aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Bremner <david@tethera.net>2017-06-07 23:11:49 -0300
committerDavid Bremner <david@tethera.net>2017-07-01 12:32:27 -0300
commit6dd00d64863dfc0563877ca7899231b8c3058c49 (patch)
tree143c3f199ff195d2f3036f6aa42e570b56d4ced4
parent64f81f95a19b28681a74a58b8cae205bff885755 (diff)
lib/index: add simple html filter
The filter just drops all (HTML) tags. As an enabling change, pass the content type to the filter constructor so we can decide which scanner to user.
-rw-r--r--lib/index.cc48
-rwxr-xr-xtest/T680-html-indexing.sh5
2 files changed, 45 insertions, 8 deletions
diff --git a/lib/index.cc b/lib/index.cc
index 8a18abf4..0c4e2329 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -58,6 +58,33 @@ static const scanner_state_t uuencode_states[] = {
{12, ' ', '`', 12, 11}
};
+/* The following table is intended to implement this DFA (in 'dot'
+ format). Note that 2 and 3 are "hidden" states used to step through
+ the possible out edges of state 1.
+
+digraph html_filter {
+ 0 -> 1 [label="<"];
+ 0 -> 0;
+ 1 -> 4 [label="'"];
+ 1 -> 5 [label="\""];
+ 1 -> 0 [label=">"];
+ 1 -> 1;
+ 4 -> 1 [label="'"];
+ 4 -> 4;
+ 5 -> 1 [label="\""];
+ 5 -> 5;
+}
+*/
+static const int first_html_skipping_state = 1;
+static const scanner_state_t html_states[] = {
+ {0, '<', '<', 1, 0},
+ {1, '\'', '\'', 4, 2}, /* scanning for quote or > */
+ {1, '"', '"', 5, 3},
+ {1, '>', '>', 0, 1},
+ {4, '\'', '\'', 1, 4}, /* inside single quotes */
+ {5, '"', '"', 1, 5}, /* inside double quotes */
+};
+
/* Oh, how I wish that gobject didn't require so much noisy boilerplate!
* (Though I have at least eliminated some of the stock set...) */
typedef struct _NotmuchFilterDiscardNonTerm NotmuchFilterDiscardNonTerm;
@@ -90,6 +117,7 @@ typedef struct _NotmuchFilterDiscardNonTermClass NotmuchFilterDiscardNonTermClas
**/
struct _NotmuchFilterDiscardNonTerm {
GMimeFilter parent_object;
+ GMimeContentType *content_type;
int state;
int first_skipping_state;
const scanner_state_t *states;
@@ -99,7 +127,7 @@ struct _NotmuchFilterDiscardNonTermClass {
GMimeFilterClass parent_class;
};
-static GMimeFilter *notmuch_filter_discard_non_term_new (void);
+static GMimeFilter *notmuch_filter_discard_non_term_new (GMimeContentType *content);
static void notmuch_filter_discard_non_term_finalize (GObject *object);
@@ -138,8 +166,8 @@ notmuch_filter_discard_non_term_finalize (GObject *object)
static GMimeFilter *
filter_copy (GMimeFilter *gmime_filter)
{
- (void) gmime_filter;
- return notmuch_filter_discard_non_term_new ();
+ NotmuchFilterDiscardNonTerm *filter = (NotmuchFilterDiscardNonTerm *) gmime_filter;
+ return notmuch_filter_discard_non_term_new (filter->content_type);
}
static void
@@ -211,7 +239,7 @@ filter_reset (GMimeFilter *gmime_filter)
* Returns: a new #NotmuchFilterDiscardNonTerm filter.
**/
static GMimeFilter *
-notmuch_filter_discard_non_term_new (void)
+notmuch_filter_discard_non_term_new (GMimeContentType *content_type)
{
static GType type = 0;
NotmuchFilterDiscardNonTerm *filter;
@@ -234,9 +262,15 @@ notmuch_filter_discard_non_term_new (void)
}
filter = (NotmuchFilterDiscardNonTerm *) g_object_newv (type, 0, NULL);
+ filter->content_type = content_type;
filter->state = 0;
- filter->states = uuencode_states;
- filter->first_skipping_state = first_uuencode_skipping_state;
+ if (g_mime_content_type_is_type (content_type, "text", "html")) {
+ filter->states = html_states;
+ filter->first_skipping_state = first_html_skipping_state;
+ } else {
+ filter->states = uuencode_states;
+ filter->first_skipping_state = first_uuencode_skipping_state;
+ }
return (GMimeFilter *) filter;
}
@@ -413,7 +447,7 @@ _index_mime_part (notmuch_message_t *message,
g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
filter = g_mime_stream_filter_new (stream);
- discard_non_term_filter = notmuch_filter_discard_non_term_new ();
+ discard_non_term_filter = notmuch_filter_discard_non_term_new (content_type);
g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
discard_non_term_filter);
diff --git a/test/T680-html-indexing.sh b/test/T680-html-indexing.sh
index 5e9cc4cb..74f33708 100755
--- a/test/T680-html-indexing.sh
+++ b/test/T680-html-indexing.sh
@@ -5,10 +5,13 @@ test_description="indexing of html parts"
add_email_corpus html
test_begin_subtest 'embedded images should not be indexed'
-test_subtest_known_broken
notmuch search kwpza7svrgjzqwi8fhb2msggwtxtwgqcxp4wbqr4wjddstqmeqa7 > OUTPUT
test_expect_equal_file /dev/null OUTPUT
+test_begin_subtest 'ignore > in attribute text'
+notmuch search swordfish | notmuch_search_sanitize > OUTPUT
+test_expect_equal_file /dev/null OUTPUT
+
test_begin_subtest 'non tag text should be indexed'
notmuch search hunter2 | notmuch_search_sanitize > OUTPUT
cat <<EOF > EXPECTED