X-Git-Url: https://git.notmuchmail.org/git?p=notmuch;a=blobdiff_plain;f=notmuch-index-message.cc;h=7610aa41f97c455ef0cf650ad3a52ac03886f2d8;hp=2d0bfc28680312753018cc43bb7c628a8b7828f6;hb=5fbdbeb333b4fb8293092e8cb9f5b19da3e53ed5;hpb=7c9dbbad40e4dfb2bf412c36938771a2e93a984c diff --git a/notmuch-index-message.cc b/notmuch-index-message.cc index 2d0bfc28..7610aa41 100644 --- a/notmuch-index-message.cc +++ b/notmuch-index-message.cc @@ -17,11 +17,44 @@ * Author: Carl Worth */ +/* This indexer creates a Xapian mail index that is remarkably similar + * to that created by sup. The big difference, (and the thing that + * will keep a notmuch index from being used by sup directly), is that + * sup expects a serialized ruby data structure in the document's data + * field, but notmuch just puts the mail's filename there (trusting + * that the email client can get the data in needs from the filename). + * + * Note: One bug here is that sup actually merges together fields such + * as To, CC, Bcc etc. when finding multiple emails with the same + * message ID. To support something similar, notmuch should list + * multiple files in the data field. + * + * Other differences between sup and notmuch-index identified so far: + * + * o sup supports encrypted mime parts by prompting for a passphrase + * to decrypt the message. So far, notmuch doesn't support this, + * both because I'm lazy to code it, and I also think doing so + * would present a security leak. + * + * o sup and notmuch have different heuristics for identifying (and + * thus ignoring) signatures. For example, sup considers a line + * consisting of two hypens as a signature separator, while + * notmuch expects those two hyphens to be followed by a space + * character. + * + * o sup as been seen to split some numbers before indexing + * them. For example, the number 1754 in an email message was + * indexed by sup as separate terms 17 and 54. I couldn't find any + * explanation for this behavior and did not try to replicate it + * in notmuch. + */ + #include #include #include #include #include +#include #include @@ -138,30 +171,44 @@ gen_terms_address_name (Xapian::TermGenerator term_gen, InternetAddress *address, const char *prefix_name) { - const char *name; - int own_name = 0; - - name = internet_address_get_name (address); - - /* In the absence of a name, we'll strip the part before the @ - * from the address. */ - if (! name) { - InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address); - const char *addr = internet_address_mailbox_get_addr (mailbox); - const char *at; + if (INTERNET_ADDRESS_IS_MAILBOX(address)) { + const char *name; + int own_name = 0; + + name = internet_address_get_name (address); + + /* In the absence of a name, we'll strip the part before the @ + * from the address. */ + if (! name) { + InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address); + const char *addr = internet_address_mailbox_get_addr (mailbox); + const char *at; + + at = strchr (addr, '@'); + if (at) { + name = strndup (addr, at - addr); + own_name = 1; + } + } - at = strchr (addr, '@'); - if (at) { - name = strndup (addr, at - addr); - own_name = 1; + if (name) + gen_terms (term_gen, prefix_name, name); + + if (own_name) + free ((void *) name); + } else if (INTERNET_ADDRESS_IS_GROUP (address)) { + InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address); + InternetAddressList *list = internet_address_group_get_members(group); + if (list) { + int length = internet_address_list_length(list); + int i; + + for (i = 0; i < length; i++) + gen_terms_address_name(term_gen, + internet_address_list_get_address(list, i), + prefix_name); } } - - if (name) - gen_terms (term_gen, prefix_name, name); - - if (own_name) - free ((void *) name); } static void @@ -172,6 +219,9 @@ gen_terms_address_names (Xapian::TermGenerator term_gen, int i; InternetAddress *address; + if (addresses == NULL) + return; + for (i = 0; i < internet_address_list_length (addresses); i++) { address = internet_address_list_get_address (addresses, i); gen_terms_address_name (term_gen, address, address_type); @@ -185,13 +235,27 @@ add_term_address_addr (Xapian::Document doc, InternetAddress *address, const char *prefix_name) { - InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address); - const char *addr; - - addr = internet_address_mailbox_get_addr (mailbox); - - if (addr) - add_term (doc, prefix_name, addr); + if (INTERNET_ADDRESS_IS_MAILBOX(address)) { + InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address); + const char *addr; + + addr = internet_address_mailbox_get_addr (mailbox); + + if (addr) + add_term (doc, prefix_name, addr); + } else if (INTERNET_ADDRESS_IS_GROUP (address)) { + InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address); + InternetAddressList *list = internet_address_group_get_members(group); + if (list) { + int length = internet_address_list_length(list); + int i; + + for (i = 0; i < length; i++) + add_term_address_addr(doc, + internet_address_list_get_address(list, i), + prefix_name); + } + } } static void @@ -202,6 +266,9 @@ add_terms_address_addrs (Xapian::Document doc, int i; InternetAddress *address; + if (addresses == NULL) + return; + for (i = 0; i < internet_address_list_length (addresses); i++) { address = internet_address_list_get_address (addresses, i); add_term_address_addr (doc, address, address_type); @@ -214,6 +281,9 @@ skip_re_in_subject (const char *subject) { const char *s = subject; + if (subject == NULL) + return NULL; + while (*s) { while (*s && isspace (*s)) s++; @@ -379,8 +449,8 @@ gen_terms_body_str (Xapian::TermGenerator term_gen, } line_end = next_line - 1; - /* Trim whitespace. */ - while (*next_line && isspace (*next_line)) + /* Get to the next non-blank line. */ + while (*next_line == '\n') next_line++; /* Skip lines that are quotes. */ @@ -433,17 +503,41 @@ gen_terms_part (Xapian::TermGenerator term_gen, return; } - if (! GMIME_IS_PART (part)) { + if (GMIME_IS_MESSAGE_PART (part)) { + GMimeMessage *message; + + message = g_mime_message_part_get_message (GMIME_MESSAGE_PART (part)); + + gen_terms_part (term_gen, g_mime_message_get_mime_part (message)); + + return; + } + + if (! (GMIME_IS_PART (part))) { fprintf (stderr, "Warning: Not indexing unknown mime part: %s.\n", g_type_name (G_OBJECT_TYPE (part))); return; } - disposition = g_mime_object_get_content_disposition (GMIME_OBJECT (part)); + disposition = g_mime_object_get_content_disposition (part); if (disposition && strcmp (disposition->disposition, GMIME_DISPOSITION_ATTACHMENT) == 0) { + const char *filename = g_mime_part_get_filename (GMIME_PART (part)); + const char *extension; + add_term (term_gen.get_document (), "label", "attachment"); + gen_terms (term_gen, "attachment", filename); + + if (filename) { + extension = strchr (filename, '.'); + if (extension) { + add_term (term_gen.get_document (), "attachment_extension", + extension + 1); + } + } + + return; } byte_array = g_byte_array_new (); @@ -451,7 +545,8 @@ gen_terms_part (Xapian::TermGenerator term_gen, stream = g_mime_stream_mem_new_with_byte_array (byte_array); g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE); wrapper = g_mime_part_get_content_object (GMIME_PART (part)); - g_mime_data_wrapper_write_to_stream (wrapper, stream); + if (wrapper) + g_mime_data_wrapper_write_to_stream (wrapper, stream); g_object_unref (stream); @@ -567,8 +662,10 @@ index_file (Xapian::WritableDatabase db, add_term (doc, "type", "mail"); add_term (doc, "source_id", "1"); - add_term (doc, "msgid", message_id); - doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, message_id); + if (message_id) { + add_term (doc, "msgid", message_id); + doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, message_id); + } if (thread_ids->len) { unsigned int i; @@ -592,7 +689,7 @@ index_file (Xapian::WritableDatabase db, doc.add_value (NOTMUCH_VALUE_THREAD, thread_id->str); g_string_free (thread_id, TRUE); - } else { + } else if (message_id) { /* If not referenced thread, use the message ID */ add_term (doc, "thread", message_id); doc.add_value (NOTMUCH_VALUE_THREAD, message_id); @@ -624,6 +721,9 @@ main (int argc, char **argv) GIOChannel *channel; GIOStatus gio_status; GError *error = NULL; + int count; + struct timeval tv_start, tv_last, tv_now; + double elapsed; if (argc < 2) { usage (argv[0]); @@ -645,6 +745,11 @@ main (int argc, char **argv) channel = g_io_channel_unix_new (fileno (stdin)); + count = 0; + + gettimeofday (&tv_start, NULL); + tv_last = tv_start; + while (1) { gio_status = g_io_channel_read_line (channel, &filename, NULL, NULL, &error); @@ -660,8 +765,23 @@ main (int argc, char **argv) index_file (db, term_gen, filename); g_free (filename); + + count++; + if (count % 1000 == 0) { + gettimeofday (&tv_now, NULL); + printf ("Indexed %d messages (%g messages/second)\n", + count, 1000 / ((tv_now.tv_sec - tv_last.tv_sec) + + (tv_now.tv_usec - tv_last.tv_usec) / 1e6)); + tv_last = tv_now; + } } + gettimeofday (&tv_now, NULL); + elapsed = (tv_now.tv_sec - tv_start.tv_sec + + (tv_now.tv_usec - tv_start.tv_usec) / 1e6); + printf ("Completed indexing of %d messages in %g seconds (%g messages/second)\n", + count, elapsed, count / elapsed); + } catch (const Xapian::Error &error) { cerr << "A Xapian exception occurred: " << error.get_msg () << endl; exit (1);