* Author: Carl Worth <cworth@cworth.org>
*/
+/* This indexer creates a Xapian mail index that is remarkably similar
+ * to that created by sup. The big difference, (and the thing that
+ * will keep a notmuch index from being used by sup directly), is that
+ * sup expects a serialized ruby data structure in the document's data
+ * field, but notmuch just puts the mail's filename there (trusting
+ * that the email client can get the data in needs from the filename).
+ *
+ * Note: One bug here is that sup actually merges together fields such
+ * as To, CC, Bcc etc. when finding multiple emails with the same
+ * message ID. To support something similar, notmuch should list
+ * multiple files in the data field.
+ *
+ * Other differences between sup and notmuch-index identified so far:
+ *
+ * o sup supports encrypted mime parts by prompting for a passphrase
+ * to decrypt the message. So far, notmuch doesn't support this,
+ * both because I'm lazy to code it, and I also think doing so
+ * would present a security leak.
+ *
+ * o sup and notmuch have different heuristics for identifying (and
+ * thus ignoring) signatures. For example, sup considers a line
+ * consisting of two hypens as a signature separator, while
+ * notmuch expects those two hyphens to be followed by a space
+ * character.
+ *
+ * o sup as been seen to split some numbers before indexing
+ * them. For example, the number 1754 in an email message was
+ * indexed by sup as separate terms 17 and 54. I couldn't find any
+ * explanation for this behavior and did not try to replicate it
+ * in notmuch.
+ */
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <time.h>
+#include <sys/time.h>
#include <iostream>
return "";
}
-int TERM_COMBINED = 0;
+/* "128 bits of thread-id ought to be enough for anybody" */
+#define NOTMUCH_THREAD_ID_BITS 128
+#define NOTMUCH_THREAD_ID_DIGITS (NOTMUCH_THREAD_ID_BITS / 4)
+typedef struct _thread_id {
+ char str[NOTMUCH_THREAD_ID_DIGITS + 1];
+} thread_id_t;
+
+static void
+thread_id_generate (thread_id_t *thread_id)
+{
+ FILE *urandom;
+ uint32_t value;
+ char *s;
+ int i;
+
+ urandom = fopen ("/dev/urandom", "r");
+ if (urandom == NULL) {
+ fprintf (stderr, "Error opening /dev/urandom: %s\n",
+ strerror (errno));
+ fprintf (stderr, "Perhaps notmuch needs some portability fixes for your platform?\n");
+ exit (1);
+ }
+
+ s = thread_id->str;
+ for (i = 0; i < NOTMUCH_THREAD_ID_DIGITS; i += 8) {
+ fread ((void *) &value, sizeof (value), 1, urandom);
+ sprintf (s, "%08x", value);
+ s += 8;
+ }
+
+ fclose (urandom);
+
+ printf ("Generated thread id: %s\n", thread_id->str);
+}
static void
add_term (Xapian::Document doc,
InternetAddress *address,
const char *prefix_name)
{
- const char *name;
- int own_name = 0;
-
- name = internet_address_get_name (address);
-
- /* In the absence of a name, we'll strip the part before the @
- * from the address. */
- if (! name) {
- InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
- const char *addr = internet_address_mailbox_get_addr (mailbox);
- const char *at;
+ if (INTERNET_ADDRESS_IS_MAILBOX(address)) {
+ const char *name;
+ int own_name = 0;
+
+ name = internet_address_get_name (address);
+
+ /* In the absence of a name, we'll strip the part before the @
+ * from the address. */
+ if (! name) {
+ InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
+ const char *addr = internet_address_mailbox_get_addr (mailbox);
+ const char *at;
+
+ at = strchr (addr, '@');
+ if (at) {
+ name = strndup (addr, at - addr);
+ own_name = 1;
+ }
+ }
- at = strchr (addr, '@');
- if (at) {
- name = strndup (addr, at - addr);
- own_name = 1;
+ if (name)
+ gen_terms (term_gen, prefix_name, name);
+
+ if (own_name)
+ free ((void *) name);
+ } else if (INTERNET_ADDRESS_IS_GROUP (address)) {
+ InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address);
+ InternetAddressList *list = internet_address_group_get_members(group);
+ if (list) {
+ int length = internet_address_list_length(list);
+ int i;
+
+ for (i = 0; i < length; i++)
+ gen_terms_address_name(term_gen,
+ internet_address_list_get_address(list, i),
+ prefix_name);
}
}
-
- if (name)
- gen_terms (term_gen, prefix_name, name);
-
- if (own_name)
- free ((void *) name);
}
static void
int i;
InternetAddress *address;
+ if (addresses == NULL)
+ return;
+
for (i = 0; i < internet_address_list_length (addresses); i++) {
address = internet_address_list_get_address (addresses, i);
gen_terms_address_name (term_gen, address, address_type);
InternetAddress *address,
const char *prefix_name)
{
- InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
- const char *addr;
-
- addr = internet_address_mailbox_get_addr (mailbox);
-
- if (addr)
- add_term (doc, prefix_name, addr);
+ if (INTERNET_ADDRESS_IS_MAILBOX(address)) {
+ InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
+ const char *addr;
+
+ addr = internet_address_mailbox_get_addr (mailbox);
+
+ if (addr)
+ add_term (doc, prefix_name, addr);
+ } else if (INTERNET_ADDRESS_IS_GROUP (address)) {
+ InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address);
+ InternetAddressList *list = internet_address_group_get_members(group);
+ if (list) {
+ int length = internet_address_list_length(list);
+ int i;
+
+ for (i = 0; i < length; i++)
+ add_term_address_addr(doc,
+ internet_address_list_get_address(list, i),
+ prefix_name);
+ }
+ }
}
static void
int i;
InternetAddress *address;
+ if (addresses == NULL)
+ return;
+
for (i = 0; i < internet_address_list_length (addresses); i++) {
address = internet_address_list_get_address (addresses, i);
add_term_address_addr (doc, address, address_type);
{
const char *s = subject;
+ if (subject == NULL)
+ return NULL;
+
while (*s) {
while (*s && isspace (*s))
s++;
}
line_end = next_line - 1;
- /* Trim whitespace. */
- while (*next_line && isspace (*next_line))
+ /* Get to the next non-blank line. */
+ while (*next_line == '\n')
next_line++;
/* Skip lines that are quotes. */
return;
}
- if (! GMIME_IS_PART (part)) {
+ if (GMIME_IS_MESSAGE_PART (part)) {
+ GMimeMessage *message;
+
+ message = g_mime_message_part_get_message (GMIME_MESSAGE_PART (part));
+
+ gen_terms_part (term_gen, g_mime_message_get_mime_part (message));
+
+ return;
+ }
+
+ if (! (GMIME_IS_PART (part))) {
fprintf (stderr, "Warning: Not indexing unknown mime part: %s.\n",
g_type_name (G_OBJECT_TYPE (part)));
return;
}
- disposition = g_mime_object_get_content_disposition (GMIME_OBJECT (part));
+ disposition = g_mime_object_get_content_disposition (part);
if (disposition &&
strcmp (disposition->disposition, GMIME_DISPOSITION_ATTACHMENT) == 0)
{
+ const char *filename = g_mime_part_get_filename (GMIME_PART (part));
+ const char *extension;
+
add_term (term_gen.get_document (), "label", "attachment");
+ gen_terms (term_gen, "attachment", filename);
+
+ if (filename) {
+ extension = strchr (filename, '.');
+ if (extension) {
+ add_term (term_gen.get_document (), "attachment_extension",
+ extension + 1);
+ }
+ }
+
return;
}
stream = g_mime_stream_mem_new_with_byte_array (byte_array);
g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
wrapper = g_mime_part_get_content_object (GMIME_PART (part));
- g_mime_data_wrapper_write_to_stream (wrapper, stream);
+ if (wrapper)
+ g_mime_data_wrapper_write_to_stream (wrapper, stream);
g_object_unref (stream);
add_term (doc, "type", "mail");
add_term (doc, "source_id", "1");
- add_term (doc, "msgid", message_id);
- doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, message_id);
+ if (message_id) {
+ add_term (doc, "msgid", message_id);
+ doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, message_id);
+ }
if (thread_ids->len) {
unsigned int i;
doc.add_value (NOTMUCH_VALUE_THREAD, thread_id->str);
g_string_free (thread_id, TRUE);
- } else {
- /* If not referenced thread, use the message ID */
- add_term (doc, "thread", message_id);
- doc.add_value (NOTMUCH_VALUE_THREAD, message_id);
+ } else if (message_id) {
+ /* If not part of any existing thread, generate a new thread_id. */
+ thread_id_t thread_id;
+
+ thread_id_generate (&thread_id);
+
+ add_term (doc, "thread", thread_id.str);
+ doc.add_value (NOTMUCH_VALUE_THREAD, thread_id.str);
}
doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time));
GIOChannel *channel;
GIOStatus gio_status;
GError *error = NULL;
+ int count;
+ struct timeval tv_start, tv_last, tv_now;
+ double elapsed;
if (argc < 2) {
usage (argv[0]);
channel = g_io_channel_unix_new (fileno (stdin));
+ count = 0;
+
+ gettimeofday (&tv_start, NULL);
+ tv_last = tv_start;
+
while (1) {
gio_status = g_io_channel_read_line (channel, &filename,
NULL, NULL, &error);
index_file (db, term_gen, filename);
g_free (filename);
+
+ count++;
+ if (count % 1000 == 0) {
+ gettimeofday (&tv_now, NULL);
+ printf ("Indexed %d messages (%g messages/second)\n",
+ count, 1000 / ((tv_now.tv_sec - tv_last.tv_sec) +
+ (tv_now.tv_usec - tv_last.tv_usec) / 1e6));
+ tv_last = tv_now;
+ }
}
+ gettimeofday (&tv_now, NULL);
+ elapsed = (tv_now.tv_sec - tv_start.tv_sec +
+ (tv_now.tv_usec - tv_start.tv_usec) / 1e6);
+ printf ("Completed indexing of %d messages in %g seconds (%g messages/second)\n",
+ count, elapsed, count / elapsed);
+
} catch (const Xapian::Error &error) {
cerr << "A Xapian exception occurred: " << error.get_msg () << endl;
exit (1);