From: Carl Worth Date: Mon, 19 Oct 2009 19:54:40 +0000 (-0700) Subject: notmuch: Switch from gmime to custom, ad-hoc parsing of headers. X-Git-Tag: 0.1~841 X-Git-Url: https://git.notmuchmail.org/git?p=notmuch;a=commitdiff_plain;h=0e777a8f800af062aba39a95a003f3e1d8f33793;ds=sidebyside notmuch: Switch from gmime to custom, ad-hoc parsing of headers. Since we're currently just trying to stitch together In-Reply-To and References headers we don't need that much sophistication. It's when we later add full-text searching that GMime will be useful. So for now, even though my own code here is surely very buggy compared to GMime it's also a lot faster. And speed is what we're after for the initial index creation. --- diff --git a/Makefile b/Makefile index 4af5a2eb..b7ebfb81 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ PROGS=notmuch notmuch-index-message xapian-dump -MYCFLAGS=-Wall -O0 -g `pkg-config --cflags gmime-2.4` +MYCFLAGS=-Wall -O0 -g `pkg-config --cflags glib-2.0` MYCXXFLAGS=$(MYCFLAGS) `xapian-config --cxxflags` -MYLDFLAGS=`pkg-config --libs gmime-2.4` `xapian-config --libs` +MYLDFLAGS=`pkg-config --libs glib-2.0` `xapian-config --libs` all: $(PROGS) @@ -13,7 +13,7 @@ all: $(PROGS) %.o: %.c $(CC) -c $(CFLAGS) $(MYCFLAGS) $^ -o $@ -notmuch: notmuch.o database.o xutil.o +notmuch: notmuch.o database.o message.o xutil.o $(CC) $(MYLDFLAGS) $^ -o $@ notmuch-index-message: notmuch-index-message.cc diff --git a/database.cc b/database.cc index 36b1b580..7ea1f416 100644 --- a/database.cc +++ b/database.cc @@ -20,20 +20,12 @@ #include "notmuch-private.h" -#include -#include -#include -#include -#include -#include -#include - #include -#include - #include +#include + using namespace std; struct _notmuch_database { @@ -278,26 +270,113 @@ find_thread_ids (Xapian::Database *db, return result; } -/* Add a term for each message-id in the References header of the - * message. */ +/* Advance 'str' past any whitespace or RFC 822 comments. A comment is + * a (potentially nested) parenthesized sequence with '\' used to + * escape any character (including parentheses). + * + * If the sequence to be skipped continues to the end of the string, + * then 'str' will be left pointing at the final terminating '\0' + * character. + */ +static void +skip_space_and_comments (const char **str) +{ + const char *s; + + s = *str; + while (*s && (isspace (*s) || *s == '(')) { + while (*s && isspace (*s)) + s++; + if (*s == '(') { + int nesting = 1; + s++; + while (*s && nesting) { + if (*s == '(') + nesting++; + else if (*s == ')') + nesting--; + else if (*s == '\\') + if (*(s+1)) + s++; + s++; + } + } + } + + *str = s; +} + +/* Parse an RFC 822 message-id, discarding whitespace, any RFC 822 + * comments, and the '<' and '>' delimeters. + * + * If not NULL, then *next will be made to point to the first character + * not parsed, (possibly pointing to the final '\0' terminator. + * + * Returns a newly allocated string which the caller should free() + * when done with it. + * + * Returns NULL if there is any error parsing the message-id. */ +static char * +parse_message_id (const char *message_id, const char **next) +{ + const char *s, *end; + + if (message_id == NULL) + return NULL; + + s = message_id; + + skip_space_and_comments (&s); + + /* Skip any unstructured text as well. */ + while (*s && *s != '<') + s++; + + if (*s == '<') { + s++; + } else { + if (next) + *next = s; + return NULL; + } + + skip_space_and_comments (&s); + + end = s; + while (*end && *end != '>') + end++; + if (next) { + if (*end) + *next = end + 1; + else + *next = end; + } + + if (end > s && *end == '>') + end--; + if (end > s) + return strndup (s, end - s + 1); + else + return NULL; +} + +/* Parse a References header value, putting a copy of each referenced + * message-id into 'array'. */ static void parse_references (GPtrArray *array, - const char *refs_str) + const char *refs) { - GMimeReferences *refs, *r; - const char *message_id; + char *ref; - if (refs_str == NULL) + if (refs == NULL) return; - refs = g_mime_references_decode (refs_str); + while (*refs) { + ref = parse_message_id (refs, &refs); - for (r = refs; r; r = r->next) { - message_id = g_mime_references_get_message_id (r); - g_ptr_array_add (array, g_strdup (message_id)); + if (ref) + g_ptr_array_add (array, ref); } - - g_mime_references_free (refs); } notmuch_database_t * @@ -344,8 +423,6 @@ notmuch_database_open (const char *path) struct stat st; int err; - g_mime_init (0); - notmuch_path = g_strdup_printf ("%s/%s", path, ".notmuch"); err = stat (notmuch_path, &st); @@ -397,31 +474,17 @@ notmuch_database_add_message (notmuch_database_t *notmuch, { Xapian::WritableDatabase *db = notmuch->xapian_db; Xapian::Document doc; + notmuch_message_t *message; - GMimeStream *stream; - GMimeParser *parser; - GMimeMessage *message; GPtrArray *parents, *thread_ids; - FILE *file; - - const char *refs, *in_reply_to; - const char *message_id; + const char *refs, *in_reply_to, *date, *header; + char *message_id; - time_t time; + time_t time_value; unsigned int i; - file = fopen (filename, "r"); - if (! file) { - fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno)); - exit (1); - } - - stream = g_mime_stream_file_new (file); - - parser = g_mime_parser_new_with_stream (stream); - - message = g_mime_parser_construct_message (parser); + message = notmuch_message_open (filename); try { doc = Xapian::Document (); @@ -430,16 +493,27 @@ notmuch_database_add_message (notmuch_database_t *notmuch, parents = g_ptr_array_new (); - refs = g_mime_object_get_header (GMIME_OBJECT (message), "references"); + refs = notmuch_message_get_header (message, "references"); parse_references (parents, refs); - in_reply_to = g_mime_object_get_header (GMIME_OBJECT (message), - "in-reply-to"); + in_reply_to = notmuch_message_get_header (message, "in-reply-to"); parse_references (parents, in_reply_to); + for (i = 0; i < parents->len; i++) add_term (doc, "ref", (char *) g_ptr_array_index (parents, i)); - message_id = g_mime_message_get_message_id (message); + header = notmuch_message_get_header (message, "message-id"); + if (header) { + message_id = parse_message_id (header, NULL); + /* So the header value isn't RFC-compliant, but it's + * better than no message-id at all. */ + if (message_id == NULL) + message_id = xstrdup (header); + } else { + /* XXX: Should generate a message_id here, (such as a SHA1 + * sum of the message itself) */ + message_id = NULL; + } thread_ids = find_thread_ids (db, parents, message_id); @@ -478,8 +552,15 @@ notmuch_database_add_message (notmuch_database_t *notmuch, doc.add_value (NOTMUCH_VALUE_THREAD, thread_id.str); } - g_mime_message_get_date (message, &time, NULL); - doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time)); + free (message_id); + +/* + date = notmuch_message_get_header (message, "date"); + time_value = notmuch_parse_date (date, NULL); + + doc.add_value (NOTMUCH_VALUE_DATE, + Xapian::sortable_serialise (time_value)); +*/ db->add_document (doc); } catch (const Xapian::Error &error) { @@ -488,9 +569,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch, return NOTMUCH_STATUS_XAPIAN_EXCEPTION; } - g_object_unref (message); - g_object_unref (parser); - g_object_unref (stream); + notmuch_message_close (message); return NOTMUCH_STATUS_SUCCESS; } diff --git a/message.c b/message.c new file mode 100644 index 00000000..ea5d239a --- /dev/null +++ b/message.c @@ -0,0 +1,300 @@ +/* message.c - Utility functions for parsing an email message for notmuch. + * + * Copyright © 2009 Carl Worth + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ . + * + * Author: Carl Worth + */ + +#include + +#include "notmuch-private.h" + +#include + +struct _notmuch_message { + /* File objects */ + int fd; + void *map; + + /* Header storage */ + int restrict_headers; + GHashTable *headers; + + /* Parsing state */ + char *start; + size_t size; + const char *next_line; + int parsing_started; + int parsing_finished; +}; + +static int +strcase_equal (const void *a, const void *b) +{ + return strcasecmp (a, b) == 0; +} + +static unsigned int +strcase_hash (const void *ptr) +{ + const char *s = ptr; + + /* This is the djb2 hash. */ + unsigned int hash = 5381; + while (s && *s) { + hash = ((hash << 5) + hash) + tolower (*s); + s++; + } + + return hash; +} + +notmuch_message_t * +notmuch_message_open (const char *filename) +{ + notmuch_message_t *message; + struct stat st; + + message = xcalloc (1, sizeof (notmuch_message_t)); + + message->fd = open (filename, O_RDONLY); + if (message->fd < 0) + goto FAIL; + + if (fstat (message->fd, &st) < 0) + goto FAIL; + + message->map = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, + message->fd, 0); + if (message->map == MAP_FAILED) + goto FAIL; + + message->headers = g_hash_table_new_full (strcase_hash, + strcase_equal, + free, + free); + + message->start = (char *) message->map; + message->size = st.st_size; + message->next_line = message->start; + message->parsing_started = 0; + message->parsing_finished = 0; + + return message; + + FAIL: + fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno)); + notmuch_message_close (message); + + return NULL; +} + +void +notmuch_message_close (notmuch_message_t *message) +{ + if (message == NULL) + return; + + if (message->headers) + g_hash_table_unref (message->headers); + + if (message->map) + munmap (message->map, message->size); + if (message->fd) + close (message->fd); + + free (message); +} + +void +notmuch_message_restrict_headersv (notmuch_message_t *message, + va_list va_headers) +{ + char *header; + + if (message->parsing_started ) { + fprintf (stderr, "Error: notmuch_message_restrict_headers called after parsing has started\n"); + exit (1); + } + + while (1) { + header = va_arg (va_headers, char*); + if (header == NULL) + break; + g_hash_table_insert (message->headers, + xstrdup (header), NULL); + } + + message->restrict_headers = 1; +} + +void +notmuch_message_restrict_headers (notmuch_message_t *message, ...) +{ + va_list va_headers; + + va_start (va_headers, message); + + notmuch_message_restrict_headersv (message, va_headers); +} + +/* With our mmapped file, we don't get the benefit of terminated + * strings, so we can't use things like strchr(). We don't even know + * if there's a newline at the end of the file so we also have to be + * careful of that. Basically, every time we advance a pointer while + * parsing we must ensure we don't go beyond our buffer. + */ +#define WITHIN(s) (((s) - message->start) < (message->size -1)) + +/* In each of the macros below, "without overrunning the buffer" means + * that the macro will never dereference a character beyond the end of + * the buffer. However, all of the macros may return a pointer + * pointing to the first character beyond the buffer. So callers + * should test with WITHIN before dereferencing the result. */ + +/* Advance 'ptr' until pointing at a non-space character in the same + * line, (without overrunning the buffer) */ +#define SKIP_SPACE_IN_LINE(ptr) \ + while (WITHIN (ptr) && (*(ptr) == ' ' || *(ptr) == '\t')) \ + (ptr)++; + +/* Advance 'ptr' until pointing at a non-space character, (without + * overrunning the buffer) */ +#define SKIP_SPACE(ptr) \ + while (WITHIN (ptr) && isspace(*(ptr))) \ + (ptr)++; + +/* Advance 'ptr' to the first occurrence of 'c' within the same + * line, (without overrunning the buffer). */ +#define ADVANCE_TO(ptr, c) \ + while (WITHIN (ptr) && *(ptr) != '\n' && \ + *(ptr) != (c)) \ + { \ + (ptr)++; \ + } + +/* Advance 'ptr' to the beginning of the next line not starting with + * an initial tab character, (without overruning the buffer). */ +#define ADVANCE_TO_NEXT_HEADER_LINE(ptr) \ + do { \ + ADVANCE_TO ((ptr), '\n'); \ + if (WITHIN (ptr)) \ + (ptr)++; \ + } while (WITHIN (ptr) && \ + (*(ptr) == '\t' || *(ptr) == ' ')); + +char * +copy_header_value (const char *start, const char *end) +{ + const char *s; + char *result, *r; + int was_newline = 0; + + result = xmalloc (end - start + 1); + + s = start; + r = result; + + while (s < end) { + if (*s == '\n') { + was_newline = 1; + } else { + if (*s == '\t' && was_newline) + *r = ' '; + else + *r = *s; + r++; + was_newline = 0; + } + s++; + } + + *r = '\0'; + + return result; +} + +const char * +notmuch_message_get_header (notmuch_message_t *message, + const char *header_desired) +{ + int contains; + const char *s, *colon; + char *header, *value; + int match; + + message->parsing_started = 1; + + contains = g_hash_table_lookup_extended (message->headers, + header_desired, NULL, + (gpointer *) &value); + if (contains) + return value; + + if (message->parsing_finished) + return NULL; + + while (1) { + s = message->next_line; + + if (*s == '\n') { + message->parsing_finished = 1; + return NULL; + } + + if (*s == '\t') { + fprintf (stderr, "Warning: Unexpected continued value\n"); + ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + continue; + } + + colon = s; + ADVANCE_TO (colon, ':'); + + if (! WITHIN (colon) || *colon == '\n') { + fprintf (stderr, "Warning: Unexpected non-header line: %s\n", s); + ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + continue; + } + + header = xstrndup (s, colon - s); + + if (message->restrict_headers && + ! g_hash_table_lookup_extended (message->headers, + header, NULL, NULL)) + { + free (header); + message->next_line = colon; + ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + continue; + } + + s = colon + 1; + SKIP_SPACE_IN_LINE (s); + + message->next_line = s; + ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + + value = copy_header_value (s, message->next_line); + + match = (strcasecmp (header, header_desired) == 0); + + g_hash_table_insert (message->headers, header, value); + + if (match) + return value; + } +} diff --git a/message.h b/message.h new file mode 100644 index 00000000..d0a34a10 --- /dev/null +++ b/message.h @@ -0,0 +1,19 @@ +/* message.h - Utility functions for parsing an email message for notmuch. + * + * Copyright © 2009 Carl Worth + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ . + * + * Author: Carl Worth + */ diff --git a/notmuch-private.h b/notmuch-private.h index 15d6db48..b7d27e91 100644 --- a/notmuch-private.h +++ b/notmuch-private.h @@ -23,8 +23,17 @@ #include "notmuch.h" +#include #include +#include +#include +#include +#include #include +#include +#include +#include +#include NOTMUCH_BEGIN_DECLS @@ -44,6 +53,78 @@ xstrdup (const char *s); char * xstrndup (const char *s, size_t n); +/* message.c */ + +/* XXX: I haven't decided yet whether these will actually get exported + * into the public interface in notmuch.h + */ + +typedef struct _notmuch_message notmuch_message_t; + +/* Open a file containing a single email message. + * + * The caller should call notmuch_message_close when done with this. + * + * Returns NULL if any error occurs. + */ +notmuch_message_t * +notmuch_message_open (const char *filename); + +/* Close a notmuch message preivously opened with notmuch_message_open. */ +void +notmuch_message_close (notmuch_message_t *message); + +/* Restrict 'message' to only save the named headers. + * + * When the caller is only interested in a short list of headers, + * known in advance, calling this function can avoid wasted time and + * memory parsing/saving header values that will never be needed. + * + * The variable arguments should be a list of const char * with a + * final '(const char *) NULL' to terminate the list. + * + * If this function is called, it must be called before any calls to + * notmuch_message_get_header for this message. + * + * After calling this function, if notmuch_message_get_header is + * called with a header name not in this list, then NULL will be + * returned even if that header exists in the actual message. + */ +void +notmuch_message_restrict_headers (notmuch_message_t *message, ...); + +/* Identical to notmuch_message_restrict_headers but accepting a va_list. */ +void +notmuch_message_restrict_headersv (notmuch_message_t *message, + va_list va_headers); + +/* Get the value of the specified header from the message. + * + * The header name is case insensitive. + * + * The returned value is owned by the notmuch message and is valid + * only until the message is closed. The caller should copy it if + * needing to modify the value or to hold onto it for longer. + * + * Returns NULL if the message does not contain a header line matching + * 'header'. + */ +const char * +notmuch_message_get_header (notmuch_message_t *message, + const char *header); + +/* date.c */ + +/* Parse an RFC 8222 date string to a time_t value. + * + * The tz_offset argument can be used to also obtain the time-zone + * offset, (but can be NULL if the call is not interested in that). + * + * Returns 0 on error. + */ +time_t +notmuch_parse_date (const char *str, int *tz_offset); + NOTMUCH_END_DECLS #endif