git.notmuchmail.org Git - notmuch/blob - lib/index.cc

   1 /*
   2  * Copyright © 2009 Carl Worth
   3  *
   4  * This program is free software: you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation, either version 3 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program.  If not, see http://www.gnu.org/licenses/ .
  16  *
  17  * Author: Carl Worth <cworth@cworth.org>
  18  */
  19
  20 #include "notmuch-private.h"
  21
  22 #include <gmime/gmime.h>
  23 #include <gmime/gmime-filter.h>
  24
  25 #include <xapian.h>
  26
  27 /* Oh, how I wish that gobject didn't require so much noisy boilerplate!
  28  * (Though I have at least eliminated some of the stock set...) */
  29 typedef struct _NotmuchFilterDiscardUuencode NotmuchFilterDiscardUuencode;
  30 typedef struct _NotmuchFilterDiscardUuencodeClass NotmuchFilterDiscardUuencodeClass;
  31
  32 /**
  33  * NotmuchFilterDiscardUuencode:
  34  *
  35  * @parent_object: parent #GMimeFilter
  36  * @encode: encoding vs decoding
  37  * @state: State of the parser
  38  *
  39  * A filter to discard uuencoded portions of an email.
  40  *
  41  * A uuencoded portion is identified as beginning with a line
  42  * matching:
  43  *
  44  *      begin [0-7][0-7][0-7] .*
  45  *
  46  * After that detection, and beginning with the following line,
  47  * characters will be discarded as long as the first character of each
  48  * line begins with M and subsequent characters on the line are within
  49  * the range of ASCII characters from ' ' to '`'.
  50  *
  51  * This is not a perfect UUencode filter. It's possible to have a
  52  * message that will legitimately match that pattern, (so that some
  53  * legitimate content is discarded). And for most UUencoded files, the
  54  * final line of encoded data (the line not starting with M) will be
  55  * indexed.
  56  **/
  57 struct _NotmuchFilterDiscardUuencode {
  58     GMimeFilter parent_object;
  59     int state;
  60 };
  61
  62 struct _NotmuchFilterDiscardUuencodeClass {
  63     GMimeFilterClass parent_class;
  64 };
  65
  66 static GMimeFilter *notmuch_filter_discard_uuencode_new (void);
  67
  68 static void notmuch_filter_discard_uuencode_finalize (GObject *object);
  69
  70 static GMimeFilter *filter_copy (GMimeFilter *filter);
  71 static void filter_filter (GMimeFilter *filter, char *in, size_t len, size_t prespace,
  72                            char **out, size_t *outlen, size_t *outprespace);
  73 static void filter_complete (GMimeFilter *filter, char *in, size_t len, size_t prespace,
  74                              char **out, size_t *outlen, size_t *outprespace);
  75 static void filter_reset (GMimeFilter *filter);
  76
  77
  78 static GMimeFilterClass *parent_class = NULL;
  79
  80 static void
  81 notmuch_filter_discard_uuencode_class_init (NotmuchFilterDiscardUuencodeClass *klass)
  82 {
  83     GObjectClass *object_class = G_OBJECT_CLASS (klass);
  84     GMimeFilterClass *filter_class = GMIME_FILTER_CLASS (klass);
  85
  86     parent_class = (GMimeFilterClass *) g_type_class_ref (GMIME_TYPE_FILTER);
  87
  88     object_class->finalize = notmuch_filter_discard_uuencode_finalize;
  89
  90     filter_class->copy = filter_copy;
  91     filter_class->filter = filter_filter;
  92     filter_class->complete = filter_complete;
  93     filter_class->reset = filter_reset;
  94 }
  95
  96 static void
  97 notmuch_filter_discard_uuencode_finalize (GObject *object)
  98 {
  99     G_OBJECT_CLASS (parent_class)->finalize (object);
 100 }
 101
 102 static GMimeFilter *
 103 filter_copy (GMimeFilter *gmime_filter)
 104 {
 105     (void) gmime_filter;
 106     return notmuch_filter_discard_uuencode_new ();
 107 }
 108
 109 static void
 110 filter_filter (GMimeFilter *gmime_filter, char *inbuf, size_t inlen, size_t prespace,
 111                char **outbuf, size_t *outlen, size_t *outprespace)
 112 {
 113     NotmuchFilterDiscardUuencode *filter = (NotmuchFilterDiscardUuencode *) gmime_filter;
 114     register const char *inptr = inbuf;
 115     const char *inend = inbuf + inlen;
 116     char *outptr;
 117
 118     (void) prespace;
 119
 120     /* Simple, linear state-transition diagram for our filter.
 121      *
 122      * If the character being processed is within the range of [a, b]
 123      * for the current state then we transition next_if_match
 124      * state. If not, we transition to the next_if_not_match state.
 125      *
 126      * The final two states are special in that they are the states in
 127      * which we discard data. */
 128     static const struct {
 129         int state;
 130         int a;
 131         int b;
 132         int next_if_match;
 133         int next_if_not_match;
 134     } states[] = {
 135         {0,  'b',  'b',  1,  0},
 136         {1,  'e',  'e',  2,  0},
 137         {2,  'g',  'g',  3,  0},
 138         {3,  'i',  'i',  4,  0},
 139         {4,  'n',  'n',  5,  0},
 140         {5,  ' ',  ' ',  6,  0},
 141         {6,  '0',  '7',  7,  0},
 142         {7,  '0',  '7',  8,  0},
 143         {8,  '0',  '7',  9,  0},
 144         {9,  ' ',  ' ',  10, 0},
 145         {10, '\n', '\n', 11, 10},
 146         {11, 'M',  'M',  12, 0},
 147         {12, ' ',  '`',  12, 11}
 148     };
 149     int next;
 150
 151     g_mime_filter_set_size (gmime_filter, inlen, FALSE);
 152     outptr = gmime_filter->outbuf;
 153
 154     while (inptr < inend) {
 155         if (*inptr >= states[filter->state].a &&
 156             *inptr <= states[filter->state].b)
 157         {
 158             next = states[filter->state].next_if_match;
 159         }
 160         else
 161         {
 162             next = states[filter->state].next_if_not_match;
 163         }
 164
 165         if (filter->state < 11)
 166             *outptr++ = *inptr;
 167
 168         filter->state = next;
 169         inptr++;
 170     }
 171
 172     *outlen = outptr - gmime_filter->outbuf;
 173     *outprespace = gmime_filter->outpre;
 174     *outbuf = gmime_filter->outbuf;
 175 }
 176
 177 static void
 178 filter_complete (GMimeFilter *filter, char *inbuf, size_t inlen, size_t prespace,
 179                  char **outbuf, size_t *outlen, size_t *outprespace)
 180 {
 181     if (inbuf && inlen)
 182         filter_filter (filter, inbuf, inlen, prespace, outbuf, outlen, outprespace);
 183 }
 184
 185 static void
 186 filter_reset (GMimeFilter *gmime_filter)
 187 {
 188     NotmuchFilterDiscardUuencode *filter = (NotmuchFilterDiscardUuencode *) gmime_filter;
 189
 190     filter->state = 0;
 191 }
 192
 193 /**
 194  * notmuch_filter_discard_uuencode_new:
 195  *
 196  * Returns: a new #NotmuchFilterDiscardUuencode filter.
 197  **/
 198 static GMimeFilter *
 199 notmuch_filter_discard_uuencode_new (void)
 200 {
 201     static GType type = 0;
 202     NotmuchFilterDiscardUuencode *filter;
 203
 204     if (!type) {
 205         static const GTypeInfo info = {
 206             sizeof (NotmuchFilterDiscardUuencodeClass),
 207             NULL, /* base_class_init */
 208             NULL, /* base_class_finalize */
 209             (GClassInitFunc) notmuch_filter_discard_uuencode_class_init,
 210             NULL, /* class_finalize */
 211             NULL, /* class_data */
 212             sizeof (NotmuchFilterDiscardUuencode),
 213             0,    /* n_preallocs */
 214             NULL, /* instance_init */
 215             NULL  /* value_table */
 216         };
 217
 218         type = g_type_register_static (GMIME_TYPE_FILTER, "NotmuchFilterDiscardUuencode", &info, (GTypeFlags) 0);
 219     }
 220
 221     filter = (NotmuchFilterDiscardUuencode *) g_object_newv (type, 0, NULL);
 222     filter->state = 0;
 223
 224     return (GMimeFilter *) filter;
 225 }
 226
 227 /* We're finally down to a single (NAME + address) email "mailbox". */
 228 static void
 229 _index_address_mailbox (notmuch_message_t *message,
 230                         const char *prefix_name,
 231                         InternetAddress *address)
 232 {
 233     InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
 234     const char *name, *addr, *combined;
 235     void *local = talloc_new (message);
 236
 237     name = internet_address_get_name (address);
 238     addr = internet_address_mailbox_get_addr (mailbox);
 239
 240     /* Combine the name and address and index them as a phrase. */
 241     if (name && addr)
 242         combined = talloc_asprintf (local, "%s %s", name, addr);
 243     else if (name)
 244         combined = name;
 245     else
 246         combined = addr;
 247
 248     if (combined)
 249         _notmuch_message_gen_terms (message, prefix_name, combined);
 250
 251     talloc_free (local);
 252 }
 253
 254 static void
 255 _index_address_list (notmuch_message_t *message,
 256                      const char *prefix_name,
 257                      InternetAddressList *addresses);
 258
 259 /* The outer loop over the InternetAddressList wasn't quite enough.
 260  * There can actually be a tree here where a single member of the list
 261  * is a "group" containing another list. Recurse please.
 262  */
 263 static void
 264 _index_address_group (notmuch_message_t *message,
 265                       const char *prefix_name,
 266                       InternetAddress *address)
 267 {
 268     InternetAddressGroup *group;
 269     InternetAddressList *list;
 270
 271     group = INTERNET_ADDRESS_GROUP (address);
 272     list = internet_address_group_get_members (group);
 273
 274     if (! list)
 275         return;
 276
 277     _index_address_list (message, prefix_name, list);
 278 }
 279
 280 static void
 281 _index_address_list (notmuch_message_t *message,
 282                      const char *prefix_name,
 283                      InternetAddressList *addresses)
 284 {
 285     int i;
 286     InternetAddress *address;
 287
 288     if (addresses == NULL)
 289         return;
 290
 291     for (i = 0; i < internet_address_list_length (addresses); i++) {
 292         address = internet_address_list_get_address (addresses, i);
 293         if (INTERNET_ADDRESS_IS_MAILBOX (address)) {
 294             _index_address_mailbox (message, prefix_name, address);
 295         } else if (INTERNET_ADDRESS_IS_GROUP (address)) {
 296             _index_address_group (message, prefix_name, address);
 297         } else {
 298             INTERNAL_ERROR ("GMime InternetAddress is neither a mailbox nor a group.\n");
 299         }
 300     }
 301 }
 302
 303 /* Callback to generate terms for each mime part of a message. */
 304 static void
 305 _index_mime_part (notmuch_message_t *message,
 306                   GMimeObject *part)
 307 {
 308     GMimeStream *stream, *filter;
 309     GMimeFilter *discard_uuencode_filter;
 310     GMimeDataWrapper *wrapper;
 311     GByteArray *byte_array;
 312     GMimeContentDisposition *disposition;
 313     char *body;
 314     const char *charset;
 315
 316     if (! part) {
 317         fprintf (stderr, "Warning: Not indexing empty mime part.\n");
 318         return;
 319     }
 320
 321     if (GMIME_IS_MULTIPART (part)) {
 322         GMimeMultipart *multipart = GMIME_MULTIPART (part);
 323         int i;
 324
 325         if (GMIME_IS_MULTIPART_SIGNED (multipart))
 326           _notmuch_message_add_term (message, "tag", "signed");
 327
 328         if (GMIME_IS_MULTIPART_ENCRYPTED (multipart))
 329           _notmuch_message_add_term (message, "tag", "encrypted");
 330
 331         for (i = 0; i < g_mime_multipart_get_count (multipart); i++) {
 332             if (GMIME_IS_MULTIPART_SIGNED (multipart)) {
 333                 /* Don't index the signature. */
 334                 if (i == 1)
 335                     continue;
 336                 if (i > 1)
 337                     fprintf (stderr, "Warning: Unexpected extra parts of multipart/signed. Indexing anyway.\n");
 338             }
 339             if (GMIME_IS_MULTIPART_ENCRYPTED (multipart)) {
 340                 /* Don't index encrypted parts. */
 341                 continue;
 342             }
 343             _index_mime_part (message,
 344                               g_mime_multipart_get_part (multipart, i));
 345         }
 346         return;
 347     }
 348
 349     if (GMIME_IS_MESSAGE_PART (part)) {
 350         GMimeMessage *mime_message;
 351
 352         mime_message = g_mime_message_part_get_message (GMIME_MESSAGE_PART (part));
 353
 354         _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 355
 356         return;
 357     }
 358
 359     if (! (GMIME_IS_PART (part))) {
 360         fprintf (stderr, "Warning: Not indexing unknown mime part: %s.\n",
 361                  g_type_name (G_OBJECT_TYPE (part)));
 362         return;
 363     }
 364
 365     disposition = g_mime_object_get_content_disposition (part);
 366     if (disposition &&
 367         strcmp (disposition->disposition, GMIME_DISPOSITION_ATTACHMENT) == 0)
 368     {
 369         const char *filename = g_mime_part_get_filename (GMIME_PART (part));
 370
 371         _notmuch_message_add_term (message, "tag", "attachment");
 372         _notmuch_message_gen_terms (message, "attachment", filename);
 373
 374         /* XXX: Would be nice to call out to something here to parse
 375          * the attachment into text and then index that. */
 376         return;
 377     }
 378
 379     byte_array = g_byte_array_new ();
 380
 381     stream = g_mime_stream_mem_new_with_byte_array (byte_array);
 382     g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
 383
 384     filter = g_mime_stream_filter_new (stream);
 385     discard_uuencode_filter = notmuch_filter_discard_uuencode_new ();
 386
 387     g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
 388                               discard_uuencode_filter);
 389
 390     charset = g_mime_object_get_content_type_parameter (part, "charset");
 391     if (charset) {
 392         GMimeFilter *charset_filter;
 393         charset_filter = g_mime_filter_charset_new (charset, "UTF-8");
 394         /* This result can be NULL for things like "unknown-8bit".
 395          * Don't set a NULL filter as that makes GMime print
 396          * annoying assertion-failure messages on stderr. */
 397         if (charset_filter) {
 398             g_mime_stream_filter_add (GMIME_STREAM_FILTER (filter),
 399                                       charset_filter);
 400             g_object_unref (charset_filter);
 401         }
 402     }
 403
 404     wrapper = g_mime_part_get_content_object (GMIME_PART (part));
 405     if (wrapper)
 406         g_mime_data_wrapper_write_to_stream (wrapper, filter);
 407
 408     g_object_unref (stream);
 409     g_object_unref (filter);
 410     g_object_unref (discard_uuencode_filter);
 411
 412     g_byte_array_append (byte_array, (guint8 *) "\0", 1);
 413     body = (char *) g_byte_array_free (byte_array, FALSE);
 414
 415     if (body) {
 416         _notmuch_message_gen_terms (message, NULL, body);
 417
 418         free (body);
 419     }
 420 }
 421
 422 notmuch_status_t
 423 _notmuch_message_index_file (notmuch_message_t *message,
 424                              notmuch_message_file_t *message_file)
 425 {
 426     GMimeMessage *mime_message;
 427     InternetAddressList *addresses;
 428     const char *from, *subject;
 429     notmuch_status_t status;
 430
 431     status = _notmuch_message_file_get_mime_message (message_file,
 432                                                      &mime_message);
 433     if (status)
 434         return status;
 435
 436     from = g_mime_message_get_sender (mime_message);
 437
 438     addresses = internet_address_list_parse_string (from);
 439     if (addresses) {
 440         _index_address_list (message, "from", addresses);
 441         g_object_unref (addresses);
 442     }
 443
 444     addresses = g_mime_message_get_all_recipients (mime_message);
 445     if (addresses) {
 446         _index_address_list (message, "to", addresses);
 447         g_object_unref (addresses);
 448     }
 449
 450     subject = g_mime_message_get_subject (mime_message);
 451     _notmuch_message_gen_terms (message, "subject", subject);
 452
 453     _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 454
 455     return NOTMUCH_STATUS_SUCCESS;
 456 }