Optimize thread search using matched docid sets.
[notmuch] / lib / query.cc
1 /* query.cc - Support for searching a notmuch database
2  *
3  * Copyright © 2009 Carl Worth
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see http://www.gnu.org/licenses/ .
17  *
18  * Author: Carl Worth <cworth@cworth.org>
19  */
20
21 #include "notmuch-private.h"
22 #include "database-private.h"
23
24 #include <glib.h> /* GHashTable, GPtrArray */
25
26 struct _notmuch_query {
27     notmuch_database_t *notmuch;
28     const char *query_string;
29     notmuch_sort_t sort;
30 };
31
32 typedef struct _notmuch_mset_messages {
33     notmuch_messages_t base;
34     notmuch_database_t *notmuch;
35     Xapian::MSetIterator iterator;
36     Xapian::MSetIterator iterator_end;
37 } notmuch_mset_messages_t;
38
39 struct _notmuch_doc_id_set {
40     unsigned int *bitmap;
41     unsigned int bound;
42 };
43
44 struct _notmuch_threads {
45     notmuch_query_t *query;
46
47     /* The ordered list of doc ids matched by the query. */
48     GArray *doc_ids;
49     /* Our iterator's current position in doc_ids. */
50     unsigned int doc_id_pos;
51     /* The set of matched docid's that have not been assigned to a
52      * thread. Initially, this contains every docid in doc_ids. */
53     notmuch_doc_id_set_t match_set;
54 };
55
56 notmuch_query_t *
57 notmuch_query_create (notmuch_database_t *notmuch,
58                       const char *query_string)
59 {
60     notmuch_query_t *query;
61
62 #ifdef DEBUG_QUERY
63     fprintf (stderr, "Query string is:\n%s\n", query_string);
64 #endif
65
66     query = talloc (NULL, notmuch_query_t);
67     if (unlikely (query == NULL))
68         return NULL;
69
70     query->notmuch = notmuch;
71
72     query->query_string = talloc_strdup (query, query_string);
73
74     query->sort = NOTMUCH_SORT_NEWEST_FIRST;
75
76     return query;
77 }
78
79 const char *
80 notmuch_query_get_query_string (notmuch_query_t *query)
81 {
82     return query->query_string;
83 }
84
85 void
86 notmuch_query_set_sort (notmuch_query_t *query, notmuch_sort_t sort)
87 {
88     query->sort = sort;
89 }
90
91 notmuch_sort_t
92 notmuch_query_get_sort (notmuch_query_t *query)
93 {
94     return query->sort;
95 }
96
97 /* We end up having to call the destructors explicitly because we had
98  * to use "placement new" in order to initialize C++ objects within a
99  * block that we allocated with talloc. So C++ is making talloc
100  * slightly less simple to use, (we wouldn't need
101  * talloc_set_destructor at all otherwise).
102  */
103 static int
104 _notmuch_messages_destructor (notmuch_mset_messages_t *messages)
105 {
106     messages->iterator.~MSetIterator ();
107     messages->iterator_end.~MSetIterator ();
108
109     return 0;
110 }
111
112 notmuch_messages_t *
113 notmuch_query_search_messages (notmuch_query_t *query)
114 {
115     notmuch_database_t *notmuch = query->notmuch;
116     const char *query_string = query->query_string;
117     notmuch_mset_messages_t *messages;
118
119     messages = talloc (query, notmuch_mset_messages_t);
120     if (unlikely (messages == NULL))
121         return NULL;
122
123     try {
124
125         messages->base.is_of_list_type = FALSE;
126         messages->base.iterator = NULL;
127         messages->notmuch = notmuch;
128         new (&messages->iterator) Xapian::MSetIterator ();
129         new (&messages->iterator_end) Xapian::MSetIterator ();
130
131         talloc_set_destructor (messages, _notmuch_messages_destructor);
132
133         Xapian::Enquire enquire (*notmuch->xapian_db);
134         Xapian::Query mail_query (talloc_asprintf (query, "%s%s",
135                                                    _find_prefix ("type"),
136                                                    "mail"));
137         Xapian::Query string_query, final_query;
138         Xapian::MSet mset;
139         unsigned int flags = (Xapian::QueryParser::FLAG_BOOLEAN |
140                               Xapian::QueryParser::FLAG_PHRASE |
141                               Xapian::QueryParser::FLAG_LOVEHATE |
142                               Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE |
143                               Xapian::QueryParser::FLAG_WILDCARD |
144                               Xapian::QueryParser::FLAG_PURE_NOT);
145
146         if (strcmp (query_string, "") == 0 ||
147             strcmp (query_string, "*") == 0)
148         {
149             final_query = mail_query;
150         } else {
151             string_query = notmuch->query_parser->
152                 parse_query (query_string, flags);
153             final_query = Xapian::Query (Xapian::Query::OP_AND,
154                                          mail_query, string_query);
155         }
156
157         enquire.set_weighting_scheme (Xapian::BoolWeight());
158
159         switch (query->sort) {
160         case NOTMUCH_SORT_OLDEST_FIRST:
161             enquire.set_sort_by_value (NOTMUCH_VALUE_TIMESTAMP, FALSE);
162             break;
163         case NOTMUCH_SORT_NEWEST_FIRST:
164             enquire.set_sort_by_value (NOTMUCH_VALUE_TIMESTAMP, TRUE);
165             break;
166         case NOTMUCH_SORT_MESSAGE_ID:
167             enquire.set_sort_by_value (NOTMUCH_VALUE_MESSAGE_ID, FALSE);
168             break;
169         case NOTMUCH_SORT_UNSORTED:
170             break;
171         }
172
173 #if DEBUG_QUERY
174         fprintf (stderr, "Final query is:\n%s\n", final_query.get_description().c_str());
175 #endif
176
177         enquire.set_query (final_query);
178
179         mset = enquire.get_mset (0, notmuch->xapian_db->get_doccount ());
180
181         messages->iterator = mset.begin ();
182         messages->iterator_end = mset.end ();
183
184         return &messages->base;
185
186     } catch (const Xapian::Error &error) {
187         fprintf (stderr, "A Xapian exception occurred performing query: %s\n",
188                  error.get_msg().c_str());
189         fprintf (stderr, "Query string was: %s\n", query->query_string);
190         notmuch->exception_reported = TRUE;
191         talloc_free (messages);
192         return NULL;
193     }
194 }
195
196 notmuch_bool_t
197 _notmuch_mset_messages_valid (notmuch_messages_t *messages)
198 {
199     notmuch_mset_messages_t *mset_messages;
200
201     mset_messages = (notmuch_mset_messages_t *) messages;
202
203     return (mset_messages->iterator != mset_messages->iterator_end);
204 }
205
206 static Xapian::docid
207 _notmuch_mset_messages_get_doc_id (notmuch_messages_t *messages)
208 {
209     notmuch_mset_messages_t *mset_messages;
210
211     mset_messages = (notmuch_mset_messages_t *) messages;
212
213     if (! _notmuch_mset_messages_valid (&mset_messages->base))
214         return 0;
215
216     return *mset_messages->iterator;
217 }
218
219 notmuch_message_t *
220 _notmuch_mset_messages_get (notmuch_messages_t *messages)
221 {
222     notmuch_message_t *message;
223     Xapian::docid doc_id;
224     notmuch_private_status_t status;
225     notmuch_mset_messages_t *mset_messages;
226
227     mset_messages = (notmuch_mset_messages_t *) messages;
228
229     if (! _notmuch_mset_messages_valid (&mset_messages->base))
230         return NULL;
231
232     doc_id = *mset_messages->iterator;
233
234     message = _notmuch_message_create (mset_messages,
235                                        mset_messages->notmuch, doc_id,
236                                        &status);
237
238     if (message == NULL &&
239        status == NOTMUCH_PRIVATE_STATUS_NO_DOCUMENT_FOUND)
240     {
241         INTERNAL_ERROR ("a messages iterator contains a non-existent document ID.\n");
242     }
243
244     return message;
245 }
246
247 void
248 _notmuch_mset_messages_move_to_next (notmuch_messages_t *messages)
249 {
250     notmuch_mset_messages_t *mset_messages;
251
252     mset_messages = (notmuch_mset_messages_t *) messages;
253
254     mset_messages->iterator++;
255 }
256
257 static notmuch_bool_t
258 _notmuch_doc_id_set_init (void *ctx,
259                           notmuch_doc_id_set_t *doc_ids,
260                           GArray *arr, unsigned int bound)
261 {
262     size_t count = (bound + sizeof (doc_ids->bitmap[0]) - 1) /
263         sizeof (doc_ids->bitmap[0]);
264     unsigned int *bitmap = talloc_zero_array (ctx, unsigned int, count);
265
266     if (bitmap == NULL)
267         return FALSE;
268
269     doc_ids->bitmap = bitmap;
270     doc_ids->bound = bound;
271
272     for (unsigned int i = 0; i < arr->len; i++) {
273         unsigned int doc_id = g_array_index(arr, unsigned int, i);
274         bitmap[doc_id / sizeof (bitmap[0])] |=
275             1 << (doc_id % sizeof (bitmap[0]));
276     }
277
278     return TRUE;
279 }
280
281 notmuch_bool_t
282 _notmuch_doc_id_set_contains (notmuch_doc_id_set_t *doc_ids,
283                               unsigned int doc_id)
284 {
285     if (doc_id >= doc_ids->bound)
286         return FALSE;
287     return (doc_ids->bitmap[doc_id / sizeof (doc_ids->bitmap[0])] &
288             (1 << (doc_id % sizeof (doc_ids->bitmap[0])))) != 0;
289 }
290
291 void
292 _notmuch_doc_id_set_remove (notmuch_doc_id_set_t *doc_ids,
293                             unsigned int doc_id)
294 {
295     if (doc_id < doc_ids->bound)
296         doc_ids->bitmap[doc_id / sizeof (doc_ids->bitmap[0])] &=
297             ~(1 << (doc_id % sizeof (doc_ids->bitmap[0])));
298 }
299
300 /* Glib objects force use to use a talloc destructor as well, (but not
301  * nearly as ugly as the for messages due to C++ objects). At
302  * this point, I'd really like to have some talloc-friendly
303  * equivalents for the few pieces of glib that I'm using. */
304 static int
305 _notmuch_threads_destructor (notmuch_threads_t *threads)
306 {
307     if (threads->doc_ids)
308         g_array_unref (threads->doc_ids);
309
310     return 0;
311 }
312
313 notmuch_threads_t *
314 notmuch_query_search_threads (notmuch_query_t *query)
315 {
316     notmuch_threads_t *threads;
317     notmuch_messages_t *messages;
318     Xapian::docid max_doc_id = 0;
319
320     threads = talloc (query, notmuch_threads_t);
321     if (threads == NULL)
322         return NULL;
323     threads->doc_ids = NULL;
324     talloc_set_destructor (threads, _notmuch_threads_destructor);
325
326     threads->query = query;
327
328     messages = notmuch_query_search_messages (query);
329     if (messages == NULL) {
330             talloc_free (threads);
331             return NULL;
332     }
333
334     threads->doc_ids = g_array_new (FALSE, FALSE, sizeof (unsigned int));
335     while (notmuch_messages_valid (messages)) {
336         unsigned int doc_id = _notmuch_mset_messages_get_doc_id (messages);
337         g_array_append_val (threads->doc_ids, doc_id);
338         max_doc_id = MAX (max_doc_id, doc_id);
339         notmuch_messages_move_to_next (messages);
340     }
341     threads->doc_id_pos = 0;
342
343     talloc_free (messages);
344
345     if (! _notmuch_doc_id_set_init (threads, &threads->match_set,
346                                     threads->doc_ids, max_doc_id + 1)) {
347         talloc_free (threads);
348         return NULL;
349     }
350
351     return threads;
352 }
353
354 void
355 notmuch_query_destroy (notmuch_query_t *query)
356 {
357     talloc_free (query);
358 }
359
360 notmuch_bool_t
361 notmuch_threads_valid (notmuch_threads_t *threads)
362 {
363     unsigned int doc_id;
364
365     while (threads->doc_id_pos < threads->doc_ids->len) {
366         doc_id = g_array_index (threads->doc_ids, unsigned int,
367                                 threads->doc_id_pos);
368         if (_notmuch_doc_id_set_contains (&threads->match_set, doc_id))
369             break;
370
371         threads->doc_id_pos++;
372     }
373
374     return threads->doc_id_pos < threads->doc_ids->len;
375 }
376
377 notmuch_thread_t *
378 notmuch_threads_get (notmuch_threads_t *threads)
379 {
380     unsigned int doc_id;
381
382     if (! notmuch_threads_valid (threads))
383         return NULL;
384
385     doc_id = g_array_index (threads->doc_ids, unsigned int,
386                             threads->doc_id_pos);
387     return _notmuch_thread_create (threads->query,
388                                    threads->query->notmuch,
389                                    doc_id,
390                                    &threads->match_set,
391                                    threads->query->sort);
392 }
393
394 void
395 notmuch_threads_move_to_next (notmuch_threads_t *threads)
396 {
397     threads->doc_id_pos++;
398 }
399
400 void
401 notmuch_threads_destroy (notmuch_threads_t *threads)
402 {
403     talloc_free (threads);
404 }
405
406 unsigned
407 notmuch_query_count_messages (notmuch_query_t *query)
408 {
409     notmuch_database_t *notmuch = query->notmuch;
410     const char *query_string = query->query_string;
411     Xapian::doccount count = 0;
412
413     try {
414         Xapian::Enquire enquire (*notmuch->xapian_db);
415         Xapian::Query mail_query (talloc_asprintf (query, "%s%s",
416                                                    _find_prefix ("type"),
417                                                    "mail"));
418         Xapian::Query string_query, final_query;
419         Xapian::MSet mset;
420         unsigned int flags = (Xapian::QueryParser::FLAG_BOOLEAN |
421                               Xapian::QueryParser::FLAG_PHRASE |
422                               Xapian::QueryParser::FLAG_LOVEHATE |
423                               Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE |
424                               Xapian::QueryParser::FLAG_WILDCARD |
425                               Xapian::QueryParser::FLAG_PURE_NOT);
426
427         if (strcmp (query_string, "") == 0 ||
428             strcmp (query_string, "*") == 0)
429         {
430             final_query = mail_query;
431         } else {
432             string_query = notmuch->query_parser->
433                 parse_query (query_string, flags);
434             final_query = Xapian::Query (Xapian::Query::OP_AND,
435                                          mail_query, string_query);
436         }
437
438         enquire.set_weighting_scheme(Xapian::BoolWeight());
439         enquire.set_docid_order(Xapian::Enquire::ASCENDING);
440
441 #if DEBUG_QUERY
442         fprintf (stderr, "Final query is:\n%s\n", final_query.get_description().c_str());
443 #endif
444
445         enquire.set_query (final_query);
446
447         mset = enquire.get_mset (0, notmuch->xapian_db->get_doccount ());
448
449         count = mset.get_matches_estimated();
450
451     } catch (const Xapian::Error &error) {
452         fprintf (stderr, "A Xapian exception occurred: %s\n",
453                  error.get_msg().c_str());
454         fprintf (stderr, "Query string was: %s\n", query->query_string);
455     }
456
457     return count;
458 }