lib/parse-sexp: support phrase queries.

author David Bremner <david@tethera.net>

Tue, 24 Aug 2021 15:17:22 +0000 (08:17 -0700)

committer David Bremner <david@tethera.net>

Sun, 5 Sep 2021 00:07:19 +0000 (17:07 -0700)
author David Bremner <david@tethera.net>
Tue, 24 Aug 2021 15:17:22 +0000 (08:17 -0700)
committer David Bremner <david@tethera.net>
Sun, 5 Sep 2021 00:07:19 +0000 (17:07 -0700)
diff --git a/doc/man7/notmuch-sexp-queries.rst b/doc/man7/notmuch-sexp-queries.rst

index 08e97cc3b29579d469a10a435937c03ff957e07d..b763876d3672ddffc51aa8cc810ec67c85ea1503 100644 (file)
--- a/doc/man7/notmuch-sexp-queries.rst
+++ b/doc/man7/notmuch-sexp-queries.rst
@@ -40,10 +40,12 @@ subqueries.
      Match all messages.
  
  *term*
-    Match all messages containing *term*, possibly after
-    stemming or phase splitting. For discussion of stemming in
-    notmuch see :any:`notmuch-search-terms(7)`. Stemming only applies
-    to unquoted terms (basic values) in s-expression queries.
+
+    Match all messages containing *term*, possibly after stemming or
+    phrase splitting. For discussion of stemming in notmuch see
+    :any:`notmuch-search-terms(7)`. Stemming only applies to unquoted
+    terms (basic values) in s-expression queries.  For information on
+    phrase splitting see :any:`fields`.
  
  ``(`` *field* |q1| |q2| ... |qn| ``)``
      Restrict the queries |q1| to |qn| to *field*, and combine with *and*
@@ -63,7 +65,7 @@ subqueries.
  FIELDS
  ``````
  
-*Fields* (also called *prefixes* in notmuch documentation)
+*Fields* [#aka-pref]_
  correspond to attributes of mail messages. Some are inherent (and
  immutable) like ``subject``, while others ``tag`` and ``property`` are
  settable by the user.  Each concrete field in
@@ -72,6 +74,13 @@ is discussed further under "Search prefixes" in
  :any:`notmuch-search-terms(7)`. The row *user* refers to user defined
  fields, described in :any:`notmuch-config(1)`.
  
+Most fields are either *phrase fields* [#aka-prob]_ (which match
+sequences of words), or *term fields* [#aka-bool]_ (which match exact
+strings). *Phrase splitting* breaks the term (basic value or quoted
+string) into words, ignore punctuation. Phrase splitting is applied to
+terms in phrase (probabilistic) fields. Both phrase splitting and
+stemming apply only in phrase fields.
+
  .. _field-table:
  
  .. table:: Fields with supported modifiers
@@ -138,10 +147,23 @@ EXAMPLES
  ``(not Bob Marley)``
      Match messages containing neither "Bob" nor "Marley", nor their stems,
  
+``"quick fox"`` ``quick-fox`` ``quick@fox``
+    Match the *phrase* "quick" followed by "fox" in phrase fields (or
+    outside a field). Match the literal string in a term field.
+
  ``(subject quick "brown fox")``
      Match messages whose subject contains "quick" (anywhere, stemmed) and
      the phrase "brown fox".
  
+NOTES
+=====
+
+.. [#aka-pref] a.k.a. prefixes
+
+.. [#aka-prob] a.k.a. probabilistic prefixes
+
+.. [#aka-bool] a.k.a. boolean prefixes
+
  .. |q1| replace:: :math:`q_1`
  .. |q2| replace:: :math:`q_2`
  .. |qn| replace:: :math:`q_n`
diff --git a/lib/parse-sexp.cc b/lib/parse-sexp.cc

index 2555605853f3b0803ca8b463ae4288da85244aae..0917f50566ac5104b5a9839825e05e4e27b319ea 100644 (file)
--- a/lib/parse-sexp.cc
+++ b/lib/parse-sexp.cc
@@ -2,7 +2,7 @@
  
  #if HAVE_SFSEXP
  #include "sexp.h"
-
+#include "unicode-util.h"
  
  /* _sexp is used for file scope symbols to avoid clashing with
   * definitions from sexp.h */
@@ -67,6 +67,36 @@ _sexp_combine_query (notmuch_database_t *notmuch,
                                 sx->next, output);
  }
  
+static notmuch_status_t
+_sexp_parse_phrase (std::string term_prefix, const char *phrase, Xapian::Query &output)
+{
+    Xapian::Utf8Iterator p (phrase);
+    Xapian::Utf8Iterator end;
+    std::vector<std::string> terms;
+
+    while (p != end) {
+       Xapian::Utf8Iterator start;
+       while (p != end && ! Xapian::Unicode::is_wordchar (*p))
+           p++;
+
+       if (p == end)
+           break;
+
+       start = p;
+
+       while (p != end && Xapian::Unicode::is_wordchar (*p))
+           p++;
+
+       if (p != start) {
+           std::string word (start, p);
+           word = Xapian::Unicode::tolower (word);
+           terms.push_back (term_prefix + word);
+       }
+    }
+    output = Xapian::Query (Xapian::Query::OP_PHRASE, terms.begin (), terms.end ());
+    return NOTMUCH_STATUS_SUCCESS;
+}
+
  /* Here we expect the s-expression to be a proper list, with first
   * element defining and operation, or as a special case the empty
   * list */
@@ -80,13 +110,12 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
         std::string term = Xapian::Unicode::tolower (sx->val);
         Xapian::Stem stem = *(notmuch->stemmer);
         std::string term_prefix = parent ? _find_prefix (parent->name) : "";
-       if (sx->aty == SEXP_BASIC)
-           term = "Z" + term_prefix + stem (term);
-       else
-           term = term_prefix + term;
-
-       output = Xapian::Query (term);
-       return NOTMUCH_STATUS_SUCCESS;
+       if (sx->aty == SEXP_BASIC && unicode_word_utf8 (sx->val)) {
+           output = Xapian::Query ("Z" + term_prefix + stem (term));
+           return NOTMUCH_STATUS_SUCCESS;
+       } else {
+           return _sexp_parse_phrase (term_prefix, sx->val, output);
+       }
      }
  
      /* Empty list */
diff --git a/test/T081-sexpr-search.sh b/test/T081-sexpr-search.sh

index 90cef50c237e48bb083a22f4e647ffdb850e06fa..4a051a50676cfc962467ff735667e3d113f5f572 100755 (executable)
--- a/test/T081-sexpr-search.sh
+++ b/test/T081-sexpr-search.sh
@@ -102,15 +102,32 @@ EOF
  test_expect_equal_file EXPECTED OUTPUT
  
  test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
-test_subtest_known_broken
  output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
  test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
  
  test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
-test_subtest_known_broken
  output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
  test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
  
+test_begin_subtest "Search by 'subject' (combine phrase, term):"
+output=$(notmuch search --query=sexp '(subject Mac "compatibility issues")' | notmuch_search_sanitize)
+test_expect_equal "$output" "thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)"
+
+test_begin_subtest "Search by 'subject' (combine phrase, term 2):"
+notmuch search --query=sexp '(subject (or utf8 "compatibility issues"))' | notmuch_search_sanitize > OUTPUT
+cat <<EOF > EXPECTED
+thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
+thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
+test_begin_subtest "Search by 'subject' (combine phrase, term 3):"
+notmuch search --query=sexp '(subject issues X/Darwin)' | notmuch_search_sanitize > OUTPUT
+cat <<EOF > EXPECTED
+thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
+EOF
+test_expect_equal_file EXPECTED OUTPUT
+
  test_begin_subtest "Unbalanced parens"
  # A code 1 indicates the error was handled (a crash will return e.g. 139).
  test_expect_code 1 "notmuch search --query=sexp '('"
author	David Bremner <david@tethera.net>
	Tue, 24 Aug 2021 15:17:22 +0000 (08:17 -0700)
committer	David Bremner <david@tethera.net>
	Sun, 5 Sep 2021 00:07:19 +0000 (17:07 -0700)
doc/man7/notmuch-sexp-queries.rst		patch \| blob \| history
lib/parse-sexp.cc		patch \| blob \| history
test/T081-sexpr-search.sh		patch \| blob \| history