aboutsummaryrefslogtreecommitdiff
path: root/util/unicode-util.c
diff options
context:
space:
mode:
authorDavid Bremner <david@tethera.net>2019-03-25 23:07:24 -0300
committerDavid Bremner <david@tethera.net>2019-05-25 06:51:12 -0300
commit781125c9e92a2b9a2b9fbe54adec28ddb60f35b1 (patch)
tree5625ddaa918acc0eec1b22a5d551c624fcefb311 /util/unicode-util.c
parent46ab6013a29233b32dba49cf9c50e70fd02db1c3 (diff)
util: add unicode_word_utf8
This originally use Xapian::Unicode::is_wordchar, but that forces clients to link directly to libxapian, which seems like it might be busywork if nothing else.
Diffstat (limited to 'util/unicode-util.c')
-rw-r--r--util/unicode-util.c43
1 files changed, 43 insertions, 0 deletions
diff --git a/util/unicode-util.c b/util/unicode-util.c
new file mode 100644
index 00000000..312e900f
--- /dev/null
+++ b/util/unicode-util.c
@@ -0,0 +1,43 @@
+#include "unicode-util.h"
+
+/* Based on Xapian::Unicode::is_wordchar, to avoid forcing clients to
+ link directly to libxapian.
+*/
+
+static bool
+unicode_is_wordchar (notmuch_unichar ch)
+{
+ switch (g_unichar_type (ch)) {
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_NON_SPACING_MARK:
+ case G_UNICODE_ENCLOSING_MARK:
+ case G_UNICODE_SPACING_MARK:
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ case G_UNICODE_CONNECT_PUNCTUATION:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+unicode_word_utf8 (const char *utf8_str)
+{
+ gunichar *decoded = g_utf8_to_ucs4_fast (utf8_str, -1, NULL);
+ const gunichar *p = decoded;
+ bool ret;
+
+ while (*p && unicode_is_wordchar (*p))
+ p++;
+
+ ret = (*p == '\0');
+
+ g_free (decoded);
+ return ret;
+}