Commit 7e381982 authored by Michael Gratton's avatar Michael Gratton Committed by Michael Gratton
Browse files

ImapDb.Database: Register new ICU-based tokeniser for FTS

The SQLite tokeniser does not deal with scripts that do not use spaces
for word breaking (CJK, Thai, etc), thus searching in those languages
does not work well.

This adds a custom SQLite tokeniser based on ICU that breaks words for
all languages supported by that library, and uses NFKC_Casefold
normalisation to handle normalisation, case folding, and dropping of
ignorable characters.

Fixes #121
parent 90711f23
......@@ -26,7 +26,7 @@ variables:
meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
libappstream-glib-devel libgee-devel libhandy1-devel
libappstream-glib-devel libgee-devel libhandy1-devel libicu-devel
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
......@@ -37,9 +37,9 @@ variables:
itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
libhandy-1-dev libicu-dev libjson-glib-dev libmessaging-menu-dev
libpeas-dev libsecret-1-dev libsqlite3-dev libstemmer-dev
libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
fedora:
......
......@@ -93,8 +93,9 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
gnome-online-accounts-devel gspell-devel gsound-devel \
gtk3-devel iso-codes-devel itstool json-glib-devel \
libappstream-glib-devel libgee-devel libhandy1-devel \
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
libpeas-devel libsecret-devel libicu-devel libstemmer-devel \
libunwind-devel libxml2-devel libytnef-devel sqlite-devel \
webkitgtk4-devel
```
Installing dependencies on Ubuntu/Debian
......@@ -108,8 +109,8 @@ sudo apt-get install meson build-essential valac \
libappstream-glib-dev libenchant-2-dev libfolks-dev \
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
libsqlite3-dev libstemmer-dev libunwind-dev \
libjson-glib-dev libhandy-1-dev libicu-dev libpeas-dev \
libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev \
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
```
......
......@@ -85,6 +85,7 @@ goa = dependency('goa-1.0')
gsound = dependency('gsound')
gspell = dependency('gspell-1')
gthread = dependency('gthread-2.0', version: '>=' + target_glib)
icu_uc = dependency('icu-uc', version: '>=60')
iso_codes = dependency('iso-codes')
javascriptcoregtk = dependency('javascriptcoregtk-4.0', version: '>=' + target_webkit)
json_glib = dependency('json-glib-1.0', version: '>= 1.0')
......@@ -130,6 +131,15 @@ libstemmer = declare_dependency(
],
)
# Faux ICU dependency to prevent ICU being passed to valac as a
# package by meson
icu = declare_dependency(
dependencies: [
cc.find_library('icuuc'),
cc.find_library('icudata'),
],
)
# Optional dependencies
appstream_util = find_program('appstream-util', required: false)
desktop_file_validate = find_program('desktop-file-validate', required: false)
......
......@@ -14,6 +14,6 @@ CREATE VIRTUAL TABLE MessageSearchTable USING fts5(
bcc,
flags,
tokenize="unicode61 remove_diacritics 2",
tokenize="geary_tokeniser",
prefix="2,4,6,8,10"
)
......@@ -7,6 +7,7 @@
[CCode (cname = "g_utf8_collate_key")]
extern string utf8_collate_key(string data, ssize_t len);
extern int sqlite3_register_fts5_tokeniser(Sqlite.Database db);
extern int sqlite3_register_fts5_matches(Sqlite.Database db);
extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db);
......@@ -630,8 +631,13 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
sqlite3_register_legacy_tokenizer(cx.db);
}
// Register custom `geary_matches()` FTS5 function to obtain
// matching tokens from FTS queries.
// Register custom FTS5 tokeniser that uses ICU to correctly
// segment at both Latin and on-Latin (e.g. CJK, Thai) word
// boundaries.
sqlite3_register_fts5_tokeniser(cx.db);
// Register custom `geary_matches()` FTS5 function that
// obtains matching tokens from FTS queries.
sqlite3_register_fts5_matches(cx.db);
if (cx.db.create_function(
......
/*
* Copyright © 2020 Michael Gratton <mike@vee.net>
*
* This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution.
*/
#include <sqlite3ext.h>
SQLITE_EXTENSION_INIT1
#include <glib.h>
#include <gmodule.h>
#include <unicode/ubrk.h>
#include <unicode/unorm2.h>
#include <unicode/ustring.h>
#include "unicode/utf.h"
#include "unicode/utypes.h"
// Full text search tokeniser for SQLite. This exists since SQLite's
// existing Unicode tokeniser doesn't work with languages that don't
// use spaces as word boundaries.
//
// When generating tokens, the follow process is applied to text using
// the ICU library:
//
// 1. ICU NFKC_Casefold normalisation, handles normalisation, case
// folding and removal of ignorable characters such as accents.
//
// 2. ICU word-boundary tokenisation, splits both on words at spaces
// and other punctuation, and also using a dictionary lookup for
// languages that do not use spaces (CJK, Thai, etc)
//
// Note: Since SQLite is single-threaded, it's safe to use single
// instances of ICU services for all calls for a single tokeniser.
#define NORM_BUF_LEN 8
#define TOKEN_BUF_LEN 8
typedef struct {
// Singleton object, threadsafe, does not need to be deleted.
const UNormalizer2 * norm;
// Stateful object, not threadsafe, must be deleted.
UBreakIterator *iter;
} IcuTokeniser;
static int icu_create(void *context,
const char **args,
int n_args,
Fts5Tokenizer **ret) {
const UNormalizer2 *norm;
UBreakIterator *iter;
IcuTokeniser *tokeniser;
UErrorCode err = U_ZERO_ERROR;
norm = unorm2_getNFKCCasefoldInstance(&err);
if (U_FAILURE(err)) {
g_warning("Error constructing ICU normaliser: %s", u_errorName(err));
return SQLITE_ABORT;
}
// The given locale doesn't matter here since it ICU doesn't
// (currently) use different rules for different word breaking
// languages that uses spaces as word boundaries, and uses
// dictionary look-ups for CJK and other scripts that don't.
iter = ubrk_open(UBRK_WORD, "en", NULL, 0, &err);
if (U_FAILURE(err)) {
g_warning("Error constructing ICU word-breaker: %s", u_errorName(err));
ubrk_close(tokeniser->iter);
return SQLITE_ABORT;
}
tokeniser = g_new0(IcuTokeniser, 1);
tokeniser->norm = norm;
tokeniser->iter = iter;
*ret = (Fts5Tokenizer *) tokeniser;
return SQLITE_OK;
}
static void icu_delete(Fts5Tokenizer *fts5_tokeniser) {
IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
ubrk_close(tokeniser->iter);
g_free(tokeniser);
}
static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser,
void *context,
int flags,
const char *chars,
int32_t chars_len,
int (*token_callback)(void*, int, const char*, int, int, int)) {
int ret = SQLITE_OK;
IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
UErrorCode err = U_ZERO_ERROR;
const UNormalizer2 *norm = tokeniser->norm;
GArray *wide_chars = NULL;
GArray *wide_offsets = NULL;
UChar *wide_data = NULL;
gsize wide_data_len_long = 0;
int32_t wide_data_len = 0;
UChar norm_buf[NORM_BUF_LEN] = {0};
UBreakIterator *iter = tokeniser->iter;
int32_t start_index, current_index = 0;
char *token_buf = NULL;
int32_t token_buf_len = NORM_BUF_LEN;
// Normalisation.
//
// SQLite needs the byte-index of tokens found in the chars, but
// ICU doesn't support UTF-8-based normalisation. So convert UTF-8
// input to UTF-16 char-by-char and record the byte offsets for
// each, so that when converting back to UTF-8 the byte offsets
// can be determined.
wide_chars = g_array_sized_new(FALSE, FALSE, sizeof(UChar), chars_len);
wide_offsets = g_array_sized_new(FALSE, FALSE, sizeof(int32_t), chars_len);
for (int32_t byte_offset = 0; byte_offset < chars_len;) {
UChar wide_char;
int32_t norm_len;
int32_t start_byte_offset = byte_offset;
U8_NEXT_OR_FFFD(chars, byte_offset, chars_len, wide_char);
norm_len = unorm2_normalize(norm,
&wide_char, 1,
norm_buf, NORM_BUF_LEN,
&err);
if (U_FAILURE(err)) {
g_warning("Token text normalisation failed");
err = SQLITE_ABORT;
goto cleanup;
}
// NFKC may decompose a single character into multiple
// characters, e.g. 'fi' into "fi", '…' into "...".
for (int i = 0; i < norm_len; i++) {
g_array_append_val(wide_chars, norm_buf[i]);
g_array_append_val(wide_offsets, start_byte_offset);
}
}
// Word breaking.
//
// UTF-16 is passed to the tokeniser, hence its indexes are
// character-based. Use the offset array to convert those back to
// byte indexes for individual tokens.
wide_data = (UChar *) g_array_steal(wide_chars, &wide_data_len_long);
wide_data_len = (int32_t) wide_data_len_long;
ubrk_setText(iter, wide_data, wide_data_len, &err);
if (U_FAILURE(err)) {
err = SQLITE_ABORT;
g_warning("Setting word break iterator text failed");
goto cleanup;
}
start_index = 0;
current_index = ubrk_first(iter);
token_buf = g_malloc0(sizeof(char) * token_buf_len);
while (current_index != UBRK_DONE && ret == SQLITE_OK) {
int32_t status = ubrk_getRuleStatus(iter);
int32_t token_char_len = current_index - start_index;
if (token_char_len > 0 &&
!(status >= UBRK_WORD_NONE && status < UBRK_WORD_NONE_LIMIT) &&
!(status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)) {
int32_t token_byte_len = 0;
int32_t token_byte_start = 0;
int32_t token_byte_end = 0;
for (;;) {
u_strToUTF8WithSub(token_buf, token_buf_len, &token_byte_len,
wide_data + start_index, token_char_len,
0xFFFD, NULL,
&err);
if (U_SUCCESS(err)) {
break;
} else if (err == U_BUFFER_OVERFLOW_ERROR) {
token_buf_len *= 2;
token_buf = g_realloc(token_buf, sizeof(char) * token_buf_len);
err = U_ZERO_ERROR;
} else {
err = SQLITE_ABORT;
g_warning("Conversion to UTF-8 failed");
goto cleanup;
}
}
token_byte_start = g_array_index(wide_offsets, int32_t, start_index);
if (current_index < wide_data_len) {
token_byte_end = g_array_index(wide_offsets, int32_t, current_index);
} else {
token_byte_end = chars_len;
}
ret = token_callback(context,
0,
token_buf,
token_byte_len,
token_byte_start,
token_byte_end);
}
start_index = current_index;
current_index = ubrk_next(iter);
}
cleanup:
g_free(wide_data);
g_array_unref(wide_chars);
g_array_unref(wide_offsets);
g_free(token_buf);
return ret;
}
static fts5_api *get_fts5_api(sqlite3 *db) {
int rc = SQLITE_OK;
sqlite3_stmt *stmt;
fts5_api *api = NULL;
rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)",
-1, &stmt, 0);
if (rc != SQLITE_OK) {
return NULL;
}
sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL);
sqlite3_step(stmt);
sqlite3_finalize(stmt);
return api;
}
static const fts5_tokenizer icu_tokeniser = {
icu_create,
icu_delete,
icu_tokenise
};
gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) {
fts5_api *api;
fts5_tokenizer *tokeniser = (fts5_tokenizer *) &icu_tokeniser;
int rc = SQLITE_OK;
api = get_fts5_api(db);
if (!api) {
return FALSE;
}
rc = api->xCreateTokenizer(api,
"geary_tokeniser",
NULL,
tokeniser,
NULL);
return (rc == SQLITE_OK) ? TRUE : FALSE;
}
// Entry point for external loadable library, required when using
// command line SQLite tool. The name of this function must match the
// name of the shared module.
int sqlite3_gearytokeniser_init(sqlite3 *db,
char **error_message,
const sqlite3_api_routines *api) {
g_info("Loading geary_tokeniser\n");
SQLITE_EXTENSION_INIT2(api);
return sqlite3_register_fts5_tokeniser(db) ? SQLITE_OK : SQLITE_ABORT;
}
......@@ -178,6 +178,7 @@ engine_vala_sources = files(
'imap-db/imap-db-email-identifier.vala',
'imap-db/imap-db-folder.vala',
'imap-db/imap-db-fts5-matches.c',
'imap-db/imap-db-fts5-tokeniser.c',
'imap-db/imap-db-gc.vala',
'imap-db/imap-db-message-row.vala',
'imap-db/imap-db-sqlite.c',
......@@ -324,6 +325,7 @@ engine_dependencies = [
gio,
glib,
gmime,
icu,
libmath,
libstemmer,
libxml,
......@@ -337,10 +339,17 @@ endif
engine_build_dir = meson.current_build_dir()
engine_c_args = geary_c_args
engine_vala_args = geary_vala_args
# Suppress SQLite loadable module init code
engine_c_args += [
'-D', 'SQLITE_CORE',
]
# Generate internal VAPI for unit testing. See Meson issue
# https://github.com/mesonbuild/meson/issues/1781 for official
# internal VAPI support.
engine_vala_args = geary_vala_args
engine_vala_args += [
'--internal-header=@0@/geary-engine-internal.h'.format(engine_build_dir),
'--internal-vapi=@0@/geary-engine-internal.vapi'.format(engine_build_dir)
......@@ -364,7 +373,7 @@ engine_lib = static_library('geary-engine',
dependencies: engine_dependencies,
include_directories: config_h_dir,
vala_args: engine_vala_args,
c_args: geary_c_args,
c_args: engine_c_args,
)
# Dummy target to tell Meson about the internal VAPI given the
......@@ -402,3 +411,14 @@ engine_internal_dep = declare_dependency(
include_directories: include_directories('.'),
sources: engine_internal_header_fixup
)
# Compile a loadable library containing the custom tokeniser so SQLite
# command line app can still be used.
tokeniser_lib = shared_library('geary-tokeniser',
files('imap-db/imap-db-fts5-tokeniser.c'),
dependencies: [ glib, icu, sqlite ],
c_args: [
# Enable GLib structured logging
'-DG_LOG_USE_STRUCTURED',
],
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment