From 435a5e90f4d3250b8b6d36edebec29c91706a4df Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Wed, 4 Nov 2020 00:51:38 +1100 Subject: [PATCH 01/24] Geary.NamedFlag: Add serialise method for non-debug persistence --- src/engine/api/geary-contact.vala | 2 +- src/engine/api/geary-named-flag.vala | 3 +-- src/engine/api/geary-named-flags.vala | 12 +++++++++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/engine/api/geary-contact.vala b/src/engine/api/geary-contact.vala index 7da8e6ca0..63f43071b 100644 --- a/src/engine/api/geary-contact.vala +++ b/src/engine/api/geary-contact.vala @@ -43,7 +43,7 @@ public class Geary.Contact : BaseObject { public string serialize() { string ret = ""; foreach (NamedFlag flag in list) { - ret += flag.serialize() + " "; + ret += flag.serialise() + " "; } return ret.strip(); diff --git a/src/engine/api/geary-named-flag.vala b/src/engine/api/geary-named-flag.vala index eefd8824a..7472de3ff 100644 --- a/src/engine/api/geary-named-flag.vala +++ b/src/engine/api/geary-named-flag.vala @@ -28,7 +28,7 @@ public class Geary.NamedFlag : BaseObject, Gee.Hashable { return name.down().hash(); } - public string serialize() { + public string serialise() { return name; } @@ -36,4 +36,3 @@ public class Geary.NamedFlag : BaseObject, Gee.Hashable { return name; } } - diff --git a/src/engine/api/geary-named-flags.vala b/src/engine/api/geary-named-flags.vala index 56e486134..08fc1bad5 100644 --- a/src/engine/api/geary-named-flags.vala +++ b/src/engine/api/geary-named-flags.vala @@ -90,6 +90,17 @@ public class Geary.NamedFlags : BaseObject, Gee.Hashable { return Geary.String.stri_hash(to_string()); } + /** Formats the flags for serialising in the database. */ + public string serialise() { + var builder = new GLib.StringBuilder(); + foreach (NamedFlag flag in this.list) { + builder.append(flag.serialise()); + builder.append_c(' '); + } + return builder.str; + } + + /** Formats the flags for debugging. */ public string to_string() { string ret = "["; foreach (NamedFlag flag in list) { @@ -99,4 +110,3 @@ public class Geary.NamedFlags : BaseObject, Gee.Hashable { return ret + "]"; } } - -- GitLab From 4fe0d921477fb1fc157e11fea1fc312afab1aa6c Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Wed, 4 Nov 2020 00:55:13 +1100 Subject: [PATCH 02/24] engine: Convert from SQLite FTS3/4 to FTS5 for full-text-search Add SQL migration that drops the old FTS4 MessageSearchTable table, re-create as a FTS5 table, clean up the column names a bit, and adds a flags column so unread/starred queries can be made fast. Define a SQLite FTS5 extension function `geary_matches()` to replace the FTS3 `offsets()` function which no longer exists in FTS5, based on Tracker's implementation. Update code to FTS5 conventions (docid -> rowid, etc), use new column names, populate and update the flags column as the email's flags change, and use new match function for getting matching tokens. Advanced searches are probably currently broken, these will be fixed by subsequent commits. --- BUILDING.md | 6 +- meson.build | 6 +- sql/meson.build | 1 + sql/version-030.sql | 19 ++ src/engine/imap-db/imap-db-account.vala | 53 ++---- src/engine/imap-db/imap-db-database.vala | 5 + src/engine/imap-db/imap-db-folder.vala | 61 +++++-- src/engine/imap-db/imap-db-fts5-matches.c | 167 ++++++++++++++++++ src/engine/imap-db/imap-db-gc.vala | 2 +- src/engine/imap-db/imap-db-search-query.vala | 4 +- src/engine/meson.build | 1 + src/meson.build | 2 - .../engine/imap-db/imap-db-database-test.vala | 2 +- 13 files changed, 266 insertions(+), 63 deletions(-) create mode 100644 sql/version-030.sql create mode 100644 src/engine/imap-db/imap-db-fts5-matches.c diff --git a/BUILDING.md b/BUILDING.md index ff2c7e46e..f8ca45ae3 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -74,8 +74,10 @@ Building Geary requires the following major libraries and tools: See the `meson.build` file in the top-level directory for the complete list of required dependencies and minimum versions. -Geary also requires SQLite to be built with the compiler flag -`-DSQLITE_ENABLE_FTS3`. +Geary requires SQLite is built with both FTS3 and FTS5 support. Ensure +`--enable-fts5`, `-DSQLITE_ENABLE_FTS3` and +`-DSQLITE_ENABLE_FTS3_PARENTHESIS` are passed to the SQLite configure +script. All required libraries and tools are available from major Linux distribution's package repositories: diff --git a/meson.build b/meson.build index 03533ec16..ce279f555 100644 --- a/meson.build +++ b/meson.build @@ -158,8 +158,10 @@ web_extensions_dir = client_lib_dir / 'web-extensions' # Ensure SQLite was built correctly if not cc.has_header_symbol('sqlite3.h', 'SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER', dependencies: sqlite) - error('SQLite3 is missing FTS3 tokenizer support. Please compile it with -DSQLITE_ENABLE_FTS3.\n' - + 'See https://bugzilla.gnome.org/show_bug.cgi?id=763203 for details.') + error('SQLite3 was not built with FTS3 support. See BUILDING.md for details.') +endif +if not cc.has_header_symbol('sqlite3.h', 'Fts5ExtensionApi', dependencies: sqlite) + error('SQLite3 was not built with FTS5 support. See BUILDING.md for details.') endif # diff --git a/sql/meson.build b/sql/meson.build index 6bf30dcaf..c47972276 100644 --- a/sql/meson.build +++ b/sql/meson.build @@ -28,6 +28,7 @@ sql_files = [ 'version-027.sql', 'version-028.sql', 'version-029.sql', + 'version-030.sql', ] install_data(sql_files, diff --git a/sql/version-030.sql b/sql/version-030.sql new file mode 100644 index 000000000..48af04df6 --- /dev/null +++ b/sql/version-030.sql @@ -0,0 +1,19 @@ +-- +-- Convert full-text search from FTS3/4 to FTS5 +-- + +DROP TABLE IF EXISTS MessageSearchTable; + +CREATE VIRTUAL TABLE MessageSearchTable USING fts5( + body, + attachments, + subject, + "from", + receivers, + cc, + bcc, + flags, + + tokenize="unicode61 remove_diacritics 2", + prefix="2,4,6,8,10" +) diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index 54522b90e..7514aeba8 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -670,11 +670,11 @@ private class Geary.ImapDB.Account : BaseObject { if (query_phrases.size != 0) { sql.append(""" WHERE id IN ( - SELECT docid + SELECT rowid FROM MessageSearchTable WHERE 1=1 """); - sql_add_query_phrases(sql, query_phrases, "INTERSECT", "docid", ""); + sql_add_query_phrases(sql, query_phrases, "INTERSECT", "rowid", ""); sql.append(")"); } else sql.append(" WHERE 1=1"); @@ -980,7 +980,7 @@ private class Geary.ImapDB.Account : BaseObject { // the order of seconds, so manually perform the operation var result = cx.prepare( - "SELECT docid FROM MessageSearchTable" + "SELECT rowid FROM MessageSearchTable" ).exec(cancellable); while (!result.finished) { search_ids.add(result.rowid_at(0)); @@ -1061,7 +1061,8 @@ private class Geary.ImapDB.Account : BaseObject { Email.REQUIRED_FOR_MESSAGE | Email.Field.ORIGINATORS | Email.Field.RECEIVERS | - Email.Field.SUBJECT + Email.Field.SUBJECT | + Email.Field.FLAGS ); Email.Field db_fields; @@ -1417,30 +1418,30 @@ private class Geary.ImapDB.Account : BaseObject { StringBuilder sql = new StringBuilder(); sql.append(""" - SELECT docid, offsets(MessageSearchTable), * + SELECT rowid, geary_matches(MessageSearchTable), * FROM MessageSearchTable - WHERE docid IN ( + WHERE rowid IN ( """); sql_append_ids(sql, id_map.keys); sql.append(")"); - StringBuilder condition = new StringBuilder("AND docid IN ("); + StringBuilder condition = new StringBuilder("AND rowid IN ("); sql_append_ids(condition, id_map.keys); condition.append(")"); - sql_add_query_phrases(sql, query_phrases, "UNION", "docid, offsets(MessageSearchTable), *", + sql_add_query_phrases(sql, query_phrases, "UNION", "rowid, geary_matches(MessageSearchTable), *", condition.str); Db.Statement stmt = cx.prepare(sql.str); sql_bind_query_phrases(stmt, 0, query_phrases); - Gee.Map> search_matches = new Gee.HashMap< - ImapDB.EmailIdentifier, Gee.Set>(); + var search_matches = + new Gee.HashMap>(); Db.Result result = stmt.exec(cancellable); while (!result.finished) { - int64 docid = result.rowid_at(0); - assert(id_map.has_key(docid)); - ImapDB.EmailIdentifier id = id_map.get(docid); + int64 rowid = result.rowid_at(0); + assert(id_map.has_key(rowid)); + ImapDB.EmailIdentifier id = id_map.get(rowid); // XXX Avoid a crash when "database disk image is // malformed" error occurs. Remove this when the SQLite @@ -1451,30 +1452,12 @@ private class Geary.ImapDB.Account : BaseObject { continue; } - // offsets() function returns a list of 4 strings that are ints indicating position - // and length of match string in search table corpus - string[] offset_array = result.nonnull_string_at(1).split(" "); + var matches = new Gee.HashSet(); + matches.add_all_array(result.nonnull_string_at(1).split(",")); - Gee.Set matches = new Gee.HashSet(); - - int j = 0; - while (true) { - unowned string[] offset_string = offset_array[j:j+4]; - - int column = int.parse(offset_string[0]); - int byte_offset = int.parse(offset_string[2]); - int size = int.parse(offset_string[3]); - - unowned string text = result.nonnull_string_at(column + 2); - matches.add(text[byte_offset : byte_offset + size].down()); - - j += 4; - if (j >= offset_array.length) - break; - } - - if (search_matches.has_key(id)) + if (search_matches.has_key(id)) { matches.add_all(search_matches.get(id)); + } search_matches.set(id, matches); result.next(cancellable); diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala index cd428c89e..9365f8761 100644 --- a/src/engine/imap-db/imap-db-database.vala +++ b/src/engine/imap-db/imap-db-database.vala @@ -7,6 +7,7 @@ [CCode (cname = "g_utf8_collate_key")] extern string utf8_collate_key(string data, ssize_t len); +extern int sqlite3_register_fts5_matches(Sqlite.Database db); extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db); private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { @@ -629,6 +630,10 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { sqlite3_register_legacy_tokenizer(cx.db); } + // Register custom `geary_matches()` FTS5 function to obtain + // matching tokens from FTS queries. + sqlite3_register_fts5_matches(cx.db); + if (cx.db.create_function( UTF8_CASE_INSENSITIVE_FN, 1, // n args diff --git a/src/engine/imap-db/imap-db-folder.vala b/src/engine/imap-db/imap-db-folder.vala index 2c2d4015b..465508080 100644 --- a/src/engine/imap-db/imap-db-folder.vala +++ b/src/engine/imap-db/imap-db-folder.vala @@ -1044,7 +1044,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { sql = new StringBuilder(); sql.append(""" DELETE FROM MessageSearchTable - WHERE docid IN ( + WHERE rowid IN ( """); sql.append(message_ids_sql_sublist.str); sql.append(")"); @@ -1712,6 +1712,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { string? from = email.from != null ? email.from.to_searchable_string() : null; string? cc = email.cc != null ? email.cc.to_searchable_string() : null; string? bcc = email.bcc != null ? email.bcc.to_searchable_string() : null; + string? flags = email.email_flags != null ? email.email_flags.serialise() : null; if (!Geary.String.is_empty(body) || !Geary.String.is_empty(attachments) || @@ -1719,12 +1720,13 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { !Geary.String.is_empty(from) || !Geary.String.is_empty(recipients) || !Geary.String.is_empty(cc) || - !Geary.String.is_empty(bcc)) { + !Geary.String.is_empty(bcc) || + !Geary.String.is_empty(flags)) { Db.Statement stmt = cx.prepare(""" INSERT INTO MessageSearchTable - (docid, body, attachment, subject, from_field, receivers, cc, bcc) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) + (rowid, body, attachments, subject, "from", receivers, cc, bcc, flags) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """); stmt.bind_rowid(0, message_id); stmt.bind_string(1, body); @@ -1734,6 +1736,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { stmt.bind_string(5, recipients); stmt.bind_string(6, cc); stmt.bind_string(7, bcc); + stmt.bind_string(8, flags); stmt.exec_insert(cancellable); } @@ -1741,7 +1744,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { private static bool do_check_for_message_search_row(Db.Connection cx, int64 message_id, Cancellable? cancellable) throws Error { - Db.Statement stmt = cx.prepare("SELECT 'TRUE' FROM MessageSearchTable WHERE docid=?"); + Db.Statement stmt = cx.prepare("SELECT 'TRUE' FROM MessageSearchTable WHERE rowid=?"); stmt.bind_rowid(0, message_id); Db.Result result = stmt.exec(cancellable); @@ -1923,10 +1926,17 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { // TODO: Unroll loop private void do_set_email_flags(Db.Connection cx, Gee.Map map, Cancellable? cancellable) throws Error { - Db.Statement update_stmt = cx.prepare( - "UPDATE MessageTable SET flags=?, fields = fields | ? WHERE id=?"); + Db.Statement update_message = cx.prepare( + "UPDATE MessageTable SET flags = ?, fields = fields | ? WHERE id = ?" + ); + Db.Statement update_search = cx.prepare(""" + UPDATE MessageSearchTable SET flags = ? WHERE rowid = ? + """ + ); foreach (ImapDB.EmailIdentifier id in map.keys) { + // Find the email location + LocationIdentifier? location = do_get_location_for_id( cx, id, @@ -1940,6 +1950,8 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { ); } + // Update MessageTable + Geary.Imap.EmailFlags? flags = map.get(id) as Geary.Imap.EmailFlags; if (flags == null) { throw new EngineError.BAD_PARAMETERS( @@ -1947,12 +1959,18 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { ); } - update_stmt.reset(Db.ResetScope.CLEAR_BINDINGS); - update_stmt.bind_string(0, flags.message_flags.serialize()); - update_stmt.bind_int(1, Geary.Email.Field.FLAGS); - update_stmt.bind_rowid(2, id.message_id); + update_message.reset(Db.ResetScope.CLEAR_BINDINGS); + update_message.bind_string(0, flags.message_flags.serialize()); + update_message.bind_int(1, Geary.Email.Field.FLAGS); + update_message.bind_rowid(2, id.message_id); + update_message.exec(cancellable); + + // Update MessageSearchTable - update_stmt.exec(cancellable); + update_search.reset(Db.ResetScope.CLEAR_BINDINGS); + update_search.bind_string(0, flags.serialise()); + update_search.bind_rowid(1, id.message_id); + update_search.exec_insert(cancellable); } } @@ -2119,9 +2137,9 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { // existing data, then do a DELETE and INSERT. See Bug 772522. Db.Statement select = cx.prepare(""" - SELECT body, attachment, subject, from_field, receivers, cc, bcc + SELECT body, attachments, subject, "from", receivers, cc, bcc, flags FROM MessageSearchTable - WHERE docid=? + WHERE rowid=? """); select.bind_rowid(0, message_id); Db.Result row = select.exec(cancellable); @@ -2133,6 +2151,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { string? recipients = row.string_at(4); string? cc = row.string_at(5); string? bcc = row.string_at(6); + string? flags = row.string_at(7); if (new_fields.is_any_set(Geary.Email.REQUIRED_FOR_MESSAGE) && email.fields.is_all_set(Geary.Email.REQUIRED_FOR_MESSAGE)) { @@ -2165,16 +2184,22 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { bcc = email.bcc.to_searchable_string(); } + if (new_fields.is_any_set(Geary.Email.Field.FLAGS)) { + if (email.email_flags != null) { + flags = email.email_flags.serialise(); + } + } + Db.Statement del = cx.prepare( - "DELETE FROM MessageSearchTable WHERE docid=?" + "DELETE FROM MessageSearchTable WHERE rowid=?" ); del.bind_rowid(0, message_id); del.exec(cancellable); Db.Statement insert = cx.prepare(""" INSERT INTO MessageSearchTable - (docid, body, attachment, subject, from_field, receivers, cc, bcc) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) + (rowid, body, attachments, subject, "from", receivers, cc, bcc, flags) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """); insert.bind_rowid(0, message_id); insert.bind_string(1, body); @@ -2184,6 +2209,7 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { insert.bind_string(5, recipients); insert.bind_string(6, cc); insert.bind_string(7, bcc); + insert.bind_string(8, flags); insert.exec_insert(cancellable); } @@ -2230,7 +2256,6 @@ private class Geary.ImapDB.Folder : BaseObject, Geary.ReferenceSemantics { ); post_fields |= Geary.Email.Field.FLAGS; - } } diff --git a/src/engine/imap-db/imap-db-fts5-matches.c b/src/engine/imap-db/imap-db-fts5-matches.c new file mode 100644 index 000000000..4dff4f32b --- /dev/null +++ b/src/engine/imap-db/imap-db-fts5-matches.c @@ -0,0 +1,167 @@ +/* + * Copyright (C) 2011 Nokia + * + * Author: Carlos Garnacho + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +/* + * Borrowed from the Tracker project (see: tracker-fts-tokenizer.c) + * and adapted for Geary by Michael Gratton . + */ + +#include +#include + +typedef struct { + int start; + int end; +} Offset; + + +static int +offsets_tokenizer_func (void *data, + int flags, + const char *token, + int n_token, + int start, + int end) +{ + GArray *offsets = data; + Offset offset = { 0 }; + offset.start = start; + offset.end = end; + g_array_append_val(offsets, offset); + return SQLITE_OK; +} + +static void +geary_matches (const Fts5ExtensionApi *api, + Fts5Context *fts_ctx, + sqlite3_context *ctx, + int n_args, + sqlite3_value **args) +{ + GString *str; + int rc, n_hits, i; + GArray *offsets = NULL; + gint cur_col = -1; + gboolean first = TRUE; + + if (n_args > 0) { + sqlite3_result_error(ctx, "Invalid argument count", -1); + return; + } + + rc = api->xInstCount(fts_ctx, &n_hits); + if (rc != SQLITE_OK) { + sqlite3_result_null(ctx); + return; + } + + str = g_string_new(NULL); + + for (i = 0; i < n_hits; i++) { + int phrase, col, n_token; + const char *text; + int length; + Offset offset; + + rc = api->xInst(fts_ctx, i, &phrase, &col, &n_token); + if (rc != SQLITE_OK) + break; + + if (first || cur_col != col) { + if (offsets) { + g_array_free(offsets, TRUE); + } + + rc = api->xColumnText(fts_ctx, col, &text, &length); + if (rc != SQLITE_OK) + break; + + offsets = g_array_new(FALSE, FALSE, sizeof(Offset)); + rc = api->xTokenize(fts_ctx, + text, + length, + offsets, + &offsets_tokenizer_func); + if (rc != SQLITE_OK) { + break; + } + + cur_col = col; + } + + first = FALSE; + + if (str->len != 0) { + g_string_append_c(str, ','); + } + + offset = g_array_index(offsets, Offset, n_token); + g_string_append_len(str, text + offset.start, offset.end - offset.start); + } + + if (offsets) { + g_array_free (offsets, TRUE); + } + + if (rc == SQLITE_OK) { + sqlite3_result_text (ctx, str->str, str->len, g_free); + g_string_free (str, FALSE); + } else { + sqlite3_result_error_code (ctx, rc); + g_string_free (str, TRUE); + } +} + +static fts5_api *get_fts5_api (sqlite3 *db) { + int rc = SQLITE_OK; + sqlite3_stmt *stmt; + fts5_api *api = NULL; + + rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)", + -1, &stmt, 0); + if (rc != SQLITE_OK) { + return NULL; + } + + sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + + return api; +} + +gboolean sqlite3_register_fts5_matches(sqlite3 *db) { + fts5_api *api; + int rc = SQLITE_OK; + + api = get_fts5_api(db); + if (!api) { + return FALSE; + } + + rc = api->xCreateFunction(api, + "geary_matches", + NULL, + &geary_matches, + NULL); + + return (rc == SQLITE_OK) ? TRUE : FALSE; +} diff --git a/src/engine/imap-db/imap-db-gc.vala b/src/engine/imap-db/imap-db-gc.vala index dd090ea8e..44d268a55 100644 --- a/src/engine/imap-db/imap-db-gc.vala +++ b/src/engine/imap-db/imap-db-gc.vala @@ -433,7 +433,7 @@ private class Geary.ImapDB.GC { stmt = cx.prepare(""" DELETE FROM MessageSearchTable - WHERE docid = ? + WHERE rowid = ? """); stmt.bind_rowid(0, message_id); diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 968ac6eac..78f4eda33 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -18,11 +18,11 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { private const unichar[] SEARCH_TERM_CONTINUATION_CHARS = { '-', '_', '.', '@' }; // Search operator field names, eg: "to:foo@example.com" or "is:unread" - private const string SEARCH_OP_ATTACHMENT = "attachment"; + private const string SEARCH_OP_ATTACHMENT = "attachments"; private const string SEARCH_OP_BCC = "bcc"; private const string SEARCH_OP_BODY = "body"; private const string SEARCH_OP_CC = "cc"; - private const string SEARCH_OP_FROM = "from_field"; + private const string SEARCH_OP_FROM = "\"from\""; private const string SEARCH_OP_IS = "is"; private const string SEARCH_OP_SUBJECT = "subject"; private const string SEARCH_OP_TO = "receivers"; diff --git a/src/engine/meson.build b/src/engine/meson.build index 992053eaa..1e9eecf17 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -176,6 +176,7 @@ engine_vala_sources = files( 'imap-db/imap-db-database.vala', 'imap-db/imap-db-email-identifier.vala', 'imap-db/imap-db-folder.vala', + 'imap-db/imap-db-fts5-matches.c', 'imap-db/imap-db-gc.vala', 'imap-db/imap-db-message-row.vala', 'imap-db/imap-db-search-query.vala', diff --git a/src/meson.build b/src/meson.build index 7cbfa80a8..1ce6681ca 100644 --- a/src/meson.build +++ b/src/meson.build @@ -49,8 +49,6 @@ geary_c_args = [ '-DGCK_API_SUBJECT_TO_CHANGE', '-DGCR_API_SUBJECT_TO_CHANGE', '-DGOA_API_IS_SUBJECT_TO_CHANGE', - '-DSQLITE_ENABLE_FTS4', - '-DSQLITE_ENABLE_FTS4_UNICODE61' ] subdir('engine') diff --git a/test/engine/imap-db/imap-db-database-test.vala b/test/engine/imap-db/imap-db-database-test.vala index 9d69ac19b..60c69ef4b 100644 --- a/test/engine/imap-db/imap-db-database-test.vala +++ b/test/engine/imap-db/imap-db-database-test.vala @@ -106,7 +106,7 @@ class Geary.ImapDB.DatabaseTest : TestCase { ); db.open.end(async_result()); - assert_equal(db.get_schema_version(), 29, "Post-upgrade version"); + assert_equal(db.get_schema_version(), 30, "Post-upgrade version"); // Since schema v22 deletes the re-creates all attachments, // attachment 12 should no longer exist on the file system and -- GitLab From 0112c8192cae57588747c51441e91f3e6d069697 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Mon, 16 Dec 2019 23:17:55 +1100 Subject: [PATCH 03/24] Geary.SearchQuery: Allow client apps to build search queries Adds classes that allow building arbitrary query expressions and require an instance to be provided to Geary.SearchQuery, to be set as a property. This enables query expressions to be parsed by clients instead of the engine, in which ever whay they choose. --- .../application/application-main-window.vala | 7 +- .../conversation-viewer.vala | 7 +- src/client/util/util-email.vala | 578 +++++++++++++++++- src/engine/api/geary-account.vala | 24 +- src/engine/api/geary-search-query.vala | 280 ++++++++- src/engine/imap-db/imap-db-search-query.vala | 6 +- .../imap-engine-generic-account.vala | 11 +- test/client/util/util-email-test.vala | 354 ++++++++++- test/mock/mock-account.vala | 12 +- test/mock/mock-search-query.vala | 5 +- 10 files changed, 1228 insertions(+), 56 deletions(-) diff --git a/src/client/application/application-main-window.vala b/src/client/application/application-main-window.vala index d5255a415..c4f934314 100644 --- a/src/client/application/application-main-window.vala +++ b/src/client/application/application-main-window.vala @@ -981,11 +981,14 @@ public class Application.MainWindow : this.previous_non_search_folder = this.selected_folder; } - var strategy = this.application.config.get_search_strategy(); try { + var expr_factory = new Util.Email.SearchExpressionFactory( + this.application.config.get_search_strategy(), + context.account.information + ); var query = yield context.account.new_search_query( + expr_factory.parse_query(query_text), query_text, - strategy, cancellable ); this.folder_list.set_search( diff --git a/src/client/conversation-viewer/conversation-viewer.vala b/src/client/conversation-viewer/conversation-viewer.vala index 74706f8ca..09d250202 100644 --- a/src/client/conversation-viewer/conversation-viewer.vala +++ b/src/client/conversation-viewer/conversation-viewer.vala @@ -429,10 +429,13 @@ public class ConversationViewer : Gtk.Stack, Geary.BaseInterface { // opening every message in the conversation as soon as // the user presses a key if (text.length >= 2) { - var strategy = this.config.get_search_strategy(); + var expr_factory = new Util.Email.SearchExpressionFactory( + this.config.get_search_strategy(), + account.information + ); query = yield account.new_search_query( + expr_factory.parse_query(text), text, - strategy, cancellable ); } diff --git a/src/client/util/util-email.vala b/src/client/util/util-email.vala index 4a1754a70..554dbe72b 100644 --- a/src/client/util/util-email.vala +++ b/src/client/util/util-email.vala @@ -1,7 +1,9 @@ -/* Copyright 2016 Software Freedom Conservancy Inc. +/* + * Copyright 2016 Software Freedom Conservancy Inc. + * Copyright 2019 Michael Gratton * * This software is licensed under the GNU Lesser General Public License - * (version 2.1 or later). See the COPYING file in this distribution. + * (version 2.1 or later). See the COPYING file in this distribution. */ namespace Util.Email { @@ -312,3 +314,575 @@ namespace Util.Email { } } + + +/** + * Parses a human-entered email query string as a query expression. + * + * @see Geary.SearchQuery.Term + */ +public class Util.Email.SearchExpressionFactory : Geary.BaseObject { + + + private const unichar OPERATOR_SEPARATOR = ':'; + private const string OPERATOR_TEMPLATE = "%s:%s"; + + + private delegate Geary.SearchQuery.Term? OperatorFactory( + string value, + bool is_quoted + ); + + + private class FactoryContext { + + + public unowned OperatorFactory factory; + + + public FactoryContext(OperatorFactory factory) { + this.factory = factory; + } + + } + + + private class Tokeniser { + + + // These characters are chosen for being commonly used to + // continue a single word (such as extended last names, + // i.e. "Lars-Eric") or in terms commonly searched for in an + // email client, i.e. unadorned mailbox addresses. Note that + // characters commonly used for wildcards or that would be + // interpreted as wildcards by SQLite are not included here. + private const unichar[] CONTINUATION_CHARS = { + '-', '_', '.', '@' + }; + + public bool has_next { + get { return (this.current_pos < this.query.length); } + } + + public bool is_at_word { + get { return (this.attrs[this.current_c].is_word_start == 1); } + } + + public bool is_at_quote { + get { return (this.c == '"'); } + } + + public unichar current_character { get { return this.c; } } + + + private string query; + private int current_pos = -1; + private int next_pos = 0; + + private unichar c = 0; + private int current_c = -1; + private Pango.LogAttr[] attrs; + + + public Tokeniser(string query, Pango.Language language) { + this.query = query; + + // Break up search string into individual words and/or + // operators. Can't simply break on space or non-alphanumeric + // chars since some languages don't use spaces, so use Pango + // for its support for the Unicode UAX #29 word boundary spec. + this.attrs = new Pango.LogAttr[query.char_count() + 1]; + Pango.get_log_attrs( + query, query.length, -1, language, this.attrs + ); + + consume_char(); + } + + public void consume_char() { + var current_pos = this.next_pos; + if (this.query.get_next_char(ref this.next_pos, out this.c)) { + this.current_c++; + } + this.current_pos = current_pos; + } + + public void skip_to_next() { + while (this.has_next && !this.is_at_quote && !this.is_at_word) { + consume_char(); + } + } + + public string consume_word() { + var start = this.current_pos; + // the attr.is_word_end value applies to the first char + // after then end of a word, so need to move one past the + // end of the current word to determine where it ends + consume_char(); + while (this.has_next && + (this.c in CONTINUATION_CHARS || + this.attrs[this.current_c].is_word_end != 1)) { + consume_char(); + } + return this.query.slice(start, this.current_pos); + } + + public string consume_quote() { + consume_char(); // skip the leading quote + var start = this.current_pos; + var last_c = this.c; + while (this.has_next && (this.c != '"' || last_c == '\\')) { + consume_char(); + } + var quote = this.query.slice(start, this.current_pos); + consume_char(); // skip the trailing quote + return quote; + } + + } + + + public Geary.SearchQuery.Strategy default_strategy { get; private set; } + + public Geary.AccountInformation account { get; private set; } + + public Pango.Language language { + get; set; default = Pango.Language.get_default(); + } + + // Maps of localised search operator names and values to their + // internal forms + private Gee.Map text_operators = + new Gee.HashMap(); + private Gee.Map boolean_operators = + new Gee.HashMap(); + private Gee.Set search_op_to_me = new Gee.HashSet(); + private Gee.Set search_op_from_me = new Gee.HashSet(); + + + public SearchExpressionFactory(Geary.SearchQuery.Strategy default_strategy, + Geary.AccountInformation account) { + this.default_strategy = default_strategy; + this.account = account; + construct_factories(); + } + + /** Constructs a search expression from the given query string. */ + public Gee.List parse_query(string query) { + var operands = new Gee.LinkedList(); + var tokens = new Tokeniser(query, this.language); + while (tokens.has_next) { + if (tokens.is_at_word) { + Geary.SearchQuery.Term? op = null; + var word = tokens.consume_word(); + if (tokens.current_character == OPERATOR_SEPARATOR && + tokens.has_next) { + op = new_extended_operator(word, tokens); + } + if (op == null) { + op = new_text_all_operator(word, false); + } + operands.add(op); + } else if (tokens.is_at_quote) { + operands.add( + new_text_all_operator(tokens.consume_quote(), true) + ); + } else { + tokens.skip_to_next(); + } + } + + return operands; + } + + private Geary.SearchQuery.Term? new_extended_operator(string name, + Tokeniser tokens) { + Geary.SearchQuery.Term? op = null; + + // consume the ':' + tokens.consume_char(); + + bool is_quoted = false; + string? value = null; + if (tokens.is_at_word) { + value = tokens.consume_word(); + } else if (tokens.is_at_quote) { + value = tokens.consume_quote(); + is_quoted = true; + } + + FactoryContext? context = null; + if (value != null) { + context = this.text_operators[name]; + if (context == null) { + context = this.boolean_operators[ + OPERATOR_TEMPLATE.printf(name, value) + ]; + } + } + + if (context != null) { + op = context.factory(value, is_quoted); + } + + if (op == null) { + // Still no operator, so the name or value must have been + // invalid. Repair by treating each as separate ops, if + // present. + var term = ( + value == null + ? "%s:".printf(name) + : "%s:%s".printf(name, value) + ); + op = new_text_all_operator(term, false); + } + + return op; + } + + private inline Geary.SearchQuery.Strategy get_matching_strategy(bool is_quoted) { + return ( + is_quoted + ? Geary.SearchQuery.Strategy.EXACT + : this.default_strategy + ); + } + + private Gee.List get_account_addresses() { + Gee.List? mailboxes = + this.account.sender_mailboxes; + var addresses = new Gee.LinkedList(); + if (mailboxes != null) { + foreach (var mailbox in mailboxes) { + addresses.add(mailbox.address); + } + } + return addresses; + } + + private void construct_factories() { + // Maps of possibly translated search operator names and values + // to English/internal names and values. We include the + // English version anyway so that when translations provide a + // localised version of the operator names but have not also + // translated the user manual, the English version in the + // manual still works. + + // Text operators + /////////////////////////////////////////////////////////// + + FactoryContext attachment_name = new FactoryContext( + this.new_text_attachment_name_operator + ); + this.text_operators.set("attachment", attachment_name); + /// Translators: Can be typed in the search box like + /// "attachment:file.txt" to find messages with attachments + /// with a particular name. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "attachment"), + attachment_name); + + FactoryContext bcc = new FactoryContext(this.new_text_bcc_operator); + this.text_operators.set("bcc", bcc); + /// Translators: Can be typed in the search box like + /// "bcc:johndoe@example.com" to find messages bcc'd to a + /// particular person. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "bcc"), bcc); + + FactoryContext body = new FactoryContext(this.new_text_body_operator); + this.text_operators.set("body", body); + /// Translators: Can be typed in the search box like + /// "body:word" to find "word" only if it occurs in the body + /// of a message. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "body"), body); + + FactoryContext cc = new FactoryContext(this.new_text_cc_operator); + this.text_operators.set("cc", cc); + /// Translators: Can be typed in the search box like + /// "cc:johndoe@example.com" to find messages cc'd to a + /// particular person. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "cc"), cc); + + FactoryContext from = new FactoryContext(this.new_text_from_operator); + this.text_operators.set("from", from); + /// Translators: Can be typed in the search box like + /// "from:johndoe@example.com" to find messages from a + /// particular sender. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "from"), from); + + FactoryContext subject = new FactoryContext( + this.new_text_subject_operator + ); + this.text_operators.set("subject", subject); + /// Translators: Can be typed in the search box like + /// "subject:word" to find "word" only if it occurs in the + /// subject of a message. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "subject"), subject); + + FactoryContext to = new FactoryContext(this.new_text_to_operator); + this.text_operators.set("to", to); + /// Translators: Can be typed in the search box like + /// "to:johndoe@example.com" to find messages received by a + /// particular person. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.text_operators.set(C_("Search operator", "to"), to); + + /// Translators: Can be typed in the search box after "to:", + /// "cc:" and "bcc:" e.g.: "to:me". Matches conversations that + /// are addressed to the user. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.search_op_to_me.add( + C_("Search operator value - mail addressed to the user", "me") + ); + this.search_op_to_me.add("me"); + + /// Translators: Can be typed in the search box after "from:" + /// i.e.: "from:me". Matches conversations were sent by the + /// user. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + this.search_op_from_me.add( + C_("Search operator value - mail sent by the user", "me") + ); + this.search_op_from_me.add("me"); + + // Boolean operators + /////////////////////////////////////////////////////////// + + /// Translators: Can be typed in the search box like + /// "is:unread" to find messages that are read, unread, or + /// starred. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + string bool_is_name = C_("Search operator", "is"); + + /// Translators: Can be typed in the search box after "is:" + /// i.e.: "is:unread". Matches conversations that are flagged + /// unread. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + string bool_is_unread_value = C_("'is:' search operator value", "unread"); + + /// Translators: Can be typed in the search box after "is:" + /// i.e.: "is:read". Matches conversations that are flagged as + /// read. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + string bool_is_read_value = C_("'is:' search operator value", "read"); + + /// Translators: Can be typed in the search box after "is:" + /// i.e.: "is:starred". Matches conversations that are flagged + /// as starred. + /// + /// The translated string must be a single word (use '-', '_' + /// or similar to combine words into one), should be short, + /// and also match the translation in "search.page" of the + /// Geary User Guide. + string bool_is_starred_value = C_("'is:' search operator value", "starred"); + + FactoryContext is_unread = new FactoryContext( + this.new_boolean_unread_operator + ); + this.boolean_operators.set("is:unread", is_unread); + this.boolean_operators.set( + OPERATOR_TEMPLATE.printf( + bool_is_name, bool_is_unread_value + ), is_unread + ); + + FactoryContext is_read = new FactoryContext( + this.new_boolean_read_operator + ); + this.boolean_operators.set("is:read", is_read); + this.boolean_operators.set( + OPERATOR_TEMPLATE.printf( + bool_is_name, bool_is_read_value + ), is_read + ); + + FactoryContext is_starred = new FactoryContext( + this.new_boolean_starred_operator + ); + this.boolean_operators.set("is:starred", is_starred); + this.boolean_operators.set( + OPERATOR_TEMPLATE.printf( + bool_is_name, bool_is_starred_value + ), is_starred + ); + } + + private Geary.SearchQuery.Term? new_text_all_operator( + string value, bool is_quoted + ) { + return new Geary.SearchQuery.EmailTextTerm( + ALL, get_matching_strategy(is_quoted), value + ); + } + + private Geary.SearchQuery.Term? new_text_attachment_name_operator( + string value, bool is_quoted + ) { + return new Geary.SearchQuery.EmailTextTerm( + ATTACHMENT_NAME, get_matching_strategy(is_quoted), value + ); + } + + private Geary.SearchQuery.Term? new_text_bcc_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted && value in this.search_op_to_me) { + op = new Geary.SearchQuery.EmailTextTerm.disjunction( + BCC, EXACT, get_account_addresses() + ); + } else { + op = new Geary.SearchQuery.EmailTextTerm( + BCC, EXACT, value + ); + } + return op; + } + + private Geary.SearchQuery.Term? new_text_body_operator( + string value, bool is_quoted + ) { + return new Geary.SearchQuery.EmailTextTerm( + BODY, get_matching_strategy(is_quoted), value + ); + } + + private Geary.SearchQuery.Term? new_text_cc_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted && value in this.search_op_to_me) { + op = new Geary.SearchQuery.EmailTextTerm.disjunction( + CC, EXACT, get_account_addresses() + ); + } else { + op = new Geary.SearchQuery.EmailTextTerm( + CC, get_matching_strategy(is_quoted), value + ); + } + return op; + } + + private Geary.SearchQuery.Term? new_text_from_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted && value in this.search_op_from_me) { + op = new Geary.SearchQuery.EmailTextTerm.disjunction( + FROM, EXACT, get_account_addresses() + ); + } else { + op = new Geary.SearchQuery.EmailTextTerm(FROM, EXACT, value); + } + return op; + } + + private Geary.SearchQuery.Term? new_text_subject_operator( + string value, bool is_quoted + ) { + return new Geary.SearchQuery.EmailTextTerm( + SUBJECT, get_matching_strategy(is_quoted), value + ); + } + + private Geary.SearchQuery.Term? new_text_to_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted && value in this.search_op_to_me) { + op = new Geary.SearchQuery.EmailTextTerm.disjunction( + TO, EXACT, get_account_addresses() + ); + } else { + op = new Geary.SearchQuery.EmailTextTerm( + TO, EXACT, value + ); + } + return op; + } + + private Geary.SearchQuery.Term? new_boolean_unread_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted) { + op = new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.UNREAD); + } + return op; + } + + private Geary.SearchQuery.Term? new_boolean_read_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted) { + op = new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.UNREAD); + op.is_negated = true; + } + return op; + } + + private Geary.SearchQuery.Term? new_boolean_starred_operator( + string value, bool is_quoted + ) { + Geary.SearchQuery.Term? op = null; + if (!is_quoted) { + op = new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.FLAGGED); + } + return op; + } + +} diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala index d81661cae..6bba6b90c 100644 --- a/src/engine/api/geary-account.vala +++ b/src/engine/api/geary-account.vala @@ -512,23 +512,13 @@ public abstract class Geary.Account : BaseObject, Logging.Source { ) throws GLib.Error; /** - * Create a new {@link SearchQuery} for this {@link Account}. - * - * See {@link Geary.SearchQuery.Strategy} for more information about how its interpreted by the - * Engine. In particular, note that it's an advisory parameter only and may have no effect, - * especially on server searches. However, it may also have a dramatic effect on what search - * results are returned and so should be used with some caution. Whether this parameter is - * user-configurable, available through GSettings or another configuration mechanism, or simply - * baked into the caller's code is up to the caller. CONSERVATIVE is designed to be a good - * default. - * - * The resulting object can only be used with calls into this - * account instance. - */ - public abstract async SearchQuery new_search_query(string query, - SearchQuery.Strategy strategy, - GLib.Cancellable? cancellable) - throws GLib.Error; + * Create a new search query for this account. + */ + public abstract async SearchQuery new_search_query( + Gee.List expression, + string text, + GLib.Cancellable? cancellable + ) throws GLib.Error; /** * Performs a search with the given query. Optionally, a list of folders not to search diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 0a15ff2ea..84f686b91 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -1,6 +1,6 @@ /* - * Copyright 2016 Software Freedom Conservancy Inc. - * Copyright 2019 Michael Gratton + * Copyright © 2016 Software Freedom Conservancy Inc. + * Copyright © 2019-2020 Michael Gratton * * This software is licensed under the GNU Lesser General Public License * (version 2.1 or later). See the COPYING file in this distribution. @@ -13,6 +13,16 @@ * Account.new_search_query} and then passed to search methods on * {@link Account} or {@link App.SearchFolder}. * + * Actual search queries are specified by the given {@link + * expression}, which is a list of {@link Term}. The expression + * denotes the conjunction of all given terms, that is, each term is + * combined by a Boolean AND function. While the order of the terms is + * not important, the expression should attempt to reflect the + * free-text search query it was built from (if any). A more + * expressive language is not supported since it is designed to work + * with both the Engine's built-in full text search system as well as + * other server-based systems, including IMAP. + * * @see Account.new_search_query * @see Account.local_search_async * @see Account.get_search_matches_async @@ -54,36 +64,272 @@ public abstract class Geary.SearchQuery : BaseObject { /** * Search for all textual variants, i.e. "the sky's the limit." */ - HORIZON + HORIZON; + + + /** Determines if stemming may be used for an operator. */ + internal bool is_stemming_enabled() { + return this != EXACT; + } + + /** + * The minimum term length before stemming is allowed. + * + * This prevents short words that might be stemmed from being stemmed. + */ + internal int get_min_term_length_for_stemming() { + var min = 0; + switch (this) { + case EXACT: + min = int.MAX; + break; + case CONSERVATIVE: + min = 6; + break; + case AGGRESSIVE: + min = 4; + break; + case HORIZON: + min = 0; + break; + } + return min; + } + + /** + * Maximum difference in lengths between term and stemmed variant. + * + * This prevents long words from being stemmed to much shorter + * words (which creates opportunities for greedy matching). + */ + internal int get_max_difference_term_stem_lengths() { + var max = 0; + switch (this) { + case EXACT: + max = 0; + break; + case CONSERVATIVE: + max = 2; + break; + case AGGRESSIVE: + max = 4; + break; + case HORIZON: + max =int.MAX; + break; + } + return max; + } + + /** + * Maximum difference in lengths between a matched word and the stemmed variant it matched + * against. + * + * This prevents long words being matched to short stem + * variants (which creates opportunities for greedy matching). + */ + internal int get_max_difference_match_stem_lengths() { + var max = 0; + switch (this) { + case EXACT: + max = 0; + break; + case CONSERVATIVE: + max = 2; + break; + case AGGRESSIVE: + max = 3; + break; + case HORIZON: + max = int.MAX; + break; + } + return max; + } + } - /** The account that owns this query. */ - public Account owner { get; private set; } + /** + * Parent class for terms that make up a search query's expression. + * + * @see SearchQuery.expression + */ + public abstract class Term : BaseObject { + + /** Determines opposite of the term is matched. */ + public bool is_negated { get; set; default = false; } + + /** Returns a string representation, for debugging. */ + public abstract string to_string(); + + } /** - * The original search text. + * A term that matches text properties of an email. + */ + public class EmailTextTerm : Term { + + + /** + * Supported text email properties that can be queried. + * + * @see EmailTextTerm + */ + public enum Property { + /** Search for a term in all supported properties. */ + ALL, + + /** Search for a term in the To field. */ + TO, + + /** Search for a term in the Cc field. */ + CC, + + /** Search for a term in the Bcc field. */ + BCC, + + /** Search for a term in the From field. */ + FROM, + + /** Search for a term in the email subject. */ + SUBJECT, + + /** Search for a term in the email body. */ + BODY, + + /** Search for a term in email attachment names. */ + ATTACHMENT_NAME; + } + + + /** The email property this term applies to. */ + public Property target { get; private set; } + + /** The strategy used for matching the given terms. */ + public Strategy matching_strategy { get; private set; } + + /** + * The strings to match against the given target. + * + * If more than one term is given, they are treated as the + * disjunction of all, that is they are combined using the + * Boolean OR function. + */ + public Gee.List terms { + get; private set; default = new Gee.ArrayList(); + } + + + public EmailTextTerm(Property target, + Strategy matching_strategy, + string term) { + this.target = target; + this.matching_strategy = matching_strategy; + this.terms.add(term); + } + + public EmailTextTerm.disjunction(Property target, + Strategy matching_strategy, + Gee.List terms) { + this.target = target; + this.matching_strategy = matching_strategy; + this.terms.add_all(terms); + } + + public override string to_string() { + var builder = new GLib.StringBuilder(); + if (this.is_negated) { + builder.append_c('!'); + } + + builder.append( + ObjectUtils.to_enum_nick( + typeof(Property), this.target).up() + ); + builder.append_c(':'); + builder.append( + ObjectUtils.to_enum_nick( + typeof(Strategy), this.matching_strategy + ).up() + ); + builder.append_c('('); + + var iter = this.terms.iterator(); + if (iter.next()) { + builder.append(iter.get().to_string()); + } + while (iter.next()) { + builder.append_c(','); + builder.append(iter.get().to_string()); + } + builder.append_c(')'); + return builder.str; + } + + } + + + /** + * A term that matches a given flag in an email. + */ + public class EmailFlagTerm : Term { + + + public NamedFlag value { get; private set; } + + + public EmailFlagTerm(NamedFlag value) { + this.value = value; + } + + public override string to_string() { + return "%s(%s)".printf( + this.is_negated ? "!" : "", + this.value.to_string() + ); + } + + } + + + /** + * A read-only list of search terms to be evaluated. * - * This is used mostly for debugging. + * Each given term is used in a conjunction, that is combined + * using a Boolean `AND` operator. */ - public string raw { get; private set; } + public Gee.List expression { get; private set; } + private Gee.List _rw_expression = new Gee.ArrayList(); /** - * The selected {@link Strategy} quality. + * The original search text, if any. + * + * This is used mostly for debugging. */ - public Strategy strategy { get; private set; } + public string raw { get; private set; } - protected SearchQuery(Account owner, - string raw, - Strategy strategy) { - this.owner = owner; + protected SearchQuery(Gee.Collection expression, + string raw) { + this._rw_expression.add_all(expression); + this.expression = this._rw_expression.read_only_view; this.raw = raw; - this.strategy = strategy; } public string to_string() { - return "\"%s\" (%s)".printf(this.raw, this.strategy.to_string()); + var builder = new GLib.StringBuilder(); + builder.append_printf("\"%s\": ", this.raw); + + var iter = this.expression.iterator(); + if (iter.next()) { + builder.append(iter.get().to_string()); + } + while (iter.next()) { + builder.append_c(','); + builder.append(iter.get().to_string()); + } + return builder.str; } -} +} diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 78f4eda33..21dee990c 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -99,6 +99,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { } } + private Geary.SearchQuery.Strategy strategy; // Maps of localised search operator names and values to their // internal forms @@ -323,10 +324,11 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { public async SearchQuery(Geary.Account owner, ImapDB.Account local, - string query, + Gee.Collection expression, + string raw, Geary.SearchQuery.Strategy strategy, GLib.Cancellable? cancellable) { - base(owner, query, strategy); + base(expression, raw); this.account = local; this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer()); diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index ef1ba7b4d..070246586 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -572,12 +572,13 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { } /** {@inheritDoc} */ - public override async SearchQuery new_search_query(string query, - SearchQuery.Strategy strategy, - GLib.Cancellable? cancellable) - throws GLib.Error { + public override async SearchQuery new_search_query( + Gee.List expression, + string text, + GLib.Cancellable? cancellable + ) throws GLib.Error { return yield new ImapDB.SearchQuery( - this, local, query, strategy, cancellable + this, this.local, expression, text, EXACT, cancellable ); } diff --git a/test/client/util/util-email-test.vala b/test/client/util/util-email-test.vala index fb3c365f5..01605480c 100644 --- a/test/client/util/util-email-test.vala +++ b/test/client/util/util-email-test.vala @@ -7,14 +7,45 @@ public class Util.Email.Test : TestCase { + + private Application.Configuration? config = null; + private Geary.AccountInformation? account = null; + + public Test() { - base("UtilEmailTest"); + base("Util.Email.Test"); add_test("null_originator", null_originator); add_test("from_originator", from_originator); add_test("sender_originator", sender_originator); add_test("reply_to_originator", reply_to_originator); add_test("reply_to_via_originator", reply_to_via_originator); add_test("plain_via_originator", plain_via_originator); + add_test("empty_search_query", empty_search_query); + add_test("plain_search_terms", plain_search_terms); + add_test("continuation_search_terms", continuation_search_terms); + add_test("i18n_search_terms", i18n_search_terms); + add_test("multiple_search_terms", multiple_search_terms); + add_test("quoted_search_terms", quoted_search_terms); + add_test("text_op_terms", text_op_terms); + add_test("text_op_single_me_terms", text_op_single_me_terms); + add_test("text_op_multiple_me_terms", text_op_multiple_me_terms); + add_test("boolean_op_terms", boolean_op_terms); + add_test("invalid_op_terms", invalid_op_terms); + } + + public override void set_up() { + this.config = new Application.Configuration(Application.Client.SCHEMA_ID); + this.account = new Geary.AccountInformation( + "test", + OTHER, + new Mock.CredentialsMediator(), + new Geary.RFC822.MailboxAddress("test", "test@example.com") + ); + } + + public override void tear_down() { + this.config = null; + this.account = null; } public void null_originator() throws GLib.Error { @@ -95,6 +126,327 @@ public class Util.Email.Test : TestCase { assert_equal(originator.address, "bot@example.com"); } + public void empty_search_query() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + assert_collection(test_article.parse_query("")).is_empty(); + } + + public void plain_search_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple1 = test_article.parse_query("hello"); + assert_collection(simple1).size(1); + assert_true(simple1[0] is Geary.SearchQuery.EmailTextTerm); + var text1 = simple1[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text1.target == ALL); + assert_true(text1.matching_strategy == CONSERVATIVE); + assert_collection(text1.terms).size(1).contains("hello"); + + var simple2 = test_article.parse_query("h"); + assert_collection(simple2).size(1); + assert_true(simple2[0] is Geary.SearchQuery.EmailTextTerm); + var text2 = simple2[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text2.terms).size(1).contains("h"); + + var simple3 = test_article.parse_query(" h"); + assert_collection(simple3).size(1); + assert_true(simple3[0] is Geary.SearchQuery.EmailTextTerm); + var text3 = simple3[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text3.terms).size(1).contains("h"); + + var simple4 = test_article.parse_query("h "); + assert_collection(simple4).size(1); + assert_true(simple4[0] is Geary.SearchQuery.EmailTextTerm); + var text4 = simple4[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text4.terms).size(1).contains("h"); + } + + public void continuation_search_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), + this.account + ); + + var simple1 = test_article.parse_query("hello-there"); + assert_collection(simple1).size(1); + assert_true(simple1[0] is Geary.SearchQuery.EmailTextTerm); + var text1 = simple1[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text1.terms).size(1).contains("hello-there"); + + var simple2 = test_article.parse_query("hello-"); + assert_collection(simple2).size(1); + assert_true(simple2[0] is Geary.SearchQuery.EmailTextTerm); + var text2 = simple2[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text2.terms).size(1).contains("hello-"); + + var simple3 = test_article.parse_query("test@example.com"); + assert_collection(simple2).size(1); + assert_true(simple3[0] is Geary.SearchQuery.EmailTextTerm); + var text3 = simple3[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text3.terms).size(1).contains("test@example.com"); + } + + public void i18n_search_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), + this.account + ); + test_article.language = Pango.Language.from_string("th"); + + var multiple = test_article.parse_query("ภาษาไทย"); + assert_collection(multiple).size(2); + assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm); + assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms + ).size(1).contains("ภาษา"); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms + ).size(1).contains("ไทย"); + } + + public void multiple_search_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var multiple = test_article.parse_query("hello there"); + assert_collection(multiple).size(2); + assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm); + assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms + ).size(1).contains("hello"); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms + ).size(1).contains("there"); + } + + public void quoted_search_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple1 = test_article.parse_query("\"hello\""); + assert_collection(simple1).size(1); + assert_true(simple1[0] is Geary.SearchQuery.EmailTextTerm); + var text1 = simple1[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text1.target == ALL); + assert_true(text1.matching_strategy == EXACT); + assert_collection(text1.terms).size(1).contains("hello"); + + var simple2 = test_article.parse_query("\"h\""); + assert_collection(simple2).size(1); + assert_true(simple2[0] is Geary.SearchQuery.EmailTextTerm); + var text2 = simple2[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text2.terms).size(1).contains("h"); + + var simple3 = test_article.parse_query(" \"h\""); + assert_collection(simple3).size(1); + assert_true(simple3[0] is Geary.SearchQuery.EmailTextTerm); + var text3 = simple3[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text3.terms).size(1).contains("h"); + + var simple4 = test_article.parse_query("\"h"); + assert_collection(simple4).size(1); + assert_true(simple4[0] is Geary.SearchQuery.EmailTextTerm); + var text4 = simple4[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text4.terms).size(1).contains("h"); + + var simple5 = test_article.parse_query("\"h\" "); + assert_collection(simple5).size(1); + assert_true(simple5[0] is Geary.SearchQuery.EmailTextTerm); + var text5 = simple5[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text5.terms).size(1).contains("h"); + + var simple6 = test_article.parse_query("\"hello there\""); + assert_collection(simple6).size(1); + assert_true(simple6[0] is Geary.SearchQuery.EmailTextTerm); + var text6 = simple6[0] as Geary.SearchQuery.EmailTextTerm; + assert_collection(text6.terms).size(1).contains("hello there"); + } + + public void text_op_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple_body = test_article.parse_query("body:hello"); + assert_collection(simple_body).size(1); + assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm); + var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_body.target == BODY); + assert_true(text_body.matching_strategy == CONSERVATIVE); + assert_collection(text_body.terms).size(1).contains("hello"); + + var simple_body_quoted = test_article.parse_query("body:\"hello\""); + assert_collection(simple_body_quoted).size(1); + assert_true(simple_body_quoted[0] is Geary.SearchQuery.EmailTextTerm); + var text_body_quoted = simple_body_quoted[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_body_quoted.target == BODY); + assert_true(text_body_quoted.matching_strategy == EXACT); + assert_collection(text_body_quoted.terms).size(1).contains("hello"); + + var simple_attach_name = test_article.parse_query("attachment:hello"); + assert_collection(simple_attach_name).size(1); + assert_true(simple_attach_name[0] is Geary.SearchQuery.EmailTextTerm); + var text_attch_name = simple_attach_name[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_attch_name.target == ATTACHMENT_NAME); + + var simple_bcc = test_article.parse_query("bcc:hello"); + assert_collection(simple_bcc).size(1); + assert_true(simple_bcc[0] is Geary.SearchQuery.EmailTextTerm); + var text_bcc = simple_bcc[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_bcc.target == BCC); + + var simple_cc = test_article.parse_query("cc:hello"); + assert_collection(simple_cc).size(1); + assert_true(simple_cc[0] is Geary.SearchQuery.EmailTextTerm); + var text_cc = simple_cc[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_cc.target == CC); + + var simple_from = test_article.parse_query("from:hello"); + assert_collection(simple_from).size(1); + assert_true(simple_from[0] is Geary.SearchQuery.EmailTextTerm); + var text_from = simple_from[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_from.target == FROM); + + var simple_subject = test_article.parse_query("subject:hello"); + assert_collection(simple_subject).size(1); + assert_true(simple_subject[0] is Geary.SearchQuery.EmailTextTerm); + var text_subject = simple_subject[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_subject.target == SUBJECT); + + var simple_to = test_article.parse_query("to:hello"); + assert_collection(simple_to).size(1); + assert_true(simple_to[0] is Geary.SearchQuery.EmailTextTerm); + var text_to = simple_to[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_to.target == TO); + } + + public void text_op_single_me_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple_to = test_article.parse_query("to:me"); + assert_collection(simple_to).size(1); + assert_true(simple_to[0] is Geary.SearchQuery.EmailTextTerm); + var text_to = simple_to[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_to.target == TO); + assert_true(text_to.matching_strategy == EXACT); + assert_collection(text_to.terms).size(1).contains("test@example.com"); + + var simple_cc = test_article.parse_query("cc:me"); + assert_collection(simple_cc).size(1); + assert_true(simple_cc[0] is Geary.SearchQuery.EmailTextTerm); + var text_cc = simple_cc[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_cc.target == CC); + assert_true(text_cc.matching_strategy == EXACT); + assert_collection(text_cc.terms).size(1).contains("test@example.com"); + + var simple_bcc = test_article.parse_query("bcc:me"); + assert_collection(simple_bcc).size(1); + assert_true(simple_bcc[0] is Geary.SearchQuery.EmailTextTerm); + var text_bcc = simple_bcc[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_bcc.target == BCC); + assert_true(text_bcc.matching_strategy == EXACT); + assert_collection(text_bcc.terms).size(1).contains("test@example.com"); + + var simple_from = test_article.parse_query("from:me"); + assert_collection(simple_from).size(1); + assert_true(simple_from[0] is Geary.SearchQuery.EmailTextTerm); + var text_from = simple_from[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_from.target == FROM); + assert_true(text_from.matching_strategy == EXACT); + assert_collection(text_from.terms).size(1).contains("test@example.com"); + } + + public void text_op_multiple_me_terms() throws GLib.Error { + this.account.append_sender( + new Geary.RFC822.MailboxAddress("test2", "test2@example.com") + ); + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var to = test_article.parse_query("to:me"); + assert_collection(to).size(1); + assert_true(to[0] is Geary.SearchQuery.EmailTextTerm); + var text_to = to[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text_to.target == TO); + assert_true(text_to.matching_strategy == EXACT); + assert_collection(text_to.terms).size(2).contains( + "test@example.com" + ).contains( + "test@example.com" + ); + } + + public void boolean_op_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple_unread = test_article.parse_query("is:unread"); + assert_true(simple_unread[0] is Geary.SearchQuery.EmailFlagTerm); + var bool_unread = simple_unread[0] as Geary.SearchQuery.EmailFlagTerm; + assert_true( + bool_unread.value.equal_to(Geary.EmailFlags.UNREAD), "unread flag" + ); + assert_false(bool_unread.is_negated, "unread negated"); + + var simple_read = test_article.parse_query("is:read"); + assert_true(simple_read[0] is Geary.SearchQuery.EmailFlagTerm); + var bool_read = simple_read[0] as Geary.SearchQuery.EmailFlagTerm; + assert_true( + bool_read.value.equal_to(Geary.EmailFlags.UNREAD), "read flag" + ); + assert_true(bool_read.is_negated, "read negated"); + + var simple_starred = test_article.parse_query("is:starred"); + assert_true(simple_starred[0] is Geary.SearchQuery.EmailFlagTerm); + var bool_starred = simple_starred[0] as Geary.SearchQuery.EmailFlagTerm; + assert_true( + bool_starred.value.equal_to(Geary.EmailFlags.FLAGGED), "starred flag" + ); + assert_false(bool_starred.is_negated, "starred negated"); + } + + public void invalid_op_terms() throws GLib.Error { + var test_article = new SearchExpressionFactory( + this.config.get_search_strategy(), this.account + ); + + var simple1 = test_article.parse_query("body:"); + assert_collection(simple1).size(1); + assert_true(simple1[0] is Geary.SearchQuery.EmailTextTerm); + var text1 = simple1[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text1.target == ALL); + assert_true(text1.matching_strategy == CONSERVATIVE); + assert_collection(text1.terms).size(1).contains("body:"); + + var simple2 = test_article.parse_query("blarg:"); + assert_collection(simple2).size(1); + assert_true(simple2[0] is Geary.SearchQuery.EmailTextTerm); + var text2 = simple2[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text2.target == ALL); + assert_true(text2.matching_strategy == CONSERVATIVE); + assert_collection(text2.terms).size(1).contains("blarg:"); + + var simple3 = test_article.parse_query("blarg:hello"); + assert_collection(simple3).size(1); + assert_true(simple3[0] is Geary.SearchQuery.EmailTextTerm); + var text3 = simple3[0] as Geary.SearchQuery.EmailTextTerm; + assert_true(text3.target == ALL); + assert_true(text3.matching_strategy == CONSERVATIVE); + assert_collection(text3.terms).size(1).contains("blarg:hello"); + } + private Geary.Email new_email(Geary.RFC822.MailboxAddress? from, Geary.RFC822.MailboxAddress? sender, Geary.RFC822.MailboxAddress? reply_to) diff --git a/test/mock/mock-account.vala b/test/mock/mock-account.vala index 173b2ecae..2d08314d0 100644 --- a/test/mock/mock-account.vala +++ b/test/mock/mock-account.vala @@ -222,12 +222,12 @@ public class Mock.Account : Geary.Account, ); } - public override async Geary.SearchQuery - new_search_query(string raw, - Geary.SearchQuery.Strategy strategy, - GLib.Cancellable? cancellable) - throws GLib.Error { - return new SearchQuery(this, raw); + public override async Geary.SearchQuery new_search_query( + Gee.List expression, + string raw, + GLib.Cancellable? cancellable + ) throws GLib.Error { + return new SearchQuery(expression, raw); } public override async Gee.Collection? diff --git a/test/mock/mock-search-query.vala b/test/mock/mock-search-query.vala index 6653f96de..310cde7ef 100644 --- a/test/mock/mock-search-query.vala +++ b/test/mock/mock-search-query.vala @@ -7,8 +7,9 @@ public class Mock.SearchQuery : Geary.SearchQuery { - internal SearchQuery(Geary.Account owner, string raw) { - base(owner, raw, Geary.SearchQuery.Strategy.EXACT); + internal SearchQuery(Gee.List expression, + string raw) { + base(expression, raw); } } -- GitLab From e0396c322ed027afab2bccbd6aacee681a9b1a71 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Wed, 4 Nov 2020 19:01:48 +1100 Subject: [PATCH 04/24] Geary.ImapDb.SearchQuery: Require stemmer to be passed in to ctor Since constructing a libstemmer object is non trivial, this allows a per-account instance to be created just once, and improves testability. --- src/engine/imap-db/imap-db-search-query.vala | 50 ++----------------- .../imap-engine-generic-account.vala | 48 +++++++++++++++++- 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 21dee990c..9e2c38e97 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -319,18 +319,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { // A list of all search terms, regardless of search op field name private Gee.ArrayList all = new Gee.ArrayList(); - private SnowBall.Stemmer stemmer; + private unowned SnowBall.Stemmer stemmer; public async SearchQuery(Geary.Account owner, ImapDB.Account local, Gee.Collection expression, string raw, + SnowBall.Stemmer stemmer, Geary.SearchQuery.Strategy strategy, GLib.Cancellable? cancellable) { base(expression, raw); this.account = local; - this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer()); + this.stemmer = stemmer; switch (strategy) { case Strategy.EXACT: @@ -674,49 +675,4 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { return stemmed; } - private string find_appropriate_search_stemmer() { - // Unfortunately, the stemmer library only accepts the full language - // name for the stemming algorithm. This translates between the user's - // preferred language ISO 639-1 code and our available stemmers. - // FIXME: the available list here is determined by what's included in - // src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in - // instead of hardcoding it here. - foreach (string l in Intl.get_language_names()) { - switch (l) { - case "ar": return "arabic"; - case "eu": return "basque"; - case "ca": return "catalan"; - case "da": return "danish"; - case "nl": return "dutch"; - case "en": return "english"; - case "fi": return "finnish"; - case "fr": return "french"; - case "de": return "german"; - case "el": return "greek"; - case "hi": return "hindi"; - case "hu": return "hungarian"; - case "id": return "indonesian"; - case "ga": return "irish"; - case "it": return "italian"; - case "lt": return "lithuanian"; - case "ne": return "nepali"; - case "no": return "norwegian"; - case "pt": return "portuguese"; - case "ro": return "romanian"; - case "ru": return "russian"; - case "sr": return "serbian"; - case "es": return "spanish"; - case "sv": return "swedish"; - case "ta": return "tamil"; - case "tr": return "turkish"; - } - } - - // Default to English because it seems to be on average the language - // most likely to be present in emails, regardless of the user's - // language setting. This is not an exact science, and search results - // should be ok either way in most cases. - return "english"; - } - } diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index 070246586..4282ca046 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -60,6 +60,8 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { private Gee.Map> special_search_names = new Gee.HashMap>(); + private SnowBall.Stemmer stemmer; + protected GenericAccount(AccountInformation config, ImapDB.Account local, @@ -107,6 +109,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { this.db_vacuum_monitor = local.vacuum_monitor; compile_special_search_names(); + this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer()); } /** {@inheritDoc} */ @@ -578,7 +581,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { GLib.Cancellable? cancellable ) throws GLib.Error { return yield new ImapDB.SearchQuery( - this, this.local, expression, text, EXACT, cancellable + this, this.local, expression, text, this.stemmer, EXACT, cancellable ); } @@ -1064,6 +1067,49 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { throw new EngineError.OPEN_REQUIRED("Account %s not opened", to_string()); } + private string find_appropriate_search_stemmer() { + // Unfortunately, the stemmer library only accepts the full + // language name for the stemming algorithm. This translates + // between the desktop sessions's preferred language ISO 639-1 + // code and the available stemmers. + // + // FIXME: the available list here is determined by what's + // included in libstemmer. We should pass that list in instead + // of hardcoding it here. + foreach (string l in Intl.get_language_names()) { + switch (l) { + case "ar": return "arabic"; + case "eu": return "basque"; + case "ca": return "catalan"; + case "da": return "danish"; + case "nl": return "dutch"; + case "en": return "english"; + case "fi": return "finnish"; + case "fr": return "french"; + case "de": return "german"; + case "el": return "greek"; + case "hi": return "hindi"; + case "hu": return "hungarian"; + case "id": return "indonesian"; + case "ga": return "irish"; + case "it": return "italian"; + case "lt": return "lithuanian"; + case "ne": return "nepali"; + case "no": return "norwegian"; + case "pt": return "portuguese"; + case "ro": return "romanian"; + case "ru": return "russian"; + case "sr": return "serbian"; + case "es": return "spanish"; + case "sv": return "swedish"; + case "ta": return "tamil"; + case "tr": return "turkish"; + } + } + + return "english"; + } + private void on_operation_error(AccountOperation op, Error error) { notify_service_problem(this.information.incoming, error); } -- GitLab From 6a614adf7367cbd2a37b415c5407e74e8563dc9c Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Wed, 4 Nov 2020 19:36:39 +1100 Subject: [PATCH 05/24] Geary.ImapDb.SearchQuery: Use expression to generate FTS5 queries Move SQL generation for FTS search from ImapDb.Account to SearchQuery. Convert to use Geary.SearchQuery.Term instances to generate SQL, rather than parsing the expression. Simplify the generated SQL substantially and generate MATCH values that work with SQLite FTS5. --- src/engine/imap-db/imap-db-account.vala | 358 ++------ src/engine/imap-db/imap-db-search-query.vala | 824 +++++------------- .../imap-engine-generic-account.vala | 4 +- .../imap-db/imap-db-search-query-test.vala | 199 +++++ test/meson.build | 1 + test/test-engine.vala | 1 + 6 files changed, 480 insertions(+), 907 deletions(-) create mode 100644 test/engine/imap-db/imap-db-search-query-test.vala diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index 7514aeba8..13134d3fd 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -543,36 +543,6 @@ private class Geary.ImapDB.Account : BaseObject { return (messages.size == 0 ? null : messages); } - private void sql_add_query_phrases(StringBuilder sql, Gee.HashMap query_phrases, - string operator, string columns, string condition) { - bool is_first_field = true; - foreach (string field in query_phrases.keys) { - if (!is_first_field) - sql.append_printf(""" - %s - SELECT %s - FROM MessageSearchTable - WHERE %s - MATCH ? - %s - """, operator, columns, field, condition); - else - sql.append_printf(" AND %s MATCH ?", field); - is_first_field = false; - } - } - - private int sql_bind_query_phrases(Db.Statement stmt, int start_index, - Gee.HashMap query_phrases) throws Geary.DatabaseError { - int i = start_index; - // This relies on the keys being returned in the same order every time - // from the same map. It might not be guaranteed, but I feel pretty - // confident it'll work unless you change the map in between. - foreach (string field in query_phrases.keys) - stmt.bind_string(i++, query_phrases.get(field)); - return i - start_index; - } - // Append each id in the collection to the StringBuilder, in a format // suitable for use in an SQL statement IN (...) clause. private void sql_append_ids(StringBuilder s, Gee.Iterable ids) { @@ -612,91 +582,24 @@ private class Geary.ImapDB.Account : BaseObject { Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error { - debug("Search terms, offset/limit: %s %d/%d", - q.to_string(), offset, limit); + debug("Search query: %s", q.to_string()); check_open(); ImapDB.SearchQuery query = check_search_query(q); - Gee.HashMap query_phrases = query.get_query_phrases(); - Gee.Map removal_conditions = query.get_removal_conditions(); - if (query_phrases.size == 0 && removal_conditions.is_empty) - return null; - - foreach (string? field in query.get_fields()) { - debug(" - Field \"%s\" terms:", field); - foreach (SearchQuery.Term? term in query.get_search_terms(field)) { - if (term != null) { - debug(" - \"%s\": %s, %s", - term.original, - term.parsed, - term.stemmed - ); - debug(" SQL terms:"); - foreach (string sql in term.sql) { - debug(" - \"%s\"", sql); - } - } - } - } - // Do this outside of transaction to catch invalid search ids up-front string? search_ids_sql = get_search_ids_sql(search_ids); - bool strip_greedy = query.should_strip_greedy_results(); Gee.List matching_ids = new Gee.LinkedList(); Gee.Map>? search_matches = null; - yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => { - string blacklisted_ids_sql = do_get_blacklisted_message_ids_sql( - folder_blacklist, cx, cancellable); - - // Every mutation of this query we could think of has been tried, - // and this version was found to minimize running time. We - // discovered that just doing a JOIN between the MessageTable and - // MessageSearchTable was causing a full table scan to order the - // results. When it's written this way, and we force SQLite to use - // the correct index (not sure why it can't figure it out on its - // own), it cuts the running time roughly in half of how it was - // before. The short version is: modify with extreme caution. See - // . - StringBuilder sql = new StringBuilder(); - sql.append(""" - SELECT id - FROM MessageTable - INDEXED BY MessageTableInternalDateTimeTIndex - """); - if (query_phrases.size != 0) { - sql.append(""" - WHERE id IN ( - SELECT rowid - FROM MessageSearchTable - WHERE 1=1 - """); - sql_add_query_phrases(sql, query_phrases, "INTERSECT", "rowid", ""); - sql.append(")"); - } else - sql.append(" WHERE 1=1"); - - if (blacklisted_ids_sql != "") - sql.append(" AND id NOT IN (%s)".printf(blacklisted_ids_sql)); - if (!Geary.String.is_empty(search_ids_sql)) - sql.append(" AND id IN (%s)".printf(search_ids_sql)); - sql.append(" ORDER BY internaldate_time_t DESC"); - if (limit > 0) - sql.append(" LIMIT ? OFFSET ?"); - - Db.Statement stmt = cx.prepare(sql.str); - int bind_index = sql_bind_query_phrases(stmt, 0, query_phrases); - if (limit > 0) { - stmt.bind_int(bind_index++, limit); - stmt.bind_int(bind_index++, offset); - } - - Gee.HashMap id_map = new Gee.HashMap( + yield db.exec_transaction_async(RO, (cx) => { + var id_map = new Gee.HashMap( Collection.int64_hash_func, Collection.int64_equal_func); - + Db.Statement stmt = query.get_search_query( + cx, search_ids_sql, folder_blacklist, limit, offset, cancellable + ); Db.Result result = stmt.exec(cancellable); while (!result.finished) { int64 message_id = result.int64_at(0); @@ -707,8 +610,7 @@ private class Geary.ImapDB.Account : BaseObject { result.next(cancellable); } - - if (strip_greedy && !id_map.is_empty) { + if (query.has_stemmed_terms && !id_map.is_empty) { search_matches = do_get_search_matches( cx, query, id_map, cancellable ); @@ -719,13 +621,7 @@ private class Geary.ImapDB.Account : BaseObject { debug("Matching emails found: %d", matching_ids.size); - if (!removal_conditions.is_empty) { - yield strip_removal_conditions( - query, matching_ids, removal_conditions, cancellable - ); - } - - if (strip_greedy && search_matches != null) { + if (query.has_stemmed_terms && search_matches != null) { strip_greedy_results(query, matching_ids, search_matches); } @@ -733,40 +629,6 @@ private class Geary.ImapDB.Account : BaseObject { return matching_ids.is_empty ? null : matching_ids; } - // Strip out from the given collection any email that matches the - // given removal conditions - private async void strip_removal_conditions(ImapDB.SearchQuery query, - Gee.Collection matches, - Gee.Map conditions, - GLib.Cancellable? cancellable = null) - throws GLib.Error { - Email.Field required_fields = Geary.Email.Field.FLAGS; - Gee.Iterator iter = matches.iterator(); - - yield db.exec_transaction_async(RO, (cx) => { - while (iter.next()) { - ImapDB.EmailIdentifier id = iter.get(); - MessageRow row = Geary.ImapDB.Folder.do_fetch_message_row( - cx, id.message_id, required_fields, null, cancellable - ); - Geary.EmailFlags? flags = row.get_generic_email_flags(); - if (flags != null) { - foreach (Gee.Map.Entry condition - in conditions.entries) { - if (flags.contains(condition.key) == condition.value) { - iter.remove(); - break; - } - } - } else { - iter.remove(); - } - } - return Db.TransactionOutcome.DONE; - }, cancellable - ); - } - // Strip out from the given collection of matching ids and results // for any search results that only contain a hit due to "greedy" // matching of the stemmed variants on all search terms. @@ -774,46 +636,46 @@ private class Geary.ImapDB.Account : BaseObject { Gee.Collection matches, Gee.Map> results) { int prestripped_results = matches.size; - Gee.Iterator iter = matches.iterator(); - while (iter.next()) { - // For each matched string in this message, retain the message in the search results - // if it prefix-matches any of the straight-up parsed terms or matches a stemmed - // variant (with only max. difference in their lengths allowed, i.e. not a "greedy" - // match) - EmailIdentifier id = iter.get(); - bool good_match_found = false; - Gee.Set? result = results.get(id); - if (result != null) { - foreach (string match in result) { - foreach (SearchQuery.Term term in query.get_all_terms()) { - // if prefix-matches parsed term, then don't strip - if (match.has_prefix(term.parsed)) { - good_match_found = true; - break; - } - - // if prefix-matches stemmed term w/o doing so - // greedily, then don't strip - if (term.stemmed != null && match.has_prefix(term.stemmed)) { - int diff = match.length - term.stemmed.length; - if (diff <= query.max_difference_match_stem_lengths) { - good_match_found = true; - break; - } - } - } - } - - if (good_match_found) { - break; - } - } - - if (!good_match_found) { - iter.remove(); - matches.remove(id); - } - } + // Gee.Iterator iter = matches.iterator(); + // while (iter.next()) { + // // For each matched string in this message, retain the message in the search results + // // if it prefix-matches any of the straight-up parsed terms or matches a stemmed + // // variant (with only max. difference in their lengths allowed, i.e. not a "greedy" + // // match) + // EmailIdentifier id = iter.get(); + // bool good_match_found = false; + // Gee.Set? result = results.get(id); + // if (result != null) { + // foreach (string match in result) { + // foreach (SearchQuery.Term term in query.get_all_terms()) { + // // if prefix-matches parsed term, then don't strip + // if (match.has_prefix(term.parsed)) { + // good_match_found = true; + // break; + // } + + // // if prefix-matches stemmed term w/o doing so + // // greedily, then don't strip + // if (term.stemmed != null && match.has_prefix(term.stemmed)) { + // int diff = match.length - term.stemmed.length; + // if (diff <= query.max_difference_match_stem_lengths) { + // good_match_found = true; + // break; + // } + // } + // } + // } + + // if (good_match_found) { + // break; + // } + // } + + // if (!good_match_found) { + // iter.remove(); + // matches.remove(id); + // } + // } debug("Stripped %d emails from search for [%s] due to greedy stem matching", prestripped_results - matches.size, query.raw); @@ -831,12 +693,15 @@ private class Geary.ImapDB.Account : BaseObject { foreach (ImapDB.EmailIdentifier id in ids) id_map.set(id.message_id, id); - Gee.Map>? match_map = - do_get_search_matches(cx, query, id_map, cancellable); - if (match_map == null || match_map.size == 0) + Gee.Map>? match_map = null; + if (!id_map.is_empty) { + match_map = do_get_search_matches(cx, query, id_map, cancellable); + } + if (match_map == null || match_map.size == 0) { return Db.TransactionOutcome.DONE; + } - if (query.should_strip_greedy_results()) { + if (query.has_stemmed_terms) { strip_greedy_results(query, ids, match_map); } @@ -1219,68 +1084,6 @@ private class Geary.ImapDB.Account : BaseObject { return !result.finished; } - // Turn the collection of folder paths into actual folder ids. As a - // special case, if "folderless" or orphan emails are to be blacklisted, - // set the out bool to true. - private Gee.Collection do_get_blacklisted_folder_ids(Gee.Collection? folder_blacklist, - Db.Connection cx, out bool blacklist_folderless, Cancellable? cancellable) throws Error { - blacklist_folderless = false; - Gee.ArrayList ids = new Gee.ArrayList(); - - if (folder_blacklist != null) { - foreach (Geary.FolderPath? folder_path in folder_blacklist) { - if (folder_path == null) { - blacklist_folderless = true; - } else { - int64 id; - do_fetch_folder_id(cx, folder_path, true, out id, cancellable); - if (id != Db.INVALID_ROWID) - ids.add(id); - } - } - } - - return ids; - } - - // Return a parameterless SQL statement that selects any message ids that - // are in a blacklisted folder. This is used as a sub-select for the - // search query to omit results from blacklisted folders. - private string do_get_blacklisted_message_ids_sql(Gee.Collection? folder_blacklist, - Db.Connection cx, Cancellable? cancellable) throws Error { - bool blacklist_folderless; - Gee.Collection blacklisted_ids = do_get_blacklisted_folder_ids( - folder_blacklist, cx, out blacklist_folderless, cancellable); - - StringBuilder sql = new StringBuilder(); - if (blacklisted_ids.size > 0) { - sql.append(""" - SELECT message_id - FROM MessageLocationTable - WHERE remove_marker = 0 - AND folder_id IN ( - """); - sql_append_ids(sql, blacklisted_ids); - sql.append(")"); - - if (blacklist_folderless) - sql.append(" UNION "); - } - if (blacklist_folderless) { - sql.append(""" - SELECT id - FROM MessageTable - WHERE id NOT IN ( - SELECT message_id - FROM MessageLocationTable - WHERE remove_marker = 0 - ) - """); - } - - return sql.str; - } - // For a message row id, return a set of all folders it's in, or null if // it's not in any folders. private Gee.Set? @@ -1406,42 +1209,22 @@ private class Geary.ImapDB.Account : BaseObject { // Not using a MultiMap because when traversing want to process all values at once per iteration, // not per key-value - public Gee.Map>? do_get_search_matches(Db.Connection cx, - ImapDB.SearchQuery query, Gee.Map id_map, Cancellable? cancellable) - throws Error { - if (id_map.size == 0) - return null; - - Gee.HashMap query_phrases = query.get_query_phrases(); - if (query_phrases.size == 0) - return null; - - StringBuilder sql = new StringBuilder(); - sql.append(""" - SELECT rowid, geary_matches(MessageSearchTable), * - FROM MessageSearchTable - WHERE rowid IN ( - """); - sql_append_ids(sql, id_map.keys); - sql.append(")"); - - StringBuilder condition = new StringBuilder("AND rowid IN ("); - sql_append_ids(condition, id_map.keys); - condition.append(")"); - sql_add_query_phrases(sql, query_phrases, "UNION", "rowid, geary_matches(MessageSearchTable), *", - condition.str); - - Db.Statement stmt = cx.prepare(sql.str); - sql_bind_query_phrases(stmt, 0, query_phrases); - - var search_matches = - new Gee.HashMap>(); + public Gee.Map> do_get_search_matches( + Db.Connection cx, + ImapDB.SearchQuery query, + Gee.Map id_map, + GLib.Cancellable? cancellable + ) throws GLib.Error { + var search_ids_sql = new GLib.StringBuilder(); + sql_append_ids(search_ids_sql, id_map.keys); - Db.Result result = stmt.exec(cancellable); + var search_matches = new Gee.HashMap>(); + Db.Result result = query.get_match_query( + cx, search_ids_sql.str + ).exec(cancellable); while (!result.finished) { - int64 rowid = result.rowid_at(0); - assert(id_map.has_key(rowid)); - ImapDB.EmailIdentifier id = id_map.get(rowid); + int64 docid = result.rowid_at(0); + ImapDB.EmailIdentifier id = id_map.get(docid); // XXX Avoid a crash when "database disk image is // malformed" error occurs. Remove this when the SQLite @@ -1502,8 +1285,9 @@ private class Geary.ImapDB.Account : BaseObject { private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error { ImapDB.SearchQuery? query = q as ImapDB.SearchQuery; - if (query == null || query.account != this) + if (query == null) { throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name); + } return query; } diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 9e2c38e97..e03e2d0ff 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -1,6 +1,6 @@ /* - * Copyright 2016 Software Freedom Conservancy Inc. - * Copyright 2019 Michael Gratton . + * Copyright © 2016 Software Freedom Conservancy Inc. + * Copyright © 2019-2020 Michael Gratton . * * This software is licensed under the GNU Lesser General Public License * (version 2.1 or later). See the COPYING file in this distribution. @@ -11,619 +11,104 @@ */ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { - // These characters are chosen for being commonly used to continue a single word (such as - // extended last names, i.e. "Lars-Eric") or in terms commonly searched for in an email client, - // i.e. unadorned mailbox addresses. Note that characters commonly used for wildcards or that - // would be interpreted as wildcards by SQLite are not included here. - private const unichar[] SEARCH_TERM_CONTINUATION_CHARS = { '-', '_', '.', '@' }; - // Search operator field names, eg: "to:foo@example.com" or "is:unread" - private const string SEARCH_OP_ATTACHMENT = "attachments"; - private const string SEARCH_OP_BCC = "bcc"; - private const string SEARCH_OP_BODY = "body"; - private const string SEARCH_OP_CC = "cc"; - private const string SEARCH_OP_FROM = "\"from\""; - private const string SEARCH_OP_IS = "is"; - private const string SEARCH_OP_SUBJECT = "subject"; - private const string SEARCH_OP_TO = "receivers"; + private const string EMAIL_TEXT_STEMMED_TERMS = "geary-stemmed-terms"; - // Operators allowing finding mail addressed to "me" - private const string[] SEARCH_OP_TO_ME_FIELDS = { - SEARCH_OP_BCC, - SEARCH_OP_CC, - SEARCH_OP_TO, - }; - // The addressable op value for "me" - private const string SEARCH_OP_ADDRESSABLE_VALUE_ME = "me"; - - // Search operator field values - private const string SEARCH_OP_VALUE_READ = "read"; - private const string SEARCH_OP_VALUE_STARRED = "starred"; - private const string SEARCH_OP_VALUE_UNREAD = "unread"; - - - /** - * Various associated state with a single term in a search query. - */ - internal class Term : GLib.Object { - - /** - * The original tokenized search term with minimal other processing performed. - * - * For example, punctuation might be removed, but no casefolding has occurred. - */ - public string original { get; private set; } - - /** - * The parsed tokenized search term. - * - * Casefolding and other normalizing text operations have been performed. - */ - public string parsed { get; private set; } - - /** - * The stemmed search term. - * - * Only used if stemming is being done ''and'' the stem is different than the {@link parsed} - * term. - */ - public string? stemmed { get; private set; } - - /** - * A list of terms ready for binding to an SQLite statement. - * - * This should include prefix operators and quotes (i.e. ["party"] or [party*]). These texts - * are guaranteed not to be null or empty strings. - */ - public Gee.List sql { get; private set; default = new Gee.ArrayList(); } - - /** - * Returns true if the {@link parsed} term is exact-match only (i.e. starts with quotes) and - * there is no {@link stemmed} variant. - */ - public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } } - - public Term(string original, string parsed, string? stemmed, string? sql_parsed, string? sql_stemmed) { - this.original = original; - this.parsed = parsed; - this.stemmed = stemmed; - - // for now, only two variations: the parsed string and the stemmed; since stem is usually - // shorter (and will be first in the OR statement), include it first - if (!String.is_empty(sql_stemmed)) - sql.add(sql_stemmed); - - if (!String.is_empty(sql_parsed)) - sql.add(sql_parsed); - } - } - - private Geary.SearchQuery.Strategy strategy; - - // Maps of localised search operator names and values to their - // internal forms - private static Gee.HashMap search_op_names = - new Gee.HashMap(); - private static Gee.ArrayList search_op_to_me_values = - new Gee.ArrayList(); - private static Gee.ArrayList search_op_from_me_values = - new Gee.ArrayList(); - private static Gee.HashMap search_op_is_values = - new Gee.HashMap(); - - - static construct { - // Map of possibly translated search operator names and values - // to English/internal names and values. We include the - // English version anyway so that when translations provide a - // localised version of the operator names but have not also - // translated the user manual, the English version in the - // manual still works. - - // Can be typed in the search box like "attachment:file.txt" - // to find messages with attachments with a particular name. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "attachment"), SEARCH_OP_ATTACHMENT); - // Can be typed in the search box like - // "bcc:johndoe@example.com" to find messages bcc'd to a - // particular person. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "bcc"), SEARCH_OP_BCC); - // Can be typed in the search box like "body:word" to find - // "word" only if it occurs in the body of a message. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "body"), SEARCH_OP_BODY); - // Can be typed in the search box like - // "cc:johndoe@example.com" to find messages cc'd to a - // particular person. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "cc"), SEARCH_OP_CC); - // Can be typed in the search box like - // "from:johndoe@example.com" to find messages from a - // particular sender. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "from"), SEARCH_OP_FROM); - // Can be typed in the search box like "is:unread" to find - // messages that are read, unread, or starred. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "is"), SEARCH_OP_IS); - // Can be typed in the search box like "subject:word" to find - // "word" only if it occurs in the subject of a message. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary - // User Guide. - search_op_names.set(C_("Search operator", "subject"), SEARCH_OP_SUBJECT); - // Can be typed in the search box like - // "to:johndoe@example.com" to find messages received by a - // particular person. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_names.set(C_("Search operator", "to"), SEARCH_OP_TO); - - // And the English language versions - search_op_names.set("attachment", SEARCH_OP_ATTACHMENT); - search_op_names.set("bcc", SEARCH_OP_BCC); - search_op_names.set("body", SEARCH_OP_BODY); - search_op_names.set("cc", SEARCH_OP_CC); - search_op_names.set("from", SEARCH_OP_FROM); - search_op_names.set("is", SEARCH_OP_IS); - search_op_names.set("subject", SEARCH_OP_SUBJECT); - search_op_names.set("to", SEARCH_OP_TO); - - // Can be typed in the search box after "to:", "cc:" and - // "bcc:" e.g.: "to:me". Matches conversations that are - // addressed to the user. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_to_me_values.add( - C_("Search operator value - mail addressed to the user", "me") - ); - search_op_to_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME); - - // Can be typed in the search box after "from:" i.e.: - // "from:me". Matches conversations were sent by the user. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_from_me_values.add( - C_("Search operator value - mail sent by the user", "me") - ); - search_op_from_me_values.add(SEARCH_OP_ADDRESSABLE_VALUE_ME); - - // Can be typed in the search box after "is:" i.e.: - // "is:read". Matches conversations that are flagged as read. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_is_values.set( - C_("'is:' search operator value", "read"), SEARCH_OP_VALUE_READ - ); - // Can be typed in the search box after "is:" i.e.: - // "is:starred". Matches conversations that are flagged as - // starred. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_is_values.set( - C_("'is:' search operator value", "starred"), SEARCH_OP_VALUE_STARRED - ); - // Can be typed in the search box after "is:" i.e.: - // "is:unread". Matches conversations that are flagged unread. - // - // The translated string must be a single word (use '-', '_' - // or similar to combine words into one), should be short, and - // also match the translation in "search.page" of the Geary User - // Guide. - search_op_is_values.set( - C_("'is:' search operator value", "unread"), SEARCH_OP_VALUE_UNREAD - ); - search_op_is_values.set(SEARCH_OP_VALUE_READ, SEARCH_OP_VALUE_READ); - search_op_is_values.set(SEARCH_OP_VALUE_STARRED, SEARCH_OP_VALUE_STARRED); - search_op_is_values.set(SEARCH_OP_VALUE_UNREAD, SEARCH_OP_VALUE_UNREAD); - } - - - /** - * Associated {@link ImapDB.Account}. - */ - public weak ImapDB.Account account { get; private set; } - - /** - * Returns whether stemming may be used when exerting the search. - * - * Determined by {@link Geary.SearchQuery.Strategy} passed to the - * constructor. - */ - public bool allow_stemming { get; private set; } - - /** - * Minimum length of the term before stemming is allowed. - * - * This prevents short words that might be stemmed from being stemmed. - * - * Overridden by {@link allow_stemming}. Determined by the {@link - * Geary.SearchQuery.Strategy} passed to the constructor. - */ - public int min_term_length_for_stemming { get; private set; } - - - /** - * Maximum difference in lengths between term and stemmed variant. - * - * This prevents long words from being stemmed to much shorter - * words (which creates opportunities for greedy matching). - * - * Overridden by {@link allow_stemming}. Determined by the {@link - * Geary.SearchQuery.Strategy} passed to the constructor. - */ - public int max_difference_term_stem_lengths { get; private set; } - - /** - * Maximum difference in lengths between a matched word and the stemmed variant it matched - * against. - * - * This prevents long words being matched to short stem variants (which creates opportunities - * for greedy matching). - * - * Overridden by {@link allow_stemming}. Determined by the {@link - * Geary.SearchQuery.Strategy} passed to the constructor. - */ - public int max_difference_match_stem_lengths { get; private set; } - - // Maps search operator field names such as "to", "cc", "is" to - // their search term values. Note that terms without an operator - // are stored with null as the key. Not using a MultiMap because - // we (might) need a guarantee of order. - private Gee.HashMap> field_map - = new Gee.HashMap>(); - - // A list of all search terms, regardless of search op field name - private Gee.ArrayList all = new Gee.ArrayList(); + internal bool has_stemmed_terms { get; private set; default = false; } private unowned SnowBall.Stemmer stemmer; - public async SearchQuery(Geary.Account owner, - ImapDB.Account local, - Gee.Collection expression, - string raw, - SnowBall.Stemmer stemmer, - Geary.SearchQuery.Strategy strategy, - GLib.Cancellable? cancellable) { + public SearchQuery(Gee.List expression, + string raw, + SnowBall.Stemmer stemmer) { base(expression, raw); - this.account = local; this.stemmer = stemmer; - switch (strategy) { - case Strategy.EXACT: - allow_stemming = false; - min_term_length_for_stemming = int.MAX; - max_difference_term_stem_lengths = 0; - max_difference_match_stem_lengths = 0; - break; - - case Strategy.CONSERVATIVE: - allow_stemming = true; - min_term_length_for_stemming = 6; - max_difference_term_stem_lengths = 2; - max_difference_match_stem_lengths = 2; - break; - - case Strategy.AGGRESSIVE: - allow_stemming = true; - min_term_length_for_stemming = 4; - max_difference_term_stem_lengths = 4; - max_difference_match_stem_lengths = 3; - break; - - case Strategy.HORIZON: - allow_stemming = true; - min_term_length_for_stemming = 0; - max_difference_term_stem_lengths = int.MAX; - max_difference_match_stem_lengths = int.MAX; - break; - } - - yield prepare(cancellable); - } - - public Gee.Collection get_fields() { - return field_map.keys; - } - - public Gee.List? get_search_terms(string? field) { - return field_map.has_key(field) ? field_map.get(field) : null; - } - - public Gee.List? get_all_terms() { - return all; - } - - // For some searches, results are stripped if they're too - // "greedy", but this requires examining the matched text, which - // has an expense to fetch, so avoid doing so unless necessary - internal bool should_strip_greedy_results() { - // HORIZON strategy is configured in such a way to allow all - // stemmed variants to match, so don't do any stripping in - // that case - // - // If any of the search terms is exact-match (no prefix - // matching) or none have stemmed variants, then don't do - // stripping of "greedy" stemmed matching (because in both - // cases, there are none) - - bool strip_results = true; - if (this.strategy == Geary.SearchQuery.Strategy.HORIZON) - strip_results = false; - else if (traverse(this.all).any( - term => term.stemmed == null || term.is_exact)) { - strip_results = false; - } - return strip_results; - } - - internal Gee.Map get_removal_conditions() { - Gee.Map conditions = - new Gee.HashMap(); - foreach (string? field in this.field_map.keys) { - if (field == SEARCH_OP_IS) { - Gee.List? terms = get_search_terms(field); - foreach (Term term in terms) - if (term.parsed == SEARCH_OP_VALUE_READ) - conditions.set(new NamedFlag("UNREAD"), true); - else if (term.parsed == SEARCH_OP_VALUE_UNREAD) - conditions.set(new NamedFlag("UNREAD"), false); - else if (term.parsed == SEARCH_OP_VALUE_STARRED) - conditions.set(new NamedFlag("FLAGGED"), false); + // Pre-stem search terms up front since the stemmed form is + // needed in a few different places + foreach (var term in this.expression) { + // Use this brittle form of type checking for performance + // (both here and further below in the class) - the Engine + // controls the Term hierarchy the needed assumptions can + // be made + if (term.get_type() == typeof(EmailTextTerm)) { + var text = (EmailTextTerm) term; + if (text.matching_strategy.is_stemming_enabled()) { + stem_search_terms(text); + } } } - return conditions; } - // Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'. - internal Gee.HashMap get_query_phrases() { - Gee.HashMap phrases = new Gee.HashMap(); - foreach (string? field in field_map.keys) { - Gee.List? terms = get_search_terms(field); - if (terms == null || terms.size == 0 || field == "is") - continue; - - // Each Term is an AND but the SQL text within in are OR ... this allows for - // each user term to be AND but the variants of each term are or. So, if terms are - // [party] and [eventful] and stems are [parti] and [event], the search would be: - // - // (party* OR parti*) AND (eventful* OR event*) - // - // Obviously with stemming there's the possibility of the stemmed variant being nothing - // but a broader search of the original term (such as event* and eventful*) but do both - // to determine from each hit result which term caused the hit, and if it's too greedy - // a match of the stemmed variant, it can be stripped from the results. - // - // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied - // (and would be treated as search term if included), parentheses are not allowed, and - // OR has a higher precedence than AND. So the above example in standard syntax is: - // - // party* OR parti* eventful* OR event* - StringBuilder builder = new StringBuilder(); - foreach (Term term in terms) { - if (term.sql.size == 0) - continue; - - if (term.is_exact) { - builder.append_printf("%s ", term.parsed); - } else { - bool is_first_sql = true; - foreach (string sql in term.sql) { - if (!is_first_sql) - builder.append(" OR "); - - builder.append_printf("%s ", sql); - is_first_sql = false; - } - } + internal Db.Statement get_search_query( + Db.Connection cx, + string? search_ids_sql, + Gee.Collection? folder_blacklist, + int limit, + int offset, + GLib.Cancellable? cancellable + ) throws GLib.Error { + var sql = new GLib.StringBuilder(); + var conditions_added = false; + + sql.append(""" + SELECT mst.rowid + FROM MessageSearchTable as mst + INNER JOIN MessageTable AS mt ON mt.id = mst.rowid + WHERE"""); + conditions_added = sql_add_term_conditions(sql, conditions_added); + if (!String.is_empty(search_ids_sql)) { + if (conditions_added) { + sql.append(" AND"); } - - phrases.set(field ?? "MessageSearchTable", builder.str); + sql.append(""" id IN (%s)""".printf(search_ids_sql)); } - - return phrases; - } - - private async void prepare(GLib.Cancellable? cancellable) { - // A few goals here: - // 1) Append an * after every term so it becomes a prefix search - // (see ) - // 2) Strip out common words/operators that might get interpreted as - // search operators - // 3) Parse each word into a list of which field it applies to, so - // you can do "to:johndoe@example.com thing" (quotes excluded) - // to find messages to John containing the word thing - // We ignore everything inside quotes to give the user a way to - // override our algorithm here. The idea is to offer one search query - // syntax for Geary that we can use locally and via IMAP, etc. - - string quote_balanced = this.raw; - if (Geary.String.count_char(this.raw, '"') % 2 != 0) { - // Remove the last quote if it's not balanced. This has the - // benefit of showing decent results as you type a quoted phrase. - int last_quote = this.raw.last_index_of_char('"'); - assert(last_quote >= 0); - quote_balanced = this.raw.splice(last_quote, last_quote + 1, " "); + sql.append(""" + ORDER BY mt.internaldate_time_t DESC"""); + if (limit > 0) { + sql.append(""" + LIMIT ? OFFSET ?"""); } - string[] words = quote_balanced.split_set(" \t\r\n()%*\\"); - bool in_quote = false; - foreach (string s in words) { - string? field = null; - - s = s.strip(); - - int quotes = Geary.String.count_char(s, '"'); - if (!in_quote && quotes > 0) { - in_quote = true; - --quotes; - } - - Term? term; - if (in_quote) { - // HACK: this helps prevent a syntax error when the user types - // something like from:"somebody". If we ever properly support - // quotes after : we can get rid of this. - term = new Term(s, s, null, s.replace(":", " "), null); - } else { - string original = s; - - // Some common search phrases we don't respect and - // therefore don't want to fall through to search - // results - // XXX translate these - string lower = s.down(); - switch (lower) { - case "": - case "and": - case "or": - case "not": - case "near": - continue; - - default: - if (lower.has_prefix("near/")) - continue; - break; - } - - if (s.has_prefix("-")) - s = s.substring(1); - - if (s == "") - continue; - - // TODO: support quotes after : - string[] parts = s.split(":", 2); - if (parts.length > 1) - field = extract_field_from_token(parts, ref s); - - if (field == SEARCH_OP_IS) { - // s will have been de-translated - term = new Term(original, s, null, null, null); - } else { - // SQL MATCH syntax for parsed term - string? sql_s = "%s*".printf(s); - - // stem the word, but if stemmed and stem is - // simply shorter version of original term, only - // prefix-match search for it (i.e. avoid - // searching for [archive* OR archiv*] when that's - // the same as [archiv*]), otherwise search for - // both - string? stemmed = yield stem_search_term(s, cancellable); - - string? sql_stemmed = null; - if (stemmed != null) { - sql_stemmed = "%s*".printf(stemmed); - if (s.has_prefix(stemmed)) - sql_s = null; - } - - // if term contains continuation characters, treat - // as exact search to reduce effects of tokenizer - // splitting terms w/ punctuation in them - if (String.contains_any_char(s, SEARCH_TERM_CONTINUATION_CHARS)) - s = "\"%s\"".printf(s); - - term = new Term(original, s, stemmed, sql_s, sql_stemmed); - } - } - - if (in_quote && quotes % 2 != 0) - in_quote = false; - - // Finally, add the term - if (!this.field_map.has_key(field)) { - this.field_map.set(field, new Gee.ArrayList()); - } - this.field_map.get(field).add(term); - this.all.add(term); + Db.Statement stmt = cx.prepare(sql.str); + int bind_index = sql_bind_term_conditions(stmt, false, 0); + if (limit > 0) { + stmt.bind_int(bind_index++, limit); + stmt.bind_int(bind_index++, offset); } + + return stmt; } - private string? extract_field_from_token(string[] parts, ref string token) { - string? field = null; - if (Geary.String.is_empty_or_whitespace(parts[1])) { - // User stopped at "field:", treat it as if they hadn't - // typed the ':' - token = parts[0]; - } else { - field = search_op_names.get(parts[0].down()); - if (field == SEARCH_OP_IS) { - string? value = search_op_is_values.get(parts[1].down()); - if (value != null) { - token = value; - } else { - // Unknown op value, pretend there is no search op - field = null; - } - } else if (field == SEARCH_OP_FROM && - parts[1].down() in search_op_from_me_values) { - // Search for all addresses on the account. Bug 768779 - token = this.account.account_information.primary_mailbox.address; - } else if (field in SEARCH_OP_TO_ME_FIELDS && - parts[1].down() in search_op_to_me_values) { - // Search for all addresses on the account. Bug 768779 - token = this.account.account_information.primary_mailbox.address; - } else if (field != null) { - token = parts[1]; - } - } - return field; + internal Db.Statement get_match_query( + Db.Connection cx, + string? search_ids_sql + ) throws GLib.Error { + var sql = new GLib.StringBuilder(); + sql.append(""" + SELECT mst.rowid, geary_matches(MessageSearchTable) + FROM MessageSearchTable as mst + WHERE rowid IN ( + """); + sql.append(search_ids_sql); + sql.append(") AND "); + sql_add_term_conditions(sql, false); + + Db.Statement stmt = cx.prepare(sql.str); + sql_bind_term_conditions(stmt, true, 0); + return stmt; } /** - * Converts unquoted search terms into a stemmed search term. + * Applies stemming for the given term to a specific term value. * * Prior experience with the Snowball stemmer indicates it is too * aggressive for our tastes when coupled with prefix-matching of - * all unquoted terms (see - * https://bugzilla.gnome.org/show_bug.cgi?id=713179). + * all unquoted terms. See + * https://bugzilla.gnome.org/show_bug.cgi?id=713179 and + * https://bugzilla.gnome.org/show_bug.cgi?id=720361 * * This method is part of a larger strategy designed to dampen * that aggressiveness without losing the benefits of stemming @@ -632,47 +117,152 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { * Post-search processing is then to strip results which are too * "greedy" due to prefix-matching the stemmed variant. * - * Some heuristics are in place simply to determine if stemming should occur: + * Some heuristics are in place simply to determine if stemming + * should occur: * * # If stemming is unallowed, no stemming occurs. - * # If the term is < min. term length for stemming, no stemming occurs. - * # If the stemmer returns a stem that is the same as the original term, no stemming occurs. - * # If the difference between the stemmed word and the original term is more than - * maximum allowed, no stemming occurs. This works under the assumption that if - * the user has typed a long word, they do not want to "go back" to searching for a much - * shorter version of it. (For example, "accountancies" stems to "account"). + * # If the term is < min. term length for stemming, no stemming + * occurs. + * # If the stemmer returns a stem that is the same as the + * original term, no stemming occurs. + * # If the difference between the stemmed word and the original + * term is more than maximum allowed, no stemming occurs. This + * works under the assumption that if the user has typed a long + * word, they do not want to "go back" to searching for a much + * shorter version of it. (For example, "accountancies" stems + * to "account"). * * Otherwise, the stem for the term is returned. */ - private async string? stem_search_term(string term, - GLib.Cancellable? cancellable) { - if (!this.allow_stemming) - return null; + private void stem_search_terms(EmailTextTerm text) { + var stemmed_terms = new Gee.ArrayList(); + foreach (var term in text.terms) { + int term_length = term.length; + string? stemmed = null; + if (term_length > text.matching_strategy.get_min_term_length_for_stemming()) { + stemmed = this.stemmer.stem(term, term_length); + if (String.is_empty(stemmed) || + term == stemmed || + term_length - stemmed.length > + text.matching_strategy.get_max_difference_term_stem_lengths()) { + stemmed = null; + } + } + if (stemmed != null) { + this.has_stemmed_terms = true; + debug(@"Search term \"$term\" stemmed to \"$stemmed\""); + } else { + debug(@"Search term \"$term\" not stemmed"); + } + stemmed_terms.add(stemmed); + } + text.set_data(EMAIL_TEXT_STEMMED_TERMS, stemmed_terms); + } - int term_length = term.length; - if (term_length < this.min_term_length_for_stemming) - return null; + private bool sql_add_term_conditions(GLib.StringBuilder sql, + bool have_added_sql_condition) { + if (!this.expression.is_empty) { + if (have_added_sql_condition) { + sql.append(" AND"); + } + have_added_sql_condition = true; + var is_first_match_term = true; + sql.append(" MessageSearchTable MATCH '"); + foreach (var term in this.expression) { + if (!is_first_match_term) { + sql.append(" AND"); + } - string? stemmed = this.stemmer.stem(term, term.length); - if (String.is_empty(stemmed)) { - debug("Empty stemmed term returned for \"%s\"", term); - return null; - } + if (term.is_negated) { + sql.append(" NOT"); + } - // If same term returned, treat as non-stemmed - if (stemmed == term) - return null; + if (term.get_type() == typeof(EmailTextTerm)) { + sql_add_email_text_term_conditions((EmailTextTerm) term, sql); + } else if (term.get_type() == typeof(EmailFlagTerm)) { + sql.append(" ({flags} : \"' || ? || '\")"); + } - // Don't search for stemmed words that are significantly shorter than the user's search term - if (term_length - stemmed.length > this.max_difference_term_stem_lengths) { - debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms", - stemmed, term); + is_first_match_term = false; + } + sql.append("'"); + } + return have_added_sql_condition; + } - return null; + private void sql_add_email_text_term_conditions(EmailTextTerm text, + GLib.StringBuilder sql) { + var target = ""; + switch (text.target) { + case ALL: + target = ""; + break; + case TO: + target = "receivers"; + break; + case CC: + target = "cc"; + break; + case BCC: + target = "bcc"; + break; + case FROM: + target = "from"; + break; + case SUBJECT: + target = "subject"; + break; + case BODY: + target = "body"; + break; + case ATTACHMENT_NAME: + target = "attachments"; + break; + } + + var values = text.terms; + var stemmed_values = text.get_data>( + EMAIL_TEXT_STEMMED_TERMS + ); + for (int i = 0; i < values.size; i++) { + if (target != "") { + sql.append_printf(" ({%s} :", target); + } + if (stemmed_values != null && stemmed_values[i] != null) { + sql.append(" \"' || ? || '\"* OR \"' || ? || '\"*"); + } else { + sql.append(" \"' || ? || '\"*"); + } + if (target != "") { + sql.append_c(')'); + } } + } - debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed); - return stemmed; + private int sql_bind_term_conditions(Db.Statement sql, + bool text_only, + int index) + throws Geary.DatabaseError { + int next_index = index; + foreach (var term in this.expression) { + var type = term.get_type(); + if (type == typeof(EmailTextTerm)) { + var text = (EmailTextTerm) term; + var stemmed_terms = text.get_data>( + EMAIL_TEXT_STEMMED_TERMS + ); + for (int i = 0; i < text.terms.size; i++) { + sql.bind_string(next_index++, text.terms[i]); + if (stemmed_terms != null && stemmed_terms[i] != null) { + sql.bind_string(next_index++, stemmed_terms[i]); + } + } + } else if (type == typeof(EmailFlagTerm)) { + var flag = (EmailFlagTerm) term; + sql.bind_string(next_index++, flag.value.serialise()); + } + } + return next_index; } } diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index 4282ca046..fb5ab4d9b 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -580,9 +580,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { string text, GLib.Cancellable? cancellable ) throws GLib.Error { - return yield new ImapDB.SearchQuery( - this, this.local, expression, text, this.stemmer, EXACT, cancellable - ); + return new ImapDB.SearchQuery(expression, text, this.stemmer); } public override async Gee.MultiMap? local_search_message_id_async( diff --git a/test/engine/imap-db/imap-db-search-query-test.vala b/test/engine/imap-db/imap-db-search-query-test.vala new file mode 100644 index 000000000..510402611 --- /dev/null +++ b/test/engine/imap-db/imap-db-search-query-test.vala @@ -0,0 +1,199 @@ +/* + * Copyright © 2020 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +public class Geary.ImapDB.SearchQueryTest : TestCase { + + + private GLib.File? tmp_dir = null; + private Geary.AccountInformation? config = null; + private Account? account = null; + private SnowBall.Stemmer? stemmer = null; + + + public SearchQueryTest() { + base("Geary.ImapDB.SearchQueryTest"); + add_test("email_text_terms", email_text_terms); + add_test("email_text_terms_stemmed", email_text_terms_stemmed); + add_test("email_text_terms_specific", email_text_terms_specific); + add_test("email_flag_terms", email_flag_terms); + } + + public override void set_up() throws GLib.Error { + this.tmp_dir = GLib.File.new_for_path( + GLib.DirUtils.make_tmp("geary-imap-db-search-query-test-XXXXXX") + ); + + this.config = new Geary.AccountInformation( + "test", + ServiceProvider.OTHER, + new Mock.CredentialsMediator(), + new Geary.RFC822.MailboxAddress(null, "test@example.com") + ); + + this.account = new Account( + config, + this.tmp_dir, + GLib.File.new_for_path(_SOURCE_ROOT_DIR).get_child("sql") + ); + this.account.open_async.begin( + null, + this.async_completion + ); + this.account.open_async.end(async_result()); + + this.stemmer = new SnowBall.Stemmer("english"); + + Db.Context.enable_sql_logging = true; + } + + public override void tear_down() throws GLib.Error { + Db.Context.enable_sql_logging = false; + + this.stemmer = null; + + this.account.close_async.begin( + null, + this.async_completion + ); + this.account.close_async.end(async_result()); + this.account = null; + this.config = null; + + delete_file(this.tmp_dir); + this.tmp_dir = null; + } + + public void email_text_terms() throws GLib.Error { + var single_all_term = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "test")}, + "test" + ); + assert_queries(single_all_term); + + var multiple_all_term = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "foo"), + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "bar") + }, + "foo bar" + ); + assert_queries(multiple_all_term); + + var all_to_term = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "foo"), + new Geary.SearchQuery.EmailTextTerm(TO, EXACT, "bar") + }, + "foo to:bar" + ); + assert_queries(all_to_term); + + var all_to_all_term = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "foo"), + new Geary.SearchQuery.EmailTextTerm(TO, EXACT, "bar"), + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "baz") + }, + "foo to:bar baz" + ); + assert_queries(all_to_all_term); + } + + public void email_text_terms_stemmed() throws GLib.Error { + var single_all_term = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(ALL, CONSERVATIVE, "universal")}, + "universal" + ); + assert_queries(single_all_term); + + var multiple_all_term = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm(ALL, CONSERVATIVE, "universal"), + new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "bar") + }, + "universal bar" + ); + assert_queries(multiple_all_term); + + var all_to_term = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm(ALL, CONSERVATIVE, "universal"), + new Geary.SearchQuery.EmailTextTerm(TO, EXACT, "bar") + }, + "universal to:bar" + ); + assert_queries(all_to_term); + } + + public void email_text_terms_specific() throws GLib.Error { + var single_term = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(SUBJECT, EXACT, "test")}, + "subject:test" + ); + assert_queries(single_term); + + var missing_term = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(SUBJECT, EXACT, "")}, + "subject:" + ); + assert_queries(missing_term); + + var conflicting_property = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "subject:")}, + "subject:" + ); + assert_queries(conflicting_property); + + var conflicting_property_and_term = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(SUBJECT, EXACT, "subject:")}, + "subject:subject:" + ); + assert_queries(conflicting_property_and_term); + } + + public void email_flag_terms() throws GLib.Error { + var unread = new_search_query( + { new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.UNREAD)}, + "is:unread" + ); + assert_queries(unread); + + var flagged = new_search_query( + { new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.FLAGGED)}, + "is:flagged" + ); + assert_queries(flagged); + } + + private SearchQuery new_search_query(Geary.SearchQuery.Term[] ops, string raw) + throws GLib.Error { + return new SearchQuery( + new Gee.ArrayList.wrap(ops), + raw, + this.stemmer + ); + } + + private void assert_queries(SearchQuery query) throws GLib.Error { + var search = query.get_search_query( + this.account.db.get_primary_connection(), + null, + null, + 0, + 10, + null + ); + search.exec(null); + + var match = query.get_match_query( + this.account.db.get_primary_connection(), + "" + ); + match.exec(null); + } + +} diff --git a/test/meson.build b/test/meson.build index a50f12a6d..d4359f64b 100644 --- a/test/meson.build +++ b/test/meson.build @@ -53,6 +53,7 @@ test_engine_sources = [ 'engine/imap-db/imap-db-database-test.vala', 'engine/imap-db/imap-db-email-identifier-test.vala', 'engine/imap-db/imap-db-folder-test.vala', + 'engine/imap-db/imap-db-search-query-test.vala', 'engine/imap-engine/account-processor-test.vala', 'engine/imap-engine/imap-engine-generic-account-test.vala', 'engine/mime/mime-content-type-test.vala', diff --git a/test/test-engine.vala b/test/test-engine.vala index 0260ef6a8..a86ee94c1 100644 --- a/test/test-engine.vala +++ b/test/test-engine.vala @@ -72,6 +72,7 @@ int main(string[] args) { engine.add_suite(new Geary.ImapDB.DatabaseTest().suite); engine.add_suite(new Geary.ImapDB.EmailIdentifierTest().suite); engine.add_suite(new Geary.ImapDB.FolderTest().suite); + engine.add_suite(new Geary.ImapDB.SearchQueryTest().suite); engine.add_suite(new Geary.ImapEngine.AccountProcessorTest().suite); engine.add_suite(new Geary.ImapEngine.GenericAccountTest().suite); -- GitLab From 6ffbfcf5d441a9f3ce7de06e0d8a9241da4cf2ec Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Wed, 4 Nov 2020 22:57:41 +1100 Subject: [PATCH 06/24] Geary.ImapDb.SearchQuery: Handle folder and deleted message exclusions Ensure designated excluded folders, folderless messages, and marked-for-deletion messages are excluded from FTS search results. --- src/engine/imap-db/imap-db-account.vala | 48 ++++++++++++++++++- src/engine/imap-db/imap-db-search-query.vala | 37 ++++++++++---- .../imap-db/imap-db-search-query-test.vala | 42 +++++++++++++++- 3 files changed, 115 insertions(+), 12 deletions(-) diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index 13134d3fd..081281e7b 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -578,7 +578,7 @@ private class Geary.ImapDB.Account : BaseObject { } public async Gee.Collection? search_async(Geary.SearchQuery q, - int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null, + int limit = 100, int offset = 0, Gee.Collection? excluded_folders = null, Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error { @@ -595,10 +595,22 @@ private class Geary.ImapDB.Account : BaseObject { Gee.Map>? search_matches = null; yield db.exec_transaction_async(RO, (cx) => { + string? excluded_folder_ids_sql = null; + bool exclude_folderless = false; + if (excluded_folders != null) { + excluded_folder_ids_sql = do_get_excluded_folder_ids( + excluded_folders, cx, out exclude_folderless, cancellable + ); + } + var id_map = new Gee.HashMap( Collection.int64_hash_func, Collection.int64_equal_func); Db.Statement stmt = query.get_search_query( - cx, search_ids_sql, folder_blacklist, limit, offset, cancellable + cx, + search_ids_sql, + excluded_folder_ids_sql, + exclude_folderless, + limit, offset ); Db.Result result = stmt.exec(cancellable); while (!result.finished) { @@ -1277,6 +1289,38 @@ private class Geary.ImapDB.Account : BaseObject { } } + // Turn the collection of folder paths into actual folder ids. As a + // special case, if "folderless" or orphan emails are to be excluded, + // set the out bool to true. + private string do_get_excluded_folder_ids( + Gee.Collection excluded_folder, + Db.Connection cx, + out bool exclude_folderless, + GLib.Cancellable? cancellable + ) throws GLib.Error { + exclude_folderless = false; + + var ids = new GLib.StringBuilder(); + var is_first = true; + foreach (Geary.FolderPath? folder_path in excluded_folder) { + if (folder_path == null) { + exclude_folderless = true; + } else { + int64 id; + do_fetch_folder_id(cx, folder_path, true, out id, cancellable); + if (id != Db.INVALID_ROWID) { + if (!is_first) { + ids.append_c(','); + } + ids.append(id.to_string()); + is_first = false; + } + } + } + + return ids.str; + } + private inline void check_open() throws GLib.Error { if (!this.db.is_open) { throw new EngineError.OPEN_REQUIRED("Database not open"); diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index e03e2d0ff..24ae267ff 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -45,19 +45,35 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { internal Db.Statement get_search_query( Db.Connection cx, string? search_ids_sql, - Gee.Collection? folder_blacklist, + string? excluded_folder_ids_sql, + bool exclude_folderless, int limit, - int offset, - GLib.Cancellable? cancellable + int offset ) throws GLib.Error { var sql = new GLib.StringBuilder(); - var conditions_added = false; sql.append(""" - SELECT mst.rowid - FROM MessageSearchTable as mst - INNER JOIN MessageTable AS mt ON mt.id = mst.rowid - WHERE"""); + SELECT DISTINCT mst.rowid + FROM MessageSearchTable as mst + INNER JOIN MessageTable AS mt ON mt.id = mst.rowid"""); + if (exclude_folderless) { + sql.append(""" + INNER JOIN MessageLocationTable AS mlt ON mt.id = mlt.message_id"""); + } else { + sql.append(""" + LEFT JOIN MessageLocationTable AS mlt ON mt.id = mlt.message_id"""); + } + + var conditions_added = false; + sql.append(""" + WHERE"""); + if (excluded_folder_ids_sql != null) { + sql.append_printf( + " mlt.folder_id NOT IN (%s)", + excluded_folder_ids_sql + ); + conditions_added = true; + } conditions_added = sql_add_term_conditions(sql, conditions_added); if (!String.is_empty(search_ids_sql)) { if (conditions_added) { @@ -65,6 +81,11 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { } sql.append(""" id IN (%s)""".printf(search_ids_sql)); } + if (conditions_added) { + sql.append(" AND"); + } + // Exclude deleted messages, but not folderless messages + sql.append(" mlt.remove_marker IN (0, null)"); sql.append(""" ORDER BY mt.internaldate_time_t DESC"""); if (limit > 0) { diff --git a/test/engine/imap-db/imap-db-search-query-test.vala b/test/engine/imap-db/imap-db-search-query-test.vala index 510402611..571e63195 100644 --- a/test/engine/imap-db/imap-db-search-query-test.vala +++ b/test/engine/imap-db/imap-db-search-query-test.vala @@ -20,6 +20,7 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { add_test("email_text_terms_stemmed", email_text_terms_stemmed); add_test("email_text_terms_specific", email_text_terms_specific); add_test("email_flag_terms", email_flag_terms); + add_test("excluded_folders", excluded_folders); } public override void set_up() throws GLib.Error { @@ -169,6 +170,43 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { assert_queries(flagged); } + public void excluded_folders() throws GLib.Error { + var query = new_search_query( + { new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "test")}, + "test" + ); + + var search_with_excluded_ids = query.get_search_query( + this.account.db.get_primary_connection(), + null, + "10,20,30,40", + false, + 10, + 0 + ); + search_with_excluded_ids.exec(null); + + var search_with_exclude_folderless = query.get_search_query( + this.account.db.get_primary_connection(), + null, + null, + true, + 10, + 0 + ); + search_with_exclude_folderless.exec(null); + + var search_with_both = query.get_search_query( + this.account.db.get_primary_connection(), + null, + "10,20,30,40", + true, + 10, + 0 + ); + search_with_both.exec(null); + } + private SearchQuery new_search_query(Geary.SearchQuery.Term[] ops, string raw) throws GLib.Error { return new SearchQuery( @@ -183,9 +221,9 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { this.account.db.get_primary_connection(), null, null, - 0, + false, 10, - null + 0 ); search.exec(null); -- GitLab From 2ab7b2c39ec4461dc0840bd99067991726b3afbe Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 00:07:08 +1100 Subject: [PATCH 07/24] Geary.ImapDb.SearchQuery: Handle queries with some/all negated terms SQLite FTS5 doesn't allow all negated terms in matches, and since NOT is a binary operator (rather than unary) negated terms should be listed after any positive terms. --- src/engine/imap-db/imap-db-search-query.vala | 176 ++++++++++++++---- .../imap-db/imap-db-search-query-test.vala | 5 + 2 files changed, 143 insertions(+), 38 deletions(-) diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 24ae267ff..0ebbf21ed 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -17,6 +17,8 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { internal bool has_stemmed_terms { get; private set; default = false; } + private bool is_all_negated = true; + private unowned SnowBall.Stemmer stemmer; @@ -26,19 +28,22 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { base(expression, raw); this.stemmer = stemmer; - // Pre-stem search terms up front since the stemmed form is - // needed in a few different places foreach (var term in this.expression) { // Use this brittle form of type checking for performance // (both here and further below in the class) - the Engine // controls the Term hierarchy the needed assumptions can // be made if (term.get_type() == typeof(EmailTextTerm)) { + // Pre-stem search terms up front since the stemmed + // form is needed in a few different places var text = (EmailTextTerm) term; if (text.matching_strategy.is_stemming_enabled()) { stem_search_terms(text); } } + if (!term.is_negated) { + this.is_all_negated = false; + } } } @@ -52,10 +57,26 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { ) throws GLib.Error { var sql = new GLib.StringBuilder(); + // Select distinct since messages may exist in more than one + // folder. sql.append(""" - SELECT DISTINCT mst.rowid - FROM MessageSearchTable as mst + SELECT DISTINCT mt.id"""); + // FTS5 queries cannot be all negated terms. If not then join + // here and filter as usual, if so then exclude via subselect + // further below instead. + if (!this.is_all_negated) { + sql.append(""" + FROM MessageSearchTable AS mst INNER JOIN MessageTable AS mt ON mt.id = mst.rowid"""); + } else { + sql.append(""" + FROM MessageTable AS mt"""); + } + // If excluding folderless messages, an inner join on + // MessageLocationTable will cause them to be excluded + // automatically. Otherwise a left join always required to + // exclude messages marked for deletion, even if there are no + // folder exclusions. if (exclude_folderless) { sql.append(""" INNER JOIN MessageLocationTable AS mlt ON mt.id = mlt.message_id"""); @@ -67,6 +88,8 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { var conditions_added = false; sql.append(""" WHERE"""); + + // Folder exclusions if (excluded_folder_ids_sql != null) { sql.append_printf( " mlt.folder_id NOT IN (%s)", @@ -74,27 +97,48 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { ); conditions_added = true; } - conditions_added = sql_add_term_conditions(sql, conditions_added); + + // FTS match exclusions + if (!this.is_all_negated) { + conditions_added = sql_add_term_conditions(sql, conditions_added); + } else { + if (conditions_added) { + sql.append(" AND"); + } + sql.append( + " mt.id NOT IN (SELECT mst.rowid FROM MessageSearchTable as mst WHERE " + ); + sql_add_term_conditions(sql, false); + sql.append_c(')'); + conditions_added = true; + } + + // Email id exclusions if (!String.is_empty(search_ids_sql)) { if (conditions_added) { sql.append(" AND"); } - sql.append(""" id IN (%s)""".printf(search_ids_sql)); + sql.append(""" mt.id IN (%s)""".printf(search_ids_sql)); } + + // Marked as deleted (but not folderless) exclusions if (conditions_added) { sql.append(" AND"); } - // Exclude deleted messages, but not folderless messages sql.append(" mlt.remove_marker IN (0, null)"); + + // Ordering sql.append(""" ORDER BY mt.internaldate_time_t DESC"""); + + // Limit exclusions if (limit > 0) { sql.append(""" LIMIT ? OFFSET ?"""); } Db.Statement stmt = cx.prepare(sql.str); - int bind_index = sql_bind_term_conditions(stmt, false, 0); + int bind_index = sql_bind_term_conditions(stmt, 0); if (limit > 0) { stmt.bind_int(bind_index++, limit); stmt.bind_int(bind_index++, offset); @@ -118,7 +162,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { sql_add_term_conditions(sql, false); Db.Statement stmt = cx.prepare(sql.str); - sql_bind_term_conditions(stmt, true, 0); + sql_bind_term_conditions(stmt, 0); return stmt; } @@ -187,32 +231,71 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { sql.append(" AND"); } have_added_sql_condition = true; - var is_first_match_term = true; sql.append(" MessageSearchTable MATCH '"); + + // Add all non-negated terms first, since NOT in FTS5 is a + // binary operator (not unary, FFS), and hence all negated + // clauses must follow a positive operator and a single + // NOT clause. For example, for positive terms a,c and + // negated terms b,d, the match value must be structured + // as: (a AND c) NOT (b AND d) + + var is_first_positive_term = true; foreach (var term in this.expression) { - if (!is_first_match_term) { - sql.append(" AND"); + if (!term.is_negated) { + if (is_first_positive_term) { + sql.append(" ("); + } else { + sql.append(" AND"); + } + sql_add_term_condition(sql, term); + is_first_positive_term = false; } + } + if (!is_first_positive_term) { + sql.append_c(')'); + } + var is_first_negated_term = true; + foreach (var term in this.expression) { if (term.is_negated) { - sql.append(" NOT"); - } - - if (term.get_type() == typeof(EmailTextTerm)) { - sql_add_email_text_term_conditions((EmailTextTerm) term, sql); - } else if (term.get_type() == typeof(EmailFlagTerm)) { - sql.append(" ({flags} : \"' || ? || '\")"); + if (is_first_negated_term) { + // If all negated, there won't be any positive + // terms above, and the MATCH will be used as + // an exclusion instead, so the NOT operator + // is not required. + if (!this.is_all_negated) { + sql.append(" NOT ("); + } else { + sql.append(" ("); + } + } else { + sql.append(" AND"); + } + sql_add_term_condition(sql, term); + is_first_negated_term = false; } - - is_first_match_term = false; } + if (!is_first_negated_term) { + sql.append_c(')'); + } + sql.append("'"); } return have_added_sql_condition; } - private void sql_add_email_text_term_conditions(EmailTextTerm text, - GLib.StringBuilder sql) { + private inline void sql_add_term_condition(GLib.StringBuilder sql, + Term term) { + if (term.get_type() == typeof(EmailTextTerm)) { + sql_add_email_text_term_conditions((EmailTextTerm) term, sql); + } else if (term.get_type() == typeof(EmailFlagTerm)) { + sql.append(" ({flags} : \"' || ? || '\")"); + } + } + + private inline void sql_add_email_text_term_conditions(EmailTextTerm text, + GLib.StringBuilder sql) { var target = ""; switch (text.target) { case ALL: @@ -261,27 +344,44 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { } private int sql_bind_term_conditions(Db.Statement sql, - bool text_only, int index) throws Geary.DatabaseError { int next_index = index; + // Per sql_add_term_conditions, add all non-negated terms + // first before adding any negated terms. foreach (var term in this.expression) { - var type = term.get_type(); - if (type == typeof(EmailTextTerm)) { - var text = (EmailTextTerm) term; - var stemmed_terms = text.get_data>( - EMAIL_TEXT_STEMMED_TERMS - ); - for (int i = 0; i < text.terms.size; i++) { - sql.bind_string(next_index++, text.terms[i]); - if (stemmed_terms != null && stemmed_terms[i] != null) { - sql.bind_string(next_index++, stemmed_terms[i]); - } + if (!term.is_negated) { + next_index = sql_bind_term_condition(sql, term, next_index); + } + } + foreach (var term in this.expression) { + if (term.is_negated) { + next_index = sql_bind_term_condition(sql, term, next_index); + } + } + return next_index; + } + + private inline int sql_bind_term_condition(Db.Statement sql, + Term term, + int index) + throws Geary.DatabaseError { + int next_index = index; + var type = term.get_type(); + if (type == typeof(EmailTextTerm)) { + var text = (EmailTextTerm) term; + var stemmed_terms = text.get_data>( + EMAIL_TEXT_STEMMED_TERMS + ); + for (int i = 0; i < text.terms.size; i++) { + sql.bind_string(next_index++, text.terms[i]); + if (stemmed_terms != null && stemmed_terms[i] != null) { + sql.bind_string(next_index++, stemmed_terms[i]); } - } else if (type == typeof(EmailFlagTerm)) { - var flag = (EmailFlagTerm) term; - sql.bind_string(next_index++, flag.value.serialise()); } + } else if (type == typeof(EmailFlagTerm)) { + var flag = (EmailFlagTerm) term; + sql.bind_string(next_index++, flag.value.serialise()); } return next_index; } diff --git a/test/engine/imap-db/imap-db-search-query-test.vala b/test/engine/imap-db/imap-db-search-query-test.vala index 571e63195..4813fefaf 100644 --- a/test/engine/imap-db/imap-db-search-query-test.vala +++ b/test/engine/imap-db/imap-db-search-query-test.vala @@ -163,6 +163,11 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { ); assert_queries(unread); + var read_term = new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.UNREAD); + read_term.is_negated = true; + var read = new_search_query({ read_term }, "is:read"); + assert_queries(read); + var flagged = new_search_query( { new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.FLAGGED)}, "is:flagged" -- GitLab From 20a4fd3ed2fd48d6ae70143cbce7f7a2513a2e60 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 00:21:39 +1100 Subject: [PATCH 08/24] Geary.ImapDb.SearchQuery: Rename to Geary.FtsSearchQuery There's nothing IMAP-specific about the class, so move it to common and rename to reflect what it is actually used for. --- po/POTFILES.in | 2 +- .../common-fts-search-query.vala} | 10 +++++----- src/engine/imap-db/imap-db-account.vala | 10 +++++----- .../imap-engine-generic-account.vala | 2 +- src/engine/meson.build | 2 +- .../common-fts-search-query-test.vala} | 19 ++++++++++--------- test/meson.build | 2 +- test/test-engine.vala | 4 +++- 8 files changed, 27 insertions(+), 24 deletions(-) rename src/engine/{imap-db/imap-db-search-query.vala => common/common-fts-search-query.vala} (98%) rename test/engine/{imap-db/imap-db-search-query-test.vala => common/common-fts-search-query-test.vala} (92%) diff --git a/po/POTFILES.in b/po/POTFILES.in index a5cbd6ee7..cbcaff2ee 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -211,6 +211,7 @@ src/engine/app/email-store/app-mark-operation.vala src/engine/common/common-contact-harvester.vala src/engine/common/common-contact-store-impl.vala src/engine/common/common-message-data.vala +src/engine/common/common-fts-search-query.vala src/engine/db/db.vala src/engine/db/db-connection.vala src/engine/db/db-context.vala @@ -269,7 +270,6 @@ src/engine/imap-db/imap-db-email-identifier.vala src/engine/imap-db/imap-db-folder.vala src/engine/imap-db/imap-db-gc.vala src/engine/imap-db/imap-db-message-row.vala -src/engine/imap-db/imap-db-search-query.vala src/engine/imap-engine/gmail/imap-engine-gmail-account.vala src/engine/imap-engine/gmail/imap-engine-gmail-all-mail-folder.vala src/engine/imap-engine/gmail/imap-engine-gmail-drafts-folder.vala diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/common/common-fts-search-query.vala similarity index 98% rename from src/engine/imap-db/imap-db-search-query.vala rename to src/engine/common/common-fts-search-query.vala index 0ebbf21ed..a24c16dc2 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/common/common-fts-search-query.vala @@ -7,9 +7,9 @@ */ /** - * Internal implementation of {@link Geary.SearchQuery}. + * A search query implementation that provides full-text search. */ -private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { +internal class Geary.FtsSearchQuery : Geary.SearchQuery { private const string EMAIL_TEXT_STEMMED_TERMS = "geary-stemmed-terms"; @@ -22,9 +22,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { private unowned SnowBall.Stemmer stemmer; - public SearchQuery(Gee.List expression, - string raw, - SnowBall.Stemmer stemmer) { + public FtsSearchQuery(Gee.List expression, + string raw, + SnowBall.Stemmer stemmer) { base(expression, raw); this.stemmer = stemmer; diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index 081281e7b..fecb1867c 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -585,7 +585,7 @@ private class Geary.ImapDB.Account : BaseObject { debug("Search query: %s", q.to_string()); check_open(); - ImapDB.SearchQuery query = check_search_query(q); + FtsSearchQuery query = check_search_query(q); // Do this outside of transaction to catch invalid search ids up-front string? search_ids_sql = get_search_ids_sql(search_ids); @@ -696,7 +696,7 @@ private class Geary.ImapDB.Account : BaseObject { public async Gee.Set? get_search_matches_async(Geary.SearchQuery q, Gee.Collection ids, Cancellable? cancellable = null) throws Error { check_open(); - ImapDB.SearchQuery query = check_search_query(q); + FtsSearchQuery query = check_search_query(q); Gee.Set? search_matches = null; yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => { @@ -1223,7 +1223,7 @@ private class Geary.ImapDB.Account : BaseObject { // not per key-value public Gee.Map> do_get_search_matches( Db.Connection cx, - ImapDB.SearchQuery query, + FtsSearchQuery query, Gee.Map id_map, GLib.Cancellable? cancellable ) throws GLib.Error { @@ -1327,8 +1327,8 @@ private class Geary.ImapDB.Account : BaseObject { } } - private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error { - ImapDB.SearchQuery? query = q as ImapDB.SearchQuery; + private FtsSearchQuery check_search_query(Geary.SearchQuery q) throws Error { + var query = q as FtsSearchQuery; if (query == null) { throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name); } diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index fb5ab4d9b..68ed4efcf 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -580,7 +580,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { string text, GLib.Cancellable? cancellable ) throws GLib.Error { - return new ImapDB.SearchQuery(expression, text, this.stemmer); + return new FtsSearchQuery(expression, text, this.stemmer); } public override async Gee.MultiMap? local_search_message_id_async( diff --git a/src/engine/meson.build b/src/engine/meson.build index 1e9eecf17..b3727861d 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -67,6 +67,7 @@ engine_vala_sources = files( 'common/common-contact-harvester.vala', 'common/common-contact-store-impl.vala', + 'common/common-fts-search-query.vala', 'common/common-message-data.vala', 'db/db.vala', @@ -179,7 +180,6 @@ engine_vala_sources = files( 'imap-db/imap-db-fts5-matches.c', 'imap-db/imap-db-gc.vala', 'imap-db/imap-db-message-row.vala', - 'imap-db/imap-db-search-query.vala', 'imap-db/imap-db-sqlite.c', 'imap-engine/imap-engine.vala', diff --git a/test/engine/imap-db/imap-db-search-query-test.vala b/test/engine/common/common-fts-search-query-test.vala similarity index 92% rename from test/engine/imap-db/imap-db-search-query-test.vala rename to test/engine/common/common-fts-search-query-test.vala index 4813fefaf..b575f5434 100644 --- a/test/engine/imap-db/imap-db-search-query-test.vala +++ b/test/engine/common/common-fts-search-query-test.vala @@ -5,17 +5,17 @@ * (version 2.1 or later). See the COPYING file in this distribution. */ -public class Geary.ImapDB.SearchQueryTest : TestCase { +public class Geary.FtsSearchQueryTest : TestCase { private GLib.File? tmp_dir = null; private Geary.AccountInformation? config = null; - private Account? account = null; + private ImapDB.Account? account = null; private SnowBall.Stemmer? stemmer = null; - public SearchQueryTest() { - base("Geary.ImapDB.SearchQueryTest"); + public FtsSearchQueryTest() { + base("Geary.FtsSearchQueryTest"); add_test("email_text_terms", email_text_terms); add_test("email_text_terms_stemmed", email_text_terms_stemmed); add_test("email_text_terms_specific", email_text_terms_specific); @@ -25,7 +25,7 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { public override void set_up() throws GLib.Error { this.tmp_dir = GLib.File.new_for_path( - GLib.DirUtils.make_tmp("geary-imap-db-search-query-test-XXXXXX") + GLib.DirUtils.make_tmp("geary-common-fts-search-query-test-XXXXXX") ); this.config = new Geary.AccountInformation( @@ -35,7 +35,7 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { new Geary.RFC822.MailboxAddress(null, "test@example.com") ); - this.account = new Account( + this.account = new ImapDB.Account( config, this.tmp_dir, GLib.File.new_for_path(_SOURCE_ROOT_DIR).get_child("sql") @@ -212,16 +212,17 @@ public class Geary.ImapDB.SearchQueryTest : TestCase { search_with_both.exec(null); } - private SearchQuery new_search_query(Geary.SearchQuery.Term[] ops, string raw) + private FtsSearchQuery new_search_query(Geary.SearchQuery.Term[] ops, + string raw) throws GLib.Error { - return new SearchQuery( + return new FtsSearchQuery( new Gee.ArrayList.wrap(ops), raw, this.stemmer ); } - private void assert_queries(SearchQuery query) throws GLib.Error { + private void assert_queries(FtsSearchQuery query) throws GLib.Error { var search = query.get_search_query( this.account.db.get_primary_connection(), null, diff --git a/test/meson.build b/test/meson.build index d4359f64b..a4fe2c292 100644 --- a/test/meson.build +++ b/test/meson.build @@ -36,6 +36,7 @@ test_engine_sources = [ 'engine/app/app-conversation-set-test.vala', 'engine/common/common-contact-store-impl-test.vala', 'engine/common/common-contact-harvester-test.vala', + 'engine/common/common-fts-search-query-test.vala', 'engine/db/db-database-test.vala', 'engine/db/db-versioned-database-test.vala', 'engine/imap/command/imap-create-command-test.vala', @@ -53,7 +54,6 @@ test_engine_sources = [ 'engine/imap-db/imap-db-database-test.vala', 'engine/imap-db/imap-db-email-identifier-test.vala', 'engine/imap-db/imap-db-folder-test.vala', - 'engine/imap-db/imap-db-search-query-test.vala', 'engine/imap-engine/account-processor-test.vala', 'engine/imap-engine/imap-engine-generic-account-test.vala', 'engine/mime/mime-content-type-test.vala', diff --git a/test/test-engine.vala b/test/test-engine.vala index a86ee94c1..e829b0bf5 100644 --- a/test/test-engine.vala +++ b/test/test-engine.vala @@ -72,7 +72,9 @@ int main(string[] args) { engine.add_suite(new Geary.ImapDB.DatabaseTest().suite); engine.add_suite(new Geary.ImapDB.EmailIdentifierTest().suite); engine.add_suite(new Geary.ImapDB.FolderTest().suite); - engine.add_suite(new Geary.ImapDB.SearchQueryTest().suite); + + // Depends on ImapDB working + engine.add_suite(new Geary.FtsSearchQueryTest().suite); engine.add_suite(new Geary.ImapEngine.AccountProcessorTest().suite); engine.add_suite(new Geary.ImapEngine.GenericAccountTest().suite); -- GitLab From 092d117c8c04a530df0331fab62f7033605d0979 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 00:48:04 +1100 Subject: [PATCH 09/24] Geary.SearchQuery: Enable checking queries and terms for equality Add `SearchQuery.equal_to` method and virtual `Term.equal_to` method, overriding it as needed in subclasses. --- src/engine/api/geary-search-query.vala | 56 ++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 84f686b91..60c6b62f5 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -160,6 +160,14 @@ public abstract class Geary.SearchQuery : BaseObject { /** Determines opposite of the term is matched. */ public bool is_negated { get; set; default = false; } + /** Determines if this term is equal to another. */ + public virtual bool equal_to(Term other) { + return ( + this.is_negated == other.is_negated && + this.get_type() == other.get_type() + ); + } + /** Returns a string representation, for debugging. */ public abstract string to_string(); @@ -237,6 +245,27 @@ public abstract class Geary.SearchQuery : BaseObject { this.terms.add_all(terms); } + public override bool equal_to(Term other) { + if (this == other) { + return true; + } + if (!base.equal_to(other)) { + return false; + } + var text = (EmailTextTerm) other; + if (this.target != text.target || + this.matching_strategy != text.matching_strategy || + this.terms.size != text.terms.size) { + return false; + } + for (int i = 0; i < this.terms.size; i++) { + if (this.terms[i] != text.terms[i]) { + return false; + } + } + return true; + } + public override string to_string() { var builder = new GLib.StringBuilder(); if (this.is_negated) { @@ -283,6 +312,16 @@ public abstract class Geary.SearchQuery : BaseObject { this.value = value; } + public override bool equal_to(Term other) { + if (this == other) { + return true; + } + if (!base.equal_to(other)) { + return false; + } + return this.value.equal_to(((EmailFlagTerm) other).value); + } + public override string to_string() { return "%s(%s)".printf( this.is_negated ? "!" : "", @@ -317,6 +356,23 @@ public abstract class Geary.SearchQuery : BaseObject { this.raw = raw; } + /** Determines if this query's expression is equal to another's. */ + public bool equal_to(SearchQuery other) { + if (this == other) { + return true; + } + if (this.expression.size != other.expression.size) { + return false; + } + for (int i = 0; i < this.expression.size; i++) { + if (!this.expression[i].equal_to(other.expression[i])) { + return false; + } + } + return true; + } + + /** Returns a string representation of this query, for debugging. */ public string to_string() { var builder = new GLib.StringBuilder(); builder.append_printf("\"%s\": ", this.raw); -- GitLab From 4c911f5be735813601ca72ba5503a529089055e0 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 00:49:57 +1100 Subject: [PATCH 10/24] Geary.App.SearchFolder: Only update search if query is different This prevents re-running a search for e.g. whitespace-only changes. --- src/engine/app/app-search-folder.vala | 34 ++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/engine/app/app-search-folder.vala b/src/engine/app/app-search-folder.vala index 0ad7552f2..981cb5a79 100644 --- a/src/engine/app/app-search-folder.vala +++ b/src/engine/app/app-search-folder.vala @@ -153,27 +153,29 @@ public class Geary.App.SearchFolder : */ public async void search(SearchQuery query, GLib.Cancellable? cancellable) throws GLib.Error { - int result_mutex_token = yield result_mutex.claim_async(); + if (this.query == null || !this.query.equal_to(query)) { + int result_mutex_token = yield result_mutex.claim_async(); - clear(); + clear(); - if (cancellable != null) { - GLib.Cancellable @internal = this.executing; - cancellable.cancelled.connect(() => { @internal.cancel(); }); - } + if (cancellable != null) { + GLib.Cancellable @internal = this.executing; + cancellable.cancelled.connect(() => { @internal.cancel(); }); + } - this.query = query; - GLib.Error? error = null; - try { - yield do_search_async(null, null, this.executing); - } catch(Error e) { - error = e; - } + this.query = query; + GLib.Error? error = null; + try { + yield do_search_async(null, null, this.executing); + } catch(Error e) { + error = e; + } - result_mutex.release(ref result_mutex_token); + result_mutex.release(ref result_mutex_token); - if (error != null) { - throw error; + if (error != null) { + throw error; + } } } -- GitLab From 9cb3a0d98cfa8696be225d22355900e5be535558 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 01:12:46 +1100 Subject: [PATCH 11/24] Geary.FtsSearchQuery: Fix build with vala ~ <= 0.48 --- .../common/common-fts-search-query.vala | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/engine/common/common-fts-search-query.vala b/src/engine/common/common-fts-search-query.vala index a24c16dc2..05cee30bf 100644 --- a/src/engine/common/common-fts-search-query.vala +++ b/src/engine/common/common-fts-search-query.vala @@ -22,7 +22,7 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { private unowned SnowBall.Stemmer stemmer; - public FtsSearchQuery(Gee.List expression, + public FtsSearchQuery(Gee.List expression, string raw, SnowBall.Stemmer stemmer) { base(expression, raw); @@ -33,10 +33,10 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { // (both here and further below in the class) - the Engine // controls the Term hierarchy the needed assumptions can // be made - if (term.get_type() == typeof(EmailTextTerm)) { + if (term.get_type() == typeof(SearchQuery.EmailTextTerm)) { // Pre-stem search terms up front since the stemmed // form is needed in a few different places - var text = (EmailTextTerm) term; + var text = (SearchQuery.EmailTextTerm) term; if (text.matching_strategy.is_stemming_enabled()) { stem_search_terms(text); } @@ -199,7 +199,7 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { * * Otherwise, the stem for the term is returned. */ - private void stem_search_terms(EmailTextTerm text) { + private void stem_search_terms(SearchQuery.EmailTextTerm text) { var stemmed_terms = new Gee.ArrayList(); foreach (var term in text.terms) { int term_length = term.length; @@ -286,15 +286,15 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { } private inline void sql_add_term_condition(GLib.StringBuilder sql, - Term term) { - if (term.get_type() == typeof(EmailTextTerm)) { - sql_add_email_text_term_conditions((EmailTextTerm) term, sql); - } else if (term.get_type() == typeof(EmailFlagTerm)) { + SearchQuery.Term term) { + if (term.get_type() == typeof(SearchQuery.EmailTextTerm)) { + sql_add_email_text_term_conditions((SearchQuery.EmailTextTerm) term, sql); + } else if (term.get_type() == typeof(SearchQuery.EmailFlagTerm)) { sql.append(" ({flags} : \"' || ? || '\")"); } } - private inline void sql_add_email_text_term_conditions(EmailTextTerm text, + private inline void sql_add_email_text_term_conditions(SearchQuery.EmailTextTerm text, GLib.StringBuilder sql) { var target = ""; switch (text.target) { @@ -363,13 +363,13 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { } private inline int sql_bind_term_condition(Db.Statement sql, - Term term, + SearchQuery.Term term, int index) throws Geary.DatabaseError { int next_index = index; var type = term.get_type(); - if (type == typeof(EmailTextTerm)) { - var text = (EmailTextTerm) term; + if (type == typeof(SearchQuery.EmailTextTerm)) { + var text = (SearchQuery.EmailTextTerm) term; var stemmed_terms = text.get_data>( EMAIL_TEXT_STEMMED_TERMS ); @@ -379,8 +379,8 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { sql.bind_string(next_index++, stemmed_terms[i]); } } - } else if (type == typeof(EmailFlagTerm)) { - var flag = (EmailFlagTerm) term; + } else if (type == typeof(SearchQuery.EmailFlagTerm)) { + var flag = (SearchQuery.EmailFlagTerm) term; sql.bind_string(next_index++, flag.value.serialise()); } return next_index; -- GitLab From 9bd2359464568c1fceb6ca4b000be8e5fe51719c Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 18:56:11 +1100 Subject: [PATCH 12/24] Geary.Account: Make new_search_query synchronous Constructing a new query should be fast, and it no longer needs to be done async, so remove async from the method and simplify callers. --- .../application/application-main-window.vala | 9 ++++----- .../conversation-viewer.vala | 17 ++++++----------- src/engine/api/geary-account.vala | 5 ++--- .../imap-engine-generic-account.vala | 5 ++--- test/mock/mock-account.vala | 7 +++---- 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/client/application/application-main-window.vala b/src/client/application/application-main-window.vala index c4f934314..18016410e 100644 --- a/src/client/application/application-main-window.vala +++ b/src/client/application/application-main-window.vala @@ -968,7 +968,7 @@ public class Application.MainWindow : return closed; } - internal async void start_search(string query_text, bool is_interactive) { + internal void start_search(string query_text, bool is_interactive) { var context = get_selected_account_context(); if (context != null) { // Stop any search in progress @@ -986,10 +986,9 @@ public class Application.MainWindow : this.application.config.get_search_strategy(), context.account.information ); - var query = yield context.account.new_search_query( + var query = context.account.new_search_query( expr_factory.parse_query(query_text), - query_text, - cancellable + query_text ); this.folder_list.set_search( this.application.engine, context.search @@ -2219,7 +2218,7 @@ public class Application.MainWindow : if (Geary.String.is_empty_or_whitespace(text)) { stop_search(true); } else { - this.start_search.begin(text, true); + this.start_search(text, true); } } diff --git a/src/client/conversation-viewer/conversation-viewer.vala b/src/client/conversation-viewer/conversation-viewer.vala index 09d250202..4d895db46 100644 --- a/src/client/conversation-viewer/conversation-viewer.vala +++ b/src/client/conversation-viewer/conversation-viewer.vala @@ -297,9 +297,7 @@ public class ConversationViewer : Gtk.Stack, Geary.BaseInterface { // Highlight matching terms from find if active, otherwise // from the search folder if that's where we are at - Geary.SearchQuery? query = yield get_find_search_query( - conversation.base_folder.account, null - ); + var query = get_find_search_query(conversation.base_folder.account); if (query == null) { var search_folder = conversation.base_folder as Geary.App.SearchFolder; if (search_folder != null) { @@ -406,9 +404,8 @@ public class ConversationViewer : Gtk.Stack, Geary.BaseInterface { }); this.find_cancellable = cancellable; try { - Geary.SearchQuery? query = yield get_find_search_query( - list.conversation.base_folder.account, - cancellable + var query = get_find_search_query( + list.conversation.base_folder.account ); if (query != null) { yield list.search.highlight_matching_email(query, true); @@ -419,8 +416,7 @@ public class ConversationViewer : Gtk.Stack, Geary.BaseInterface { } } - private async Geary.SearchQuery? get_find_search_query(Geary.Account account, - GLib.Cancellable? cancellable) + private Geary.SearchQuery? get_find_search_query(Geary.Account account) throws GLib.Error { Geary.SearchQuery? query = null; if (this.conversation_find_bar.get_search_mode()) { @@ -433,10 +429,9 @@ public class ConversationViewer : Gtk.Stack, Geary.BaseInterface { this.config.get_search_strategy(), account.information ); - query = yield account.new_search_query( + query = account.new_search_query( expr_factory.parse_query(text), - text, - cancellable + text ); } } diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala index 6bba6b90c..bc887459b 100644 --- a/src/engine/api/geary-account.vala +++ b/src/engine/api/geary-account.vala @@ -514,10 +514,9 @@ public abstract class Geary.Account : BaseObject, Logging.Source { /** * Create a new search query for this account. */ - public abstract async SearchQuery new_search_query( + public abstract SearchQuery new_search_query( Gee.List expression, - string text, - GLib.Cancellable? cancellable + string text ) throws GLib.Error; /** diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index 68ed4efcf..c3ae26038 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -575,10 +575,9 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.Account { } /** {@inheritDoc} */ - public override async SearchQuery new_search_query( + public override SearchQuery new_search_query( Gee.List expression, - string text, - GLib.Cancellable? cancellable + string text ) throws GLib.Error { return new FtsSearchQuery(expression, text, this.stemmer); } diff --git a/test/mock/mock-account.vala b/test/mock/mock-account.vala index 2d08314d0..f617f9693 100644 --- a/test/mock/mock-account.vala +++ b/test/mock/mock-account.vala @@ -222,12 +222,11 @@ public class Mock.Account : Geary.Account, ); } - public override async Geary.SearchQuery new_search_query( + public override Geary.SearchQuery new_search_query( Gee.List expression, - string raw, - GLib.Cancellable? cancellable + string text ) throws GLib.Error { - return new SearchQuery(expression, raw); + return new SearchQuery(expression, text); } public override async Gee.Collection? -- GitLab From f56070727116bf37afcc6b028d576235b1ef03e8 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 22:52:59 +1100 Subject: [PATCH 13/24] Geary.FtsSearchQuery: Fixes for email text disjunctions Ensure OR is actually used to separate disjuncts, don't prefix match non-stemmed terms when a stemmed version exists or when EXACT is specified. Ensure column is correctly specified per disjunct. --- .../common/common-fts-search-query.vala | 22 ++++-- .../common/common-fts-search-query-test.vala | 69 +++++++++++-------- 2 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/engine/common/common-fts-search-query.vala b/src/engine/common/common-fts-search-query.vala index 05cee30bf..aef2e2df3 100644 --- a/src/engine/common/common-fts-search-query.vala +++ b/src/engine/common/common-fts-search-query.vala @@ -324,23 +324,33 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { break; } + sql.append(" ("); + var values = text.terms; var stemmed_values = text.get_data>( EMAIL_TEXT_STEMMED_TERMS ); + var is_first_disjunct = true; for (int i = 0; i < values.size; i++) { + if (!is_first_disjunct) { + sql.append(" OR"); + } if (target != "") { - sql.append_printf(" ({%s} :", target); + sql.append_printf("{%s} :", target); } if (stemmed_values != null && stemmed_values[i] != null) { - sql.append(" \"' || ? || '\"* OR \"' || ? || '\"*"); - } else { + // Original is not a prefix match, stemmed is + sql.append(" \"' || ? || '\" OR \"' || ? || '\"*"); + } else if (text.matching_strategy != EXACT) { + // A regular match, do a suffix match sql.append(" \"' || ? || '\"*"); + } else { + // EXACT is not a prefix match + sql.append(" \"' || ? || '\""); } - if (target != "") { - sql.append_c(')'); - } + is_first_disjunct = false; } + sql.append_c(')'); } private int sql_bind_term_conditions(Db.Statement sql, diff --git a/test/engine/common/common-fts-search-query-test.vala b/test/engine/common/common-fts-search-query-test.vala index b575f5434..790f89954 100644 --- a/test/engine/common/common-fts-search-query-test.vala +++ b/test/engine/common/common-fts-search-query-test.vala @@ -19,8 +19,8 @@ public class Geary.FtsSearchQueryTest : TestCase { add_test("email_text_terms", email_text_terms); add_test("email_text_terms_stemmed", email_text_terms_stemmed); add_test("email_text_terms_specific", email_text_terms_specific); + add_test("email_text_terms_disjunction", email_text_terms_disjunction); add_test("email_flag_terms", email_flag_terms); - add_test("excluded_folders", excluded_folders); } public override void set_up() throws GLib.Error { @@ -156,6 +156,28 @@ public class Geary.FtsSearchQueryTest : TestCase { assert_queries(conflicting_property_and_term); } + public void email_text_terms_disjunction() throws GLib.Error { + var multiple_all = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm.disjunction( + ALL, EXACT, new Gee.ArrayList.wrap({ "foo", "bar" }) + ) + }, + "(foo|bar)" + ); + assert_queries(multiple_all); + + var multiple_subject = new_search_query( + { + new Geary.SearchQuery.EmailTextTerm.disjunction( + ALL, EXACT, new Gee.ArrayList.wrap({ "foo", "bar" }) + ) + }, + "subject:(foo|bar)" + ); + assert_queries(multiple_subject); + } + public void email_flag_terms() throws GLib.Error { var unread = new_search_query( { new Geary.SearchQuery.EmailFlagTerm(Geary.EmailFlags.UNREAD)}, @@ -175,11 +197,26 @@ public class Geary.FtsSearchQueryTest : TestCase { assert_queries(flagged); } - public void excluded_folders() throws GLib.Error { - var query = new_search_query( - { new Geary.SearchQuery.EmailTextTerm(ALL, EXACT, "test")}, - "test" + private FtsSearchQuery new_search_query(Geary.SearchQuery.Term[] ops, + string raw) + throws GLib.Error { + return new FtsSearchQuery( + new Gee.ArrayList.wrap(ops), + raw, + this.stemmer + ); + } + + private void assert_queries(FtsSearchQuery query) throws GLib.Error { + var search = query.get_search_query( + this.account.db.get_primary_connection(), + null, + null, + false, + 10, + 0 ); + search.exec(null); var search_with_excluded_ids = query.get_search_query( this.account.db.get_primary_connection(), @@ -210,28 +247,6 @@ public class Geary.FtsSearchQueryTest : TestCase { 0 ); search_with_both.exec(null); - } - - private FtsSearchQuery new_search_query(Geary.SearchQuery.Term[] ops, - string raw) - throws GLib.Error { - return new FtsSearchQuery( - new Gee.ArrayList.wrap(ops), - raw, - this.stemmer - ); - } - - private void assert_queries(FtsSearchQuery query) throws GLib.Error { - var search = query.get_search_query( - this.account.db.get_primary_connection(), - null, - null, - false, - 10, - 0 - ); - search.exec(null); var match = query.get_match_query( this.account.db.get_primary_connection(), -- GitLab From bb02e157c62e046627090b83787b7319208161da Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 5 Nov 2020 23:03:45 +1100 Subject: [PATCH 14/24] Geary.App.SearchFolder: Substantial implementation rework This does the following: * Makes updating the search query a non-async call, so that apps can call it and forget about it * Makes updating search results only update the folder's contents when finished, and not if cancelled * Cancels any existing search tasks if the query is updated * Swaps sending removed signals before inserted signals to make the conversation viewer happier --- .../application/application-controller.vala | 2 +- .../application/application-main-window.vala | 17 +- src/engine/api/geary-search-query.vala | 2 +- src/engine/app/app-search-folder.vala | 346 ++++++++++-------- 4 files changed, 199 insertions(+), 168 deletions(-) diff --git a/src/client/application/application-controller.vala b/src/client/application/application-controller.vala index 2237aa618..6aec77acf 100644 --- a/src/client/application/application-controller.vala +++ b/src/client/application/application-controller.vala @@ -1088,7 +1088,7 @@ internal class Application.Controller : update_account_status(); // Stop any background processes - context.search.clear(); + context.search.clear_query(); context.contacts.close(); context.cancellable.cancel(); diff --git a/src/client/application/application-main-window.vala b/src/client/application/application-main-window.vala index 18016410e..d0f0ae21b 100644 --- a/src/client/application/application-main-window.vala +++ b/src/client/application/application-main-window.vala @@ -323,7 +323,6 @@ public class Application.MainWindow : private GLib.Cancellable action_update_cancellable = new GLib.Cancellable(); private GLib.Cancellable folder_open = new GLib.Cancellable(); - private GLib.Cancellable search_open = new GLib.Cancellable(); private Geary.TimeoutManager update_ui_timeout; private int64 update_ui_last = 0; @@ -971,10 +970,8 @@ public class Application.MainWindow : internal void start_search(string query_text, bool is_interactive) { var context = get_selected_account_context(); if (context != null) { - // Stop any search in progress - this.search_open.cancel(); - var cancellable = this.search_open = new GLib.Cancellable(); - + // If the current folder is not the search folder, save it + // so it can be re-selected later when search is closed if (this.previous_non_search_folder == null && this.selected_folder != null && this.selected_folder.used_as != SEARCH) { @@ -993,7 +990,7 @@ public class Application.MainWindow : this.folder_list.set_search( this.application.engine, context.search ); - yield context.search.search(query, cancellable); + context.search.update_query(query); } catch (GLib.Error error) { handle_error(context.account.information, error); } @@ -1001,10 +998,8 @@ public class Application.MainWindow : } internal void stop_search(bool is_interactive) { - // Stop any search in progress - this.search_open.cancel(); - this.search_open = new GLib.Cancellable(); - + // If the search folder is current selected, deselect and + // re-select any previously selected folder if (this.selected_folder == null || this.selected_folder.used_as == SEARCH) { var to_select = this.previous_non_search_folder; @@ -1026,7 +1021,7 @@ public class Application.MainWindow : this.folder_list.remove_search(); foreach (var context in this.controller.get_account_contexts()) { - context.search.clear(); + context.search.clear_query(); } } diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 60c6b62f5..62f7416d7 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -26,7 +26,7 @@ * @see Account.new_search_query * @see Account.local_search_async * @see Account.get_search_matches_async - * @see App.SearchFolder.search + * @see App.SearchFolder.update_query */ public abstract class Geary.SearchQuery : BaseObject { diff --git a/src/engine/app/app-search-folder.vala b/src/engine/app/app-search-folder.vala index 981cb5a79..a8fa9404e 100644 --- a/src/engine/app/app-search-folder.vala +++ b/src/engine/app/app-search-folder.vala @@ -110,10 +110,10 @@ public class Geary.App.SearchFolder : new Gee.HashSet(); // The email present in the folder, sorted - private Gee.TreeSet contents; + private Gee.SortedSet entries; // Map of engine ids to search ids - private Gee.Map id_map; + private Gee.Map ids; private Nonblocking.Mutex result_mutex = new Nonblocking.Mutex(); @@ -131,7 +131,8 @@ public class Geary.App.SearchFolder : account.email_removed.connect(on_account_email_removed); account.email_locally_removed.connect(on_account_email_removed); - new_contents(); + this.entries = new_entry_set(); + this.ids = new_id_map(); // Always exclude emails that don't live anywhere from search // results. @@ -147,53 +148,40 @@ public class Geary.App.SearchFolder : } /** - * Executes the given query over the account's local email. + * Sets the current search query for the folder. * - * Calling this will block until the search is complete. + * Calling this method will start the search folder asynchronously + * in the background. If the given query is not equal to the + * existing query, the folder's contents will be updated to + * reflect the changed query. */ - public async void search(SearchQuery query, GLib.Cancellable? cancellable) - throws GLib.Error { + public void update_query(SearchQuery query) { if (this.query == null || !this.query.equal_to(query)) { - int result_mutex_token = yield result_mutex.claim_async(); - - clear(); - - if (cancellable != null) { - GLib.Cancellable @internal = this.executing; - cancellable.cancelled.connect(() => { @internal.cancel(); }); - } + this.executing.cancel(); + this.executing = new GLib.Cancellable(); this.query = query; - GLib.Error? error = null; - try { - yield do_search_async(null, null, this.executing); - } catch(Error e) { - error = e; - } - - result_mutex.release(ref result_mutex_token); - - if (error != null) { - throw error; - } + this.update.begin(); } } /** * Cancels and clears the search query and results. * - * The {@link query} property will be cleared. + * The {@link query} property will be set to null. */ - public void clear() { + public void clear_query() { this.executing.cancel(); this.executing = new GLib.Cancellable(); - var old_ids = this.id_map; - new_contents(); + this.query = null; + var old_ids = this.ids; + + this.entries = new_entry_set(); + this.ids = new_id_map(); + notify_email_removed(old_ids.keys); notify_email_count_changed(0, REMOVED); - - this.query = null; } /** @@ -220,10 +208,16 @@ public class Geary.App.SearchFolder : Gee.Collection ids, GLib.Cancellable? cancellable = null) throws GLib.Error { + debug("Waiting for checking contains"); + int result_mutex_token = yield this.result_mutex.claim_async(cancellable); + var existing_ids = this.ids; + result_mutex.release(ref result_mutex_token); + + debug("Checking contains"); return Geary.traverse( ids ).filter( - (id) => this.id_map.has_key(id) + (id) => existing_ids.has_key(id) ).to_hash_set(); } @@ -234,17 +228,22 @@ public class Geary.App.SearchFolder : Folder.ListFlags flags, Cancellable? cancellable = null ) throws GLib.Error { - int result_mutex_token = yield result_mutex.claim_async(); + debug("Waiting to list email"); + int result_mutex_token = yield this.result_mutex.claim_async(cancellable); + var existing_entries = this.entries; + var existing_ids = this.ids; + result_mutex.release(ref result_mutex_token); + debug("Listing email"); var engine_ids = new Gee.LinkedList(); if (Folder.ListFlags.OLDEST_TO_NEWEST in flags) { EmailEntry? oldest = null; - if (!this.contents.is_empty) { + if (!existing_entries.is_empty) { if (initial_id == null) { - oldest = this.contents.last(); + oldest = existing_entries.last(); } else { - oldest = this.id_map.get(initial_id); + oldest = existing_ids.get(initial_id); if (oldest == null) { throw new EngineError.NOT_FOUND( @@ -253,13 +252,13 @@ public class Geary.App.SearchFolder : } if (!(Folder.ListFlags.INCLUDING_ID in flags)) { - oldest = contents.higher(oldest); + oldest = existing_entries.higher(oldest); } } } if (oldest != null) { var iter = ( - this.contents.iterator_at(oldest) as + existing_entries.iterator_at(oldest) as Gee.BidirIterator ); engine_ids.add(oldest.id); @@ -270,11 +269,11 @@ public class Geary.App.SearchFolder : } else { // Newest to oldest EmailEntry? newest = null; - if (!this.contents.is_empty) { + if (!existing_entries.is_empty) { if (initial_id == null) { - newest = this.contents.first(); + newest = existing_entries.first(); } else { - newest = this.id_map.get(initial_id); + newest = existing_ids.get(initial_id); if (newest == null) { throw new EngineError.NOT_FOUND( @@ -283,13 +282,13 @@ public class Geary.App.SearchFolder : } if (!(Folder.ListFlags.INCLUDING_ID in flags)) { - newest = contents.lower(newest); + newest = existing_entries.lower(newest); } } } if (newest != null) { var iter = ( - this.contents.iterator_at(newest) as + existing_entries.iterator_at(newest) as Gee.BidirIterator ); engine_ids.add(newest.id); @@ -313,8 +312,6 @@ public class Geary.App.SearchFolder : } } - result_mutex.release(ref result_mutex_token); - if (list_error != null) { throw list_error; } @@ -392,7 +389,7 @@ public class Geary.App.SearchFolder : private void require_id(EmailIdentifier id) throws EngineError.NOT_FOUND { - if (!this.id_map.has_key(id)) { + if (!this.ids.has_key(id)) { throw new EngineError.NOT_FOUND( "Id not found: %s", id.to_string() ); @@ -403,17 +400,114 @@ public class Geary.App.SearchFolder : Gee.Collection to_check ) { var available = new Gee.LinkedList(); - var id_map = this.id_map; + var ids = this.ids; var iter = to_check.iterator(); while (iter.next()) { var id = iter.get(); - if (id_map.has_key(id)) { + if (ids.has_key(id)) { available.add(id); } } return available; } + private async void append(Folder folder, + Gee.Collection ids) { + // Grab the cancellable before the lock so that if the current + // search is cancelled while waiting, this doesn't go and try + // to update the new results. + var cancellable = this.executing; + + debug("Waiting to append to search results"); + try { + int result_mutex_token = yield this.result_mutex.claim_async( + cancellable + ); + try { + if (!this.exclude_folders.contains(folder.path)) { + yield do_search_async(ids, null, cancellable); + } + } catch (GLib.Error error) { + this.account.report_problem( + new AccountProblemReport(this.account.information, error) + ); + } + + this.result_mutex.release(ref result_mutex_token); + } catch (GLib.IOError.CANCELLED mutex_err) { + // all good + } catch (GLib.Error mutex_err) { + warning("Error acquiring lock: %s", mutex_err.message); + } + } + + private async void update() { + // Grab the cancellable before the lock so that if the current + // search is cancelled while waiting, this doesn't go and try + // to update the new results. + var cancellable = this.executing; + + debug("Waiting to update search results"); + try { + int result_mutex_token = yield this.result_mutex.claim_async( + cancellable + ); + try { + yield do_search_async(null, null, cancellable); + } catch (GLib.Error error) { + this.account.report_problem( + new AccountProblemReport(this.account.information, error) + ); + } + + this.result_mutex.release(ref result_mutex_token); + } catch (GLib.IOError.CANCELLED mutex_err) { + // all good + } catch (GLib.Error mutex_err) { + warning("Error acquiring lock: %s", mutex_err.message); + } + } + + private async void remove(Folder folder, + Gee.Collection ids) { + + // Grab the cancellable before the lock so that if the current + // search is cancelled while waiting, this doesn't go and try + // to update the new results. + var cancellable = this.executing; + + debug("Waiting to remove from search results"); + try { + int result_mutex_token = yield this.result_mutex.claim_async( + cancellable + ); + + // Ensure this happens inside the lock so it is working with + // up-to-date data + var id_map = this.ids; + var relevant_ids = ( + traverse(ids) + .filter(id => id_map.has_key(id)) + .to_linked_list() + ); + if (relevant_ids.size > 0) { + try { + yield do_search_async(null, relevant_ids, cancellable); + } catch (GLib.Error error) { + this.account.report_problem( + new AccountProblemReport(this.account.information, error) + ); + } + } + + this.result_mutex.release(ref result_mutex_token); + } catch (GLib.IOError.CANCELLED mutex_err) { + // all good + } catch (GLib.Error mutex_err) { + warning("Error acquiring lock: %s", mutex_err.message); + } + } + // NOTE: you must call this ONLY after locking result_mutex_token. // If both *_ids parameters are null, the results of this search are // considered to be the full new set. If non-null, the results are @@ -424,11 +518,15 @@ public class Geary.App.SearchFolder : Gee.Collection? remove_ids, GLib.Cancellable? cancellable) throws GLib.Error { - var id_map = this.id_map; - var contents = this.contents; + debug("Processing search results"); + var entries = new_entry_set(); + var ids = new_id_map(); var added = new Gee.LinkedList(); var removed = new Gee.LinkedList(); + entries.add_all(this.entries); + ids.set_all(this.ids); + if (remove_ids == null) { // Adding email to the search, either searching all local // email if to_add is null, or adding only a matching @@ -465,24 +563,24 @@ public class Geary.App.SearchFolder : var hashed_results = new Gee.HashSet(); hashed_results.add_all(id_results); - var existing = id_map.map_iterator(); + var existing = ids.map_iterator(); while (existing.next()) { if (!hashed_results.contains(existing.get_key())) { var entry = existing.get_value(); existing.unset(); - contents.remove(entry); + entries.remove(entry); removed.add(entry.id); } } } foreach (var email in email_results) { - if (!id_map.has_key(email.id)) { + if (!ids.has_key(email.id)) { var entry = new EmailEntry( email.id, email.properties.date_received ); - contents.add(entry); - id_map.set(email.id, entry); + entries.add(entry); + ids.set(email.id, entry); added.add(email.id); } } @@ -491,89 +589,53 @@ public class Geary.App.SearchFolder : // Removing email, can just remove them directly foreach (var id in remove_ids) { EmailEntry entry; - if (id_map.unset(id, out entry)) { - contents.remove(entry); + if (ids.unset(id, out entry)) { + entries.remove(entry); removed.add(id); } } } - this._properties.set_total(this.contents.size); - - // Note that we probably shouldn't be firing these signals from inside - // our mutex lock. Keep an eye on it, and if there's ever a case where - // it might cause problems, it shouldn't be too hard to move the - // firings outside. - - Folder.CountChangeReason reason = CountChangeReason.NONE; - if (added.size > 0) { - // TODO: we'd like to be able to use APPENDED here when applicable, - // but because of the potential to append a thousand results at - // once and the ConversationMonitor's inability to handle that - // gracefully (#7464), we always use INSERTED for now. - notify_email_inserted(added); - reason |= Folder.CountChangeReason.INSERTED; - } - if (removed.size > 0) { - notify_email_removed(removed); - reason |= Folder.CountChangeReason.REMOVED; - } - if (reason != CountChangeReason.NONE) - notify_email_count_changed(this.contents.size, reason); - } - - private async void do_append(Folder folder, - Gee.Collection ids, - GLib.Cancellable? cancellable) - throws GLib.Error { - int result_mutex_token = yield result_mutex.claim_async(); - - GLib.Error? error = null; - try { - if (!this.exclude_folders.contains(folder.path)) { - yield do_search_async(ids, null, cancellable); - } - } catch (GLib.Error e) { - error = e; - } - - result_mutex.release(ref result_mutex_token); + if (!cancellable.is_cancelled()) { + this.entries = entries; + this.ids = ids; - if (error != null) - throw error; - } + this._properties.set_total(entries.size); - private async void do_remove(Folder folder, - Gee.Collection ids, - GLib.Cancellable? cancellable) - throws GLib.Error { - int result_mutex_token = yield result_mutex.claim_async(); + // Note that we probably shouldn't be firing these signals from inside + // our mutex lock. Keep an eye on it, and if there's ever a case where + // it might cause problems, it shouldn't be too hard to move the + // firings outside. - GLib.Error? error = null; - try { - var id_map = this.id_map; - var relevant_ids = ( - traverse(ids) - .filter(id => id_map.has_key(id)) - .to_linked_list() - ); - - if (relevant_ids.size > 0) { - yield do_search_async(null, relevant_ids, cancellable); + Folder.CountChangeReason reason = CountChangeReason.NONE; + if (removed.size > 0) { + notify_email_removed(removed); + reason |= Folder.CountChangeReason.REMOVED; + } + if (added.size > 0) { + // TODO: we'd like to be able to use APPENDED here + // when applicable, but because of the potential to + // append a thousand results at once and the + // ConversationMonitor's inability to handle that + // gracefully (#7464), we always use INSERTED for now. + notify_email_inserted(added); + reason |= Folder.CountChangeReason.INSERTED; } - } catch (GLib.Error e) { - error = e; + if (reason != CountChangeReason.NONE) { + notify_email_count_changed(this.entries.size, reason); + } + debug("Processing done, entries/ids: %d/%d", entries.size, ids.size); + } else { + debug("Processing cancelled, dropping entries/ids: %d/%d", entries.size, ids.size); } + } - result_mutex.release(ref result_mutex_token); - - if (error != null) - throw error; + private inline Gee.SortedSet new_entry_set() { + return new Gee.TreeSet(EmailEntry.compare_to); } - private inline void new_contents() { - this.contents = new Gee.TreeSet(EmailEntry.compare_to); - this.id_map = new Gee.HashMap(); + private inline Gee.Map new_id_map() { + return new Gee.HashMap(); } private void include_folder(Folder folder) { @@ -613,40 +675,14 @@ public class Geary.App.SearchFolder : private void on_email_locally_complete(Folder folder, Gee.Collection ids) { if (this.query != null) { - this.do_append.begin( - folder, ids, null, - (obj, res) => { - try { - this.do_append.end(res); - } catch (GLib.Error error) { - this.account.report_problem( - new AccountProblemReport( - this.account.information, error - ) - ); - } - } - ); + this.append.begin(folder, ids); } } private void on_account_email_removed(Folder folder, Gee.Collection ids) { if (this.query != null) { - this.do_remove.begin( - folder, ids, null, - (obj, res) => { - try { - this.do_remove.end(res); - } catch (GLib.Error error) { - this.account.report_problem( - new AccountProblemReport( - this.account.information, error - ) - ); - } - } - ); + this.remove.begin(folder, ids); } } -- GitLab From 90711f234e3b780a5cb4b9544fd04e9d51020f3b Mon Sep 17 00:00:00 2001 From: Douglas Fuller Date: Tue, 6 Oct 2020 17:50:52 -0400 Subject: [PATCH 15/24] Geary.App.FillWindowOperation: detect stale FillWindowOperations When a user types in the search box, there may still be oustanding FillWindowOperations queued on previous instances of SearchFolder from previous keystrokes. This can result in a FillWindowOperation with a stale value of ConversationMonitor.window_lowest from a previous search executing on the current one. Detect this and return immediately. Fixes: #838 --- .../app-fill-window-operation.vala | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/engine/app/conversation-monitor/app-fill-window-operation.vala b/src/engine/app/conversation-monitor/app-fill-window-operation.vala index 9f34797b1..a7ca8b8cb 100644 --- a/src/engine/app/conversation-monitor/app-fill-window-operation.vala +++ b/src/engine/app/conversation-monitor/app-fill-window-operation.vala @@ -37,9 +37,16 @@ private class Geary.App.FillWindowOperation : ConversationOperation { num_to_load = MAX_FILL_COUNT; } - int loaded = yield this.monitor.load_by_id_async( - this.monitor.window_lowest, num_to_load, LOCAL_ONLY - ); + int loaded = 0; + + try { + loaded = yield this.monitor.load_by_id_async( + this.monitor.window_lowest, num_to_load, LOCAL_ONLY + ); + } catch (EngineError.NOT_FOUND err) { + debug("Stale FillWindowOperation: %s", err.message); + return; + } debug( "Filled %d of %d locally, window: %d, total: %d", @@ -61,9 +68,14 @@ private class Geary.App.FillWindowOperation : ConversationOperation { // Load the max amount if going to the trouble of talking // to the remote. num_to_load = MAX_FILL_COUNT; - loaded = yield this.monitor.load_by_id_async( - this.monitor.window_lowest, num_to_load, FORCE_UPDATE - ); + try { + loaded = yield this.monitor.load_by_id_async( + this.monitor.window_lowest, num_to_load, FORCE_UPDATE + ); + } catch (EngineError.NOT_FOUND err) { + debug("Stale FillWindowOperation: %s", err.message); + return; + } debug( "Filled %d of %d from the remote, window: %d, total: %d", -- GitLab From 7e381982872d74b5b135c183ee9188586bb83645 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Fri, 13 Nov 2020 08:41:08 +1100 Subject: [PATCH 16/24] ImapDb.Database: Register new ICU-based tokeniser for FTS The SQLite tokeniser does not deal with scripts that do not use spaces for word breaking (CJK, Thai, etc), thus searching in those languages does not work well. This adds a custom SQLite tokeniser based on ICU that breaks words for all languages supported by that library, and uses NFKC_Casefold normalisation to handle normalisation, case folding, and dropping of ignorable characters. Fixes #121 --- .gitlab-ci.yml | 8 +- BUILDING.md | 9 +- meson.build | 10 + sql/version-030.sql | 2 +- src/engine/imap-db/imap-db-database.vala | 10 +- src/engine/imap-db/imap-db-fts5-tokeniser.c | 275 ++++++++++++++++++++ src/engine/meson.build | 24 +- 7 files changed, 325 insertions(+), 13 deletions(-) create mode 100644 src/engine/imap-db/imap-db-fts5-tokeniser.c diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 80be0f435..2b02d93e3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,7 @@ variables: meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool - libappstream-glib-devel libgee-devel libhandy1-devel + libappstream-glib-devel libgee-devel libhandy1-devel libicu-devel libpeas-devel libsecret-devel libstemmer-devel libunwind-devel libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz @@ -37,9 +37,9 @@ variables: itstool libappstream-glib-dev libenchant-2-dev libfolks-dev libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev - libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev - libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev - libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev + libhandy-1-dev libicu-dev libjson-glib-dev libmessaging-menu-dev + libpeas-dev libsecret-1-dev libsqlite3-dev libstemmer-dev + libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb fedora: diff --git a/BUILDING.md b/BUILDING.md index f8ca45ae3..f63b10414 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -93,8 +93,9 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \ gnome-online-accounts-devel gspell-devel gsound-devel \ gtk3-devel iso-codes-devel itstool json-glib-devel \ libappstream-glib-devel libgee-devel libhandy1-devel \ - libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \ - libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel + libpeas-devel libsecret-devel libicu-devel libstemmer-devel \ + libunwind-devel libxml2-devel libytnef-devel sqlite-devel \ + webkitgtk4-devel ``` Installing dependencies on Ubuntu/Debian @@ -108,8 +109,8 @@ sudo apt-get install meson build-essential valac \ libappstream-glib-dev libenchant-2-dev libfolks-dev \ libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \ libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \ - libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \ - libsqlite3-dev libstemmer-dev libunwind-dev \ + libjson-glib-dev libhandy-1-dev libicu-dev libpeas-dev \ + libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev \ libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev ``` diff --git a/meson.build b/meson.build index ce279f555..325849953 100644 --- a/meson.build +++ b/meson.build @@ -85,6 +85,7 @@ goa = dependency('goa-1.0') gsound = dependency('gsound') gspell = dependency('gspell-1') gthread = dependency('gthread-2.0', version: '>=' + target_glib) +icu_uc = dependency('icu-uc', version: '>=60') iso_codes = dependency('iso-codes') javascriptcoregtk = dependency('javascriptcoregtk-4.0', version: '>=' + target_webkit) json_glib = dependency('json-glib-1.0', version: '>= 1.0') @@ -130,6 +131,15 @@ libstemmer = declare_dependency( ], ) +# Faux ICU dependency to prevent ICU being passed to valac as a +# package by meson +icu = declare_dependency( + dependencies: [ + cc.find_library('icuuc'), + cc.find_library('icudata'), + ], +) + # Optional dependencies appstream_util = find_program('appstream-util', required: false) desktop_file_validate = find_program('desktop-file-validate', required: false) diff --git a/sql/version-030.sql b/sql/version-030.sql index 48af04df6..4fbde30d7 100644 --- a/sql/version-030.sql +++ b/sql/version-030.sql @@ -14,6 +14,6 @@ CREATE VIRTUAL TABLE MessageSearchTable USING fts5( bcc, flags, - tokenize="unicode61 remove_diacritics 2", + tokenize="geary_tokeniser", prefix="2,4,6,8,10" ) diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala index 9365f8761..45286f9b0 100644 --- a/src/engine/imap-db/imap-db-database.vala +++ b/src/engine/imap-db/imap-db-database.vala @@ -7,6 +7,7 @@ [CCode (cname = "g_utf8_collate_key")] extern string utf8_collate_key(string data, ssize_t len); +extern int sqlite3_register_fts5_tokeniser(Sqlite.Database db); extern int sqlite3_register_fts5_matches(Sqlite.Database db); extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db); @@ -630,8 +631,13 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { sqlite3_register_legacy_tokenizer(cx.db); } - // Register custom `geary_matches()` FTS5 function to obtain - // matching tokens from FTS queries. + // Register custom FTS5 tokeniser that uses ICU to correctly + // segment at both Latin and on-Latin (e.g. CJK, Thai) word + // boundaries. + sqlite3_register_fts5_tokeniser(cx.db); + + // Register custom `geary_matches()` FTS5 function that + // obtains matching tokens from FTS queries. sqlite3_register_fts5_matches(cx.db); if (cx.db.create_function( diff --git a/src/engine/imap-db/imap-db-fts5-tokeniser.c b/src/engine/imap-db/imap-db-fts5-tokeniser.c new file mode 100644 index 000000000..2991a56e1 --- /dev/null +++ b/src/engine/imap-db/imap-db-fts5-tokeniser.c @@ -0,0 +1,275 @@ +/* + * Copyright © 2020 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +#include +SQLITE_EXTENSION_INIT1 + +#include +#include +#include +#include +#include +#include "unicode/utf.h" +#include "unicode/utypes.h" + +// Full text search tokeniser for SQLite. This exists since SQLite's +// existing Unicode tokeniser doesn't work with languages that don't +// use spaces as word boundaries. +// +// When generating tokens, the follow process is applied to text using +// the ICU library: +// +// 1. ICU NFKC_Casefold normalisation, handles normalisation, case +// folding and removal of ignorable characters such as accents. +// +// 2. ICU word-boundary tokenisation, splits both on words at spaces +// and other punctuation, and also using a dictionary lookup for +// languages that do not use spaces (CJK, Thai, etc) +// +// Note: Since SQLite is single-threaded, it's safe to use single +// instances of ICU services for all calls for a single tokeniser. + +#define NORM_BUF_LEN 8 +#define TOKEN_BUF_LEN 8 + +typedef struct { + // Singleton object, threadsafe, does not need to be deleted. + const UNormalizer2 * norm; + + // Stateful object, not threadsafe, must be deleted. + UBreakIterator *iter; +} IcuTokeniser; + + +static int icu_create(void *context, + const char **args, + int n_args, + Fts5Tokenizer **ret) { + const UNormalizer2 *norm; + UBreakIterator *iter; + IcuTokeniser *tokeniser; + UErrorCode err = U_ZERO_ERROR; + + norm = unorm2_getNFKCCasefoldInstance(&err); + if (U_FAILURE(err)) { + g_warning("Error constructing ICU normaliser: %s", u_errorName(err)); + return SQLITE_ABORT; + } + + // The given locale doesn't matter here since it ICU doesn't + // (currently) use different rules for different word breaking + // languages that uses spaces as word boundaries, and uses + // dictionary look-ups for CJK and other scripts that don't. + iter = ubrk_open(UBRK_WORD, "en", NULL, 0, &err); + if (U_FAILURE(err)) { + g_warning("Error constructing ICU word-breaker: %s", u_errorName(err)); + ubrk_close(tokeniser->iter); + return SQLITE_ABORT; + } + + tokeniser = g_new0(IcuTokeniser, 1); + tokeniser->norm = norm; + tokeniser->iter = iter; + *ret = (Fts5Tokenizer *) tokeniser; + + return SQLITE_OK; +} + +static void icu_delete(Fts5Tokenizer *fts5_tokeniser) { + IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser; + + ubrk_close(tokeniser->iter); + g_free(tokeniser); +} + +static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser, + void *context, + int flags, + const char *chars, + int32_t chars_len, + int (*token_callback)(void*, int, const char*, int, int, int)) { + int ret = SQLITE_OK; + IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser; + UErrorCode err = U_ZERO_ERROR; + + const UNormalizer2 *norm = tokeniser->norm; + GArray *wide_chars = NULL; + GArray *wide_offsets = NULL; + UChar *wide_data = NULL; + gsize wide_data_len_long = 0; + int32_t wide_data_len = 0; + + UChar norm_buf[NORM_BUF_LEN] = {0}; + + UBreakIterator *iter = tokeniser->iter; + int32_t start_index, current_index = 0; + char *token_buf = NULL; + int32_t token_buf_len = NORM_BUF_LEN; + + // Normalisation. + // + // SQLite needs the byte-index of tokens found in the chars, but + // ICU doesn't support UTF-8-based normalisation. So convert UTF-8 + // input to UTF-16 char-by-char and record the byte offsets for + // each, so that when converting back to UTF-8 the byte offsets + // can be determined. + + wide_chars = g_array_sized_new(FALSE, FALSE, sizeof(UChar), chars_len); + wide_offsets = g_array_sized_new(FALSE, FALSE, sizeof(int32_t), chars_len); + + for (int32_t byte_offset = 0; byte_offset < chars_len;) { + UChar wide_char; + int32_t norm_len; + int32_t start_byte_offset = byte_offset; + + U8_NEXT_OR_FFFD(chars, byte_offset, chars_len, wide_char); + norm_len = unorm2_normalize(norm, + &wide_char, 1, + norm_buf, NORM_BUF_LEN, + &err); + if (U_FAILURE(err)) { + g_warning("Token text normalisation failed"); + err = SQLITE_ABORT; + goto cleanup; + } + + // NFKC may decompose a single character into multiple + // characters, e.g. 'fi' into "fi", '…' into "...". + for (int i = 0; i < norm_len; i++) { + g_array_append_val(wide_chars, norm_buf[i]); + g_array_append_val(wide_offsets, start_byte_offset); + } + } + + // Word breaking. + // + // UTF-16 is passed to the tokeniser, hence its indexes are + // character-based. Use the offset array to convert those back to + // byte indexes for individual tokens. + + wide_data = (UChar *) g_array_steal(wide_chars, &wide_data_len_long); + wide_data_len = (int32_t) wide_data_len_long; + + ubrk_setText(iter, wide_data, wide_data_len, &err); + if (U_FAILURE(err)) { + err = SQLITE_ABORT; + g_warning("Setting word break iterator text failed"); + goto cleanup; + } + start_index = 0; + current_index = ubrk_first(iter); + token_buf = g_malloc0(sizeof(char) * token_buf_len); + while (current_index != UBRK_DONE && ret == SQLITE_OK) { + int32_t status = ubrk_getRuleStatus(iter); + int32_t token_char_len = current_index - start_index; + if (token_char_len > 0 && + !(status >= UBRK_WORD_NONE && status < UBRK_WORD_NONE_LIMIT) && + !(status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)) { + int32_t token_byte_len = 0; + int32_t token_byte_start = 0; + int32_t token_byte_end = 0; + + for (;;) { + u_strToUTF8WithSub(token_buf, token_buf_len, &token_byte_len, + wide_data + start_index, token_char_len, + 0xFFFD, NULL, + &err); + + if (U_SUCCESS(err)) { + break; + } else if (err == U_BUFFER_OVERFLOW_ERROR) { + token_buf_len *= 2; + token_buf = g_realloc(token_buf, sizeof(char) * token_buf_len); + err = U_ZERO_ERROR; + } else { + err = SQLITE_ABORT; + g_warning("Conversion to UTF-8 failed"); + goto cleanup; + } + } + + token_byte_start = g_array_index(wide_offsets, int32_t, start_index); + if (current_index < wide_data_len) { + token_byte_end = g_array_index(wide_offsets, int32_t, current_index); + } else { + token_byte_end = chars_len; + } + + ret = token_callback(context, + 0, + token_buf, + token_byte_len, + token_byte_start, + token_byte_end); + } + + start_index = current_index; + current_index = ubrk_next(iter); + } + + cleanup: + g_free(wide_data); + g_array_unref(wide_chars); + g_array_unref(wide_offsets); + g_free(token_buf); + + return ret; +} + +static fts5_api *get_fts5_api(sqlite3 *db) { + int rc = SQLITE_OK; + sqlite3_stmt *stmt; + fts5_api *api = NULL; + + rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)", + -1, &stmt, 0); + if (rc != SQLITE_OK) { + return NULL; + } + + sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + + return api; +} + +static const fts5_tokenizer icu_tokeniser = { + icu_create, + icu_delete, + icu_tokenise +}; + +gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) { + fts5_api *api; + fts5_tokenizer *tokeniser = (fts5_tokenizer *) &icu_tokeniser; + int rc = SQLITE_OK; + + api = get_fts5_api(db); + if (!api) { + return FALSE; + } + + rc = api->xCreateTokenizer(api, + "geary_tokeniser", + NULL, + tokeniser, + NULL); + + return (rc == SQLITE_OK) ? TRUE : FALSE; +} + +// Entry point for external loadable library, required when using +// command line SQLite tool. The name of this function must match the +// name of the shared module. +int sqlite3_gearytokeniser_init(sqlite3 *db, + char **error_message, + const sqlite3_api_routines *api) { + g_info("Loading geary_tokeniser\n"); + SQLITE_EXTENSION_INIT2(api); + return sqlite3_register_fts5_tokeniser(db) ? SQLITE_OK : SQLITE_ABORT; +} diff --git a/src/engine/meson.build b/src/engine/meson.build index b3727861d..ba9941ffa 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -178,6 +178,7 @@ engine_vala_sources = files( 'imap-db/imap-db-email-identifier.vala', 'imap-db/imap-db-folder.vala', 'imap-db/imap-db-fts5-matches.c', + 'imap-db/imap-db-fts5-tokeniser.c', 'imap-db/imap-db-gc.vala', 'imap-db/imap-db-message-row.vala', 'imap-db/imap-db-sqlite.c', @@ -324,6 +325,7 @@ engine_dependencies = [ gio, glib, gmime, + icu, libmath, libstemmer, libxml, @@ -337,10 +339,17 @@ endif engine_build_dir = meson.current_build_dir() +engine_c_args = geary_c_args +engine_vala_args = geary_vala_args + +# Suppress SQLite loadable module init code +engine_c_args += [ + '-D', 'SQLITE_CORE', +] + # Generate internal VAPI for unit testing. See Meson issue # https://github.com/mesonbuild/meson/issues/1781 for official # internal VAPI support. -engine_vala_args = geary_vala_args engine_vala_args += [ '--internal-header=@0@/geary-engine-internal.h'.format(engine_build_dir), '--internal-vapi=@0@/geary-engine-internal.vapi'.format(engine_build_dir) @@ -364,7 +373,7 @@ engine_lib = static_library('geary-engine', dependencies: engine_dependencies, include_directories: config_h_dir, vala_args: engine_vala_args, - c_args: geary_c_args, + c_args: engine_c_args, ) # Dummy target to tell Meson about the internal VAPI given the @@ -402,3 +411,14 @@ engine_internal_dep = declare_dependency( include_directories: include_directories('.'), sources: engine_internal_header_fixup ) + +# Compile a loadable library containing the custom tokeniser so SQLite +# command line app can still be used. +tokeniser_lib = shared_library('geary-tokeniser', + files('imap-db/imap-db-fts5-tokeniser.c'), + dependencies: [ glib, icu, sqlite ], + c_args: [ + # Enable GLib structured logging + '-DG_LOG_USE_STRUCTURED', + ], +) -- GitLab From 36ba76b83bbdaedcc7ed971ab25d98f75af53af4 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Mon, 28 Dec 2020 19:18:42 +1030 Subject: [PATCH 17/24] src/engine/imap-db/imap-db-fts5-*.c: Fix build warnings --- src/engine/imap-db/imap-db-fts5-matches.c | 12 +++++++----- src/engine/imap-db/imap-db-fts5-tokeniser.c | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/engine/imap-db/imap-db-fts5-matches.c b/src/engine/imap-db/imap-db-fts5-matches.c index 4dff4f32b..a213cb4bb 100644 --- a/src/engine/imap-db/imap-db-fts5-matches.c +++ b/src/engine/imap-db/imap-db-fts5-matches.c @@ -27,17 +27,19 @@ #include #include +#define unused __attribute__((unused)) + + typedef struct { int start; int end; } Offset; - static int offsets_tokenizer_func (void *data, - int flags, - const char *token, - int n_token, + unused int flags, + unused const char *token, + unused int n_token, int start, int end) { @@ -54,7 +56,7 @@ geary_matches (const Fts5ExtensionApi *api, Fts5Context *fts_ctx, sqlite3_context *ctx, int n_args, - sqlite3_value **args) + unused sqlite3_value **args) { GString *str; int rc, n_hits, i; diff --git a/src/engine/imap-db/imap-db-fts5-tokeniser.c b/src/engine/imap-db/imap-db-fts5-tokeniser.c index 2991a56e1..0468fd597 100644 --- a/src/engine/imap-db/imap-db-fts5-tokeniser.c +++ b/src/engine/imap-db/imap-db-fts5-tokeniser.c @@ -16,6 +16,8 @@ SQLITE_EXTENSION_INIT1 #include "unicode/utf.h" #include "unicode/utypes.h" +#define unused __attribute__((unused)) + // Full text search tokeniser for SQLite. This exists since SQLite's // existing Unicode tokeniser doesn't work with languages that don't // use spaces as word boundaries. @@ -45,9 +47,9 @@ typedef struct { } IcuTokeniser; -static int icu_create(void *context, - const char **args, - int n_args, +static int icu_create(unused void *context, + unused const char **args, + unused int n_args, Fts5Tokenizer **ret) { const UNormalizer2 *norm; UBreakIterator *iter; @@ -88,7 +90,7 @@ static void icu_delete(Fts5Tokenizer *fts5_tokeniser) { static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser, void *context, - int flags, + unused int flags, const char *chars, int32_t chars_len, int (*token_callback)(void*, int, const char*, int, int, int)) { @@ -267,7 +269,7 @@ gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) { // command line SQLite tool. The name of this function must match the // name of the shared module. int sqlite3_gearytokeniser_init(sqlite3 *db, - char **error_message, + unused char **error_message, const sqlite3_api_routines *api) { g_info("Loading geary_tokeniser\n"); SQLITE_EXTENSION_INIT2(api); -- GitLab From 67af1b2064209f4d6951447d63d3a2f9c120615c Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Mon, 28 Dec 2020 19:47:11 +1030 Subject: [PATCH 18/24] po/POFILES.in: Add missing files --- po/POTFILES.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/po/POTFILES.in b/po/POTFILES.in index cbcaff2ee..1e476d676 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -268,8 +268,11 @@ src/engine/imap-db/imap-db-attachment.vala src/engine/imap-db/imap-db-database.vala src/engine/imap-db/imap-db-email-identifier.vala src/engine/imap-db/imap-db-folder.vala +src/engine/imap-db/imap-db-fts5-matches.c +src/engine/imap-db/imap-db-fts5-tokeniser.c src/engine/imap-db/imap-db-gc.vala src/engine/imap-db/imap-db-message-row.vala +src/engine/imap-db/imap-db-sqlite.c src/engine/imap-engine/gmail/imap-engine-gmail-account.vala src/engine/imap-engine/gmail/imap-engine-gmail-all-mail-folder.vala src/engine/imap-engine/gmail/imap-engine-gmail-drafts-folder.vala -- GitLab From d74c024bc97159959c1549319892d51d1307ee5e Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Tue, 29 Dec 2020 17:44:12 +1030 Subject: [PATCH 19/24] Geary.FtsSearchQuery: Re-introduce FTS search optimisations Per commit 9a137699, ensure we tell SQLite what index it should be using, and are doing the FTS search in a sub-select. Without these, FTS search is an order of magnitude slower. --- .../common/common-fts-search-query.vala | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/engine/common/common-fts-search-query.vala b/src/engine/common/common-fts-search-query.vala index aef2e2df3..e8b73042a 100644 --- a/src/engine/common/common-fts-search-query.vala +++ b/src/engine/common/common-fts-search-query.vala @@ -60,18 +60,10 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { // Select distinct since messages may exist in more than one // folder. sql.append(""" - SELECT DISTINCT mt.id"""); - // FTS5 queries cannot be all negated terms. If not then join - // here and filter as usual, if so then exclude via subselect - // further below instead. - if (!this.is_all_negated) { - sql.append(""" - FROM MessageSearchTable AS mst - INNER JOIN MessageTable AS mt ON mt.id = mst.rowid"""); - } else { - sql.append(""" - FROM MessageTable AS mt"""); - } + SELECT DISTINCT mt.id + FROM MessageTable AS mt + INDEXED BY MessageTableInternalDateTimeTIndex"""); + // If excluding folderless messages, an inner join on // MessageLocationTable will cause them to be excluded // automatically. Otherwise a left join always required to @@ -99,14 +91,17 @@ internal class Geary.FtsSearchQuery : Geary.SearchQuery { } // FTS match exclusions - if (!this.is_all_negated) { - conditions_added = sql_add_term_conditions(sql, conditions_added); - } else { + if (!this.expression.is_empty) { if (conditions_added) { sql.append(" AND"); } sql.append( - " mt.id NOT IN (SELECT mst.rowid FROM MessageSearchTable as mst WHERE " + this.is_all_negated + ? " mt.id NOT IN" + : " mt.id IN" + ); + sql.append( + " (SELECT mst.rowid FROM MessageSearchTable as mst WHERE " ); sql_add_term_conditions(sql, false); sql.append_c(')'); -- GitLab From d98467a3f44ddcaa4d667ff7bbd70c3afd9dfb44 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Tue, 29 Dec 2020 17:50:32 +1030 Subject: [PATCH 20/24] Geary.ImapDB.Account: Print SQL for search statements before executing Helps to debug what the expression generator is doing. --- src/engine/imap-db/imap-db-account.vala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index fecb1867c..e5998afbf 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -612,6 +612,7 @@ private class Geary.ImapDB.Account : BaseObject { exclude_folderless, limit, offset ); + debug("Search SQL: %s", stmt.get_expanded_sql()); Db.Result result = stmt.exec(cancellable); while (!result.finished) { int64 message_id = result.int64_at(0); -- GitLab From 24140c07cde6998cb9a11e1343853df9dda94ff1 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 14 Jan 2021 21:09:56 +1100 Subject: [PATCH 21/24] Geary.App.RemoveOperation: Queue a window count check after executing Since when removing messages from the conversation monitor (especially when a search is changed) the window might get smaller, queue a check to re-fill if needed. --- .../app/conversation-monitor/app-remove-operation.vala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/engine/app/conversation-monitor/app-remove-operation.vala b/src/engine/app/conversation-monitor/app-remove-operation.vala index 8105d959e..f977e9023 100644 --- a/src/engine/app/conversation-monitor/app-remove-operation.vala +++ b/src/engine/app/conversation-monitor/app-remove-operation.vala @@ -35,13 +35,14 @@ private class Geary.App.RemoveOperation : BatchOperation { trimmed ); - - // Fire signals, clean up this.monitor.removed( removed, trimmed, (this.source_folder == this.monitor.base_folder) ? batch : null ); + + // Queue an update since many emails may have been removed + this.monitor.check_window_count(); } } -- GitLab From 58cae0ae40858682097de7bb06056de4a03ec74a Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 14 Jan 2021 21:20:25 +1100 Subject: [PATCH 22/24] Geary.ImapDB.Account: Drop post-search stemmed term greedy match removal Stop post-processing search results by dropping results that contain a matched term that is longer by some criterion than a stemmed term. Since this cannot be specified by SQLite's FTS queries, it has to be done outside of the search, which can have a substantial impact on performance, and either means running multiple queries outside of a transaction to get the required number of search results (potentially a large number of times), running the pruning within a transaction (potentially blocking the DB for a large length of time), or returning the wrong number of search results (potentially confusing the caller). Because of these disadvantages, and since SearchQuery's maximum difference in lengths between term and stemmed variant helps to prevent greedy matching anyway, just drop the post processing. --- src/engine/api/geary-search-query.vala | 26 ----------- src/engine/imap-db/imap-db-account.vala | 62 ------------------------- 2 files changed, 88 deletions(-) diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 62f7416d7..d9603d66a 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -121,32 +121,6 @@ public abstract class Geary.SearchQuery : BaseObject { return max; } - /** - * Maximum difference in lengths between a matched word and the stemmed variant it matched - * against. - * - * This prevents long words being matched to short stem - * variants (which creates opportunities for greedy matching). - */ - internal int get_max_difference_match_stem_lengths() { - var max = 0; - switch (this) { - case EXACT: - max = 0; - break; - case CONSERVATIVE: - max = 2; - break; - case AGGRESSIVE: - max = 3; - break; - case HORIZON: - max = int.MAX; - break; - } - return max; - } - } diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index e5998afbf..e9d3abc0d 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -633,67 +633,9 @@ private class Geary.ImapDB.Account : BaseObject { }, cancellable); debug("Matching emails found: %d", matching_ids.size); - - if (query.has_stemmed_terms && search_matches != null) { - strip_greedy_results(query, matching_ids, search_matches); - } - - debug("Final search matches: %d", matching_ids.size); return matching_ids.is_empty ? null : matching_ids; } - // Strip out from the given collection of matching ids and results - // for any search results that only contain a hit due to "greedy" - // matching of the stemmed variants on all search terms. - private void strip_greedy_results(SearchQuery query, - Gee.Collection matches, - Gee.Map> results) { - int prestripped_results = matches.size; - // Gee.Iterator iter = matches.iterator(); - // while (iter.next()) { - // // For each matched string in this message, retain the message in the search results - // // if it prefix-matches any of the straight-up parsed terms or matches a stemmed - // // variant (with only max. difference in their lengths allowed, i.e. not a "greedy" - // // match) - // EmailIdentifier id = iter.get(); - // bool good_match_found = false; - // Gee.Set? result = results.get(id); - // if (result != null) { - // foreach (string match in result) { - // foreach (SearchQuery.Term term in query.get_all_terms()) { - // // if prefix-matches parsed term, then don't strip - // if (match.has_prefix(term.parsed)) { - // good_match_found = true; - // break; - // } - - // // if prefix-matches stemmed term w/o doing so - // // greedily, then don't strip - // if (term.stemmed != null && match.has_prefix(term.stemmed)) { - // int diff = match.length - term.stemmed.length; - // if (diff <= query.max_difference_match_stem_lengths) { - // good_match_found = true; - // break; - // } - // } - // } - // } - - // if (good_match_found) { - // break; - // } - // } - - // if (!good_match_found) { - // iter.remove(); - // matches.remove(id); - // } - // } - - debug("Stripped %d emails from search for [%s] due to greedy stem matching", - prestripped_results - matches.size, query.raw); - } - public async Gee.Set? get_search_matches_async(Geary.SearchQuery q, Gee.Collection ids, Cancellable? cancellable = null) throws Error { check_open(); @@ -714,10 +656,6 @@ private class Geary.ImapDB.Account : BaseObject { return Db.TransactionOutcome.DONE; } - if (query.has_stemmed_terms) { - strip_greedy_results(query, ids, match_map); - } - search_matches = new Gee.HashSet(); foreach (Gee.Set matches in match_map.values) search_matches.add_all(matches); -- GitLab From 2f81fdf146cad612f52b70a3745ba555653f57a9 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Mon, 18 Jan 2021 22:16:24 +1100 Subject: [PATCH 23/24] build: Add minimal ICU VAPI for UBreakIterator --- bindings/vapi/icu-uc.vapi | 95 +++++++++++++++++++++++++++++++++++++++ meson.build | 16 +++---- src/client/meson.build | 1 + src/engine/meson.build | 4 +- src/meson.build | 1 + 5 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 bindings/vapi/icu-uc.vapi diff --git a/bindings/vapi/icu-uc.vapi b/bindings/vapi/icu-uc.vapi new file mode 100644 index 000000000..b470faff1 --- /dev/null +++ b/bindings/vapi/icu-uc.vapi @@ -0,0 +1,95 @@ +// Based on icu-uc.vapi from the Dino project. + +[CCode (cprefix="u_")] +namespace Icu { + + [CCode (cname = "UChar")] + [IntegerType (rank = 5, min = 0, max = 65535)] + struct Char {} + + [CCode (cname = "UErrorCode", cprefix = "U_", cheader_filename = "unicode/utypes.h")] + enum ErrorCode { + ZERO_ERROR, + INVALID_CHAR_FOUND, + INDEX_OUTOFBOUNDS_ERROR, + BUFFER_OVERFLOW_ERROR, + STRINGPREP_PROHIBITED_ERROR, + UNASSIGNED_CODE_POINT_FOUND, + IDNA_STD3_ASCII_RULES_ERROR; + + [CCode (cname = "u_errorName")] + public unowned string errorName(); + + [CCode (cname = "U_SUCCESS")] + public bool is_success(); + + [CCode (cname = "U_FAILURE")] + public bool is_failure(); + } + + [CCode (cname = "UParseError", cprefix = "U_", cheader_filename = "unicode/parseerr.h")] + struct ParseError {} + + [CCode (cname = "UText", cprefix = "utext_", free_function = "utext_close", cheader_filename = "unicode/utext.h")] + [Compact] + class Text { + [CCode (cname="utext_openUTF8")] + public static Text open_utf8(Text* existing, [CCode (array_length_type = "int64_t")] uint8[] text, ref ErrorCode status); + } + + [CCode (cname = "UBreakIterator", cprefix = "ubrk_", free_function = "ubrk_close", cheader_filename = "unicode/ubrk.h")] + [Compact] + class BreakIterator { + + [CCode (cname = "UBRK_DONE")] + public const int32 DONE; + + [CCode (cname = "UBreakIteratorType", cprefix = "UBRK_")] + public enum Type { + CHARACTER, + WORD, + LINE, + SENTENCE; + } + + [CCode (cname = "UWordBreak", cprefix = "UBRK_WORD_")] + enum WordBreak { + NONE, + NONE_LIMIT, + NUMBER, + NUMBER_LIMIT, + LETTER, + LETTER_LIMIT, + KANA, + KANA_LIMIT, + IDEO, + IDEO_LIMIT; + } + + public static BreakIterator open(Type type, string locale, Char* text, int32 text_len, ref ErrorCode status); + + public int32 current { + [CCode (cname="ubrk_current")] get; + } + public int32 rule_status { + [CCode (cname="ubrk_getRuleStatus")] get; + } + + [CCode (cname="ubrk_isBoundary")] + public bool is_boundary(int32 offset); + + public int32 first(); + public int32 last(); + + public int32 next(); + public int32 previous(); + + public int32 proceeding(int32 offset); + public int32 following(int32 offset); + + [CCode (cname="ubrk_setUText")] + public void set_utext(Text text, ref ErrorCode status); + } + + +} \ No newline at end of file diff --git a/meson.build b/meson.build index 325849953..37b527095 100644 --- a/meson.build +++ b/meson.build @@ -111,6 +111,13 @@ webkit2gtk_web_extension = dependency('webkit2gtk-web-extension-4.0', version: ' # following libraries, but the declared dependency is what we actually # build against so we can include the custom VAPI correctly. +icu_uc = declare_dependency( + dependencies: [ + valac.find_library('icu-uc', dirs: [vapi_dir]), + cc.find_library('icuuc'), + ], +) + if libunwind_dep.found() # We need to add native lib to the search path for these so Flatpak # builds can find it. @@ -131,15 +138,6 @@ libstemmer = declare_dependency( ], ) -# Faux ICU dependency to prevent ICU being passed to valac as a -# package by meson -icu = declare_dependency( - dependencies: [ - cc.find_library('icuuc'), - cc.find_library('icudata'), - ], -) - # Optional dependencies appstream_util = find_program('appstream-util', required: false) desktop_file_validate = find_program('desktop-file-validate', required: false) diff --git a/src/client/meson.build b/src/client/meson.build index aeea36d01..e4726cebb 100644 --- a/src/client/meson.build +++ b/src/client/meson.build @@ -164,6 +164,7 @@ client_dependencies = [ goa, gspell, gtk, + icu_uc, javascriptcoregtk, json_glib, libhandy, diff --git a/src/engine/meson.build b/src/engine/meson.build index ba9941ffa..a29daf0b0 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -325,7 +325,7 @@ engine_dependencies = [ gio, glib, gmime, - icu, + icu_uc, libmath, libstemmer, libxml, @@ -416,7 +416,7 @@ engine_internal_dep = declare_dependency( # command line app can still be used. tokeniser_lib = shared_library('geary-tokeniser', files('imap-db/imap-db-fts5-tokeniser.c'), - dependencies: [ glib, icu, sqlite ], + dependencies: [ glib, icu_uc, sqlite ], c_args: [ # Enable GLib structured logging '-DG_LOG_USE_STRUCTURED', diff --git a/src/meson.build b/src/meson.build index 1ce6681ca..d2b018038 100644 --- a/src/meson.build +++ b/src/meson.build @@ -149,6 +149,7 @@ foreach dep : valadoc_dependencies valadoc_dep_args += '--pkg' valadoc_dep_args += dep.name() endforeach +valadoc_dep_args += [ '--pkg', 'icu-uc' ] valadoc_dep_args += [ '--pkg', 'libstemmer' ] valadoc_dep_args += [ '--pkg', 'posix' ] -- GitLab From 642bf00e88de4e317977754ef377ed7aafe08461 Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Tue, 19 Jan 2021 20:42:26 +1100 Subject: [PATCH 24/24] Util.Email.SearchExpressionFactory: Use ICU for work breaking Implement search query text word segmentaion using ICU, so that languages that don't use spaces for word delimiters are correctly tokenised. --- src/client/util/util-email.vala | 65 +++++++++++++++++---------- test/client/util/util-email-test.vala | 30 ++++++++----- 2 files changed, 62 insertions(+), 33 deletions(-) diff --git a/src/client/util/util-email.vala b/src/client/util/util-email.vala index 554dbe72b..c94af33b8 100644 --- a/src/client/util/util-email.vala +++ b/src/client/util/util-email.vala @@ -1,6 +1,6 @@ /* * Copyright 2016 Software Freedom Conservancy Inc. - * Copyright 2019 Michael Gratton + * Copyright 2019-2021 Michael Gratton * * This software is licensed under the GNU Lesser General Public License * (version 2.1 or later). See the COPYING file in this distribution. @@ -350,6 +350,10 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { private class Tokeniser { + [Flags] + private enum CharStatus { NONE, IN_WORD, END_WORD; } + + // These characters are chosen for being commonly used to // continue a single word (such as extended last names, // i.e. "Lars-Eric") or in terms commonly searched for in an @@ -365,7 +369,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { } public bool is_at_word { - get { return (this.attrs[this.current_c].is_word_start == 1); } + get { return CharStatus.IN_WORD in this.char_status[this.current_pos]; } } public bool is_at_quote { @@ -380,30 +384,51 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { private int next_pos = 0; private unichar c = 0; - private int current_c = -1; - private Pango.LogAttr[] attrs; + private CharStatus[] char_status; - public Tokeniser(string query, Pango.Language language) { + public Tokeniser(string query) { this.query = query; // Break up search string into individual words and/or - // operators. Can't simply break on space or non-alphanumeric - // chars since some languages don't use spaces, so use Pango - // for its support for the Unicode UAX #29 word boundary spec. - this.attrs = new Pango.LogAttr[query.char_count() + 1]; - Pango.get_log_attrs( - query, query.length, -1, language, this.attrs + // operators. Can't simply break on space or + // non-alphanumeric chars since some languages don't use + // spaces, so use ICU for its support for the Unicode UAX + // #29 word boundary spec and dictionary-based breaking + // for languages that do not use spaces for work breaks. + + this.char_status = new CharStatus[query.length + 1]; + + var icu_err = Icu.ErrorCode.ZERO_ERROR; + var icu_text = Icu.Text.open_utf8(null, this.query.data, ref icu_err); + var word_breaker = Icu.BreakIterator.open( + WORD, "en", null, -1, ref icu_err ); + word_breaker.set_utext(icu_text, ref icu_err); + + int32 prev_index = 0; + var current_index = word_breaker.first(); + var status = 0; + while (current_index != Icu.BreakIterator.DONE) { + status = word_breaker.rule_status; + if (!(status >= Icu.BreakIterator.WordBreak.NONE && + status < Icu.BreakIterator.WordBreak.NONE_LIMIT)) { + for (int i = prev_index; i < current_index; i++) { + this.char_status[i] |= IN_WORD; + } + this.char_status[current_index] |= END_WORD; + } + + prev_index = current_index; + current_index = word_breaker.next(); + } consume_char(); } public void consume_char() { var current_pos = this.next_pos; - if (this.query.get_next_char(ref this.next_pos, out this.c)) { - this.current_c++; - } + this.query.get_next_char(ref this.next_pos, out this.c); this.current_pos = current_pos; } @@ -415,13 +440,11 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { public string consume_word() { var start = this.current_pos; - // the attr.is_word_end value applies to the first char - // after then end of a word, so need to move one past the - // end of the current word to determine where it ends consume_char(); while (this.has_next && + this.c != OPERATOR_SEPARATOR && (this.c in CONTINUATION_CHARS || - this.attrs[this.current_c].is_word_end != 1)) { + !(CharStatus.END_WORD in this.char_status[this.current_pos]))) { consume_char(); } return this.query.slice(start, this.current_pos); @@ -446,10 +469,6 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { public Geary.AccountInformation account { get; private set; } - public Pango.Language language { - get; set; default = Pango.Language.get_default(); - } - // Maps of localised search operator names and values to their // internal forms private Gee.Map text_operators = @@ -470,7 +489,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject { /** Constructs a search expression from the given query string. */ public Gee.List parse_query(string query) { var operands = new Gee.LinkedList(); - var tokens = new Tokeniser(query, this.language); + var tokens = new Tokeniser(query); while (tokens.has_next) { if (tokens.is_at_word) { Geary.SearchQuery.Term? op = null; diff --git a/test/client/util/util-email-test.vala b/test/client/util/util-email-test.vala index 01605480c..b3e45d7d7 100644 --- a/test/client/util/util-email-test.vala +++ b/test/client/util/util-email-test.vala @@ -195,18 +195,28 @@ public class Util.Email.Test : TestCase { this.config.get_search_strategy(), this.account ); - test_article.language = Pango.Language.from_string("th"); - var multiple = test_article.parse_query("ภาษาไทย"); - assert_collection(multiple).size(2); - assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm); - assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm); + var thai = test_article.parse_query("ภาษาไทย"); + assert_collection(thai).size(2); + assert_true(thai[0] is Geary.SearchQuery.EmailTextTerm); + assert_true(thai[1] is Geary.SearchQuery.EmailTextTerm); assert_collection( - ((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms + ((Geary.SearchQuery.EmailTextTerm) thai[0]).terms ).size(1).contains("ภาษา"); assert_collection( - ((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms + ((Geary.SearchQuery.EmailTextTerm) thai[1]).terms ).size(1).contains("ไทย"); + + var chinese = test_article.parse_query("男子去"); + assert_collection(chinese).size(2); + assert_true(chinese[0] is Geary.SearchQuery.EmailTextTerm); + assert_true(chinese[1] is Geary.SearchQuery.EmailTextTerm); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) chinese[0]).terms + ).size(1).contains("男子"); + assert_collection( + ((Geary.SearchQuery.EmailTextTerm) chinese[1]).terms + ).size(1).contains("去"); } public void multiple_search_terms() throws GLib.Error { @@ -277,10 +287,10 @@ public class Util.Email.Test : TestCase { var simple_body = test_article.parse_query("body:hello"); assert_collection(simple_body).size(1); - assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm); + assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm, "type"); var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm; - assert_true(text_body.target == BODY); - assert_true(text_body.matching_strategy == CONSERVATIVE); + assert_true(text_body.target == BODY, "target"); + assert_true(text_body.matching_strategy == CONSERVATIVE, "strategy"); assert_collection(text_body.terms).size(1).contains("hello"); var simple_body_quoted = test_article.parse_query("body:\"hello\""); -- GitLab