Commit 533ab75e authored by Jim Nelson's avatar Jim Nelson

Improved search experience: Bug #720361

This introduces a new full-text search algorithm that attempts to
curb the effects of overstemming in the Porter Snowball stemmer.
The FTS table will be regenerated with this update.

The crux of this new algorithm is a configurable heuristic that
reduces stemmed matching.  The configuration is not available via the
UI (I suspect it will only confuse users) but can be changed by power
users via GSettings.  More information is available at:

https://wiki.gnome.org/Apps/Geary/FullTextSearchStrategy
parent 068b3abc
......@@ -74,6 +74,13 @@
<summary>whether to compose emails in HTML</summary>
<description>True to compose emails in HTML; false for plain text.</description>
</key>
<key name="search-strategy" type="s">
<default>"conservative"</default>
<summary>Advisory strategy for full-text searching</summary>
<description>Acceptable values are EXACT, CONSERVATIVE, AGGRESSIVE, and HORIZON.</description>
</key>
</schema>
</schemalist>
......@@ -22,3 +22,4 @@ install(FILES version-019.sql DESTINATION ${SQL_DEST})
install(FILES version-020.sql DESTINATION ${SQL_DEST})
install(FILES version-021.sql DESTINATION ${SQL_DEST})
install(FILES version-022.sql DESTINATION ${SQL_DEST})
install(FILES version-023.sql DESTINATION ${SQL_DEST})
--
-- Database upgrade to add FTS tokenize virtual table, which allows for querying the tokenizer
-- directly for stemmed words, and dropping the stemmed FTS table for an unstemmed one. We now
-- use the stemmer manually to generate search queries.
--
DROP TABLE MessageSearchTable;
CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
body,
attachment,
subject,
from_field,
receivers,
cc,
bcc,
tokenize=simple,
prefix="2,4,6,8,10"
);
......@@ -170,6 +170,8 @@ engine/imap-db/imap-db-email-identifier.vala
engine/imap-db/imap-db-folder.vala
engine/imap-db/imap-db-message-addresses.vala
engine/imap-db/imap-db-message-row.vala
engine/imap-db/imap-db-search-query.vala
engine/imap-db/imap-db-search-term.vala
engine/imap-db/imap-db-search-email-identifier.vala
engine/imap-db/outbox/smtp-outbox-email-identifier.vala
engine/imap-db/outbox/smtp-outbox-email-properties.vala
......
......@@ -135,5 +135,43 @@ public class Configuration {
if (!settings.set_boolean(name, value))
message("Unable to set configuration value %s = %s", name, value.to_string());
}
public Geary.SearchQuery.Strategy get_search_strategy() {
switch (settings.get_string("search-strategy").down()) {
case "exact":
return Geary.SearchQuery.Strategy.EXACT;
case "aggressive":
return Geary.SearchQuery.Strategy.AGGRESSIVE;
case "horizon":
return Geary.SearchQuery.Strategy.HORIZON;
case "conservative":
default:
return Geary.SearchQuery.Strategy.CONSERVATIVE;
}
}
public void set_search_strategy(Geary.SearchQuery.Strategy strategy) {
switch (strategy) {
case Geary.SearchQuery.Strategy.EXACT:
settings.set_string("search-strategy", "exact");
break;
case Geary.SearchQuery.Strategy.AGGRESSIVE:
settings.set_string("search-strategy", "aggressive");
break;
case Geary.SearchQuery.Strategy.HORIZON:
settings.set_string("search-strategy", "horizon");
break;
case Geary.SearchQuery.Strategy.CONSERVATIVE:
default:
settings.set_string("search-strategy", "conservative");
break;
}
}
}
......@@ -81,7 +81,7 @@ public class GearyController : Geary.BaseObject {
private const string MOVE_MESSAGE_TOOLTIP_MULTIPLE = _("Move conversations");
private const int SELECT_FOLDER_TIMEOUT_USEC = 100 * 1000;
private const int SEARCH_TIMEOUT_MSEC = 100;
private const int SEARCH_TIMEOUT_MSEC = 250;
private const string PROP_ATTEMPT_OPEN_ACCOUNT = "attempt-open-account";
......@@ -2512,7 +2512,8 @@ public class GearyController : Geary.BaseObject {
cancel_search(); // Stop any search in progress.
folder.set_search_query(search_text, cancellable_search);
folder.search(search_text, GearyApplication.instance.config.get_search_strategy(),
cancellable_search);
main_window.folder_list.set_search(folder);
search_text_changed(main_window.main_toolbar.search_text);
......@@ -2523,7 +2524,8 @@ public class GearyController : Geary.BaseObject {
// search after a quick delay when they finish typing.
if (search_timeout_id != 0)
Source.remove(search_timeout_id);
search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout);
search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout, Priority.LOW);
}
private bool on_search_timeout() {
......
......@@ -457,11 +457,28 @@ public class ConversationViewer : Gtk.Box {
}
}
private void on_search_text_changed(string? query) {
private void on_search_text_changed(Geary.SearchQuery? query) {
if (query != null)
highlight_search_terms.begin();
}
// This applies a fudge-factor set of matches when the database results
// aren't entirely satisfactory, such as when you search for an email
// address and the database tokenizes out the @ and ., etc. It's not meant
// to be comprehensive, just a little extra highlighting applied to make
// the results look a little closer to what you typed.
private void add_literal_matches(string raw_query, Gee.Set<string>? search_matches) {
foreach (string word in raw_query.split(" ")) {
if (word.has_suffix("\""))
word = word.substring(0, word.length - 1);
if (word.has_prefix("\""))
word = word.substring(1);
if (!Geary.String.is_empty_or_whitespace(word))
search_matches.add(word);
}
}
private async void highlight_search_terms() {
if (search_folder == null)
return;
......@@ -475,8 +492,13 @@ public class ConversationViewer : Gtk.Box {
ids.add(email.id);
try {
Gee.Collection<string>? search_matches = yield search_folder.get_search_matches_async(
Gee.Set<string>? search_matches = yield search_folder.get_search_matches_async(
ids, cancellable_fetch);
if (search_matches == null)
search_matches = new Gee.HashSet<string>();
if (search_folder.search_query != null)
add_literal_matches(search_folder.search_query.raw, search_matches);
// Webkit's highlighting is ... weird. In order to actually see
// all the highlighting you're applying, it seems necessary to
......@@ -484,8 +506,7 @@ public class ConversationViewer : Gtk.Box {
// seems that shorter strings will overwrite longer ones, and
// you're left with incomplete highlighting.
Gee.ArrayList<string> ordered_matches = new Gee.ArrayList<string>();
if (search_matches != null)
ordered_matches.add_all(search_matches);
ordered_matches.add_all(search_matches);
ordered_matches.sort((a, b) => a.length - b.length);
foreach(string match in ordered_matches)
......
......@@ -118,11 +118,13 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account {
public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy);
public abstract async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws Error;
public abstract async Gee.Collection<string>? get_search_matches_async(Geary.SearchQuery query,
public abstract async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection<Geary.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error;
public abstract async Gee.MultiMap<Geary.EmailIdentifier, Geary.FolderPath>? get_containing_folders_async(
......
......@@ -322,6 +322,23 @@ public interface Geary.Account : BaseObject {
public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
/**
* Create a new {@link SearchQuery} for this {@link Account}.
*
* See {@link Geary.SearchQuery.Strategy} for more information about how its interpreted by the
* Engine. In particular, note that it's an advisory parameter only and may have no effect,
* especially on server searches. However, it may also have a dramatic effect on what search
* results are returned and so should be used with some caution. Whether this parameter is
* user-configurable, available through GSettings or another configuration mechanism, or simply
* baked into the caller's code is up to the caller. CONSERVATIVE is designed to be a good
* default.
*
* The SearchQuery object can only be used with calls into this Account.
*
* Dropping the last reference to the SearchQuery will close it.
*/
public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy);
/**
* Performs a search with the given query. Optionally, a list of folders not to search
* can be passed as well as a list of email identifiers to restrict the search to only those messages.
......@@ -335,9 +352,9 @@ public interface Geary.Account : BaseObject {
Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws Error;
/**
* Given a list of mail IDs, returns a list of words that match for the query.
* Given a list of mail IDs, returns a set of casefolded words that match for the query.
*/
public abstract async Gee.Collection<string>? get_search_matches_async(Geary.SearchQuery query,
public abstract async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection<Geary.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error;
/**
......
......@@ -49,6 +49,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
}
public Geary.SearchQuery? search_query { get; private set; default = null; }
private Gee.HashSet<Geary.FolderPath?> exclude_folders = new Gee.HashSet<Geary.FolderPath?>();
private Geary.SpecialFolderType[] exclude_types = {
Geary.SpecialFolderType.SPAM,
......@@ -56,7 +58,6 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
Geary.SpecialFolderType.DRAFTS,
// Orphan emails (without a folder) are also excluded; see ctor.
};
private Geary.SearchQuery? search_query = null;
private Gee.TreeSet<ImapDB.SearchEmailIdentifier> search_results;
private Geary.Nonblocking.Mutex result_mutex = new Geary.Nonblocking.Mutex();
......@@ -64,7 +65,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
* Fired when the search query has changed. This signal is fired *after* the search
* has completed.
*/
public signal void search_query_changed(string? query);
public signal void search_query_changed(Geary.SearchQuery? query);
public SearchFolder(Account account) {
base();
......@@ -203,8 +204,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
/**
* Sets the keyword string for this search.
*/
public void set_search_query(string query, Cancellable? cancellable = null) {
set_search_query_async.begin(query, cancellable, on_set_search_query_complete);
public void search(string query, SearchQuery.Strategy strategy, Cancellable? cancellable = null) {
set_search_query_async.begin(query, strategy, cancellable, on_set_search_query_complete);
}
private void on_set_search_query_complete(Object? source, AsyncResult result) {
......@@ -215,8 +216,9 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
}
private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error {
Geary.SearchQuery search_query = new Geary.SearchQuery(query);
private async void set_search_query_async(string query, SearchQuery.Strategy strategy,
Cancellable? cancellable) throws Error {
Geary.SearchQuery search_query = account.open_search(query, strategy);
int result_mutex_token = yield result_mutex.claim_async();
......@@ -230,7 +232,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
result_mutex.release(ref result_mutex_token);
this.search_query = search_query;
search_query_changed(search_query.raw);
search_query_changed(search_query);
if (error != null)
throw error;
......@@ -425,13 +427,14 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
/**
* Given a list of mail IDs, returns a list of words that match for the current
* Given a list of mail IDs, returns a set of casefolded words that match for the current
* search query.
*/
public async Gee.Collection<string>? get_search_matches_async(
public async Gee.Set<string>? get_search_matches_async(
Gee.Collection<Geary.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
if (search_query == null)
return null;
return yield account.get_search_matches_async(search_query, ids, cancellable);
}
......
......@@ -6,39 +6,63 @@
/**
* An object to hold state for various search subsystems that might need to
* parse the same text string different ways. The only interaction the API
* user should have with this is creating new ones and then passing them off to
* the search methods in the engine.
* parse the same text string different ways.
*
* TODO: support anything other than ImapDB.Account's search methods.
* The only interaction the API user should have with this is creating new ones and then passing
* them to the search methods in the Engine.
*
* @see Geary.Account.open_search
*/
public class Geary.SearchQuery : BaseObject {
public string raw { get; private set; }
public bool parsed { get; internal set; default = false; }
// Not using a MultiMap because we (might) need a guarantee of order.
private Gee.HashMap<string?, Gee.ArrayList<string>> field_map
= new Gee.HashMap<string?, Gee.ArrayList<string>>();
public SearchQuery(string query) {
raw = query;
public abstract class Geary.SearchQuery : BaseObject {
/**
* An advisory parameter regarding search quality, scope, and breadth.
*
* The Engine can perform searches based on (unspecified, uncontracted) textual variations of
* a query's search terms. Some of those variations may produce undesirable results due to
* "greedy" matching of terms. The Strategy parameter allows for an advisory to the Engine
* about how to use those textual variants, if any at all.
*
* This may be respected or ignored by the Engine. In particular, there's no guarantee it will
* have any effect on server search.
*/
public enum Strategy {
/**
* Only return exact matches, perform no searches for textual variants.
*
* Note that Geary's search syntax does prefix-matching for unquoted strings. EXACT means
* exact ''prefix-''matching in this case.
*/
EXACT,
/**
* Allow for searching for a small set of textual variants and small differences in search
* terms. This is a good default.
*/
CONSERVATIVE,
/**
* Allow for searching for a broad set of textual variants and larger differences in
* search terms.
*/
AGGRESSIVE,
/**
* Search for all textual variants, i.e. "the sky's the limit."
*/
HORIZON
}
internal void add_token(string? field, string token) {
if (!field_map.has_key(field))
field_map.set(field, new Gee.ArrayList<string>());
field_map.get(field).add(token);
}
/**
* The original user search text.
*/
public string raw { get; private set; }
internal Gee.Collection<string?> get_fields() {
return field_map.keys;
}
/**
* The selected {@link Strategy} quality.
*/
public Strategy strategy { get; private set; }
internal Gee.List<string>? get_tokens(string? field) {
if (!field_map.has_key(field))
return null;
return field_map.get(field);
protected SearchQuery(string raw, Strategy strategy) {
this.raw = raw;
this.strategy = strategy;
}
}
This diff is collapsed.
......@@ -107,7 +107,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
break;
case 22:
post_rebuild_attachments();
post_upgrade_rebuild_attachments();
break;
case 23:
post_upgrade_add_tokenizer_table();
break;
}
}
......@@ -407,7 +411,7 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
// Version 22
private void post_rebuild_attachments() {
private void post_upgrade_rebuild_attachments() {
try {
exec_transaction(Db.TransactionType.RW, (cx) => {
Db.Statement stmt = cx.prepare("""
......@@ -471,6 +475,25 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
}
// Version 23
private void post_upgrade_add_tokenizer_table() {
try {
string stemmer = find_appropriate_search_stemmer();
debug("Creating tokenizer table using %s stemmer", stemmer);
// These can't go in the .sql file because its schema (the stemmer
// algorithm) is determined at runtime.
exec("""
CREATE VIRTUAL TABLE TokenizerTable USING fts3tokenize(
unicodesn,
"stemmer=%s"
);
""".printf(stemmer));
} catch (Error e) {
error("Error creating tokenizer table: %s", e.message);
}
}
private void on_prepare_database_connection(Db.Connection cx) throws Error {
cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
cx.set_foreign_keys(true);
......
/* Copyright 2014 Yorba Foundation
*
* This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution.
*/
/**
* Internal implementation of {@link Geary.SearchQuery}.
*/
private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
/**
* Associated {@link ImapDB.Account}.
*/
public weak ImapDB.Account account { get; private set; }
/**
* Whether or not the query has been parsed and processed prior to search submission.
*/
public bool parsed { get; set; default = false; }
/**
* Determined by {@link strategy}.
*/
public bool allow_stemming { get; private set; }
/**
* Minimum length of the term before stemming is allowed.
*
* This prevents short words that might be stemmed from being stemmed.
*
* Overridden by {@link allow_stemming}. Determined by {@link strategy}.
*/
public int min_term_length_for_stemming { get; private set; }
/**
* Maximum difference in lengths between term and stemmed variant.
*
* This prevents long words from being stemmed to much shorter words (which creates
* opportunities for greedy matching).
*
* Overridden by {@link allow_stemming}. Determined by {@link strategy}.
*/
public int max_difference_term_stem_lengths { get; private set; }
/**
* Maximum difference in lengths between a matched word and the stemmed variant it matched
* against.
*
* This prevents long words being matched to short stem variants (which creates opportunities
* for greedy matching).
*
* Overridden by {@link allow_stemming}. Determined by {@link strategy}.
*/
public int max_difference_match_stem_lengths { get; private set; }
// Not using a MultiMap because we (might) need a guarantee of order.
private Gee.HashMap<string?, Gee.ArrayList<SearchTerm>> field_map
= new Gee.HashMap<string?, Gee.ArrayList<SearchTerm>>();
private Gee.ArrayList<SearchTerm> all = new Gee.ArrayList<SearchTerm>();
public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Strategy strategy) {
base (query, strategy);
this.account = account;
switch (strategy) {
case Strategy.EXACT:
allow_stemming = false;
min_term_length_for_stemming = int.MAX;
max_difference_term_stem_lengths = 0;
max_difference_match_stem_lengths = 0;
break;
case Strategy.CONSERVATIVE:
allow_stemming = true;
min_term_length_for_stemming = 6;
max_difference_term_stem_lengths = 2;
max_difference_match_stem_lengths = 2;
break;
case Strategy.AGGRESSIVE:
allow_stemming = true;
min_term_length_for_stemming = 4;
max_difference_term_stem_lengths = 4;
max_difference_match_stem_lengths = 3;
break;
case Strategy.HORIZON:
allow_stemming = true;
min_term_length_for_stemming = 0;
max_difference_term_stem_lengths = int.MAX;
max_difference_match_stem_lengths = int.MAX;
break;
default:
assert_not_reached();
}
}
public void add_search_term(string? field, SearchTerm term) {
if (!field_map.has_key(field))
field_map.set(field, new Gee.ArrayList<SearchTerm>());
field_map.get(field).add(term);
all.add(term);
}
public Gee.Collection<string?> get_fields() {
return field_map.keys;
}
public Gee.List<SearchTerm>? get_search_terms(string? field) {
return field_map.has_key(field) ? field_map.get(field) : null;
}
public Gee.List<SearchTerm>? get_all_terms() {
return all;
}
}
/* Copyright 2014 Yorba Foundation
*
* This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution.
*/
/**
* Various associated state with a single term in a {@link ImapDB.SearchQuery}.
*/
private class Geary.ImapDB.SearchTerm : BaseObject {
/**
* The original tokenized search term with minimal other processing performed.
*
* For example, punctuation might be removed, but no casefolding has occurred.
*/
public string original { get; private set; }
/**
* The parsed tokenized search term.
*
* Casefolding and other normalizing text operations have been performed.
*/
public string parsed { get; private set; }
/**
* The stemmed search term.
*
* Only used if stemming is being done ''and'' the stem is different than the {@link parsed}
* term.
*/
public string? stemmed { get; private set; }
/**
* A list of terms ready for binding to an SQLite statement.
*
* This should include prefix operators and quotes (i.e. ["party"] or [party*]). These texts
* are guaranteed not to be null or empty strings.
*/
public Gee.List<string> sql { get; private set; default = new Gee.ArrayList<string>(); }
/**
* Returns true if the {@link parsed} term is exact-match only (i.e. starts with quotes) and
* there is no {@link stemmed} variant.
*/
public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } }
public SearchTerm(string original, string parsed, string? stemmed, string? sql_parsed, string? sql_stemmed) {
this.original = original;
this.parsed = parsed;
this.stemmed = stemmed;
// for now, only two variations: the parsed string and the stemmed; since stem is usually
// shorter (and will be first in the OR statement), include it first
if (!String.is_empty(sql_stemmed))
sql.add(sql_stemmed);
if (!String.is_empty(sql_parsed))
sql.add(sql_parsed);
}
}
......@@ -824,6 +824,10 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable);
}
public override Geary.SearchQuery open_search(string query, SearchQuery.Strategy strategy) {
return new ImapDB.SearchQuery(local, query, strategy);
}
public override async Gee.Collection<Geary.EmailIdentifier>? local_search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection<Geary.FolderPath?>? folder_blacklist = null,
Gee.Collection<Geary.EmailIdentifier>? search_ids = null, Cancellable? cancellable = null) throws Error {
......@@ -833,7 +837,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
return yield local.search_async(query, limit, offset, folder_blacklist, search_ids, cancellable);
}
public override async Gee.Collection<string>? get_search_matches_async(Geary.SearchQuery query,
public override async Gee.Set<string>? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection<Geary.EmailIdentifier> ids, Cancellable? cancellable = null) throws Error {
return yield local.get_search_matches_async(query, check_ids(ids), cancellable);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment