Commit 968bc1a9 authored by Michael Gratton's avatar Michael Gratton 🍸 Committed by Michael Gratton

ImapDb.SearchQuery: Use libstemmer for stemming

Instead of shipping our own forked stemmer via SQLite, add a dependency
on libstemmer and use that instead.
parent 7c0ad051
......@@ -26,9 +26,9 @@ variables:
meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
libappstream-glib-devel libgee-devel libhandy1-devel libpeas-devel
libsecret-devel libunwind-devel libxml2-devel libytnef-devel
sqlite-devel webkitgtk4-devel
libappstream-glib-devel libgee-devel libhandy1-devel
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
# Ubuntu packages
......@@ -37,8 +37,8 @@ variables:
itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
libhandy-1-dev libjson-glib-dev libmessaging-menu-dev
libpeas-dev libsecret-1-dev libsqlite3-dev libunwind-dev
libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
......
......@@ -91,8 +91,8 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
gnome-online-accounts-devel gspell-devel gsound-devel \
gtk3-devel iso-codes-devel itstool json-glib-devel \
libappstream-glib-devel libgee-devel libhandy1-devel \
libpeas-devel libsecret-devel libunwind-devel libxml2-devel \
libytnef-devel sqlite-devel webkitgtk4-devel
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
```
Installing dependencies on Ubuntu/Debian
......@@ -107,8 +107,8 @@ sudo apt-get install meson build-essential valac \
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
libsqlite3-dev libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev \
libytnef0-dev
libsqlite3-dev libstemmer-dev libunwind-dev \
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
```
And for Ubuntu Messaging Menu integration:
......
/*-
* Copyright (c) 2010-2013 Giulio Paci <giuliopaci@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")]
namespace SnowBall {
[Compact]
[CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")]
public class Stemmer {
public unowned string stem(string word, int size);
public int length();
public Stemmer(string language, string encoding = "UTF_8");
[CCode (array_length = false, array_null_terminated = true)]
public static unowned string[] list();
}
}
\ No newline at end of file
......@@ -93,6 +93,7 @@ libmath = cc.find_library('m')
libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
libsecret = dependency('libsecret-1', version: '>= 0.11')
libstemmer_dep = cc.find_library('stemmer')
libsoup = dependency('libsoup-2.4', version: '>= 2.48')
libunwind_dep = dependency(
'libunwind', version: '>= 1.1', required: get_option('libunwind')
......@@ -122,6 +123,13 @@ if libunwind_dep.found()
)
endif
libstemmer = declare_dependency(
dependencies: [
valac.find_library('libstemmer', dirs: [vapi_dir]),
libstemmer_dep,
],
)
# Optional dependencies
appstream_util = find_program('appstream-util', required: false)
desktop_file_validate = find_program('desktop-file-validate', required: false)
......
......@@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
// A list of all search terms, regardless of search op field name
private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
private SnowBall.Stemmer stemmer;
public async SearchQuery(Geary.Account owner,
ImapDB.Account local,
string query,
......@@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
GLib.Cancellable? cancellable) {
base(owner, query, strategy);
this.account = local;
this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer());
switch (strategy) {
case Strategy.EXACT:
......@@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
/**
* This method is used to convert an unquoted user-entered search terms into a stemmed search
* term.
* Converts unquoted search terms into a stemmed search term.
*
* Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
* tastes when coupled with prefix-matching of all unquoted terms (see
* https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
* designed to dampen that aggressiveness without losing the benefits of stemming entirely.
* Prior experience with the Snowball stemmer indicates it is too
* aggressive for our tastes when coupled with prefix-matching of
* all unquoted terms (see
* https://bugzilla.gnome.org/show_bug.cgi?id=713179).
*
* Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
* with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
* table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
* is still installed in Geary. Thus, we are now in the position to search for the original
* term and its stemmed variant, then do post-search processing to strip results which are
* too "greedy" due to prefix-matching the stemmed variant.
* This method is part of a larger strategy designed to dampen
* that aggressiveness without losing the benefits of stemming
* entirely: The database's FTS table uses no stemming, but
* libstemmer is used to generate stemmed search terms.
* Post-search processing is then to strip results which are too
* "greedy" due to prefix-matching the stemmed variant.
*
* Some heuristics are in place simply to determine if stemming should occur:
*
......@@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
if (term_length < this.min_term_length_for_stemming)
return null;
string? stemmed = null;
try {
yield this.account.db.exec_transaction_async(RO,
(cx, cancellable) => {
Db.Statement stmt = cx.prepare("""
SELECT token
FROM TokenizerTable
WHERE input=?
""");
stmt.bind_string(0, term);
// get stemmed string; if no result, fall through
Db.Result result = stmt.exec(cancellable);
if (!result.finished) {
stemmed = result.string_at(0);
} else {
debug("No stemmed term returned for \"%s\"", term);
}
return COMMIT;
}, cancellable
);
} catch (Error err) {
debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
// fall-through
}
string? stemmed = this.stemmer.stem(term, term.length);
if (String.is_empty(stemmed)) {
debug("Empty stemmed term returned for \"%s\"", term);
return null;
}
......@@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
}
debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
return stemmed;
}
private string find_appropriate_search_stemmer() {
// Unfortunately, the stemmer library only accepts the full language
// name for the stemming algorithm. This translates between the user's
// preferred language ISO 639-1 code and our available stemmers.
// FIXME: the available list here is determined by what's included in
// src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in
// instead of hardcoding it here.
foreach (string l in Intl.get_language_names()) {
switch (l) {
case "ar": return "arabic";
case "eu": return "basque";
case "ca": return "catalan";
case "da": return "danish";
case "nl": return "dutch";
case "en": return "english";
case "fi": return "finnish";
case "fr": return "french";
case "de": return "german";
case "el": return "greek";
case "hi": return "hindi";
case "hu": return "hungarian";
case "id": return "indonesian";
case "ga": return "irish";
case "it": return "italian";
case "lt": return "lithuanian";
case "ne": return "nepali";
case "no": return "norwegian";
case "pt": return "portuguese";
case "ro": return "romanian";
case "ru": return "russian";
case "sr": return "serbian";
case "es": return "spanish";
case "sv": return "swedish";
case "ta": return "tamil";
case "tr": return "turkish";
}
}
// Default to English because it seems to be on average the language
// most likely to be present in emails, regardless of the user's
// language setting. This is not an exact science, and search results
// should be ok either way in most cases.
return "english";
}
}
......@@ -323,6 +323,7 @@ engine_dependencies = [
glib,
gmime,
libmath,
libstemmer,
libxml,
posix,
sqlite
......
......@@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies
valadoc_dep_args += '--pkg'
valadoc_dep_args += dep.name()
endforeach
valadoc_dep_args += [ '--pkg', 'libstemmer' ]
valadoc_dep_args += [ '--pkg', 'posix' ]
valadoc_vapidir_args = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment