From 3fc25f2104a37a70a14748c9548d2cfa542f33a8 Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Fri, 24 Jul 2020 22:30:31 +0200 Subject: [PATCH 1/2] tracker-extract: Ensure that we ignore failed files in all situations We rely on setting the tracker:extractorHash property to mark files which are processed. If this isn't set the extractor may get stuck repeatedly processing the same file. This commit fixes some edge cases where files might not be marked. Fixes https://gitlab.gnome.org/GNOME/tracker-miners/-/issues/118 --- .../tracker-extract-decorator.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/tracker-extract/tracker-extract-decorator.c b/src/tracker-extract/tracker-extract-decorator.c index 20152abb2..cfec34950 100644 --- a/src/tracker-extract/tracker-extract-decorator.c +++ b/src/tracker-extract/tracker-extract-decorator.c @@ -609,18 +609,21 @@ decorator_ignore_file (GFile *file, NULL, &error); if (!info) { g_warning ("Could not get mimetype: %s", error->message); - g_error_free (error); - return; - } + g_clear_error (&error); - mimetype = g_file_info_get_attribute_string (info, - G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE); - hash = tracker_extract_module_manager_get_hash (mimetype); - g_object_unref (info); + /* Use a zero hash value to mark that we tried to process the file and got nowhere */ + hash = "00000000"; + } else { + mimetype = g_file_info_get_attribute_string (info, + G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE); + hash = tracker_extract_module_manager_get_hash (mimetype); + g_object_unref (info); + } conn = tracker_miner_get_connection (TRACKER_MINER (decorator)); query = g_strdup_printf ("INSERT DATA { GRAPH tracker:FileSystem {" - " <%s> tracker:extractorHash \"%s\" ;" + " <%s> a nfo:FileDataObject ; " + " tracker:extractorHash \"%s\" ." "}}", uri, hash); -- GitLab From fdd783bafb19886e6fa79ffdc77292e881a5395c Mon Sep 17 00:00:00 2001 From: Sam Thursfield Date: Fri, 24 Jul 2020 22:34:20 +0200 Subject: [PATCH 2/2] functional-tests: Test some failure cases of tracker-extract --- tests/functional-tests/extractor-decorator.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/functional-tests/extractor-decorator.py b/tests/functional-tests/extractor-decorator.py index 41c8eec17..e3fd6e3d3 100755 --- a/tests/functional-tests/extractor-decorator.py +++ b/tests/functional-tests/extractor-decorator.py @@ -20,6 +20,7 @@ Tests failure cases of tracker-extract. """ +import pathlib import os import shutil import unittest as ut @@ -37,6 +38,12 @@ TRACKER_EXTRACT_FAILURE_DATA_SOURCE = 'tracker:extractor-failure-data-source' class ExtractorDecoratorTest(fixtures.TrackerMinerTest): + def create_test_file(self, path): + testfile = pathlib.Path(self.workdir).joinpath(path) + testfile.parent.mkdir(parents=True, exist_ok=True) + testfile.write_text("Hello, I'm a test file.") + return testfile + def test_reextraction(self): """Tests whether known files are still re-extracted on user request.""" miner_fs = self.miner_fs @@ -74,6 +81,45 @@ class ExtractorDecoratorTest(fixtures.TrackerMinerTest): finally: os.remove(file_path) + def await_failsafe_marker_inserted(self, graph, path, timeout=cfg.AWAIT_TIMEOUT): + url = path.as_uri() + expected = [ + f'a rdfs:Resource. <{url}> tracker:extractorHash ?hash' + ] + + return self.tracker.await_insert(graph, '; '.join(expected), timeout=timeout) + + def test_extract_failure(self): + """ + Tests a file which extractor will fail to process. + """ + + # This file will be processed by the mp3 or gstreamer extractor due to + # its extension, but it's not a valid MP3. + testfile = self.create_test_file('test-not-monitored/invalid.mp3') + + # The extractor hash should be recorded against the file, so it won't + # try to process it again. + with self.await_failsafe_marker_inserted(fixtures.FILESYSTEM_GRAPH, testfile): + self.miner_fs.index_file(testfile.as_uri()) + + def test_extract_missing_file(self): + """ + Tests there are no problems if the file to be extract is missing. + """ + # The extractor should record the file in the store as a failure. + missing_file = pathlib.Path('/missing-file') + assert not missing_file.exists() + + with self.await_failsafe_marker_inserted(fixtures.FILESYSTEM_GRAPH, missing_file): + missing_file_url = missing_file.as_uri() + self.miner_fs.get_sparql_connection().update( + "INSERT DATA { " + " GRAPH tracker:Documents { " + f" <{missing_file_url}> a nfo:Document , nfo:FileDataObject . " + " } " + "}", None) + if __name__ == '__main__': fixtures.tracker_test_main() -- GitLab