Commit baf8b40e authored by Sam Thursfield's avatar Sam Thursfield

functional-tests: Rewrite tracker-extract tests to parse JSON-LD output

These tests have been broken for about 2 years (since the TrackerResource
work landed and changed the output generated by `tracker-extract
--file`).

In this commit they are reworked with a focus on using JSON-LD rather
than doing any custom parsing of SPARQL. The test description files are
now JSON too rather than .ini format, which is not stricty needed but
should make the code a lot simpler

The failure messages could still be improved in many cases, help is
welcome here!
parent 441a0886
#!/usr/bin/python
#
# Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
# Copyright (C) 2018, Sam Thursfield <sam@afuera.me.uk>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
......@@ -24,12 +25,12 @@ directory (containing xxx.expected files)
"""
from common.utils import configuration as cfg
from common.utils.extractor import get_tracker_extract_output
from common.utils.extractor import get_tracker_extract_jsonld_output
import unittest2 as ut
import json
import os
import sys
import ConfigParser
class ExtractionTestCase (ut.TestCase):
......@@ -42,22 +43,22 @@ class ExtractionTestCase (ut.TestCase):
"""
ut.TestCase.__init__ (self, methodName)
# Load the description file
assert descfile
self.rel_description = descfile
self.configParser = self.__load_description_file (self.rel_description)
self.descfile = descfile
try:
with open(descfile) as f:
self.spec = json.load(f)
except ValueError as e:
self.fail("Error loading %s: %s" % (descfile, e))
# Add a method to the class called after the description file
methodName = self.rel_description.lower()[:-len(".expected")].replace (" ", "_")[-60:]
methodName = descfile.lower()[:-len(".expected")].replace (" ", "_")[-60:]
if (self.__is_expected_failure ()):
if (self.spec['test'].get('ExpectedFailure', False)):
setattr (self,
methodName,
self.expected_failure_test_extraction)
methodName,
self.expected_failure_test_extraction)
else:
setattr (self,
methodName,
self.generic_test_extraction)
setattr (self, methodName, self.generic_test_extraction)
# unittest framework will run the test called "self._testMethodName"
# So we set that variable to our new name
......@@ -69,28 +70,20 @@ class ExtractionTestCase (ut.TestCase):
"""
assert False
def __load_description_file (self, descfile):
configParser = ConfigParser.RawConfigParser ()
# Make it case sensitive:
configParser.optionxform = str
def __get_bugnumber (self):
return self.spec['test'].get('Bugzilla')
abs_description = os.path.abspath (descfile)
loaded_files = configParser.read (abs_description)
if not abs_description in loaded_files:
raise Exception("Unable to load %s" % (abs_description))
def generic_test_extraction (self):
abs_description = os.path.abspath (self.descfile)
return configParser
# Filename contains the file to extract, in a relative path to the description file
desc_root, desc_file = os.path.split (abs_description)
def __is_expected_failure (self):
assert self.configParser
return self.configParser.has_option ("TestFile", "ExpectedFailure")
filename_to_extract = self.spec['test']['Filename']
self.file_to_extract = os.path.join (desc_root, filename_to_extract)
def __get_bugnumber (self):
assert self.configParser
if self.configParser.has_option ("TestFile", "Bugzilla"):
return "'" + self.configParser.get ("TestFile", "Bugzilla") + "'"
else:
return None
result = get_tracker_extract_jsonld_output(self.file_to_extract)
self.__assert_extraction_ok (result)
def expected_failure_test_extraction (self):
try:
......@@ -103,19 +96,9 @@ class ExtractionTestCase (ut.TestCase):
else:
raise Exception ("Unexpected success. Check " + self.rel_description)
def generic_test_extraction (self):
abs_description = os.path.abspath (self.rel_description)
# Filename contains the file to extract, in a relative path to the description file
desc_root, desc_file = os.path.split (abs_description)
filename_to_extract = self.configParser.get ("TestFile", "Filename")
self.file_to_extract = os.path.join (desc_root, filename_to_extract)
result = get_tracker_extract_output(self.file_to_extract)
self.__assert_extraction_ok (result)
def assertDictHasKey (self, d, key, msg=None):
if not isinstance(d, dict):
self.fail ("Expected dict, got %s" % d)
if not d.has_key (key):
standardMsg = "Missing: %s\n" % (key)
self.fail (self._formatMessage (msg, standardMsg))
......@@ -135,75 +118,111 @@ class ExtractionTestCase (ut.TestCase):
self.fail (self._formatMessage (msg, standardMsg))
def __assert_extraction_ok (self, result):
self.__check_section ("Metadata", result)
def __check_section (self, section, result):
error_missing_prop = "Property '%s' hasn't been extracted from file \n'%s'\n (requested on '%s' [%s])"
error_wrong_value = "on property '%s' from file %s\n (requested on: '%s' [%s])"
error_extra_prop = "Property '%s' was explicitely banned for file \n'%s'\n (requested on '%s' [%s])"
error_extra_prop_v = "Property '%s' with value '%s' was explicitely banned for file \n'%s'\n (requested on %s' [%s])"
try:
self.__check (self.spec['metadata'], result)
except AssertionError as e:
print("\ntracker-extract returned: %s" % json.dumps(result, indent=4))
raise
def __check (self, spec, result):
error_missing_prop = "Property '%s' hasn't been extracted from file \n'%s'\n (requested on '%s')"
error_wrong_value = "on property '%s' from file %s\n (requested on: '%s')"
error_wrong_length = "Length mismatch on property '%s' from file %s\n (requested on: '%s')"
error_extra_prop = "Property '%s' was explicitely banned for file \n'%s'\n (requested on '%s')"
error_extra_prop_v = "Property '%s' with value '%s' was explicitely banned for file \n'%s'\n (requested on %s')"
expected_pairs = [] # List of expected (key, value)
unexpected_pairs = [] # List of unexpected (key, value)
expected_keys = [] # List of expected keys (the key must be there, value doesnt matter)
for k, v in self.configParser.items (section):
for k, v in spec.items():
if k.startswith ("!"):
unexpected_pairs.append ( (k[1:].replace ("_", ":"), v) )
elif k.startswith ("@"):
expected_keys.append ( k[1:].replace ("_", ":") )
unexpected_pairs.append ( (k[1:], v) )
elif k == '@type':
expected_keys.append ( '@type' )
else:
expected_pairs.append ( (k.replace ("_", ":"), v) )
expected_pairs.append ( (k, v) )
for (prop, value) in expected_pairs:
for prop, expected_value in expected_pairs:
self.assertDictHasKey (result, prop,
error_missing_prop % (prop,
self.file_to_extract,
self.rel_description,
section))
if value == "@URNUUID@":
# Watch out! We take only the FIRST element. Incompatible with multiple-valued props.
self.assertIsURN (result [prop][0],
self.descfile))
if expected_value == "@URNUUID@":
self.assertIsURN (result [prop][0]['@id'],
error_wrong_value % (prop,
self.file_to_extract,
self.rel_description,
section))
self.descfile))
else:
self.assertIn (value, result [prop],
error_wrong_value % (prop,
self.file_to_extract,
self.rel_description,
section))
if isinstance(expected_value, list):
if not isinstance(result[prop], list):
raise AssertionError("Expected a list property for %s, but got a %s: %s" % (
prop, type(result[prop]).__name__, result[prop]))
self.assertEqual (len(expected_value), len(result[prop]),
error_wrong_length % (prop,
self.file_to_extract,
self.descfile))
for i in range(0, len(expected_value)):
self.__check(spec[prop][i], result[prop][i])
elif isinstance(expected_value, dict):
self.__check(expected_value, result[prop])
else:
self.assertEqual (str(spec[prop]), str(result [prop]),
error_wrong_value % (prop,
self.file_to_extract,
self.descfile))
for (prop, value) in unexpected_pairs:
# There is no prop, or it is but not with that value
if (value == ""):
self.assertFalse (result.has_key (prop), error_extra_prop % (prop,
self.file_to_extract,
self.rel_description,
section))
self.descfile))
else:
if (value == "@URNUUID@"):
self.assertIsURN (result [prop][0], error_extra_prop % (prop,
self.file_to_extract,
self.rel_description,
section))
self.descfile))
else:
self.assertNotIn (value, result [prop], error_extra_prop_v % (prop,
value,
self.file_to_extract,
self.rel_description,
section))
self.descfile))
for prop in expected_keys:
self.assertDictHasKey (result, prop,
error_missing_prop % (prop,
self.file_to_extract,
self.rel_description,
section))
self.descfile))
def run_all ():
##
# Traverse the TEST_DATA_PATH directory looking for .description files
# Add a new TestCase to the suite per .description file and run the suite.
#
# Is we do this inside a single TestCase an error in one test would stop the whole
# testing.
##
if (os.path.exists (os.getcwd() + "/test-extraction-data")):
# Use local directory if available
TEST_DATA_PATH = os.getcwd() + "/test-extraction-data"
else:
TEST_DATA_PATH = os.path.join (cfg.DATADIR, "tracker-tests",
"test-extraction-data")
print "Loading test descriptions from", TEST_DATA_PATH
extractionTestSuite = ut.TestSuite ()
for root, dirs, files in os.walk (TEST_DATA_PATH):
descriptions = [os.path.join (root, f) for f in files if f.endswith ("expected")]
for descfile in descriptions:
tc = ExtractionTestCase(descfile=descfile)
extractionTestSuite.addTest(tc)
result = ut.TextTestRunner (verbosity=1).run (extractionTestSuite)
sys.exit(not result.wasSuccessful())
def run_one (filename):
##
# Run just one .description file
......@@ -218,10 +237,10 @@ def run_one (filename):
sys.exit(not result.wasSuccessful())
test = sys.argv[1]
if os.path.exists (sys.argv[1]) and sys.argv[1].endswith (".expected"):
if len(sys.argv) == 2:
run_one (sys.argv[1])
elif len(sys.argv) == 1:
run_all ()
else:
print("Usage: %s [FILE.expected]" % (sys.argv[0]))
sys.stderr.write("Too many arguments.")
sys.exit(1)
#!/usr/bin/python
#
# Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
# Copyright (C) 2018, Sam Thursfield <sam@afuera.me.uk>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
......@@ -20,246 +21,19 @@
from common.utils import configuration as cfg
from common.utils.helpers import log
import json
import os
import re
import subprocess
class ExtractorParser(object):
def parse_tracker_extract_output(self, text):
"""
Parse stdout of `tracker-extract --file` to get SPARQL statements.
Calls the extractor a returns a dictionary of property, value.
Example:
{ 'nie:filename': 'a.jpeg' ,
'tracker:added': '2008-12-12T12:23:34Z'
}
"""
metadata = {}
parts = self.get_statements_from_stdout_output(text)
extras = self.__process_where_part(parts['where'])
for attribute_value in self.__process_lines(parts['item']):
att, value = attribute_value.split(" ", 1)
if value.startswith("?") and extras.has_key(value):
value = extras[value]
if metadata.has_key(att):
metadata [att].append(value)
else:
metadata [att] = [value]
return metadata
def get_statements_from_stdout_output(self, text):
lines = text.split('\n')
parts = {}
current_part = None
part_start = None
i = 0
for i in range(0, len(lines)):
if lines[i] == 'SPARQL pre-update:':
current_part = 'preupdate'
elif lines[i] == 'SPARQL item:':
current_part = 'item'
elif lines[i] == 'SPARQL where clause:':
current_part = 'where'
elif lines[i] == 'SPARQL post-update:':
current_part = 'postupdate'
if lines[i] == '--':
if part_start is None:
part_start = i + 1
else:
part_lines = lines[part_start:i]
parts[current_part] = '\n'.join(part_lines)
current_part = None
part_start = None
if current_part is not None:
raise Exception("End of text while parsing %s in tracker-extract "
"output" % current_part)
if len(parts) == 0:
raise Exception("No metadata was found by tracker-extract")
return parts
def __process_lines(self, embedded):
"""
Translate each line in a "prop value" string, handling anonymous nodes.
Example:
nfo:width 699 ; -> 'nfo:width 699'
or
nao:hasTag [ a nao:Tag ;
nao:prefLabel "tracker"] ; -> nao:hasTag:prefLabel 'tracker'
Would be so cool to implement this with yield and generators... :)
"""
grouped_lines = []
current_line = ""
anon_node_open = False
for l in embedded.split ("\n\t"):
if "[" in l:
current_line = current_line + l
anon_node_open = True
continue
if "]" in l:
anon_node_open = False
current_line += l
final_lines = self.__handle_anon_nodes (current_line.strip ())
grouped_lines = grouped_lines + final_lines
current_line = ""
continue
if anon_node_open:
current_line += l
else:
if (len (l.strip ()) == 0):
continue
final_lines = self.__handle_multivalues (l.strip ())
grouped_lines = grouped_lines + final_lines
return map (self.__clean_value, grouped_lines)
def __process_where_part(self, where):
gettags = re.compile ("(\?\w+)\ a\ nao:Tag\ ;\ nao:prefLabel\ \"([\w\ -]+)\"")
tags = {}
for l in where.split ("\n"):
if len (l) == 0:
continue
match = gettags.search (l)
if (match):
tags [match.group(1)] = match.group (2)
else:
print "This line is not a tag:", l
return tags
def __handle_multivalues(self, line):
"""
Split multivalues like:
a nfo:Image, nmm:Photo ;
-> a nfo:Image ;
-> a nmm:Photo ;
"""
hasEscapedComma = re.compile ("\".+,.+\"")
if "," in line and not hasEscapedComma.search (line):
prop, multival = line.split (" ", 1)
results = []
for value in multival.split (","):
results.append ("%s %s" % (prop, value.strip ()))
return results
else:
return [line]
def __handle_anon_nodes(self, line):
"""
Traslates anonymous nodes in 'flat' properties:
nao:hasTag [a nao:Tag; nao:prefLabel "xxx"]
-> nao:hasTag:prefLabel "xxx"
slo:location [a slo:GeoLocation; slo:postalAddress <urn:uuid:1231-123> .]
-> slo:location <urn:uuid:1231-123>
nfo:hasMediaFileListEntry [ a nfo:MediaFileListEntry ; nfo:entryUrl "file://x.mp3"; nfo:listPosition 1]
-> nfo:hasMediaFileListEntry:entryUrl "file://x.mp3"
"""
# hasTag case
if line.startswith ("nao:hasTag"):
getlabel = re.compile ("nao:prefLabel\ \"([\w\ -]+)\"")
match = getlabel.search (line)
if (match):
line = 'nao:hasTag:prefLabel "%s" ;' % (match.group(1))
return [line]
else:
print "Whats wrong on line", line, "?"
return [line]
# location case
elif line.startswith ("slo:location"):
results = []
# Can have country AND/OR city
getpa = re.compile ("slo:postalAddress\ \<([\w:-]+)\>")
pa_match = getpa.search (line)
if (pa_match):
results.append ('slo:location:postalAddress "%s" ;' % (pa_match.group(1)))
else:
print "FIXME another location subproperty in ", line
return results
elif line.startswith ("nco:creator"):
getcreator = re.compile ("nco:fullname\ \"([\w\ ]+)\"")
creator_match = getcreator.search (line)
if (creator_match):
new_line = 'nco:creator:fullname "%s" ;' % (creator_match.group (1))
return [new_line]
else:
print "Something special in this line '%s'" % (line)
elif line.startswith ("nfo:hasMediaFileListEntry"):
return self.__handle_playlist_entries (line)
else:
return [line]
def __handle_playlist_entries(self, line):
"""
Playlist entries come in one big line:
nfo:hMFLE [ a nfo:MFLE; nfo:entryUrl '...'; nfo:listPosition X] , [ ... ], [ ... ]
-> nfo:hMFLE:entryUrl '...'
-> nfo:hMFLE:entryUrl '...'
...
"""
geturl = re.compile ("nfo:entryUrl \"([\w\.\:\/]+)\"")
entries = line.strip () [len ("nfo:hasMediaFileListEntry"):]
results = []
for entry in entries.split (","):
url_match = geturl.search (entry)
if (url_match):
new_line = 'nfo:hasMediaFileListEntry:entryUrl "%s" ;' % (url_match.group (1))
results.append (new_line)
else:
print " *** Something special in this line '%s'" % (entry)
return results
def __clean_value(self, value):
"""
the value comes with a ';' or a '.' at the end
"""
if (len (value) < 2):
return value.strip ()
clean = value.strip ()
if value[-1] in [';', '.']:
clean = value [:-1]
clean = clean.replace ("\"", "")
return clean.strip ()
def get_tracker_extract_output(filename, mime_type=None):
def get_tracker_extract_jsonld_output(filename, mime_type=None):
"""
Runs `tracker-extract --file` to extract metadata from a file.
"""
tracker_extract = os.path.join (cfg.TRACKER_EXTRACT_PATH)
command = [tracker_extract, '--file', filename]
command = [tracker_extract, '--verbosity=0', '--output-format=json-ld', '--file', filename]
if mime_type is not None:
command.extend(['--mime', mime_type])
......@@ -267,8 +41,12 @@ def get_tracker_extract_output(filename, mime_type=None):
log ('Running: %s' % ' '.join(command))
output = subprocess.check_output (command)
except subprocess.CalledProcessError as e:
raise Exception("Error %i from %s, output: %s" %
(e.returncode, tracker_extract, e.output))
raise Exception("Error %i from %s, output, see stderr for details" %
(e.returncode, tracker_extract))
try:
data = json.loads(output)
except ValueError as e:
raise RuntimeError("Invalid JSON returned by tracker-extract: "
"%s.\nOutput was: %s" % (e, output))
parser = ExtractorParser()
return parser.parse_tracker_extract_output(output)
return data
......@@ -18,27 +18,27 @@ functional_tests = [
]
extractor_tests = [
'audio/audio-test-1.expected',
'audio/empty_albumi_song3.expected',
'audio/Jazz_Audio_OPLs0.expected',
'audio/audio-test-2.expected',
'images/test-image-1.expected',
'images/xmp-loaded-1.expected',
'images/test-image-3.expected',
'images/corrupted-image.expected',
'images/test-iptcdata-records.expected',
'images/roi2.expected',
'images/test-image-4.expected',
'images/test-image-2.expected',
'images/comment-extension-block.expected',
'images/roi.expected',
'video/184505.expected',
'video/video-1.expected',
'video/video-2.expected',
'playlists/playlist-test-1.expected',
'office/office-doc.expected',
'office/powerpoint.expected',
'office/pdf-doc.expected',
'audio/audio-test-1.expected.json',
'audio/empty_albumi_song3.expected.json',
'audio/Jazz_Audio_OPLs0.expected.json',
'audio/audio-test-2.expected.json',
'images/test-image-1.expected.json',
'images/xmp-loaded-1.expected.json',
'images/test-image-3.expected.json',
'images/corrupted-image.expected.json',
'images/test-iptcdata-records.expected.json',
'images/roi2.expected.json',
'images/test-image-4.expected.json',
'images/test-image-2.expected.json',
'images/comment-extension-block.expected.json',
'images/roi.expected.json',
'video/184505.expected.json',
'video/video-1.expected.json',
'video/video-2.expected.json',
'playlists/playlist-test-1.expected.json',
'office/office-doc.expected.json',
'office/powerpoint.expected.json',
'office/pdf-doc.expected.json',
]
config_json_full_path = join_paths(meson.current_build_dir(), 'configuration.json')
......
test-extraction-data
====================
The python test will traverse recursively this directory, loading the .expected files as tests.
Each test for the extractor is defined by a .expected file.
.expected file format
=====================
It is a .ini (.desktop) formatted file with two mandatory sections: TestFile and Metadata.
It is a JSON file with two toplevel key: "test" and "metadata".
The TestFile section contains the
The "test" key should contain the following subkeys:
* Filename (mandatory): relative path from the .expected of the file under test.
* Bugzilla (optional): reference to a bug related with the file.
* Comment (optional): brief description of why that file is in the test suite (problematic, format example...)
* ExpectedFailure (optional): If this key is present, we expect the extraction to fail.
If ExpectedFailure is included, it is recommended to fill the Bugzilla field!
The Metadata section contains pairs of property=values with few special rules:
1. The ':' in the properties is replaced with ̈́_'. Note that in the values is ok to have ':'.
E.G. nfo:duration=5 -> nfo_duration=5
but a=nmm:Video -> a=nmm:Video
The "metadata" section lists the metadata we expect to see returned by
tracker-extract (when tracker-extract is run with --output-format=json-ld).
2. If the property name is prefixed with '@' then the property is expected in the extraction, but the value
won't be checked.
There are a few special rules:
E.G. The extraction is: slo:location [a slo:GeoPoint; slo:postalAddress <urn:uuid:123-123>];
1. If the property name is prefixed with '@' then the property is expected in the extraction, but the value
won't be checked.
@slo_location= -> PASS slo:location is in the extraction
3. If the property name is prefixed with '!' then the property is NOT expected in the extraction
2. If the property name is prefixed with '!' then the property is NOT expected in the extraction
If the negated property has a value, it forbids the exact value. Otherwise, it forbids the property at all.
E.G. The extraction is: a nmm:Video; nfo:duration 50.
!a=nmm:Audio -> PASS because there is no "a nmm:Audio"
!nfo_duration=12 -> PASS because duration has a different value
!nfo_duration= -> FAIL because there shouldn't be any nfo:duration at all
4. The translation of the extraction results to a python dictionary is very basic.
It handles a couple of special cases, relevant for testing:
E.G.
E.G. The extraction is: { "@type": "nmm:Video", "nfo:duration": 50 }
slo:location [a slo:GeoLocation; slo:PostalAddress "XXX"] -> slo_location_postalAddress=XXX
{ "!@type": "nmm:Audio" } -> PASS because there is no { "@type": "nmm:Audio" }
{ "!nfo_duration": "12" } -> PASS because duration has a different value
{ "!nfo_duration": null} } -> FAIL because there shouldn't be any nfo:duration at all
Note 24/08/2011: Tags are translated as "nao_Tag=value, value, value" instead of
the old "nao_Tag_preflabel=value, value, value"
5. There is (so far only) one constant defined to use in the values:
3. There is (so far only) one constant defined to use in the values:
@URNUUID@ meaning an automatic generated urn:uuid:1231-123-123 URL
[This constant must not be used with multiple-valued properties. The code only check the first result.]
......@@ -56,13 +40,15 @@ Example
Everything together should look like this:
[TestFile]
Filename=x.mp4
Bugzilla=GB#123123
Comment=Video usually clasiffied as Audio by mistake
[Metadata]
a=nmm:Video
!a=nmm:Audio
nfo_duration=123