Commit 8092dd15 authored by Martyn Russell's avatar Martyn Russell

tracker-extract: Removed support for libstreamanalyzer

This has not been compiled or used in years and likely doesn't produce an
ontology compatible output that we could use.

EXTERMINATE!
parent 30dd98e5
......@@ -189,8 +189,6 @@ EVO_REQUIRED=2.32.0
EVO_SHELL_REQUIRED=2.32.0
EDS_REQUIRED=2.32.0
CAMEL_REQUIRED=2.32.0
# Unlikely version for now, Nepomuk integration isn't finished in streamanalyzer atm
LIBSTREAMANALYZER_REQUIRED=0.7.0
GEE_REQUIRED=0.3
TAGLIB_REQUIRED=1.6
LIBGRSS_REQUIRED=0.5
......@@ -1500,39 +1498,6 @@ fi
AM_CONDITIONAL(HAVE_ENCA, test "$have_enca" = "yes")
####################################################################
# Check for tracker-extract: libstreamanalyzer
####################################################################
AC_ARG_ENABLE(libstreamanalyzer,
AS_HELP_STRING([--enable-libstreamanalyzer],
[enable libstreamananalyzer [[default=no]]]),,
[enable_libstreamanalyzer=no])
if test "x$enable_libstreamanalyzer" != "xno"; then
PKG_CHECK_MODULES(LIBSTREAMANALYZER,
[libstreamanalyzer >= $LIBSTREAMANALYZER_REQUIRED],
[have_libstreamanalyzer=yes],
[have_libstreamanalyzer=no])
TRACKER_EXTRACT_CFLAGS="$TRACKER_EXTRACT_CFLAGS $LIBSTREAMANALYZER_CFLAGS"
TRACKER_EXTRACT_LIBS="$TRACKER_EXTRACT_LIBS $LIBSTREAMANALYZER_LIBS"
if test "x$have_libstreamanalyzer" = "xyes"; then
AC_DEFINE(HAVE_LIBSTREAMANALYZER, [], [Define if we have libstreamanalyzer])
fi
else
have_libstreamanalyzer="no (disabled)"
fi
if test "x$enable_libstreamanalyzer" = "xyes"; then
if test "x$have_libstreamanalyzer" != "xyes"; then
AC_MSG_ERROR([Couldn't find libstreamanalyzer >= $LIBSTREAMANALYZER_REQUIRED and libstreamanalyzer.])
fi
fi
AM_CONDITIONAL(HAVE_LIBSTREAMANALYZER, test "$have_libstreamanalyzer" = "yes")
##################################################################
# Check for tracker-extract: libxml2 for XML/HTML extractor
##################################################################
......@@ -2572,7 +2537,6 @@ Applications:
Metadata Extractors:
Support libstreamanalyzer: $have_libstreamanalyzer
Support PNG: $have_libpng
Support PDF: $have_poppler
Support XPS: $have_libgxps
......
......@@ -33,10 +33,6 @@ be either a local path or a URI. It also does not have to be an absolute path.
The \fIMIME\fR type to use for the file. If one is not provided, it
will be guessed automatically.
.TP
.B \-i, \-\-force-internal-extractors
Use this option to force internal extractors over 3rd parties like
libstreamanalyzer.
.TP
.B \-m, \-\-force-module=MODULE
Force a particular module to be used. This is here as a convenience
for developers wanting to test their \fIMODULE\fR file. Only the
......
......@@ -559,10 +559,6 @@ tracker_extract_CFLAGS = $(LIBGSF_CFLAGS)
tracker_extract_LDADD += $(LIBGSF_LIBS)
endif
if HAVE_LIBSTREAMANALYZER
tracker_extract_SOURCES += tracker-topanalyzer.cpp tracker-topanalyzer.h
endif
# do nothing, output as a side-effect
tracker-extract-priority-dbus.c: tracker-extract-priority-dbus-stamp
@:
......
......@@ -37,10 +37,6 @@
#include "tracker-extract.h"
#include "tracker-main.h"
#ifdef HAVE_LIBSTREAMANALYZER
#include "tracker-topanalyzer.h"
#endif /* HAVE_STREAMANALYZER */
#ifdef THREAD_ENABLE_TRACE
#warning Main thread traces enabled
#endif /* THREAD_ENABLE_TRACE */
......@@ -72,7 +68,6 @@ typedef struct {
GHashTable *single_thread_extractors;
gboolean disable_shutdown;
gboolean force_internal_extractors;
gboolean disable_summary_on_finalize;
gchar *force_module;
......@@ -129,10 +124,6 @@ tracker_extract_init (TrackerExtract *object)
{
TrackerExtractPrivate *priv;
#ifdef HAVE_LIBSTREAMANALYZER
tracker_topanalyzer_init ();
#endif /* HAVE_STREAMANALYZER */
priv = TRACKER_EXTRACT_GET_PRIVATE (object);
priv->statistics_data = g_hash_table_new_full (NULL, NULL, NULL,
(GDestroyNotify) statistics_data_free);
......@@ -159,10 +150,6 @@ tracker_extract_finalize (GObject *object)
report_statistics (object);
}
#ifdef HAVE_LIBSTREAMANALYZER
tracker_topanalyzer_shutdown ();
#endif /* HAVE_STREAMANALYZER */
g_hash_table_destroy (priv->statistics_data);
g_mutex_clear (&priv->task_mutex);
......@@ -217,7 +204,6 @@ report_statistics (GObject *object)
TrackerExtract *
tracker_extract_new (gboolean disable_shutdown,
gboolean force_internal_extractors,
const gchar *force_module)
{
TrackerExtract *object;
......@@ -233,7 +219,6 @@ tracker_extract_new (gboolean disable_shutdown,
priv = TRACKER_EXTRACT_GET_PRIVATE (object);
priv->disable_shutdown = disable_shutdown;
priv->force_internal_extractors = force_internal_extractors;
priv->force_module = g_strdup (force_module);
return object;
......@@ -287,9 +272,6 @@ get_file_metadata (TrackerExtractTask *task,
TrackerExtractInfo *info;
GFile *file;
gchar *mime_used = NULL;
#ifdef HAVE_LIBSTREAMANALYZER
gchar *content_type = NULL;
#endif
gint items = 0;
*info_out = NULL;
......@@ -298,41 +280,10 @@ get_file_metadata (TrackerExtractTask *task,
info = tracker_extract_info_new (file, task->mimetype, task->graph);
g_object_unref (file);
#ifdef HAVE_LIBSTREAMANALYZER
/* FIXME: This entire section is completely broken,
* it doesn't even build these days. It should be removed or fixed.
* -mr (05/09/11)
*/
if (!priv->force_internal_extractors) {
g_debug ("Using libstreamanalyzer...");
tracker_topanalyzer_extract (task->file, statements, &content_type);
if (tracker_sparql_builder_get_length (statements) > 0) {
g_free (content_type);
tracker_sparql_builder_insert_close (statements);
*info_out = info;
return TRUE;
}
} else {
g_debug ("Using internal extractors ONLY...");
}
#endif /* HAVE_LIBSTREAMANALYZER */
if (task->mimetype && *task->mimetype) {
/* We know the mime */
mime_used = g_strdup (task->mimetype);
}
#ifdef HAVE_LIBSTREAMANALYZER
else if (content_type && *content_type) {
/* We know the mime from LSA */
mime_used = content_type;
g_strstrip (mime_used);
}
#endif /* HAVE_LIBSTREAMANALYZER */
else {
} else {
tracker_extract_info_unref (info);
return FALSE;
}
......
......@@ -50,7 +50,6 @@ struct TrackerExtractClass {
GType tracker_extract_get_type (void);
TrackerExtract *tracker_extract_new (gboolean disable_shutdown,
gboolean force_internal_extractors,
const gchar *force_module);
void tracker_extract_file (TrackerExtract *extract,
......
......@@ -71,14 +71,11 @@
"\n" \
" http://www.gnu.org/licenses/gpl.txt\n"
#define QUIT_TIMEOUT 30 /* 1/2 minutes worth of seconds */
static GMainLoop *main_loop;
static gint verbosity = -1;
static gchar *filename;
static gchar *mime_type;
static gboolean force_internal_extractors;
static gchar *force_module;
static gboolean version;
......@@ -98,10 +95,6 @@ static GOptionEntry entries[] = {
G_OPTION_ARG_STRING, &mime_type,
N_("MIME type for file (if not provided, this will be guessed)"),
N_("MIME") },
{ "force-internal-extractors", 'i', 0,
G_OPTION_ARG_NONE, &force_internal_extractors,
N_("Force internal extractors over 3rd parties like libstreamanalyzer"),
NULL },
{ "force-module", 'm', 0,
G_OPTION_ARG_STRING, &force_module,
N_("Force a module to be used for extraction (e.g. \"foo\" for \"foo.so\")"),
......@@ -288,9 +281,7 @@ run_standalone (TrackerConfig *config)
file = g_file_new_for_commandline_arg (filename);
uri = g_file_get_uri (file);
object = tracker_extract_new (TRUE,
force_internal_extractors,
force_module);
object = tracker_extract_new (TRUE, force_module);
if (!object) {
g_object_unref (file);
......@@ -354,20 +345,6 @@ main (int argc, char *argv[])
return EXIT_FAILURE;
}
if (force_internal_extractors && force_module) {
gchar *help;
g_printerr ("%s\n\n",
_("Options --force-internal-extractors and --force-module can't be used together"));
help = g_option_context_get_help (context, TRUE, NULL);
g_option_context_free (context);
g_printerr ("%s", help);
g_free (help);
return EXIT_FAILURE;
}
g_option_context_free (context);
if (version) {
......@@ -409,9 +386,7 @@ main (int argc, char *argv[])
tracker_db_manager_get_first_index_done () == FALSE);
tracker_memory_setrlimits ();
extract = tracker_extract_new (TRUE,
force_internal_extractors,
force_module);
extract = tracker_extract_new (TRUE, force_module);
if (!extract) {
g_object_unref (config);
......
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 2008, Nokia
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
* Authors: Philip Van Hoof <philip@codeminded.be>
*/
#include <glib.h>
#include <glib/gstdio.h>
#include <strigi/indexwriter.h>
#include <strigi/analysisresult.h>
#include <strigi/analyzerconfiguration.h>
#include <strigi/fileinputstream.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <map>
#include <sstream>
#include <algorithm>
#include <libtracker-common/tracker-type-utils.h>
#include <libtracker-common/tracker-os-dependant.h>
#include <libtracker-common/tracker-sparql-builder.h>
#include <libtracker-common/tracker-ontologies.h>
#include <libtracker-extract/tracker-utils.h>
#define NIE_PREFIX TRACKER_NIE_PREFIX
#include "tracker-main.h"
#include "tracker-topanalyzer.h"
using namespace std;
using namespace Strigi;
static GStaticPrivate private_key = G_STATIC_PRIVATE_INIT;
namespace Tracker {
class TripleCollector : public Strigi::IndexWriter
{
public:
TripleCollector ();
~TripleCollector ();
void commit ();
void deleteEntries (const std::vector<std::string>& entries);
void deleteAllEntries ();
void initWriterData (const Strigi::FieldRegister&);
void releaseWriterData (const Strigi::FieldRegister&);
void startAnalysis (const AnalysisResult*);
void addText (const AnalysisResult*,
const char* text,
int32_t length);
void addValue (const AnalysisResult*,
const RegisteredField* field,
const std::string& value);
void addValue (const AnalysisResult*,
const RegisteredField* field,
const unsigned char* data,
uint32_t size);
void addValue (const AnalysisResult*,
const RegisteredField* field,
int32_t value);
void addValue (const AnalysisResult*,
const RegisteredField* field,
uint32_t value);
void addValue (const AnalysisResult*,
const RegisteredField* field,
double value);
void addTriplet (const std::string& subject,
const std::string& predicate,
const std::string& object);
void addValue (const AnalysisResult*,
const RegisteredField* field,
const std::string& name,
const std::string& value);
void finishAnalysis (const AnalysisResult*);
void setParams (const gchar *uri_,
TrackerSparqlBuilder *metadata_);
gchar *content_type;
private:
const gchar *predicateMapping (const RegisteredField *field);
const gchar *predicateMapping (const std::string &key);
gboolean predicateNeeded (const gchar *predicate);
const gchar *uri;
TrackerSparqlBuilder *metadata;
};
Tracker::TripleCollector::TripleCollector ()
{
content_type = NULL;
}
void Tracker::TripleCollector::setParams (const gchar *uri_, TrackerSparqlBuilder *metadata_)
{
uri = uri_;
metadata = metadata_;
g_free (content_type);
content_type = NULL;
}
Tracker::TripleCollector::~TripleCollector ()
{
g_free (content_type);
}
void Tracker::TripleCollector::commit () { }
void Tracker::TripleCollector::deleteEntries (const std::vector<std::string>& entries ) { }
void Tracker::TripleCollector::deleteAllEntries () { }
void Tracker::TripleCollector::initWriterData (const Strigi::FieldRegister&) { }
void Tracker::TripleCollector::releaseWriterData (const Strigi::FieldRegister&) { }
void Tracker::TripleCollector::startAnalysis (const AnalysisResult* idx) { }
void Tracker::TripleCollector::addText (const AnalysisResult* idx,
const char* text,
int32_t length)
{
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, NIE_PREFIX "plainTextContent");
tracker_sparql_builder_object_unvalidated (metadata, text);
}
const gchar* Tracker::TripleCollector::predicateMapping (const std::string &key)
{
/* const gchar *original; */
/* gchar *str, *p; */
/* original = key.c_str(); */
/* p = strrchr (original, '/'); */
/* if (G_UNLIKELY (!p)) { */
/* return g_strdup (original); */
/* } */
/* if (G_UNLIKELY (!strchr (p, '#'))) { */
/* return g_strdup (original); */
/* } */
/* str = g_strdup (p + 1); */
/* p = strchr (str, '#'); */
/* *p = ':'; */
return key.c_str();
}
const gchar* Tracker::TripleCollector::predicateMapping (const RegisteredField *field)
{
/* const gchar *original; */
/* gchar *str, *p; */
/* original = field->key().c_str(); */
/* p = strrchr (original, '/'); */
/* if (G_UNLIKELY (!p)) { */
/* return g_strdup (original); */
/* } */
/* if (G_UNLIKELY (!strchr (p, '#'))) { */
/* return g_strdup (original); */
/* } */
/* str = g_strdup (p + 1); */
/* p = strchr (str, '#'); */
/* *p = ':'; */
return field->key().c_str();
}
gboolean Tracker::TripleCollector::predicateNeeded (const gchar *predicate)
{
if (!predicate) {
return FALSE;
}
/* We already cover these in the miner-fs */
if (strstr (predicate, "nfo#FileDataObject") ||
strstr (predicate, "nfo#belongsToContainer") ||
strstr (predicate, "nfo#fileName") ||
strstr (predicate, "nfo#fileSize") ||
strstr (predicate, "nfo#fileLastModified") ||
strstr (predicate, "nfo#fileLastAccessed") ||
strstr (predicate, "nie#InformationElement") ||
strstr (predicate, "nie#isStoredAs") ||
strstr (predicate, "nie#mimeType") ||
strstr (predicate, "nie#dataSource")) {
return FALSE;
}
return TRUE;
}
/* The methods below basically just convert the C++ world to the C world
* of TrackerSparqlBuilder. Nothing magical about it. */
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
const std::string& value)
{
const gchar *predicate = predicateMapping (field);
if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
g_free (content_type);
content_type = g_strdup (value.c_str());
}
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_unvalidated (metadata, value.c_str());
}
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
const unsigned char* data,
uint32_t size )
{
const gchar *predicate = predicateMapping (field);
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_unvalidated (metadata, (const gchar*) data);
}
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
int32_t value)
{
const gchar *predicate = predicateMapping (field);
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_int64 (metadata, value);
}
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
uint32_t value )
{
const gchar *predicate = predicateMapping (field);
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_int64 (metadata, value);
}
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
double value )
{
const gchar *predicate = predicateMapping (field);
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_double (metadata, value);
}
void Tracker::TripleCollector::addTriplet (const std::string& subject,
const std::string& predicate,
const std::string& object )
{
const gchar *predicate_str = predicateMapping (predicate);
if (!predicateNeeded (predicate_str)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, subject.c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate_str);
tracker_sparql_builder_object_unvalidated (metadata, object.c_str());
}
void Tracker::TripleCollector::addValue (const AnalysisResult* idx,
const RegisteredField* field,
const std::string& name,
const std::string& value )
{
const gchar *predicate = predicateMapping (field);
if (field->key() == FieldRegister::mimetypeFieldName && idx->depth() == 0) {
g_free (content_type);
content_type = g_strdup (value.c_str());
}
if (!predicateNeeded (predicate)) {
return;
}
tracker_sparql_builder_subject_iri (metadata, idx->path().c_str());
tracker_sparql_builder_predicate_iri (metadata, predicate);
tracker_sparql_builder_object_unvalidated (metadata, value.c_str());
}
void Tracker::TripleCollector::finishAnalysis (const AnalysisResult*) { }
}
typedef struct {
Strigi::AnalyzerConfiguration *mconfig;
Strigi::StreamAnalyzer *streamindexer;
Tracker::TripleCollector *m_writer;
} TrackerTopanalyzerPrivate;
static void
private_free (gpointer data)
{
TrackerTopanalyzerPrivate *priv = (TrackerTopanalyzerPrivate*) data;
delete priv->mconfig;
delete priv->streamindexer;
delete priv->m_writer;
g_free (priv);
}
void
tracker_topanalyzer_init (void)
{
TrackerTopanalyzerPrivate *priv;
/* For added granularity of what analyzer should be elected for which
* filetype or file, you can inherit a Strigi::AnalyzerConfiguration
* and have some tuning this way. */
FieldRegister::FieldRegister ();
priv = g_new0 (TrackerTopanalyzerPrivate, 1);
priv->mconfig = new Strigi::AnalyzerConfiguration ();
priv->streamindexer = new Strigi::StreamAnalyzer (*priv->mconfig);
priv->m_writer = new Tracker::TripleCollector ();
priv->streamindexer->setIndexWriter (*priv->m_writer);
g_static_private_set (&private_key,
priv,
private_free);
}
void
tracker_topanalyzer_shutdown (void)
{
g_static_private_set (&private_key, NULL, NULL);
}
void
tracker_topanalyzer_extract (const gchar *uri,
TrackerSparqlBuilder *metadata,
gchar **content_type)
{
TrackerTopanalyzerPrivate *priv;
gchar *filename;
priv = (TrackerTopanalyzerPrivate*) g_static_private_get (&private_key);
g_return_if_fail (priv != NULL);
/* We need the filename from the URI because we'll use stat() and because
* in this experiment I used FileInputStream. But any kind of stream could
* work with StreamAnalyzer's analyzers. */
filename = g_filename_from_uri (uri, NULL, NULL);
if (filename) {
struct stat s;
/* We use our own strategy as writer. Our writer writes to the @metadata
* array. I decided to call it a collector because that's what its
* implementation does (collecting triples) */
priv->m_writer->setParams (uri, metadata);
stat (filename, &s);
/* The first parameter that we pass here will influence what
* idx->path() will be above. StreamAnalyzer only ever appends
* path chunks to this initial stringvalue. So if we pass
* our://URI then idx->path will end up being:
*
* our://URI
* our://URI/child
* our://URI/child/child.
*
* For example the URI of a tar.gz will go like this:
*
* file:///path/to/my.tar.gz
* file:///path/to/my.tar.gz/dir_in_tar/file1.txt
* file:///path/to/my.tar.gz/dir_in_tar/file2.txt
*
* The URI passed here doesn't mean the stream passed later must
* not really resemble the URI. Usually it will of course.
*/
AnalysisResult analysisresult (uri, s.st_mtime, *priv->m_writer,
*priv->streamindexer);
/* If we want a remote stream, then we implement a Stream in C++
* for it and use that instead of FileInputStream. We could for
* example make a C++ wrapper for GInputStream and enjoy using
* GIO and GNIO here that way. */
FileInputStream resource (filename);
if (resource.status() == Ok) {
analysisresult.index (&resource);
if (content_type && priv->m_writer->content_type) {
*content_type = g_strdup (priv->m_writer->content_type);
}
}
g_free (filename);
}
}
/*
* Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.