Commit ede17cc2 authored by Carlos Garnacho's avatar Carlos Garnacho

extract-mp3: Bail out on encoding detection if confidence is too low

Libicu encoding detection is able to tell the confidence it got on
the detection, we should be using that in case the confidence is
too low, as that means the returned encoding is probably bogus, and
we have an encoding to fallback on.

This fixes detection on the file reported on bug #735515, where
a couple of 'ï' chars (valid ISO-8859-1) make libicu detect UTF-16BE,
although with an extremely low confidence.

https://bugzilla.gnome.org/show_bug.cgi?id=735515
parent 6916ee5e
......@@ -29,13 +29,15 @@
gchar *
tracker_encoding_guess_icu (const gchar *buffer,
gsize size)
gsize size,
gdouble *confidence)
{
UCharsetDetector *detector = NULL;
const UCharsetMatch *match;
gchar *charset = NULL;
UErrorCode status = 0;
const char *p_match = NULL;
int32_t conf = 0;
detector = ucsdet_open (&status);
......@@ -60,12 +62,21 @@ tracker_encoding_guess_icu (const gchar *buffer,
if (p_match == NULL || U_FAILURE (status))
goto failure;
conf = ucsdet_getConfidence (match, &status);
if (U_FAILURE (status))
goto failure;
charset = g_strdup ((const gchar *) p_match);
if (charset)
g_debug ("Guessing charset as '%s'", charset);
g_debug ("Guessing charset as '%s' (Confidence: %f)",
charset, (gdouble) conf / 100);
failure:
if (confidence)
*confidence = (gdouble) conf / 100;
if (detector)
ucsdet_close (detector);
......
......@@ -26,7 +26,8 @@ G_BEGIN_DECLS
G_GNUC_INTERNAL
gchar *tracker_encoding_guess_icu (const gchar *buffer,
gsize size);
gsize size,
gdouble *confidence);
G_END_DECLS
......
......@@ -46,9 +46,11 @@ tracker_encoding_can_guess (void)
gchar *
tracker_encoding_guess (const gchar *buffer,
gsize size)
gsize size,
gdouble *confidence)
{
gchar *encoding = NULL;
gdouble conf = 1;
#ifdef HAVE_MEEGOTOUCH
encoding = tracker_encoding_guess_meegotouch (buffer, size);
......@@ -56,14 +58,18 @@ tracker_encoding_guess (const gchar *buffer,
#ifdef HAVE_LIBICU_CHARSET_DETECTION
if (!encoding)
encoding = tracker_encoding_guess_icu (buffer, size);
encoding = tracker_encoding_guess_icu (buffer, size, &conf);
#endif /* HAVE_LIBICU_CHARSET_DETECTION */
#ifdef HAVE_ENCA
if (!encoding)
if (!encoding || conf < 0.5) {
conf = 1;
encoding = tracker_encoding_guess_enca (buffer, size);
}
#endif /* HAVE_ENCA */
if (confidence)
*confidence = conf;
return encoding;
}
......@@ -33,7 +33,8 @@ gboolean tracker_encoding_can_guess (void);
/* Returns NULL if it couldn't guess it */
gchar *tracker_encoding_guess (const gchar *buffer,
gsize size);
gsize size,
gdouble *confidence);
G_END_DECLS
......
......@@ -675,13 +675,22 @@ get_encoding (const gchar *data,
gsize size,
gboolean *encoding_found)
{
gdouble confidence = 1;
gchar *encoding;
/* Try to guess encoding */
encoding = (data && size ?
tracker_encoding_guess (data, size) :
tracker_encoding_guess (data, size, &confidence) :
NULL);
if (confidence < 0.5) {
/* Confidence on the results was too low, bail out and
* fallback to the default ISO-8859-1/Windows-1252 encoding.
*/
g_free (encoding);
encoding = NULL;
}
/* Notify if a proper detection was done */
if (encoding_found) {
*encoding_found = (encoding ? TRUE : FALSE);;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment