Merge smart converter in the document output stream.

Using a converter we are not able to get enough information where
the conversion error has been produced, due to this we are merging
it in the document output stream so we can do some escaping in the
future.
parent 870293e1
......@@ -123,7 +123,6 @@ NOINST_H_FILES = \
gedit-rounded-frame.h \
gedit-session.h \
gedit-settings.h \
gedit-smart-charset-converter.h \
gedit-status-combo-box.h \
gedit-style-scheme-manager.h \
gedit-tab-label.h \
......@@ -207,7 +206,6 @@ libgedit_c_files = \
gedit-rounded-frame.c \
gedit-session.c \
gedit-settings.c \
gedit-smart-charset-converter.c \
gedit-statusbar.c \
gedit-status-combo-box.c \
gedit-style-scheme-manager.c \
......
......@@ -40,7 +40,6 @@
#include "gedit-document-loader.h"
#include "gedit-document-output-stream.h"
#include "gedit-smart-charset-converter.h"
#include "gedit-debug.h"
#include "gedit-metadata-manager.h"
#include "gedit-utils.h"
......@@ -118,7 +117,6 @@ struct _GeditDocumentLoaderPrivate
GCancellable *cancellable;
GInputStream *stream;
GOutputStream *output;
GeditSmartCharsetConverter *converter;
gchar buffer[READ_CHUNK_SIZE];
......@@ -225,12 +223,6 @@ gedit_document_loader_dispose (GObject *object)
priv->output = NULL;
}
if (priv->converter != NULL)
{
g_object_unref (priv->converter);
priv->converter = NULL;
}
if (priv->error != NULL)
{
g_error_free (priv->error);
......@@ -628,7 +620,7 @@ async_read_cb (GInputStream *stream,
g_output_stream_flush (loader->priv->output, NULL, NULL);
loader->priv->auto_detected_encoding =
gedit_smart_charset_converter_get_guessed (loader->priv->converter);
gedit_document_output_stream_get_guessed (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output));
loader->priv->auto_detected_newline_type =
gedit_document_output_stream_detect_newline_type (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output));
......@@ -636,7 +628,7 @@ async_read_cb (GInputStream *stream,
/* Check if we needed some fallback char, if so, check if there was
a previous error and if not set a fallback used error */
/* FIXME Uncomment this when we want to manage conversion fallback */
/*if ((gedit_smart_charset_converter_get_num_fallbacks (loader->priv->converter) != 0) &&
/*if ((gedit_document_output_stream_get_num_fallbacks (GEDIT_DOCUMENT_OUTPUT_STREAM (loader->priv->output)) != 0) &&
loader->priv->error == NULL)
{
g_set_error_literal (&loader->priv->error,
......@@ -721,19 +713,6 @@ start_stream_read (AsyncData *async)
loader = async->loader;
info = loader->priv->info;
/* Get the candidate encodings */
if (loader->priv->encoding == NULL)
{
candidate_encodings = get_candidate_encodings (loader);
}
else
{
candidate_encodings = g_slist_prepend (NULL, (gpointer)loader->priv->encoding);
}
loader->priv->converter = gedit_smart_charset_converter_new (candidate_encodings);
g_slist_free (candidate_encodings);
if (g_file_info_has_attribute (info, G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE))
{
const gchar *content_type = g_file_info_get_content_type (info);
......@@ -756,12 +735,23 @@ start_stream_read (AsyncData *async)
}
g_object_unref (loader->priv->stream);
loader->priv->stream = g_converter_input_stream_new (base_stream,
G_CONVERTER (loader->priv->converter));
g_object_unref (base_stream);
loader->priv->stream = base_stream;
/* Get the candidate encodings */
if (loader->priv->encoding == NULL)
{
candidate_encodings = get_candidate_encodings (loader);
}
else
{
candidate_encodings = g_slist_prepend (NULL, (gpointer)loader->priv->encoding);
}
/* Output stream */
loader->priv->output = gedit_document_output_stream_new (loader->priv->document);
loader->priv->output = gedit_document_output_stream_new (loader->priv->document,
candidate_encodings);
g_slist_free (candidate_encodings);
/* start reading */
read_file_chunk (async);
......
......@@ -26,7 +26,9 @@
#include <glib.h>
#include <glib/gi18n.h>
#include <gio/gio.h>
#include <errno.h>
#include "gedit-document-output-stream.h"
#include "gedit-debug.h"
/* NOTE: never use async methods on this stream, the stream is just
* a wrapper around GtkTextBuffer api so that we can use GIO Stream
......@@ -48,6 +50,16 @@ struct _GeditDocumentOutputStreamPrivate
gchar *buffer;
gsize buflen;
/* Encoding detection */
GIConv iconv;
GCharsetConverter *charset_conv;
GSList *encodings;
GSList *current_encoding;
guint is_utf8 : 1;
guint use_first : 1;
guint is_initialized : 1;
guint is_closed : 1;
};
......@@ -114,12 +126,33 @@ gedit_document_output_stream_get_property (GObject *object,
}
}
static void
gedit_document_output_stream_dispose (GObject *object)
{
GeditDocumentOutputStream *stream = GEDIT_DOCUMENT_OUTPUT_STREAM (object);
if (stream->priv->iconv != NULL)
{
g_iconv_close (stream->priv->iconv);
stream->priv->iconv = NULL;
}
if (stream->priv->charset_conv != NULL)
{
g_object_unref (stream->priv->charset_conv);
stream->priv->charset_conv = NULL;
}
G_OBJECT_CLASS (gedit_document_output_stream_parent_class)->dispose (object);
}
static void
gedit_document_output_stream_finalize (GObject *object)
{
GeditDocumentOutputStream *stream = GEDIT_DOCUMENT_OUTPUT_STREAM (object);
g_free (stream->priv->buffer);
g_slist_free (stream->priv->encodings);
G_OBJECT_CLASS (gedit_document_output_stream_parent_class)->finalize (object);
}
......@@ -154,6 +187,7 @@ gedit_document_output_stream_class_init (GeditDocumentOutputStreamClass *klass)
object_class->get_property = gedit_document_output_stream_get_property;
object_class->set_property = gedit_document_output_stream_set_property;
object_class->dispose = gedit_document_output_stream_dispose;
object_class->finalize = gedit_document_output_stream_finalize;
object_class->constructed = gedit_document_output_stream_constructed;
......@@ -181,8 +215,196 @@ gedit_document_output_stream_init (GeditDocumentOutputStream *stream)
stream->priv->buffer = NULL;
stream->priv->buflen = 0;
stream->priv->charset_conv = NULL;
stream->priv->encodings = NULL;
stream->priv->current_encoding = NULL;
stream->priv->is_initialized = FALSE;
stream->priv->is_closed = FALSE;
stream->priv->is_utf8 = FALSE;
stream->priv->use_first = FALSE;
}
static const GeditEncoding *
get_encoding (GeditDocumentOutputStream *stream)
{
if (stream->priv->current_encoding == NULL)
{
stream->priv->current_encoding = stream->priv->encodings;
}
else
{
stream->priv->current_encoding = g_slist_next (stream->priv->current_encoding);
}
if (stream->priv->current_encoding != NULL)
{
return (const GeditEncoding *)stream->priv->current_encoding->data;
}
return NULL;
}
static gboolean
try_convert (GCharsetConverter *converter,
const void *inbuf,
gsize inbuf_size)
{
GError *err;
gsize bytes_read, nread;
gsize bytes_written, nwritten;
GConverterResult res;
gchar *out;
gboolean ret;
gsize out_size;
if (inbuf == NULL || inbuf_size == 0)
{
return FALSE;
}
err = NULL;
nread = 0;
nwritten = 0;
out_size = inbuf_size * 4;
out = g_malloc (out_size);
do
{
res = g_converter_convert (G_CONVERTER (converter),
(gchar *)inbuf + nread,
inbuf_size - nread,
(gchar *)out + nwritten,
out_size - nwritten,
G_CONVERTER_INPUT_AT_END,
&bytes_read,
&bytes_written,
&err);
nread += bytes_read;
nwritten += bytes_written;
} while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR && err == NULL);
if (err != NULL)
{
if (err->code == G_CONVERT_ERROR_PARTIAL_INPUT)
{
/* FIXME We can get partial input while guessing the
encoding because we just take some amount of text
to guess from. */
ret = TRUE;
}
else
{
ret = FALSE;
}
g_error_free (err);
}
else
{
ret = TRUE;
}
/* FIXME: Check the remainder? */
if (ret == TRUE && !g_utf8_validate (out, nwritten, NULL))
{
ret = FALSE;
}
g_free (out);
return ret;
}
static GCharsetConverter *
guess_encoding (GeditDocumentOutputStream *stream,
const void *inbuf,
gsize inbuf_size)
{
GCharsetConverter *conv = NULL;
if (inbuf == NULL || inbuf_size == 0)
{
stream->priv->is_utf8 = TRUE;
return NULL;
}
if (stream->priv->encodings != NULL &&
stream->priv->encodings->next == NULL)
{
stream->priv->use_first = TRUE;
}
/* We just check the first block */
while (TRUE)
{
const GeditEncoding *enc;
if (conv != NULL)
{
g_object_unref (conv);
conv = NULL;
}
/* We get an encoding from the list */
enc = get_encoding (stream);
/* if it is NULL we didn't guess anything */
if (enc == NULL)
{
break;
}
gedit_debug_message (DEBUG_UTILS, "trying charset: %s",
gedit_encoding_get_charset (stream->priv->current_encoding->data));
if (enc == gedit_encoding_get_utf8 ())
{
gsize remainder;
const gchar *end;
if (g_utf8_validate (inbuf, inbuf_size, &end) ||
stream->priv->use_first)
{
stream->priv->is_utf8 = TRUE;
break;
}
/* Check if the end is less than one char */
remainder = inbuf_size - (end - (gchar *)inbuf);
if (remainder < 6)
{
stream->priv->is_utf8 = TRUE;
break;
}
continue;
}
conv = g_charset_converter_new ("UTF-8",
gedit_encoding_get_charset (enc),
NULL);
/* If we tried all encodings we use the first one */
if (stream->priv->use_first)
{
break;
}
/* Try to convert */
if (try_convert (conv, inbuf, inbuf_size))
{
break;
}
}
if (conv != NULL)
{
g_converter_reset (G_CONVERTER (conv));
}
return conv;
}
static GeditDocumentNewlineType
......@@ -216,10 +438,17 @@ get_newline_type (GtkTextIter *end)
}
GOutputStream *
gedit_document_output_stream_new (GeditDocument *doc)
gedit_document_output_stream_new (GeditDocument *doc,
GSList *candidate_encodings)
{
return G_OUTPUT_STREAM (g_object_new (GEDIT_TYPE_DOCUMENT_OUTPUT_STREAM,
"document", doc, NULL));
GeditDocumentOutputStream *stream;
stream = g_object_new (GEDIT_TYPE_DOCUMENT_OUTPUT_STREAM,
"document", doc, NULL);
stream->priv->encodings = g_slist_copy (candidate_encodings);
return G_OUTPUT_STREAM (stream);
}
GeditDocumentNewlineType
......@@ -244,6 +473,38 @@ gedit_document_output_stream_detect_newline_type (GeditDocumentOutputStream *str
return type;
}
const GeditEncoding *
gedit_document_output_stream_get_guessed (GeditDocumentOutputStream *stream)
{
g_return_val_if_fail (GEDIT_IS_DOCUMENT_OUTPUT_STREAM (stream), NULL);
if (stream->priv->current_encoding != NULL)
{
return (const GeditEncoding *)stream->priv->current_encoding->data;
}
else if (stream->priv->is_utf8 || !stream->priv->is_initialized)
{
/* If it is not initialized we assume that we are trying to convert
the empty string */
return gedit_encoding_get_utf8 ();
}
return NULL;
}
guint
gedit_document_output_stream_get_num_fallbacks (GeditDocumentOutputStream *stream)
{
g_return_val_if_fail (GEDIT_IS_DOCUMENT_OUTPUT_STREAM (stream), FALSE);
if (stream->priv->charset_conv == NULL)
{
return FALSE;
}
return g_charset_converter_get_num_fallbacks (stream->priv->charset_conv) != 0;
}
/* If the last char is a newline, remove it from the buffer (otherwise
GtkTextView shows it as an empty line). See bug #324942. */
static void
......@@ -297,6 +558,7 @@ gedit_document_output_stream_write (GOutputStream *stream,
const gchar *end;
gsize nvalid;
gboolean valid;
gsize remainder;
if (g_cancellable_set_error_if_cancelled (cancellable, error))
{
......@@ -307,6 +569,55 @@ gedit_document_output_stream_write (GOutputStream *stream,
if (!ostream->priv->is_initialized)
{
ostream->priv->charset_conv = guess_encoding (ostream, buffer, count);
/* If we still have the previous case is that we didn't guess
anything */
if (ostream->priv->charset_conv == NULL &&
!ostream->priv->is_utf8)
{
/* FIXME: Add a different domain when we kill gedit_convert */
g_set_error_literal (error, GEDIT_DOCUMENT_ERROR,
GEDIT_DOCUMENT_ERROR_ENCODING_AUTO_DETECTION_FAILED,
_("It is not possible to detect the encoding automatically"));
return -1;
}
/* Do not initialize iconv if we are not going to conver anything */
if (!ostream->priv->is_utf8)
{
gchar *from_charset;
/* Initialize iconv */
g_object_get (G_OBJECT (ostream->priv->charset_conv),
"from-charset", &from_charset,
NULL);
ostream->priv->iconv = g_iconv_open ("UTF-8", from_charset);
if (ostream->priv->iconv == (GIConv)-1)
{
if (errno == EINVAL)
{
g_set_error (error, G_IO_ERROR, G_IO_ERROR_NOT_SUPPORTED,
_("Conversion from character set '%s' to 'UTF-8' is not supported"),
from_charset);
}
else
{
g_set_error (error, G_IO_ERROR, G_IO_ERROR_FAILED,
_("Could not open converter from '%s' to 'UTF-8'"),
from_charset);
}
g_free (from_charset);
return -1;
}
g_free (from_charset);
}
/* Init the undoable action */
gtk_source_buffer_begin_not_undoable_action (GTK_SOURCE_BUFFER (ostream->priv->doc));
......@@ -339,14 +650,74 @@ gedit_document_output_stream_write (GOutputStream *stream,
len = count;
}
if (!ostream->priv->is_utf8)
{
gchar *conv_text;
gsize conv_read;
gsize conv_written;
GError *err = NULL;
if (ostream->priv->iconv == NULL)
{
g_set_error_literal (error, G_IO_ERROR, G_IO_ERROR_NOT_INITIALIZED,
_("Invalid object, not initialized"));
if (freetext)
{
g_free (text);
}
return -1;
}
/* If we reached here is because we need to convert the text so, we
convert it with the charset converter */
conv_text = g_convert_with_iconv (text,
len,
ostream->priv->iconv,
&conv_read,
&conv_written,
&err);
if (freetext)
{
g_free (text);
}
if (err != NULL)
{
remainder = len - conv_read;
/* Store the partial char for the next conversion */
if (err->code == G_CONVERT_ERROR_ILLEGAL_SEQUENCE &&
remainder < MAX_UNICHAR_LEN &&
(g_utf8_get_char_validated (text + conv_read, remainder) == (gunichar)-2))
{
ostream->priv->buffer = g_strndup (text + conv_read, remainder);
ostream->priv->buflen = remainder;
}
else
{
/* Something went wrong with the conversion,
propagate the error and finish */
g_propagate_error (error, err);
g_free (conv_text);
return -1;
}
}
text = conv_text;
len = conv_written;
freetext = TRUE;
}
/* validate */
valid = g_utf8_validate (text, len, &end);
nvalid = end - text;
if (!valid)
{
gsize remainder;
remainder = len - nvalid;
if ((remainder < MAX_UNICHAR_LEN) &&
......
......@@ -26,6 +26,7 @@
#include <gio/gio.h>
#include "gedit-document.h"
#include "gedit-encodings.h"
G_BEGIN_DECLS
......@@ -55,10 +56,15 @@ struct _GeditDocumentOutputStreamClass
GType gedit_document_output_stream_get_type (void) G_GNUC_CONST;
GOutputStream *gedit_document_output_stream_new (GeditDocument *doc);
GOutputStream *gedit_document_output_stream_new (GeditDocument *doc,
GSList *candidate_encodings);
GeditDocumentNewlineType gedit_document_output_stream_detect_newline_type (GeditDocumentOutputStream *stream);
const GeditEncoding *gedit_document_output_stream_get_guessed (GeditDocumentOutputStream *stream);
guint gedit_document_output_stream_get_num_fallbacks (GeditDocumentOutputStream *stream);
G_END_DECLS
#endif /* __GEDIT_DOCUMENT_OUTPUT_STREAM_H__ */
......
/*
* gedit-smart-charset-converter.c
* This file is part of gedit
*
* Copyright (C) 2009 - Ignacio Casal Quinteiro
*
* gedit is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* gedit is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with gedit; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
#include "gedit-smart-charset-converter.h"
#include "gedit-debug.h"
#include "gedit-document.h"
#include <gio/gio.h>
#include <glib/gi18n.h>
#define GEDIT_SMART_CHARSET_CONVERTER_GET_PRIVATE(object)(G_TYPE_INSTANCE_GET_PRIVATE((object), GEDIT_TYPE_SMART_CHARSET_CONVERTER, GeditSmartCharsetConverterPrivate))
struct _GeditSmartCharsetConverterPrivate
{
GCharsetConverter *charset_conv;
GSList *encodings;
GSList *current_encoding;
guint is_utf8 : 1;
guint use_first : 1;
};
static void gedit_smart_charset_converter_iface_init (GConverterIface *iface);
G_DEFINE_TYPE_WITH_CODE (GeditSmartCharsetConverter, gedit_smart_charset_converter,
G_TYPE_OBJECT,
G_IMPLEMENT_INTERFACE (G_TYPE_CONVERTER,
gedit_smart_charset_converter_iface_init))
static void
gedit_smart_charset_converter_finalize (GObject *object)
{
GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (object);
g_slist_free (smart->priv->encodings);
gedit_debug_message (DEBUG_UTILS, "finalizing smart charset converter");
G_OBJECT_CLASS (gedit_smart_charset_converter_parent_class)->finalize (object);
}
static void
gedit_smart_charset_converter_dispose (GObject *object)
{
GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (object);
if (smart->priv->charset_conv != NULL)
{
g_object_unref (smart->priv->charset_conv);
smart->priv->charset_conv = NULL;
}
gedit_debug_message (DEBUG_UTILS, "disposing smart charset converter");
G_OBJECT_CLASS (gedit_smart_charset_converter_parent_class)->dispose (object);
}
static void
gedit_smart_charset_converter_class_init (GeditSmartCharsetConverterClass *klass)
{
GObjectClass *object_class = G_OBJECT_CLASS (klass);
object_class->finalize = gedit_smart_charset_converter_finalize;
object_class->dispose = gedit_smart_charset_converter_dispose;
g_type_class_add_private (object_class, sizeof (GeditSmartCharsetConverterPrivate));
}
static void
gedit_smart_charset_converter_init (GeditSmartCharsetConverter *smart)
{
smart->priv = GEDIT_SMART_CHARSET_CONVERTER_GET_PRIVATE (smart);
smart->priv->charset_conv = NULL;
smart->priv->encodings = NULL;
smart->priv->current_encoding = NULL;
smart->priv->is_utf8 = FALSE;
smart->priv->use_first = FALSE;
gedit_debug_message (DEBUG_UTILS, "initializing smart charset converter");
}
static const GeditEncoding *
get_encoding (GeditSmartCharsetConverter *smart)
{
if (smart->priv->current_encoding == NULL)
{
smart->priv->current_encoding = smart->priv->encodings;
}
else
{
smart->priv->current_encoding = g_slist_next (smart->priv->current_encoding);
}
if (smart->priv->current_encoding != NULL)
return (const GeditEncoding *)smart->priv->current_encoding->data;
#if 0
FIXME: uncomment this when using fallback
/* If we tried all encodings, we return the first encoding */
smart->priv->use_first = TRUE;
smart->priv->current_encoding = smart->priv->encodings;
return (const GeditEncoding *)smart->priv->current_encoding->data;
#endif
return NULL;
}
static gboolean
try_convert (GCharsetConverter *converter,
const void *inbuf,
gsize inbuf_size)
{
GError *err;
gsize bytes_read, nread;
gsize bytes_written, nwritten;
GConverterResult res;
gchar *out;
gboolean ret;
gsize out_size;
if (inbuf == NULL || inbuf_size == 0)
{
return FALSE;
}