summaryrefslogtreecommitdiff
path: root/gedit/gedit-smart-charset-converter.c
diff options
context:
space:
mode:
Diffstat (limited to 'gedit/gedit-smart-charset-converter.c')
-rwxr-xr-xgedit/gedit-smart-charset-converter.c422
1 files changed, 422 insertions, 0 deletions
diff --git a/gedit/gedit-smart-charset-converter.c b/gedit/gedit-smart-charset-converter.c
new file mode 100755
index 00000000..e32b0b17
--- /dev/null
+++ b/gedit/gedit-smart-charset-converter.c
@@ -0,0 +1,422 @@
+/*
+ * gedit-smart-charset-converter.c
+ * This file is part of gedit
+ *
+ * Copyright (C) 2009 - Ignacio Casal Quinteiro
+ *
+ * gedit is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * gedit is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with gedit; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301 USA
+ */
+
+#include "gedit-smart-charset-converter.h"
+#include "gedit-debug.h"
+#include "gedit-document.h"
+
+#include <gio/gio.h>
+#include <glib/gi18n.h>
+
+#define GEDIT_SMART_CHARSET_CONVERTER_GET_PRIVATE(object)(G_TYPE_INSTANCE_GET_PRIVATE((object), GEDIT_TYPE_SMART_CHARSET_CONVERTER, GeditSmartCharsetConverterPrivate))
+
+struct _GeditSmartCharsetConverterPrivate
+{
+ GCharsetConverter *charset_conv;
+
+ GSList *encodings;
+ GSList *current_encoding;
+
+ guint is_utf8 : 1;
+ guint use_first : 1;
+};
+
+static void gedit_smart_charset_converter_iface_init (GConverterIface *iface);
+
+G_DEFINE_TYPE_WITH_CODE (GeditSmartCharsetConverter, gedit_smart_charset_converter,
+ G_TYPE_OBJECT,
+ G_IMPLEMENT_INTERFACE (G_TYPE_CONVERTER,
+ gedit_smart_charset_converter_iface_init))
+
+static void
+gedit_smart_charset_converter_finalize (GObject *object)
+{
+ GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (object);
+
+ g_slist_free (smart->priv->encodings);
+
+ gedit_debug_message (DEBUG_UTILS, "finalizing smart charset converter");
+
+ G_OBJECT_CLASS (gedit_smart_charset_converter_parent_class)->finalize (object);
+}
+
+static void
+gedit_smart_charset_converter_dispose (GObject *object)
+{
+ GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (object);
+
+ if (smart->priv->charset_conv != NULL)
+ {
+ g_object_unref (smart->priv->charset_conv);
+ smart->priv->charset_conv = NULL;
+ }
+
+ gedit_debug_message (DEBUG_UTILS, "disposing smart charset converter");
+
+ G_OBJECT_CLASS (gedit_smart_charset_converter_parent_class)->dispose (object);
+}
+
+static void
+gedit_smart_charset_converter_class_init (GeditSmartCharsetConverterClass *klass)
+{
+ GObjectClass *object_class = G_OBJECT_CLASS (klass);
+
+ object_class->finalize = gedit_smart_charset_converter_finalize;
+ object_class->dispose = gedit_smart_charset_converter_dispose;
+
+ g_type_class_add_private (object_class, sizeof (GeditSmartCharsetConverterPrivate));
+}
+
+static void
+gedit_smart_charset_converter_init (GeditSmartCharsetConverter *smart)
+{
+ smart->priv = GEDIT_SMART_CHARSET_CONVERTER_GET_PRIVATE (smart);
+
+ smart->priv->charset_conv = NULL;
+ smart->priv->encodings = NULL;
+ smart->priv->current_encoding = NULL;
+ smart->priv->is_utf8 = FALSE;
+ smart->priv->use_first = FALSE;
+
+ gedit_debug_message (DEBUG_UTILS, "initializing smart charset converter");
+}
+
+static const GeditEncoding *
+get_encoding (GeditSmartCharsetConverter *smart)
+{
+ if (smart->priv->current_encoding == NULL)
+ {
+ smart->priv->current_encoding = smart->priv->encodings;
+ }
+ else
+ {
+ smart->priv->current_encoding = g_slist_next (smart->priv->current_encoding);
+ }
+
+ if (smart->priv->current_encoding != NULL)
+ return (const GeditEncoding *)smart->priv->current_encoding->data;
+
+#if 0
+ FIXME: uncomment this when using fallback
+ /* If we tried all encodings, we return the first encoding */
+ smart->priv->use_first = TRUE;
+ smart->priv->current_encoding = smart->priv->encodings;
+
+ return (const GeditEncoding *)smart->priv->current_encoding->data;
+#endif
+ return NULL;
+}
+
+static gboolean
+try_convert (GCharsetConverter *converter,
+ const void *inbuf,
+ gsize inbuf_size)
+{
+ GError *err;
+ gsize bytes_read, nread;
+ gsize bytes_written, nwritten;
+ GConverterResult res;
+ gchar *out;
+ gboolean ret;
+ gsize out_size;
+
+ if (inbuf == NULL || inbuf_size == 0)
+ {
+ return FALSE;
+ }
+
+ err = NULL;
+ nread = 0;
+ nwritten = 0;
+ out_size = inbuf_size * 4;
+ out = g_malloc (out_size);
+
+ do
+ {
+ res = g_converter_convert (G_CONVERTER (converter),
+ inbuf + nread,
+ inbuf_size - nread,
+ out + nwritten,
+ out_size - nwritten,
+ G_CONVERTER_INPUT_AT_END,
+ &bytes_read,
+ &bytes_written,
+ &err);
+
+ nread += bytes_read;
+ nwritten += bytes_written;
+ } while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR && err == NULL);
+
+ if (err != NULL)
+ {
+ if (err->code == G_CONVERT_ERROR_PARTIAL_INPUT)
+ {
+ /* FIXME We can get partial input while guessing the
+ encoding because we just take some amount of text
+ to guess from. */
+ ret = TRUE;
+ }
+ else
+ {
+ ret = FALSE;
+ }
+
+ g_error_free (err);
+ }
+ else
+ {
+ ret = TRUE;
+ }
+
+ /* FIXME: Check the remainder? */
+ if (ret == TRUE && !g_utf8_validate (out, nwritten, NULL))
+ {
+ ret = FALSE;
+ }
+
+ g_free (out);
+
+ return ret;
+}
+
+static GCharsetConverter *
+guess_encoding (GeditSmartCharsetConverter *smart,
+ const void *inbuf,
+ gsize inbuf_size)
+{
+ GCharsetConverter *conv = NULL;
+
+ if (inbuf == NULL || inbuf_size == 0)
+ {
+ smart->priv->is_utf8 = TRUE;
+ return NULL;
+ }
+
+ if (smart->priv->encodings != NULL &&
+ smart->priv->encodings->next == NULL)
+ smart->priv->use_first = TRUE;
+
+ /* We just check the first block */
+ while (TRUE)
+ {
+ const GeditEncoding *enc;
+
+ if (conv != NULL)
+ {
+ g_object_unref (conv);
+ conv = NULL;
+ }
+
+ /* We get an encoding from the list */
+ enc = get_encoding (smart);
+
+ /* if it is NULL we didn't guess anything */
+ if (enc == NULL)
+ {
+ break;
+ }
+
+ gedit_debug_message (DEBUG_UTILS, "trying charset: %s",
+ gedit_encoding_get_charset (smart->priv->current_encoding->data));
+
+ if (enc == gedit_encoding_get_utf8 ())
+ {
+ gsize remainder;
+ const gchar *end;
+
+ if (g_utf8_validate (inbuf, inbuf_size, &end) ||
+ smart->priv->use_first)
+ {
+ smart->priv->is_utf8 = TRUE;
+ break;
+ }
+
+ /* Check if the end is less than one char */
+ remainder = inbuf_size - (end - (gchar *)inbuf);
+ if (remainder < 6)
+ {
+ smart->priv->is_utf8 = TRUE;
+ break;
+ }
+
+ continue;
+ }
+
+ conv = g_charset_converter_new ("UTF-8",
+ gedit_encoding_get_charset (enc),
+ NULL);
+
+ /* If we tried all encodings we use the first one */
+ if (smart->priv->use_first)
+ {
+ break;
+ }
+
+ /* Try to convert */
+ if (try_convert (conv, inbuf, inbuf_size))
+ {
+ break;
+ }
+ }
+
+ if (conv != NULL)
+ {
+ g_converter_reset (G_CONVERTER (conv));
+
+ /* FIXME: uncomment this when we want to use the fallback
+ g_charset_converter_set_use_fallback (conv, TRUE);*/
+ }
+
+ return conv;
+}
+
+static GConverterResult
+gedit_smart_charset_converter_convert (GConverter *converter,
+ const void *inbuf,
+ gsize inbuf_size,
+ void *outbuf,
+ gsize outbuf_size,
+ GConverterFlags flags,
+ gsize *bytes_read,
+ gsize *bytes_written,
+ GError **error)
+{
+ GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (converter);
+
+ /* Guess the encoding if we didn't make it yet */
+ if (smart->priv->charset_conv == NULL &&
+ !smart->priv->is_utf8)
+ {
+ smart->priv->charset_conv = guess_encoding (smart, inbuf, inbuf_size);
+
+ /* If we still have the previous case is that we didn't guess
+ anything */
+ if (smart->priv->charset_conv == NULL &&
+ !smart->priv->is_utf8)
+ {
+ /* FIXME: Add a different domain when we kill gedit_convert */
+ g_set_error_literal (error, GEDIT_DOCUMENT_ERROR,
+ GEDIT_DOCUMENT_ERROR_ENCODING_AUTO_DETECTION_FAILED,
+ _("It is not possible to detect the encoding automatically"));
+ return G_CONVERTER_ERROR;
+ }
+ }
+
+ /* Now if the encoding is utf8 just redirect the input to the output */
+ if (smart->priv->is_utf8)
+ {
+ gsize size;
+ GConverterResult ret;
+
+ size = MIN (inbuf_size, outbuf_size);
+
+ memcpy (outbuf, inbuf, size);
+ *bytes_read = size;
+ *bytes_written = size;
+
+ ret = G_CONVERTER_CONVERTED;
+
+ if (flags & G_CONVERTER_INPUT_AT_END)
+ ret = G_CONVERTER_FINISHED;
+ else if (flags & G_CONVERTER_FLUSH)
+ ret = G_CONVERTER_FLUSHED;
+
+ return ret;
+ }
+
+ /* If we reached here is because we need to convert the text so, we
+ convert it with the charset converter */
+ return g_converter_convert (G_CONVERTER (smart->priv->charset_conv),
+ inbuf,
+ inbuf_size,
+ outbuf,
+ outbuf_size,
+ flags,
+ bytes_read,
+ bytes_written,
+ error);
+}
+
+static void
+gedit_smart_charset_converter_reset (GConverter *converter)
+{
+ GeditSmartCharsetConverter *smart = GEDIT_SMART_CHARSET_CONVERTER (converter);
+
+ smart->priv->current_encoding = NULL;
+ smart->priv->is_utf8 = FALSE;
+
+ if (smart->priv->charset_conv != NULL)
+ {
+ g_object_unref (smart->priv->charset_conv);
+ smart->priv->charset_conv = NULL;
+ }
+}
+
+static void
+gedit_smart_charset_converter_iface_init (GConverterIface *iface)
+{
+ iface->convert = gedit_smart_charset_converter_convert;
+ iface->reset = gedit_smart_charset_converter_reset;
+}
+
+GeditSmartCharsetConverter *
+gedit_smart_charset_converter_new (GSList *candidate_encodings)
+{
+ GeditSmartCharsetConverter *smart;
+
+ g_return_val_if_fail (candidate_encodings != NULL, NULL);
+
+ smart = g_object_new (GEDIT_TYPE_SMART_CHARSET_CONVERTER, NULL);
+
+ smart->priv->encodings = g_slist_copy (candidate_encodings);
+
+ return smart;
+}
+
+const GeditEncoding *
+gedit_smart_charset_converter_get_guessed (GeditSmartCharsetConverter *smart)
+{
+ g_return_val_if_fail (GEDIT_IS_SMART_CHARSET_CONVERTER (smart), NULL);
+
+ if (smart->priv->current_encoding != NULL)
+ {
+ return (const GeditEncoding *)smart->priv->current_encoding->data;
+ }
+ else if (smart->priv->is_utf8)
+ {
+ return gedit_encoding_get_utf8 ();
+ }
+
+ return NULL;
+}
+
+guint
+gedit_smart_charset_converter_get_num_fallbacks (GeditSmartCharsetConverter *smart)
+{
+ g_return_val_if_fail (GEDIT_IS_SMART_CHARSET_CONVERTER (smart), FALSE);
+
+ if (smart->priv->charset_conv == NULL)
+ return FALSE;
+
+ return g_charset_converter_get_num_fallbacks (smart->priv->charset_conv) != 0;
+}
+