/* * smart-converter.c * This file is part of pluma * * Copyright (C) 2009 - Ignacio Casal Quinteiro * * pluma is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * pluma is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with pluma; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ #include "pluma-smart-charset-converter.h" #include "pluma-encodings.h" #include #include #include #include #define TEXT_TO_CONVERT "this is some text to make the tests" #define TEXT_TO_GUESS "hello \xe6\x96\x87 world" static void print_hex (gchar *ptr, gint len) { gint i; for (i = 0; i < len; ++i) { g_printf ("\\x%02x", (unsigned char)ptr[i]); } g_printf ("\n"); } static gchar * get_encoded_text (const gchar *text, gsize nread, const PlumaEncoding *to, const PlumaEncoding *from, gsize *bytes_written_aux, gboolean care_about_error) { GCharsetConverter *converter; gchar *out, *out_aux; gsize bytes_read, bytes_read_aux; gsize bytes_written; GConverterResult res; GError *err; converter = g_charset_converter_new (pluma_encoding_get_charset (to), pluma_encoding_get_charset (from), NULL); out = g_malloc (200); out_aux = g_malloc (200); err = NULL; bytes_read_aux = 0; *bytes_written_aux = 0; if (nread == -1) { nread = strlen (text); } do { res = g_converter_convert (G_CONVERTER (converter), text + bytes_read_aux, nread, out_aux, 200, G_CONVERTER_INPUT_AT_END, &bytes_read, &bytes_written, &err); memcpy (out + *bytes_written_aux, out_aux, bytes_written); bytes_read_aux += bytes_read; *bytes_written_aux += bytes_written; nread -= bytes_read; } while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR); if (care_about_error) { g_assert_no_error (err); } else if (err) { g_printf ("** You don't care, but there was an error: %s", err->message); return NULL; } out[*bytes_written_aux] = '\0'; if (!g_utf8_validate (out, *bytes_written_aux, NULL) && !care_about_error) { if (!care_about_error) { return NULL; } else { g_assert_not_reached (); } } return out; } static GSList * get_all_encodings () { GSList *encs = NULL; gint i = 0; while (TRUE) { const PlumaEncoding *enc; enc = pluma_encoding_get_from_index (i); if (enc == NULL) break; encs = g_slist_prepend (encs, (gpointer)enc); i++; } return encs; } static gchar * do_test (const gchar *test_in, const gchar *enc, GSList *encodings, gsize nread, const PlumaEncoding **guessed) { PlumaSmartCharsetConverter *converter; gchar *out, *out_aux; gsize bytes_read, bytes_read_aux; gsize bytes_written, bytes_written_aux; GConverterResult res; GError *err; if (enc != NULL) { encodings = NULL; encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset (enc)); } converter = pluma_smart_charset_converter_new (encodings); out = g_malloc (200); out_aux = g_malloc (200); err = NULL; bytes_read_aux = 0; bytes_written_aux = 0; do { res = g_converter_convert (G_CONVERTER (converter), test_in + bytes_read_aux, nread, out_aux, 200, G_CONVERTER_INPUT_AT_END, &bytes_read, &bytes_written, &err); memcpy (out + bytes_written_aux, out_aux, bytes_written); bytes_read_aux += bytes_read; bytes_written_aux += bytes_written; nread -= bytes_read; } while (res != G_CONVERTER_FINISHED && res != G_CONVERTER_ERROR); g_assert_no_error (err); out[bytes_written_aux] = '\0'; if (guessed != NULL) *guessed = pluma_smart_charset_converter_get_guessed (converter); return out; } static void do_test_roundtrip (const char *str, const char *charset) { gsize len; gchar *buf, *p; GInputStream *in, *tmp; GCharsetConverter *c1; PlumaSmartCharsetConverter *c2; gsize n, tot; GError *err; GSList *enc = NULL; len = strlen(str); buf = g_new0 (char, len); in = g_memory_input_stream_new_from_data (str, -1, NULL); c1 = g_charset_converter_new (charset, "UTF-8", NULL); tmp = in; in = g_converter_input_stream_new (in, G_CONVERTER (c1)); g_object_unref (tmp); g_object_unref (c1); enc = g_slist_prepend (enc, (gpointer)pluma_encoding_get_from_charset (charset)); c2 = pluma_smart_charset_converter_new (enc); g_slist_free (enc); tmp = in; in = g_converter_input_stream_new (in, G_CONVERTER (c2)); g_object_unref (tmp); g_object_unref (c2); tot = 0; p = buf; n = len; while (TRUE) { gssize res; err = NULL; res = g_input_stream_read (in, p, n, NULL, &err); g_assert_no_error (err); if (res == 0) break; p += res; n -= res; tot += res; } g_assert_cmpint (tot, ==, len); g_assert_cmpstr (str, ==, buf); g_free (buf); g_object_unref (in); } static void test_utf8_utf8 () { gchar *aux; aux = do_test (TEXT_TO_CONVERT, "UTF-8", NULL, strlen (TEXT_TO_CONVERT), NULL); g_assert_cmpstr (aux, ==, TEXT_TO_CONVERT); aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 18, NULL); g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz"); aux = do_test ("foobar\xc3\xa8\xc3\xa8\xc3\xa8zzzzzz", "UTF-8", NULL, 9, NULL); g_assert_cmpstr (aux, ==, "foobar\xc3\xa8\xc3"); /* FIXME: Use the utf8 stream for a fallback? */ //do_test_with_error ("\xef\xbf\xbezzzzzz", encs, G_IO_ERROR_FAILED); } static void test_xxx_xxx () { GSList *encs, *l; encs = get_all_encodings (); /* Here we just test all encodings it is just to know that the conversions are done ok */ for (l = encs; l != NULL; l = g_slist_next (l)) { do_test_roundtrip (TEXT_TO_CONVERT, pluma_encoding_get_charset ((const PlumaEncoding *)l->data)); } g_slist_free (encs); } static void test_empty () { const PlumaEncoding *guessed; gchar *out; GSList *encodings = NULL; /* testing the case of an empty file and list of encodings with no utf-8. In this case, the smart converter cannot determine the right encoding (because there is no input), but should still default to utf-8 for the detection */ encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset ("UTF-16")); encodings = g_slist_prepend (encodings, (gpointer)pluma_encoding_get_from_charset ("ISO-8859-15")); out = do_test ("", NULL, encodings, 0, &guessed); g_assert_cmpstr (out, ==, ""); g_assert (guessed == pluma_encoding_get_utf8 ()); } static void test_guessed () { GSList *encs = NULL; gchar *aux, *aux2, *fail; gsize aux_len, fail_len; const PlumaEncoding *guessed; aux = get_encoded_text (TEXT_TO_GUESS, -1, pluma_encoding_get_from_charset ("UTF-16"), pluma_encoding_get_from_charset ("UTF-8"), &aux_len, TRUE); fail = get_encoded_text (aux, aux_len, pluma_encoding_get_from_charset ("UTF-8"), pluma_encoding_get_from_charset ("ISO-8859-15"), &fail_len, FALSE); g_assert (fail == NULL); /* ISO-8859-15 should fail */ encs = g_slist_append (encs, (gpointer)pluma_encoding_get_from_charset ("ISO-8859-15")); encs = g_slist_append (encs, (gpointer)pluma_encoding_get_from_charset ("UTF-16")); aux2 = do_test (aux, NULL, encs, aux_len, &guessed); g_assert (guessed == pluma_encoding_get_from_charset ("UTF-16")); } int main (int argc, char *argv[]) { g_test_init (&argc, &argv, NULL); g_test_add_func ("/smart-converter/utf8-utf8", test_utf8_utf8); //g_test_add_func ("/smart-converter/xxx-xxx", test_xxx_xxx); g_test_add_func ("/smart-converter/guessed", test_guessed); g_test_add_func ("/smart-converter/empty", test_empty); return g_test_run (); }