Audacious  $Id:Doxyfile42802007-03-2104:39:00Znenolod$
chardet.c
Go to the documentation of this file.
00001 /*  Audacious
00002  *  Copyright (C) 2005-2007  Audacious development team.
00003  *
00004  *  This program is free software; you can redistribute it and/or modify
00005  *  it under the terms of the GNU General Public License as published by
00006  *  the Free Software Foundation; under version 3 of the License.
00007  *
00008  *  This program is distributed in the hope that it will be useful,
00009  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00010  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00011  *  GNU General Public License for more details.
00012  *
00013  *  You should have received a copy of the GNU General Public License
00014  *  along with this program.  If not, see <http://www.gnu.org/licenses>.
00015  *
00016  *  The Audacious team does not consider modular code linking to
00017  *  Audacious or using our public API to be a derived work.
00018  */
00019 
00020 #include <string.h>
00021 #include <libaudcore/audstrings.h>
00022 
00023 #include "audconfig.h"
00024 #include "chardet.h"
00025 #include "config.h"
00026 #include "i18n.h"
00027 #include "main.h"
00028 #include "debug.h"
00029 
00030 #ifdef USE_CHARDET
00031 #  include <libguess.h>
00032 #endif
00033 
00034 gchar *
00035 cd_str_to_utf8(const gchar * str)
00036 {
00037     gchar *out_str;
00038 
00039     if (str == NULL)
00040         return NULL;
00041 
00042     /* Note: Currently, playlist calls this function repeatedly, even
00043      * if the string is already converted into utf-8.
00044      * chardet_to_utf8() would convert a valid utf-8 string into a
00045      * different utf-8 string, if fallback encodings were supplied and
00046      * the given string could be treated as a string in one of
00047      * fallback encodings. To avoid this, g_utf8_validate() had been
00048      * used at the top of evaluation.
00049      */
00050 
00051     /* Note 2: g_utf8_validate() has so called encapsulated utf-8
00052      * problem, thus chardet_to_utf8() took the place of that.
00053      */
00054 
00055     /* Note 3: As introducing madplug, the problem of conversion from
00056      * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
00057      * located near the end of chardet_to_utf8(), but it requires utf8
00058      * validation guard where g_utf8_validate() was. New
00059      * dfa_validate_utf8() employs libguess' DFA engine to validate
00060      * utf-8 and can properly distinguish examples of encapsulated
00061      * utf-8. It is considered to be safe to use as a guard.
00062      */
00063 
00064     /* Already UTF-8? */
00065 #ifdef USE_CHARDET
00066     if (libguess_validate_utf8(str, strlen(str)))
00067         return g_strdup(str);
00068 #else
00069     if (g_utf8_validate(str, strlen(str), NULL))
00070         return g_strdup(str);
00071 #endif
00072 
00073     /* chardet encoding detector */
00074     if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL)
00075         return out_str;
00076 
00077     /* all else fails, we mask off character codes >= 128, replace with '?' */
00078     return str_to_utf8_fallback(str);
00079 }
00080 
00081 gchar *
00082 cd_chardet_to_utf8(const gchar * str, gssize len, gsize * arg_bytes_read,
00083                    gsize * arg_bytes_write, GError ** error)
00084 {
00085     if (error)
00086         * error = NULL;
00087 
00088 #ifdef USE_CHARDET
00089     gchar *det = NULL, *encoding = NULL;
00090 #endif
00091     gchar *ret = NULL;
00092     gsize *bytes_read, *bytes_write;
00093     gsize my_bytes_read, my_bytes_write;
00094 
00095     bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00096     bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00097 
00098     g_return_val_if_fail(str != NULL, NULL);
00099 
00100 #ifdef USE_CHARDET
00101     if (libguess_validate_utf8(str, len))
00102 #else
00103     if (g_utf8_validate(str, len, NULL))
00104 #endif
00105     {
00106         if (len < 0)
00107             len = strlen (str);
00108 
00109         ret = g_malloc (len + 1);
00110         memcpy (ret, str, len);
00111         ret[len] = 0;
00112 
00113         if (arg_bytes_read != NULL)
00114             * arg_bytes_read = len;
00115         if (arg_bytes_write != NULL)
00116             * arg_bytes_write = len;
00117 
00118         return ret;
00119     }
00120 #ifdef USE_CHARDET
00121     if (cfg.chardet_detector)
00122         det = cfg.chardet_detector;
00123 
00124     if (det)
00125     {
00126         AUDDBG("guess encoding (%s) %s\n", det, str);
00127         encoding = (gchar *) libguess_determine_encoding(str, len, det);
00128         AUDDBG("encoding = %s\n", encoding);
00129         if (encoding == NULL)
00130             goto fallback;
00131 
00132         ret = g_convert (str, len, "UTF-8", encoding, bytes_read, bytes_write,
00133          (error && * error) ? NULL : error);
00134     }
00135 
00136 fallback:
00137 #endif
00138 
00139     /* If detection failed or was not enabled, try fallbacks (if there are any) */
00140     if (ret == NULL && cfg.chardet_fallback_s != NULL)
00141     {
00142         gchar **enc;
00143         for (enc = cfg.chardet_fallback_s; *enc != NULL; enc++)
00144         {
00145             ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write,
00146              (error && * error) ? NULL : error);
00147             if (len == *bytes_read)
00148                 break;
00149             else {
00150                 g_free(ret);
00151                 ret = NULL;
00152             }
00153         }
00154     }
00155 
00156     /* First fallback: locale (duh!) */
00157     if (ret == NULL)
00158         ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write,
00159          (error && * error) ? NULL : error);
00160 
00161     /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
00162     if (ret == NULL)
00163         ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read,
00164          bytes_write, (error && * error) ? NULL : error);
00165 
00166     if (ret != NULL)
00167     {
00168         if (g_utf8_validate(ret, -1, NULL))
00169             return ret;
00170         else
00171         {
00172             g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00173             g_free(ret);
00174             return NULL;
00175         }
00176     }
00177 
00178     return NULL; /* If we have no idea, return NULL. */
00179 }
00180 
00181 
00182 void chardet_init(void)
00183 {
00184     str_to_utf8 = cd_str_to_utf8;
00185     chardet_to_utf8 = cd_chardet_to_utf8;
00186 }