/* $TOG: $ */ /****************************************************************** Copyright 1993 by SunSoft, Inc. Copyright 1999-2000 by Bruno Haible Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation, and that the names of SunSoft, Inc. and Bruno Haible not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. SunSoft, Inc. and Bruno Haible make no representations about the suitability of this software for any purpose. It is provided "as is" without express or implied warranty. SunSoft Inc. AND Bruno Haible DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SunSoft, Inc. OR Bruno Haible BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ******************************************************************/ /* $XFree86: xc/lib/X11/lcUTF8.c,v 1.13 2001/08/09 19:14:06 dawes Exp $ */ /* * This file contains: * * I. Conversion routines CompoundText/CharSet <--> Unicode/UTF-8. * * Used for three purposes: * 1. The UTF-8 locales, see below. * 2. Unicode aware applications for which the use of 8-bit character * sets is an anachronism. * 3. For conversion from keysym to locale encoding. * * II. An UTF-8 locale loader. * Supports: all locales with codeset UTF-8. * How: Provides converters for UTF-8. * Platforms: all systems. */ /* * The conversion from UTF-8 to CompoundText is realized in a very * conservative way. Recall that CompoundText data is used for inter-client * communication purposes. We distinguish three classes of clients: * - Clients which accept only those pieces of CompoundText which belong to * the character set understood by the current locale. * (Example: clients which are linked to an older X11 library.) * - Clients which accept CompoundText with multiple character sets and parse * it themselves. * (Example: emacs, xemacs.) * - Clients which rely entirely on the X{mb,wc}TextPropertyToTextList * functions for the conversion of CompoundText to their current locale's * multi-byte/wide-character format. * For best interoperation, the UTF-8 to CompoundText conversion proceeds as * follows. For every character, it first tests whether the character is * representable in the current locale's original (non-UTF-8) character set. * If not, it goes through the list of predefined character sets for * CompoundText and tests if the character is representable in that character * set. If so, it encodes the character using its code within that character * set. If not, it uses an UTF-8-in-CompoundText encapsulation. Since * clients of the first and second kind ignore such encapsulated text, * this encapsulation is kept to a minimum and terminated as early as possible. * * In a distant future, when clients of the first and second kind will have * disappeared, we will be able to stuff UTF-8 data directly in CompoundText * without first going through the list of predefined character sets. */ #include #include "Xlibint.h" #include "XlcPubI.h" #include "XlcGeneric.h" static XlcConv create_conv( XLCd lcd, XlcConvMethods methods) { XlcConv conv; conv = (XlcConv) Xmalloc(sizeof(XlcConvRec)); if (conv == (XlcConv) NULL) return (XlcConv) NULL; conv->methods = methods; conv->state = NULL; return conv; } static void close_converter( XlcConv conv) { Xfree((char *) conv); } /* Replacement character for invalid multibyte sequence or wide character. */ #define BAD_WCHAR ((ucs4_t) 0xfffd) #define BAD_CHAR '?' /***************************************************************************/ /* Part I: Conversion routines CompoundText/CharSet <--> Unicode/UTF-8. * * Note that this code works in any locale. We store Unicode values in * `ucs4_t' variables, but don't pass them to the user. * * This code has to support all character sets that are used for CompoundText, * nothing more, nothing less. See the table in lcCT.c. * Since the conversion _to_ CompoundText is likely to need the tables for all * character sets at once, we don't use dynamic loading (of tables or shared * libraries through iconv()). Use a fixed set of tables instead. * * We use statically computed tables, not dynamically allocated arrays, * because it's more memory efficient: Different processes using the same * libX11 shared library share the "text" and read-only "data" sections. */ typedef unsigned int ucs4_t; #define conv_t XlcConv typedef struct _Utf8ConvRec { const char *name; XrmQuark xrm_name; #if NeedFunctionPrototypes int (* cstowc) (XlcConv, ucs4_t *, unsigned char const *, int); #else int (* cstowc) (); #endif #if NeedFunctionPrototypes int (* wctocs) (XlcConv, unsigned char *, ucs4_t, int); #else int (* wctocs) (); #endif } Utf8ConvRec, *Utf8Conv; /* * int xxx_cstowc (XlcConv conv, ucs4_t *pwc, unsigned char const *s, int n) * converts the byte sequence starting at s to a wide character. Up to n bytes * are available at s. n is >= 1. * Result is number of bytes consumed (if a wide character was read), * or 0 if invalid, or -1 if n too small. * * int xxx_wctocs (XlcConv conv, unsigned char *r, ucs4_t wc, int n) * converts the wide character wc to the character set xxx, and stores the * result beginning at r. Up to n bytes may be written at r. n is >= 1. * Result is number of bytes written, or 0 if invalid, or -1 if n too small. */ /* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */ #define RET_ILSEQ 0 /* Return code if only a shift sequence of n bytes was read. (xxx_mbtowc) */ #define RET_TOOFEW(n) (-1-(n)) /* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */ #define RET_TOOSMALL -1 /* * The tables below are bijective. It would be possible to extend the * xxx_wctocs tables to do some transliteration (e.g. U+201C,U+201D -> 0x22) * but *only* with characters not contained in any other table, and *only* * when the current locale is not an UTF-8 locale. */ #include "lcUniConv/utf8.h" #include "lcUniConv/ucs2be.h" #ifdef notused #include "lcUniConv/ascii.h" #endif #include "lcUniConv/iso8859_1.h" #include "lcUniConv/iso8859_2.h" #include "lcUniConv/iso8859_3.h" #include "lcUniConv/iso8859_4.h" #include "lcUniConv/iso8859_5.h" #include "lcUniConv/iso8859_6.h" #include "lcUniConv/iso8859_7.h" #include "lcUniConv/iso8859_8.h" #include "lcUniConv/iso8859_9.h" #include "lcUniConv/iso8859_10.h" #include "lcUniConv/iso8859_13.h" #include "lcUniConv/iso8859_14.h" #include "lcUniConv/iso8859_15.h" #include "lcUniConv/iso8859_16.h" #include "lcUniConv/iso8859_9e.h" #include "lcUniConv/jisx0201.h" #include "lcUniConv/tis620.h" #include "lcUniConv/koi8_r.h" #include "lcUniConv/koi8_u.h" #include "lcUniConv/koi8_c.h" #include "lcUniConv/armscii_8.h" #include "lcUniConv/cp1133.h" #include "lcUniConv/mulelao.h" #include "lcUniConv/viscii.h" #include "lcUniConv/tcvn.h" #include "lcUniConv/georgian_academy.h" #include "lcUniConv/georgian_ps.h" #include "lcUniConv/cp1251.h" #include "lcUniConv/cp1255.h" #include "lcUniConv/cp1256.h" #include "lcUniConv/tatar_cyr.h" typedef struct { unsigned short indx; /* index into big table */ unsigned short used; /* bitmask of used entries */ } Summary16; #include "lcUniConv/gb2312.h" #include "lcUniConv/jisx0208.h" #include "lcUniConv/jisx0212.h" #include "lcUniConv/ksc5601.h" #include "lcUniConv/big5.h" #include "lcUniConv/big5_emacs.h" static Utf8ConvRec all_charsets[] = { /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning (for lookup speed), once at the end (as a fallback). */ { "ISO10646-1", NULLQUARK, utf8_mbtowc, utf8_wctomb }, { "ISO8859-1", NULLQUARK, iso8859_1_mbtowc, iso8859_1_wctomb }, { "ISO8859-2", NULLQUARK, iso8859_2_mbtowc, iso8859_2_wctomb }, { "ISO8859-3", NULLQUARK, iso8859_3_mbtowc, iso8859_3_wctomb }, { "ISO8859-4", NULLQUARK, iso8859_4_mbtowc, iso8859_4_wctomb }, { "ISO8859-5", NULLQUARK, iso8859_5_mbtowc, iso8859_5_wctomb }, { "ISO8859-6", NULLQUARK, iso8859_6_mbtowc, iso8859_6_wctomb }, { "ISO8859-7", NULLQUARK, iso8859_7_mbtowc, iso8859_7_wctomb }, { "ISO8859-8", NULLQUARK, iso8859_8_mbtowc, iso8859_8_wctomb }, { "ISO8859-9", NULLQUARK, iso8859_9_mbtowc, iso8859_9_wctomb }, { "ISO8859-10", NULLQUARK, iso8859_10_mbtowc, iso8859_10_wctomb }, { "ISO8859-13", NULLQUARK, iso8859_13_mbtowc, iso8859_13_wctomb }, { "ISO8859-14", NULLQUARK, iso8859_14_mbtowc, iso8859_14_wctomb }, { "ISO8859-15", NULLQUARK, iso8859_15_mbtowc, iso8859_15_wctomb }, { "ISO8859-16", NULLQUARK, iso8859_16_mbtowc, iso8859_16_wctomb }, { "JISX0201.1976-0", NULLQUARK, jisx0201_mbtowc, jisx0201_wctomb }, { "TIS620-0", NULLQUARK, tis620_mbtowc, tis620_wctomb }, { "GB2312.1980-0", NULLQUARK, gb2312_mbtowc, gb2312_wctomb }, { "JISX0208.1983-0", NULLQUARK, jisx0208_mbtowc, jisx0208_wctomb }, { "JISX0208.1990-0", NULLQUARK, jisx0208_mbtowc, jisx0208_wctomb }, { "JISX0212.1990-0", NULLQUARK, jisx0212_mbtowc, jisx0212_wctomb }, { "KSC5601.1987-0", NULLQUARK, ksc5601_mbtowc, ksc5601_wctomb }, { "KOI8-R", NULLQUARK, koi8_r_mbtowc, koi8_r_wctomb }, { "KOI8-U", NULLQUARK, koi8_u_mbtowc, koi8_u_wctomb }, { "KOI8-C", NULLQUARK, koi8_c_mbtowc, koi8_c_wctomb }, { "TATAR-CYR", NULLQUARK, tatar_cyr_mbtowc, tatar_cyr_wctomb }, { "ARMSCII-8", NULLQUARK, armscii_8_mbtowc, armscii_8_wctomb }, { "IBM-CP1133", NULLQUARK, cp1133_mbtowc, cp1133_wctomb }, { "MULELAO-1", NULLQUARK, mulelao_mbtowc, mulelao_wctomb }, { "VISCII1.1-1", NULLQUARK, viscii_mbtowc, viscii_wctomb }, { "TCVN-5712", NULLQUARK, tcvn_mbtowc, tcvn_wctomb }, { "GEORGIAN-ACADEMY", NULLQUARK, georgian_academy_mbtowc, georgian_academy_wctomb }, { "GEORGIAN-PS", NULLQUARK, georgian_ps_mbtowc, georgian_ps_wctomb }, { "ISO8859-9E", NULLQUARK, iso8859_9e_mbtowc, iso8859_9e_wctomb }, { "MICROSOFT-CP1251", NULLQUARK, cp1251_mbtowc, cp1251_wctomb }, { "MICROSOFT-CP1255", NULLQUARK, cp1255_mbtowc, cp1255_wctomb }, { "MICROSOFT-CP1256", NULLQUARK, cp1256_mbtowc, cp1256_wctomb }, { "BIG5-0", NULLQUARK, big5_mbtowc, big5_wctomb }, { "BIG5-E0", NULLQUARK, big5_0_mbtowc, big5_0_wctomb }, { "BIG5-E1", NULLQUARK, big5_1_mbtowc, big5_1_wctomb }, /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning (for lookup speed), once at the end (as a fallback). */ { "ISO10646-1", NULLQUARK, utf8_mbtowc, utf8_wctomb }, /* Encoding ISO10646-1 for fonts means UCS2-like encoding so for conversion to FontCharSet we need this record */ { "ISO10646-1", NULLQUARK, ucs2be_mbtowc, ucs2be_wctomb } }; #define charsets_table_size (sizeof(all_charsets)/sizeof(all_charsets[0])) #define all_charsets_count (charsets_table_size - 1) #define ucs2_conv_index (charsets_table_size - 1) static void init_all_charsets (void) { Utf8Conv convptr; int i; for (convptr = all_charsets, i = charsets_table_size; i > 0; convptr++, i--) convptr->xrm_name = XrmStringToQuark(convptr->name); } #define lazy_init_all_charsets() \ do { \ if (all_charsets[0].xrm_name == NULLQUARK) \ init_all_charsets(); \ } while (0) /* from XlcNCharSet to XlcNUtf8String */ static int cstoutf8( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { XlcCharSet charset; const char *name; Utf8Conv convptr; int i; unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; if (num_args < 1) return -1; charset = (XlcCharSet) args[0]; name = charset->encoding_name; /* not charset->name because the latter has a ":GL"/":GR" suffix */ for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) if (!strcmp(convptr->name, name)) break; if (i == 0) return -1; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { ucs4_t wc; int consumed; int count; consumed = convptr->cstowc(conv, &wc, src, srcend-src); if (consumed == RET_ILSEQ) return -1; if (consumed == RET_TOOFEW(0)) break; count = utf8_wctomb(NULL, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst); if (count == RET_TOOSMALL) break; unconv_num++; } src += consumed; dst += count; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_cstoutf8 = { close_converter, cstoutf8, NULL }; static XlcConv open_cstoutf8( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { lazy_init_all_charsets(); return create_conv(from_lcd, &methods_cstoutf8); } /* from XlcNUtf8String to XlcNCharSet */ static XlcConv create_tocs_conv( XLCd lcd, XlcConvMethods methods) { XlcConv conv; CodeSet *codeset_list; int codeset_num; int charset_num; int i, j, k; Utf8Conv *preferred; lazy_init_all_charsets(); codeset_list = XLC_GENERIC(lcd, codeset_list); codeset_num = XLC_GENERIC(lcd, codeset_num); charset_num = 0; for (i = 0; i < codeset_num; i++) charset_num += codeset_list[i]->num_charsets; if (charset_num > all_charsets_count-1) charset_num = all_charsets_count-1; conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + (charset_num + 1) * sizeof(Utf8Conv)); if (conv == (XlcConv) NULL) return (XlcConv) NULL; preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec)); /* Loop through all codesets mentioned in the locale. */ charset_num = 0; for (i = 0; i < codeset_num; i++) { XlcCharSet *charsets = codeset_list[i]->charset_list; int num_charsets = codeset_list[i]->num_charsets; for (j = 0; j < num_charsets; j++) { const char *name = charsets[j]->encoding_name; /* If it wasn't already encountered... */ for (k = charset_num-1; k >= 0; k--) if (!strcmp(preferred[k]->name, name)) break; if (k < 0) { /* Look it up in all_charsets[]. */ for (k = 0; k < all_charsets_count-1; k++) if (!strcmp(all_charsets[k].name, name)) { /* Add it to the preferred set. */ preferred[charset_num++] = &all_charsets[k]; break; } } } } preferred[charset_num] = (Utf8Conv) NULL; conv->methods = methods; conv->state = (XPointer) preferred; return conv; } static void close_tocs_converter( XlcConv conv) { /* conv->state is allocated together with conv, free both at once. */ Xfree((char *) conv); } /* * Converts a Unicode character to an appropriate character set. The NULL * terminated array of preferred character sets is passed as first argument. * If successful, *charsetp is set to the character set that was used, and * *sidep is set to the character set side (XlcGL or XlcGR). */ static int charset_wctocs( Utf8Conv *preferred, Utf8Conv *charsetp, XlcSide *sidep, XlcConv conv, unsigned char *r, ucs4_t wc, int n) { int count; Utf8Conv convptr; int i; for (; *preferred != (Utf8Conv) NULL; preferred++) { convptr = *preferred; count = convptr->wctocs(conv, r, wc, n); if (count == RET_TOOSMALL) return RET_TOOSMALL; if (count != RET_ILSEQ) { *charsetp = convptr; *sidep = (*r < 0x80 ? XlcGL : XlcGR); return count; } } for (convptr = all_charsets+1, i = all_charsets_count-1; i > 0; convptr++, i--) { count = convptr->wctocs(conv, r, wc, n); if (count == RET_TOOSMALL) return RET_TOOSMALL; if (count != RET_ILSEQ) { *charsetp = convptr; *sidep = (*r < 0x80 ? XlcGL : XlcGR); return count; } } return RET_ILSEQ; } static int utf8tocs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { Utf8Conv *preferred_charsets; XlcCharSet last_charset = NULL; unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; preferred_charsets = (Utf8Conv *) conv->state; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; ucs4_t wc; int consumed; int count; consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (consumed == RET_ILSEQ) { src++; unconv_num++; continue; } count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { src += consumed; unconv_num++; continue; } if (last_charset == NULL) { last_charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); if (last_charset == NULL) { src += consumed; unconv_num++; continue; } } else { if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name && (last_charset->side == XlcGLGR || last_charset->side == chosen_side))) break; } src += consumed; dst += count; } if (last_charset == NULL) return -1; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; if (num_args >= 1) *((XlcCharSet *)args[0]) = last_charset; return unconv_num; } static XlcConvMethodsRec methods_utf8tocs = { close_tocs_converter, utf8tocs, NULL }; static XlcConv open_utf8tocs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tocs_conv(from_lcd, &methods_utf8tocs); } /* from XlcNUtf8String to XlcNChar */ static int utf8tocs1( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { Utf8Conv *preferred_charsets; XlcCharSet last_charset = NULL; unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; preferred_charsets = (Utf8Conv *) conv->state; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; ucs4_t wc; int consumed; int count; consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (consumed == RET_ILSEQ) { src++; unconv_num++; continue; } count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { src += consumed; unconv_num++; continue; } if (last_charset == NULL) { last_charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); if (last_charset == NULL) { src += consumed; unconv_num++; continue; } } else { if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name && (last_charset->side == XlcGLGR || last_charset->side == chosen_side))) break; } src += consumed; dst += count; break; } if (last_charset == NULL) return -1; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; if (num_args >= 1) *((XlcCharSet *)args[0]) = last_charset; return unconv_num; } static XlcConvMethodsRec methods_utf8tocs1 = { close_tocs_converter, utf8tocs1, NULL }; static XlcConv open_utf8tocs1( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tocs_conv(from_lcd, &methods_utf8tocs1); } /* from XlcNUtf8String to XlcNString */ static int utf8tostr( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { unsigned char c; ucs4_t wc; int consumed; consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (dst == dstend) break; if (consumed == RET_ILSEQ) { consumed = 1; c = BAD_CHAR; unconv_num++; } else { if ((wc & ~(ucs4_t)0xff) != 0) { c = BAD_CHAR; unconv_num++; } else c = (unsigned char) wc; } *dst++ = c; src += consumed; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_utf8tostr = { close_converter, utf8tostr, NULL }; static XlcConv open_utf8tostr( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_utf8tostr); } /* from XlcNString to XlcNUtf8String */ static int strtoutf8( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; while (src < srcend) { int count = utf8_wctomb(NULL, dst, *src, dstend-dst); if (count == RET_TOOSMALL) break; dst += count; src++; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return 0; } static XlcConvMethodsRec methods_strtoutf8 = { close_converter, strtoutf8, NULL }; static XlcConv open_strtoutf8( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_strtoutf8); } /* Support for the input methods. */ XPointer _Utf8GetConvByName( const char *name) { XrmQuark xrm_name; Utf8Conv convptr; int i; if (name == NULL) return (XPointer) NULL; lazy_init_all_charsets(); xrm_name = XrmStringToQuark(name); for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) if (convptr->xrm_name == xrm_name) return (XPointer) convptr->wctocs; return (XPointer) NULL; } /* from XlcNUcsChar to XlcNChar, needed for input methods */ static XlcConv create_ucstocs_conv( XLCd lcd, XlcConvMethods methods) { if (XLC_PUBLIC_PART(lcd)->codeset && _XlcCompareISOLatin1(XLC_PUBLIC_PART(lcd)->codeset, "UTF-8") == 0) { XlcConv conv; Utf8Conv *preferred; lazy_init_all_charsets(); conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + 2 * sizeof(Utf8Conv)); if (conv == (XlcConv) NULL) return (XlcConv) NULL; preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec)); preferred[0] = &all_charsets[0]; /* ISO10646 */ preferred[1] = (Utf8Conv) NULL; conv->methods = methods; conv->state = (XPointer) preferred; return conv; } else { return create_tocs_conv(lcd, methods); } } static int charset_wctocs_exactly( Utf8Conv *preferred, Utf8Conv *charsetp, XlcSide *sidep, XlcConv conv, unsigned char *r, ucs4_t wc, int n) { int count; Utf8Conv convptr; for (; *preferred != (Utf8Conv) NULL; preferred++) { convptr = *preferred; count = convptr->wctocs(conv, r, wc, n); if (count == RET_TOOSMALL) return RET_TOOSMALL; if (count != RET_ILSEQ) { *charsetp = convptr; *sidep = (*r < 0x80 ? XlcGL : XlcGR); return count; } } return RET_ILSEQ; } static int ucstocs1( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { ucs4_t const *src = (ucs4_t const *) *from; unsigned char *dst = (unsigned char *) *to; int unconv_num = 0; Utf8Conv *preferred_charsets = (Utf8Conv *) conv->state; Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; XlcCharSet charset = NULL; int count; if (from == NULL || *from == NULL) return 0; count = charset_wctocs_exactly(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, *src, *to_left); if (count < 1) { unconv_num++; count = 0; } else { charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); } if (charset == NULL) return -1; *from = (XPointer) ++src; (*from_left)--; *to = (XPointer) dst; *to_left -= count; if (num_args >= 1) *((XlcCharSet *)args[0]) = charset; return unconv_num; } static XlcConvMethodsRec methods_ucstocs1 = { close_tocs_converter, ucstocs1, NULL }; static XlcConv open_ucstocs1( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_ucstocs_conv(from_lcd, &methods_ucstocs1); } /* from XlcNUcsChar to XlcNUtf8String, needed for input methods */ static int ucstoutf8( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { const ucs4_t *src; const ucs4_t *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (const ucs4_t *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { int count = utf8_wctomb(NULL, dst, *src, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) unconv_num++; src++; dst += count; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_ucstoutf8 = { close_converter, ucstoutf8, NULL }; static XlcConv open_ucstoutf8( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_ucstoutf8); } /* Registers UTF-8 converters for a non-UTF-8 locale. */ void _XlcAddUtf8Converters( XLCd lcd) { _XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNUtf8String, open_cstoutf8); _XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNCharSet, open_utf8tocs); _XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNChar, open_utf8tocs1); _XlcSetConverter(lcd, XlcNString, lcd, XlcNUtf8String, open_strtoutf8); _XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNString, open_utf8tostr); _XlcSetConverter(lcd, XlcNUcsChar, lcd, XlcNChar, open_ucstocs1); _XlcSetConverter(lcd, XlcNUcsChar, lcd, XlcNUtf8String, open_ucstoutf8); } /***************************************************************************/ /* Part II: An UTF-8 locale loader. * * Here we can assume that "multi-byte" is UTF-8 and that `wchar_t' is Unicode. */ /* from XlcNMultiByte to XlcNWideChar */ static int utf8towcs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; wchar_t *dst; wchar_t *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (wchar_t *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { ucs4_t wc; int consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (consumed == RET_ILSEQ) { src++; *dst = BAD_WCHAR; unconv_num++; } else { src += consumed; *dst = wc; } dst++; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_utf8towcs = { close_converter, utf8towcs, NULL }; static XlcConv open_utf8towcs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_utf8towcs); } /* from XlcNWideChar to XlcNMultiByte */ static int wcstoutf8( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { wchar_t const *src; wchar_t const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (wchar_t const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { int count = utf8_wctomb(NULL, dst, *src, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst); if (count == RET_TOOSMALL) break; unconv_num++; } dst += count; src++; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_wcstoutf8 = { close_converter, wcstoutf8, NULL }; static XlcConv open_wcstoutf8( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_wcstoutf8); } /* from XlcNString to XlcNWideChar */ static int our_strtowcs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; wchar_t *dst; wchar_t *dstend; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (wchar_t *) *to; dstend = dst + *to_left; while (src < srcend && dst < dstend) *dst++ = (wchar_t) *src++; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return 0; } static XlcConvMethodsRec methods_strtowcs = { close_converter, our_strtowcs, NULL }; static XlcConv open_strtowcs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_strtowcs); } /* from XlcNWideChar to XlcNString */ static int our_wcstostr( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { wchar_t const *src; wchar_t const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (wchar_t const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { unsigned int wc = *src++; if (wc < 0x80) *dst = wc; else { *dst = BAD_CHAR; unconv_num++; } dst++; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_wcstostr = { close_converter, our_wcstostr, NULL }; static XlcConv open_wcstostr( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_wcstostr); } /* from XlcNCharSet to XlcNWideChar */ static int cstowcs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { XlcCharSet charset; const char *name; Utf8Conv convptr; int i; unsigned char const *src; unsigned char const *srcend; wchar_t *dst; wchar_t *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; if (num_args < 1) return -1; charset = (XlcCharSet) args[0]; name = charset->encoding_name; /* not charset->name because the latter has a ":GL"/":GR" suffix */ for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) if (!strcmp(convptr->name, name)) break; if (i == 0) return -1; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (wchar_t *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { unsigned int wc; int consumed; consumed = convptr->cstowc(conv, &wc, src, srcend-src); if (consumed == RET_ILSEQ) return -1; if (consumed == RET_TOOFEW(0)) break; *dst++ = wc; src += consumed; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; } static XlcConvMethodsRec methods_cstowcs = { close_converter, cstowcs, NULL }; static XlcConv open_cstowcs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { lazy_init_all_charsets(); return create_conv(from_lcd, &methods_cstowcs); } /* from XlcNWideChar to XlcNCharSet */ static int wcstocs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { Utf8Conv *preferred_charsets; XlcCharSet last_charset = NULL; wchar_t const *src; wchar_t const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; preferred_charsets = (Utf8Conv *) conv->state; src = (wchar_t const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; wchar_t wc = *src; int count; count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { src++; unconv_num++; continue; } if (last_charset == NULL) { last_charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); if (last_charset == NULL) { src++; unconv_num++; continue; } } else { if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name && (last_charset->side == XlcGLGR || last_charset->side == chosen_side))) break; } src++; dst += count; } if (last_charset == NULL) return -1; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; if (num_args >= 1) *((XlcCharSet *)args[0]) = last_charset; return unconv_num; } static XlcConvMethodsRec methods_wcstocs = { close_tocs_converter, wcstocs, NULL }; static XlcConv open_wcstocs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tocs_conv(from_lcd, &methods_wcstocs); } /* from XlcNWideChar to XlcNChar */ static int wcstocs1( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { Utf8Conv *preferred_charsets; XlcCharSet last_charset = NULL; wchar_t const *src; wchar_t const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; preferred_charsets = (Utf8Conv *) conv->state; src = (wchar_t const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; wchar_t wc = *src; int count; count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { src++; unconv_num++; continue; } if (last_charset == NULL) { last_charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); if (last_charset == NULL) { src++; unconv_num++; continue; } } else { if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name && (last_charset->side == XlcGLGR || last_charset->side == chosen_side))) break; } src++; dst += count; break; } if (last_charset == NULL) return -1; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; if (num_args >= 1) *((XlcCharSet *)args[0]) = last_charset; return unconv_num; } static XlcConvMethodsRec methods_wcstocs1 = { close_tocs_converter, wcstocs1, NULL }; static XlcConv open_wcstocs1( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tocs_conv(from_lcd, &methods_wcstocs1); } /* trivial, no conversion */ static int identity( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; while (src < srcend && dst < dstend) *dst++ = *src++; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return 0; } static XlcConvMethodsRec methods_identity = { close_converter, identity, NULL }; static XlcConv open_identity( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_conv(from_lcd, &methods_identity); } /* from MultiByte/WideChar to FontCharSet. */ /* They really use converters to CharSet * but with different create_conv procedure. */ static XlcConv create_tofontcs_conv( XLCd lcd, XlcConvMethods methods) { XlcConv conv; int i, num, k, count; char **value, buf[20]; Utf8Conv *preferred; lazy_init_all_charsets(); for (i = 0, num = 0;; i++) { sprintf(buf, "fs%d.charset.name", i); _XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count); if (count < 1) { sprintf(buf, "fs%d.charset", i); _XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count); if (count < 1) break; } num += count; } conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + (num + 1) * sizeof(Utf8Conv)); if (conv == (XlcConv) NULL) return (XlcConv) NULL; preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec)); /* Loop through all fontsets mentioned in the locale. */ for (i = 0, num = 0;; i++) { sprintf(buf, "fs%d.charset.name", i); _XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count); if (count < 1) { sprintf(buf, "fs%d.charset", i); _XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count); if (count < 1) break; } while (count-- > 0) { XlcCharSet charset = _XlcGetCharSet(*value++); const char *name = charset->encoding_name; /* If it wasn't already encountered... */ for (k = num - 1; k >= 0; k--) if (!strcmp(preferred[k]->name, name)) break; if (k < 0) { /* For fonts "ISO10646-1" means not utf8 but ucs2.*/ if (!strcmp("ISO10646-1", name)) { preferred[num++] = &all_charsets[ucs2_conv_index]; continue; } /* Look it up in all_charsets[]. */ for (k = 0; k < all_charsets_count-1; k++) if (!strcmp(all_charsets[k].name, name)) { /* Add it to the preferred set. */ preferred[num++] = &all_charsets[k]; break; } } } } preferred[num] = (Utf8Conv) NULL; conv->methods = methods; conv->state = (XPointer) preferred; return conv; } static XlcConv open_wcstofcs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tofontcs_conv(from_lcd, &methods_wcstocs); } static XlcConv open_utf8tofcs( XLCd from_lcd, const char *from_type, XLCd to_lcd, const char *to_type) { return create_tofontcs_conv(from_lcd, &methods_utf8tocs); } XLCd _XlcUtf8Loader( const char *name) { XLCd lcd; lcd = _XlcCreateLC(name, _XlcGenericMethods); if (lcd == (XLCd) NULL) return lcd; /* The official IANA name for UTF-8 is "UTF-8" in upper case with a dash. */ if (!XLC_PUBLIC_PART(lcd)->codeset || (_XlcCompareISOLatin1(XLC_PUBLIC_PART(lcd)->codeset, "UTF-8"))) { _XlcDestroyLC(lcd); return (XLCd) NULL; } /* Register elementary converters. */ _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNWideChar, open_utf8towcs); _XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNMultiByte, open_wcstoutf8); _XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNString, open_wcstostr); _XlcSetConverter(lcd, XlcNString, lcd, XlcNWideChar, open_strtowcs); /* Register converters for XlcNCharSet. This implicitly provides * converters from and to XlcNCompoundText. */ _XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNMultiByte, open_cstoutf8); _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNCharSet, open_utf8tocs); _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNChar, open_utf8tocs1); _XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNWideChar, open_cstowcs); _XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNCharSet, open_wcstocs); _XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNChar, open_wcstocs1); _XlcSetConverter(lcd, XlcNString, lcd, XlcNMultiByte, open_strtoutf8); _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNString, open_utf8tostr); _XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNMultiByte, open_identity); _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNUtf8String, open_identity); /* Register converters for XlcNFontCharSet */ _XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNFontCharSet, open_utf8tofcs); _XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNFontCharSet, open_wcstofcs); _XlcAddUtf8Converters(lcd); return lcd; }