/* * Copyright (c) 2010 * by AVM GmbH Berlin, Germany * * Licence: Free, use with no restriction. */ #include #include #include #include "charencoding.h" size_t ConvertSingleUnicodeCodepointTo_UTF8_WithFallback(unsigned long codepoint, unsigned char *buf, size_t max_buf, unsigned long fallback_codepoint) { again: if (codepoint <= 0x7f) { if (max_buf < 1) return 0; *buf = (unsigned char)codepoint; return 1; } if (codepoint <= 0x7ff) { if (max_buf < 2) return 0; *buf++ = 0xc0 | (( codepoint >> 6) & 0x1f); *buf = 0x80 | (codepoint & 0x3f); return 2; } if (codepoint <= 0xffff /* 0xd7ff*/) { if (max_buf < 3) return 0; *buf++ = 0xe0 | ((codepoint >> 12) & 0x0f); *buf++ = 0x80 | ((codepoint >> 6) & 0x3f); *buf = 0x80 | (codepoint & 0x3f); return 3; } if (codepoint <= 0x10ffff) { if (max_buf < 4) return 0; *buf++ = 0xf0 | ((codepoint >> 18) & 0x07); *buf++ = 0x80 | ((codepoint >> 12) & 0x3f); *buf++ = 0x80 | ((codepoint >> 6) & 0x3f); *buf = 0x80 | (codepoint & 0x3f); return 4; } codepoint = fallback_codepoint; goto again; return 0; } unsigned long ConvertSingle_UTF8_ToUnicodeCodepoint(unsigned char *buf, size_t max_buf, /*out */size_t *pBytesConverted) { unsigned long codepoint; unsigned char c; if (0 == max_buf) { goto err; } c = *buf; if (!(c & 0x80)) { *pBytesConverted = 1; return (unsigned long)c; } if (0xc0 == (c & 0xe0)) { if (max_buf < 2) goto err; *pBytesConverted = 2; codepoint = (c & 0x1f) << 6; c = *(++buf); codepoint |= (c & 0x3f); return codepoint; } if (0xe0 == (c & 0xf0)) { if (max_buf < 3) goto err; *pBytesConverted = 3; codepoint = (c & 0x0f) << 12; c = *(++buf); codepoint |= (c & 0x3f) << 6; c = *(++buf); codepoint |= (c & 0x3f); return codepoint; } if (0xf0 == (c & 0xf8)) { if (max_buf < 4) goto err; *pBytesConverted = 4; codepoint = (c & 0x07) << 18; c = *(++buf); codepoint |= (c & 0x3f) << 12; c = *(++buf); codepoint |= (c & 0x3f) << 6; c = *(++buf); codepoint |= (c & 0x3f); return codepoint; } err: *pBytesConverted = 0; return 0; } // ----------------------------------------------------------------------------------------- char *ConvertStringFromISO8859_1ToUTF8_WithAlloc(char *iso) { char *pin = iso; size_t total_out_space = strlen(iso) + 8; // start with 8 bytes room for UTF-8 multibyte encoding unsigned char *out_buf = (unsigned char *)malloc(total_out_space + 1); if (!out_buf) return strdup(""); // failure unsigned char *pout = out_buf; size_t out_space = total_out_space; while(*pin) { size_t BytesConverted; unsigned long codepoint = (unsigned long)*((unsigned char *)pin++); do { BytesConverted = ConvertSingleUnicodeCodepointTo_UTF8_WithFallback(codepoint, pout, out_space, (unsigned long)'.'); if (!BytesConverted) { // make more room total_out_space += 8; unsigned char *p = (unsigned char *)malloc(total_out_space + 1); // 8 bytes more room if (!p) break; // error! memcpy(p, out_buf, pout - out_buf); free(out_buf); // note: out_buf is still used for calculating pout = p + (pout - out_buf); out_space += 8; out_buf = p; } } while(!BytesConverted); if (!BytesConverted) break; // failed pout += BytesConverted; out_space -= BytesConverted; } *pout = '\0'; return (char *)out_buf; } int ConvertStringFromUTF8ToISO8859_1_With_Fallback(char *utf8, char fallback_iso_char) { int len = strlen(utf8); unsigned char *pin = (unsigned char *)utf8; unsigned char *pout = (unsigned char *)utf8; // we use the same buffer, cause iso encoding will never be larger than UTF8 encoding while(len > 0) { size_t BytesConverted; unsigned long codepoint = ConvertSingle_UTF8_ToUnicodeCodepoint(pin, len, &BytesConverted); if (!BytesConverted) { return -1; // failed } if (codepoint > 0xff) { if (0 != fallback_iso_char) { codepoint = (unsigned long)fallback_iso_char; } else { return -2; // failed - cant represent in ISO8859-1 } } *(pout++) = (unsigned char)(codepoint & 0xff); len -= BytesConverted; pin += BytesConverted; } *pout = '\0'; return 0; // ok }