/*------------------------------------------------------------------ * wcsfc_s.c * * September 2017, Reini Urban * * Copyright (c) 2017 by Reini Urban * All rights reserved. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. *------------------------------------------------------------------ */ #ifdef FOR_DOXYGEN #include "safe_str_lib.h" #else #include "safeclib_private.h" #include #include #ifdef HAVE_UNISTD_H #include /* DEBUG */ #endif #include #endif /* with lithuanian only grave, acute, tilde above, and ogonek See http://unicode.org/reports/tr21/tr21-5.html#SpecialCasing */ static inline int _is_lt_accented(uint32_t wc) { return (wc == 0x300 || /* grave */ wc == 0x301 || /* acute */ wc == 0x303 || /* tilde */ wc == 0x328) /* ogonek */ ? 1 : 0; } /** * @def wcsfc_s(dest,dmax,src,lenp) * @brief * Converts the wide string via full case-folding NFD normalized to * lowercase. The conversion stops at the first null or after dmax characters. * The conversion is determined by the LC_CTYPE category setting of the locale. * Other characters are not affected. fold-case performs full case folding, * i.e. if iswfc() of a character > 1, the length of dest might be greater than * the length of src (max 4 per char), the conversion is done via \c towfc_s() * and Unicode 10.0, the Unicode special-casing rules are obeyed, and composed * characters are normalized to NFD via \c wcsnorm_decompose_s() and \c * wcsnorm_reorder_s(). If not, the conversion is per character done via normal * \c towlower(). Note that decomposition creates larger strings, typically * 2-3 chars more. * * With SAFECLIB_STR_NULL_SLACK defined all elements following the * terminating null character (if any) written in the * array of dmax characters pointed to by dest are nulled. * * @details * SpecialCasing checks for conditional boundary context at the begin or end * of certain characters (final greek sigma), and locale sensitive rules * for the Lithuanian and the Turkish/Azeri I-dot. * * Composed characters are checked for the left-hand-side of the * Decomposition_Mapping Unicode property, which means the codepoint will be * normalized to NFD if any codepoint is composed. Technically only FCD as * all FC expansions are already properly ordered, and all mangled marks will * not be reordered, as the have the same Combining Class. * * @param[out] dest wide string to hold the result (~130% larger than src) * @param[in] dmax maximum result buffer size * @param[in] src wide string * @param[out] lenp pointer to length of the result, may be NULL * * @pre dest and src shall not be null pointers. * @pre dmax shall not be smaller than 5 and big enough for dest. * @pre dmax shall not be greater than RSIZE_MAX_WSTR and size of dest. * * @retval EOK on successful operation * @retval ESNULLP when dest or src is NULL pointer * @retval ESZEROL when dmax = 0 * @retval ESLEMAX when dmax > RSIZE_MAX_WSTR or a cp > 0x10ffff * @retval EOVERFLOW when dmax > size of dest (optionally, when the compiler * knows the object_size statically) * @retval ESLEWRNG when dmax != size of dest and --enable-error-dmax * @retval ESNOSPC when dmax is too small for the decomposition. * *lenp is still written, to know how much space is * needed. * @retval ESNOTFND Internal error as returned by towfc_s() for multi-char * foldings. happens only when the internal implementations of iswfc() and * towfc_s() are mismatched. * * @see * iswfc(), towfc_s(), towupper(), wcslwr_s(), wcsupr_s() * */ #ifdef FOR_DOXYGEN errno_t wcsfc_s(wchar_t *restrict dest, rsize_t dmax, const wchar_t *restrict src, rsize_t *restrict lenp) #else EXPORT errno_t _wcsfc_s_chk(wchar_t *restrict dest, rsize_t dmax, const wchar_t *restrict src, rsize_t *restrict lenp, const size_t destbos) #endif { wchar_t *orig_dest; rsize_t orig_dmax; int is_lithuanian = 0; int is_tr_az = 0; if (lenp) *lenp = 0; CHK_DEST_NULL("wcsfc_s") CHK_DMAX_ZERO("wcsfc_s") CHK_DMAX_MAX("wcsfc_s", RSIZE_MAX_WSTR) if (destbos == BOS_UNKNOWN) { BND_CHK_PTR_BOUNDS(dest, dmax * sizeof(wchar_t)); } else { const size_t destsz = dmax * sizeof(wchar_t); CHK_DESTW_OVR_CLEAR("wcsfc_s", destsz, destbos) } if (unlikely(src == NULL)) { handle_werror(dest, dmax, "wcsfc_s: " "src is null", ESNULLP); return (ESNULLP); } if (unlikely((*src >= 0x49 && *src <= 0x130) && (*src == 0x49 || *src == 0x4A || *src == 0x130 || *src == 0xcc || *src == 0xcd || *src == 0x128 || *src == 0x12e))) { const char *lc = setlocale(LC_CTYPE, NULL); if (lc && *lc) { if ((*lc == 't' && *(lc + 1) == 'r') || (*lc == 'a' && *(lc + 1) == 'z')) is_tr_az = 1; else if (*lc == 'l' && *(lc + 1) == 't') is_lithuanian = 1; } } orig_dest = dest; orig_dmax = dmax; while (*src && dmax > 0) { wchar_t tmp[4]; uint32_t cp = _dec_w16((wchar_t *)src); int c = iswfc(cp); #if SIZEOF_WCHAR_T == 2 if (cp > 0xffff) src++; #endif if (unlikely(c > 1)) { /* can this be further decomposed? */ errno_t rc = towfc_s(tmp, 4, cp); if (rc < 0) return rc; /* I-Dot for Turkish and Azeri */ if (unlikely(is_tr_az && *src == 0x130)) { *dest++ = 0x69; dmax--; /* skip the \x307 dot */ } else if (unlikely(*src >= 0x1f80 && *src <= 0x1ff4)) { /* only some greek PROSGEGRAMMENI + YPOGEGRAMMENI need to be * decomposed further */ /* decompose mult. chars */ wchar_t tmpd[8]; int i; for (i = 0; i < c; i++) { int d; uint32_t cp1 = _dec_w16(&tmp[i]); #if SIZEOF_WCHAR_T > 2 if (unlikely(_UNICODE_MAX < cp1)) { handle_werror(orig_dest, orig_dmax, "wcsfc_s: " "cp is too high", ESLEMAX); return ESLEMAX; } #endif d = _decomp_s(tmpd, 8, cp1, false); if (d) { /* decomp. max 4 */ memcpy(dest, tmpd, d * sizeof(wchar_t)); dest += d; dmax -= d; } else { _ENC_W16(dest, dmax, cp1); } } } else { memcpy(dest, tmp, c * sizeof(wchar_t)); dest += c; dmax -= c; } src++; } else { /* c = 0 or 1. 1 might still be special case */ if (unlikely(*src == 0x3a3)) { if (iswspace(*(src + 1))) /* final sigma? */ *dest++ = 0x3c2; else *dest++ = 0x3c3; src++; dmax--; } else if (unlikely(is_lithuanian)) { /* I-Dot/J-Dot for Lithuanian, I-Dot for Turkish and Azeri. http://unicode.org/reports/tr21/tr21-5.html#SpecialCasing */ switch (*src) { case 0xcc: *dest++ = 0x69; *dest++ = 0x307; *dest++ = 0x300; dmax -= 3; break; case 0xcd: *dest++ = 0x69; *dest++ = 0x307; *dest++ = 0x301; dmax -= 3; break; case 0x128: *dest++ = 0x69; *dest++ = 0x307; *dest++ = 0x303; dmax -= 3; break; case 0x49: /* Add the dot with accented I's or J's */ case 0x4A: /* lt More_Above */ if (_is_lt_accented(*(src + 1))) { /*towfc_s(tmp, 4, 0x0130);*/ *dest++ = (*src + 0x20); *dest++ = 0x307; dmax -= 2; src++; break; } /* FALLTHRU */ case 0x12e: if (_is_lt_accented(*(src + 1))) { *dest++ = 0x12f; *dest++ = 0x307; dmax -= 2; src++; break; } /* FALLTHRU */ default: goto is_single; } src++; } else if (unlikely(is_tr_az)) { /* pair I-Dot with i for Turkish and Azeri. http://unicode.org/reports/tr21/tr21-5.html#SpecialCasing */ if (*src == 0x130) { *dest++ = 0x307; src++; dmax--; } /* Remove dot_above in the sequence I + dot_above, which will turn into i */ else if (*src == L'I' && *(src + 1) == 0x307) { *dest++ = L'i'; dmax--; src++; src++; } /* Unless an I is before a dot_above, it turns into a dotless i. */ else if (*src == 0x49 && *(src + 1) != 0x307 && *(src + 1) != 0x130) { *dest++ = 0x131; src++; dmax--; } else { is_single: (void)_towfc_single(dest, _dec_w16((wchar_t *)src)); src++; /* even if not found dest[0] still contains towlower */ dest++; dmax--; } } else { if (unlikely(dmax < 5)) goto too_small; #if 1 (void)_towfc_single(tmp, _dec_w16((wchar_t *)src)); src++; if (tmp[0] >= 0xc0) { cp = _dec_w16(tmp); /* needed? I thought only 0x1f80-0x1ff4 needs to be * decomposed */ c = _decomp_s(dest, dmax, cp, false); } else c = 0; if (!c) { *dest++ = tmp[0]; dmax--; } else { /* eg accents 101 => 61 304 */ dest += c; dmax -= c; } #else c = _towfc_single(dest, _dec_w16((wchar_t *)src)); if (c > 0) { dest += c; dmax -= c; } src++; #endif } } } /* write the length even if too small, so the client knows how much is needed */ if (lenp) *lenp = orig_dmax - dmax; if (unlikely(dmax <= 0)) { too_small: handle_werror(orig_dest, orig_dmax, "wcsfc_s: " "dmax too small", ESNOSPC); return (ESNOSPC); } #ifdef SAFECLIB_STR_NULL_SLACK memset(dest, 0, dmax * sizeof(wchar_t)); /*while (dmax) { *dest = '\0'; dmax--; dest++; }*/ #else *dest = 0; #endif return (EOK); }