/* Unicode character classification and properties.
Copyright (C) 2002, 2005-2016 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published
by the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see . */
#ifndef _UNICTYPE_H
#define _UNICTYPE_H
#include "unitypes.h"
/* Get bool. */
#include
/* Get size_t. */
#include
#ifdef __cplusplus
extern "C" {
#endif
/* ========================================================================= */
/* Field 1 of Unicode Character Database: Character name.
See "uniname.h". */
/* ========================================================================= */
/* Field 2 of Unicode Character Database: General category. */
/* Data type denoting a General category value. This is not just a bitmask,
but rather a bitmask and a pointer to the lookup table, so that programs
that use only the predefined bitmasks (i.e. don't combine bitmasks with &
and |) don't have a link-time dependency towards the big general table. */
typedef struct
{
uint32_t bitmask : 31;
/*bool*/ unsigned int generic : 1;
union
{
const void *table; /* when generic is 0 */
bool (*lookup_fn) (ucs4_t uc, uint32_t bitmask); /* when generic is 1 */
} lookup;
}
uc_general_category_t;
/* Bits and bit masks denoting General category values. UnicodeData-3.2.0.html
says a 32-bit integer will always suffice to represent them.
These bit masks can only be used with the uc_is_general_category_withtable
function. */
enum
{
UC_CATEGORY_MASK_L = 0x0000001f,
UC_CATEGORY_MASK_LC = 0x00000007,
UC_CATEGORY_MASK_Lu = 0x00000001,
UC_CATEGORY_MASK_Ll = 0x00000002,
UC_CATEGORY_MASK_Lt = 0x00000004,
UC_CATEGORY_MASK_Lm = 0x00000008,
UC_CATEGORY_MASK_Lo = 0x00000010,
UC_CATEGORY_MASK_M = 0x000000e0,
UC_CATEGORY_MASK_Mn = 0x00000020,
UC_CATEGORY_MASK_Mc = 0x00000040,
UC_CATEGORY_MASK_Me = 0x00000080,
UC_CATEGORY_MASK_N = 0x00000700,
UC_CATEGORY_MASK_Nd = 0x00000100,
UC_CATEGORY_MASK_Nl = 0x00000200,
UC_CATEGORY_MASK_No = 0x00000400,
UC_CATEGORY_MASK_P = 0x0003f800,
UC_CATEGORY_MASK_Pc = 0x00000800,
UC_CATEGORY_MASK_Pd = 0x00001000,
UC_CATEGORY_MASK_Ps = 0x00002000,
UC_CATEGORY_MASK_Pe = 0x00004000,
UC_CATEGORY_MASK_Pi = 0x00008000,
UC_CATEGORY_MASK_Pf = 0x00010000,
UC_CATEGORY_MASK_Po = 0x00020000,
UC_CATEGORY_MASK_S = 0x003c0000,
UC_CATEGORY_MASK_Sm = 0x00040000,
UC_CATEGORY_MASK_Sc = 0x00080000,
UC_CATEGORY_MASK_Sk = 0x00100000,
UC_CATEGORY_MASK_So = 0x00200000,
UC_CATEGORY_MASK_Z = 0x01c00000,
UC_CATEGORY_MASK_Zs = 0x00400000,
UC_CATEGORY_MASK_Zl = 0x00800000,
UC_CATEGORY_MASK_Zp = 0x01000000,
UC_CATEGORY_MASK_C = 0x3e000000,
UC_CATEGORY_MASK_Cc = 0x02000000,
UC_CATEGORY_MASK_Cf = 0x04000000,
UC_CATEGORY_MASK_Cs = 0x08000000,
UC_CATEGORY_MASK_Co = 0x10000000,
UC_CATEGORY_MASK_Cn = 0x20000000
};
/* Predefined General category values. */
extern const uc_general_category_t UC_CATEGORY_L;
extern const uc_general_category_t UC_CATEGORY_LC;
extern const uc_general_category_t UC_CATEGORY_Lu;
extern const uc_general_category_t UC_CATEGORY_Ll;
extern const uc_general_category_t UC_CATEGORY_Lt;
extern const uc_general_category_t UC_CATEGORY_Lm;
extern const uc_general_category_t UC_CATEGORY_Lo;
extern const uc_general_category_t UC_CATEGORY_M;
extern const uc_general_category_t UC_CATEGORY_Mn;
extern const uc_general_category_t UC_CATEGORY_Mc;
extern const uc_general_category_t UC_CATEGORY_Me;
extern const uc_general_category_t UC_CATEGORY_N;
extern const uc_general_category_t UC_CATEGORY_Nd;
extern const uc_general_category_t UC_CATEGORY_Nl;
extern const uc_general_category_t UC_CATEGORY_No;
extern const uc_general_category_t UC_CATEGORY_P;
extern const uc_general_category_t UC_CATEGORY_Pc;
extern const uc_general_category_t UC_CATEGORY_Pd;
extern const uc_general_category_t UC_CATEGORY_Ps;
extern const uc_general_category_t UC_CATEGORY_Pe;
extern const uc_general_category_t UC_CATEGORY_Pi;
extern const uc_general_category_t UC_CATEGORY_Pf;
extern const uc_general_category_t UC_CATEGORY_Po;
extern const uc_general_category_t UC_CATEGORY_S;
extern const uc_general_category_t UC_CATEGORY_Sm;
extern const uc_general_category_t UC_CATEGORY_Sc;
extern const uc_general_category_t UC_CATEGORY_Sk;
extern const uc_general_category_t UC_CATEGORY_So;
extern const uc_general_category_t UC_CATEGORY_Z;
extern const uc_general_category_t UC_CATEGORY_Zs;
extern const uc_general_category_t UC_CATEGORY_Zl;
extern const uc_general_category_t UC_CATEGORY_Zp;
extern const uc_general_category_t UC_CATEGORY_C;
extern const uc_general_category_t UC_CATEGORY_Cc;
extern const uc_general_category_t UC_CATEGORY_Cf;
extern const uc_general_category_t UC_CATEGORY_Cs;
extern const uc_general_category_t UC_CATEGORY_Co;
extern const uc_general_category_t UC_CATEGORY_Cn;
/* Non-public. */
extern const uc_general_category_t _UC_CATEGORY_NONE;
/* Alias names for predefined General category values. */
#define UC_LETTER UC_CATEGORY_L
#define UC_CASED_LETTER UC_CATEGORY_LC
#define UC_UPPERCASE_LETTER UC_CATEGORY_Lu
#define UC_LOWERCASE_LETTER UC_CATEGORY_Ll
#define UC_TITLECASE_LETTER UC_CATEGORY_Lt
#define UC_MODIFIER_LETTER UC_CATEGORY_Lm
#define UC_OTHER_LETTER UC_CATEGORY_Lo
#define UC_MARK UC_CATEGORY_M
#define UC_NON_SPACING_MARK UC_CATEGORY_Mn
#define UC_COMBINING_SPACING_MARK UC_CATEGORY_Mc
#define UC_ENCLOSING_MARK UC_CATEGORY_Me
#define UC_NUMBER UC_CATEGORY_N
#define UC_DECIMAL_DIGIT_NUMBER UC_CATEGORY_Nd
#define UC_LETTER_NUMBER UC_CATEGORY_Nl
#define UC_OTHER_NUMBER UC_CATEGORY_No
#define UC_PUNCTUATION UC_CATEGORY_P
#define UC_CONNECTOR_PUNCTUATION UC_CATEGORY_Pc
#define UC_DASH_PUNCTUATION UC_CATEGORY_Pd
#define UC_OPEN_PUNCTUATION UC_CATEGORY_Ps /* a.k.a. UC_START_PUNCTUATION */
#define UC_CLOSE_PUNCTUATION UC_CATEGORY_Pe /* a.k.a. UC_END_PUNCTUATION */
#define UC_INITIAL_QUOTE_PUNCTUATION UC_CATEGORY_Pi
#define UC_FINAL_QUOTE_PUNCTUATION UC_CATEGORY_Pf
#define UC_OTHER_PUNCTUATION UC_CATEGORY_Po
#define UC_SYMBOL UC_CATEGORY_S
#define UC_MATH_SYMBOL UC_CATEGORY_Sm
#define UC_CURRENCY_SYMBOL UC_CATEGORY_Sc
#define UC_MODIFIER_SYMBOL UC_CATEGORY_Sk
#define UC_OTHER_SYMBOL UC_CATEGORY_So
#define UC_SEPARATOR UC_CATEGORY_Z
#define UC_SPACE_SEPARATOR UC_CATEGORY_Zs
#define UC_LINE_SEPARATOR UC_CATEGORY_Zl
#define UC_PARAGRAPH_SEPARATOR UC_CATEGORY_Zp
#define UC_OTHER UC_CATEGORY_C
#define UC_CONTROL UC_CATEGORY_Cc
#define UC_FORMAT UC_CATEGORY_Cf
#define UC_SURROGATE UC_CATEGORY_Cs /* all of them are invalid characters */
#define UC_PRIVATE_USE UC_CATEGORY_Co
#define UC_UNASSIGNED UC_CATEGORY_Cn /* some of them are invalid characters */
/* Return the union of two general categories.
This corresponds to the unions of the two sets of characters. */
extern uc_general_category_t
uc_general_category_or (uc_general_category_t category1,
uc_general_category_t category2);
/* Return the intersection of two general categories as bit masks.
This *does*not* correspond to the intersection of the two sets of
characters. */
extern uc_general_category_t
uc_general_category_and (uc_general_category_t category1,
uc_general_category_t category2);
/* Return the intersection of a general category with the complement of a
second general category, as bit masks.
This *does*not* correspond to the intersection with complement, when
viewing the categories as sets of characters. */
extern uc_general_category_t
uc_general_category_and_not (uc_general_category_t category1,
uc_general_category_t category2);
/* Return the name of a general category. */
extern const char *
uc_general_category_name (uc_general_category_t category)
_UC_ATTRIBUTE_PURE;
/* Return the long name of a general category. */
extern const char *
uc_general_category_long_name (uc_general_category_t category)
_UC_ATTRIBUTE_PURE;
/* Return the general category given by name, e.g. "Lu", or by long name,
e.g. "Uppercase Letter". */
extern uc_general_category_t
uc_general_category_byname (const char *category_name)
_UC_ATTRIBUTE_PURE;
/* Return the general category of a Unicode character. */
extern uc_general_category_t
uc_general_category (ucs4_t uc)
_UC_ATTRIBUTE_PURE;
/* Test whether a Unicode character belongs to a given category.
The CATEGORY argument can be the combination of several predefined
general categories. */
extern bool
uc_is_general_category (ucs4_t uc, uc_general_category_t category)
_UC_ATTRIBUTE_PURE;
/* Likewise. This function uses a big table comprising all categories. */
extern bool
uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 3 of Unicode Character Database: Canonical combining class. */
/* The possible results of uc_combining_class (0..255) are described in
UCD.html. The list here is not definitive; more values can be added
in future versions. */
enum
{
UC_CCC_NR = 0, /* Not Reordered */
UC_CCC_OV = 1, /* Overlay */
UC_CCC_NK = 7, /* Nukta */
UC_CCC_KV = 8, /* Kana Voicing */
UC_CCC_VR = 9, /* Virama */
UC_CCC_ATBL = 200, /* Attached Below Left */
UC_CCC_ATB = 202, /* Attached Below */
UC_CCC_ATA = 214, /* Attached Above */
UC_CCC_ATAR = 216, /* Attached Above Right */
UC_CCC_BL = 218, /* Below Left */
UC_CCC_B = 220, /* Below */
UC_CCC_BR = 222, /* Below Right */
UC_CCC_L = 224, /* Left */
UC_CCC_R = 226, /* Right */
UC_CCC_AL = 228, /* Above Left */
UC_CCC_A = 230, /* Above */
UC_CCC_AR = 232, /* Above Right */
UC_CCC_DB = 233, /* Double Below */
UC_CCC_DA = 234, /* Double Above */
UC_CCC_IS = 240 /* Iota Subscript */
};
/* Return the canonical combining class of a Unicode character. */
extern int
uc_combining_class (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Return the name of a canonical combining class. */
extern const char *
uc_combining_class_name (int ccc)
_UC_ATTRIBUTE_CONST;
/* Return the long name of a canonical combining class. */
extern const char *
uc_combining_class_long_name (int ccc)
_UC_ATTRIBUTE_CONST;
/* Return the canonical combining class given by name, e.g. "BL", or by long
name, e.g. "Below Left". */
extern int
uc_combining_class_byname (const char *ccc_name)
_UC_ATTRIBUTE_PURE;
/* ========================================================================= */
/* Field 4 of Unicode Character Database: Bidi class.
Before Unicode 4.0, this field was called "Bidirectional category". */
enum
{
UC_BIDI_L, /* Left-to-Right */
UC_BIDI_LRE, /* Left-to-Right Embedding */
UC_BIDI_LRO, /* Left-to-Right Override */
UC_BIDI_R, /* Right-to-Left */
UC_BIDI_AL, /* Right-to-Left Arabic */
UC_BIDI_RLE, /* Right-to-Left Embedding */
UC_BIDI_RLO, /* Right-to-Left Override */
UC_BIDI_PDF, /* Pop Directional Format */
UC_BIDI_EN, /* European Number */
UC_BIDI_ES, /* European Number Separator */
UC_BIDI_ET, /* European Number Terminator */
UC_BIDI_AN, /* Arabic Number */
UC_BIDI_CS, /* Common Number Separator */
UC_BIDI_NSM, /* Non-Spacing Mark */
UC_BIDI_BN, /* Boundary Neutral */
UC_BIDI_B, /* Paragraph Separator */
UC_BIDI_S, /* Segment Separator */
UC_BIDI_WS, /* Whitespace */
UC_BIDI_ON, /* Other Neutral */
UC_BIDI_LRI, /* Left-to-Right Isolate */
UC_BIDI_RLI, /* Right-to-Left Isolate */
UC_BIDI_FSI, /* First Strong Isolate */
UC_BIDI_PDI /* Pop Directional Isolate */
};
/* Return the name of a bidi class. */
extern const char *
uc_bidi_class_name (int bidi_class)
_UC_ATTRIBUTE_CONST;
/* Same; obsolete function name. */
extern const char *
uc_bidi_category_name (int category)
_UC_ATTRIBUTE_CONST;
/* Return the long name of a bidi class. */
extern const char *
uc_bidi_class_long_name (int bidi_class)
_UC_ATTRIBUTE_CONST;
/* Return the bidi class given by name, e.g. "LRE", or by long name, e.g.
"Left-to-Right Embedding". */
extern int
uc_bidi_class_byname (const char *bidi_class_name)
_UC_ATTRIBUTE_PURE;
/* Same; obsolete function name. */
extern int
uc_bidi_category_byname (const char *category_name)
_UC_ATTRIBUTE_PURE;
/* Return the bidi class of a Unicode character. */
extern int
uc_bidi_class (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Same; obsolete function name. */
extern int
uc_bidi_category (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test whether a Unicode character belongs to a given bidi class. */
extern bool
uc_is_bidi_class (ucs4_t uc, int bidi_class)
_UC_ATTRIBUTE_CONST;
/* Same; obsolete function name. */
extern bool
uc_is_bidi_category (ucs4_t uc, int category)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 5 of Unicode Character Database: Character decomposition mapping.
See "uninorm.h". */
/* ========================================================================= */
/* Field 6 of Unicode Character Database: Decimal digit value. */
/* Return the decimal digit value of a Unicode character. */
extern int
uc_decimal_value (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 7 of Unicode Character Database: Digit value. */
/* Return the digit value of a Unicode character. */
extern int
uc_digit_value (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 8 of Unicode Character Database: Numeric value. */
/* Return the numeric value of a Unicode character. */
typedef struct
{
int numerator;
int denominator;
}
uc_fraction_t;
extern uc_fraction_t
uc_numeric_value (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 9 of Unicode Character Database: Mirrored. */
/* Return the mirrored character of a Unicode character UC in *PUC. */
extern bool
uc_mirror_char (ucs4_t uc, ucs4_t *puc);
/* ========================================================================= */
/* Field 10 of Unicode Character Database: Unicode 1.0 Name.
Not available in this library. */
/* ========================================================================= */
/* Field 11 of Unicode Character Database: ISO 10646 comment.
Not available in this library. */
/* ========================================================================= */
/* Field 12, 13, 14 of Unicode Character Database: Uppercase mapping,
lowercase mapping, titlecase mapping. See "unicase.h". */
/* ========================================================================= */
/* Field 2 of the file ArabicShaping.txt in the Unicode Character Database. */
/* Possible joining types. */
enum
{
UC_JOINING_TYPE_U, /* Non_Joining */
UC_JOINING_TYPE_T, /* Transparent */
UC_JOINING_TYPE_C, /* Join_Causing */
UC_JOINING_TYPE_L, /* Left_Joining */
UC_JOINING_TYPE_R, /* Right_Joining */
UC_JOINING_TYPE_D /* Dual_Joining */
};
/* Return the name of a joining type. */
extern const char *
uc_joining_type_name (int joining_type)
_UC_ATTRIBUTE_CONST;
/* Return the long name of a joining type. */
extern const char *
uc_joining_type_long_name (int joining_type)
_UC_ATTRIBUTE_CONST;
/* Return the joining type given by name, e.g. "D", or by long name, e.g.
"Dual Joining". */
extern int
uc_joining_type_byname (const char *joining_type_name)
_UC_ATTRIBUTE_PURE;
/* Return the joining type of a Unicode character. */
extern int
uc_joining_type (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Field 3 of the file ArabicShaping.txt in the Unicode Character Database. */
/* Possible joining groups.
This enumeration may be extended in the future. */
enum
{
UC_JOINING_GROUP_NONE, /* No_Joining_Group */
UC_JOINING_GROUP_AIN, /* Ain */
UC_JOINING_GROUP_ALAPH, /* Alaph */
UC_JOINING_GROUP_ALEF, /* Alef */
UC_JOINING_GROUP_BEH, /* Beh */
UC_JOINING_GROUP_BETH, /* Beth */
UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
UC_JOINING_GROUP_DAL, /* Dal */
UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
UC_JOINING_GROUP_E, /* E */
UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
UC_JOINING_GROUP_FE, /* Fe */
UC_JOINING_GROUP_FEH, /* Feh */
UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
UC_JOINING_GROUP_GAF, /* Gaf */
UC_JOINING_GROUP_GAMAL, /* Gamal */
UC_JOINING_GROUP_HAH, /* Hah */
UC_JOINING_GROUP_HE, /* He */
UC_JOINING_GROUP_HEH, /* Heh */
UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
UC_JOINING_GROUP_HETH, /* Heth */
UC_JOINING_GROUP_KAF, /* Kaf */
UC_JOINING_GROUP_KAPH, /* Kaph */
UC_JOINING_GROUP_KHAPH, /* Khaph */
UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
UC_JOINING_GROUP_LAM, /* Lam */
UC_JOINING_GROUP_LAMADH, /* Lamadh */
UC_JOINING_GROUP_MEEM, /* Meem */
UC_JOINING_GROUP_MIM, /* Mim */
UC_JOINING_GROUP_NOON, /* Noon */
UC_JOINING_GROUP_NUN, /* Nun */
UC_JOINING_GROUP_NYA, /* Nya */
UC_JOINING_GROUP_PE, /* Pe */
UC_JOINING_GROUP_QAF, /* Qaf */
UC_JOINING_GROUP_QAPH, /* Qaph */
UC_JOINING_GROUP_REH, /* Reh */
UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
UC_JOINING_GROUP_SAD, /* Sad */
UC_JOINING_GROUP_SADHE, /* Sadhe */
UC_JOINING_GROUP_SEEN, /* Seen */
UC_JOINING_GROUP_SEMKATH, /* Semkath */
UC_JOINING_GROUP_SHIN, /* Shin */
UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
UC_JOINING_GROUP_TAH, /* Tah */
UC_JOINING_GROUP_TAW, /* Taw */
UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
UC_JOINING_GROUP_TETH, /* Teth */
UC_JOINING_GROUP_WAW, /* Waw */
UC_JOINING_GROUP_YEH, /* Yeh */
UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
UC_JOINING_GROUP_YUDH, /* Yudh */
UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
UC_JOINING_GROUP_ZAIN, /* Zain */
UC_JOINING_GROUP_ZHAIN, /* Zhain */
UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
UC_JOINING_GROUP_MANICHAEAN_HUNDRED /* Manichaean_Hundred */
};
/* Return the name of a joining group. */
extern const char *
uc_joining_group_name (int joining_group)
_UC_ATTRIBUTE_CONST;
/* Return the joining group given by name, e.g. "Teh_Marbuta". */
extern int
uc_joining_group_byname (const char *joining_group_name)
_UC_ATTRIBUTE_PURE;
/* Return the joining group of a Unicode character. */
extern int
uc_joining_group (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Common API for properties. */
/* Data type denoting a property. This is not just a number, but rather a
pointer to the test functions, so that programs that use only few of the
properties don't have a link-time dependency towards all the tables. */
typedef struct
{
bool (*test_fn) (ucs4_t uc);
}
uc_property_t;
/* Predefined properties. */
/* General. */
extern const uc_property_t UC_PROPERTY_WHITE_SPACE;
extern const uc_property_t UC_PROPERTY_ALPHABETIC;
extern const uc_property_t UC_PROPERTY_OTHER_ALPHABETIC;
extern const uc_property_t UC_PROPERTY_NOT_A_CHARACTER;
extern const uc_property_t UC_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT;
extern const uc_property_t UC_PROPERTY_OTHER_DEFAULT_IGNORABLE_CODE_POINT;
extern const uc_property_t UC_PROPERTY_DEPRECATED;
extern const uc_property_t UC_PROPERTY_LOGICAL_ORDER_EXCEPTION;
extern const uc_property_t UC_PROPERTY_VARIATION_SELECTOR;
extern const uc_property_t UC_PROPERTY_PRIVATE_USE;
extern const uc_property_t UC_PROPERTY_UNASSIGNED_CODE_VALUE;
/* Case. */
extern const uc_property_t UC_PROPERTY_UPPERCASE;
extern const uc_property_t UC_PROPERTY_OTHER_UPPERCASE;
extern const uc_property_t UC_PROPERTY_LOWERCASE;
extern const uc_property_t UC_PROPERTY_OTHER_LOWERCASE;
extern const uc_property_t UC_PROPERTY_TITLECASE;
extern const uc_property_t UC_PROPERTY_CASED;
extern const uc_property_t UC_PROPERTY_CASE_IGNORABLE;
extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_LOWERCASED;
extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_UPPERCASED;
extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_TITLECASED;
extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEFOLDED;
extern const uc_property_t UC_PROPERTY_CHANGES_WHEN_CASEMAPPED;
extern const uc_property_t UC_PROPERTY_SOFT_DOTTED;
/* Identifiers. */
extern const uc_property_t UC_PROPERTY_ID_START;
extern const uc_property_t UC_PROPERTY_OTHER_ID_START;
extern const uc_property_t UC_PROPERTY_ID_CONTINUE;
extern const uc_property_t UC_PROPERTY_OTHER_ID_CONTINUE;
extern const uc_property_t UC_PROPERTY_XID_START;
extern const uc_property_t UC_PROPERTY_XID_CONTINUE;
extern const uc_property_t UC_PROPERTY_PATTERN_WHITE_SPACE;
extern const uc_property_t UC_PROPERTY_PATTERN_SYNTAX;
/* Shaping and rendering. */
extern const uc_property_t UC_PROPERTY_JOIN_CONTROL;
extern const uc_property_t UC_PROPERTY_GRAPHEME_BASE;
extern const uc_property_t UC_PROPERTY_GRAPHEME_EXTEND;
extern const uc_property_t UC_PROPERTY_OTHER_GRAPHEME_EXTEND;
extern const uc_property_t UC_PROPERTY_GRAPHEME_LINK;
/* Bidi. */
extern const uc_property_t UC_PROPERTY_BIDI_CONTROL;
extern const uc_property_t UC_PROPERTY_BIDI_LEFT_TO_RIGHT;
extern const uc_property_t UC_PROPERTY_BIDI_HEBREW_RIGHT_TO_LEFT;
extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_RIGHT_TO_LEFT;
extern const uc_property_t UC_PROPERTY_BIDI_EUROPEAN_DIGIT;
extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_SEPARATOR;
extern const uc_property_t UC_PROPERTY_BIDI_EUR_NUM_TERMINATOR;
extern const uc_property_t UC_PROPERTY_BIDI_ARABIC_DIGIT;
extern const uc_property_t UC_PROPERTY_BIDI_COMMON_SEPARATOR;
extern const uc_property_t UC_PROPERTY_BIDI_BLOCK_SEPARATOR;
extern const uc_property_t UC_PROPERTY_BIDI_SEGMENT_SEPARATOR;
extern const uc_property_t UC_PROPERTY_BIDI_WHITESPACE;
extern const uc_property_t UC_PROPERTY_BIDI_NON_SPACING_MARK;
extern const uc_property_t UC_PROPERTY_BIDI_BOUNDARY_NEUTRAL;
extern const uc_property_t UC_PROPERTY_BIDI_PDF;
extern const uc_property_t UC_PROPERTY_BIDI_EMBEDDING_OR_OVERRIDE;
extern const uc_property_t UC_PROPERTY_BIDI_OTHER_NEUTRAL;
/* Numeric. */
extern const uc_property_t UC_PROPERTY_HEX_DIGIT;
extern const uc_property_t UC_PROPERTY_ASCII_HEX_DIGIT;
/* CJK. */
extern const uc_property_t UC_PROPERTY_IDEOGRAPHIC;
extern const uc_property_t UC_PROPERTY_UNIFIED_IDEOGRAPH;
extern const uc_property_t UC_PROPERTY_RADICAL;
extern const uc_property_t UC_PROPERTY_IDS_BINARY_OPERATOR;
extern const uc_property_t UC_PROPERTY_IDS_TRINARY_OPERATOR;
/* Misc. */
extern const uc_property_t UC_PROPERTY_ZERO_WIDTH;
extern const uc_property_t UC_PROPERTY_SPACE;
extern const uc_property_t UC_PROPERTY_NON_BREAK;
extern const uc_property_t UC_PROPERTY_ISO_CONTROL;
extern const uc_property_t UC_PROPERTY_FORMAT_CONTROL;
extern const uc_property_t UC_PROPERTY_DASH;
extern const uc_property_t UC_PROPERTY_HYPHEN;
extern const uc_property_t UC_PROPERTY_PUNCTUATION;
extern const uc_property_t UC_PROPERTY_LINE_SEPARATOR;
extern const uc_property_t UC_PROPERTY_PARAGRAPH_SEPARATOR;
extern const uc_property_t UC_PROPERTY_QUOTATION_MARK;
extern const uc_property_t UC_PROPERTY_SENTENCE_TERMINAL;
extern const uc_property_t UC_PROPERTY_TERMINAL_PUNCTUATION;
extern const uc_property_t UC_PROPERTY_CURRENCY_SYMBOL;
extern const uc_property_t UC_PROPERTY_MATH;
extern const uc_property_t UC_PROPERTY_OTHER_MATH;
extern const uc_property_t UC_PROPERTY_PAIRED_PUNCTUATION;
extern const uc_property_t UC_PROPERTY_LEFT_OF_PAIR;
extern const uc_property_t UC_PROPERTY_COMBINING;
extern const uc_property_t UC_PROPERTY_COMPOSITE;
extern const uc_property_t UC_PROPERTY_DECIMAL_DIGIT;
extern const uc_property_t UC_PROPERTY_NUMERIC;
extern const uc_property_t UC_PROPERTY_DIACRITIC;
extern const uc_property_t UC_PROPERTY_EXTENDER;
extern const uc_property_t UC_PROPERTY_IGNORABLE_CONTROL;
/* Return the property given by name, e.g. "White space". */
extern uc_property_t
uc_property_byname (const char *property_name);
/* Test whether a property is valid. */
#define uc_property_is_valid(property) ((property).test_fn != NULL)
/* Test whether a Unicode character has a given property. */
extern bool
uc_is_property (ucs4_t uc, uc_property_t property);
extern bool uc_is_property_white_space (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_alphabetic (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_alphabetic (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_not_a_character (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_default_ignorable_code_point (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_default_ignorable_code_point (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_deprecated (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_logical_order_exception (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_variation_selector (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_private_use (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_unassigned_code_value (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_uppercase (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_uppercase (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_lowercase (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_lowercase (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_titlecase (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_cased (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_case_ignorable (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_changes_when_lowercased (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_changes_when_uppercased (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_changes_when_titlecased (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_changes_when_casefolded (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_changes_when_casemapped (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_soft_dotted (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_id_start (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_id_start (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_id_continue (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_id_continue (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_xid_start (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_xid_continue (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_pattern_white_space (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_pattern_syntax (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_join_control (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_grapheme_base (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_grapheme_extend (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_grapheme_extend (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_grapheme_link (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_control (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_left_to_right (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_hebrew_right_to_left (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_arabic_right_to_left (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_european_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_eur_num_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_eur_num_terminator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_arabic_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_common_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_block_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_segment_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_whitespace (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_non_spacing_mark (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_boundary_neutral (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_pdf (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_embedding_or_override (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_bidi_other_neutral (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_hex_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_ascii_hex_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_ideographic (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_unified_ideograph (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_radical (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_ids_binary_operator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_ids_trinary_operator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_zero_width (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_space (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_non_break (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_iso_control (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_format_control (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_dash (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_hyphen (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_punctuation (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_line_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_paragraph_separator (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_quotation_mark (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_sentence_terminal (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_terminal_punctuation (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_currency_symbol (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_math (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_other_math (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_paired_punctuation (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_left_of_pair (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_combining (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_composite (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_decimal_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_numeric (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_diacritic (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_extender (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
extern bool uc_is_property_ignorable_control (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Subdivision of the Unicode characters into scripts. */
typedef struct
{
unsigned int code : 21;
unsigned int start : 1;
unsigned int end : 1;
}
uc_interval_t;
typedef struct
{
unsigned int nintervals;
const uc_interval_t *intervals;
const char *name;
}
uc_script_t;
/* Return the script of a Unicode character. */
extern const uc_script_t *
uc_script (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Return the script given by name, e.g. "HAN". */
extern const uc_script_t *
uc_script_byname (const char *script_name)
_UC_ATTRIBUTE_PURE;
/* Test whether a Unicode character belongs to a given script. */
extern bool
uc_is_script (ucs4_t uc, const uc_script_t *script)
_UC_ATTRIBUTE_PURE;
/* Get the list of all scripts. */
extern void
uc_all_scripts (const uc_script_t **scripts, size_t *count);
/* ========================================================================= */
/* Subdivision of the Unicode character range into blocks. */
typedef struct
{
ucs4_t start;
ucs4_t end;
const char *name;
}
uc_block_t;
/* Return the block a character belongs to. */
extern const uc_block_t *
uc_block (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test whether a Unicode character belongs to a given block. */
extern bool
uc_is_block (ucs4_t uc, const uc_block_t *block)
_UC_ATTRIBUTE_PURE;
/* Get the list of all blocks. */
extern void
uc_all_blocks (const uc_block_t **blocks, size_t *count);
/* ========================================================================= */
/* Properties taken from language standards. */
/* Test whether a Unicode character is considered whitespace in ISO C 99. */
extern bool
uc_is_c_whitespace (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test whether a Unicode character is considered whitespace in Java. */
extern bool
uc_is_java_whitespace (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
enum
{
UC_IDENTIFIER_START, /* valid as first or subsequent character */
UC_IDENTIFIER_VALID, /* valid as subsequent character only */
UC_IDENTIFIER_INVALID, /* not valid */
UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
};
/* Return the categorization of a Unicode character w.r.t. the ISO C 99
identifier syntax. */
extern int
uc_c_ident_category (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Return the categorization of a Unicode character w.r.t. the Java
identifier syntax. */
extern int
uc_java_ident_category (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
/* Like ISO C and . These functions are deprecated,
because this set of functions was designed with ASCII in mind and cannot
reflect the more diverse reality of the Unicode character set. But they
can be a quick-and-dirty porting aid when migrating from wchar_t APIs
to Unicode strings. */
/* Test for any character for which 'uc_is_alpha' or 'uc_is_digit' is true. */
extern bool
uc_is_alnum (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character for which 'uc_is_upper' or 'uc_is_lower' is true,
or any character that is one of a locale-specific set of characters for
which none of 'uc_is_cntrl', 'uc_is_digit', 'uc_is_punct', or 'uc_is_space'
is true. */
extern bool
uc_is_alpha (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any control character. */
extern bool
uc_is_cntrl (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character that corresponds to a decimal-digit character. */
extern bool
uc_is_digit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character for which 'uc_is_print' is true and 'uc_is_space'
is false. */
extern bool
uc_is_graph (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character that corresponds to a lowercase letter or is one
of a locale-specific set of characters for which none of 'uc_is_cntrl',
'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
extern bool
uc_is_lower (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any printing character. */
extern bool
uc_is_print (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any printing character that is one of a locale-specific set of
characters for which neither 'uc_is_space' nor 'uc_is_alnum' is true. */
extern bool
uc_is_punct (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character that corresponds to a locale-specific set of
characters for which none of 'uc_is_alnum', 'uc_is_graph', or 'uc_is_punct'
is true. */
extern bool
uc_is_space (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character that corresponds to an uppercase letter or is one
of a locale-specific set of character for which none of 'uc_is_cntrl',
'uc_is_digit', 'uc_is_punct', or 'uc_is_space' is true. */
extern bool
uc_is_upper (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* Test for any character that corresponds to a hexadecimal-digit
character. */
extern bool
uc_is_xdigit (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* GNU extension. */
/* Test for any character that corresponds to a standard blank character or
a locale-specific set of characters for which 'uc_is_alnum' is false. */
extern bool
uc_is_blank (ucs4_t uc)
_UC_ATTRIBUTE_CONST;
/* ========================================================================= */
#ifdef __cplusplus
}
#endif
#endif /* _UNICTYPE_H */