/* * Compact and efficient reimplementation of the * xterm character class mechanism for large character sets * * Markus Kuhn -- mkuhn@acm.org -- 2000-07-03 * * Xterm allows users to select entire words with a double-click on * the left mouse button. Opinions might differ on what type of * characters are part of separate words, therefore xterm allows users * to configure a class code for each 8-bit character. Words are * maximum length sequences of neighboring characters with identical * class code. Extending this mechanism to Unicode naively would * create an at least 2^16 entries (128 kB) long class code table. * Instead, we transform the character class table into a list * of intervals, that will be accessed via a linear search. * Changes made to the table by the user will be appended. A special * class code -1 (default) marks characters who have their code number * as the class code. We could alternatively use a sorted table of * non-overlapping intervals that can be accessed via binary search, * but merging in new intervals is significantly more hassle and * not worth the effort here. */ /* $XFree86: xc/programs/xterm/charclass.c,v 1.2 2000/12/28 00:51:50 dickey Exp $ */ #include #include #if OPT_WIDE_CHARS struct classentry { int class; int first; int last; } *classtab; /* * Special convention for classtab[0]: * - classtab[0].class is the allocated number of entries in classtab * - classtab[0].first = 1 (first used entry in classtab) * - classtab[0].last is the last used entry in classtab */ int SetCharacterClassRange(int low, int high, int value) { if (high < low) return -1; /* nothing to do */ /* make sure we have at least one free entry left at table end */ if (classtab[0].last > classtab[0].class - 2) { classtab[0].class += 5 + classtab[0].class/4; classtab = realloc(classtab, classtab[0].class * sizeof(struct classentry)); if (!classtab) abort(); } /* simply append new interval to end of interval array */ classtab[0].last++; classtab[classtab[0].last].first = low; classtab[classtab[0].last].last = high; classtab[classtab[0].last].class = value; return 0; } void init_classtab(void) { const int size = 50; classtab = (struct classentry *) malloc(size * sizeof(struct classentry)); if (!classtab) abort(); classtab[0].class = size; classtab[0].first = 1; classtab[0].last = 0; /* old xterm default classes */ SetCharacterClassRange(0, 0, 32); SetCharacterClassRange(1, 31, 1); SetCharacterClassRange('\t', '\t', 32); SetCharacterClassRange('0', '9', 48); SetCharacterClassRange('A', 'Z', 48); SetCharacterClassRange('_', '_', 48); SetCharacterClassRange('a', 'z', 48); SetCharacterClassRange(127, 159, 1); SetCharacterClassRange(160, 191, -1); SetCharacterClassRange(192, 255, 48); SetCharacterClassRange(215, 215, 216); SetCharacterClassRange(247, 247, 248); /* added Unicode classes */ SetCharacterClassRange(0x0100, 0xffdf, 48); /* mostly characters */ SetCharacterClassRange(0x037e, 0x037e, -1); /* Greek question mark */ SetCharacterClassRange(0x0387, 0x0387, -1); /* Greek ano teleia */ SetCharacterClassRange(0x055a, 0x055f, -1); /* Armenian punctuation */ SetCharacterClassRange(0x0589, 0x0589, -1); /* Armenian full stop */ SetCharacterClassRange(0x0700, 0x070d, -1); /* Syriac punctuation */ SetCharacterClassRange(0x104a, 0x104f, -1); /* Myanmar punctuation */ SetCharacterClassRange(0x10fb, 0x10fb, -1); /* Georgian punctuation */ SetCharacterClassRange(0x1361, 0x1368, -1); /* Ethiopic punctuation */ SetCharacterClassRange(0x166d, 0x166e, -1); /* Canadian Syl. punctuation */ SetCharacterClassRange(0x17d4, 0x17dc, -1); /* Khmer punctuation */ SetCharacterClassRange(0x1800, 0x180a, -1); /* Mongolian punctuation */ SetCharacterClassRange(0x2000, 0x200a, 32); /* spaces */ SetCharacterClassRange(0x200b, 0x27ff, -1); /* punctuation and symbols */ SetCharacterClassRange(0x2070, 0x207f, 0x2070); /* superscript */ SetCharacterClassRange(0x2080, 0x208f, 0x2080); /* subscript */ SetCharacterClassRange(0x3000, 0x3000, 32); /* ideographic space */ SetCharacterClassRange(0x3001, 0x3020, -1); /* ideographic punctuation */ SetCharacterClassRange(0x3040, 0x309f, 0x3040); /* Hiragana */ SetCharacterClassRange(0x30a0, 0x30ff, 0x30a0); /* Katakana */ SetCharacterClassRange(0x3300, 0x9fff, 0x4e00); /* CJK Ideographs */ SetCharacterClassRange(0xac00, 0xd7a3, 0xac00); /* Hangul Syllables */ SetCharacterClassRange(0xf900, 0xfaff, 0x4e00); /* CJK Ideographs */ SetCharacterClassRange(0xfe30, 0xfe6b, -1); /* punctuation forms */ SetCharacterClassRange(0xff00, 0xff0f, -1); /* half/fullwidth ASCII */ SetCharacterClassRange(0xff1a, 0xff20, -1); /* half/fullwidth ASCII */ SetCharacterClassRange(0xff3b, 0xff40, -1); /* half/fullwidth ASCII */ SetCharacterClassRange(0xff5b, 0xff64, -1); /* half/fullwidth ASCII */ return; } int CharacterClass(int c) { int i, class = -1; for (i = classtab[0].first; i <= classtab[0].last; i++) if (classtab[i].first <= c && classtab[i].last >= c) class = classtab[i].class; if (class < 0) class = c; return class; } #endif