1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * More thorough explanations of the various classes and their algorithms
11 * can be found there.
12 *
13 *
14 * Authors:
15 * Bob Jamison
16 *
17 * Copyright (C) 2006-2008 Bob Jamison
18 *
19 * This library is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU Lesser General Public
21 * License as published by the Free Software Foundation; either
22 * version 2.1 of the License, or (at your option) any later version.
23 *
24 * This library is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 * Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public
30 * License along with this library; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 *
33 */
34 #ifndef __UCD_H__
35 #define __UCD_H__
38 /************************************************
39 ** Unicode character classification
40 ************************************************/
43 /**
44 * Enumerated Unicode general category types
45 */
46 typedef enum UniCharType
47 {
48 UNI_UNASSIGNED = 0, /* Cn */
49 UNI_UPPERCASE_LETTER = 1, /* Lu */
50 UNI_LOWERCASE_LETTER = 2, /* Ll */
51 UNI_TITLECASE_LETTER = 3, /* Lt */
52 UNI_MODIFIER_LETTER = 4, /* Lm */
53 UNI_OTHER_LETTER = 5, /* Lo */
54 UNI_NON_SPACING_MARK = 6, /* Mn */
55 UNI_ENCLOSING_MARK = 7, /* Me */
56 UNI_COMBINING_SPACING_MARK = 8, /* Mc */
57 UNI_DECIMAL_DIGIT_NUMBER = 9, /* Nd */
58 UNI_LETTER_NUMBER = 10, /* Nl */
59 UNI_OTHER_NUMBER = 11, /* No */
60 UNI_SPACE_SEPARATOR = 12, /* Zs */
61 UNI_LINE_SEPARATOR = 13, /* Zl */
62 UNI_PARAGRAPH_SEPARATOR = 14, /* Zp */
63 UNI_CONTROL = 15, /* Cc */
64 UNI_FORMAT = 16, /* Cf */
65 UNI_UNUSED_RESERVE = 17, /* xx */
66 UNI_PRIVATE_USE = 18, /* Co */
67 UNI_SURROGATE = 19, /* Cs */
68 UNI_DASH_PUNCTUATION = 20, /* Pd */
69 UNI_START_PUNCTUATION = 21, /* Ps */
70 UNI_END_PUNCTUATION = 22, /* Pe */
71 UNI_CONNECTOR_PUNCTUATION = 23, /* Pc */
72 UNI_OTHER_PUNCTUATION = 24, /* Po */
73 UNI_MATH_SYMBOL = 25, /* Sm */
74 UNI_CURRENCY_SYMBOL = 26, /* Sc */
75 UNI_MODIFIER_SYMBOL = 27, /* Sk */
76 UNI_OTHER_SYMBOL = 28, /* So */
77 UNI_INITIAL_QUOTE_PUNCTUATION = 29, /* Pi */
78 UNI_FINAL_QUOTE_PUNCTUATION = 30 /* Pf */
79 } UnicodeCharType;
82 /**
83 * Get the raw table entry for this Unicode codepoint
84 * @param ch the Unicode codepoint to test
85 * @return the raw UCD property table entry
86 */
87 unsigned int uni_code(int ch);
90 /**
91 * Get the Unicode General Category of ths character
92 * @param ch the Unicode codepoint to test
93 * @return the 'UniCharType' General Category enumeration (above)
94 */
95 unsigned int uni_type(int ch);
98 /**
99 * Test if this Unicode code point is lower case
100 * @param ch the Unicode codepoint to test
101 * @return 1 if successful, else 0
102 */
103 int uni_is_lower(int ch);
106 /**
107 * Test if this Unicode code point is upper case
108 * @param ch the Unicode codepoint to test
109 * @return 1 if successful, else 0
110 */
111 int uni_is_upper(int ch);
114 /**
115 * Test if this Unicode code point is title case
116 * @param ch the Unicode codepoint to test
117 * @return 1 if successful, else 0
118 */
119 int uni_is_title(int ch);
122 /**
123 * Test if this Unicode code point is a numeric digit
124 * @param ch the Unicode codepoint to test
125 * @return 1 if successful, else 0
126 */
127 int uni_is_digit(int ch);
130 /**
131 * Test if this Unicode code point is defined in the database
132 * @param ch the Unicode codepoint to test
133 * @return 1 if successful, else 0
134 */
135 int uni_is_defined(int ch);
137 /**
138 * Test if this Unicode code point is a letter
139 * @param ch the Unicode codepoint to test
140 * @return 1 if successful, else 0
141 */
142 int uni_is_letter(int ch);
145 /**
146 * Test if this Unicode code point is a letter or a digit
147 * @param ch the Unicode codepoint to test
148 * @return 1 if successful, else 0
149 */
150 int uni_is_letter_or_digit(int ch);
152 /**
153 * Test if this Unicode code point is considered to be a space
154 * @param ch the Unicode codepoint to test
155 * @return 1 if successful, else 0
156 */
157 int uni_is_space(int ch);
160 /************************************************
161 ** Unicode case conversion
162 ************************************************/
164 /**
165 * Convert the given codepoint to its lower case mapping.
166 * If there is none, return the codepoint.
167 * @param ch the Unicode codepoint to convert
168 * @return the converted codepoint
169 */
170 int uni_to_lower(int ch);
172 /**
173 * Convert the given codepoint to its upper case mapping.
174 * If there is none, return the codepoint.
175 * @param ch the Unicode codepoint to convert
176 * @return the converted codepoint
177 */
178 int uni_to_upper(int ch);
180 /**
181 * Convert the given codepoint to its title case mapping.
182 * If there is none, return the codepoint.
183 * @param ch the Unicode codepoint to convert
184 * @return the converted codepoint
185 */
186 int uni_to_title(int ch);
189 /************************************************
190 ** Unicode blocks
191 ************************************************/
195 /**
196 * Used to hold the information for a Unicode codepoint
197 * block
198 */
199 typedef struct
200 {
201 /**
202 * Low end of the block range
203 */
204 unsigned long low;
205 /**
206 * High end of the block range
207 */
208 unsigned long high;
209 /**
210 * Name string for the block
211 */
212 const char *name;
213 } UcdBlockData;
216 /**
217 * Return the Unicode block (defined below) for the given
218 * codepoint. If not found, return UCD_BLOCK_NO_BLOCK.
219 * @param ch the Unicode codepoint to search
220 * @return the block
221 */
222 int uni_block(int ch);
225 /**
226 * Return the Unicode block data for the enumerated block number.
227 * @param nr the Unicode block number
228 * @return the block data if found, else NULL
229 */
230 UcdBlockData *uni_block_data(int blockNr);
235 /**
236 * The Unicode codepoint blocks as defined in Blocks.txt.
237 * Block list has 171 entries
238 */
239 typedef enum
240 {
241 /* 0, 000000 - 00007f */ UCD_BLOCK_BASIC_LATIN,
242 /* 2, 000100 - 00017f */ UCD_BLOCK_LATIN_EXTENDED_A,
243 /* 4, 000250 - 0002af */ UCD_BLOCK_IPA_EXTENSIONS,
244 /* 6, 000300 - 00036f */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS,
245 /* 8, 000400 - 0004ff */ UCD_BLOCK_CYRILLIC,
246 /* 10, 000530 - 00058f */ UCD_BLOCK_ARMENIAN,
247 /* 12, 000600 - 0006ff */ UCD_BLOCK_ARABIC,
248 /* 14, 000750 - 00077f */ UCD_BLOCK_ARABIC_SUPPLEMENT,
249 /* 16, 0007c0 - 0007ff */ UCD_BLOCK_NKO,
250 /* 18, 000980 - 0009ff */ UCD_BLOCK_BENGALI,
251 /* 20, 000a80 - 000aff */ UCD_BLOCK_GUJARATI,
252 /* 22, 000b80 - 000bff */ UCD_BLOCK_TAMIL,
253 /* 24, 000c80 - 000cff */ UCD_BLOCK_KANNADA,
254 /* 26, 000d80 - 000dff */ UCD_BLOCK_SINHALA,
255 /* 28, 000e80 - 000eff */ UCD_BLOCK_LAO,
256 /* 30, 001000 - 00109f */ UCD_BLOCK_MYANMAR,
257 /* 32, 001100 - 0011ff */ UCD_BLOCK_HANGUL_JAMO,
258 /* 34, 001380 - 00139f */ UCD_BLOCK_ETHIOPIC_SUPPLEMENT,
259 /* 36, 001400 - 00167f */ UCD_BLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
260 /* 38, 0016a0 - 0016ff */ UCD_BLOCK_RUNIC,
261 /* 40, 001720 - 00173f */ UCD_BLOCK_HANUNOO,
262 /* 42, 001760 - 00177f */ UCD_BLOCK_TAGBANWA,
263 /* 44, 001800 - 0018af */ UCD_BLOCK_MONGOLIAN,
264 /* 46, 001950 - 00197f */ UCD_BLOCK_TAI_LE,
265 /* 48, 0019e0 - 0019ff */ UCD_BLOCK_KHMER_SYMBOLS,
266 /* 50, 001b00 - 001b7f */ UCD_BLOCK_BALINESE,
267 /* 52, 001c00 - 001c4f */ UCD_BLOCK_LEPCHA,
268 /* 54, 001d00 - 001d7f */ UCD_BLOCK_PHONETIC_EXTENSIONS,
269 /* 56, 001dc0 - 001dff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
270 /* 58, 001f00 - 001fff */ UCD_BLOCK_GREEK_EXTENDED,
271 /* 60, 002070 - 00209f */ UCD_BLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS,
272 /* 62, 0020d0 - 0020ff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
273 /* 64, 002150 - 00218f */ UCD_BLOCK_NUMBER_FORMS,
274 /* 66, 002200 - 0022ff */ UCD_BLOCK_MATHEMATICAL_OPERATORS,
275 /* 68, 002400 - 00243f */ UCD_BLOCK_CONTROL_PICTURES,
276 /* 70, 002460 - 0024ff */ UCD_BLOCK_ENCLOSED_ALPHANUMERICS,
277 /* 72, 002580 - 00259f */ UCD_BLOCK_BLOCK_ELEMENTS,
278 /* 74, 002600 - 0026ff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS,
279 /* 76, 0027c0 - 0027ef */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
280 /* 78, 002800 - 0028ff */ UCD_BLOCK_BRAILLE_PATTERNS,
281 /* 80, 002980 - 0029ff */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
282 /* 82, 002b00 - 002bff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS,
283 /* 84, 002c60 - 002c7f */ UCD_BLOCK_LATIN_EXTENDED_C,
284 /* 86, 002d00 - 002d2f */ UCD_BLOCK_GEORGIAN_SUPPLEMENT,
285 /* 88, 002d80 - 002ddf */ UCD_BLOCK_ETHIOPIC_EXTENDED,
286 /* 90, 002e00 - 002e7f */ UCD_BLOCK_SUPPLEMENTAL_PUNCTUATION,
287 /* 92, 002f00 - 002fdf */ UCD_BLOCK_KANGXI_RADICALS,
288 /* 94, 003000 - 00303f */ UCD_BLOCK_CJK_SYMBOLS_AND_PUNCTUATION,
289 /* 96, 0030a0 - 0030ff */ UCD_BLOCK_KATAKANA,
290 /* 98, 003130 - 00318f */ UCD_BLOCK_HANGUL_COMPATIBILITY_JAMO,
291 /* 100, 0031a0 - 0031bf */ UCD_BLOCK_BOPOMOFO_EXTENDED,
292 /* 102, 0031f0 - 0031ff */ UCD_BLOCK_KATAKANA_PHONETIC_EXTENSIONS,
293 /* 104, 003300 - 0033ff */ UCD_BLOCK_CJK_COMPATIBILITY,
294 /* 106, 004dc0 - 004dff */ UCD_BLOCK_YIJING_HEXAGRAM_SYMBOLS,
295 /* 108, 00a000 - 00a48f */ UCD_BLOCK_YI_SYLLABLES,
296 /* 110, 00a500 - 00a63f */ UCD_BLOCK_VAI,
297 /* 112, 00a700 - 00a71f */ UCD_BLOCK_MODIFIER_TONE_LETTERS,
298 /* 114, 00a800 - 00a82f */ UCD_BLOCK_SYLOTI_NAGRI,
299 /* 116, 00a880 - 00a8df */ UCD_BLOCK_SAURASHTRA,
300 /* 118, 00a930 - 00a95f */ UCD_BLOCK_REJANG,
301 /* 120, 00ac00 - 00d7af */ UCD_BLOCK_HANGUL_SYLLABLES,
302 /* 122, 00db80 - 00dbff */ UCD_BLOCK_HIGH_PRIVATE_USE_SURROGATES,
303 /* 124, 00e000 - 00f8ff */ UCD_BLOCK_PRIVATE_USE_AREA,
304 /* 126, 00fb00 - 00fb4f */ UCD_BLOCK_ALPHABETIC_PRESENTATION_FORMS,
305 /* 128, 00fe00 - 00fe0f */ UCD_BLOCK_VARIATION_SELECTORS,
306 /* 130, 00fe20 - 00fe2f */ UCD_BLOCK_COMBINING_HALF_MARKS,
307 /* 132, 00fe50 - 00fe6f */ UCD_BLOCK_SMALL_FORM_VARIANTS,
308 /* 134, 00ff00 - 00ffef */ UCD_BLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS,
309 /* 136, 010000 - 01007f */ UCD_BLOCK_LINEAR_B_SYLLABARY,
310 /* 138, 010100 - 01013f */ UCD_BLOCK_AEGEAN_NUMBERS,
311 /* 140, 010190 - 0101cf */ UCD_BLOCK_ANCIENT_SYMBOLS,
312 /* 142, 010280 - 01029f */ UCD_BLOCK_LYCIAN,
313 /* 144, 010300 - 01032f */ UCD_BLOCK_OLD_ITALIC,
314 /* 146, 010380 - 01039f */ UCD_BLOCK_UGARITIC,
315 /* 148, 010400 - 01044f */ UCD_BLOCK_DESERET,
316 /* 150, 010480 - 0104af */ UCD_BLOCK_OSMANYA,
317 /* 152, 010900 - 01091f */ UCD_BLOCK_PHOENICIAN,
318 /* 154, 010a00 - 010a5f */ UCD_BLOCK_KHAROSHTHI,
319 /* 156, 012400 - 01247f */ UCD_BLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION,
320 /* 158, 01d100 - 01d1ff */ UCD_BLOCK_MUSICAL_SYMBOLS,
321 /* 160, 01d300 - 01d35f */ UCD_BLOCK_TAI_XUAN_JING_SYMBOLS,
322 /* 162, 01d400 - 01d7ff */ UCD_BLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
323 /* 164, 01f030 - 01f09f */ UCD_BLOCK_DOMINO_TILES,
324 /* 166, 02f800 - 02fa1f */ UCD_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
325 /* 168, 0e0100 - 0e01ef */ UCD_BLOCK_VARIATION_SELECTORS_SUPPLEMENT,
326 /* 170, 100000 - 10ffff */ UCD_BLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B,
327 /* 171, 000000 - 10ffff */ UCD_BLOCK_NO_BLOCK
328 } UnicodeBlocks;
331 #endif /* __UCD_H__ */