dom/ucd.h

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * More thorough explanations of the various classes and their algorithms
  11  * can be found there.
  12  *
  13  *
  14  * Authors:
  15  *   Bob Jamison
  16  *
  17  * Copyright (C) 2006-2008 Bob Jamison
  18  *
  19  *  This library is free software; you can redistribute it and/or
  20  *  modify it under the terms of the GNU Lesser General Public
  21  *  License as published by the Free Software Foundation; either
  22  *  version 2.1 of the License, or (at your option) any later version.
  23  *
  24  *  This library is distributed in the hope that it will be useful,
  25  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  26  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  27  *  Lesser General Public License for more details.
  28  *
  29  *  You should have received a copy of the GNU Lesser General Public
  30  *  License along with this library; if not, write to the Free Software
  31  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  32  *
  33  */
  34 #ifndef __UCD_H__
  35 #define __UCD_H__
  36
  37
  38 /************************************************
  39 ** Unicode character classification
  40 ************************************************/
  41
  42
  43 /**
  44  * Enumerated Unicode general category types
  45  */
  46 typedef enum UniCharType
  47 {
  48     UNI_UNASSIGNED                =  0,  /* Cn */
  49     UNI_UPPERCASE_LETTER          =  1,  /* Lu */
  50     UNI_LOWERCASE_LETTER          =  2,  /* Ll */
  51     UNI_TITLECASE_LETTER          =  3,  /* Lt */
  52     UNI_MODIFIER_LETTER           =  4,  /* Lm */
  53     UNI_OTHER_LETTER              =  5,  /* Lo */
  54     UNI_NON_SPACING_MARK          =  6,  /* Mn */
  55     UNI_ENCLOSING_MARK            =  7,  /* Me */
  56     UNI_COMBINING_SPACING_MARK    =  8,  /* Mc */
  57     UNI_DECIMAL_DIGIT_NUMBER      =  9,  /* Nd */
  58     UNI_LETTER_NUMBER             = 10,  /* Nl */
  59     UNI_OTHER_NUMBER              = 11,  /* No */
  60     UNI_SPACE_SEPARATOR           = 12,  /* Zs */
  61     UNI_LINE_SEPARATOR            = 13,  /* Zl */
  62     UNI_PARAGRAPH_SEPARATOR       = 14,  /* Zp */
  63     UNI_CONTROL                   = 15,  /* Cc */
  64     UNI_FORMAT                    = 16,  /* Cf */
  65     UNI_UNUSED_RESERVE            = 17,  /* xx */
  66     UNI_PRIVATE_USE               = 18,  /* Co */
  67     UNI_SURROGATE                 = 19,  /* Cs */
  68     UNI_DASH_PUNCTUATION          = 20,  /* Pd */
  69     UNI_START_PUNCTUATION         = 21,  /* Ps */
  70     UNI_END_PUNCTUATION           = 22,  /* Pe */
  71     UNI_CONNECTOR_PUNCTUATION     = 23,  /* Pc */
  72     UNI_OTHER_PUNCTUATION         = 24,  /* Po */
  73     UNI_MATH_SYMBOL               = 25,  /* Sm */
  74     UNI_CURRENCY_SYMBOL           = 26,  /* Sc */
  75     UNI_MODIFIER_SYMBOL           = 27,  /* Sk */
  76     UNI_OTHER_SYMBOL              = 28,  /* So */
  77     UNI_INITIAL_QUOTE_PUNCTUATION = 29,  /* Pi */
  78     UNI_FINAL_QUOTE_PUNCTUATION   = 30   /* Pf */
  79 } UnicodeCharType;
  80
  81
  82 /**
  83  * Get the raw table entry for this Unicode codepoint
  84  * @param ch the Unicode codepoint to test
  85  * @return the raw UCD property table entry
  86  */
  87 unsigned int uni_code(int ch);
  88
  89
  90 /**
  91  * Get the Unicode General Category of ths character
  92  * @param ch the Unicode codepoint to test
  93  * @return the 'UniCharType' General Category enumeration (above)
  94  */
  95 unsigned int uni_type(int ch);
  96
  97
  98 /**
  99  * Test if this Unicode code point is lower case
 100  * @param ch the Unicode codepoint to test
 101  * @return 1 if successful, else 0
 102  */
 103 int uni_is_lower(int ch);
 104
 105
 106 /**
 107  * Test if this Unicode code point is upper case
 108  * @param ch the Unicode codepoint to test
 109  * @return 1 if successful, else 0
 110  */
 111 int uni_is_upper(int ch);
 112
 113
 114 /**
 115  * Test if this Unicode code point is title case
 116  * @param ch the Unicode codepoint to test
 117  * @return 1 if successful, else 0
 118  */
 119 int uni_is_title(int ch);
 120
 121
 122 /**
 123  * Test if this Unicode code point is a numeric digit
 124  * @param ch the Unicode codepoint to test
 125  * @return 1 if successful, else 0
 126  */
 127 int uni_is_digit(int ch);
 128
 129
 130 /**
 131  * Test if this Unicode code point is defined in the database
 132  * @param ch the Unicode codepoint to test
 133  * @return 1 if successful, else 0
 134  */
 135 int uni_is_defined(int ch);
 136
 137 /**
 138  * Test if this Unicode code point is a letter
 139  * @param ch the Unicode codepoint to test
 140  * @return 1 if successful, else 0
 141  */
 142 int uni_is_letter(int ch);
 143
 144
 145 /**
 146  * Test if this Unicode code point is a letter or a digit
 147  * @param ch the Unicode codepoint to test
 148  * @return 1 if successful, else 0
 149  */
 150 int uni_is_letter_or_digit(int ch);
 151
 152 /**
 153  * Test if this Unicode code point is considered to be a space
 154  * @param ch the Unicode codepoint to test
 155  * @return 1 if successful, else 0
 156  */
 157 int uni_is_space(int ch);
 158
 159
 160 /************************************************
 161 ** Unicode case conversion
 162 ************************************************/
 163
 164 /**
 165  * Convert the given codepoint to its lower case mapping.
 166  * If there is none, return the codepoint.
 167  * @param ch the Unicode codepoint to convert
 168  * @return the converted codepoint
 169  */
 170 int uni_to_lower(int ch);
 171
 172 /**
 173  * Convert the given codepoint to its upper case mapping.
 174  * If there is none, return the codepoint.
 175  * @param ch the Unicode codepoint to convert
 176  * @return the converted codepoint
 177  */
 178 int uni_to_upper(int ch);
 179
 180 /**
 181  * Convert the given codepoint to its title case mapping.
 182  * If there is none, return the codepoint.
 183  * @param ch the Unicode codepoint to convert
 184  * @return the converted codepoint
 185  */
 186 int uni_to_title(int ch);
 187
 188
 189 /************************************************
 190 ** Unicode blocks
 191 ************************************************/
 192
 193
 194
 195 /**
 196  * Used to hold the information for a Unicode codepoint
 197  * block
 198  */
 199 typedef struct
 200 {
 201     /**
 202      * Low end of the block range
 203      */
 204     unsigned long low;
 205     /**
 206      * High end of the block range
 207      */
 208     unsigned long high;
 209     /**
 210      * Name string for the block
 211      */
 212     const char    *name;
 213 } UcdBlockData;
 214
 215
 216 /**
 217  * Return the Unicode block (defined below) for the given
 218  * codepoint.  If not found, return UCD_BLOCK_NO_BLOCK.
 219  * @param ch the Unicode codepoint to search
 220  * @return the block
 221  */
 222 int uni_block(int ch);
 223
 224
 225 /**
 226  * Return the Unicode block data for the enumerated block number.
 227  * @param nr the Unicode block number
 228  * @return the block data if found, else NULL
 229  */
 230 UcdBlockData *uni_block_data(int blockNr);
 231
 232
 233
 234
 235 /**
 236  * The Unicode codepoint blocks as defined in Blocks.txt.
 237  * Block list has 171 entries
 238  */
 239 typedef enum
 240 {
 241     /*   0, 000000 - 00007f */  UCD_BLOCK_BASIC_LATIN,
 242     /*   2, 000100 - 00017f */  UCD_BLOCK_LATIN_EXTENDED_A,
 243     /*   4, 000250 - 0002af */  UCD_BLOCK_IPA_EXTENSIONS,
 244     /*   6, 000300 - 00036f */  UCD_BLOCK_COMBINING_DIACRITICAL_MARKS,
 245     /*   8, 000400 - 0004ff */  UCD_BLOCK_CYRILLIC,
 246     /*  10, 000530 - 00058f */  UCD_BLOCK_ARMENIAN,
 247     /*  12, 000600 - 0006ff */  UCD_BLOCK_ARABIC,
 248     /*  14, 000750 - 00077f */  UCD_BLOCK_ARABIC_SUPPLEMENT,
 249     /*  16, 0007c0 - 0007ff */  UCD_BLOCK_NKO,
 250     /*  18, 000980 - 0009ff */  UCD_BLOCK_BENGALI,
 251     /*  20, 000a80 - 000aff */  UCD_BLOCK_GUJARATI,
 252     /*  22, 000b80 - 000bff */  UCD_BLOCK_TAMIL,
 253     /*  24, 000c80 - 000cff */  UCD_BLOCK_KANNADA,
 254     /*  26, 000d80 - 000dff */  UCD_BLOCK_SINHALA,
 255     /*  28, 000e80 - 000eff */  UCD_BLOCK_LAO,
 256     /*  30, 001000 - 00109f */  UCD_BLOCK_MYANMAR,
 257     /*  32, 001100 - 0011ff */  UCD_BLOCK_HANGUL_JAMO,
 258     /*  34, 001380 - 00139f */  UCD_BLOCK_ETHIOPIC_SUPPLEMENT,
 259     /*  36, 001400 - 00167f */  UCD_BLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
 260     /*  38, 0016a0 - 0016ff */  UCD_BLOCK_RUNIC,
 261     /*  40, 001720 - 00173f */  UCD_BLOCK_HANUNOO,
 262     /*  42, 001760 - 00177f */  UCD_BLOCK_TAGBANWA,
 263     /*  44, 001800 - 0018af */  UCD_BLOCK_MONGOLIAN,
 264     /*  46, 001950 - 00197f */  UCD_BLOCK_TAI_LE,
 265     /*  48, 0019e0 - 0019ff */  UCD_BLOCK_KHMER_SYMBOLS,
 266     /*  50, 001b00 - 001b7f */  UCD_BLOCK_BALINESE,
 267     /*  52, 001c00 - 001c4f */  UCD_BLOCK_LEPCHA,
 268     /*  54, 001d00 - 001d7f */  UCD_BLOCK_PHONETIC_EXTENSIONS,
 269     /*  56, 001dc0 - 001dff */  UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
 270     /*  58, 001f00 - 001fff */  UCD_BLOCK_GREEK_EXTENDED,
 271     /*  60, 002070 - 00209f */  UCD_BLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS,
 272     /*  62, 0020d0 - 0020ff */  UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
 273     /*  64, 002150 - 00218f */  UCD_BLOCK_NUMBER_FORMS,
 274     /*  66, 002200 - 0022ff */  UCD_BLOCK_MATHEMATICAL_OPERATORS,
 275     /*  68, 002400 - 00243f */  UCD_BLOCK_CONTROL_PICTURES,
 276     /*  70, 002460 - 0024ff */  UCD_BLOCK_ENCLOSED_ALPHANUMERICS,
 277     /*  72, 002580 - 00259f */  UCD_BLOCK_BLOCK_ELEMENTS,
 278     /*  74, 002600 - 0026ff */  UCD_BLOCK_MISCELLANEOUS_SYMBOLS,
 279     /*  76, 0027c0 - 0027ef */  UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
 280     /*  78, 002800 - 0028ff */  UCD_BLOCK_BRAILLE_PATTERNS,
 281     /*  80, 002980 - 0029ff */  UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
 282     /*  82, 002b00 - 002bff */  UCD_BLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS,
 283     /*  84, 002c60 - 002c7f */  UCD_BLOCK_LATIN_EXTENDED_C,
 284     /*  86, 002d00 - 002d2f */  UCD_BLOCK_GEORGIAN_SUPPLEMENT,
 285     /*  88, 002d80 - 002ddf */  UCD_BLOCK_ETHIOPIC_EXTENDED,
 286     /*  90, 002e00 - 002e7f */  UCD_BLOCK_SUPPLEMENTAL_PUNCTUATION,
 287     /*  92, 002f00 - 002fdf */  UCD_BLOCK_KANGXI_RADICALS,
 288     /*  94, 003000 - 00303f */  UCD_BLOCK_CJK_SYMBOLS_AND_PUNCTUATION,
 289     /*  96, 0030a0 - 0030ff */  UCD_BLOCK_KATAKANA,
 290     /*  98, 003130 - 00318f */  UCD_BLOCK_HANGUL_COMPATIBILITY_JAMO,
 291     /* 100, 0031a0 - 0031bf */  UCD_BLOCK_BOPOMOFO_EXTENDED,
 292     /* 102, 0031f0 - 0031ff */  UCD_BLOCK_KATAKANA_PHONETIC_EXTENSIONS,
 293     /* 104, 003300 - 0033ff */  UCD_BLOCK_CJK_COMPATIBILITY,
 294     /* 106, 004dc0 - 004dff */  UCD_BLOCK_YIJING_HEXAGRAM_SYMBOLS,
 295     /* 108, 00a000 - 00a48f */  UCD_BLOCK_YI_SYLLABLES,
 296     /* 110, 00a500 - 00a63f */  UCD_BLOCK_VAI,
 297     /* 112, 00a700 - 00a71f */  UCD_BLOCK_MODIFIER_TONE_LETTERS,
 298     /* 114, 00a800 - 00a82f */  UCD_BLOCK_SYLOTI_NAGRI,
 299     /* 116, 00a880 - 00a8df */  UCD_BLOCK_SAURASHTRA,
 300     /* 118, 00a930 - 00a95f */  UCD_BLOCK_REJANG,
 301     /* 120, 00ac00 - 00d7af */  UCD_BLOCK_HANGUL_SYLLABLES,
 302     /* 122, 00db80 - 00dbff */  UCD_BLOCK_HIGH_PRIVATE_USE_SURROGATES,
 303     /* 124, 00e000 - 00f8ff */  UCD_BLOCK_PRIVATE_USE_AREA,
 304     /* 126, 00fb00 - 00fb4f */  UCD_BLOCK_ALPHABETIC_PRESENTATION_FORMS,
 305     /* 128, 00fe00 - 00fe0f */  UCD_BLOCK_VARIATION_SELECTORS,
 306     /* 130, 00fe20 - 00fe2f */  UCD_BLOCK_COMBINING_HALF_MARKS,
 307     /* 132, 00fe50 - 00fe6f */  UCD_BLOCK_SMALL_FORM_VARIANTS,
 308     /* 134, 00ff00 - 00ffef */  UCD_BLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS,
 309     /* 136, 010000 - 01007f */  UCD_BLOCK_LINEAR_B_SYLLABARY,
 310     /* 138, 010100 - 01013f */  UCD_BLOCK_AEGEAN_NUMBERS,
 311     /* 140, 010190 - 0101cf */  UCD_BLOCK_ANCIENT_SYMBOLS,
 312     /* 142, 010280 - 01029f */  UCD_BLOCK_LYCIAN,
 313     /* 144, 010300 - 01032f */  UCD_BLOCK_OLD_ITALIC,
 314     /* 146, 010380 - 01039f */  UCD_BLOCK_UGARITIC,
 315     /* 148, 010400 - 01044f */  UCD_BLOCK_DESERET,
 316     /* 150, 010480 - 0104af */  UCD_BLOCK_OSMANYA,
 317     /* 152, 010900 - 01091f */  UCD_BLOCK_PHOENICIAN,
 318     /* 154, 010a00 - 010a5f */  UCD_BLOCK_KHAROSHTHI,
 319     /* 156, 012400 - 01247f */  UCD_BLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION,
 320     /* 158, 01d100 - 01d1ff */  UCD_BLOCK_MUSICAL_SYMBOLS,
 321     /* 160, 01d300 - 01d35f */  UCD_BLOCK_TAI_XUAN_JING_SYMBOLS,
 322     /* 162, 01d400 - 01d7ff */  UCD_BLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
 323     /* 164, 01f030 - 01f09f */  UCD_BLOCK_DOMINO_TILES,
 324     /* 166, 02f800 - 02fa1f */  UCD_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
 325     /* 168, 0e0100 - 0e01ef */  UCD_BLOCK_VARIATION_SELECTORS_SUPPLEMENT,
 326     /* 170, 100000 - 10ffff */  UCD_BLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B,
 327     /* 171, 000000 - 10ffff */  UCD_BLOCK_NO_BLOCK
 328 } UnicodeBlocks;
 329
 330
 331 #endif /* __UCD_H__ */
 332
 333