diff --git a/src/dom/ucd.h b/src/dom/ucd.h
index 92c0a7525c0055f31560ad3f20608cef8d09e117..c4d0ab4e0b54ecaaf1db1c3f9e683a852bf74bc7 100644 (file)
--- a/src/dom/ucd.h
+++ b/src/dom/ucd.h
/**
- *
* Phoebe DOM Implementation.
*
* This is a C++ approximation of the W3C DOM model, which follows
* Authors:
* Bob Jamison
*
- * Copyright (C) 2008 Bob Jamison
+ * Copyright (C) 2006-2008 Bob Jamison
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
- * =======================================================================
- * NOTES:
- *
- * This code is from another project, also by Bob Jamison, so no permissions
- * were required. :-) It has been tested on 32 and 64 bits.
- *
- * This table contains the codepoints from the Unicode Character Database (UCD),
- * version 5.1.0.
- *
*/
#ifndef __UCD_H__
#define __UCD_H__
+/************************************************
+** Unicode character classification
+************************************************/
+
/**
* Enumerated Unicode general category types
UNI_OTHER_SYMBOL = 28, /* So */
UNI_INITIAL_QUOTE_PUNCTUATION = 29, /* Pi */
UNI_FINAL_QUOTE_PUNCTUATION = 30 /* Pf */
-} UniCharType;
+} UnicodeCharType;
/**
*/
int uni_is_space(int ch);
+
+/************************************************
+** Unicode case conversion
+************************************************/
+
/**
* Convert the given codepoint to its lower case mapping.
* If there is none, return the codepoint.
int uni_to_title(int ch);
+/************************************************
+** Unicode blocks
+************************************************/
+
+
+
+/**
+ * Used to hold the information for a Unicode codepoint
+ * block
+ */
+typedef struct
+{
+ /**
+ * Low end of the block range
+ */
+ unsigned long low;
+ /**
+ * High end of the block range
+ */
+ unsigned long high;
+ /**
+ * Name string for the block
+ */
+ const char *name;
+} UcdBlockData;
+
+
+/**
+ * Return the Unicode block (defined below) for the given
+ * codepoint. If not found, return UCD_BLOCK_NO_BLOCK.
+ * @param ch the Unicode codepoint to search
+ * @return the block
+ */
+int uni_block(int ch);
+
+
+/**
+ * Return the Unicode block data for the enumerated block number.
+ * @param nr the Unicode block number
+ * @return the block data if found, else NULL
+ */
+UcdBlockData *uni_block_data(int blockNr);
+
-#endif /* __UCD_H__ */
+/**
+ * The Unicode codepoint blocks as defined in Blocks.txt.
+ * Block list has 171 entries
+ */
+typedef enum
+{
+ /* 0, 000000 - 00007f */ UCD_BLOCK_BASIC_LATIN,
+ /* 2, 000100 - 00017f */ UCD_BLOCK_LATIN_EXTENDED_A,
+ /* 4, 000250 - 0002af */ UCD_BLOCK_IPA_EXTENSIONS,
+ /* 6, 000300 - 00036f */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS,
+ /* 8, 000400 - 0004ff */ UCD_BLOCK_CYRILLIC,
+ /* 10, 000530 - 00058f */ UCD_BLOCK_ARMENIAN,
+ /* 12, 000600 - 0006ff */ UCD_BLOCK_ARABIC,
+ /* 14, 000750 - 00077f */ UCD_BLOCK_ARABIC_SUPPLEMENT,
+ /* 16, 0007c0 - 0007ff */ UCD_BLOCK_NKO,
+ /* 18, 000980 - 0009ff */ UCD_BLOCK_BENGALI,
+ /* 20, 000a80 - 000aff */ UCD_BLOCK_GUJARATI,
+ /* 22, 000b80 - 000bff */ UCD_BLOCK_TAMIL,
+ /* 24, 000c80 - 000cff */ UCD_BLOCK_KANNADA,
+ /* 26, 000d80 - 000dff */ UCD_BLOCK_SINHALA,
+ /* 28, 000e80 - 000eff */ UCD_BLOCK_LAO,
+ /* 30, 001000 - 00109f */ UCD_BLOCK_MYANMAR,
+ /* 32, 001100 - 0011ff */ UCD_BLOCK_HANGUL_JAMO,
+ /* 34, 001380 - 00139f */ UCD_BLOCK_ETHIOPIC_SUPPLEMENT,
+ /* 36, 001400 - 00167f */ UCD_BLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
+ /* 38, 0016a0 - 0016ff */ UCD_BLOCK_RUNIC,
+ /* 40, 001720 - 00173f */ UCD_BLOCK_HANUNOO,
+ /* 42, 001760 - 00177f */ UCD_BLOCK_TAGBANWA,
+ /* 44, 001800 - 0018af */ UCD_BLOCK_MONGOLIAN,
+ /* 46, 001950 - 00197f */ UCD_BLOCK_TAI_LE,
+ /* 48, 0019e0 - 0019ff */ UCD_BLOCK_KHMER_SYMBOLS,
+ /* 50, 001b00 - 001b7f */ UCD_BLOCK_BALINESE,
+ /* 52, 001c00 - 001c4f */ UCD_BLOCK_LEPCHA,
+ /* 54, 001d00 - 001d7f */ UCD_BLOCK_PHONETIC_EXTENSIONS,
+ /* 56, 001dc0 - 001dff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
+ /* 58, 001f00 - 001fff */ UCD_BLOCK_GREEK_EXTENDED,
+ /* 60, 002070 - 00209f */ UCD_BLOCK_SUPERSCRIPTS_AND_SUBSCRIPTS,
+ /* 62, 0020d0 - 0020ff */ UCD_BLOCK_COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
+ /* 64, 002150 - 00218f */ UCD_BLOCK_NUMBER_FORMS,
+ /* 66, 002200 - 0022ff */ UCD_BLOCK_MATHEMATICAL_OPERATORS,
+ /* 68, 002400 - 00243f */ UCD_BLOCK_CONTROL_PICTURES,
+ /* 70, 002460 - 0024ff */ UCD_BLOCK_ENCLOSED_ALPHANUMERICS,
+ /* 72, 002580 - 00259f */ UCD_BLOCK_BLOCK_ELEMENTS,
+ /* 74, 002600 - 0026ff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS,
+ /* 76, 0027c0 - 0027ef */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
+ /* 78, 002800 - 0028ff */ UCD_BLOCK_BRAILLE_PATTERNS,
+ /* 80, 002980 - 0029ff */ UCD_BLOCK_MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
+ /* 82, 002b00 - 002bff */ UCD_BLOCK_MISCELLANEOUS_SYMBOLS_AND_ARROWS,
+ /* 84, 002c60 - 002c7f */ UCD_BLOCK_LATIN_EXTENDED_C,
+ /* 86, 002d00 - 002d2f */ UCD_BLOCK_GEORGIAN_SUPPLEMENT,
+ /* 88, 002d80 - 002ddf */ UCD_BLOCK_ETHIOPIC_EXTENDED,
+ /* 90, 002e00 - 002e7f */ UCD_BLOCK_SUPPLEMENTAL_PUNCTUATION,
+ /* 92, 002f00 - 002fdf */ UCD_BLOCK_KANGXI_RADICALS,
+ /* 94, 003000 - 00303f */ UCD_BLOCK_CJK_SYMBOLS_AND_PUNCTUATION,
+ /* 96, 0030a0 - 0030ff */ UCD_BLOCK_KATAKANA,
+ /* 98, 003130 - 00318f */ UCD_BLOCK_HANGUL_COMPATIBILITY_JAMO,
+ /* 100, 0031a0 - 0031bf */ UCD_BLOCK_BOPOMOFO_EXTENDED,
+ /* 102, 0031f0 - 0031ff */ UCD_BLOCK_KATAKANA_PHONETIC_EXTENSIONS,
+ /* 104, 003300 - 0033ff */ UCD_BLOCK_CJK_COMPATIBILITY,
+ /* 106, 004dc0 - 004dff */ UCD_BLOCK_YIJING_HEXAGRAM_SYMBOLS,
+ /* 108, 00a000 - 00a48f */ UCD_BLOCK_YI_SYLLABLES,
+ /* 110, 00a500 - 00a63f */ UCD_BLOCK_VAI,
+ /* 112, 00a700 - 00a71f */ UCD_BLOCK_MODIFIER_TONE_LETTERS,
+ /* 114, 00a800 - 00a82f */ UCD_BLOCK_SYLOTI_NAGRI,
+ /* 116, 00a880 - 00a8df */ UCD_BLOCK_SAURASHTRA,
+ /* 118, 00a930 - 00a95f */ UCD_BLOCK_REJANG,
+ /* 120, 00ac00 - 00d7af */ UCD_BLOCK_HANGUL_SYLLABLES,
+ /* 122, 00db80 - 00dbff */ UCD_BLOCK_HIGH_PRIVATE_USE_SURROGATES,
+ /* 124, 00e000 - 00f8ff */ UCD_BLOCK_PRIVATE_USE_AREA,
+ /* 126, 00fb00 - 00fb4f */ UCD_BLOCK_ALPHABETIC_PRESENTATION_FORMS,
+ /* 128, 00fe00 - 00fe0f */ UCD_BLOCK_VARIATION_SELECTORS,
+ /* 130, 00fe20 - 00fe2f */ UCD_BLOCK_COMBINING_HALF_MARKS,
+ /* 132, 00fe50 - 00fe6f */ UCD_BLOCK_SMALL_FORM_VARIANTS,
+ /* 134, 00ff00 - 00ffef */ UCD_BLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS,
+ /* 136, 010000 - 01007f */ UCD_BLOCK_LINEAR_B_SYLLABARY,
+ /* 138, 010100 - 01013f */ UCD_BLOCK_AEGEAN_NUMBERS,
+ /* 140, 010190 - 0101cf */ UCD_BLOCK_ANCIENT_SYMBOLS,
+ /* 142, 010280 - 01029f */ UCD_BLOCK_LYCIAN,
+ /* 144, 010300 - 01032f */ UCD_BLOCK_OLD_ITALIC,
+ /* 146, 010380 - 01039f */ UCD_BLOCK_UGARITIC,
+ /* 148, 010400 - 01044f */ UCD_BLOCK_DESERET,
+ /* 150, 010480 - 0104af */ UCD_BLOCK_OSMANYA,
+ /* 152, 010900 - 01091f */ UCD_BLOCK_PHOENICIAN,
+ /* 154, 010a00 - 010a5f */ UCD_BLOCK_KHAROSHTHI,
+ /* 156, 012400 - 01247f */ UCD_BLOCK_CUNEIFORM_NUMBERS_AND_PUNCTUATION,
+ /* 158, 01d100 - 01d1ff */ UCD_BLOCK_MUSICAL_SYMBOLS,
+ /* 160, 01d300 - 01d35f */ UCD_BLOCK_TAI_XUAN_JING_SYMBOLS,
+ /* 162, 01d400 - 01d7ff */ UCD_BLOCK_MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
+ /* 164, 01f030 - 01f09f */ UCD_BLOCK_DOMINO_TILES,
+ /* 166, 02f800 - 02fa1f */ UCD_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
+ /* 168, 0e0100 - 0e01ef */ UCD_BLOCK_VARIATION_SELECTORS_SUPPLEMENT,
+ /* 170, 100000 - 10ffff */ UCD_BLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B,
+ /* 171, 000000 - 10ffff */ UCD_BLOCK_NO_BLOCK
+} UnicodeBlocks;
+
+
+#endif /* __UCD_H__ */
+
+