From 656277dbe3fa5cb2d6bbe7e21afb11e94b9815f0 Mon Sep 17 00:00:00 2001 From: ishmal Date: Sat, 8 Mar 2008 19:53:53 +0000 Subject: [PATCH] Change charclass.cpp/h to ucd.cpp/h --- src/dom/Makefile.mingw | 2 +- src/dom/Makefile_insert | 154 +++++++++++++------------- src/dom/cssparser.cpp | 2 +- src/dom/{charclass.cpp => ucd.c} | 184 ++++++++++++++++++++++++++++--- src/dom/{charclass.h => ucd.h} | 36 ++++-- src/dom/uri.cpp | 2 +- src/dom/xmlreader.cpp | 2 +- src/dom/xpathparser.cpp | 2 +- 8 files changed, 282 insertions(+), 102 deletions(-) rename src/dom/{charclass.cpp => ucd.c} (96%) rename src/dom/{charclass.h => ucd.h} (85%) diff --git a/src/dom/Makefile.mingw b/src/dom/Makefile.mingw index b47437265..987a668af 100644 --- a/src/dom/Makefile.mingw +++ b/src/dom/Makefile.mingw @@ -139,7 +139,6 @@ CFLAGS = -Wall -g -DXP_WIN LIBS = -lws2_32 DOMOBJ = \ -charclass.o \ cssparser.o \ domimpl.o \ domptr.o \ @@ -164,6 +163,7 @@ io/socket.o \ odf/odfdocument.o \ svg/svgimpl.o \ svg/svgparser.o \ +ucd.o \ util/thread.o \ util/ziptool.o diff --git a/src/dom/Makefile_insert b/src/dom/Makefile_insert index 7777c4250..05f4f254b 100644 --- a/src/dom/Makefile_insert +++ b/src/dom/Makefile_insert @@ -1,77 +1,77 @@ -## Makefile.am fragment sourced by src/Makefile.am. - -dom/all: dom/libdom.a - -dom/clean: - rm -f dom/libdom.a $(dom_libdom_a_OBJECTS) - -dom_libdom_a_SOURCES = \ - dom/charclass.cpp \ - dom/charclass.h \ - dom/css.h \ - dom/cssparser.cpp \ - dom/cssparser.h \ - dom/dom.h \ - dom/domconfig.h \ - dom/domimpl.cpp \ - dom/domimpl.h \ - dom/domstring.cpp \ - dom/domstring.h \ - dom/domstringimpl.h \ - dom/events.h \ - dom/ls.h \ - dom/lsimpl.cpp \ - dom/lsimpl.h \ - dom/phoebedom.h \ - dom/prop-css2.cpp \ - dom/prop-css.cpp \ - dom/prop-svg.cpp \ - dom/smil.h \ - dom/smilimpl.cpp \ - dom/smilimpl.h \ - dom/stylesheets.h \ - dom/traversal.h \ - dom/uri.cpp \ - dom/uri.h \ - dom/views.h \ - dom/xmlreader.cpp \ - dom/xmlreader.h \ - dom/xpath.h \ - dom/xpathimpl.cpp \ - dom/xpathimpl.h \ - dom/xpathparser.cpp \ - dom/xpathparser.h \ - dom/xpathtoken.h \ - dom/xpathtoken.cpp \ - dom/io/base64stream.cpp \ - dom/io/base64stream.h \ - dom/io/bufferstream.cpp \ - dom/io/bufferstream.h \ - dom/io/domstream.cpp \ - dom/io/domstream.h \ - dom/io/httpclient.h \ - dom/io/gzipstream.cpp \ - dom/io/gzipstream.h \ - dom/io/socket.cpp \ - dom/io/socket.h \ - dom/io/gzipstream.cpp \ - dom/io/gzipstream.h \ - dom/io/stringstream.cpp \ - dom/io/stringstream.h \ - dom/io/uristream.cpp \ - dom/io/uristream.h \ - dom/odf/odfdocument.cpp \ - dom/odf/odfdocument.h \ - dom/svg/svg.h \ - dom/svg/svgimpl.cpp \ - dom/svg/svgimpl.h \ - dom/svg/svgparser.cpp \ - dom/svg/svgparser.h \ - dom/svg/svgtypes.h \ - dom/util/digest.h \ - dom/util/digest.cpp \ - dom/util/thread.h \ - dom/util/thread.cpp \ - dom/util/ziptool.h \ - dom/util/ziptool.cpp - +## Makefile.am fragment sourced by src/Makefile.am. + +dom/all: dom/libdom.a + +dom/clean: + rm -f dom/libdom.a $(dom_libdom_a_OBJECTS) + +dom_libdom_a_SOURCES = \ + dom/css.h \ + dom/cssparser.cpp \ + dom/cssparser.h \ + dom/dom.h \ + dom/domconfig.h \ + dom/domimpl.cpp \ + dom/domimpl.h \ + dom/domstring.cpp \ + dom/domstring.h \ + dom/domstringimpl.h \ + dom/events.h \ + dom/ls.h \ + dom/lsimpl.cpp \ + dom/lsimpl.h \ + dom/phoebedom.h \ + dom/prop-css2.cpp \ + dom/prop-css.cpp \ + dom/prop-svg.cpp \ + dom/smil.h \ + dom/smilimpl.cpp \ + dom/smilimpl.h \ + dom/stylesheets.h \ + dom/traversal.h \ + dom/ucd.cpp \ + dom/ucd.h \ + dom/uri.cpp \ + dom/uri.h \ + dom/views.h \ + dom/xmlreader.cpp \ + dom/xmlreader.h \ + dom/xpath.h \ + dom/xpathimpl.cpp \ + dom/xpathimpl.h \ + dom/xpathparser.cpp \ + dom/xpathparser.h \ + dom/xpathtoken.h \ + dom/xpathtoken.cpp \ + dom/io/base64stream.cpp \ + dom/io/base64stream.h \ + dom/io/bufferstream.cpp \ + dom/io/bufferstream.h \ + dom/io/domstream.cpp \ + dom/io/domstream.h \ + dom/io/httpclient.h \ + dom/io/gzipstream.cpp \ + dom/io/gzipstream.h \ + dom/io/socket.cpp \ + dom/io/socket.h \ + dom/io/gzipstream.cpp \ + dom/io/gzipstream.h \ + dom/io/stringstream.cpp \ + dom/io/stringstream.h \ + dom/io/uristream.cpp \ + dom/io/uristream.h \ + dom/odf/odfdocument.cpp \ + dom/odf/odfdocument.h \ + dom/svg/svg.h \ + dom/svg/svgimpl.cpp \ + dom/svg/svgimpl.h \ + dom/svg/svgparser.cpp \ + dom/svg/svgparser.h \ + dom/svg/svgtypes.h \ + dom/util/digest.h \ + dom/util/digest.cpp \ + dom/util/thread.h \ + dom/util/thread.cpp \ + dom/util/ziptool.h \ + dom/util/ziptool.cpp + diff --git a/src/dom/cssparser.cpp b/src/dom/cssparser.cpp index 78ce4ce9b..83fd92909 100644 --- a/src/dom/cssparser.cpp +++ b/src/dom/cssparser.cpp @@ -28,7 +28,7 @@ */ #include "cssparser.h" -#include "charclass.h" +#include "ucd.h" #include #include diff --git a/src/dom/charclass.cpp b/src/dom/ucd.c similarity index 96% rename from src/dom/charclass.cpp rename to src/dom/ucd.c index d2b67a660..dbb0a299f 100644 --- a/src/dom/charclass.cpp +++ b/src/dom/ucd.c @@ -1,5 +1,5 @@ /* - * Generated by UcdReader at:Fri Mar 07 14:15:18 CST 2008 + * Generated by UcdReader at:Sat Mar 08 13:42:03 CST 2008 * block table size:2 (4 bytes) * plane table size:6 (64 bytes) * @@ -9,7 +9,8 @@ * * You might notice that this table is larger than some other implementations. * This is because we included the entire UCD set of codepoints, rather than - * truncating it at 16 bits for UTF-16 + * truncating it at 16 bits for UTF-16. It is similar to Java's and JS's, but + * this is our own implementation. * * This lookup table should be compared to the raw unfolded table lookup * to make sure that no bits are lost in compression. This is done internally @@ -37,24 +38,24 @@ * G 3 bits This field is for storing our own lexical information, in addition * to that given by the UCD. The fields are as follows: * 0 may not be part of an identifier - * 1 ignorable control; may continue a Unicode identifier or JS + * 1 ignorable control; may continue a Unicode identifier or C * identifier - * 2 may continue a JS identifier but not a Unicode identifier + * 2 may continue a C identifier but not a Unicode identifier * (unused) - * 3 may continue a Unicode identifier or JS identifier - * 4 is a JS whitespace character - * 5 may start or continue a JS identifier; + * 3 may continue a Unicode identifier or C identifier + * 4 is a C whitespace character + * 5 may start or continue a C identifier; * may continue but not start a Unicode identifier (_) - * 6 may start or continue a JS identifier but not a Unicode + * 6 may start or continue a C identifier but not a Unicode * identifier ($) - * 7 may start or continue a Unicode identifier or JS identifier + * 7 may start or continue a Unicode identifier or C identifier * Thus: - * 5, 6, 7 may start a JS identifier - * 1, 2, 3, 5, 6, 7 may continue a JS identifier + * 5, 6, 7 may start a C identifier + * 1, 2, 3, 5, 6, 7 may continue a C identifier * 7 may start a Unicode identifier * 1, 3, 5, 7 may continue a Unicode identifier * 1 is ignorable within an identifier - * 4 is JS whitespace + * 4 is C whitespace * H 2 bits This field is for storing additional information regarding this * codepoint's numeric properties. The values are: * 0 this character has no numeric property @@ -2005,10 +2006,139 @@ static unsigned int prop[] = #define UNI_CASE_OFFSET(ch) ((UNI_CODE(ch)>>18) & 0x1ff) +/** + * Special case mappings, not covered by UnicodeData.txt + */ +typedef struct +{ + int sourceChar; + int nrMaps; + int map[3]; +} CaseMapEntry; + +static CaseMapEntry caseMap[] = +{ + { 0x1fc2, 2, { 0xffff, 0x0159 } }, + { 0x1fc3, 1, { 0xffff } }, + { 0x1fc6, 2, { 0x018d, 0x0156 } }, + { 0x1fc7, 3, { 0x018d, 0x0156, 0x0159 } }, + { 0x1fc4, 2, { 0x0185, 0x0159 } }, + { 0x1fcc, 1, { 0xffff } }, + { 0x1fd3, 3, { 0x018f, 0x0134, 0x012d } }, + { 0x1fd2, 3, { 0x018f, 0x0134, 0x012c } }, + { 0x1fd7, 3, { 0x018f, 0x0134, 0x0156 } }, + { 0x1fd6, 2, { 0x018f, 0x0156 } }, + { 0x0130, 1, { 0x0082 } }, + { 0x1fe2, 3, { 0xffff, 0x0134, 0x012c } }, + { 0x1fe3, 3, { 0xffff, 0x0134, 0x012d } }, + { 0x1fe4, 2, { 0xffff, 0x0139 } }, + { 0x1fe6, 2, { 0xffff, 0x0156 } }, + { 0x1fe7, 3, { 0xffff, 0x0134, 0x0156 } }, + { 0x0307, 1, { 0x0133 } }, + { 0x1ff3, 1, { 0xffff } }, + { 0x1ff2, 2, { 0xffff, 0x0159 } }, + { 0x1ff4, 2, { 0xffff, 0x0159 } }, + { 0x1ff7, 3, { 0xffff, 0x0156, 0x0159 } }, + { 0x1ff6, 2, { 0xffff, 0x0156 } }, + { 0x0128, 1, { 0x0080 } }, + { 0x1ffc, 1, { 0xffff } }, + { 0x012e, 1, { 0xffff } }, + { 0x1f86, 1, { 0xffff } }, + { 0x1f87, 1, { 0xffff } }, + { 0x1f84, 1, { 0xffff } }, + { 0x1e97, 2, { 0x0036, 0x0134 } }, + { 0x1f85, 1, { 0xffff } }, + { 0x1e96, 2, { 0x0030, 0x014b } }, + { 0x1f82, 1, { 0xffff } }, + { 0x1f83, 1, { 0xffff } }, + { 0x1f80, 1, { 0xffff } }, + { 0x1f81, 1, { 0xffff } }, + { 0xfb01, 2, { 0x002e, 0x0045 } }, + { 0x1f8e, 1, { 0xffff } }, + { 0xfb00, 2, { 0x002e, 0x0042 } }, + { 0x1f8f, 1, { 0xffff } }, + { 0xfb03, 3, { 0x002e, 0x0042, 0x0045 } }, + { 0x1f8c, 1, { 0xffff } }, + { 0xfb02, 2, { 0x002e, 0xffff } }, + { 0x1f8d, 1, { 0xffff } }, + { 0xfb05, 2, { 0x0035, 0x004a } }, + { 0x1e99, 2, { 0x003b, 0xffff } }, + { 0x1f8a, 1, { 0xffff } }, + { 0x0049, 1, { 0x0031 } }, + { 0xfb04, 3, { 0x002e, 0x0042, 0xffff } }, + { 0x1e98, 2, { 0x0039, 0xffff } }, + { 0x1f8b, 1, { 0xffff } }, + { 0x004a, 1, { 0xffff } }, + { 0x1f88, 1, { 0xffff } }, + { 0xfb06, 2, { 0x0035, 0x004a } }, + { 0x1e9a, 2, { 0x0029, 0xffff } }, + { 0x1f89, 1, { 0xffff } }, + { 0x1f97, 1, { 0xffff } }, + { 0x1f96, 1, { 0xffff } }, + { 0x1f95, 1, { 0xffff } }, + { 0x1f94, 1, { 0xffff } }, + { 0x1f93, 1, { 0xffff } }, + { 0x1f92, 1, { 0xffff } }, + { 0x1f91, 1, { 0xffff } }, + { 0x1f90, 1, { 0xffff } }, + { 0x1f9f, 1, { 0xffff } }, + { 0x1f9e, 1, { 0xffff } }, + { 0x1f9d, 1, { 0xffff } }, + { 0xfb13, 2, { 0x0220, 0x0240 } }, + { 0x1f9c, 1, { 0xffff } }, + { 0xfb14, 2, { 0x0220, 0x0235 } }, + { 0x1f9b, 1, { 0xffff } }, + { 0xfb15, 2, { 0x0220, 0xffff } }, + { 0x1f9a, 1, { 0xffff } }, + { 0xfb16, 2, { 0xffff, 0x0240 } }, + { 0x1f99, 1, { 0xffff } }, + { 0xfb17, 2, { 0x0220, 0xffff } }, + { 0x0149, 2, { 0xffff, 0xffff } }, + { 0x1f98, 1, { 0xffff } }, + { 0x1fa4, 1, { 0xffff } }, + { 0x1fa5, 1, { 0xffff } }, + { 0x1fa6, 1, { 0xffff } }, + { 0x1fa7, 1, { 0xffff } }, + { 0x1fa0, 1, { 0xffff } }, + { 0x1fa1, 1, { 0xffff } }, + { 0x1fa2, 1, { 0xffff } }, + { 0x1fa3, 1, { 0xffff } }, + { 0x1fac, 1, { 0xffff } }, + { 0x1fad, 1, { 0xffff } }, + { 0x1fae, 1, { 0xffff } }, + { 0x1faf, 1, { 0xffff } }, + { 0x1fa8, 1, { 0xffff } }, + { 0x1fa9, 1, { 0xffff } }, + { 0x1faa, 1, { 0xffff } }, + { 0x0069, 1, { 0x0082 } }, + { 0x1fab, 1, { 0xffff } }, + { 0x1fb4, 2, { 0x0182, 0x0159 } }, + { 0x1fb7, 3, { 0x0187, 0x0156, 0x0159 } }, + { 0x1fb6, 2, { 0x0187, 0x0156 } }, + { 0x1fb3, 1, { 0xffff } }, + { 0x1fb2, 2, { 0xffff, 0x0159 } }, + { 0x1fbc, 1, { 0xffff } }, + { 0x03b0, 3, { 0xffff, 0x0134, 0x012d } }, + { 0x1f52, 3, { 0xffff, 0x0139, 0x012c } }, + { 0x1f50, 2, { 0xffff, 0x0139 } }, + { 0x1f56, 3, { 0xffff, 0x0139, 0x0156 } }, + { 0x03a3, 1, { 0xffff } }, + { 0x1f54, 3, { 0xffff, 0x0139, 0x012d } }, + { 0x0390, 3, { 0x018f, 0x0134, 0x012d } }, + { 0x00cd, 1, { 0xffff } }, + { 0x00cc, 1, { 0xffff } }, + { 0x00df, 2, { 0x0035, 0x0049 } }, + { 0x0587, 2, { 0x0217, 0x0246 } }, + { 0x01f0, 2, { 0xffff, 0xffff } }, + { 0, 0, {} } +}; + + + /** * Look for comments in ucd.h */ -#include "charclass.h" +#include "ucd.h" @@ -2067,6 +2197,34 @@ int uni_is_space(int ch) || c==UNI_PARAGRAPH_SEPARATOR); } +int uni_to_lower(int ch) +{ + int c = UNI_CODE(ch); + if (c == UNI_LOWERCASE_LETTER) + return ch; + ch -= (c>>18) & 0x1ff; + return ch; +} + +int uni_to_upper(int ch) +{ + int c = UNI_CODE(ch); + if (c == UNI_UPPERCASE_LETTER) + return ch; + ch += (c>>18) & 0x1ff; + return ch; +} + +int uni_to_title(int ch) +{ + int c = UNI_CODE(ch); + if (c == UNI_TITLECASE_LETTER) + return ch; + ch += (c>>18) & 0x1ff; + return ch; +} + + diff --git a/src/dom/charclass.h b/src/dom/ucd.h similarity index 85% rename from src/dom/charclass.h rename to src/dom/ucd.h index 26d5dc265..f9ce2f09c 100644 --- a/src/dom/charclass.h +++ b/src/dom/ucd.h @@ -1,12 +1,7 @@ -#ifndef __CHARCLASS_H__ -#define __CHARCLASS_H__ /** * - * Phoebe DOM Implementation. + * NGP Packrat Parser Generator * - * This is a C++ approximation of the W3C DOM model, which follows - * fairly closely the specifications in the various .idl files, copies of - * which are provided for reference. Most important is this one: * * Authors: * Bob Jamison @@ -28,6 +23,9 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ +#ifndef __UCD_H__ +#define __UCD_H__ + /** @@ -146,8 +144,32 @@ int uni_is_letter_or_digit(int ch); */ int uni_is_space(int ch); +/** + * Convert the given codepoint to its lower case mapping. + * If there is none, return the codepoint. + * @param ch the Unicode codepoint to convert + * @return the converted codepoint + */ +int uni_to_lower(int ch); + +/** + * Convert the given codepoint to its upper case mapping. + * If there is none, return the codepoint. + * @param ch the Unicode codepoint to convert + * @return the converted codepoint + */ +int uni_to_upper(int ch); + +/** + * Convert the given codepoint to its title case mapping. + * If there is none, return the codepoint. + * @param ch the Unicode codepoint to convert + * @return the converted codepoint + */ +int uni_to_title(int ch); + -#endif /* __CHARCLASS_H__ */ +#endif /* __UCD_H__ */ diff --git a/src/dom/uri.cpp b/src/dom/uri.cpp index 8b810e611..db45a2abf 100644 --- a/src/dom/uri.cpp +++ b/src/dom/uri.cpp @@ -31,7 +31,7 @@ #include "uri.h" -#include "charclass.h" +#include "ucd.h" #include #include diff --git a/src/dom/xmlreader.cpp b/src/dom/xmlreader.cpp index 32a97e0c0..c6eae8ba3 100644 --- a/src/dom/xmlreader.cpp +++ b/src/dom/xmlreader.cpp @@ -30,7 +30,7 @@ #include "xmlreader.h" -#include "charclass.h" +#include "ucd.h" #include "domimpl.h" #include diff --git a/src/dom/xpathparser.cpp b/src/dom/xpathparser.cpp index f2e000e6d..bbe0fcc40 100644 --- a/src/dom/xpathparser.cpp +++ b/src/dom/xpathparser.cpp @@ -28,7 +28,7 @@ */ -#include "charclass.h" +#include "ucd.h" #include "xpathparser.h" -- 2.30.2