1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
32 #include "xmlreader.h"
33 #include "charclass.h"
34 #include "svgimpl.h"
36 #include <stdarg.h>
38 namespace org
39 {
40 namespace w3c
41 {
42 namespace dom
43 {
46 //#########################################################################
47 //# E N T I T Y T A B L E
48 //#########################################################################
49 struct EntityInfo
50 {
51 char *escape;
52 int escapeLength;
53 char *value;
54 };
57 static EntityInfo entityTable[] =
58 {
59 { "&" , 5 , "&" },
60 { "<" , 4 , "<" },
61 { ">" , 4 , ">" },
62 { "'" , 6 , "'" },
63 { """ , 6 , "\"" },
64 { NULL , 0 , "\0" }
65 };
69 //#########################################################################
70 //# M E S S A G E S
71 //#########################################################################
74 /**
75 *
76 */
77 void XmlReader::error(char *fmt, ...)
78 {
79 va_list args;
80 fprintf(stderr, "XmlReader:error at line %d, column %d:", lineNr, colNr);
81 va_start(args, fmt);
82 vfprintf(stderr, fmt, args);
83 va_end(args) ;
84 fprintf(stderr, "\n");
85 }
89 //#########################################################################
90 //# U T I L I T Y
91 //#########################################################################
93 static void trim(DOMString &str)
94 {
95 int len = str.size();
96 if (len<1)
97 return;
99 int start = 0;
100 int end = 0;
101 for (start=0 ; start<len ; start++)
102 {
103 int ch = str[start];
104 if (ch<=' ' || ch>126)
105 break;
106 }
107 for (end=len-1 ; end>=0 ; end--)
108 {
109 int ch = str[end];
110 if (ch<=' ' || ch>126)
111 break;
112 }
113 if (start<end)
114 {
115 str = str.substr(start, end+1);
116 }
117 }
119 //#########################################################################
120 //# P A R S I N G
121 //#########################################################################
123 /**
124 * Get the character at the position and record the fact
125 */
126 int XmlReader::get(int p)
127 {
128 if (p >= len)
129 return -1;
130 int ch = parsebuf[p];
131 //printf("%c", ch);
132 if (ch == '\n' || ch == '\r')
133 {
134 colNr = 0;
135 lineNr++;
136 }
137 else
138 colNr++;
139 return ch;
140 }
142 /**
143 * Look at the character at the position, but don't note the fact
144 */
145 int XmlReader::peek(int p)
146 {
147 if (p >= len)
148 return -1;
149 int ch = parsebuf[p];
150 return ch;
151 }
154 /**
155 * Test if the given substring exists at the given position
156 * in parsebuf. Use peek() in case of out-of-bounds
157 */
158 bool XmlReader::match(int pos, char *str)
159 {
160 while (*str)
161 {
162 if (peek(pos++) != *str++)
163 return false;
164 }
165 return true;
166 }
170 /**
171 * Test if the given substring exists at the given position
172 * in a given buffer
173 */
174 /*
175 static bool bufMatch(const DOMString &buf, int pos, char *str)
176 {
177 while (*str)
178 {
179 if (buf[pos++] != *str++)
180 return false;
181 }
182 return true;
183 }
184 */
187 /**
188 *
189 */
190 int XmlReader::skipwhite(int p)
191 {
192 while (p < len)
193 {
194 int b = get(p);
195 if (!isWhitespace(b))
196 break;
197 p++;
198 }
199 return p;
200 }
202 /**
203 * modify this to allow all chars for an element or attribute name
204 */
205 int XmlReader::getWord(int p, DOMString &result)
206 {
207 while (p<len)
208 {
209 int b = get(p);
210 if (b<=' ' || b=='/' || b=='>' || b=='=')
211 break;
212 result.push_back(b);
213 p++;
214 }
215 return p;
216 }
218 /**
219 * get a name and prefix, if any
220 */
221 int XmlReader::getPrefixedWord(int p, DOMString &prefix,
222 DOMString &shortWord, DOMString &fullWord)
223 {
224 while (p<len)
225 {
226 int b = get(p);
227 if (b<=' ' || b=='/' || b=='>' || b=='=')
228 break;
229 else if (b == ':')
230 {
231 prefix = shortWord;
232 shortWord = "";
233 }
234 else
235 shortWord.push_back(b);
236 p++;
237 }
238 if (prefix.size() > 0)
239 fullWord = prefix + ":" + shortWord;
240 else
241 fullWord = shortWord;
242 return p;
243 }
246 /**
247 * Assume that we are starting on a quote. Ends on the char
248 * after the final '"'
249 */
250 int XmlReader::getQuoted(int p0, DOMString &result)
251 {
253 int p = p0;
255 if (peek(p)!='"' && peek(p)!='\'')
256 return p0;
258 int b = get(p++); //go to next char
260 DOMString buf;
262 while (p<len )
263 {
264 b = get(p++);
265 if (b=='"' || b=='\'')
266 break;
267 else if (b=='&')
268 {
269 p = parseEntity(p, result);
270 if (p < 0)
271 return p0;
272 }
273 else
274 {
275 buf.push_back(b);
276 }
277 }
279 //printf("quoted text:'%s'\n", buf.c_str());
281 result.append(buf);
283 return p;
284 }
288 /**
289 * Parse a <!xml> tag. Node may be null. Assumes current char is '<'
290 * ends on char after '>'
291 */
292 int XmlReader::parseVersion(int p0)
293 {
294 int p = p0;
296 if (!match(p, "<?xml"))
297 return p0;
299 p += 5;
300 colNr += 5;
302 bool quickCloseDummy;
303 Node *node = new NodeImpl();
304 int p2 = parseAttributes(p, node, &quickCloseDummy);
305 if (p2 < p)
306 {
307 delete node;
308 return p0;
309 }
310 p = p2;
312 //get the attributes that we need
313 NamedNodeMap attributes = node->getAttributes();
314 Node *attr = attributes.getNamedItem("version");
315 if (attr)
316 document->setXmlVersion(attr->getNodeValue());
317 attr = attributes.getNamedItem("encoding");
318 if (attr)
319 { /*document->setXmlEncoding(attr->getNodeValue());*/ }
320 attr = attributes.getNamedItem("standalone");
321 if (attr)
322 document->setXmlStandalone((attr->getNodeValue() == "yes"));
323 delete node;
325 //#now we should be pointing at '?>'
326 if (!match(p, "?>"))
327 {
328 return p0;
329 }
331 //skip over '?>'
332 get(p++);
333 get(p++);
335 return p;
336 }
339 /**
340 * Parse a <!DOCTYPE> tag. doctype may be null. Expects '<'
341 * on start. Ends pointing at char after '>'
342 */
343 int XmlReader::parseDoctype(int p0)
344 {
345 int p = p0;
347 if (!match(p, "<!DOCTYPE"))
348 return p0;
350 p += 9;
351 colNr += 9;
353 DocumentType *doctype = document->getDoctype();
354 if (!doctype)
355 return p0;
358 //### get the root name of the document
359 p = skipwhite(p);
360 DOMString rootName;
361 int p2 = getWord(p, rootName);
362 if (p2 <= p)
363 return p0;
364 p = p2;
365 //printf("doctype root '%s'\n", rootName.c_str());
368 while (p < len)
369 {
370 p = skipwhite(p);
371 if (peek(p) == '>')
372 break;
373 else if (peek(p) == '[') //just ignore 'internal' [] stuff
374 {
375 while (p < len)
376 {
377 int ch = get(p++);
378 if (ch == ']')
379 break;
380 }
381 p++;
382 }
383 else if (match(p, "PUBLIC"))
384 {
385 p += 6;
386 colNr += 6;
387 p = skipwhite(p);
388 DOMString pubIdLiteral;
389 int p2 = getQuoted(p, pubIdLiteral);
390 if (p2 <= p)
391 return p0;
392 p = p2;
393 p = skipwhite(p);
394 DOMString systemLiteral;
395 p2 = getQuoted(p, systemLiteral);
396 if (p2 <= p)
397 return p0;
398 p = p2;
399 //printf("PUBLIC \"%s\" \"%s\" \n",
400 // pubIdLiteral.c_str(), systemLiteral.c_str());
401 }
402 else if (match(p, "SYSTEM"))
403 {
404 p += 6;
405 colNr += 6;
406 p = skipwhite(p);
407 DOMString systemLiteral;
408 int p2 = getQuoted(p, systemLiteral);
409 if (p2 <= p)
410 return p0;
411 p = p2;
412 //printf("SYSTEM \"%s\" \n", systemLiteral.c_str());
413 }
414 }
417 //skip over '>'
418 get(p++);
420 return p;
421 }
425 /**
426 * Expects '<' on startup, ends on char after '>'
427 */
428 int XmlReader::parseComment(int p0, Comment *comment)
429 {
430 int p = p0;
432 if (!match(p, "<!--"))
433 return p0;
435 colNr += 4;
436 p += 4;
438 DOMString buf;
440 while (p<len-3)
441 {
442 if (match(p, "-->"))
443 {
444 p += 3;
445 colNr += 3;
446 break;
447 }
448 int ch = get(p++);
449 buf.push_back(ch);
450 }
452 comment->setNodeValue(buf);
454 return p;
455 }
459 /**
460 *
461 */
462 int XmlReader::parseCDATA(int p0, CDATASection *cdata)
463 {
465 int p = p0;
467 if (!match(p, "<![CDATA["))
468 return p0;
470 colNr += 9;
471 p += 9;
473 DOMString buf;
475 while (p<len)
476 {
477 if (match(p, "]]>"))
478 {
479 p +=3;
480 colNr += 3;
481 break;
482 }
483 int ch = get(p++);
484 buf.push_back(ch);
485 }
487 /*printf("Got CDATA:%s\n",buf.c_str());*/
488 cdata->setNodeValue(buf);
490 return p;
491 }
495 /**
496 *
497 */
498 int XmlReader::parseText(int p0, Text *text)
499 {
501 int p = p0;
503 DOMString buf;
505 while (p<len)
506 {
507 if (peek(p) == '&')
508 {
509 p = parseEntity(p, buf);
510 if (p < 0) //error?
511 return p0;
512 }
513 else if (peek(p) == '<')
514 {
515 break;
516 }
517 else
518 {
519 int ch = get(p++);
520 buf.push_back(ch);
521 }
522 }
524 /*printf("Got Text:%s\n",buf.c_str());*/
525 text->setNodeValue(buf);
527 return p;
528 }
534 /**
535 * Parses attributes of a node. Should end pointing at either the
536 * '?' of a version or doctype tag, or a '>' of a normal tag
537 */
538 int XmlReader::parseAttributes(int p0, Node *node, bool *quickClose)
539 {
540 *quickClose = false;
542 int p = p0;
544 NamedNodeMap attributes;
546 while (p<len)
547 {
548 /*printf("ch:%c\n",ch);*/
549 p = skipwhite(p);
550 int ch = get(p);
552 /*printf("ch:%c\n",ch);*/
553 if (ch == '?' || ch == '>')//done
554 break;
555 else if (ch=='/' && p<len+1)
556 {
557 p++;
558 p = skipwhite(p);
559 ch = peek(p);
560 if (ch == '>')
561 {
562 p++;
563 *quickClose = true;
564 /*printf("quick close\n");*/
565 return p;
566 }
567 }
568 DOMString shortName;
569 DOMString prefix;
570 DOMString qualifiedName;
571 int p2 = getPrefixedWord(p, prefix, shortName, qualifiedName);
572 if (p2 <= p)
573 break;
575 /*printf("name:%s",buf);*/
576 p = p2;
577 p = skipwhite(p);
578 ch = get(p);
579 /*printf("ch:%c\n",ch);*/
580 if (ch != '=')
581 break;
582 p++;
583 p = skipwhite(p);
584 /*ch = parsebuf[p];*/
585 /*printf("ch:%c\n",ch);*/
586 DOMString attrValue;
587 p2 = getQuoted(p, attrValue);
588 p = p2;
589 /*printf("name:'%s' value:'%s'\n",buf,buf2);*/
591 DOMString namespaceURI = "";
592 if (prefix == "xmlns" || shortName == "xmlns")
593 namespaceURI = XMLNSNAME;
595 //## Now let us make the attribute and give it to the node
596 Attr *attr = document->createAttributeNS(namespaceURI, qualifiedName);
597 attr->setValue(attrValue);
598 node->getAttributes().setNamedItemNS(attr);
600 }//while p<len
602 return p;
603 }
605 /**
606 * Appends the value of an entity to the buffer
607 */
608 int XmlReader::parseEntity(int p0, DOMString &buf)
609 {
610 int p = p0;
611 for (EntityInfo *info = entityTable ; info->escape ; info++)
612 {
613 if (match(p, info->escape))
614 {
615 p += info->escapeLength;
616 colNr += info->escapeLength;
617 buf += info->value;
618 return p;
619 }
620 }
622 error("unterminated entity");
623 return -1;
624 }
627 //#########################################################################
628 //# P A R S E A N O D E
629 //#########################################################################
631 /**
632 * Parse as a document, preserving the original structure as much as
633 * possible
634 */
635 int XmlReader::parseNode(int p0, Node *node, int depth)
636 {
638 int p = p0;
641 //### OPEN TAG
642 int ch = get(p++);
643 if (ch != '<')
644 return p0;
646 p = skipwhite(p);
647 DOMString openTagName;
648 DOMString openTagNamePrefix;
649 DOMString openTagQualifiedName;
650 int p2 = getPrefixedWord(p,openTagNamePrefix,
651 openTagName, openTagQualifiedName);
652 if (p2 <= p)
653 return p0;
654 p = p2;
655 p = skipwhite(p);
657 //printf("qualifiedName:%s\n", openTagQualifiedName.c_str());
658 DOMString namespaceURI = node->lookupNamespaceURI(openTagNamePrefix);
659 document->renameNode(node, namespaceURI, openTagQualifiedName);
661 //### ATTRIBUTES
662 bool quickClose;
663 p = parseAttributes(p, node, &quickClose);
664 if (quickClose) //trivial tag: <name/>
665 return p;
667 p++; //skip over '>'
670 DOMString nodeValue;
672 /* ### Get intervening data ### */
673 while (p<len && keepGoing)
674 {
675 //### COMMENT
676 if (match(p, "<!--"))
677 {
678 Comment *comment = document->createComment("");
679 p2 = parseComment(p, comment);
680 if (p2 <= p)
681 return p0;
682 p = p2;
683 if (parseAsData)
684 { //throw away
685 delete comment;
686 }
687 else
688 {
689 node->appendChild(comment);
690 }
691 }
692 //### VERSION
693 else if (match(p, "<?xml"))
694 {
695 p2 = parseVersion(p);
696 if (p2 <= p)
697 return p0;
698 }
699 //### DOCTYPE
700 else if (match(p, "<!DOCTYPE"))
701 {
702 p2 = parseDoctype(p);
703 if (p2 <= p)
704 return p0;
705 }
706 //### CDATA
707 else if (match(p, "<![CDATA["))
708 {
709 CDATASection *cdata = document->createCDATASection("");
710 p2 = parseCDATA(p, cdata);
711 if (p2 <= p)
712 return p0;
713 p = p2;
714 if (parseAsData)
715 {
716 nodeValue += cdata->getNodeValue();
717 delete cdata;
718 }
719 else
720 {
721 node->appendChild(cdata);
722 }
723 }
724 //### OPEN OR CLOSE TAG
725 else if (peek(p) == '<')
726 {
727 p2 = skipwhite(p+1);
728 if (peek(p2) =='/')
729 {
730 p = p2;
731 break;
732 }
733 else
734 {
735 /*Add element to tree*/
736 Element *elem = document->createElement(""); //fill in name later
737 node->appendChild(elem);
738 p2 = parseNode(p, elem, depth+1);
739 if (p2 <= p)
740 {
741 /*printf("problem on element:%ls. p2:%d p:%d\n",n->name, p2, p);*/
742 return p0;
743 }
744 p = p2;
745 }
746 }
747 //### TEXT
748 else
749 {
750 Text *text = document->createTextNode("");
751 p2 = parseText(p, text);
752 if (p2 <= p)
753 return p0;
754 p = p2;
755 if (parseAsData)
756 {
757 nodeValue += text->getNodeValue();
758 delete text;
759 }
760 else
761 {
762 node->appendChild(text);
763 }
764 }
766 }//while (p<len)
768 //printf("%d : nodeValue:'%s'\n", p, nodeValue.c_str());
769 trim(nodeValue);
770 node->setNodeValue(nodeValue);
772 //### get close tag. we should be pointing at '/'
773 p = skipwhite(p);
774 ch = get(p);
775 if (ch != '/')
776 {
777 error("no / on end tag");
778 return p0;
779 }
780 p++;
782 //### get word after '/'
783 p = skipwhite(p);
784 DOMString closeTagName;
785 DOMString closeTagNamePrefix;
786 DOMString closeTagQualifiedName;
787 p = getPrefixedWord(p, closeTagNamePrefix, closeTagName,
788 closeTagQualifiedName);
789 if (openTagQualifiedName != closeTagQualifiedName)
790 {
791 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
792 openTagQualifiedName.c_str(), closeTagQualifiedName.c_str());
793 return p0;
794 }
795 p = skipwhite(p);
796 if (parsebuf[p] != '>')
797 {
798 error("no > on end tag");
799 return p0;
800 }
801 p++;
802 /*printf("close element:%ls\n",buf);*/
803 return p;
804 }
807 /**
808 *
809 */
810 org::w3c::dom::Document *
811 XmlReader::parse(const DOMString &buf, int bufferOffset, int parseLen)
812 {
813 len = parseLen;
814 parsebuf = buf;
816 DOMImplementationSourceImpl source;
817 DOMImplementation *domImpl = source.getDOMImplementation("");
819 keepGoing = true;
821 document = domImpl->createDocument("", "", NULL);
822 //document = new svg::SVGDocumentImpl(domImpl, "", "", NULL);
824 int p = bufferOffset;
825 int p2 = 0;
827 while (p<len && keepGoing)
828 {
829 p = skipwhite(p);
830 //### COMMENT
831 if (match(p, "<!--"))
832 {
833 Comment *comment = document->createComment("");
834 p2 = parseComment(p, comment);
835 if (p2 <= p)
836 return document;
837 p = p2;
838 if (parseAsData)
839 { //throw away
840 delete comment;
841 }
842 else
843 {
844 document->appendChild(comment);
845 }
846 }
847 //### VERSION
848 else if (match(p, "<?xml"))
849 {
850 p2 = parseVersion(p);
851 if (p2 <= p)
852 return document;
853 p = p2;
854 }
855 //### DOCTYPE
856 else if (match(p, "<!DOCTYPE"))
857 {
858 p2 = parseDoctype(p);
859 if (p2 <= p)
860 return document;
861 p = p2;
862 }
863 else
864 {
865 break;
866 }
867 }
869 p = skipwhite(p);
870 p = parseNode(p, document->getDocumentElement(), 0);
872 keepGoing = false;
874 return document;
875 }
878 /**
879 *
880 */
881 org::w3c::dom::Document *
882 XmlReader::parse(const DOMString &str)
883 {
885 Document *doc = parse(str, 0, str.size());
886 doc->normalizeDocument();
888 return doc;
889 }
891 /**
892 *
893 */
894 org::w3c::dom::Document *
895 XmlReader::parseFile(char *fileName)
896 {
898 DOMString buf = loadFile(fileName);
900 Document *doc = parse(buf, 0, buf.size());
902 return doc;
903 }
907 //#########################################################################
908 //# S T R E A M R E A D I N G
909 //#########################################################################
911 /**
912 *
913 */
914 org::w3c::dom::DOMString
915 XmlReader::loadFile(char *fileName)
916 {
918 if (!fileName)
919 return NULL;
920 FILE *f = fopen(fileName, "rb");
921 if (!f)
922 return NULL;
924 DOMString buf;
925 while (!feof(f))
926 {
927 int ch = fgetc(f);
928 if (ch<0)
929 break;
930 buf.push_back(ch);
931 }
932 fclose(f);
934 return buf;
935 }
938 //#########################################################################
939 //# C O N S T R U C T O R / D E S T R U C T O R
940 //#########################################################################
943 /**
944 *
945 */
946 XmlReader::XmlReader()
947 {
948 len = 0;
949 lineNr = 1;
950 colNr = 0;
951 parseAsData = false;
952 keepGoing = false;
953 }
955 /**
956 *
957 */
958 XmlReader::XmlReader(bool parseAsDataArg)
959 {
960 len = 0;
961 lineNr = 1;
962 colNr = 0;
963 parseAsData = parseAsDataArg;
964 keepGoing = false;
965 }
969 /**
970 *
971 */
972 XmlReader::~XmlReader()
973 {
974 }
977 } //namespace dom
978 } //namespace w3c
979 } //namespace org
982 //#########################################################################
983 //# E N D O F F I L E
984 //#########################################################################