1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005-2008 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
32 #include "xmlreader.h"
33 #include "ucd.h"
34 #include "domimpl.h"
36 #include <stdio.h>
37 #include <stdarg.h>
39 namespace org
40 {
41 namespace w3c
42 {
43 namespace dom
44 {
47 //#########################################################################
48 //# E N T I T Y T A B L E
49 //#########################################################################
50 struct EntityInfo
51 {
52 const char *escape;
53 int escapeLength;
54 const char *value;
55 };
58 static EntityInfo entityTable[] =
59 {
60 { "&" , 5 , "&" },
61 { "<" , 4 , "<" },
62 { ">" , 4 , ">" },
63 { "'" , 6 , "'" },
64 { """ , 6 , "\"" },
65 { NULL , 0 , "\0" }
66 };
70 //#########################################################################
71 //# M E S S A G E S
72 //#########################################################################
75 /**
76 *
77 */
78 void XmlReader::error(const char *fmt, ...)
79 {
80 va_list args;
81 fprintf(stderr, "XmlReader:error at line %d, column %d:", lineNr, colNr);
82 va_start(args, fmt);
83 vfprintf(stderr, fmt, args);
84 va_end(args) ;
85 fprintf(stderr, "\n");
86 }
90 //#########################################################################
91 //# U T I L I T Y
92 //#########################################################################
94 static void trim(DOMString &str)
95 {
96 int len = str.size();
97 if (len<1)
98 return;
100 int start = 0;
101 int end = 0;
102 for (start=0 ; start<len ; start++)
103 {
104 int ch = str[start];
105 if (ch<=' ' || ch>126)
106 break;
107 }
108 for (end=len-1 ; end>=0 ; end--)
109 {
110 int ch = str[end];
111 if (ch<=' ' || ch>126)
112 break;
113 }
114 if (start<end)
115 {
116 str = str.substr(start, end+1);
117 }
118 }
120 //#########################################################################
121 //# P A R S I N G
122 //#########################################################################
124 /**
125 * Get the character at the position and record the fact
126 */
127 int XmlReader::get(int p)
128 {
129 if (p >= len)
130 return -1;
131 int ch = parsebuf[p];
132 //printf("%c", ch);
133 if (ch == '\n' || ch == '\r')
134 {
135 colNr = 0;
136 lineNr++;
137 }
138 else
139 colNr++;
140 return ch;
141 }
143 /**
144 * Look at the character at the position, but don't note the fact
145 */
146 int XmlReader::peek(int p)
147 {
148 if (p >= len)
149 return -1;
150 int ch = parsebuf[p];
151 return ch;
152 }
155 /**
156 * Test if the given substring exists at the given position
157 * in parsebuf. Use peek() in case of out-of-bounds
158 */
159 bool XmlReader::match(int pos, const char *str)
160 {
161 while (*str)
162 {
163 if (peek(pos++) != *str++)
164 return false;
165 }
166 return true;
167 }
171 /**
172 * Test if the given substring exists at the given position
173 * in a given buffer
174 */
175 /*
176 static bool bufMatch(const DOMString &buf, int pos, char *str)
177 {
178 while (*str)
179 {
180 if (buf[pos++] != *str++)
181 return false;
182 }
183 return true;
184 }
185 */
188 /**
189 *
190 */
191 int XmlReader::skipwhite(int p)
192 {
193 while (p < len)
194 {
195 int b = get(p);
196 if (!uni_is_space(b))
197 break;
198 p++;
199 }
200 return p;
201 }
203 /**
204 * modify this to allow all chars for an element or attribute name
205 */
206 int XmlReader::getWord(int p, DOMString &result)
207 {
208 while (p<len)
209 {
210 int b = get(p);
211 if (b<=' ' || b=='/' || b=='>' || b=='=')
212 break;
213 result.push_back((XMLCh)b);
214 p++;
215 }
216 return p;
217 }
219 /**
220 * get a name and prefix, if any
221 */
222 int XmlReader::getPrefixedWord(int p, DOMString &prefix,
223 DOMString &shortWord, DOMString &fullWord)
224 {
225 while (p<len)
226 {
227 int b = get(p);
228 if (b<=' ' || b=='/' || b=='>' || b=='=')
229 break;
230 else if (b == ':')
231 {
232 prefix = shortWord;
233 shortWord = "";
234 }
235 else
236 shortWord.push_back((XMLCh)b);
237 p++;
238 }
239 if (prefix.size() > 0)
240 fullWord = prefix + ":" + shortWord;
241 else
242 fullWord = shortWord;
243 return p;
244 }
247 /**
248 * Assume that we are starting on a quote. Ends on the char
249 * after the final '"'
250 */
251 int XmlReader::getQuoted(int p0, DOMString &result)
252 {
254 int p = p0;
256 if (peek(p)!='"' && peek(p)!='\'')
257 return p0;
259 int b = get(p++); //go to next char
261 DOMString buf;
263 while (p<len )
264 {
265 b = get(p++);
266 if (b=='"' || b=='\'')
267 break;
268 else if (b=='&')
269 {
270 p = parseEntity(p, result);
271 if (p < 0)
272 return p0;
273 }
274 else
275 {
276 buf.push_back((XMLCh)b);
277 }
278 }
280 //printf("quoted text:'%s'\n", buf.c_str());
282 result.append(buf);
284 return p;
285 }
289 /**
290 * Parse a <!xml> tag. Node may be null. Assumes current char is '<'
291 * ends on char after '>'
292 */
293 int XmlReader::parseVersion(int p0)
294 {
295 int p = p0;
297 if (!match(p, "<?xml"))
298 return p0;
300 p += 5;
301 colNr += 5;
303 bool quickCloseDummy;
304 NodePtr node = new NodeImpl();
305 int p2 = parseAttributes(p, node, &quickCloseDummy);
306 if (p2 < p)
307 {
308 //smart ptr!!do not delete node;
309 return p0;
310 }
311 p = p2;
313 //get the attributes that we need
314 NamedNodeMap attributes = node->getAttributes();
315 NodePtr attr = attributes.getNamedItem("version");
316 if (attr.get())
317 document->setXmlVersion(attr->getNodeValue());
318 attr = attributes.getNamedItem("encoding");
319 if (attr.get())
320 { /*document->setXmlEncoding(attr->getNodeValue());*/ }
321 attr = attributes.getNamedItem("standalone");
322 if (attr.get())
323 document->setXmlStandalone((attr->getNodeValue() == "yes"));
325 //#now we should be pointing at '?>'
326 if (!match(p, "?>"))
327 {
328 return p0;
329 }
331 //skip over '?>'
332 get(p++);
333 get(p++);
335 return p;
336 }
339 /**
340 * Parse a <!DOCTYPE> tag. doctype may be null. Expects '<'
341 * on start. Ends pointing at char after '>'
342 */
343 int XmlReader::parseDoctype(int p0)
344 {
345 int p = p0;
347 if (!match(p, "<!DOCTYPE"))
348 return p0;
350 p += 9;
351 colNr += 9;
353 DocumentTypePtr doctype = document->getDoctype();
354 if (!doctype)
355 return p0;
358 //### get the root name of the document
359 p = skipwhite(p);
360 DOMString rootName;
361 int p2 = getWord(p, rootName);
362 if (p2 <= p)
363 return p0;
364 p = p2;
365 //printf("doctype root '%s'\n", rootName.c_str());
368 while (p < len)
369 {
370 p = skipwhite(p);
371 if (peek(p) == '>')
372 break;
373 else if (peek(p) == '[') //just ignore 'internal' [] stuff
374 {
375 while (p < len)
376 {
377 int ch = get(p++);
378 if (ch == ']')
379 break;
380 }
381 p++;
382 }
383 else if (match(p, "PUBLIC"))
384 {
385 p += 6;
386 colNr += 6;
387 p = skipwhite(p);
388 DOMString pubIdLiteral;
389 int p2 = getQuoted(p, pubIdLiteral);
390 if (p2 <= p)
391 return p0;
392 p = p2;
393 p = skipwhite(p);
394 DOMString systemLiteral;
395 p2 = getQuoted(p, systemLiteral);
396 if (p2 <= p)
397 return p0;
398 p = p2;
399 //printf("PUBLIC \"%s\" \"%s\" \n",
400 // pubIdLiteral.c_str(), systemLiteral.c_str());
401 }
402 else if (match(p, "SYSTEM"))
403 {
404 p += 6;
405 colNr += 6;
406 p = skipwhite(p);
407 DOMString systemLiteral;
408 int p2 = getQuoted(p, systemLiteral);
409 if (p2 <= p)
410 return p0;
411 p = p2;
412 //printf("SYSTEM \"%s\" \n", systemLiteral.c_str());
413 }
414 }
417 //skip over '>'
418 get(p++);
420 return p;
421 }
425 /**
426 * Expects '<' on startup, ends on char after '>'
427 */
428 int XmlReader::parseComment(int p0, CommentPtr comment)
429 {
430 int p = p0;
432 if (!match(p, "<!--"))
433 return p0;
435 colNr += 4;
436 p += 4;
438 DOMString buf;
440 while (p<len-3)
441 {
442 if (match(p, "-->"))
443 {
444 p += 3;
445 colNr += 3;
446 break;
447 }
448 int ch = get(p++);
449 buf.push_back((XMLCh)ch);
450 }
452 comment->setNodeValue(buf);
454 return p;
455 }
459 /**
460 *
461 */
462 int XmlReader::parseCDATA(int p0, CDATASectionPtr cdata)
463 {
465 int p = p0;
467 if (!match(p, "<![CDATA["))
468 return p0;
470 colNr += 9;
471 p += 9;
473 DOMString buf;
475 while (p<len)
476 {
477 if (match(p, "]]>"))
478 {
479 p +=3;
480 colNr += 3;
481 break;
482 }
483 int ch = get(p++);
484 buf.push_back((XMLCh)ch);
485 }
487 /*printf("Got CDATA:%s\n",buf.c_str());*/
488 cdata->setNodeValue(buf);
490 return p;
491 }
495 /**
496 *
497 */
498 int XmlReader::parseText(int p0, TextPtr text)
499 {
501 int p = p0;
503 DOMString buf;
505 while (p<len)
506 {
507 if (peek(p) == '&')
508 {
509 p = parseEntity(p, buf);
510 if (p < 0) //error?
511 return p0;
512 }
513 else if (peek(p) == '<')
514 {
515 break;
516 }
517 else
518 {
519 int ch = get(p++);
520 buf.push_back((XMLCh)ch);
521 }
522 }
524 /*printf("Got Text:%s\n",buf.c_str());*/
525 text->setNodeValue(buf);
527 return p;
528 }
534 /**
535 * Parses attributes of a node. Should end pointing at either the
536 * '?' of a version or doctype tag, or a '>' of a normal tag
537 */
538 int XmlReader::parseAttributes(int p0, NodePtr node, bool *quickClose)
539 {
540 *quickClose = false;
542 int p = p0;
544 NamedNodeMap attributes;
546 while (p<len)
547 {
548 /*printf("ch:%c\n",ch);*/
549 p = skipwhite(p);
550 int ch = get(p);
552 /*printf("ch:%c\n",ch);*/
553 if (ch == '?' || ch == '>')//done
554 break;
555 else if (ch=='/' && p<len+1)
556 {
557 p++;
558 p = skipwhite(p);
559 ch = peek(p);
560 if (ch == '>')
561 {
562 p++;
563 *quickClose = true;
564 /*printf("quick close\n");*/
565 return p;
566 }
567 }
568 DOMString shortName;
569 DOMString prefix;
570 DOMString qualifiedName;
571 int p2 = getPrefixedWord(p, prefix, shortName, qualifiedName);
572 if (p2 <= p)
573 break;
575 /*printf("name:%s",buf);*/
576 p = p2;
577 p = skipwhite(p);
578 ch = get(p);
579 /*printf("ch:%c\n",ch);*/
580 if (ch != '=')
581 break;
582 p++;
583 p = skipwhite(p);
584 /*ch = parsebuf[p];*/
585 /*printf("ch:%c\n",ch);*/
586 DOMString attrValue;
587 p2 = getQuoted(p, attrValue);
588 p = p2;
589 /*printf("name:'%s' value:'%s'\n",buf,buf2);*/
591 DOMString namespaceURI = "";
592 if (prefix == "xmlns" || shortName == "xmlns")
593 namespaceURI = XMLNSNAME;
595 //## Now let us make the attribute and give it to the node
596 AttrPtr attr = document->createAttributeNS(namespaceURI, qualifiedName);
597 attr->setValue(attrValue);
598 node->getAttributes().setNamedItemNS(attr);
600 }//while p<len
602 return p;
603 }
605 /**
606 * Appends the value of an entity to the buffer
607 */
608 int XmlReader::parseEntity(int p0, DOMString &buf)
609 {
610 int p = p0;
611 for (EntityInfo *info = entityTable ; info->escape ; info++)
612 {
613 if (match(p, info->escape))
614 {
615 p += info->escapeLength;
616 colNr += info->escapeLength;
617 buf += info->value;
618 return p;
619 }
620 }
622 error("unterminated entity");
623 return -1;
624 }
627 //#########################################################################
628 //# P A R S E A N O D E
629 //#########################################################################
631 /**
632 * Parse as a document, preserving the original structure as much as
633 * possible
634 */
635 int XmlReader::parseNode(int p0, NodePtr node, int depth)
636 {
638 int p = p0;
641 //### OPEN TAG
642 int ch = get(p++);
643 if (ch != '<')
644 return p0;
646 p = skipwhite(p);
647 DOMString openTagName;
648 DOMString openTagNamePrefix;
649 DOMString openTagQualifiedName;
650 int p2 = getPrefixedWord(p,openTagNamePrefix,
651 openTagName, openTagQualifiedName);
652 if (p2 <= p)
653 return p0;
654 p = p2;
655 p = skipwhite(p);
657 //printf("qualifiedName:%s\n", openTagQualifiedName.c_str());
658 DOMString namespaceURI = node->lookupNamespaceURI(openTagNamePrefix);
659 document->renameNode(node, namespaceURI, openTagQualifiedName);
661 //### ATTRIBUTES
662 bool quickClose;
663 p = parseAttributes(p, node, &quickClose);
664 if (quickClose) //trivial tag: <name/>
665 return p;
667 p++; //skip over '>'
670 DOMString nodeValue;
672 /* ### Get intervening data ### */
673 while (p<len && keepGoing)
674 {
675 //### COMMENT
676 if (match(p, "<!--"))
677 {
678 CommentPtr comment = document->createComment("");
679 p2 = parseComment(p, comment);
680 if (p2 <= p)
681 return p0;
682 p = p2;
683 if (parseAsData)
684 { //throw away
685 //delete comment;
686 }
687 else
688 {
689 node->appendChild(comment);
690 }
691 }
692 //### VERSION
693 else if (match(p, "<?xml"))
694 {
695 p2 = parseVersion(p);
696 if (p2 <= p)
697 return p0;
698 }
699 //### DOCTYPE
700 else if (match(p, "<!DOCTYPE"))
701 {
702 p2 = parseDoctype(p);
703 if (p2 <= p)
704 return p0;
705 }
706 //### CDATA
707 else if (match(p, "<![CDATA["))
708 {
709 CDATASectionPtr cdata = document->createCDATASection("");
710 p2 = parseCDATA(p, cdata);
711 if (p2 <= p)
712 return p0;
713 p = p2;
714 if (parseAsData)
715 {
716 nodeValue += cdata->getNodeValue();
717 //delete cdata;
718 }
719 else
720 {
721 node->appendChild(cdata);
722 }
723 }
724 //### OPEN OR CLOSE TAG
725 else if (peek(p) == '<')
726 {
727 p2 = skipwhite(p+1);
728 if (peek(p2) =='/')
729 {
730 p = p2;
731 break;
732 }
733 else
734 {
735 /*Add element to tree*/
736 ElementPtr elem = document->createElement(""); //fill in name later
737 node->appendChild(elem);
738 p2 = parseNode(p, elem, depth+1);
739 if (p2 <= p)
740 {
741 /*printf("problem on element:%ls. p2:%d p:%d\n",n->name, p2, p);*/
742 return p0;
743 }
744 p = p2;
745 }
746 }
747 //### TEXT
748 else
749 {
750 TextPtr text = document->createTextNode("");
751 p2 = parseText(p, text);
752 if (p2 <= p)
753 return p0;
754 p = p2;
755 if (parseAsData)
756 {
757 nodeValue += text->getNodeValue();
758 //delete text;
759 }
760 else
761 {
762 node->appendChild(text);
763 }
764 }
766 }//while (p<len)
768 //printf("%d : nodeValue:'%s'\n", p, nodeValue.c_str());
769 trim(nodeValue);
770 node->setNodeValue(nodeValue);
772 //### get close tag. we should be pointing at '/'
773 p = skipwhite(p);
774 ch = get(p);
775 if (ch != '/')
776 {
777 error("no / on end tag");
778 return p0;
779 }
780 p++;
782 //### get word after '/'
783 p = skipwhite(p);
784 DOMString closeTagName;
785 DOMString closeTagNamePrefix;
786 DOMString closeTagQualifiedName;
787 p = getPrefixedWord(p, closeTagNamePrefix, closeTagName,
788 closeTagQualifiedName);
789 if (openTagQualifiedName != closeTagQualifiedName)
790 {
791 error("Mismatched closing tag. Expected </%s>. Got '%s'.",
792 openTagQualifiedName.c_str(), closeTagQualifiedName.c_str());
793 return p0;
794 }
795 p = skipwhite(p);
796 if (parsebuf[p] != '>')
797 {
798 error("no > on end tag");
799 return p0;
800 }
801 p++;
802 /*printf("close element:%ls\n",buf);*/
803 return p;
804 }
807 /**
808 *
809 */
810 org::w3c::dom::DocumentPtr
811 XmlReader::parse(const DOMString &buf, int bufferOffset, int parseLen)
812 {
813 len = parseLen;
814 parsebuf = buf;
816 keepGoing = true;
818 DOMImplementationSourceImpl source;
819 DOMImplementation *domImpl = source.getDOMImplementation("");
821 document = domImpl->createDocument("", "", NULL);
822 //document = new svg::SVGDocumentImpl(domImpl, "", "", NULL);
824 int p = bufferOffset;
825 int p2 = 0;
827 while (p<len && keepGoing)
828 {
829 p = skipwhite(p);
830 //### COMMENT
831 if (match(p, "<!--"))
832 {
833 CommentPtr comment = document->createComment("");
834 p2 = parseComment(p, comment);
835 if (p2 <= p)
836 return document;
837 p = p2;
838 if (parseAsData)
839 { //throw away
840 //delete comment;
841 }
842 else
843 {
844 document->appendChild(comment);
845 }
846 }
847 //### VERSION
848 else if (match(p, "<?xml"))
849 {
850 p2 = parseVersion(p);
851 if (p2 <= p)
852 return document;
853 p = p2;
854 }
855 //### DOCTYPE
856 else if (match(p, "<!DOCTYPE"))
857 {
858 p2 = parseDoctype(p);
859 if (p2 <= p)
860 return document;
861 p = p2;
862 }
863 else
864 {
865 break;
866 }
867 }
869 p = skipwhite(p);
870 p = parseNode(p, document->getDocumentElement(), 0);
872 keepGoing = false;
874 return document;
875 }
878 /**
879 *
880 */
881 org::w3c::dom::DocumentPtr
882 XmlReader::parse(const DOMString &str)
883 {
885 DocumentPtr doc = parse(str, 0, str.size());
886 if (!doc)
887 return doc;
888 doc->normalizeDocument();
889 return doc;
890 }
892 /**
893 *
894 */
895 org::w3c::dom::DocumentPtr
896 XmlReader::parseFile(const DOMString &fileName)
897 {
898 DocumentPtr doc;
900 DOMString buf = loadFile(fileName);
901 if (buf.size() == 0)
902 return doc; /*doc still null*/
904 doc = parse(buf, 0, buf.size());
906 return doc;
907 }
911 //#########################################################################
912 //# S T R E A M R E A D I N G
913 //#########################################################################
915 /**
916 *
917 */
918 org::w3c::dom::DOMString
919 XmlReader::loadFile(const DOMString &fileName)
920 {
921 DOMString buf;
923 if (fileName.size() == 0)
924 return buf;
925 FILE *f = fopen(fileName.c_str(), "rb");
926 if (!f)
927 {
928 //error here
929 return buf;
930 }
932 while (!feof(f))
933 {
934 int ch = fgetc(f);
935 if (ch<0)
936 break;
937 buf.push_back((XMLCh)ch);
938 }
939 fclose(f);
941 return buf;
942 }
945 //#########################################################################
946 //# C O N S T R U C T O R / D E S T R U C T O R
947 //#########################################################################
950 /**
951 *
952 */
953 XmlReader::XmlReader()
954 {
955 len = 0;
956 lineNr = 1;
957 colNr = 0;
958 parseAsData = false;
959 keepGoing = false;
960 }
962 /**
963 *
964 */
965 XmlReader::XmlReader(bool parseAsDataArg)
966 {
967 len = 0;
968 lineNr = 1;
969 colNr = 0;
970 parseAsData = parseAsDataArg;
971 keepGoing = false;
972 }
976 /**
977 *
978 */
979 XmlReader::~XmlReader()
980 {
981 }
984 } //namespace dom
985 } //namespace w3c
986 } //namespace org
989 //#########################################################################
990 //# E N D O F F I L E
991 //#########################################################################