d475afdea7f8ec46df9aec385f56d387e277bdb2
1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
32 #include "xmlreader.h"
33 #include "svgimpl.h"
35 #include <stdarg.h>
37 namespace org {
38 namespace w3c {
39 namespace dom {
42 //#########################################################################
43 //# E N T I T Y T A B L E
44 //#########################################################################
45 struct EntityInfo
46 {
47 char *escape;
48 int escapeLength;
49 char *value;
50 };
53 static EntityInfo entityTable[] =
54 {
55 { "&" , 5 , "&" },
56 { "<" , 4 , "<" },
57 { ">" , 4 , ">" },
58 { "'" , 6 , "'" },
59 { """ , 6 , "\"" },
60 { NULL , 0 , "\0" }
61 };
65 //#########################################################################
66 //# M E S S A G E S
67 //#########################################################################
70 /**
71 *
72 */
73 void XmlReader::error(char *fmt, ...)
74 {
75 va_list args;
76 fprintf(stderr, "XmlReader:error at line %d, column %d:", lineNr, colNr);
77 va_start(args, fmt);
78 vfprintf(stderr, fmt, args);
79 va_end(args) ;
80 fprintf(stderr, "\n");
81 }
85 //#########################################################################
86 //# U T I L I T Y
87 //#########################################################################
89 static void trim(DOMString &str)
90 {
91 int len = str.size();
92 if (len<1)
93 return;
95 int start = 0;
96 int end = 0;
97 for (start=0 ; start<len ; start++)
98 {
99 int ch = str[start];
100 if (ch<=' ' || ch>126)
101 break;
102 }
103 for (end=len-1 ; end>=0 ; end--)
104 {
105 int ch = str[end];
106 if (ch<=' ' || ch>126)
107 break;
108 }
109 if (start<end)
110 {
111 str = str.substr(start, end+1);
112 }
113 }
115 //#########################################################################
116 //# P A R S I N G
117 //#########################################################################
119 /**
120 * Get the character at the position and record the fact
121 */
122 int XmlReader::get(int p)
123 {
124 if (p >= len)
125 return -1;
126 int ch = parsebuf[p];
127 //printf("%c", ch);
128 if (ch == '\n' || ch == '\r')
129 {
130 colNr = 0;
131 lineNr++;
132 }
133 else
134 colNr++;
135 return ch;
136 }
138 /**
139 * Look at the character at the position, but don't note the fact
140 */
141 int XmlReader::peek(int p)
142 {
143 if (p >= len)
144 return -1;
145 int ch = parsebuf[p];
146 return ch;
147 }
150 /**
151 * Test if the given substring exists at the given position
152 * in parsebuf. Use peek() in case of out-of-bounds
153 */
154 bool XmlReader::match(int pos, char *str)
155 {
156 while (*str)
157 {
158 if (peek(pos++) != *str++)
159 return false;
160 }
161 return true;
162 }
166 /**
167 * Test if the given substring exists at the given position
168 * in a given buffer
169 */
170 /*
171 static bool bufMatch(const DOMString &buf, int pos, char *str)
172 {
173 while (*str)
174 {
175 if (buf[pos++] != *str++)
176 return false;
177 }
178 return true;
179 }
180 */
183 /**
184 *
185 */
186 int XmlReader::skipwhite(int p)
187 {
188 while (p < len)
189 {
190 int b = get(p);
191 if (!isspace(b))
192 break;
193 p++;
194 }
195 return p;
196 }
198 /**
199 * modify this to allow all chars for an element or attribute name
200 */
201 int XmlReader::getWord(int p, DOMString &result)
202 {
203 while (p<len)
204 {
205 int b = get(p);
206 if (b<=' ' || b=='/' || b=='>' || b=='=')
207 break;
208 result.push_back(b);
209 p++;
210 }
211 return p;
212 }
214 /**
215 * get a name and prefix, if any
216 */
217 int XmlReader::getPrefixedWord(int p, DOMString &prefix,
218 DOMString &shortWord, DOMString &fullWord)
219 {
220 while (p<len)
221 {
222 int b = get(p);
223 if (b<=' ' || b=='/' || b=='>' || b=='=')
224 break;
225 else if (b == ':')
226 {
227 prefix = shortWord;
228 shortWord = "";
229 }
230 else
231 shortWord.push_back(b);
232 p++;
233 }
234 if (prefix.size() > 0)
235 fullWord = prefix + ":" + shortWord;
236 else
237 fullWord = shortWord;
238 return p;
239 }
242 /**
243 * Assume that we are starting on a quote. Ends on the char
244 * after the final '"'
245 */
246 int XmlReader::getQuoted(int p0, DOMString &result)
247 {
249 int p = p0;
251 if (peek(p)!='"' && peek(p)!='\'')
252 return p0;
254 int b = get(p++); //go to next char
256 DOMString buf;
258 while (p<len )
259 {
260 b = get(p++);
261 if (b=='"' || b=='\'')
262 break;
263 else if (b=='&')
264 {
265 p = parseEntity(p, result);
266 if (p < 0)
267 return p0;
268 }
269 else
270 {
271 buf.push_back(b);
272 }
273 }
275 //printf("quoted text:'%s'\n", buf.c_str());
277 result.append(buf);
279 return p;
280 }
284 /**
285 * Parse a <!xml> tag. Node may be null. Assumes current char is '<'
286 * ends on char after '>'
287 */
288 int XmlReader::parseVersion(int p0)
289 {
290 int p = p0;
292 if (!match(p, "<?xml"))
293 return p0;
295 p += 5;
296 colNr += 5;
298 bool quickCloseDummy;
299 Node *node = new NodeImpl();
300 int p2 = parseAttributes(p, node, &quickCloseDummy);
301 if (p2 < p)
302 {
303 delete node;
304 return p0;
305 }
306 p = p2;
308 //get the attributes that we need
309 NamedNodeMap attributes = node->getAttributes();
310 Node *attr = attributes.getNamedItem("version");
311 if (attr)
312 document->setXmlVersion(attr->getNodeValue());
313 attr = attributes.getNamedItem("encoding");
314 if (attr)
315 { /*document->setXmlEncoding(attr->getNodeValue());*/ }
316 attr = attributes.getNamedItem("standalone");
317 if (attr)
318 document->setXmlStandalone((attr->getNodeValue() == "yes"));
319 delete node;
321 //#now we should be pointing at '?>'
322 if (!match(p, "?>"))
323 {
324 return p0;
325 }
327 //skip over '?>'
328 get(p++);
329 get(p++);
331 return p;
332 }
335 /**
336 * Parse a <!DOCTYPE> tag. doctype may be null. Expects '<'
337 * on start. Ends pointing at char after '>'
338 */
339 int XmlReader::parseDoctype(int p0)
340 {
341 int p = p0;
343 if (!match(p, "<!DOCTYPE"))
344 return p0;
346 p += 9;
347 colNr += 9;
349 DocumentType *doctype = document->getDoctype();
350 if (!doctype)
351 return p0;
354 //### get the root name of the document
355 p = skipwhite(p);
356 DOMString rootName;
357 int p2 = getWord(p, rootName);
358 if (p2 <= p)
359 return p0;
360 p = p2;
361 //printf("doctype root '%s'\n", rootName.c_str());
364 while (p < len)
365 {
366 p = skipwhite(p);
367 if (peek(p) == '>')
368 break;
369 else if (peek(p) == '[') //just ignore 'internal' [] stuff
370 {
371 while (p < len)
372 {
373 int ch = get(p++);
374 if (ch == ']')
375 break;
376 }
377 p++;
378 }
379 else if (match(p, "PUBLIC"))
380 {
381 p += 6;
382 colNr += 6;
383 p = skipwhite(p);
384 DOMString pubIdLiteral;
385 int p2 = getQuoted(p, pubIdLiteral);
386 if (p2 <= p)
387 return p0;
388 p = p2;
389 p = skipwhite(p);
390 DOMString systemLiteral;
391 p2 = getQuoted(p, systemLiteral);
392 if (p2 <= p)
393 return p0;
394 p = p2;
395 //printf("PUBLIC \"%s\" \"%s\" \n",
396 // pubIdLiteral.c_str(), systemLiteral.c_str());
397 }
398 else if (match(p, "SYSTEM"))
399 {
400 p += 6;
401 colNr += 6;
402 p = skipwhite(p);
403 DOMString systemLiteral;
404 int p2 = getQuoted(p, systemLiteral);
405 if (p2 <= p)
406 return p0;
407 p = p2;
408 //printf("SYSTEM \"%s\" \n", systemLiteral.c_str());
409 }
410 }
413 //skip over '>'
414 get(p++);
416 return p;
417 }
421 /**
422 * Expects '<' on startup, ends on char after '>'
423 */
424 int XmlReader::parseComment(int p0, Comment *comment)
425 {
426 int p = p0;
428 if (!match(p, "<!--"))
429 return p0;
431 colNr += 4;
432 p += 4;
434 DOMString buf;
436 while (p<len-3)
437 {
438 if (match(p, "-->"))
439 {
440 p += 3;
441 colNr += 3;
442 break;
443 }
444 int ch = get(p++);
445 buf.push_back(ch);
446 }
448 comment->setNodeValue(buf);
450 return p;
451 }
455 /**
456 *
457 */
458 int XmlReader::parseCDATA(int p0, CDATASection *cdata)
459 {
461 int p = p0;
463 if (!match(p, "<![CDATA["))
464 return p0;
466 colNr += 9;
467 p += 9;
469 DOMString buf;
471 while (p<len)
472 {
473 if (match(p, "]]>"))
474 {
475 p +=3;
476 colNr += 3;
477 break;
478 }
479 int ch = get(p++);
480 buf.push_back(ch);
481 }
483 /*printf("Got CDATA:%s\n",buf.c_str());*/
484 cdata->setNodeValue(buf);
486 return p;
487 }
491 /**
492 *
493 */
494 int XmlReader::parseText(int p0, Text *text)
495 {
497 int p = p0;
499 DOMString buf;
501 while (p<len)
502 {
503 if (peek(p) == '&')
504 {
505 p = parseEntity(p, buf);
506 if (p < 0) //error?
507 return p0;
508 }
509 else if (peek(p) == '<')
510 {
511 break;
512 }
513 else
514 {
515 int ch = get(p++);
516 buf.push_back(ch);
517 }
518 }
520 /*printf("Got Text:%s\n",buf.c_str());*/
521 text->setNodeValue(buf);
523 return p;
524 }
530 /**
531 * Parses attributes of a node. Should end pointing at either the
532 * '?' of a version or doctype tag, or a '>' of a normal tag
533 */
534 int XmlReader::parseAttributes(int p0, Node *node, bool *quickClose)
535 {
536 *quickClose = false;
538 int p = p0;
540 NamedNodeMap attributes;
542 while (p<len)
543 {
544 /*printf("ch:%c\n",ch);*/
545 p = skipwhite(p);
546 int ch = get(p);
548 /*printf("ch:%c\n",ch);*/
549 if (ch == '?' || ch == '>')//done
550 break;
551 else if (ch=='/' && p<len+1)
552 {
553 p++;
554 p = skipwhite(p);
555 ch = peek(p);
556 if (ch == '>')
557 {
558 p++;
559 *quickClose = true;
560 /*printf("quick close\n");*/
561 return p;
562 }
563 }
564 DOMString shortName;
565 DOMString prefix;
566 DOMString qualifiedName;
567 int p2 = getPrefixedWord(p, prefix, shortName, qualifiedName);
568 if (p2 <= p)
569 break;
571 /*printf("name:%s",buf);*/
572 p = p2;
573 p = skipwhite(p);
574 ch = get(p);
575 /*printf("ch:%c\n",ch);*/
576 if (ch != '=')
577 break;
578 p++;
579 p = skipwhite(p);
580 /*ch = parsebuf[p];*/
581 /*printf("ch:%c\n",ch);*/
582 DOMString attrValue;
583 p2 = getQuoted(p, attrValue);
584 p = p2;
585 /*printf("name:'%s' value:'%s'\n",buf,buf2);*/
587 DOMString namespaceURI = "";
588 if (prefix == "xmlns" || shortName == "xmlns")
589 namespaceURI = XMLNSNAME;
591 //## Now let us make the attribute and give it to the node
592 Attr *attr = document->createAttributeNS(namespaceURI, qualifiedName);
593 attr->setValue(attrValue);
594 node->getAttributes().setNamedItemNS(attr);
596 }//while p<len
598 return p;
599 }
601 /**
602 * Appends the value of an entity to the buffer
603 */
604 int XmlReader::parseEntity(int p0, DOMString &buf)
605 {
606 int p = p0;
607 for (EntityInfo *info = entityTable ; info->escape ; info++)
608 {
609 if (match(p, info->escape))
610 {
611 p += info->escapeLength;
612 colNr += info->escapeLength;
613 buf += info->value;
614 return p;
615 }
616 }
618 error("unterminated entity");
619 return -1;
620 }
623 //#########################################################################
624 //# P A R S E A N O D E
625 //#########################################################################
627 /**
628 * Parse as a document, preserving the original structure as much as
629 * possible
630 */
631 int XmlReader::parseNode(int p0, Node *node, int depth)
632 {
634 int p = p0;
637 //### OPEN TAG
638 int ch = get(p++);
639 if (ch != '<')
640 return p0;
642 p = skipwhite(p);
643 DOMString openTagName;
644 DOMString openTagNamePrefix;
645 DOMString openTagQualifiedName;
646 int p2 = getPrefixedWord(p,openTagNamePrefix,
647 openTagName, openTagQualifiedName);
648 if (p2 <= p)
649 return p0;
650 p = p2;
651 p = skipwhite(p);
653 //printf("qualifiedName:%s\n", openTagQualifiedName.c_str());
654 DOMString namespaceURI = node->lookupNamespaceURI(openTagNamePrefix);
655 document->renameNode(node, namespaceURI, openTagQualifiedName);
657 //### ATTRIBUTES
658 bool quickClose;
659 p = parseAttributes(p, node, &quickClose);
660 if (quickClose) //trivial tag: <name/>
661 return p;
663 p++; //skip over '>'
666 DOMString nodeValue;
668 /* ### Get intervening data ### */
669 while (p<len && keepGoing)
670 {
671 //### COMMENT
672 if (match(p, "<!--"))
673 {
674 Comment *comment = document->createComment("");
675 p2 = parseComment(p, comment);
676 if (p2 <= p)
677 return p0;
678 p = p2;
679 if (parseAsData)
680 { //throw away
681 delete comment;
682 }
683 else
684 {
685 node->appendChild(comment);
686 }
687 }
688 //### VERSION
689 else if (match(p, "<?xml"))
690 {
691 p2 = parseVersion(p);
692 if (p2 <= p)
693 return p0;
694 }
695 //### DOCTYPE
696 else if (match(p, "<!DOCTYPE"))
697 {
698 p2 = parseDoctype(p);
699 if (p2 <= p)
700 return p0;
701 }
702 //### CDATA
703 else if (match(p, "<![CDATA["))
704 {
705 CDATASection *cdata = document->createCDATASection("");
706 p2 = parseCDATA(p, cdata);
707 if (p2 <= p)
708 return p0;
709 p = p2;
710 if (parseAsData)
711 {
712 nodeValue += cdata->getNodeValue();
713 delete cdata;
714 }
715 else
716 {
717 node->appendChild(cdata);
718 }
719 }
720 //### OPEN OR CLOSE TAG
721 else if (peek(p) == '<')
722 {
723 p2 = skipwhite(p+1);
724 if (peek(p2) =='/')
725 {
726 p = p2;
727 break;
728 }
729 else
730 {
731 /*Add element to tree*/
732 Element *elem = document->createElement(""); //fill in name later
733 node->appendChild(elem);
734 p2 = parseNode(p, elem, depth+1);
735 if (p2 <= p)
736 {
737 /*printf("problem on element:%ls. p2:%d p:%d\n",n->name, p2, p);*/
738 return p0;
739 }
740 p = p2;
741 }
742 }
743 //### TEXT
744 else
745 {
746 Text *text = document->createTextNode("");
747 p2 = parseText(p, text);
748 if (p2 <= p)
749 return p0;
750 p = p2;
751 if (parseAsData)
752 {
753 nodeValue += text->getNodeValue();
754 delete text;
755 }
756 else
757 {
758 node->appendChild(text);
759 }
760 }
762 }//while (p<len)
764 //printf("%d : nodeValue:'%s'\n", p, nodeValue.c_str());
765 trim(nodeValue);
766 node->setNodeValue(nodeValue);
768 //### get close tag. we should be pointing at '/'
769 p = skipwhite(p);
770 ch = get(p);
771 if (ch != '/')
772 {
773 error("no / on end tag");
774 return p0;
775 }
776 p++;
778 //### get word after '/'
779 p = skipwhite(p);
780 DOMString closeTagName;
781 DOMString closeTagNamePrefix;
782 DOMString closeTagQualifiedName;
783 p = getPrefixedWord(p, closeTagNamePrefix, closeTagName,
784 closeTagQualifiedName);
785 if (openTagQualifiedName != closeTagQualifiedName)
786 {
787 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
788 openTagQualifiedName.c_str(), closeTagQualifiedName.c_str());
789 return p0;
790 }
791 p = skipwhite(p);
792 if (parsebuf[p] != '>')
793 {
794 error("no > on end tag");
795 return p0;
796 }
797 p++;
798 /*printf("close element:%ls\n",buf);*/
799 return p;
800 }
803 /**
804 *
805 */
806 org::w3c::dom::Document *
807 XmlReader::parse(const DOMString &buf, int bufferOffset, int parseLen)
808 {
809 len = parseLen;
810 parsebuf = buf;
812 DOMImplementationSourceImpl source;
813 DOMImplementation *domImpl = source.getDOMImplementation("");
815 keepGoing = true;
817 document = domImpl->createDocument("", "", NULL);
818 //document = new svg::SVGDocumentImpl(domImpl, "", "", NULL);
820 int p = bufferOffset;
821 int p2 = 0;
823 while (p<len && keepGoing)
824 {
825 p = skipwhite(p);
826 //### COMMENT
827 if (match(p, "<!--"))
828 {
829 Comment *comment = document->createComment("");
830 p2 = parseComment(p, comment);
831 if (p2 <= p)
832 return document;
833 p = p2;
834 if (parseAsData)
835 { //throw away
836 delete comment;
837 }
838 else
839 {
840 document->appendChild(comment);
841 }
842 }
843 //### VERSION
844 else if (match(p, "<?xml"))
845 {
846 p2 = parseVersion(p);
847 if (p2 <= p)
848 return document;
849 p = p2;
850 }
851 //### DOCTYPE
852 else if (match(p, "<!DOCTYPE"))
853 {
854 p2 = parseDoctype(p);
855 if (p2 <= p)
856 return document;
857 p = p2;
858 }
859 else
860 {
861 break;
862 }
863 }
865 p = skipwhite(p);
866 p = parseNode(p, document->getDocumentElement(), 0);
868 keepGoing = false;
870 return document;
871 }
874 /**
875 *
876 */
877 org::w3c::dom::Document *
878 XmlReader::parse(const DOMString &str)
879 {
881 Document *doc = parse(str, 0, str.size());
882 doc->normalizeDocument();
884 return doc;
885 }
887 /**
888 *
889 */
890 org::w3c::dom::Document *
891 XmlReader::parseFile(char *fileName)
892 {
894 DOMString buf = loadFile(fileName);
896 Document *doc = parse(buf, 0, buf.size());
898 return doc;
899 }
903 //#########################################################################
904 //# S T R E A M R E A D I N G
905 //#########################################################################
907 /**
908 *
909 */
910 org::w3c::dom::DOMString
911 XmlReader::loadFile(char *fileName)
912 {
914 if (!fileName)
915 return NULL;
916 FILE *f = fopen(fileName, "rb");
917 if (!f)
918 return NULL;
920 DOMString buf;
921 while (!feof(f))
922 {
923 int ch = fgetc(f);
924 if (ch<0)
925 break;
926 buf.push_back(ch);
927 }
928 fclose(f);
930 return buf;
931 }
934 //#########################################################################
935 //# C O N S T R U C T O R / D E S T R U C T O R
936 //#########################################################################
939 /**
940 *
941 */
942 XmlReader::XmlReader()
943 {
944 len = 0;
945 lineNr = 1;
946 colNr = 0;
947 parseAsData = false;
948 keepGoing = false;
949 }
951 /**
952 *
953 */
954 XmlReader::XmlReader(bool parseAsDataArg)
955 {
956 len = 0;
957 lineNr = 1;
958 colNr = 0;
959 parseAsData = parseAsDataArg;
960 keepGoing = false;
961 }
965 /**
966 *
967 */
968 XmlReader::~XmlReader()
969 {
970 }
973 } //namespace dom
974 } //namespace w3c
975 } //namespace org
978 //#########################################################################
979 //# E N D O F F I L E
980 //#########################################################################