fbacf48f411aa54481098a3d8576679709940401
1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
32 #include "xmlreader.h"
33 #include "charclass.h"
34 #include "domimpl.h"
35 #include "svg/svgimpl.h"
37 #include <stdio.h>
38 #include <stdarg.h>
40 namespace org
41 {
42 namespace w3c
43 {
44 namespace dom
45 {
48 //#########################################################################
49 //# E N T I T Y T A B L E
50 //#########################################################################
51 struct EntityInfo
52 {
53 char *escape;
54 int escapeLength;
55 char *value;
56 };
59 static EntityInfo entityTable[] =
60 {
61 { "&" , 5 , "&" },
62 { "<" , 4 , "<" },
63 { ">" , 4 , ">" },
64 { "'" , 6 , "'" },
65 { """ , 6 , "\"" },
66 { NULL , 0 , "\0" }
67 };
71 //#########################################################################
72 //# M E S S A G E S
73 //#########################################################################
76 /**
77 *
78 */
79 void XmlReader::error(char *fmt, ...)
80 {
81 va_list args;
82 fprintf(stderr, "XmlReader:error at line %d, column %d:", lineNr, colNr);
83 va_start(args, fmt);
84 vfprintf(stderr, fmt, args);
85 va_end(args) ;
86 fprintf(stderr, "\n");
87 }
91 //#########################################################################
92 //# U T I L I T Y
93 //#########################################################################
95 static void trim(DOMString &str)
96 {
97 int len = str.size();
98 if (len<1)
99 return;
101 int start = 0;
102 int end = 0;
103 for (start=0 ; start<len ; start++)
104 {
105 int ch = str[start];
106 if (ch<=' ' || ch>126)
107 break;
108 }
109 for (end=len-1 ; end>=0 ; end--)
110 {
111 int ch = str[end];
112 if (ch<=' ' || ch>126)
113 break;
114 }
115 if (start<end)
116 {
117 str = str.substr(start, end+1);
118 }
119 }
121 //#########################################################################
122 //# P A R S I N G
123 //#########################################################################
125 /**
126 * Get the character at the position and record the fact
127 */
128 int XmlReader::get(int p)
129 {
130 if (p >= len)
131 return -1;
132 int ch = parsebuf[p];
133 //printf("%c", ch);
134 if (ch == '\n' || ch == '\r')
135 {
136 colNr = 0;
137 lineNr++;
138 }
139 else
140 colNr++;
141 return ch;
142 }
144 /**
145 * Look at the character at the position, but don't note the fact
146 */
147 int XmlReader::peek(int p)
148 {
149 if (p >= len)
150 return -1;
151 int ch = parsebuf[p];
152 return ch;
153 }
156 /**
157 * Test if the given substring exists at the given position
158 * in parsebuf. Use peek() in case of out-of-bounds
159 */
160 bool XmlReader::match(int pos, char *str)
161 {
162 while (*str)
163 {
164 if (peek(pos++) != *str++)
165 return false;
166 }
167 return true;
168 }
172 /**
173 * Test if the given substring exists at the given position
174 * in a given buffer
175 */
176 /*
177 static bool bufMatch(const DOMString &buf, int pos, char *str)
178 {
179 while (*str)
180 {
181 if (buf[pos++] != *str++)
182 return false;
183 }
184 return true;
185 }
186 */
189 /**
190 *
191 */
192 int XmlReader::skipwhite(int p)
193 {
194 while (p < len)
195 {
196 int b = get(p);
197 if (!isWhitespace(b))
198 break;
199 p++;
200 }
201 return p;
202 }
204 /**
205 * modify this to allow all chars for an element or attribute name
206 */
207 int XmlReader::getWord(int p, DOMString &result)
208 {
209 while (p<len)
210 {
211 int b = get(p);
212 if (b<=' ' || b=='/' || b=='>' || b=='=')
213 break;
214 result.push_back((XMLCh)b);
215 p++;
216 }
217 return p;
218 }
220 /**
221 * get a name and prefix, if any
222 */
223 int XmlReader::getPrefixedWord(int p, DOMString &prefix,
224 DOMString &shortWord, DOMString &fullWord)
225 {
226 while (p<len)
227 {
228 int b = get(p);
229 if (b<=' ' || b=='/' || b=='>' || b=='=')
230 break;
231 else if (b == ':')
232 {
233 prefix = shortWord;
234 shortWord = "";
235 }
236 else
237 shortWord.push_back((XMLCh)b);
238 p++;
239 }
240 if (prefix.size() > 0)
241 fullWord = prefix + ":" + shortWord;
242 else
243 fullWord = shortWord;
244 return p;
245 }
248 /**
249 * Assume that we are starting on a quote. Ends on the char
250 * after the final '"'
251 */
252 int XmlReader::getQuoted(int p0, DOMString &result)
253 {
255 int p = p0;
257 if (peek(p)!='"' && peek(p)!='\'')
258 return p0;
260 int b = get(p++); //go to next char
262 DOMString buf;
264 while (p<len )
265 {
266 b = get(p++);
267 if (b=='"' || b=='\'')
268 break;
269 else if (b=='&')
270 {
271 p = parseEntity(p, result);
272 if (p < 0)
273 return p0;
274 }
275 else
276 {
277 buf.push_back((XMLCh)b);
278 }
279 }
281 //printf("quoted text:'%s'\n", buf.c_str());
283 result.append(buf);
285 return p;
286 }
290 /**
291 * Parse a <!xml> tag. Node may be null. Assumes current char is '<'
292 * ends on char after '>'
293 */
294 int XmlReader::parseVersion(int p0)
295 {
296 int p = p0;
298 if (!match(p, "<?xml"))
299 return p0;
301 p += 5;
302 colNr += 5;
304 bool quickCloseDummy;
305 Node *node = new NodeImpl();
306 int p2 = parseAttributes(p, node, &quickCloseDummy);
307 if (p2 < p)
308 {
309 delete node;
310 return p0;
311 }
312 p = p2;
314 //get the attributes that we need
315 NamedNodeMap attributes = node->getAttributes();
316 Node *attr = attributes.getNamedItem("version");
317 if (attr)
318 document->setXmlVersion(attr->getNodeValue());
319 attr = attributes.getNamedItem("encoding");
320 if (attr)
321 { /*document->setXmlEncoding(attr->getNodeValue());*/ }
322 attr = attributes.getNamedItem("standalone");
323 if (attr)
324 document->setXmlStandalone((attr->getNodeValue() == "yes"));
325 delete node;
327 //#now we should be pointing at '?>'
328 if (!match(p, "?>"))
329 {
330 return p0;
331 }
333 //skip over '?>'
334 get(p++);
335 get(p++);
337 return p;
338 }
341 /**
342 * Parse a <!DOCTYPE> tag. doctype may be null. Expects '<'
343 * on start. Ends pointing at char after '>'
344 */
345 int XmlReader::parseDoctype(int p0)
346 {
347 int p = p0;
349 if (!match(p, "<!DOCTYPE"))
350 return p0;
352 p += 9;
353 colNr += 9;
355 DocumentType *doctype = document->getDoctype();
356 if (!doctype)
357 return p0;
360 //### get the root name of the document
361 p = skipwhite(p);
362 DOMString rootName;
363 int p2 = getWord(p, rootName);
364 if (p2 <= p)
365 return p0;
366 p = p2;
367 //printf("doctype root '%s'\n", rootName.c_str());
370 while (p < len)
371 {
372 p = skipwhite(p);
373 if (peek(p) == '>')
374 break;
375 else if (peek(p) == '[') //just ignore 'internal' [] stuff
376 {
377 while (p < len)
378 {
379 int ch = get(p++);
380 if (ch == ']')
381 break;
382 }
383 p++;
384 }
385 else if (match(p, "PUBLIC"))
386 {
387 p += 6;
388 colNr += 6;
389 p = skipwhite(p);
390 DOMString pubIdLiteral;
391 int p2 = getQuoted(p, pubIdLiteral);
392 if (p2 <= p)
393 return p0;
394 p = p2;
395 p = skipwhite(p);
396 DOMString systemLiteral;
397 p2 = getQuoted(p, systemLiteral);
398 if (p2 <= p)
399 return p0;
400 p = p2;
401 //printf("PUBLIC \"%s\" \"%s\" \n",
402 // pubIdLiteral.c_str(), systemLiteral.c_str());
403 }
404 else if (match(p, "SYSTEM"))
405 {
406 p += 6;
407 colNr += 6;
408 p = skipwhite(p);
409 DOMString systemLiteral;
410 int p2 = getQuoted(p, systemLiteral);
411 if (p2 <= p)
412 return p0;
413 p = p2;
414 //printf("SYSTEM \"%s\" \n", systemLiteral.c_str());
415 }
416 }
419 //skip over '>'
420 get(p++);
422 return p;
423 }
427 /**
428 * Expects '<' on startup, ends on char after '>'
429 */
430 int XmlReader::parseComment(int p0, Comment *comment)
431 {
432 int p = p0;
434 if (!match(p, "<!--"))
435 return p0;
437 colNr += 4;
438 p += 4;
440 DOMString buf;
442 while (p<len-3)
443 {
444 if (match(p, "-->"))
445 {
446 p += 3;
447 colNr += 3;
448 break;
449 }
450 int ch = get(p++);
451 buf.push_back((XMLCh)ch);
452 }
454 comment->setNodeValue(buf);
456 return p;
457 }
461 /**
462 *
463 */
464 int XmlReader::parseCDATA(int p0, CDATASection *cdata)
465 {
467 int p = p0;
469 if (!match(p, "<![CDATA["))
470 return p0;
472 colNr += 9;
473 p += 9;
475 DOMString buf;
477 while (p<len)
478 {
479 if (match(p, "]]>"))
480 {
481 p +=3;
482 colNr += 3;
483 break;
484 }
485 int ch = get(p++);
486 buf.push_back((XMLCh)ch);
487 }
489 /*printf("Got CDATA:%s\n",buf.c_str());*/
490 cdata->setNodeValue(buf);
492 return p;
493 }
497 /**
498 *
499 */
500 int XmlReader::parseText(int p0, Text *text)
501 {
503 int p = p0;
505 DOMString buf;
507 while (p<len)
508 {
509 if (peek(p) == '&')
510 {
511 p = parseEntity(p, buf);
512 if (p < 0) //error?
513 return p0;
514 }
515 else if (peek(p) == '<')
516 {
517 break;
518 }
519 else
520 {
521 int ch = get(p++);
522 buf.push_back((XMLCh)ch);
523 }
524 }
526 /*printf("Got Text:%s\n",buf.c_str());*/
527 text->setNodeValue(buf);
529 return p;
530 }
536 /**
537 * Parses attributes of a node. Should end pointing at either the
538 * '?' of a version or doctype tag, or a '>' of a normal tag
539 */
540 int XmlReader::parseAttributes(int p0, Node *node, bool *quickClose)
541 {
542 *quickClose = false;
544 int p = p0;
546 NamedNodeMap attributes;
548 while (p<len)
549 {
550 /*printf("ch:%c\n",ch);*/
551 p = skipwhite(p);
552 int ch = get(p);
554 /*printf("ch:%c\n",ch);*/
555 if (ch == '?' || ch == '>')//done
556 break;
557 else if (ch=='/' && p<len+1)
558 {
559 p++;
560 p = skipwhite(p);
561 ch = peek(p);
562 if (ch == '>')
563 {
564 p++;
565 *quickClose = true;
566 /*printf("quick close\n");*/
567 return p;
568 }
569 }
570 DOMString shortName;
571 DOMString prefix;
572 DOMString qualifiedName;
573 int p2 = getPrefixedWord(p, prefix, shortName, qualifiedName);
574 if (p2 <= p)
575 break;
577 /*printf("name:%s",buf);*/
578 p = p2;
579 p = skipwhite(p);
580 ch = get(p);
581 /*printf("ch:%c\n",ch);*/
582 if (ch != '=')
583 break;
584 p++;
585 p = skipwhite(p);
586 /*ch = parsebuf[p];*/
587 /*printf("ch:%c\n",ch);*/
588 DOMString attrValue;
589 p2 = getQuoted(p, attrValue);
590 p = p2;
591 /*printf("name:'%s' value:'%s'\n",buf,buf2);*/
593 DOMString namespaceURI = "";
594 if (prefix == "xmlns" || shortName == "xmlns")
595 namespaceURI = XMLNSNAME;
597 //## Now let us make the attribute and give it to the node
598 Attr *attr = document->createAttributeNS(namespaceURI, qualifiedName);
599 attr->setValue(attrValue);
600 node->getAttributes().setNamedItemNS(attr);
602 }//while p<len
604 return p;
605 }
607 /**
608 * Appends the value of an entity to the buffer
609 */
610 int XmlReader::parseEntity(int p0, DOMString &buf)
611 {
612 int p = p0;
613 for (EntityInfo *info = entityTable ; info->escape ; info++)
614 {
615 if (match(p, info->escape))
616 {
617 p += info->escapeLength;
618 colNr += info->escapeLength;
619 buf += info->value;
620 return p;
621 }
622 }
624 error("unterminated entity");
625 return -1;
626 }
629 //#########################################################################
630 //# P A R S E A N O D E
631 //#########################################################################
633 /**
634 * Parse as a document, preserving the original structure as much as
635 * possible
636 */
637 int XmlReader::parseNode(int p0, Node *node, int depth)
638 {
640 int p = p0;
643 //### OPEN TAG
644 int ch = get(p++);
645 if (ch != '<')
646 return p0;
648 p = skipwhite(p);
649 DOMString openTagName;
650 DOMString openTagNamePrefix;
651 DOMString openTagQualifiedName;
652 int p2 = getPrefixedWord(p,openTagNamePrefix,
653 openTagName, openTagQualifiedName);
654 if (p2 <= p)
655 return p0;
656 p = p2;
657 p = skipwhite(p);
659 //printf("qualifiedName:%s\n", openTagQualifiedName.c_str());
660 DOMString namespaceURI = node->lookupNamespaceURI(openTagNamePrefix);
661 document->renameNode(node, namespaceURI, openTagQualifiedName);
663 //### ATTRIBUTES
664 bool quickClose;
665 p = parseAttributes(p, node, &quickClose);
666 if (quickClose) //trivial tag: <name/>
667 return p;
669 p++; //skip over '>'
672 DOMString nodeValue;
674 /* ### Get intervening data ### */
675 while (p<len && keepGoing)
676 {
677 //### COMMENT
678 if (match(p, "<!--"))
679 {
680 Comment *comment = document->createComment("");
681 p2 = parseComment(p, comment);
682 if (p2 <= p)
683 return p0;
684 p = p2;
685 if (parseAsData)
686 { //throw away
687 delete comment;
688 }
689 else
690 {
691 node->appendChild(comment);
692 }
693 }
694 //### VERSION
695 else if (match(p, "<?xml"))
696 {
697 p2 = parseVersion(p);
698 if (p2 <= p)
699 return p0;
700 }
701 //### DOCTYPE
702 else if (match(p, "<!DOCTYPE"))
703 {
704 p2 = parseDoctype(p);
705 if (p2 <= p)
706 return p0;
707 }
708 //### CDATA
709 else if (match(p, "<![CDATA["))
710 {
711 CDATASection *cdata = document->createCDATASection("");
712 p2 = parseCDATA(p, cdata);
713 if (p2 <= p)
714 return p0;
715 p = p2;
716 if (parseAsData)
717 {
718 nodeValue += cdata->getNodeValue();
719 delete cdata;
720 }
721 else
722 {
723 node->appendChild(cdata);
724 }
725 }
726 //### OPEN OR CLOSE TAG
727 else if (peek(p) == '<')
728 {
729 p2 = skipwhite(p+1);
730 if (peek(p2) =='/')
731 {
732 p = p2;
733 break;
734 }
735 else
736 {
737 /*Add element to tree*/
738 Element *elem = document->createElement(""); //fill in name later
739 node->appendChild(elem);
740 p2 = parseNode(p, elem, depth+1);
741 if (p2 <= p)
742 {
743 /*printf("problem on element:%ls. p2:%d p:%d\n",n->name, p2, p);*/
744 return p0;
745 }
746 p = p2;
747 }
748 }
749 //### TEXT
750 else
751 {
752 Text *text = document->createTextNode("");
753 p2 = parseText(p, text);
754 if (p2 <= p)
755 return p0;
756 p = p2;
757 if (parseAsData)
758 {
759 nodeValue += text->getNodeValue();
760 delete text;
761 }
762 else
763 {
764 node->appendChild(text);
765 }
766 }
768 }//while (p<len)
770 //printf("%d : nodeValue:'%s'\n", p, nodeValue.c_str());
771 trim(nodeValue);
772 node->setNodeValue(nodeValue);
774 //### get close tag. we should be pointing at '/'
775 p = skipwhite(p);
776 ch = get(p);
777 if (ch != '/')
778 {
779 error("no / on end tag");
780 return p0;
781 }
782 p++;
784 //### get word after '/'
785 p = skipwhite(p);
786 DOMString closeTagName;
787 DOMString closeTagNamePrefix;
788 DOMString closeTagQualifiedName;
789 p = getPrefixedWord(p, closeTagNamePrefix, closeTagName,
790 closeTagQualifiedName);
791 if (openTagQualifiedName != closeTagQualifiedName)
792 {
793 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
794 openTagQualifiedName.c_str(), closeTagQualifiedName.c_str());
795 return p0;
796 }
797 p = skipwhite(p);
798 if (parsebuf[p] != '>')
799 {
800 error("no > on end tag");
801 return p0;
802 }
803 p++;
804 /*printf("close element:%ls\n",buf);*/
805 return p;
806 }
809 /**
810 *
811 */
812 org::w3c::dom::Document *
813 XmlReader::parse(const DOMString &buf, int bufferOffset, int parseLen)
814 {
815 len = parseLen;
816 parsebuf = buf;
818 DOMImplementationSourceImpl source;
819 DOMImplementation *domImpl = source.getDOMImplementation("");
821 keepGoing = true;
823 document = domImpl->createDocument("", "", NULL);
824 //document = new svg::SVGDocumentImpl(domImpl, "", "", NULL);
826 int p = bufferOffset;
827 int p2 = 0;
829 while (p<len && keepGoing)
830 {
831 p = skipwhite(p);
832 //### COMMENT
833 if (match(p, "<!--"))
834 {
835 Comment *comment = document->createComment("");
836 p2 = parseComment(p, comment);
837 if (p2 <= p)
838 return document;
839 p = p2;
840 if (parseAsData)
841 { //throw away
842 delete comment;
843 }
844 else
845 {
846 document->appendChild(comment);
847 }
848 }
849 //### VERSION
850 else if (match(p, "<?xml"))
851 {
852 p2 = parseVersion(p);
853 if (p2 <= p)
854 return document;
855 p = p2;
856 }
857 //### DOCTYPE
858 else if (match(p, "<!DOCTYPE"))
859 {
860 p2 = parseDoctype(p);
861 if (p2 <= p)
862 return document;
863 p = p2;
864 }
865 else
866 {
867 break;
868 }
869 }
871 p = skipwhite(p);
872 p = parseNode(p, document->getDocumentElement(), 0);
874 keepGoing = false;
876 return document;
877 }
880 /**
881 *
882 */
883 org::w3c::dom::Document *
884 XmlReader::parse(const DOMString &str)
885 {
887 Document *doc = parse(str, 0, str.size());
888 doc->normalizeDocument();
890 return doc;
891 }
893 /**
894 *
895 */
896 org::w3c::dom::Document *
897 XmlReader::parseFile(char *fileName)
898 {
900 DOMString buf = loadFile(fileName);
902 Document *doc = parse(buf, 0, buf.size());
904 return doc;
905 }
909 //#########################################################################
910 //# S T R E A M R E A D I N G
911 //#########################################################################
913 /**
914 *
915 */
916 org::w3c::dom::DOMString
917 XmlReader::loadFile(char *fileName)
918 {
920 if (!fileName)
921 return NULL;
922 FILE *f = fopen(fileName, "rb");
923 if (!f)
924 return NULL;
926 DOMString buf;
927 while (!feof(f))
928 {
929 int ch = fgetc(f);
930 if (ch<0)
931 break;
932 buf.push_back((XMLCh)ch);
933 }
934 fclose(f);
936 return buf;
937 }
940 //#########################################################################
941 //# C O N S T R U C T O R / D E S T R U C T O R
942 //#########################################################################
945 /**
946 *
947 */
948 XmlReader::XmlReader()
949 {
950 len = 0;
951 lineNr = 1;
952 colNr = 0;
953 parseAsData = false;
954 keepGoing = false;
955 }
957 /**
958 *
959 */
960 XmlReader::XmlReader(bool parseAsDataArg)
961 {
962 len = 0;
963 lineNr = 1;
964 colNr = 0;
965 parseAsData = parseAsDataArg;
966 keepGoing = false;
967 }
971 /**
972 *
973 */
974 XmlReader::~XmlReader()
975 {
976 }
979 } //namespace dom
980 } //namespace w3c
981 } //namespace org
984 //#########################################################################
985 //# E N D O F F I L E
986 //#########################################################################