1 /*
2 * Implementation of the Pedro mini-DOM parser and tree
3 *
4 * Authors:
5 * Bob Jamison
6 *
7 * Copyright (C) 2005-2007 Bob Jamison
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdarg.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
33 #include "pedrodom.h"
35 namespace Pedro
36 {
40 //########################################################################
41 //# E L E M E N T
42 //########################################################################
44 Element *Element::clone()
45 {
46 Element *elem = new Element(name, value);
47 elem->parent = parent;
48 elem->attributes = attributes;
49 elem->namespaces = namespaces;
51 std::vector<Element *>::iterator iter;
52 for (iter = children.begin(); iter != children.end() ; iter++)
53 {
54 elem->addChild((*iter)->clone());
55 }
56 return elem;
57 }
60 void Element::findElementsRecursive(std::vector<Element *>&res, const DOMString &name)
61 {
62 if (getName() == name)
63 {
64 res.push_back(this);
65 }
66 for (unsigned int i=0; i<children.size() ; i++)
67 children[i]->findElementsRecursive(res, name);
68 }
70 std::vector<Element *> Element::findElements(const DOMString &name)
71 {
72 std::vector<Element *> res;
73 findElementsRecursive(res, name);
74 return res;
75 }
77 DOMString Element::getAttribute(const DOMString &name)
78 {
79 for (unsigned int i=0 ; i<attributes.size() ; i++)
80 if (attributes[i].getName() ==name)
81 return attributes[i].getValue();
82 return "";
83 }
85 DOMString Element::getTagAttribute(const DOMString &tagName, const DOMString &attrName)
86 {
87 std::vector<Element *>elems = findElements(tagName);
88 if (elems.size() <1)
89 return "";
90 DOMString res = elems[0]->getAttribute(attrName);
91 return res;
92 }
94 DOMString Element::getTagValue(const DOMString &tagName)
95 {
96 std::vector<Element *>elems = findElements(tagName);
97 if (elems.size() <1)
98 return "";
99 DOMString res = elems[0]->getValue();
100 return res;
101 }
103 void Element::addChild(Element *child)
104 {
105 if (!child)
106 return;
107 child->parent = this;
108 children.push_back(child);
109 }
112 void Element::addAttribute(const DOMString &name, const DOMString &value)
113 {
114 Attribute attr(name, value);
115 attributes.push_back(attr);
116 }
118 void Element::addNamespace(const DOMString &prefix, const DOMString &namespaceURI)
119 {
120 Namespace ns(prefix, namespaceURI);
121 namespaces.push_back(ns);
122 }
124 void Element::writeIndentedRecursive(FILE *f, int indent)
125 {
126 int i;
127 if (!f)
128 return;
129 //Opening tag, and attributes
130 for (i=0;i<indent;i++)
131 fputc(' ',f);
132 fprintf(f,"<%s",name.c_str());
133 for (unsigned int i=0 ; i<attributes.size() ; i++)
134 {
135 fprintf(f," %s=\"%s\"",
136 attributes[i].getName().c_str(),
137 attributes[i].getValue().c_str());
138 }
139 for (unsigned int i=0 ; i<namespaces.size() ; i++)
140 {
141 fprintf(f," xmlns:%s=\"%s\"",
142 namespaces[i].getPrefix().c_str(),
143 namespaces[i].getNamespaceURI().c_str());
144 }
145 fprintf(f,">\n");
147 //Between the tags
148 if (value.size() > 0)
149 {
150 for (int i=0;i<indent;i++)
151 fputc(' ', f);
152 fprintf(f," %s\n", value.c_str());
153 }
155 for (unsigned int i=0 ; i<children.size() ; i++)
156 children[i]->writeIndentedRecursive(f, indent+2);
158 //Closing tag
159 for (int i=0; i<indent; i++)
160 fputc(' ',f);
161 fprintf(f,"</%s>\n", name.c_str());
162 }
164 void Element::writeIndented(FILE *f)
165 {
166 writeIndentedRecursive(f, 0);
167 }
169 void Element::print()
170 {
171 writeIndented(stdout);
172 }
175 //########################################################################
176 //# P A R S E R
177 //########################################################################
181 typedef struct
182 {
183 char *escaped;
184 char value;
185 } EntityEntry;
187 static EntityEntry entities[] =
188 {
189 { "&" , '&' },
190 { "<" , '<' },
191 { ">" , '>' },
192 { "'", '\'' },
193 { """, '"' },
194 { NULL , '\0' }
195 };
199 /**
200 * Removes whitespace from beginning and end of a string
201 */
202 DOMString Parser::trim(const DOMString &s)
203 {
204 if (s.size() < 1)
205 return s;
207 //Find first non-ws char
208 unsigned int begin = 0;
209 for ( ; begin < s.size() ; begin++)
210 {
211 if (!isspace(s[begin]))
212 break;
213 }
215 //Find first non-ws char, going in reverse
216 unsigned int end = s.size() - 1;
217 for ( ; end > begin ; end--)
218 {
219 if (!isspace(s[end]))
220 break;
221 }
222 //trace("begin:%d end:%d", begin, end);
224 DOMString res = s.substr(begin, end-begin+1);
225 return res;
226 }
228 void Parser::getLineAndColumn(long pos, long *lineNr, long *colNr)
229 {
230 long line = 1;
231 long col = 1;
232 for (long i=0 ; i<pos ; i++)
233 {
234 XMLCh ch = parsebuf[i];
235 if (ch == '\n' || ch == '\r')
236 {
237 col = 0;
238 line ++;
239 }
240 else
241 col++;
242 }
243 *lineNr = line;
244 *colNr = col;
246 }
249 void Parser::error(char *fmt, ...)
250 {
251 long lineNr;
252 long colNr;
253 getLineAndColumn(currentPosition, &lineNr, &colNr);
254 va_list args;
255 fprintf(stderr, "xml error at line %ld, column %ld:", lineNr, colNr);
256 va_start(args,fmt);
257 vfprintf(stderr,fmt,args);
258 va_end(args) ;
259 fprintf(stderr, "\n");
260 }
264 int Parser::peek(long pos)
265 {
266 if (pos >= parselen)
267 return -1;
268 currentPosition = pos;
269 int ch = parsebuf[pos];
270 //printf("ch:%c\n", ch);
271 return ch;
272 }
276 DOMString Parser::encode(const DOMString &str)
277 {
278 DOMString ret;
279 for (unsigned int i=0 ; i<str.size() ; i++)
280 {
281 XMLCh ch = (XMLCh)str[i];
282 if (ch == '&')
283 ret.append("&");
284 else if (ch == '<')
285 ret.append("<");
286 else if (ch == '>')
287 ret.append(">");
288 else if (ch == '\'')
289 ret.append("'");
290 else if (ch == '"')
291 ret.append(""");
292 else
293 ret.push_back(ch);
295 }
296 return ret;
297 }
300 int Parser::match(long p0, const char *text)
301 {
302 int p = p0;
303 while (*text)
304 {
305 if (peek(p) != *text)
306 return p0;
307 p++; text++;
308 }
309 return p;
310 }
314 int Parser::skipwhite(long p)
315 {
317 while (p<parselen)
318 {
319 int p2 = match(p, "<!--");
320 if (p2 > p)
321 {
322 p = p2;
323 while (p<parselen)
324 {
325 p2 = match(p, "-->");
326 if (p2 > p)
327 {
328 p = p2;
329 break;
330 }
331 p++;
332 }
333 }
334 XMLCh b = peek(p);
335 if (!isspace(b))
336 break;
337 p++;
338 }
339 return p;
340 }
342 /* modify this to allow all chars for an element or attribute name*/
343 int Parser::getWord(int p0, DOMString &buf)
344 {
345 int p = p0;
346 while (p<parselen)
347 {
348 XMLCh b = peek(p);
349 if (b<=' ' || b=='/' || b=='>' || b=='=')
350 break;
351 buf.push_back(b);
352 p++;
353 }
354 return p;
355 }
357 int Parser::getQuoted(int p0, DOMString &buf, int do_i_parse)
358 {
360 int p = p0;
361 if (peek(p) != '"' && peek(p) != '\'')
362 return p0;
363 p++;
365 while ( p<parselen )
366 {
367 XMLCh b = peek(p);
368 if (b=='"' || b=='\'')
369 break;
370 if (b=='&' && do_i_parse)
371 {
372 bool found = false;
373 for (EntityEntry *ee = entities ; ee->value ; ee++)
374 {
375 int p2 = match(p, ee->escaped);
376 if (p2>p)
377 {
378 buf.push_back(ee->value);
379 p = p2;
380 found = true;
381 break;
382 }
383 }
384 if (!found)
385 {
386 error("unterminated entity");
387 return false;
388 }
389 }
390 else
391 {
392 buf.push_back(b);
393 p++;
394 }
395 }
396 return p;
397 }
399 int Parser::parseVersion(int p0)
400 {
401 //printf("### parseVersion: %d\n", p0);
403 int p = p0;
405 p = skipwhite(p0);
407 if (peek(p) != '<')
408 return p0;
410 p++;
411 if (p>=parselen || peek(p)!='?')
412 return p0;
414 p++;
416 DOMString buf;
418 while (p<parselen)
419 {
420 XMLCh ch = peek(p);
421 if (ch=='?')
422 {
423 p++;
424 break;
425 }
426 buf.push_back(ch);
427 p++;
428 }
430 if (peek(p) != '>')
431 return p0;
432 p++;
434 //printf("Got version:%s\n",buf.c_str());
435 return p;
436 }
438 int Parser::parseDoctype(int p0)
439 {
440 //printf("### parseDoctype: %d\n", p0);
442 int p = p0;
443 p = skipwhite(p);
445 if (p>=parselen || peek(p)!='<')
446 return p0;
448 p++;
450 if (peek(p)!='!' || peek(p+1)=='-')
451 return p0;
452 p++;
454 DOMString buf;
455 while (p<parselen)
456 {
457 XMLCh ch = peek(p);
458 if (ch=='>')
459 {
460 p++;
461 break;
462 }
463 buf.push_back(ch);
464 p++;
465 }
467 //printf("Got doctype:%s\n",buf.c_str());
468 return p;
469 }
471 int Parser::parseElement(int p0, Element *par,int depth)
472 {
474 int p = p0;
476 int p2 = p;
478 p = skipwhite(p);
480 //## Get open tag
481 XMLCh ch = peek(p);
482 if (ch!='<')
483 return p0;
485 p++;
487 DOMString openTagName;
488 p = skipwhite(p);
489 p = getWord(p, openTagName);
490 //printf("####tag :%s\n", openTagName.c_str());
491 p = skipwhite(p);
493 //Add element to tree
494 Element *n = new Element(openTagName);
495 n->parent = par;
496 par->addChild(n);
498 // Get attributes
499 if (peek(p) != '>')
500 {
501 while (p<parselen)
502 {
503 p = skipwhite(p);
504 ch = peek(p);
505 //printf("ch:%c\n",ch);
506 if (ch=='>')
507 break;
508 else if (ch=='/' && p<parselen+1)
509 {
510 p++;
511 p = skipwhite(p);
512 ch = peek(p);
513 if (ch=='>')
514 {
515 p++;
516 //printf("quick close\n");
517 return p;
518 }
519 }
520 DOMString attrName;
521 p2 = getWord(p, attrName);
522 if (p2==p)
523 break;
524 //printf("name:%s",buf);
525 p=p2;
526 p = skipwhite(p);
527 ch = peek(p);
528 //printf("ch:%c\n",ch);
529 if (ch!='=')
530 break;
531 p++;
532 p = skipwhite(p);
533 // ch = parsebuf[p];
534 // printf("ch:%c\n",ch);
535 DOMString attrVal;
536 p2 = getQuoted(p, attrVal, true);
537 p=p2+1;
538 //printf("name:'%s' value:'%s'\n",attrName.c_str(),attrVal.c_str());
539 char *namestr = (char *)attrName.c_str();
540 if (strncmp(namestr, "xmlns:", 6)==0)
541 n->addNamespace(attrName, attrVal);
542 else
543 n->addAttribute(attrName, attrVal);
544 }
545 }
547 bool cdata = false;
549 p++;
550 // ### Get intervening data ### */
551 DOMString data;
552 while (p<parselen)
553 {
554 //# COMMENT
555 p2 = match(p, "<!--");
556 if (!cdata && p2>p)
557 {
558 p = p2;
559 while (p<parselen)
560 {
561 p2 = match(p, "-->");
562 if (p2 > p)
563 {
564 p = p2;
565 break;
566 }
567 p++;
568 }
569 }
571 ch = peek(p);
572 //# END TAG
573 if (ch=='<' && !cdata && peek(p+1)=='/')
574 {
575 break;
576 }
577 //# CDATA
578 p2 = match(p, "<![CDATA[");
579 if (p2 > p)
580 {
581 cdata = true;
582 p = p2;
583 continue;
584 }
586 //# CHILD ELEMENT
587 if (ch == '<')
588 {
589 p2 = parseElement(p, n, depth+1);
590 if (p2 == p)
591 {
592 /*
593 printf("problem on element:%s. p2:%d p:%d\n",
594 openTagName.c_str(), p2, p);
595 */
596 return p0;
597 }
598 p = p2;
599 continue;
600 }
601 //# ENTITY
602 if (ch=='&' && !cdata)
603 {
604 bool found = false;
605 for (EntityEntry *ee = entities ; ee->value ; ee++)
606 {
607 int p2 = match(p, ee->escaped);
608 if (p2>p)
609 {
610 data.push_back(ee->value);
611 p = p2;
612 found = true;
613 break;
614 }
615 }
616 if (!found)
617 {
618 error("unterminated entity");
619 return -1;
620 }
621 continue;
622 }
624 //# NONE OF THE ABOVE
625 data.push_back(ch);
626 p++;
627 }/*while*/
630 n->value = data;
631 //printf("%d : data:%s\n",p,data.c_str());
633 //## Get close tag
634 p = skipwhite(p);
635 ch = peek(p);
636 if (ch != '<')
637 {
638 error("no < for end tag\n");
639 return p0;
640 }
641 p++;
642 ch = peek(p);
643 if (ch != '/')
644 {
645 error("no / on end tag");
646 return p0;
647 }
648 p++;
649 ch = peek(p);
650 p = skipwhite(p);
651 DOMString closeTagName;
652 p = getWord(p, closeTagName);
653 if (openTagName != closeTagName)
654 {
655 error("Mismatched closing tag. Expected </%s>. Got '%s'.",
656 openTagName.c_str(), closeTagName.c_str());
657 return p0;
658 }
659 p = skipwhite(p);
660 if (peek(p) != '>')
661 {
662 error("no > on end tag for '%s'", closeTagName.c_str());
663 return p0;
664 }
665 p++;
666 // printf("close element:%s\n",closeTagName.c_str());
667 p = skipwhite(p);
668 return p;
669 }
674 Element *Parser::parse(XMLCh *buf,int pos,int len)
675 {
676 parselen = len;
677 parsebuf = buf;
678 Element *rootNode = new Element("root");
679 pos = parseVersion(pos);
680 pos = parseDoctype(pos);
681 pos = parseElement(pos, rootNode, 0);
682 return rootNode;
683 }
686 Element *Parser::parse(const char *buf, int pos, int len)
687 {
688 XMLCh *charbuf = new XMLCh[len + 1];
689 long i = 0;
690 for ( ; i < len ; i++)
691 charbuf[i] = (XMLCh)buf[i];
692 charbuf[i] = '\0';
694 Element *n = parse(charbuf, pos, len);
695 delete[] charbuf;
696 return n;
697 }
699 Element *Parser::parse(const DOMString &buf)
700 {
701 long len = (long)buf.size();
702 XMLCh *charbuf = new XMLCh[len + 1];
703 long i = 0;
704 for ( ; i < len ; i++)
705 charbuf[i] = (XMLCh)buf[i];
706 charbuf[i] = '\0';
708 Element *n = parse(charbuf, 0, len);
709 delete[] charbuf;
710 return n;
711 }
713 Element *Parser::parseFile(const DOMString &fileName)
714 {
716 //##### LOAD INTO A CHAR BUF, THEN CONVERT TO XMLCh
717 FILE *f = fopen(fileName.c_str(), "rb");
718 if (!f)
719 return NULL;
721 struct stat statBuf;
722 if (fstat(fileno(f),&statBuf)<0)
723 {
724 fclose(f);
725 return NULL;
726 }
727 long filelen = statBuf.st_size;
729 //printf("length:%d\n",filelen);
730 XMLCh *charbuf = new XMLCh[filelen + 1];
731 for (XMLCh *p=charbuf ; !feof(f) ; p++)
732 {
733 *p = (XMLCh)fgetc(f);
734 }
735 fclose(f);
736 charbuf[filelen] = '\0';
739 /*
740 printf("nrbytes:%d\n",wc_count);
741 printf("buf:%ls\n======\n",charbuf);
742 */
743 Element *n = parse(charbuf, 0, filelen);
744 delete [] charbuf;
745 return n;
746 }
754 }//namespace Pedro
756 #if 0
757 //########################################################################
758 //# T E S T
759 //########################################################################
761 bool doTest(char *fileName)
762 {
763 Pedro::Parser parser;
765 Pedro::Element *elem = parser.parseFile(fileName);
767 if (!elem)
768 {
769 printf("Parsing failed\n");
770 return false;
771 }
773 elem->print();
775 delete elem;
777 return true;
778 }
782 int main(int argc, char **argv)
783 {
784 if (argc != 2)
785 {
786 printf("usage: %s <xmlfile>\n", argv[0]);
787 return 1;
788 }
790 if (!doTest(argv[1]))
791 return 1;
793 return 0;
794 }
796 #endif
798 //########################################################################
799 //# E N D O F F I L E
800 //########################################################################