1 /*
2 * Implementation of the Pedro mini-DOM parser and tree
3 *
4 * Authors:
5 * Bob Jamison
6 *
7 * Copyright (C) 2005 Bob Jamison
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdarg.h>
29 #ifdef HAVE_MALLOC_H
30 #include <malloc.h>
31 #endif
32 #include <sys/types.h>
33 #include <sys/stat.h>
36 #include "pedrodom.h"
38 namespace Pedro
39 {
43 //########################################################################
44 //# E L E M E N T
45 //########################################################################
47 Element *Element::clone()
48 {
49 Element *elem = new Element(name, value);
50 elem->parent = parent;
51 elem->attributes = attributes;
52 elem->namespaces = namespaces;
54 std::vector<Element *>::iterator iter;
55 for (iter = children.begin(); iter != children.end() ; iter++)
56 {
57 elem->addChild((*iter)->clone());
58 }
59 return elem;
60 }
63 void Element::findElementsRecursive(std::vector<Element *>&res, const DOMString &name)
64 {
65 if (getName() == name)
66 {
67 res.push_back(this);
68 }
69 for (unsigned int i=0; i<children.size() ; i++)
70 children[i]->findElementsRecursive(res, name);
71 }
73 std::vector<Element *> Element::findElements(const DOMString &name)
74 {
75 std::vector<Element *> res;
76 findElementsRecursive(res, name);
77 return res;
78 }
80 DOMString Element::getAttribute(const DOMString &name)
81 {
82 for (unsigned int i=0 ; i<attributes.size() ; i++)
83 if (attributes[i].getName() ==name)
84 return attributes[i].getValue();
85 return "";
86 }
88 DOMString Element::getTagAttribute(const DOMString &tagName, const DOMString &attrName)
89 {
90 std::vector<Element *>elems = findElements(tagName);
91 if (elems.size() <1)
92 return "";
93 DOMString res = elems[0]->getAttribute(attrName);
94 return res;
95 }
97 DOMString Element::getTagValue(const DOMString &tagName)
98 {
99 std::vector<Element *>elems = findElements(tagName);
100 if (elems.size() <1)
101 return "";
102 DOMString res = elems[0]->getValue();
103 return res;
104 }
106 void Element::addChild(Element *child)
107 {
108 if (!child)
109 return;
110 child->parent = this;
111 children.push_back(child);
112 }
115 void Element::addAttribute(const DOMString &name, const DOMString &value)
116 {
117 Attribute attr(name, value);
118 attributes.push_back(attr);
119 }
121 void Element::addNamespace(const DOMString &prefix, const DOMString &namespaceURI)
122 {
123 Namespace ns(prefix, namespaceURI);
124 namespaces.push_back(ns);
125 }
127 void Element::writeIndentedRecursive(FILE *f, int indent)
128 {
129 int i;
130 if (!f)
131 return;
132 //Opening tag, and attributes
133 for (i=0;i<indent;i++)
134 fputc(' ',f);
135 fprintf(f,"<%s",name.c_str());
136 for (unsigned int i=0 ; i<attributes.size() ; i++)
137 {
138 fprintf(f," %s=\"%s\"",
139 attributes[i].getName().c_str(),
140 attributes[i].getValue().c_str());
141 }
142 for (unsigned int i=0 ; i<namespaces.size() ; i++)
143 {
144 fprintf(f," xmlns:%s=\"%s\"",
145 namespaces[i].getPrefix().c_str(),
146 namespaces[i].getNamespaceURI().c_str());
147 }
148 fprintf(f,">\n");
150 //Between the tags
151 if (value.size() > 0)
152 {
153 for (int i=0;i<indent;i++)
154 fputc(' ', f);
155 fprintf(f," %s\n", value.c_str());
156 }
158 for (unsigned int i=0 ; i<children.size() ; i++)
159 children[i]->writeIndentedRecursive(f, indent+2);
161 //Closing tag
162 for (int i=0; i<indent; i++)
163 fputc(' ',f);
164 fprintf(f,"</%s>\n", name.c_str());
165 }
167 void Element::writeIndented(FILE *f)
168 {
169 writeIndentedRecursive(f, 0);
170 }
172 void Element::print()
173 {
174 writeIndented(stdout);
175 }
178 //########################################################################
179 //# P A R S E R
180 //########################################################################
184 typedef struct
185 {
186 char *escaped;
187 char value;
188 } EntityEntry;
190 static EntityEntry entities[] =
191 {
192 { "&" , '&' },
193 { "<" , '<' },
194 { ">" , '>' },
195 { "'", '\'' },
196 { """, '"' },
197 { NULL , '\0' }
198 };
202 void Parser::getLineAndColumn(long pos, long *lineNr, long *colNr)
203 {
204 long line = 1;
205 long col = 1;
206 for (long i=0 ; i<pos ; i++)
207 {
208 XMLCh ch = parsebuf[i];
209 if (ch == '\n' || ch == '\r')
210 {
211 col = 0;
212 line ++;
213 }
214 else
215 col++;
216 }
217 *lineNr = line;
218 *colNr = col;
220 }
223 void Parser::error(char *fmt, ...)
224 {
225 long lineNr;
226 long colNr;
227 getLineAndColumn(currentPosition, &lineNr, &colNr);
228 va_list args;
229 fprintf(stderr, "xml error at line %ld, column %ld:", lineNr, colNr);
230 va_start(args,fmt);
231 vfprintf(stderr,fmt,args);
232 va_end(args) ;
233 fprintf(stderr, "\n");
234 }
238 int Parser::peek(long pos)
239 {
240 if (pos >= parselen)
241 return -1;
242 currentPosition = pos;
243 int ch = parsebuf[pos];
244 //printf("ch:%c\n", ch);
245 return ch;
246 }
250 DOMString Parser::encode(const DOMString &str)
251 {
252 DOMString ret;
253 for (unsigned int i=0 ; i<str.size() ; i++)
254 {
255 XMLCh ch = (XMLCh)str[i];
256 if (ch == '&')
257 ret.append("&");
258 else if (ch == '<')
259 ret.append("<");
260 else if (ch == '>')
261 ret.append(">");
262 else if (ch == '\'')
263 ret.append("'");
264 else if (ch == '"')
265 ret.append(""");
266 else
267 ret.push_back(ch);
269 }
270 return ret;
271 }
274 int Parser::match(long p0, const char *text)
275 {
276 int p = p0;
277 while (*text)
278 {
279 if (peek(p) != *text)
280 return p0;
281 p++; text++;
282 }
283 return p;
284 }
288 int Parser::skipwhite(long p)
289 {
291 while (p<parselen)
292 {
293 int p2 = match(p, "<!--");
294 if (p2 > p)
295 {
296 p = p2;
297 while (p<parselen)
298 {
299 p2 = match(p, "-->");
300 if (p2 > p)
301 {
302 p = p2;
303 break;
304 }
305 p++;
306 }
307 }
308 XMLCh b = peek(p);
309 if (!isspace(b))
310 break;
311 p++;
312 }
313 return p;
314 }
316 /* modify this to allow all chars for an element or attribute name*/
317 int Parser::getWord(int p0, DOMString &buf)
318 {
319 int p = p0;
320 while (p<parselen)
321 {
322 XMLCh b = peek(p);
323 if (b<=' ' || b=='/' || b=='>' || b=='=')
324 break;
325 buf.push_back(b);
326 p++;
327 }
328 return p;
329 }
331 int Parser::getQuoted(int p0, DOMString &buf, int do_i_parse)
332 {
334 int p = p0;
335 if (peek(p) != '"' && peek(p) != '\'')
336 return p0;
337 p++;
339 while ( p<parselen )
340 {
341 XMLCh b = peek(p);
342 if (b=='"' || b=='\'')
343 break;
344 if (b=='&' && do_i_parse)
345 {
346 bool found = false;
347 for (EntityEntry *ee = entities ; ee->value ; ee++)
348 {
349 int p2 = match(p, ee->escaped);
350 if (p2>p)
351 {
352 buf.push_back(ee->value);
353 p = p2;
354 found = true;
355 break;
356 }
357 }
358 if (!found)
359 {
360 error("unterminated entity");
361 return false;
362 }
363 }
364 else
365 {
366 buf.push_back(b);
367 p++;
368 }
369 }
370 return p;
371 }
373 int Parser::parseVersion(int p0)
374 {
375 //printf("### parseVersion: %d\n", p0);
377 int p = p0;
379 p = skipwhite(p0);
381 if (peek(p) != '<')
382 return p0;
384 p++;
385 if (p>=parselen || peek(p)!='?')
386 return p0;
388 p++;
390 DOMString buf;
392 while (p<parselen)
393 {
394 XMLCh ch = peek(p);
395 if (ch=='?')
396 {
397 p++;
398 break;
399 }
400 buf.push_back(ch);
401 p++;
402 }
404 if (peek(p) != '>')
405 return p0;
406 p++;
408 //printf("Got version:%s\n",buf.c_str());
409 return p;
410 }
412 int Parser::parseDoctype(int p0)
413 {
414 //printf("### parseDoctype: %d\n", p0);
416 int p = p0;
417 p = skipwhite(p);
419 if (p>=parselen || peek(p)!='<')
420 return p0;
422 p++;
424 if (peek(p)!='!' || peek(p+1)=='-')
425 return p0;
426 p++;
428 DOMString buf;
429 while (p<parselen)
430 {
431 XMLCh ch = peek(p);
432 if (ch=='>')
433 {
434 p++;
435 break;
436 }
437 buf.push_back(ch);
438 p++;
439 }
441 //printf("Got doctype:%s\n",buf.c_str());
442 return p;
443 }
445 int Parser::parseElement(int p0, Element *par,int depth)
446 {
448 int p = p0;
450 int p2 = p;
452 p = skipwhite(p);
454 //## Get open tag
455 XMLCh ch = peek(p);
456 if (ch!='<')
457 return p0;
459 p++;
461 DOMString openTagName;
462 p = skipwhite(p);
463 p = getWord(p, openTagName);
464 //printf("####tag :%s\n", openTagName.c_str());
465 p = skipwhite(p);
467 //Add element to tree
468 Element *n = new Element(openTagName);
469 n->parent = par;
470 par->addChild(n);
472 // Get attributes
473 if (peek(p) != '>')
474 {
475 while (p<parselen)
476 {
477 p = skipwhite(p);
478 ch = peek(p);
479 //printf("ch:%c\n",ch);
480 if (ch=='>')
481 break;
482 else if (ch=='/' && p<parselen+1)
483 {
484 p++;
485 p = skipwhite(p);
486 ch = peek(p);
487 if (ch=='>')
488 {
489 p++;
490 //printf("quick close\n");
491 return p;
492 }
493 }
494 DOMString attrName;
495 p2 = getWord(p, attrName);
496 if (p2==p)
497 break;
498 //printf("name:%s",buf);
499 p=p2;
500 p = skipwhite(p);
501 ch = peek(p);
502 //printf("ch:%c\n",ch);
503 if (ch!='=')
504 break;
505 p++;
506 p = skipwhite(p);
507 // ch = parsebuf[p];
508 // printf("ch:%c\n",ch);
509 DOMString attrVal;
510 p2 = getQuoted(p, attrVal, true);
511 p=p2+1;
512 //printf("name:'%s' value:'%s'\n",attrName.c_str(),attrVal.c_str());
513 char *namestr = (char *)attrName.c_str();
514 if (strncmp(namestr, "xmlns:", 6)==0)
515 n->addNamespace(attrName, attrVal);
516 else
517 n->addAttribute(attrName, attrVal);
518 }
519 }
521 bool cdata = false;
523 p++;
524 // ### Get intervening data ### */
525 DOMString data;
526 while (p<parselen)
527 {
528 //# COMMENT
529 p2 = match(p, "<!--");
530 if (!cdata && p2>p)
531 {
532 p = p2;
533 while (p<parselen)
534 {
535 p2 = match(p, "-->");
536 if (p2 > p)
537 {
538 p = p2;
539 break;
540 }
541 p++;
542 }
543 }
545 ch = peek(p);
546 //# END TAG
547 if (ch=='<' && !cdata && peek(p+1)=='/')
548 {
549 break;
550 }
551 //# CDATA
552 p2 = match(p, "<![CDATA[");
553 if (p2 > p)
554 {
555 cdata = true;
556 p = p2;
557 continue;
558 }
560 //# CHILD ELEMENT
561 if (ch == '<')
562 {
563 p2 = parseElement(p, n, depth+1);
564 if (p2 == p)
565 {
566 /*
567 printf("problem on element:%s. p2:%d p:%d\n",
568 openTagName.c_str(), p2, p);
569 */
570 return p0;
571 }
572 p = p2;
573 continue;
574 }
575 //# ENTITY
576 if (ch=='&' && !cdata)
577 {
578 bool found = false;
579 for (EntityEntry *ee = entities ; ee->value ; ee++)
580 {
581 int p2 = match(p, ee->escaped);
582 if (p2>p)
583 {
584 data.push_back(ee->value);
585 p = p2;
586 found = true;
587 break;
588 }
589 }
590 if (!found)
591 {
592 error("unterminated entity");
593 return -1;
594 }
595 continue;
596 }
598 //# NONE OF THE ABOVE
599 data.push_back(ch);
600 p++;
601 }/*while*/
604 n->value = data;
605 //printf("%d : data:%s\n",p,data.c_str());
607 //## Get close tag
608 p = skipwhite(p);
609 ch = peek(p);
610 if (ch != '<')
611 {
612 error("no < for end tag\n");
613 return p0;
614 }
615 p++;
616 ch = peek(p);
617 if (ch != '/')
618 {
619 error("no / on end tag");
620 return p0;
621 }
622 p++;
623 ch = peek(p);
624 p = skipwhite(p);
625 DOMString closeTagName;
626 p = getWord(p, closeTagName);
627 if (openTagName != closeTagName)
628 {
629 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
630 openTagName.c_str(), closeTagName.c_str());
631 return p0;
632 }
633 p = skipwhite(p);
634 if (peek(p) != '>')
635 {
636 error("no > on end tag for '%s'", closeTagName.c_str());
637 return p0;
638 }
639 p++;
640 // printf("close element:%s\n",closeTagName.c_str());
641 p = skipwhite(p);
642 return p;
643 }
648 Element *Parser::parse(XMLCh *buf,int pos,int len)
649 {
650 parselen = len;
651 parsebuf = buf;
652 Element *rootNode = new Element("root");
653 pos = parseVersion(pos);
654 pos = parseDoctype(pos);
655 pos = parseElement(pos, rootNode, 0);
656 return rootNode;
657 }
660 Element *Parser::parse(const char *buf, int pos, int len)
661 {
663 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
664 long i = 0;
665 while (i< len)
666 {
667 charbuf[i] = (XMLCh)buf[i];
668 i++;
669 }
670 charbuf[i] = '\0';
671 Element *n = parse(charbuf, 0, len);
672 free(charbuf);
673 return n;
674 }
676 Element *Parser::parse(const DOMString &buf)
677 {
678 long len = buf.size();
679 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
680 long i = 0;
681 while (i< len)
682 {
683 charbuf[i] = (XMLCh)buf[i];
684 i++;
685 }
686 charbuf[i] = '\0';
687 Element *n = parse(charbuf, 0, len);
688 free(charbuf);
689 return n;
690 }
692 Element *Parser::parseFile(const char *fileName)
693 {
695 //##### LOAD INTO A CHAR BUF, THEN CONVERT TO XMLCh
696 if (!fileName)
697 return NULL;
699 FILE *f = fopen(fileName, "rb");
700 if (!f)
701 return NULL;
703 struct stat statBuf;
704 if (fstat(fileno(f),&statBuf)<0)
705 {
706 fclose(f);
707 return NULL;
708 }
709 long filelen = statBuf.st_size;
711 //printf("length:%d\n",filelen);
712 XMLCh *charbuf = (XMLCh *)malloc((filelen+1) * sizeof(XMLCh));
713 for (XMLCh *p=charbuf ; !feof(f) ; p++)
714 {
715 *p = (XMLCh)fgetc(f);
716 }
717 fclose(f);
718 charbuf[filelen] = '\0';
721 /*
722 printf("nrbytes:%d\n",wc_count);
723 printf("buf:%ls\n======\n",charbuf);
724 */
725 Element *n = parse(charbuf, 0, filelen);
726 free(charbuf);
727 return n;
728 }
736 }//namespace Pedro
738 #if 0
739 //########################################################################
740 //# T E S T
741 //########################################################################
743 bool doTest(char *fileName)
744 {
745 Pedro::Parser parser;
747 Pedro::Element *elem = parser.parseFile(fileName);
749 if (!elem)
750 {
751 printf("Parsing failed\n");
752 return false;
753 }
755 elem->print();
757 delete elem;
759 return true;
760 }
764 int main(int argc, char **argv)
765 {
766 if (argc != 2)
767 {
768 printf("usage: %s <xmlfile>\n", argv[0]);
769 return 1;
770 }
772 if (!doTest(argv[1]))
773 return 1;
775 return 0;
776 }
778 #endif
780 //########################################################################
781 //# E N D O F F I L E
782 //########################################################################