1 /*
2 * Implementation of the Pedro mini-DOM parser and tree
3 *
4 * Authors:
5 * Bob Jamison
6 *
7 * Copyright (C) 2005 Bob Jamison
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
26 #include <stdio.h>
27 #include <string.h>
28 #include <stdarg.h>
29 #include <malloc.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
34 #include "pedrodom.h"
36 namespace Pedro
37 {
41 //########################################################################
42 //# E L E M E N T
43 //########################################################################
45 Element *Element::clone()
46 {
47 Element *elem = new Element(name, value);
48 elem->parent = parent;
49 elem->attributes = attributes;
50 elem->namespaces = namespaces;
52 std::vector<Element *>::iterator iter;
53 for (iter = children.begin(); iter != children.end() ; iter++)
54 {
55 elem->addChild((*iter)->clone());
56 }
57 return elem;
58 }
61 void Element::findElementsRecursive(std::vector<Element *>&res, const DOMString &name)
62 {
63 if (getName() == name)
64 {
65 res.push_back(this);
66 }
67 for (unsigned int i=0; i<children.size() ; i++)
68 children[i]->findElementsRecursive(res, name);
69 }
71 std::vector<Element *> Element::findElements(const DOMString &name)
72 {
73 std::vector<Element *> res;
74 findElementsRecursive(res, name);
75 return res;
76 }
78 DOMString Element::getAttribute(const DOMString &name)
79 {
80 for (unsigned int i=0 ; i<attributes.size() ; i++)
81 if (attributes[i].getName() ==name)
82 return attributes[i].getValue();
83 return "";
84 }
86 DOMString Element::getTagAttribute(const DOMString &tagName, const DOMString &attrName)
87 {
88 std::vector<Element *>elems = findElements(tagName);
89 if (elems.size() <1)
90 return "";
91 DOMString res = elems[0]->getAttribute(attrName);
92 return res;
93 }
95 DOMString Element::getTagValue(const DOMString &tagName)
96 {
97 std::vector<Element *>elems = findElements(tagName);
98 if (elems.size() <1)
99 return "";
100 DOMString res = elems[0]->getValue();
101 return res;
102 }
104 void Element::addChild(Element *child)
105 {
106 if (!child)
107 return;
108 child->parent = this;
109 children.push_back(child);
110 }
113 void Element::addAttribute(const DOMString &name, const DOMString &value)
114 {
115 Attribute attr(name, value);
116 attributes.push_back(attr);
117 }
119 void Element::addNamespace(const DOMString &prefix, const DOMString &namespaceURI)
120 {
121 Namespace ns(prefix, namespaceURI);
122 namespaces.push_back(ns);
123 }
125 void Element::writeIndentedRecursive(FILE *f, int indent)
126 {
127 int i;
128 if (!f)
129 return;
130 //Opening tag, and attributes
131 for (i=0;i<indent;i++)
132 fputc(' ',f);
133 fprintf(f,"<%s",name.c_str());
134 for (unsigned int i=0 ; i<attributes.size() ; i++)
135 {
136 fprintf(f," %s=\"%s\"",
137 attributes[i].getName().c_str(),
138 attributes[i].getValue().c_str());
139 }
140 for (unsigned int i=0 ; i<namespaces.size() ; i++)
141 {
142 fprintf(f," xmlns:%s=\"%s\"",
143 namespaces[i].getPrefix().c_str(),
144 namespaces[i].getNamespaceURI().c_str());
145 }
146 fprintf(f,">\n");
148 //Between the tags
149 if (value.size() > 0)
150 {
151 for (int i=0;i<indent;i++)
152 fputc(' ', f);
153 fprintf(f," %s\n", value.c_str());
154 }
156 for (unsigned int i=0 ; i<children.size() ; i++)
157 children[i]->writeIndentedRecursive(f, indent+2);
159 //Closing tag
160 for (int i=0; i<indent; i++)
161 fputc(' ',f);
162 fprintf(f,"</%s>\n", name.c_str());
163 }
165 void Element::writeIndented(FILE *f)
166 {
167 writeIndentedRecursive(f, 0);
168 }
170 void Element::print()
171 {
172 writeIndented(stdout);
173 }
176 //########################################################################
177 //# P A R S E R
178 //########################################################################
182 typedef struct
183 {
184 char *escaped;
185 char value;
186 } EntityEntry;
188 static EntityEntry entities[] =
189 {
190 { "&" , '&' },
191 { "<" , '<' },
192 { ">" , '>' },
193 { "'", '\'' },
194 { """, '"' },
195 { NULL , '\0' }
196 };
200 void Parser::getLineAndColumn(long pos, long *lineNr, long *colNr)
201 {
202 long line = 1;
203 long col = 1;
204 for (long i=0 ; i<pos ; i++)
205 {
206 XMLCh ch = parsebuf[i];
207 if (ch == '\n' || ch == '\r')
208 {
209 col = 0;
210 line ++;
211 }
212 else
213 col++;
214 }
215 *lineNr = line;
216 *colNr = col;
218 }
221 void Parser::error(char *fmt, ...)
222 {
223 long lineNr;
224 long colNr;
225 getLineAndColumn(currentPosition, &lineNr, &colNr);
226 va_list args;
227 fprintf(stderr, "xml error at line %ld, column %ld:", lineNr, colNr);
228 va_start(args,fmt);
229 vfprintf(stderr,fmt,args);
230 va_end(args) ;
231 fprintf(stderr, "\n");
232 }
236 int Parser::peek(long pos)
237 {
238 if (pos >= parselen)
239 return -1;
240 currentPosition = pos;
241 int ch = parsebuf[pos];
242 //printf("ch:%c\n", ch);
243 return ch;
244 }
248 DOMString Parser::encode(const DOMString &str)
249 {
250 DOMString ret;
251 for (unsigned int i=0 ; i<str.size() ; i++)
252 {
253 XMLCh ch = (XMLCh)str[i];
254 if (ch == '&')
255 ret.append("&");
256 else if (ch == '<')
257 ret.append("<");
258 else if (ch == '>')
259 ret.append(">");
260 else if (ch == '\'')
261 ret.append("'");
262 else if (ch == '"')
263 ret.append(""");
264 else
265 ret.push_back(ch);
267 }
268 return ret;
269 }
272 int Parser::match(long p0, const char *text)
273 {
274 int p = p0;
275 while (*text)
276 {
277 if (peek(p) != *text)
278 return p0;
279 p++; text++;
280 }
281 return p;
282 }
286 int Parser::skipwhite(long p)
287 {
289 while (p<parselen)
290 {
291 int p2 = match(p, "<!--");
292 if (p2 > p)
293 {
294 p = p2;
295 while (p<parselen)
296 {
297 p2 = match(p, "-->");
298 if (p2 > p)
299 {
300 p = p2;
301 break;
302 }
303 p++;
304 }
305 }
306 XMLCh b = peek(p);
307 if (!isspace(b))
308 break;
309 p++;
310 }
311 return p;
312 }
314 /* modify this to allow all chars for an element or attribute name*/
315 int Parser::getWord(int p0, DOMString &buf)
316 {
317 int p = p0;
318 while (p<parselen)
319 {
320 XMLCh b = peek(p);
321 if (b<=' ' || b=='/' || b=='>' || b=='=')
322 break;
323 buf.push_back(b);
324 p++;
325 }
326 return p;
327 }
329 int Parser::getQuoted(int p0, DOMString &buf, int do_i_parse)
330 {
332 int p = p0;
333 if (peek(p) != '"' && peek(p) != '\'')
334 return p0;
335 p++;
337 while ( p<parselen )
338 {
339 XMLCh b = peek(p);
340 if (b=='"' || b=='\'')
341 break;
342 if (b=='&' && do_i_parse)
343 {
344 bool found = false;
345 for (EntityEntry *ee = entities ; ee->value ; ee++)
346 {
347 int p2 = match(p, ee->escaped);
348 if (p2>p)
349 {
350 buf.push_back(ee->value);
351 p = p2;
352 found = true;
353 break;
354 }
355 }
356 if (!found)
357 {
358 error("unterminated entity");
359 return false;
360 }
361 }
362 else
363 {
364 buf.push_back(b);
365 p++;
366 }
367 }
368 return p;
369 }
371 int Parser::parseVersion(int p0)
372 {
373 //printf("### parseVersion: %d\n", p0);
375 int p = p0;
377 p = skipwhite(p0);
379 if (peek(p) != '<')
380 return p0;
382 p++;
383 if (p>=parselen || peek(p)!='?')
384 return p0;
386 p++;
388 DOMString buf;
390 while (p<parselen)
391 {
392 XMLCh ch = peek(p);
393 if (ch=='?')
394 {
395 p++;
396 break;
397 }
398 buf.push_back(ch);
399 p++;
400 }
402 if (peek(p) != '>')
403 return p0;
404 p++;
406 //printf("Got version:%s\n",buf.c_str());
407 return p;
408 }
410 int Parser::parseDoctype(int p0)
411 {
412 //printf("### parseDoctype: %d\n", p0);
414 int p = p0;
415 p = skipwhite(p);
417 if (p>=parselen || peek(p)!='<')
418 return p0;
420 p++;
422 if (peek(p)!='!' || peek(p+1)=='-')
423 return p0;
424 p++;
426 DOMString buf;
427 while (p<parselen)
428 {
429 XMLCh ch = peek(p);
430 if (ch=='>')
431 {
432 p++;
433 break;
434 }
435 buf.push_back(ch);
436 p++;
437 }
439 //printf("Got doctype:%s\n",buf.c_str());
440 return p;
441 }
443 int Parser::parseElement(int p0, Element *par,int depth)
444 {
446 int p = p0;
448 int p2 = p;
450 p = skipwhite(p);
452 //## Get open tag
453 XMLCh ch = peek(p);
454 if (ch!='<')
455 return p0;
457 p++;
459 DOMString openTagName;
460 p = skipwhite(p);
461 p = getWord(p, openTagName);
462 //printf("####tag :%s\n", openTagName.c_str());
463 p = skipwhite(p);
465 //Add element to tree
466 Element *n = new Element(openTagName);
467 n->parent = par;
468 par->addChild(n);
470 // Get attributes
471 if (peek(p) != '>')
472 {
473 while (p<parselen)
474 {
475 p = skipwhite(p);
476 ch = peek(p);
477 //printf("ch:%c\n",ch);
478 if (ch=='>')
479 break;
480 else if (ch=='/' && p<parselen+1)
481 {
482 p++;
483 p = skipwhite(p);
484 ch = peek(p);
485 if (ch=='>')
486 {
487 p++;
488 //printf("quick close\n");
489 return p;
490 }
491 }
492 DOMString attrName;
493 p2 = getWord(p, attrName);
494 if (p2==p)
495 break;
496 //printf("name:%s",buf);
497 p=p2;
498 p = skipwhite(p);
499 ch = peek(p);
500 //printf("ch:%c\n",ch);
501 if (ch!='=')
502 break;
503 p++;
504 p = skipwhite(p);
505 // ch = parsebuf[p];
506 // printf("ch:%c\n",ch);
507 DOMString attrVal;
508 p2 = getQuoted(p, attrVal, true);
509 p=p2+1;
510 //printf("name:'%s' value:'%s'\n",attrName.c_str(),attrVal.c_str());
511 char *namestr = (char *)attrName.c_str();
512 if (strncmp(namestr, "xmlns:", 6)==0)
513 n->addNamespace(attrName, attrVal);
514 else
515 n->addAttribute(attrName, attrVal);
516 }
517 }
519 bool cdata = false;
521 p++;
522 // ### Get intervening data ### */
523 DOMString data;
524 while (p<parselen)
525 {
526 //# COMMENT
527 p2 = match(p, "<!--");
528 if (!cdata && p2>p)
529 {
530 p = p2;
531 while (p<parselen)
532 {
533 p2 = match(p, "-->");
534 if (p2 > p)
535 {
536 p = p2;
537 break;
538 }
539 p++;
540 }
541 }
543 ch = peek(p);
544 //# END TAG
545 if (ch=='<' && !cdata && peek(p+1)=='/')
546 {
547 break;
548 }
549 //# CDATA
550 p2 = match(p, "<![CDATA[");
551 if (p2 > p)
552 {
553 cdata = true;
554 p = p2;
555 continue;
556 }
558 //# CHILD ELEMENT
559 if (ch == '<')
560 {
561 p2 = parseElement(p, n, depth+1);
562 if (p2 == p)
563 {
564 /*
565 printf("problem on element:%s. p2:%d p:%d\n",
566 openTagName.c_str(), p2, p);
567 */
568 return p0;
569 }
570 p = p2;
571 continue;
572 }
573 //# ENTITY
574 if (ch=='&' && !cdata)
575 {
576 bool found = false;
577 for (EntityEntry *ee = entities ; ee->value ; ee++)
578 {
579 int p2 = match(p, ee->escaped);
580 if (p2>p)
581 {
582 data.push_back(ee->value);
583 p = p2;
584 found = true;
585 break;
586 }
587 }
588 if (!found)
589 {
590 error("unterminated entity");
591 return -1;
592 }
593 continue;
594 }
596 //# NONE OF THE ABOVE
597 data.push_back(ch);
598 p++;
599 }/*while*/
602 n->value = data;
603 //printf("%d : data:%s\n",p,data.c_str());
605 //## Get close tag
606 p = skipwhite(p);
607 ch = peek(p);
608 if (ch != '<')
609 {
610 error("no < for end tag\n");
611 return p0;
612 }
613 p++;
614 ch = peek(p);
615 if (ch != '/')
616 {
617 error("no / on end tag");
618 return p0;
619 }
620 p++;
621 ch = peek(p);
622 p = skipwhite(p);
623 DOMString closeTagName;
624 p = getWord(p, closeTagName);
625 if (openTagName != closeTagName)
626 {
627 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
628 openTagName.c_str(), closeTagName.c_str());
629 return p0;
630 }
631 p = skipwhite(p);
632 if (peek(p) != '>')
633 {
634 error("no > on end tag for '%s'", closeTagName.c_str());
635 return p0;
636 }
637 p++;
638 // printf("close element:%s\n",closeTagName.c_str());
639 p = skipwhite(p);
640 return p;
641 }
646 Element *Parser::parse(XMLCh *buf,int pos,int len)
647 {
648 parselen = len;
649 parsebuf = buf;
650 Element *rootNode = new Element("root");
651 pos = parseVersion(pos);
652 pos = parseDoctype(pos);
653 pos = parseElement(pos, rootNode, 0);
654 return rootNode;
655 }
658 Element *Parser::parse(const char *buf, int pos, int len)
659 {
661 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
662 long i = 0;
663 while (i< len)
664 {
665 charbuf[i] = (XMLCh)buf[i];
666 i++;
667 }
668 charbuf[i] = '\0';
669 Element *n = parse(charbuf, 0, len);
670 free(charbuf);
671 return n;
672 }
674 Element *Parser::parse(const DOMString &buf)
675 {
676 long len = buf.size();
677 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
678 long i = 0;
679 while (i< len)
680 {
681 charbuf[i] = (XMLCh)buf[i];
682 i++;
683 }
684 charbuf[i] = '\0';
685 Element *n = parse(charbuf, 0, len);
686 free(charbuf);
687 return n;
688 }
690 Element *Parser::parseFile(const char *fileName)
691 {
693 //##### LOAD INTO A CHAR BUF, THEN CONVERT TO XMLCh
694 if (!fileName)
695 return NULL;
697 FILE *f = fopen(fileName, "rb");
698 if (!f)
699 return NULL;
701 struct stat statBuf;
702 if (fstat(fileno(f),&statBuf)<0)
703 {
704 fclose(f);
705 return NULL;
706 }
707 long filelen = statBuf.st_size;
709 //printf("length:%d\n",filelen);
710 XMLCh *charbuf = (XMLCh *)malloc((filelen+1) * sizeof(XMLCh));
711 for (XMLCh *p=charbuf ; !feof(f) ; p++)
712 {
713 *p = (XMLCh)fgetc(f);
714 }
715 fclose(f);
716 charbuf[filelen] = '\0';
719 /*
720 printf("nrbytes:%d\n",wc_count);
721 printf("buf:%ls\n======\n",charbuf);
722 */
723 Element *n = parse(charbuf, 0, filelen);
724 free(charbuf);
725 return n;
726 }
734 }//namespace Pedro
736 #if 0
737 //########################################################################
738 //# T E S T
739 //########################################################################
741 bool doTest(char *fileName)
742 {
743 Pedro::Parser parser;
745 Pedro::Element *elem = parser.parseFile(fileName);
747 if (!elem)
748 {
749 printf("Parsing failed\n");
750 return false;
751 }
753 elem->print();
755 delete elem;
757 return true;
758 }
762 int main(int argc, char **argv)
763 {
764 if (argc != 2)
765 {
766 printf("usage: %s <xmlfile>\n", argv[0]);
767 return 1;
768 }
770 if (!doTest(argv[1]))
771 return 1;
773 return 0;
774 }
776 #endif
778 //########################################################################
779 //# E N D O F F I L E
780 //########################################################################