4 #include <stdio.h>
5 #include <string.h>
6 #include <stdarg.h>
7 #include <malloc.h>
8 #include <sys/types.h>
9 #include <sys/stat.h>
12 #include "minidom.h"
14 namespace MiniDom
15 {
19 //########################################################################
20 //# E L E M E N T
21 //########################################################################
23 void Element::findElementsRecursive(std::vector<Element *>&res, const DOMString &name)
24 {
25 if (getName() == name)
26 res.push_back(this);
27 for (unsigned int i=0; i<children.size() ; i++)
28 children[i]->findElementsRecursive(res, name);
29 }
31 std::vector<Element *> Element::findElements(const DOMString &name)
32 {
33 std::vector<Element *> res;
34 findElementsRecursive(res, name);
35 return res;
36 }
38 DOMString Element::getAttribute(const DOMString &name)
39 {
40 for (unsigned int i=0 ; i<attributes.size() ; i++)
41 if (attributes[i].getName() ==name)
42 return attributes[i].getValue();
43 return "";
44 }
46 void Element::addChild(Element *child)
47 {
48 children.push_back(child);
49 }
52 void Element::addAttribute(const DOMString &name, const DOMString &value)
53 {
54 Attribute attr(name, value);
55 attributes.push_back(attr);
56 }
58 void Element::addNamespace(const DOMString &prefix, const DOMString &namespaceURI)
59 {
60 Namespace ns(prefix, namespaceURI);
61 namespaces.push_back(ns);
62 }
64 void Element::writeIndentedRecursive(FILE *f, int indent)
65 {
66 int i;
67 if (!f)
68 return;
69 //Opening tag, and attributes
70 for (i=0;i<indent;i++)
71 fputc(' ',f);
72 fprintf(f,"<%s",name.c_str());
73 for (unsigned int i=0 ; i<attributes.size() ; i++)
74 {
75 fprintf(f," %s=\"%s\"",
76 attributes[i].getName().c_str(),
77 attributes[i].getValue().c_str());
78 }
79 for (unsigned int i=0 ; i<namespaces.size() ; i++)
80 {
81 fprintf(f," xmlns:%s=\"%s\"",
82 namespaces[i].getPrefix().c_str(),
83 namespaces[i].getNamespaceURI().c_str());
84 }
85 fprintf(f,">\n");
87 //Between the tags
88 if (value.size() > 0)
89 {
90 for (int i=0;i<indent;i++)
91 fputc(' ', f);
92 fprintf(f," %s\n", value.c_str());
93 }
95 for (unsigned int i=0 ; i<children.size() ; i++)
96 children[i]->writeIndentedRecursive(f, indent+2);
98 //Closing tag
99 for (int i=0; i<indent; i++)
100 fputc(' ',f);
101 fprintf(f,"</%s>\n", name.c_str());
102 }
104 void Element::writeIndented(FILE *f)
105 {
106 writeIndentedRecursive(f, 0);
107 }
109 void Element::print()
110 {
111 writeIndented(stdout);
112 }
115 //########################################################################
116 //# P A R S E R
117 //########################################################################
121 typedef struct
122 {
123 char *escaped;
124 char value;
125 } EntityEntry;
127 static EntityEntry entities[] =
128 {
129 { "&" , '&' },
130 { "<" , '<' },
131 { ">" , '>' },
132 { "'", '\'' },
133 { """, '"' },
134 { NULL , '\0' }
135 };
139 void Parser::getLineAndColumn(long pos, long *lineNr, long *colNr)
140 {
141 long line = 1;
142 long col = 1;
143 for (long i=0 ; i<pos ; i++)
144 {
145 XMLCh ch = parsebuf[i];
146 if (ch == '\n' || ch == '\r')
147 {
148 col = 0;
149 line ++;
150 }
151 else
152 col++;
153 }
154 *lineNr = line;
155 *colNr = col;
157 }
160 void Parser::error(char *fmt, ...)
161 {
162 long lineNr;
163 long colNr;
164 getLineAndColumn(currentPosition, &lineNr, &colNr);
165 va_list args;
166 fprintf(stderr, "xml error at line %ld, column %ld:", lineNr, colNr);
167 va_start(args,fmt);
168 vfprintf(stderr,fmt,args);
169 va_end(args) ;
170 fprintf(stderr, "\n");
171 }
175 int Parser::peek(long pos)
176 {
177 if (pos >= parselen)
178 return -1;
179 currentPosition = pos;
180 int ch = parsebuf[pos];
181 //printf("ch:%c\n", ch);
182 return ch;
183 }
187 int Parser::match(long p0, const char *text)
188 {
189 int p = p0;
190 while (*text)
191 {
192 if (peek(p) != *text)
193 return p0;
194 p++; text++;
195 }
196 return p;
197 }
201 int Parser::skipwhite(long p)
202 {
204 while (p<parselen)
205 {
206 int p2 = match(p, "<!--");
207 if (p2 > p)
208 {
209 p = p2;
210 while (p<parselen)
211 {
212 p2 = match(p, "-->");
213 if (p2 > p)
214 {
215 p = p2;
216 break;
217 }
218 p++;
219 }
220 }
221 XMLCh b = peek(p);
222 if (!isspace(b))
223 break;
224 p++;
225 }
226 return p;
227 }
229 /* modify this to allow all chars for an element or attribute name*/
230 int Parser::getWord(int p0, DOMString &buf)
231 {
232 int p = p0;
233 while (p<parselen)
234 {
235 XMLCh b = peek(p);
236 if (b<=' ' || b=='/' || b=='>' || b=='=')
237 break;
238 buf.push_back(b);
239 p++;
240 }
241 return p;
242 }
244 int Parser::getQuoted(int p0, DOMString &buf, int do_i_parse)
245 {
247 int p = p0;
248 if (peek(p) != '"' && peek(p) != '\'')
249 return p0;
250 p++;
252 while ( p<parselen )
253 {
254 XMLCh b = peek(p);
255 if (b=='"' || b=='\'')
256 break;
257 if (b=='&' && do_i_parse)
258 {
259 bool found = false;
260 for (EntityEntry *ee = entities ; ee->value ; ee++)
261 {
262 int p2 = match(p, ee->escaped);
263 if (p2>p)
264 {
265 buf.push_back(ee->value);
266 p = p2;
267 found = true;
268 break;
269 }
270 }
271 if (!found)
272 {
273 error("unterminated entity");
274 return false;
275 }
276 }
277 else
278 {
279 buf.push_back(b);
280 p++;
281 }
282 }
283 return p;
284 }
286 int Parser::parseVersion(int p0)
287 {
288 //printf("### parseVersion: %d\n", p0);
290 int p = p0;
292 p = skipwhite(p0);
294 if (peek(p) != '<')
295 return p0;
297 p++;
298 if (p>=parselen || peek(p)!='?')
299 return p0;
301 p++;
303 DOMString buf;
305 while (p<parselen)
306 {
307 XMLCh ch = peek(p++);
308 if (ch=='?')
309 break;
310 buf.push_back(ch);
311 }
312 if (peek(p) != '>')
313 return p0;
314 p++;
316 //printf("Got version:%s\n",buf.c_str());
317 return p;
318 }
320 int Parser::parseDoctype(int p0)
321 {
322 //printf("### parseDoctype: %d\n", p0);
324 int p = p0;
325 p = skipwhite(p);
327 if (p>=parselen || peek(p)!='<')
328 return p0;
330 p++;
332 if (peek(p)!='!' || peek(p+1)=='-')
333 return p0;
334 p++;
336 DOMString buf;
337 while (p<parselen)
338 {
339 XMLCh ch = peek(p);
340 if (ch=='>')
341 {
342 p++;
343 break;
344 }
345 buf.push_back(ch);
346 p++;
347 }
349 //printf("Got doctype:%s\n",buf.c_str());
350 return p;
351 }
353 int Parser::parseElement(int p0, Element *par,int depth)
354 {
356 int p = p0;
358 int p2 = p;
360 p = skipwhite(p);
362 //## Get open tag
363 XMLCh ch = peek(p);
364 if (ch!='<')
365 return p0;
367 p++;
369 DOMString openTagName;
370 p = skipwhite(p);
371 p = getWord(p, openTagName);
372 //printf("####tag :%s\n", openTagName.c_str());
373 p = skipwhite(p);
375 //Add element to tree
376 Element *n = new Element(openTagName);
377 n->parent = par;
378 par->addChild(n);
380 // Get attributes
381 if (peek(p) != '>')
382 {
383 while (p<parselen)
384 {
385 p = skipwhite(p);
386 ch = peek(p);
387 //printf("ch:%c\n",ch);
388 if (ch=='>')
389 break;
390 else if (ch=='/' && p<parselen+1)
391 {
392 p++;
393 p = skipwhite(p);
394 ch = peek(p);
395 if (ch=='>')
396 {
397 p++;
398 //printf("quick close\n");
399 return p;
400 }
401 }
402 DOMString attrName;
403 p2 = getWord(p, attrName);
404 if (p2==p)
405 break;
406 //printf("name:%s",buf);
407 p=p2;
408 p = skipwhite(p);
409 ch = peek(p);
410 //printf("ch:%c\n",ch);
411 if (ch!='=')
412 break;
413 p++;
414 p = skipwhite(p);
415 // ch = parsebuf[p];
416 // printf("ch:%c\n",ch);
417 DOMString attrVal;
418 p2 = getQuoted(p, attrVal, true);
419 p=p2+1;
420 //printf("name:'%s' value:'%s'\n",attrName.c_str(),attrVal.c_str());
421 char *namestr = (char *)attrName.c_str();
422 if (strncmp(namestr, "xmlns:", 6)==0)
423 n->addNamespace(attrName, attrVal);
424 else
425 n->addAttribute(attrName, attrVal);
426 }
427 }
429 bool cdata = false;
431 p++;
432 // ### Get intervening data ### */
433 DOMString data;
434 while (p<parselen)
435 {
436 //# COMMENT
437 p2 = match(p, "<!--");
438 if (!cdata && p2>p)
439 {
440 p = p2;
441 while (p<parselen)
442 {
443 p2 = match(p, "-->");
444 if (p2 > p)
445 {
446 p = p2;
447 break;
448 }
449 p++;
450 }
451 }
453 ch = peek(p);
454 //# END TAG
455 if (ch=='<' && !cdata && peek(p+1)=='/')
456 {
457 break;
458 }
459 //# CDATA
460 p2 = match(p, "<![CDATA[");
461 if (p2 > p)
462 {
463 cdata = true;
464 p = p2;
465 continue;
466 }
468 //# CHILD ELEMENT
469 if (ch == '<')
470 {
471 p2 = parseElement(p, n, depth+1);
472 if (p2 == p)
473 {
474 /*
475 printf("problem on element:%s. p2:%d p:%d\n",
476 openTagName.c_str(), p2, p);
477 */
478 return p0;
479 }
480 p = p2;
481 continue;
482 }
483 //# ENTITY
484 if (ch=='&' && !cdata)
485 {
486 bool found = false;
487 for (EntityEntry *ee = entities ; ee->value ; ee++)
488 {
489 int p2 = match(p, ee->escaped);
490 if (p2>p)
491 {
492 data.push_back(ee->value);
493 p = p2;
494 found = true;
495 break;
496 }
497 }
498 if (!found)
499 {
500 error("unterminated entity");
501 return -1;
502 }
503 continue;
504 }
506 //# NONE OF THE ABOVE
507 data.push_back(ch);
508 p++;
509 }/*while*/
512 n->value = data;
513 //printf("%d : data:%s\n",p,data.c_str());
515 //## Get close tag
516 p = skipwhite(p);
517 ch = peek(p);
518 if (ch != '<')
519 {
520 error("no < for end tag\n");
521 return p0;
522 }
523 p++;
524 ch = peek(p);
525 if (ch != '/')
526 {
527 error("no / on end tag");
528 return p0;
529 }
530 p++;
531 ch = peek(p);
532 p = skipwhite(p);
533 DOMString closeTagName;
534 p = getWord(p, closeTagName);
535 if (openTagName != closeTagName)
536 {
537 error("Mismatched closing tag. Expected </%S>. Got '%S'.",
538 openTagName.c_str(), closeTagName.c_str());
539 return p0;
540 }
541 p = skipwhite(p);
542 if (peek(p) != '>')
543 {
544 error("no > on end tag for '%s'", closeTagName.c_str());
545 return p0;
546 }
547 p++;
548 // printf("close element:%s\n",closeTagName.c_str());
549 p = skipwhite(p);
550 return p;
551 }
556 Element *Parser::parse(XMLCh *buf,int pos,int len)
557 {
558 parselen = len;
559 parsebuf = buf;
560 Element *rootNode = new Element("root");
561 pos = parseVersion(pos);
562 pos = parseDoctype(pos);
563 pos = parseElement(pos, rootNode, 0);
564 return rootNode;
565 }
568 Element *Parser::parse(const char *buf, int pos, int len)
569 {
571 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
572 long i = 0;
573 while (i< len)
574 {
575 charbuf[i] = (XMLCh)buf[i];
576 i++;
577 }
578 charbuf[i] = '\0';
579 Element *n = parse(charbuf, 0, len-1);
580 free(charbuf);
581 return n;
582 }
584 Element *Parser::parse(const DOMString &buf)
585 {
586 long len = buf.size();
587 XMLCh *charbuf = (XMLCh *)malloc((len+1) * sizeof(XMLCh));
588 long i = 0;
589 while (i< len)
590 {
591 charbuf[i] = (XMLCh)buf[i];
592 i++;
593 }
594 charbuf[i] = '\0';
595 Element *n = parse(charbuf, 0, len-1);
596 free(charbuf);
597 return n;
598 }
600 Element *Parser::parseFile(const char *fileName)
601 {
603 //##### LOAD INTO A CHAR BUF, THEN CONVERT TO XMLCh
604 if (!fileName)
605 return NULL;
607 FILE *f = fopen(fileName, "rb");
608 if (!f)
609 return NULL;
611 struct stat statBuf;
612 if (fstat(fileno(f),&statBuf)<0)
613 {
614 fclose(f);
615 return NULL;
616 }
617 long filelen = statBuf.st_size;
619 //printf("length:%d\n",filelen);
620 XMLCh *charbuf = (XMLCh *)malloc((filelen+1) * sizeof(XMLCh));
621 for (XMLCh *p=charbuf ; !feof(f) ; p++)
622 {
623 *p = (XMLCh)fgetc(f);
624 }
625 fclose(f);
626 charbuf[filelen] = '\0';
629 /*
630 printf("nrbytes:%d\n",wc_count);
631 printf("buf:%ls\n======\n",charbuf);
632 */
633 Element *n = parse(charbuf, 0, filelen-1);
634 free(charbuf);
635 return n;
636 }
644 }//namespace MiniDom
645 //########################################################################
646 //# T E S T
647 //########################################################################
649 bool doTest(char *fileName)
650 {
651 MiniDom::Parser parser;
653 MiniDom::Element *elem = parser.parseFile(fileName);
655 if (!elem)
656 {
657 printf("Parsing failed\n");
658 return false;
659 }
661 elem->print();
663 delete elem;
665 return true;
666 }
670 int main(int argc, char **argv)
671 {
672 if (argc != 2)
673 {
674 printf("usage: %s <xmlfile>\n", argv[0]);
675 return 1;
676 }
678 if (!doTest(argv[1]))
679 return 1;
681 return 0;
682 }
686 //########################################################################
687 //# E N D O F F I L E
688 //########################################################################