src/pedro/pedrodom.cpp

   1 /*
   2  * Implementation of the Pedro mini-DOM parser and tree
   3  *
   4  * Authors:
   5  *   Bob Jamison
   6  *
   7  * Copyright (C) 2005-2008 Bob Jamison
   8  *
   9  *  This library is free software; you can redistribute it and/or
  10  *  modify it under the terms of the GNU Lesser General Public
  11  *  License as published by the Free Software Foundation; either
  12  *  version 2.1 of the License, or (at your option) any later version.
  13  *
  14  *  This library is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  *  Lesser General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU Lesser General Public
  20  *  License along with this library; if not, write to the Free Software
  21  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  22  */
  23
  24
  25
  26 #include <stdio.h>
  27 #include <string.h>
  28 #include <stdarg.h>
  29 #include <sys/types.h>
  30 #include <sys/stat.h>
  31
  32
  33 #include "pedrodom.h"
  34
  35 namespace Pedro
  36 {
  37
  38
  39
  40 //########################################################################
  41 //# E L E M E N T
  42 //########################################################################
  43
  44 Element *Element::clone()
  45 {
  46     Element *elem = new Element(name, value);
  47     elem->parent     = parent;
  48     elem->attributes = attributes;
  49     elem->namespaces = namespaces;
  50
  51     ElementList::iterator iter;
  52     for (iter = children.begin(); iter != children.end() ; iter++)
  53         {
  54         elem->addChild((*iter)->clone());
  55         }
  56     return elem;
  57 }
  58
  59
  60 void Element::findElementsRecursive(std::vector<Element *>&res, const DOMString &name)
  61 {
  62     if (getName() == name)
  63         {
  64         res.push_back(this);
  65         }
  66     for (unsigned int i=0; i<children.size() ; i++)
  67         children[i]->findElementsRecursive(res, name);
  68 }
  69
  70 std::vector<Element *> Element::findElements(const DOMString &name)
  71 {
  72     std::vector<Element *> res;
  73     findElementsRecursive(res, name);
  74     return res;
  75 }
  76
  77 DOMString Element::getAttribute(const DOMString &name)
  78 {
  79     for (unsigned int i=0 ; i<attributes.size() ; i++)
  80         if (attributes[i].getName() ==name)
  81             return attributes[i].getValue();
  82     return "";
  83 }
  84
  85 DOMString Element::getTagAttribute(const DOMString &tagName, const DOMString &attrName)
  86 {
  87     ElementList elems = findElements(tagName);
  88     if (elems.size() <1)
  89         return "";
  90     DOMString res = elems[0]->getAttribute(attrName);
  91     return res;
  92 }
  93
  94 DOMString Element::getTagValue(const DOMString &tagName)
  95 {
  96     ElementList elems = findElements(tagName);
  97     if (elems.size() <1)
  98         return "";
  99     DOMString res = elems[0]->getValue();
 100     return res;
 101 }
 102
 103 void Element::addChild(Element *child)
 104 {
 105     if (!child)
 106         return;
 107     child->parent = this;
 108     children.push_back(child);
 109 }
 110
 111
 112 void Element::addAttribute(const DOMString &name, const DOMString &value)
 113 {
 114     Attribute attr(name, value);
 115     attributes.push_back(attr);
 116 }
 117
 118 void Element::addNamespace(const DOMString &prefix, const DOMString &namespaceURI)
 119 {
 120     Namespace ns(prefix, namespaceURI);
 121     namespaces.push_back(ns);
 122 }
 123
 124 void Element::writeIndentedRecursive(FILE *f, int indent)
 125 {
 126     int i;
 127     if (!f)
 128         return;
 129     //Opening tag, and attributes
 130     for (i=0;i<indent;i++)
 131         fputc(' ',f);
 132     fprintf(f,"<%s",name.c_str());
 133     for (unsigned int i=0 ; i<attributes.size() ; i++)
 134         {
 135         fprintf(f," %s=\"%s\"",
 136               attributes[i].getName().c_str(),
 137               attributes[i].getValue().c_str());
 138         }
 139     for (unsigned int i=0 ; i<namespaces.size() ; i++)
 140         {
 141         fprintf(f," xmlns:%s=\"%s\"",
 142               namespaces[i].getPrefix().c_str(),
 143               namespaces[i].getNamespaceURI().c_str());
 144         }
 145     fprintf(f,">\n");
 146
 147     //Between the tags
 148     if (value.size() > 0)
 149         {
 150         for (int i=0;i<indent;i++)
 151             fputc(' ', f);
 152         fprintf(f," %s\n", value.c_str());
 153         }
 154
 155     for (unsigned int i=0 ; i<children.size() ; i++)
 156         children[i]->writeIndentedRecursive(f, indent+2);
 157
 158     //Closing tag
 159     for (int i=0; i<indent; i++)
 160         fputc(' ',f);
 161     fprintf(f,"</%s>\n", name.c_str());
 162 }
 163
 164 void Element::writeIndented(FILE *f)
 165 {
 166     writeIndentedRecursive(f, 0);
 167 }
 168
 169 void Element::print()
 170 {
 171     writeIndented(stdout);
 172 }
 173
 174
 175 //########################################################################
 176 //# P A R S E R
 177 //########################################################################
 178
 179
 180
 181 typedef struct
 182     {
 183     char *escaped;
 184     char value;
 185     } EntityEntry;
 186
 187 static EntityEntry entities[] =
 188 {
 189     { "&amp;" , '&'  },
 190     { "&lt;"  , '<'  },
 191     { "&gt;"  , '>'  },
 192     { "&apos;", '\'' },
 193     { "&quot;", '"'  },
 194     { NULL    , '\0' }
 195 };
 196
 197
 198
 199 /**
 200  *  Removes whitespace from beginning and end of a string
 201  */
 202 DOMString Parser::trim(const DOMString &s)
 203 {
 204     if (s.size() < 1)
 205         return s;
 206
 207     //Find first non-ws char
 208     unsigned int begin = 0;
 209     for ( ; begin < s.size() ; begin++)
 210         {
 211         if (!isspace(s[begin]))
 212             break;
 213         }
 214
 215     //Find first non-ws char, going in reverse
 216     unsigned int end = s.size() - 1;
 217     for ( ; end > begin ; end--)
 218         {
 219         if (!isspace(s[end]))
 220             break;
 221         }
 222     //trace("begin:%d  end:%d", begin, end);
 223
 224     DOMString res = s.substr(begin, end-begin+1);
 225     return res;
 226 }
 227
 228 void Parser::getLineAndColumn(long pos, long *lineNr, long *colNr)
 229 {
 230     long line = 1;
 231     long col  = 1;
 232     for (long i=0 ; i<pos ; i++)
 233         {
 234         XMLCh ch = parsebuf[i];
 235         if (ch == '\n' || ch == '\r')
 236             {
 237             col = 0;
 238             line ++;
 239             }
 240         else
 241             col++;
 242         }
 243     *lineNr = line;
 244     *colNr  = col;
 245
 246 }
 247
 248
 249 void Parser::error(char const *fmt, ...)
 250 {
 251     long lineNr;
 252     long colNr;
 253     getLineAndColumn(currentPosition, &lineNr, &colNr);
 254     va_list args;
 255     fprintf(stderr, "xml error at line %ld, column %ld:", lineNr, colNr);
 256     va_start(args,fmt);
 257     vfprintf(stderr,fmt,args);
 258     va_end(args) ;
 259     fprintf(stderr, "\n");
 260 }
 261
 262
 263
 264 int Parser::peek(long pos)
 265 {
 266     if (pos >= parselen)
 267         return -1;
 268     currentPosition = pos;
 269     int ch = parsebuf[pos];
 270     //printf("ch:%c\n", ch);
 271     return ch;
 272 }
 273
 274
 275
 276 DOMString Parser::encode(const DOMString &str)
 277 {
 278     DOMString ret;
 279     for (unsigned int i=0 ; i<str.size() ; i++)
 280         {
 281         XMLCh ch = (XMLCh)str[i];
 282         if (ch == '&')
 283             ret.append("&amp;");
 284         else if (ch == '<')
 285             ret.append("&lt;");
 286         else if (ch == '>')
 287             ret.append("&gt;");
 288         else if (ch == '\'')
 289             ret.append("&apos;");
 290         else if (ch == '"')
 291             ret.append("&quot;");
 292         else
 293             ret.push_back(ch);
 294
 295         }
 296     return ret;
 297 }
 298
 299
 300 int Parser::match(long p0, const char *text)
 301 {
 302     int p = p0;
 303     while (*text)
 304         {
 305         if (peek(p) != *text)
 306             return p0;
 307         p++; text++;
 308         }
 309     return p;
 310 }
 311
 312
 313
 314 int Parser::skipwhite(long p)
 315 {
 316
 317     while (p<parselen)
 318         {
 319         int p2 = match(p, "<!--");
 320         if (p2 > p)
 321             {
 322             p = p2;
 323             while (p<parselen)
 324               {
 325               p2 = match(p, "-->");
 326               if (p2 > p)
 327                   {
 328                   p = p2;
 329                   break;
 330                   }
 331               p++;
 332               }
 333           }
 334       XMLCh b = peek(p);
 335       if (!isspace(b))
 336           break;
 337       p++;
 338       }
 339   return p;
 340 }
 341
 342 /* modify this to allow all chars for an element or attribute name*/
 343 int Parser::getWord(int p0, DOMString &buf)
 344 {
 345     int p = p0;
 346     while (p<parselen)
 347         {
 348         XMLCh b = peek(p);
 349         if (b<=' ' || b=='/' || b=='>' || b=='=')
 350             break;
 351         buf.push_back(b);
 352         p++;
 353         }
 354     return p;
 355 }
 356
 357 int Parser::getQuoted(int p0, DOMString &buf, int do_i_parse)
 358 {
 359
 360     int p = p0;
 361     if (peek(p) != '"' && peek(p) != '\'')
 362         return p0;
 363     p++;
 364
 365     while ( p<parselen )
 366         {
 367         XMLCh b = peek(p);
 368         if (b=='"' || b=='\'')
 369             break;
 370         if (b=='&' && do_i_parse)
 371             {
 372             bool found = false;
 373             for (EntityEntry *ee = entities ; ee->value ; ee++)
 374                 {
 375                 int p2 = match(p, ee->escaped);
 376                 if (p2>p)
 377                     {
 378                     buf.push_back(ee->value);
 379                     p = p2;
 380                     found = true;
 381                     break;
 382                     }
 383                 }
 384             if (!found)
 385                 {
 386                 error("unterminated entity");
 387                 return false;
 388                 }
 389             }
 390         else
 391             {
 392             buf.push_back(b);
 393             p++;
 394             }
 395         }
 396     return p;
 397 }
 398
 399 int Parser::parseVersion(int p0)
 400 {
 401     //printf("### parseVersion: %d\n", p0);
 402
 403     int p = p0;
 404
 405     p = skipwhite(p0);
 406
 407     if (peek(p) != '<')
 408         return p0;
 409
 410     p++;
 411     if (p>=parselen || peek(p)!='?')
 412         return p0;
 413
 414     p++;
 415
 416     DOMString buf;
 417
 418     while (p<parselen)
 419         {
 420         XMLCh ch = peek(p);
 421         if (ch=='?')
 422             {
 423             p++;
 424             break;
 425             }
 426         buf.push_back(ch);
 427         p++;
 428         }
 429
 430     if (peek(p) != '>')
 431         return p0;
 432     p++;
 433
 434     //printf("Got version:%s\n",buf.c_str());
 435     return p;
 436 }
 437
 438 int Parser::parseDoctype(int p0)
 439 {
 440     //printf("### parseDoctype: %d\n", p0);
 441
 442     int p = p0;
 443     p = skipwhite(p);
 444
 445     if (p>=parselen || peek(p)!='<')
 446         return p0;
 447
 448     p++;
 449
 450     if (peek(p)!='!' || peek(p+1)=='-')
 451         return p0;
 452     p++;
 453
 454     DOMString buf;
 455     while (p<parselen)
 456         {
 457         XMLCh ch = peek(p);
 458         if (ch=='>')
 459             {
 460             p++;
 461             break;
 462             }
 463         buf.push_back(ch);
 464         p++;
 465         }
 466
 467     //printf("Got doctype:%s\n",buf.c_str());
 468     return p;
 469 }
 470
 471 int Parser::parseElement(int p0, Element *par,int depth)
 472 {
 473
 474     int p = p0;
 475
 476     int p2 = p;
 477
 478     p = skipwhite(p);
 479
 480     //## Get open tag
 481     XMLCh ch = peek(p);
 482     if (ch!='<')
 483         return p0;
 484
 485     p++;
 486
 487     DOMString openTagName;
 488     p = skipwhite(p);
 489     p = getWord(p, openTagName);
 490     //printf("####tag :%s\n", openTagName.c_str());
 491     p = skipwhite(p);
 492
 493     //Add element to tree
 494     Element *n = new Element(openTagName);
 495     n->parent = par;
 496     par->addChild(n);
 497
 498     // Get attributes
 499     if (peek(p) != '>')
 500         {
 501         while (p<parselen)
 502             {
 503             p = skipwhite(p);
 504             ch = peek(p);
 505             //printf("ch:%c\n",ch);
 506             if (ch=='>')
 507                 break;
 508             else if (ch=='/' && p<parselen+1)
 509                 {
 510                 p++;
 511                 p = skipwhite(p);
 512                 ch = peek(p);
 513                 if (ch=='>')
 514                     {
 515                     p++;
 516                     //printf("quick close\n");
 517                     return p;
 518                     }
 519                 }
 520             DOMString attrName;
 521             p2 = getWord(p, attrName);
 522             if (p2==p)
 523                 break;
 524             //printf("name:%s",buf);
 525             p=p2;
 526             p = skipwhite(p);
 527             ch = peek(p);
 528             //printf("ch:%c\n",ch);
 529             if (ch!='=')
 530                 break;
 531             p++;
 532             p = skipwhite(p);
 533             // ch = parsebuf[p];
 534             // printf("ch:%c\n",ch);
 535             DOMString attrVal;
 536             p2 = getQuoted(p, attrVal, true);
 537             p=p2+1;
 538             //printf("name:'%s'   value:'%s'\n",attrName.c_str(),attrVal.c_str());
 539             char *namestr = (char *)attrName.c_str();
 540             if (strncmp(namestr, "xmlns:", 6)==0)
 541                 n->addNamespace(attrName, attrVal);
 542             else
 543                 n->addAttribute(attrName, attrVal);
 544             }
 545         }
 546
 547     bool cdata = false;
 548
 549     p++;
 550     // ### Get intervening data ### */
 551     DOMString data;
 552     while (p<parselen)
 553         {
 554         //# COMMENT
 555         p2 = match(p, "<!--");
 556         if (!cdata && p2>p)
 557             {
 558             p = p2;
 559             while (p<parselen)
 560                 {
 561                 p2 = match(p, "-->");
 562                 if (p2 > p)
 563                     {
 564                     p = p2;
 565                     break;
 566                     }
 567                 p++;
 568                 }
 569             }
 570
 571         ch = peek(p);
 572         //# END TAG
 573         if (ch=='<' && !cdata && peek(p+1)=='/')
 574             {
 575             break;
 576             }
 577         //# CDATA
 578         p2 = match(p, "<![CDATA[");
 579         if (p2 > p)
 580             {
 581             cdata = true;
 582             p = p2;
 583             continue;
 584             }
 585
 586         //# CHILD ELEMENT
 587         if (ch == '<')
 588             {
 589             p2 = parseElement(p, n, depth+1);
 590             if (p2 == p)
 591                 {
 592                 /*
 593                 printf("problem on element:%s.  p2:%d p:%d\n",
 594                       openTagName.c_str(), p2, p);
 595                 */
 596                 return p0;
 597                 }
 598             p = p2;
 599             continue;
 600             }
 601         //# ENTITY
 602         if (ch=='&' && !cdata)
 603             {
 604             bool found = false;
 605             for (EntityEntry *ee = entities ; ee->value ; ee++)
 606                 {
 607                 int p2 = match(p, ee->escaped);
 608                 if (p2>p)
 609                     {
 610                     data.push_back(ee->value);
 611                     p = p2;
 612                     found = true;
 613                     break;
 614                     }
 615                 }
 616             if (!found)
 617                 {
 618                 error("unterminated entity");
 619                 return -1;
 620                 }
 621             continue;
 622             }
 623
 624         //# NONE OF THE ABOVE
 625         data.push_back(ch);
 626         p++;
 627         }/*while*/
 628
 629
 630     n->value = data;
 631     //printf("%d : data:%s\n",p,data.c_str());
 632
 633     //## Get close tag
 634     p = skipwhite(p);
 635     ch = peek(p);
 636     if (ch != '<')
 637         {
 638         error("no < for end tag\n");
 639         return p0;
 640         }
 641     p++;
 642     ch = peek(p);
 643     if (ch != '/')
 644         {
 645         error("no / on end tag");
 646         return p0;
 647         }
 648     p++;
 649     ch = peek(p);
 650     p = skipwhite(p);
 651     DOMString closeTagName;
 652     p = getWord(p, closeTagName);
 653     if (openTagName != closeTagName)
 654         {
 655         error("Mismatched closing tag.  Expected </%s>. Got '%s'.",
 656                 openTagName.c_str(), closeTagName.c_str());
 657         return p0;
 658         }
 659     p = skipwhite(p);
 660     if (peek(p) != '>')
 661         {
 662         error("no > on end tag for '%s'", closeTagName.c_str());
 663         return p0;
 664         }
 665     p++;
 666     // printf("close element:%s\n",closeTagName.c_str());
 667     p = skipwhite(p);
 668     return p;
 669 }
 670
 671
 672
 673
 674 Element *Parser::parse(XMLCh *buf,int pos,int len)
 675 {
 676     parselen = len;
 677     parsebuf = buf;
 678     Element *rootNode = new Element("root");
 679     pos = parseVersion(pos);
 680     pos = parseDoctype(pos);
 681     pos = parseElement(pos, rootNode, 0);
 682     return rootNode;
 683 }
 684
 685
 686 Element *Parser::parse(const char *buf, int pos, int len)
 687 {
 688     XMLCh *charbuf = new XMLCh[len + 1];
 689     long i = 0;
 690     for ( ; i < len ; i++)
 691         charbuf[i] = (XMLCh)buf[i];
 692     charbuf[i] = '\0';
 693
 694     Element *n = parse(charbuf, pos, len);
 695     delete[] charbuf;
 696     return n;
 697 }
 698
 699 Element *Parser::parse(const DOMString &buf)
 700 {
 701     long len = (long)buf.size();
 702     XMLCh *charbuf = new XMLCh[len + 1];
 703     long i = 0;
 704     for ( ; i < len ; i++)
 705         charbuf[i] = (XMLCh)buf[i];
 706     charbuf[i] = '\0';
 707
 708     Element *n = parse(charbuf, 0, len);
 709     delete[] charbuf;
 710     return n;
 711 }
 712
 713 Element *Parser::parseFile(const DOMString &fileName)
 714 {
 715
 716     //##### LOAD INTO A CHAR BUF, THEN CONVERT TO XMLCh
 717     FILE *f = fopen(fileName.c_str(), "rb");
 718     if (!f)
 719         return NULL;
 720
 721     struct stat   statBuf;
 722     if (fstat(fileno(f),&statBuf)<0)
 723         {
 724         fclose(f);
 725         return NULL;
 726         }
 727     long filelen = statBuf.st_size;
 728
 729     //printf("length:%d\n",filelen);
 730     XMLCh *charbuf = new XMLCh[filelen + 1];
 731     for (XMLCh *p=charbuf ; !feof(f) ; p++)
 732         {
 733         *p = (XMLCh)fgetc(f);
 734         }
 735     fclose(f);
 736     charbuf[filelen] = '\0';
 737
 738
 739     /*
 740     printf("nrbytes:%d\n",wc_count);
 741     printf("buf:%ls\n======\n",charbuf);
 742     */
 743     Element *n = parse(charbuf, 0, filelen);
 744     delete [] charbuf;
 745     return n;
 746 }
 747
 748
 749
 750
 751
 752
 753
 754 }//namespace Pedro
 755
 756 #if 0
 757 //########################################################################
 758 //#  T E S T
 759 //########################################################################
 760
 761 bool doTest(char *fileName)
 762 {
 763     Pedro::Parser parser;
 764
 765     Pedro::Element *elem = parser.parseFile(fileName);
 766
 767     if (!elem)
 768         {
 769         printf("Parsing failed\n");
 770         return false;
 771         }
 772
 773     elem->print();
 774
 775     delete elem;
 776
 777     return true;
 778 }
 779
 780
 781
 782 int main(int argc, char **argv)
 783 {
 784     if (argc != 2)
 785         {
 786         printf("usage: %s <xmlfile>\n", argv[0]);
 787         return 1;
 788         }
 789
 790     if (!doTest(argv[1]))
 791         return 1;
 792
 793     return 0;
 794 }
 795
 796 #endif
 797
 798 //########################################################################
 799 //#  E N D    O F    F I L E
 800 //########################################################################
 801
 802