src/dom/uri.cpp

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * Authors:
  11  *   Bob Jamison
  12  *
  13  * Copyright (C) 2005-2008 Bob Jamison
  14  *
  15  *  This library is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU Lesser General Public
  17  *  License as published by the Free Software Foundation; either
  18  *  version 2.1 of the License, or (at your option) any later version.
  19  *
  20  *  This library is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  Lesser General Public License for more details.
  24  *
  25  *  You should have received a copy of the GNU Lesser General Public
  26  *  License along with this library; if not, write to the Free Software
  27  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  28  */
  29
  30
  31
  32
  33 #include "uri.h"
  34 #include "ucd.h"
  35
  36 #include <stdio.h>
  37 #include <stdarg.h>
  38 #include <vector>
  39
  40
  41 namespace org
  42 {
  43 namespace w3c
  44 {
  45 namespace dom
  46 {
  47
  48
  49 typedef struct
  50 {
  51     int  ival;
  52     char const *sval;
  53     int  port;
  54 } LookupEntry;
  55
  56 static LookupEntry schemes[] =
  57 {
  58     { URI::SCHEME_DATA,   "data:",    0 },
  59     { URI::SCHEME_HTTP,   "http:",   80 },
  60     { URI::SCHEME_HTTPS,  "https:", 443 },
  61     { URI::SCHEME_FTP,    "ftp",     12 },
  62     { URI::SCHEME_FILE,   "file:",    0 },
  63     { URI::SCHEME_LDAP,   "ldap:",  123 },
  64     { URI::SCHEME_MAILTO, "mailto:", 25 },
  65     { URI::SCHEME_NEWS,   "news:",  117 },
  66     { URI::SCHEME_TELNET, "telnet:", 23 },
  67     { 0,                  NULL,       0 }
  68 };
  69
  70
  71
  72 //#########################################################################
  73 //# C O N S T R U C T O R
  74 //#########################################################################
  75
  76 /**
  77  *
  78  */
  79 URI::URI()
  80 {
  81     init();
  82 }
  83
  84 /**
  85  *
  86  */
  87 URI::URI(const DOMString &str)
  88 {
  89     init();
  90     parse(str);
  91 }
  92
  93
  94 /**
  95  *
  96  */
  97 URI::URI(const char *str)
  98 {
  99     init();
 100     DOMString domStr = str;
 101     parse(domStr);
 102 }
 103
 104
 105 /**
 106  *
 107  */
 108 URI::URI(const URI &other)
 109 {
 110     init();
 111     assign(other);
 112 }
 113
 114
 115 /**
 116  *
 117  */
 118 URI &URI::operator=(const URI &other)
 119 {
 120     init();
 121     assign(other);
 122     return *this;
 123 }
 124
 125
 126 /**
 127  *
 128  */
 129 URI::~URI()
 130 {
 131 }
 132
 133
 134
 135
 136
 137 /**
 138  *
 139  */
 140 void URI::init()
 141 {
 142     parsebuf  = NULL;
 143     parselen  = 0;
 144     scheme    = SCHEME_NONE;
 145     schemeStr.clear();
 146     port      = 0;
 147     authority.clear();
 148     path.clear();
 149     absolute  = false;
 150     opaque    = false;
 151     query.clear();
 152     fragment.clear();
 153 }
 154
 155
 156 /**
 157  *
 158  */
 159 void URI::assign(const URI &other)
 160 {
 161     scheme    = other.scheme;
 162     schemeStr = other.schemeStr;
 163     authority = other.authority;
 164     port      = other.port;
 165     path      = other.path;
 166     absolute  = other.absolute;
 167     opaque    = other.opaque;
 168     query     = other.query;
 169     fragment  = other.fragment;
 170 }
 171
 172
 173 //#########################################################################
 174 //#A T T R I B U T E S
 175 //#########################################################################
 176 static const char *hexChars = "0123456789abcdef";
 177
 178 static DOMString toStr(const std::vector<int> &arr)
 179 {
 180     DOMString buf;
 181     std::vector<int>::const_iterator iter;
 182     for (iter=arr.begin() ; iter!=arr.end() ; iter++)
 183         {
 184         int ch = *iter;
 185         if (isprint(ch))
 186             buf.push_back((XMLCh)ch);
 187         else
 188             {
 189             buf.push_back('%');
 190             int hi = ((ch>>4) & 0xf);
 191             buf.push_back(hexChars[hi]);
 192             int lo = ((ch   ) & 0xf);
 193             buf.push_back(hexChars[lo]);
 194             }
 195         }
 196     return buf;
 197 }
 198
 199
 200 DOMString URI::toString() const
 201 {
 202     DOMString str = schemeStr;
 203     if (authority.size() > 0)
 204         {
 205         str.append("//");
 206         str.append(toStr(authority));
 207         }
 208     str.append(toStr(path));
 209     if (query.size() > 0)
 210         {
 211         str.append("?");
 212         str.append(toStr(query));
 213         }
 214     if (fragment.size() > 0)
 215         {
 216         str.append("#");
 217         str.append(toStr(fragment));
 218         }
 219     return str;
 220 }
 221
 222
 223 int URI::getScheme() const
 224 {
 225     return scheme;
 226 }
 227
 228 DOMString URI::getSchemeStr() const
 229 {
 230     return schemeStr;
 231 }
 232
 233
 234 DOMString URI::getAuthority() const
 235 {
 236     DOMString ret = toStr(authority);
 237     if (portSpecified && port>=0)
 238         {
 239         char buf[7];
 240         snprintf(buf, 6, ":%6d", port);
 241         ret.append(buf);
 242         }
 243     return ret;
 244 }
 245
 246 DOMString URI::getHost() const
 247 {
 248     DOMString str = toStr(authority);
 249     return str;
 250 }
 251
 252 int URI::getPort() const
 253 {
 254     return port;
 255 }
 256
 257
 258 DOMString URI::getPath() const
 259 {
 260     DOMString str = toStr(path);
 261     return str;
 262 }
 263
 264 DOMString URI::getNativePath() const
 265 {
 266     DOMString pathStr = toStr(path);
 267     DOMString npath;
 268 #ifdef __WIN32__
 269     unsigned int firstChar = 0;
 270     if (pathStr.size() >= 3)
 271         {
 272         if (pathStr[0] == '/' &&
 273             uni_is_letter(pathStr[1]) &&
 274             pathStr[2] == ':')
 275             firstChar++;
 276          }
 277     for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
 278         {
 279         XMLCh ch = (XMLCh) pathStr[i];
 280         if (ch == '/')
 281             npath.push_back((XMLCh)'\\');
 282         else
 283             npath.push_back(ch);
 284         }
 285 #else
 286     npath = pathStr;
 287 #endif
 288     return npath;
 289 }
 290
 291
 292 bool URI::isAbsolute() const
 293 {
 294     return absolute;
 295 }
 296
 297 bool URI::isOpaque() const
 298 {
 299     return opaque;
 300 }
 301
 302
 303 DOMString URI::getQuery() const
 304 {
 305     DOMString str = toStr(query);
 306     return str;
 307 }
 308
 309
 310 DOMString URI::getFragment() const
 311 {
 312     DOMString str = toStr(fragment);
 313     return str;
 314 }
 315
 316
 317
 318
 319 static int find(const std::vector<int> &str, int ch, int startpos)
 320 {
 321     for (unsigned int i = startpos ; i < str.size() ; i++)
 322         {
 323         if (ch == str[i])
 324             return i;
 325         }
 326     return -1;
 327 }
 328
 329
 330 static int findLast(const std::vector<int> &str, int ch)
 331 {
 332     /**
 333      * Fixed.  Originally I used an unsigned int for str.size(),
 334      * which was dumb, since i>=0 would always be true.
 335      */
 336     for (int i = ((int)str.size())-1 ; i>=0 ; i--)
 337         {
 338         if (ch == str[i])
 339             return i;
 340         }
 341     return -1;
 342 }
 343
 344
 345 static bool sequ(const std::vector<int> &str, const char *key)
 346 {
 347     char *c = (char *)key;
 348     for (unsigned int i=0 ; i<str.size() ; i++)
 349         {
 350         if (! (*c))
 351             return false;
 352         if (*c != str[i])
 353             return false;
 354         }
 355     return true;
 356 }
 357
 358
 359 static std::vector<int> substr(const std::vector<int> &str,
 360                       int startpos, int len)
 361 {
 362     std::vector<int> buf;
 363     unsigned int pos = startpos;
 364     for (int i=0 ; i<len ; i++)
 365         {
 366         if (pos >= str.size())
 367             break;
 368         buf.push_back(str[pos++]);
 369         }
 370     return buf;
 371 }
 372
 373
 374 URI URI::resolve(const URI &other) const
 375 {
 376     //### According to w3c, this is handled in 3 cases
 377
 378     //## 1
 379     if (opaque || other.isAbsolute())
 380         return other;
 381
 382     //## 2
 383     if (other.fragment.size()  >  0 &&
 384         other.path.size()      == 0 &&
 385         other.scheme           == SCHEME_NONE &&
 386         other.authority.size() == 0 &&
 387         other.query.size()     == 0 )
 388         {
 389         URI fragUri = *this;
 390         fragUri.fragment = other.fragment;
 391         return fragUri;
 392         }
 393
 394     //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
 395     URI newUri;
 396     //# 3.1
 397     newUri.scheme    = scheme;
 398     newUri.schemeStr = schemeStr;
 399     newUri.query     = other.query;
 400     newUri.fragment  = other.fragment;
 401     if (other.authority.size() > 0)
 402         {
 403         //# 3.2
 404         if (absolute || other.absolute)
 405             newUri.absolute = true;
 406         newUri.authority = other.authority;
 407         newUri.port      = other.port;//part of authority
 408         newUri.path      = other.path;
 409         }
 410     else
 411         {
 412         //# 3.3
 413         if (other.absolute)
 414             {
 415             newUri.absolute = true;
 416             newUri.path     = other.path;
 417             }
 418         else
 419             {
 420             int pos = findLast(path, '/');
 421             if (pos >= 0)
 422                 {
 423                 newUri.path.clear();
 424                 //# append my path up to and including the '/'
 425                 for (int i = 0; i<=pos ; i++)
 426                        newUri.path.push_back(path[i]);
 427                 //# append other path
 428                 for (unsigned int i = 0; i<other.path.size() ; i++)
 429                        newUri.path.push_back(other.path[i]);
 430                 }
 431             else
 432                 newUri.path = other.path;
 433             }
 434         }
 435
 436     newUri.normalize();
 437
 438     return newUri;
 439 }
 440
 441
 442 /**
 443  *  This follows the Java URI algorithm:
 444  *   1. All "." segments are removed.
 445  *   2. If a ".." segment is preceded by a non-".." segment
 446  *          then both of these segments are removed. This step
 447  *          is repeated until it is no longer applicable.
 448  *   3. If the path is relative, and if its first segment
 449  *          contains a colon character (':'), then a "." segment
 450  *          is prepended. This prevents a relative URI with a path
 451  *          such as "a:b/c/d" from later being re-parsed as an
 452  *          opaque URI with a scheme of "a" and a scheme-specific
 453  *          part of "b/c/d". (Deviation from RFC 2396)
 454  */
 455 void URI::normalize()
 456 {
 457     std::vector< std::vector<int> > segments;
 458
 459     //## Collect segments
 460     if (path.size()<2)
 461         return;
 462     bool abs = false;
 463     int pos=0;
 464     int len = (int) path.size();
 465
 466     if (path[0]=='/')
 467         {
 468         abs = true;
 469         pos++;
 470         }
 471
 472     while (pos < len)
 473         {
 474         int pos2 = find(path, '/', pos);
 475         if (pos2 < 0)
 476             {
 477             std::vector<int> seg = substr(path, pos, path.size()-pos);
 478             //printf("last segment:%s\n", toStr(seg).c_str());
 479             segments.push_back(seg);
 480             break;
 481             }
 482         if (pos2>pos)
 483             {
 484             std::vector<int> seg = substr(path, pos, pos2-pos);
 485             //printf("segment:%s\n", toStr(seg).c_str());
 486             segments.push_back(seg);
 487             }
 488         pos = pos2;
 489         pos++;
 490         }
 491
 492     //## Clean up (normalize) segments
 493     bool edited = false;
 494     std::vector< std::vector<int> >::iterator iter;
 495     for (iter=segments.begin() ; iter!=segments.end() ; )
 496         {
 497         std::vector<int> s = *iter;
 498         if (sequ(s,"."))
 499             {
 500             iter = segments.erase(iter);
 501             edited = true;
 502             }
 503         else if (sequ(s, "..") && iter != segments.begin() &&
 504                  !sequ(*(iter-1), ".."))
 505             {
 506             iter--; //back up, then erase two entries
 507             iter = segments.erase(iter);
 508             iter = segments.erase(iter);
 509             edited = true;
 510             }
 511         else
 512             iter++;
 513         }
 514
 515     //## Rebuild path, if necessary
 516     if (edited)
 517         {
 518         path.clear();
 519         if (abs)
 520             {
 521             path.push_back('/');
 522             }
 523         std::vector< std::vector<int> >::iterator iter;
 524         for (iter=segments.begin() ; iter!=segments.end() ; iter++)
 525             {
 526             if (iter != segments.begin())
 527                 path.push_back('/');
 528             std::vector<int> seg = *iter;
 529             for (unsigned int i = 0; i<seg.size() ; i++)
 530                 path.push_back(seg[i]);
 531             }
 532         }
 533
 534 }
 535
 536
 537
 538 //#########################################################################
 539 //# M E S S A G E S
 540 //#########################################################################
 541
 542 void URI::error(const char *fmt, ...)
 543 {
 544     va_list args;
 545     fprintf(stderr, "URI error: ");
 546     va_start(args, fmt);
 547     vfprintf(stderr, fmt, args);
 548     va_end(args);
 549     fprintf(stderr, "\n");
 550 }
 551
 552 void URI::trace(const char *fmt, ...)
 553 {
 554     va_list args;
 555     fprintf(stdout, "URI: ");
 556     va_start(args, fmt);
 557     vfprintf(stdout, fmt, args);
 558     va_end(args);
 559     fprintf(stdout, "\n");
 560 }
 561
 562
 563
 564 //#########################################################################
 565 //# P A R S I N G
 566 //#########################################################################
 567
 568
 569
 570 int URI::peek(int p)
 571 {
 572     if (p<0 || p>=parselen)
 573         return -1;
 574     return parsebuf[p];
 575 }
 576
 577
 578
 579 int URI::match(int p0, char const *key)
 580 {
 581     int p = p0;
 582     while (p < parselen)
 583         {
 584         if (*key == '\0')
 585             return p;
 586         else if (*key != parsebuf[p])
 587             break;
 588         p++; key++;
 589         }
 590     return p0;
 591 }
 592
 593 //#########################################################################
 594 //#  Parsing is performed according to:
 595 //#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
 596 //#########################################################################
 597
 598 int URI::parseHex(int p0, int &result)
 599 {
 600     int p = p0;
 601     int val = 0;
 602
 603     //# Upper 4
 604     int ch = peek(p);
 605     if (ch >= '0' && ch <= '9')
 606         val += (ch - '0');
 607     else if (ch >= 'a' && ch <= 'f')
 608         val += (10 + ch - 'a');
 609     else if (ch >= 'A' && ch <= 'F')
 610         val += (10 + ch - 'A');
 611     else
 612         {
 613         error("parseHex : unexpected character : %c", ch);
 614         return -1;
 615         }
 616     p++;
 617     val <<= 4;
 618
 619     //# Lower 4
 620     ch = peek(p);
 621     if (ch >= '0' && ch <= '9')
 622         val += (ch - '0');
 623     else if (ch >= 'a' && ch <= 'f')
 624         val += (10 + ch - 'a');
 625     else if (ch >= 'A' && ch <= 'F')
 626         val += (10 + ch - 'A');
 627     else
 628         {
 629         error("parseHex : unexpected character : %c", ch);
 630         return -1;
 631         }
 632     p++;
 633     result = val;
 634     return p;
 635 }
 636
 637
 638
 639 int URI::parseEntity(int p0, int &result)
 640 {
 641     int p = p0;
 642     int ch = peek(p);
 643     if (ch != '&')
 644         return p0;
 645     p++;
 646     if (!match(p, "#x"))
 647         {
 648         error("parseEntity: expected '#x'");
 649         return -1;
 650         }
 651     p += 2;
 652     int val;
 653     p = parseHex(p, val);
 654     if (p<0)
 655         return -1;
 656     ch = peek(p);
 657     if (ch != ';')
 658         {
 659         error("parseEntity: expected ';'");
 660         return -1;
 661         }
 662     p++;
 663     result = val;
 664     return p;
 665 }
 666
 667 int URI::parseAsciiEntity(int p0, int &result)
 668 {
 669     int p = p0;
 670     int ch = peek(p);
 671     if (ch != '%')
 672         return p0;
 673     p++;
 674     int val;
 675     p = parseHex(p, val);
 676     if (p<0)
 677         return -1;
 678     result = val;
 679     return p;
 680 }
 681
 682
 683 int URI::parseScheme(int p0)
 684 {
 685     int p = p0;
 686     for (LookupEntry *entry = schemes; entry->sval ; entry++)
 687         {
 688         int p2 = match(p, entry->sval);
 689         if (p2 > p)
 690             {
 691             schemeStr = entry->sval;
 692             scheme    = entry->ival;
 693             port      = entry->port;
 694             p = p2;
 695             return p;
 696             }
 697         }
 698
 699     return p;
 700 }
 701
 702
 703 int URI::parseHierarchicalPart(int p0)
 704 {
 705     int p = p0;
 706     int ch;
 707
 708     //# Authority field (host and port, for example)
 709     int p2 = match(p, "//");
 710     if (p2 > p)
 711         {
 712         p = p2;
 713         portSpecified = false;
 714         DOMString portStr;
 715         while (p < parselen)
 716             {
 717             ch = peek(p);
 718             if (ch == '/')
 719                 break;
 720             else if (ch == '&') //IRI entity
 721                 {
 722                 int val;
 723                 p2 = parseEntity(p, val);
 724                 if (p2<p)
 725                     {
 726                     return -1;
 727                     }
 728                 p = p2;
 729                 authority.push_back((XMLCh)val);
 730                 }
 731             else if (ch == '%') //ascii hex excape
 732                 {
 733                 int val;
 734                 p2 = parseAsciiEntity(p, val);
 735                 if (p2<p)
 736                     {
 737                     return -1;
 738                     }
 739                 p = p2;
 740                 authority.push_back((XMLCh)val);
 741                 }
 742             else if (ch == ':')
 743                 {
 744                 portSpecified = true;
 745                 p++;
 746                 }
 747             else if (portSpecified)
 748                 {
 749                 portStr.push_back((XMLCh)ch);
 750                 p++;
 751                 }
 752             else
 753                 {
 754                 authority.push_back((XMLCh)ch);
 755                 p++;
 756                 }
 757             }
 758         if (portStr.size() > 0)
 759             {
 760             char *pstr = (char *)portStr.c_str();
 761             char *endStr;
 762             long val = strtol(pstr, &endStr, 10);
 763             if (endStr > pstr) //successful parse?
 764                 port = val;
 765             }
 766         }
 767
 768     //# Are we absolute?
 769     ch = peek(p);
 770     if (uni_is_letter(ch) && peek(p+1)==':')
 771         {
 772         absolute = true;
 773         path.push_back((XMLCh)'/');
 774         }
 775     else if (ch == '/')
 776         {
 777         absolute = true;
 778         if (p>p0) //in other words, if '/' is not the first char
 779             opaque = true;
 780         path.push_back((XMLCh)ch);
 781         p++;
 782         }
 783
 784     while (p < parselen)
 785         {
 786         ch = peek(p);
 787         if (ch == '?' || ch == '#')
 788             break;
 789         else if (ch == '&') //IRI entity
 790             {
 791             int val;
 792             p2 = parseEntity(p, val);
 793             if (p2<p)
 794                 {
 795                 return -1;
 796                 }
 797             p = p2;
 798             path.push_back((XMLCh)val);
 799             }
 800         else if (ch == '%') //ascii hex excape
 801             {
 802             int val;
 803             p2 = parseAsciiEntity(p, val);
 804             if (p2<p)
 805                 {
 806                 return -1;
 807                 }
 808             p = p2;
 809             path.push_back((XMLCh)val);
 810             }
 811         else
 812             {
 813             path.push_back((XMLCh)ch);
 814             p++;
 815             }
 816         }
 817     //trace("path:%s", toStr(path).c_str());
 818     return p;
 819 }
 820
 821 int URI::parseQuery(int p0)
 822 {
 823     int p = p0;
 824     int ch = peek(p);
 825     if (ch != '?')
 826         return p0;
 827
 828     p++;
 829     while (p < parselen)
 830         {
 831         ch = peek(p);
 832         if (ch == '#')
 833             break;
 834         query.push_back((XMLCh)ch);
 835         p++;
 836         }
 837
 838
 839     return p;
 840 }
 841
 842 int URI::parseFragment(int p0)
 843 {
 844
 845     int p = p0;
 846     int ch = peek(p);
 847     if (ch != '#')
 848         return p0;
 849
 850     p++;
 851     while (p < parselen)
 852         {
 853         ch = peek(p);
 854         if (ch == '?')
 855             break;
 856         fragment.push_back(ch);
 857         p++;
 858         }
 859
 860
 861     return p;
 862 }
 863
 864
 865 int URI::parse(int p0)
 866 {
 867
 868     int p = p0;
 869
 870     int p2 = parseScheme(p);
 871     if (p2 < 0)
 872         {
 873         error("Scheme");
 874         return -1;
 875         }
 876     p = p2;
 877
 878
 879     p2 = parseHierarchicalPart(p);
 880     if (p2 < 0)
 881         {
 882         error("Hierarchical part");
 883         return -1;
 884         }
 885     p = p2;
 886
 887     p2 = parseQuery(p);
 888     if (p2 < 0)
 889         {
 890         error("Query");
 891         return -1;
 892         }
 893     p = p2;
 894
 895
 896     p2 = parseFragment(p);
 897     if (p2 < 0)
 898         {
 899         error("Fragment");
 900         return -1;
 901         }
 902     p = p2;
 903
 904     return p;
 905
 906 }
 907
 908
 909
 910 bool URI::parse(const DOMString &str)
 911 {
 912
 913     parselen = str.size();
 914     parsebuf = new int[str.size()];
 915     if (!parsebuf)
 916         {
 917         error("parse : could not allocate parsebuf");
 918         return false;
 919         }
 920
 921     DOMString::const_iterator iter;
 922     unsigned int i=0;
 923     for (iter= str.begin() ; iter!=str.end() ; iter++)
 924         {
 925         int ch = *iter;
 926         if (ch == '\\')
 927             parsebuf[i++] = '/';
 928         else
 929             parsebuf[i++] = ch;
 930         }
 931
 932
 933     int p = parse(0);
 934     normalize();
 935
 936     delete[] parsebuf;
 937
 938     if (p < 0)
 939         {
 940         error("Syntax error");
 941         return false;
 942         }
 943
 944     //printf("uri:%s\n", toString().c_str());
 945     //printf("parse:%s\n", toStr(path).c_str());
 946
 947     return true;
 948
 949 }
 950
 951
 952
 953
 954
 955 }  //namespace dom
 956 }  //namespace w3c
 957 }  //namespace org
 958 //#########################################################################
 959 //# E N D    O F    F I L E
 960 //#########################################################################
 961
 962
 963