src/dom/uri.cpp

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * Authors:
  11  *   Bob Jamison
  12  *
  13  * Copyright (C) 2005-2007 Bob Jamison
  14  *
  15  *  This library is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU Lesser General Public
  17  *  License as published by the Free Software Foundation; either
  18  *  version 2.1 of the License, or (at your option) any later version.
  19  *
  20  *  This library is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  Lesser General Public License for more details.
  24  *
  25  *  You should have received a copy of the GNU Lesser General Public
  26  *  License along with this library; if not, write to the Free Software
  27  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  28  */
  29
  30
  31
  32
  33 #include "uri.h"
  34 #include "charclass.h"
  35
  36 #include <stdio.h>
  37 #include <stdarg.h>
  38 #include <vector>
  39
  40
  41 namespace org
  42 {
  43 namespace w3c
  44 {
  45 namespace dom
  46 {
  47
  48
  49 typedef struct
  50 {
  51     int  ival;
  52     char const *sval;
  53     int  port;
  54 } LookupEntry;
  55
  56 static LookupEntry schemes[] =
  57 {
  58     { URI::SCHEME_DATA,   "data:",    0 },
  59     { URI::SCHEME_HTTP,   "http:",   80 },
  60     { URI::SCHEME_HTTPS,  "https:", 443 },
  61     { URI::SCHEME_FTP,    "ftp",     12 },
  62     { URI::SCHEME_FILE,   "file:",    0 },
  63     { URI::SCHEME_LDAP,   "ldap:",  123 },
  64     { URI::SCHEME_MAILTO, "mailto:", 25 },
  65     { URI::SCHEME_NEWS,   "news:",  117 },
  66     { URI::SCHEME_TELNET, "telnet:", 23 },
  67     { 0,                  NULL,       0 }
  68 };
  69
  70
  71
  72 //#########################################################################
  73 //# C O N S T R U C T O R
  74 //#########################################################################
  75
  76 /**
  77  *
  78  */
  79 URI::URI()
  80 {
  81     init();
  82 }
  83
  84 /**
  85  *
  86  */
  87 URI::URI(const DOMString &str)
  88 {
  89     init();
  90     parse(str);
  91 }
  92
  93
  94 /**
  95  *
  96  */
  97 URI::URI(const char *str)
  98 {
  99     init();
 100     DOMString domStr = str;
 101     parse(domStr);
 102 }
 103
 104
 105 /**
 106  *
 107  */
 108 URI::URI(const URI &other)
 109 {
 110     init();
 111     assign(other);
 112 }
 113
 114
 115 /**
 116  *
 117  */
 118 URI &URI::operator=(const URI &other)
 119 {
 120     init();
 121     assign(other);
 122     return *this;
 123 }
 124
 125
 126 /**
 127  *
 128  */
 129 URI::~URI()
 130 {
 131 }
 132
 133
 134
 135
 136
 137 /**
 138  *
 139  */
 140 void URI::init()
 141 {
 142     parsebuf  = NULL;
 143     parselen  = 0;
 144     scheme    = SCHEME_NONE;
 145     schemeStr.clear();
 146     port      = 0;
 147     authority.clear();
 148     path.clear();
 149     absolute  = false;
 150     opaque    = false;
 151     query.clear();
 152     fragment.clear();
 153 }
 154
 155
 156 /**
 157  *
 158  */
 159 void URI::assign(const URI &other)
 160 {
 161     scheme    = other.scheme;
 162     schemeStr = other.schemeStr;
 163     authority = other.authority;
 164     port      = other.port;
 165     path      = other.path;
 166     absolute  = other.absolute;
 167     opaque    = other.opaque;
 168     query     = other.query;
 169     fragment  = other.fragment;
 170 }
 171
 172
 173 //#########################################################################
 174 //#A T T R I B U T E S
 175 //#########################################################################
 176 static char *hexChars = "0123456789abcdef";
 177
 178 static DOMString toStr(const std::vector<int> &arr)
 179 {
 180     DOMString buf;
 181     std::vector<int>::const_iterator iter;
 182     for (iter=arr.begin() ; iter!=arr.end() ; iter++)
 183         {
 184         int ch = *iter;
 185         if (isprint(ch))
 186             buf.push_back((XMLCh)ch);
 187         else
 188             {
 189             buf.push_back('%');
 190             int hi = ((ch>>4) & 0xf);
 191             buf.push_back(hexChars[hi]);
 192             int lo = ((ch   ) & 0xf);
 193             buf.push_back(hexChars[lo]);
 194             }
 195         }
 196     return buf;
 197 }
 198
 199
 200 DOMString URI::toString() const
 201 {
 202     DOMString str = schemeStr;
 203     if (authority.size() > 0)
 204         {
 205         str.append("//");
 206         str.append(toStr(authority));
 207         }
 208     str.append(toStr(path));
 209     if (query.size() > 0)
 210         {
 211         str.append("?");
 212         str.append(toStr(query));
 213         }
 214     if (fragment.size() > 0)
 215         {
 216         str.append("#");
 217         str.append(toStr(fragment));
 218         }
 219     return str;
 220 }
 221
 222
 223 int URI::getScheme() const
 224 {
 225     return scheme;
 226 }
 227
 228 DOMString URI::getSchemeStr() const
 229 {
 230     return schemeStr;
 231 }
 232
 233
 234 DOMString URI::getAuthority() const
 235 {
 236     DOMString ret = toStr(authority);
 237     if (portSpecified && port>=0)
 238         {
 239         char buf[7];
 240         snprintf(buf, 6, ":%6d", port);
 241         ret.append(buf);
 242         }
 243     return ret;
 244 }
 245
 246 DOMString URI::getHost() const
 247 {
 248     DOMString str = toStr(authority);
 249     return str;
 250 }
 251
 252 int URI::getPort() const
 253 {
 254     return port;
 255 }
 256
 257
 258 DOMString URI::getPath() const
 259 {
 260     DOMString str = toStr(path);
 261     return str;
 262 }
 263
 264 DOMString URI::getNativePath() const
 265 {
 266     DOMString pathStr = toStr(path);
 267     DOMString npath;
 268 #ifdef __WIN32__
 269     unsigned int firstChar = 0;
 270     if (pathStr.size() >= 3)
 271         {
 272         if (pathStr[0] == '/' &&
 273             uni_is_letter(pathStr[1]) &&
 274             pathStr[2] == ':')
 275             firstChar++;
 276          }
 277     for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
 278         {
 279         XMLCh ch = (XMLCh) pathStr[i];
 280         if (ch == '/')
 281             npath.push_back((XMLCh)'\\');
 282         else
 283             npath.push_back(ch);
 284         }
 285 #else
 286     npath = pathStr;
 287 #endif
 288     return npath;
 289 }
 290
 291
 292 bool URI::isAbsolute() const
 293 {
 294     return absolute;
 295 }
 296
 297 bool URI::isOpaque() const
 298 {
 299     return opaque;
 300 }
 301
 302
 303 DOMString URI::getQuery() const
 304 {
 305     DOMString str = toStr(query);
 306     return str;
 307 }
 308
 309
 310 DOMString URI::getFragment() const
 311 {
 312     DOMString str = toStr(fragment);
 313     return str;
 314 }
 315
 316
 317
 318
 319 static int find(const std::vector<int> &str, int ch, int startpos)
 320 {
 321     for (unsigned int i = startpos ; i < str.size() ; i++)
 322         {
 323         if (ch == str[i])
 324             return i;
 325         }
 326     return -1;
 327 }
 328
 329
 330 static int findLast(const std::vector<int> &str, int ch)
 331 {
 332     // TODO FIXME BUGBUG
 333     // This loop appears to be infinite, so it is probably not being called.
 334     // Test for a problem, then fix after it has been observed locking up.
 335     for (unsigned int i = str.size()-1 ; i>=0 ; i--)
 336         {
 337         if (ch == str[i])
 338             return i;
 339         }
 340     return -1;
 341 }
 342
 343
 344 static bool sequ(const std::vector<int> &str, char *key)
 345 {
 346     char *c = key;
 347     for (unsigned int i=0 ; i<str.size() ; i++)
 348         {
 349         if (! (*c))
 350             return false;
 351         if (*c != str[i])
 352             return false;
 353         }
 354     return true;
 355 }
 356
 357
 358 static std::vector<int> substr(const std::vector<int> &str,
 359                       int startpos, int len)
 360 {
 361     std::vector<int> buf;
 362     unsigned int pos = startpos;
 363     for (int i=0 ; i<len ; i++)
 364         {
 365         if (pos >= str.size())
 366             break;
 367         buf.push_back(str[pos++]);
 368         }
 369     return buf;
 370 }
 371
 372
 373 URI URI::resolve(const URI &other) const
 374 {
 375     //### According to w3c, this is handled in 3 cases
 376
 377     //## 1
 378     if (opaque || other.isAbsolute())
 379         return other;
 380
 381     //## 2
 382     if (other.fragment.size()  >  0 &&
 383         other.path.size()      == 0 &&
 384         other.scheme           == SCHEME_NONE &&
 385         other.authority.size() == 0 &&
 386         other.query.size()     == 0 )
 387         {
 388         URI fragUri = *this;
 389         fragUri.fragment = other.fragment;
 390         return fragUri;
 391         }
 392
 393     //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
 394     URI newUri;
 395     //# 3.1
 396     newUri.scheme    = scheme;
 397     newUri.schemeStr = schemeStr;
 398     newUri.query     = other.query;
 399     newUri.fragment  = other.fragment;
 400     if (other.authority.size() > 0)
 401         {
 402         //# 3.2
 403         if (absolute || other.absolute)
 404             newUri.absolute = true;
 405         newUri.authority = other.authority;
 406         newUri.port      = other.port;//part of authority
 407         newUri.path      = other.path;
 408         }
 409     else
 410         {
 411         //# 3.3
 412         if (other.absolute)
 413             {
 414             newUri.absolute = true;
 415             newUri.path     = other.path;
 416             }
 417         else
 418             {
 419             int pos = findLast(path, '/');
 420             if (pos >= 0)
 421                 {
 422                 newUri.path.clear();
 423                 //# append my path up to and including the '/'
 424                 for (int i = 0; i<=pos ; i++)
 425                        newUri.path.push_back(path[i]);
 426                 //# append other path
 427                 for (unsigned int i = 0; i<other.path.size() ; i++)
 428                        newUri.path.push_back(other.path[i]);
 429                 }
 430             else
 431                 newUri.path = other.path;
 432             }
 433         }
 434
 435     newUri.normalize();
 436
 437     return newUri;
 438 }
 439
 440
 441 /**
 442  *  This follows the Java URI algorithm:
 443  *   1. All "." segments are removed.
 444  *   2. If a ".." segment is preceded by a non-".." segment
 445  *          then both of these segments are removed. This step
 446  *          is repeated until it is no longer applicable.
 447  *   3. If the path is relative, and if its first segment
 448  *          contains a colon character (':'), then a "." segment
 449  *          is prepended. This prevents a relative URI with a path
 450  *          such as "a:b/c/d" from later being re-parsed as an
 451  *          opaque URI with a scheme of "a" and a scheme-specific
 452  *          part of "b/c/d". (Deviation from RFC 2396)
 453  */
 454 void URI::normalize()
 455 {
 456     std::vector< std::vector<int> > segments;
 457
 458     //## Collect segments
 459     if (path.size()<2)
 460         return;
 461     bool abs = false;
 462     int pos=0;
 463     int len = (int) path.size();
 464
 465     if (path[0]=='/')
 466         {
 467         abs = true;
 468         pos++;
 469         }
 470
 471     while (pos < len)
 472         {
 473         int pos2 = find(path, '/', pos);
 474         if (pos2 < 0)
 475             {
 476             std::vector<int> seg = substr(path, pos, path.size()-pos);
 477             //printf("last segment:%s\n", toStr(seg).c_str());
 478             segments.push_back(seg);
 479             break;
 480             }
 481         if (pos2>pos)
 482             {
 483             std::vector<int> seg = substr(path, pos, pos2-pos);
 484             //printf("segment:%s\n", toStr(seg).c_str());
 485             segments.push_back(seg);
 486             }
 487         pos = pos2;
 488         pos++;
 489         }
 490
 491     //## Clean up (normalize) segments
 492     bool edited = false;
 493     std::vector< std::vector<int> >::iterator iter;
 494     for (iter=segments.begin() ; iter!=segments.end() ; )
 495         {
 496         std::vector<int> s = *iter;
 497         if (sequ(s,"."))
 498             {
 499             iter = segments.erase(iter);
 500             edited = true;
 501             }
 502         else if (sequ(s, "..") && iter != segments.begin() &&
 503                  !sequ(*(iter-1), ".."))
 504             {
 505             iter--; //back up, then erase two entries
 506             iter = segments.erase(iter);
 507             iter = segments.erase(iter);
 508             edited = true;
 509             }
 510         else
 511             iter++;
 512         }
 513
 514     //## Rebuild path, if necessary
 515     if (edited)
 516         {
 517         path.clear();
 518         if (abs)
 519             {
 520             path.push_back('/');
 521             }
 522         std::vector< std::vector<int> >::iterator iter;
 523         for (iter=segments.begin() ; iter!=segments.end() ; iter++)
 524             {
 525             if (iter != segments.begin())
 526                 path.push_back('/');
 527             std::vector<int> seg = *iter;
 528             for (unsigned int i = 0; i<seg.size() ; i++)
 529                 path.push_back(seg[i]);
 530             }
 531         }
 532
 533 }
 534
 535
 536
 537 //#########################################################################
 538 //# M E S S A G E S
 539 //#########################################################################
 540
 541 void URI::error(const char *fmt, ...)
 542 {
 543     va_list args;
 544     fprintf(stderr, "URI error: ");
 545     va_start(args, fmt);
 546     vfprintf(stderr, fmt, args);
 547     va_end(args);
 548     fprintf(stderr, "\n");
 549 }
 550
 551 void URI::trace(const char *fmt, ...)
 552 {
 553     va_list args;
 554     fprintf(stdout, "URI: ");
 555     va_start(args, fmt);
 556     vfprintf(stdout, fmt, args);
 557     va_end(args);
 558     fprintf(stdout, "\n");
 559 }
 560
 561
 562
 563 //#########################################################################
 564 //# P A R S I N G
 565 //#########################################################################
 566
 567
 568
 569 int URI::peek(int p)
 570 {
 571     if (p<0 || p>=parselen)
 572         return -1;
 573     return parsebuf[p];
 574 }
 575
 576
 577
 578 int URI::match(int p0, char const *key)
 579 {
 580     int p = p0;
 581     while (p < parselen)
 582         {
 583         if (*key == '\0')
 584             return p;
 585         else if (*key != parsebuf[p])
 586             break;
 587         p++; key++;
 588         }
 589     return p0;
 590 }
 591
 592 //#########################################################################
 593 //#  Parsing is performed according to:
 594 //#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
 595 //#########################################################################
 596
 597 int URI::parseHex(int p0, int &result)
 598 {
 599     int p = p0;
 600     int val = 0;
 601
 602     //# Upper 4
 603     int ch = peek(p);
 604     if (ch >= '0' && ch <= '9')
 605         val += (ch - '0');
 606     else if (ch >= 'a' && ch <= 'f')
 607         val += (10 + ch - 'a');
 608     else if (ch >= 'A' && ch <= 'F')
 609         val += (10 + ch - 'A');
 610     else
 611         {
 612         error("parseHex : unexpected character : %c", ch);
 613         return -1;
 614         }
 615     p++;
 616     val <<= 4;
 617
 618     //# Lower 4
 619     ch = peek(p);
 620     if (ch >= '0' && ch <= '9')
 621         val += (ch - '0');
 622     else if (ch >= 'a' && ch <= 'f')
 623         val += (10 + ch - 'a');
 624     else if (ch >= 'A' && ch <= 'F')
 625         val += (10 + ch - 'A');
 626     else
 627         {
 628         error("parseHex : unexpected character : %c", ch);
 629         return -1;
 630         }
 631     p++;
 632     result = val;
 633     return p;
 634 }
 635
 636
 637
 638 int URI::parseEntity(int p0, int &result)
 639 {
 640     int p = p0;
 641     int ch = peek(p);
 642     if (ch != '&')
 643         return p0;
 644     p++;
 645     if (!match(p, "#x"))
 646         {
 647         error("parseEntity: expected '#x'");
 648         return -1;
 649         }
 650     p += 2;
 651     int val;
 652     p = parseHex(p, val);
 653     if (p<0)
 654         return -1;
 655     ch = peek(p);
 656     if (ch != ';')
 657         {
 658         error("parseEntity: expected ';'");
 659         return -1;
 660         }
 661     p++;
 662     result = val;
 663     return p;
 664 }
 665
 666 int URI::parseAsciiEntity(int p0, int &result)
 667 {
 668     int p = p0;
 669     int ch = peek(p);
 670     if (ch != '%')
 671         return p0;
 672     p++;
 673     int val;
 674     p = parseHex(p, val);
 675     if (p<0)
 676         return -1;
 677     result = val;
 678     return p;
 679 }
 680
 681
 682 int URI::parseScheme(int p0)
 683 {
 684     int p = p0;
 685     for (LookupEntry *entry = schemes; entry->sval ; entry++)
 686         {
 687         int p2 = match(p, entry->sval);
 688         if (p2 > p)
 689             {
 690             schemeStr = entry->sval;
 691             scheme    = entry->ival;
 692             port      = entry->port;
 693             p = p2;
 694             return p;
 695             }
 696         }
 697
 698     return p;
 699 }
 700
 701
 702 int URI::parseHierarchicalPart(int p0)
 703 {
 704     int p = p0;
 705     int ch;
 706
 707     //# Authority field (host and port, for example)
 708     int p2 = match(p, "//");
 709     if (p2 > p)
 710         {
 711         p = p2;
 712         portSpecified = false;
 713         DOMString portStr;
 714         while (p < parselen)
 715             {
 716             ch = peek(p);
 717             if (ch == '/')
 718                 break;
 719             else if (ch == '&') //IRI entity
 720                 {
 721                 int val;
 722                 p2 = parseEntity(p, val);
 723                 if (p2<p)
 724                     {
 725                     return -1;
 726                     }
 727                 p = p2;
 728                 authority.push_back((XMLCh)val);
 729                 }
 730             else if (ch == '%') //ascii hex excape
 731                 {
 732                 int val;
 733                 p2 = parseAsciiEntity(p, val);
 734                 if (p2<p)
 735                     {
 736                     return -1;
 737                     }
 738                 p = p2;
 739                 authority.push_back((XMLCh)val);
 740                 }
 741             else if (ch == ':')
 742                 {
 743                 portSpecified = true;
 744                 p++;
 745                 }
 746             else if (portSpecified)
 747                 {
 748                 portStr.push_back((XMLCh)ch);
 749                 p++;
 750                 }
 751             else
 752                 {
 753                 authority.push_back((XMLCh)ch);
 754                 p++;
 755                 }
 756             }
 757         if (portStr.size() > 0)
 758             {
 759             char *pstr = (char *)portStr.c_str();
 760             char *endStr;
 761             long val = strtol(pstr, &endStr, 10);
 762             if (endStr > pstr) //successful parse?
 763                 port = val;
 764             }
 765         }
 766
 767     //# Are we absolute?
 768     ch = peek(p);
 769     if (uni_is_letter(ch) && peek(p+1)==':')
 770         {
 771         absolute = true;
 772         path.push_back((XMLCh)'/');
 773         }
 774     else if (ch == '/')
 775         {
 776         absolute = true;
 777         if (p>p0) //in other words, if '/' is not the first char
 778             opaque = true;
 779         path.push_back((XMLCh)ch);
 780         p++;
 781         }
 782
 783     while (p < parselen)
 784         {
 785         ch = peek(p);
 786         if (ch == '?' || ch == '#')
 787             break;
 788         else if (ch == '&') //IRI entity
 789             {
 790             int val;
 791             p2 = parseEntity(p, val);
 792             if (p2<p)
 793                 {
 794                 return -1;
 795                 }
 796             p = p2;
 797             path.push_back((XMLCh)val);
 798             }
 799         else if (ch == '%') //ascii hex excape
 800             {
 801             int val;
 802             p2 = parseAsciiEntity(p, val);
 803             if (p2<p)
 804                 {
 805                 return -1;
 806                 }
 807             p = p2;
 808             path.push_back((XMLCh)val);
 809             }
 810         else
 811             {
 812             path.push_back((XMLCh)ch);
 813             p++;
 814             }
 815         }
 816     //trace("path:%s", toStr(path).c_str());
 817     return p;
 818 }
 819
 820 int URI::parseQuery(int p0)
 821 {
 822     int p = p0;
 823     int ch = peek(p);
 824     if (ch != '?')
 825         return p0;
 826
 827     p++;
 828     while (p < parselen)
 829         {
 830         ch = peek(p);
 831         if (ch == '#')
 832             break;
 833         query.push_back((XMLCh)ch);
 834         p++;
 835         }
 836
 837
 838     return p;
 839 }
 840
 841 int URI::parseFragment(int p0)
 842 {
 843
 844     int p = p0;
 845     int ch = peek(p);
 846     if (ch != '#')
 847         return p0;
 848
 849     p++;
 850     while (p < parselen)
 851         {
 852         ch = peek(p);
 853         if (ch == '?')
 854             break;
 855         fragment.push_back(ch);
 856         p++;
 857         }
 858
 859
 860     return p;
 861 }
 862
 863
 864 int URI::parse(int p0)
 865 {
 866
 867     int p = p0;
 868
 869     int p2 = parseScheme(p);
 870     if (p2 < 0)
 871         {
 872         error("Scheme");
 873         return -1;
 874         }
 875     p = p2;
 876
 877
 878     p2 = parseHierarchicalPart(p);
 879     if (p2 < 0)
 880         {
 881         error("Hierarchical part");
 882         return -1;
 883         }
 884     p = p2;
 885
 886     p2 = parseQuery(p);
 887     if (p2 < 0)
 888         {
 889         error("Query");
 890         return -1;
 891         }
 892     p = p2;
 893
 894
 895     p2 = parseFragment(p);
 896     if (p2 < 0)
 897         {
 898         error("Fragment");
 899         return -1;
 900         }
 901     p = p2;
 902
 903     return p;
 904
 905 }
 906
 907
 908
 909 bool URI::parse(const DOMString &str)
 910 {
 911
 912     parselen = str.size();
 913     parsebuf = new int[str.size()];
 914     if (!parsebuf)
 915         {
 916         error("parse : could not allocate parsebuf");
 917         return false;
 918         }
 919
 920     DOMString::const_iterator iter;
 921     unsigned int i=0;
 922     for (iter= str.begin() ; iter!=str.end() ; iter++)
 923         {
 924         int ch = *iter;
 925         if (ch == '\\')
 926             parsebuf[i++] = '/';
 927         else
 928             parsebuf[i++] = ch;
 929         }
 930
 931
 932     int p = parse(0);
 933     normalize();
 934
 935     delete[] parsebuf;
 936
 937     if (p < 0)
 938         {
 939         error("Syntax error");
 940         return false;
 941         }
 942
 943     //printf("uri:%s\n", toString().c_str());
 944     //printf("parse:%s\n", toStr(path).c_str());
 945
 946     return true;
 947
 948 }
 949
 950
 951
 952
 953
 954 }  //namespace dom
 955 }  //namespace w3c
 956 }  //namespace org
 957 //#########################################################################
 958 //# E N D    O F    F I L E
 959 //#########################################################################
 960
 961
 962