src/dom/uri.cpp

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * Authors:
  11  *   Bob Jamison
  12  *
  13  * Copyright (C) 2005-2007 Bob Jamison
  14  *
  15  *  This library is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU Lesser General Public
  17  *  License as published by the Free Software Foundation; either
  18  *  version 2.1 of the License, or (at your option) any later version.
  19  *
  20  *  This library is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  Lesser General Public License for more details.
  24  *
  25  *  You should have received a copy of the GNU Lesser General Public
  26  *  License along with this library; if not, write to the Free Software
  27  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  28  */
  29
  30
  31
  32
  33 #include "uri.h"
  34 #include "charclass.h"
  35
  36 #include <stdio.h>
  37 #include <stdarg.h>
  38
  39
  40
  41 namespace org
  42 {
  43 namespace w3c
  44 {
  45 namespace dom
  46 {
  47
  48
  49 typedef struct
  50 {
  51     int  ival;
  52     char *sval;
  53     int  port;
  54 } LookupEntry;
  55
  56 static LookupEntry schemes[] =
  57 {
  58     { URI::SCHEME_DATA,   "data:",    0 },
  59     { URI::SCHEME_HTTP,   "http:",   80 },
  60     { URI::SCHEME_HTTPS,  "https:", 443 },
  61     { URI::SCHEME_FTP,    "ftp",     12 },
  62     { URI::SCHEME_FILE,   "file:",    0 },
  63     { URI::SCHEME_LDAP,   "ldap:",  123 },
  64     { URI::SCHEME_MAILTO, "mailto:", 25 },
  65     { URI::SCHEME_NEWS,   "news:",  117 },
  66     { URI::SCHEME_TELNET, "telnet:", 23 },
  67     { 0,                  NULL,       0 }
  68 };
  69
  70
  71
  72 //#########################################################################
  73 //# C O N S T R U C T O R
  74 //#########################################################################
  75
  76 /**
  77  *
  78  */
  79 URI::URI()
  80 {
  81     init();
  82 }
  83
  84 /**
  85  *
  86  */
  87 URI::URI(const DOMString &str)
  88 {
  89     init();
  90     parse(str);
  91 }
  92
  93
  94 /**
  95  *
  96  */
  97 URI::URI(const char *str)
  98 {
  99     init();
 100     DOMString domStr = str;
 101     parse(domStr);
 102 }
 103
 104
 105 /**
 106  *
 107  */
 108 URI::URI(const URI &other)
 109 {
 110     init();
 111     assign(other);
 112 }
 113
 114
 115 /**
 116  *
 117  */
 118 URI &URI::operator=(const URI &other)
 119 {
 120     init();
 121     assign(other);
 122     return *this;
 123 }
 124
 125
 126 /**
 127  *
 128  */
 129 URI::~URI()
 130 {
 131 }
 132
 133
 134
 135
 136
 137 /**
 138  *
 139  */
 140 void URI::init()
 141 {
 142     parsebuf  = NULL;
 143     parselen  = 0;
 144     scheme    = SCHEME_NONE;
 145     schemeStr.clear();
 146     port      = 0;
 147     authority.clear();
 148     path.clear();
 149     absolute  = false;
 150     opaque    = false;
 151     query.clear();
 152     fragment.clear();
 153 }
 154
 155
 156 /**
 157  *
 158  */
 159 void URI::assign(const URI &other)
 160 {
 161     scheme    = other.scheme;
 162     schemeStr = other.schemeStr;
 163     authority = other.authority;
 164     port      = other.port;
 165     path      = other.path;
 166     absolute  = other.absolute;
 167     opaque    = other.opaque;
 168     query     = other.query;
 169     fragment  = other.fragment;
 170 }
 171
 172
 173 //#########################################################################
 174 //#A T T R I B U T E S
 175 //#########################################################################
 176 static char *hexChars = "0123456789abcdef";
 177
 178 static DOMString toStr(const std::vector<int> &arr)
 179 {
 180     DOMString buf;
 181     std::vector<int>::const_iterator iter;
 182     for (iter=arr.begin() ; iter!=arr.end() ; iter++)
 183         {
 184         int ch = *iter;
 185         if (isprint(ch))
 186             buf.push_back((XMLCh)ch);
 187         else
 188             {
 189             buf.push_back('%');
 190             int hi = ((ch>>4) & 0xf);
 191             buf.push_back(hexChars[hi]);
 192             int lo = ((ch   ) & 0xf);
 193             buf.push_back(hexChars[lo]);
 194             }
 195         }
 196     return buf;
 197 }
 198
 199
 200 DOMString URI::toString() const
 201 {
 202     DOMString str = schemeStr;
 203     if (authority.size() > 0)
 204         {
 205         str.append("//");
 206         str.append(toStr(authority));
 207         }
 208     str.append(toStr(path));
 209     if (query.size() > 0)
 210         {
 211         str.append("?");
 212         str.append(toStr(query));
 213         }
 214     if (fragment.size() > 0)
 215         {
 216         str.append("#");
 217         str.append(toStr(fragment));
 218         }
 219     return str;
 220 }
 221
 222
 223 int URI::getScheme() const
 224 {
 225     return scheme;
 226 }
 227
 228 DOMString URI::getSchemeStr() const
 229 {
 230     return schemeStr;
 231 }
 232
 233
 234 DOMString URI::getAuthority() const
 235 {
 236     DOMString ret = toStr(authority);
 237     if (portSpecified && port>=0)
 238         {
 239         char buf[7];
 240         snprintf(buf, 6, ":%6d", port);
 241         ret.append(buf);
 242         }
 243     return ret;
 244 }
 245
 246 DOMString URI::getHost() const
 247 {
 248     DOMString str = toStr(authority);
 249     return str;
 250 }
 251
 252 int URI::getPort() const
 253 {
 254     return port;
 255 }
 256
 257
 258 DOMString URI::getPath() const
 259 {
 260     DOMString str = toStr(path);
 261     return str;
 262 }
 263
 264 DOMString URI::getNativePath() const
 265 {
 266     DOMString pathStr = toStr(path);
 267     DOMString npath;
 268 #ifdef __WIN32__
 269     unsigned int firstChar = 0;
 270     if (pathStr.size() >= 3)
 271         {
 272         if (pathStr[0] == '/' &&
 273             isLetter(pathStr[1]) &&
 274             pathStr[2] == ':')
 275             firstChar++;
 276          }
 277     for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
 278         {
 279         XMLCh ch = (XMLCh) pathStr[i];
 280         if (ch == '/')
 281             npath.push_back((XMLCh)'\\');
 282         else
 283             npath.push_back(ch);
 284         }
 285 #else
 286     npath = pathStr;
 287 #endif
 288     return npath;
 289 }
 290
 291
 292 bool URI::isAbsolute() const
 293 {
 294     return absolute;
 295 }
 296
 297 bool URI::isOpaque() const
 298 {
 299     return opaque;
 300 }
 301
 302
 303 DOMString URI::getQuery() const
 304 {
 305     DOMString str = toStr(query);
 306     return str;
 307 }
 308
 309
 310 DOMString URI::getFragment() const
 311 {
 312     DOMString str = toStr(fragment);
 313     return str;
 314 }
 315
 316
 317
 318
 319 static int find(const std::vector<int> &str, int ch, int startpos)
 320 {
 321     for (unsigned int i = startpos ; i < str.size() ; i++)
 322         {
 323         if (ch == str[i])
 324             return i;
 325         }
 326     return -1;
 327 }
 328
 329
 330 static int findLast(const std::vector<int> &str, int ch)
 331 {
 332     for (unsigned int i = str.size()-1 ; i>=0 ; i--)
 333         {
 334         if (ch == str[i])
 335             return i;
 336         }
 337     return -1;
 338 }
 339
 340
 341 static bool sequ(const std::vector<int> &str, char *key)
 342 {
 343     char *c = key;
 344     for (unsigned int i=0 ; i<str.size() ; i++)
 345         {
 346         if (! (*c))
 347             return false;
 348         if (*c != str[i])
 349             return false;
 350         }
 351     return true;
 352 }
 353
 354
 355 static std::vector<int> substr(const std::vector<int> &str,
 356                       int startpos, int len)
 357 {
 358     std::vector<int> buf;
 359     unsigned int pos = startpos;
 360     for (int i=0 ; i<len ; i++)
 361         {
 362         if (pos >= str.size())
 363             break;
 364         buf.push_back(str[pos++]);
 365         }
 366     return buf;
 367 }
 368
 369
 370 URI URI::resolve(const URI &other) const
 371 {
 372     //### According to w3c, this is handled in 3 cases
 373
 374     //## 1
 375     if (opaque || other.isAbsolute())
 376         return other;
 377
 378     //## 2
 379     if (other.fragment.size()  >  0 &&
 380         other.path.size()      == 0 &&
 381         other.scheme           == SCHEME_NONE &&
 382         other.authority.size() == 0 &&
 383         other.query.size()     == 0 )
 384         {
 385         URI fragUri = *this;
 386         fragUri.fragment = other.fragment;
 387         return fragUri;
 388         }
 389
 390     //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
 391     URI newUri;
 392     //# 3.1
 393     newUri.scheme    = scheme;
 394     newUri.schemeStr = schemeStr;
 395     newUri.query     = other.query;
 396     newUri.fragment  = other.fragment;
 397     if (other.authority.size() > 0)
 398         {
 399         //# 3.2
 400         if (absolute || other.absolute)
 401             newUri.absolute = true;
 402         newUri.authority = other.authority;
 403         newUri.port      = other.port;//part of authority
 404         newUri.path      = other.path;
 405         }
 406     else
 407         {
 408         //# 3.3
 409         if (other.absolute)
 410             {
 411             newUri.absolute = true;
 412             newUri.path     = other.path;
 413             }
 414         else
 415             {
 416             int pos = findLast(path, '/');
 417             if (pos >= 0)
 418                 {
 419                 newUri.path.clear();
 420                 //# append my path up to and including the '/'
 421                 for (int i = 0; i<=pos ; i++)
 422                        newUri.path.push_back(path[i]);
 423                 //# append other path
 424                 for (unsigned int i = 0; i<other.path.size() ; i++)
 425                        newUri.path.push_back(other.path[i]);
 426                 }
 427             else
 428                 newUri.path = other.path;
 429             }
 430         }
 431
 432     newUri.normalize();
 433
 434     return newUri;
 435 }
 436
 437
 438 /**
 439  *  This follows the Java URI algorithm:
 440  *   1. All "." segments are removed.
 441  *   2. If a ".." segment is preceded by a non-".." segment
 442  *          then both of these segments are removed. This step
 443  *          is repeated until it is no longer applicable.
 444  *   3. If the path is relative, and if its first segment
 445  *          contains a colon character (':'), then a "." segment
 446  *          is prepended. This prevents a relative URI with a path
 447  *          such as "a:b/c/d" from later being re-parsed as an
 448  *          opaque URI with a scheme of "a" and a scheme-specific
 449  *          part of "b/c/d". (Deviation from RFC 2396)
 450  */
 451 void URI::normalize()
 452 {
 453     std::vector< std::vector<int> > segments;
 454
 455     //## Collect segments
 456     if (path.size()<2)
 457         return;
 458     bool abs = false;
 459     int pos=0;
 460     int len = (int) path.size();
 461
 462     if (path[0]=='/')
 463         {
 464         abs = true;
 465         pos++;
 466         }
 467
 468     while (pos < len)
 469         {
 470         int pos2 = find(path, '/', pos);
 471         if (pos2 < 0)
 472             {
 473             std::vector<int> seg = substr(path, pos, path.size()-pos);
 474             //printf("last segment:%s\n", toStr(seg).c_str());
 475             segments.push_back(seg);
 476             break;
 477             }
 478         if (pos2>pos)
 479             {
 480             std::vector<int> seg = substr(path, pos, pos2-pos);
 481             //printf("segment:%s\n", toStr(seg).c_str());
 482             segments.push_back(seg);
 483             }
 484         pos = pos2;
 485         pos++;
 486         }
 487
 488     //## Clean up (normalize) segments
 489     bool edited = false;
 490     std::vector< std::vector<int> >::iterator iter;
 491     for (iter=segments.begin() ; iter!=segments.end() ; )
 492         {
 493         std::vector<int> s = *iter;
 494         if (sequ(s,"."))
 495             {
 496             iter = segments.erase(iter);
 497             edited = true;
 498             }
 499         else if (sequ(s, "..") && iter != segments.begin() &&
 500                  !sequ(*(iter-1), ".."))
 501             {
 502             iter--; //back up, then erase two entries
 503             iter = segments.erase(iter);
 504             iter = segments.erase(iter);
 505             edited = true;
 506             }
 507         else
 508             iter++;
 509         }
 510
 511     //## Rebuild path, if necessary
 512     if (edited)
 513         {
 514         path.clear();
 515         if (abs)
 516             {
 517             path.push_back('/');
 518             }
 519         std::vector< std::vector<int> >::iterator iter;
 520         for (iter=segments.begin() ; iter!=segments.end() ; iter++)
 521             {
 522             if (iter != segments.begin())
 523                 path.push_back('/');
 524             std::vector<int> seg = *iter;
 525             for (unsigned int i = 0; i<seg.size() ; i++)
 526                 path.push_back(seg[i]);
 527             }
 528         }
 529
 530 }
 531
 532
 533
 534 //#########################################################################
 535 //# M E S S A G E S
 536 //#########################################################################
 537
 538 void URI::error(const char *fmt, ...)
 539 {
 540     va_list args;
 541     fprintf(stderr, "URI error: ");
 542     va_start(args, fmt);
 543     vfprintf(stderr, fmt, args);
 544     va_end(args);
 545     fprintf(stderr, "\n");
 546 }
 547
 548 void URI::trace(const char *fmt, ...)
 549 {
 550     va_list args;
 551     fprintf(stdout, "URI: ");
 552     va_start(args, fmt);
 553     vfprintf(stdout, fmt, args);
 554     va_end(args);
 555     fprintf(stdout, "\n");
 556 }
 557
 558
 559
 560 //#########################################################################
 561 //# P A R S I N G
 562 //#########################################################################
 563
 564
 565
 566 int URI::peek(int p)
 567 {
 568     if (p<0 || p>=parselen)
 569         return -1;
 570     return parsebuf[p];
 571 }
 572
 573
 574
 575 int URI::match(int p0, char *key)
 576 {
 577     int p = p0;
 578     while (p < parselen)
 579         {
 580         if (*key == '\0')
 581             return p;
 582         else if (*key != parsebuf[p])
 583             break;
 584         p++; key++;
 585         }
 586     return p0;
 587 }
 588
 589 //#########################################################################
 590 //#  Parsing is performed according to:
 591 //#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
 592 //#########################################################################
 593
 594 int URI::parseHex(int p0, int &result)
 595 {
 596     int p = p0;
 597     int val = 0;
 598
 599     //# Upper 4
 600     int ch = peek(p);
 601     if (ch >= '0' && ch <= '9')
 602         val += (ch - '0');
 603     else if (ch >= 'a' && ch <= 'f')
 604         val += (10 + ch - 'a');
 605     else if (ch >= 'A' && ch <= 'F')
 606         val += (10 + ch - 'A');
 607     else
 608         {
 609         error("parseHex : unexpected character : %c", ch);
 610         return -1;
 611         }
 612     p++;
 613     val <<= 4;
 614
 615     //# Lower 4
 616     ch = peek(p);
 617     if (ch >= '0' && ch <= '9')
 618         val += (ch - '0');
 619     else if (ch >= 'a' && ch <= 'f')
 620         val += (10 + ch - 'a');
 621     else if (ch >= 'A' && ch <= 'F')
 622         val += (10 + ch - 'A');
 623     else
 624         {
 625         error("parseHex : unexpected character : %c", ch);
 626         return -1;
 627         }
 628     p++;
 629     result = val;
 630     return p;
 631 }
 632
 633
 634
 635 int URI::parseEntity(int p0, int &result)
 636 {
 637     int p = p0;
 638     int ch = peek(p);
 639     if (ch != '&')
 640         return p0;
 641     p++;
 642     if (!match(p, "#x"))
 643         {
 644         error("parseEntity: expected '#x'");
 645         return -1;
 646         }
 647     p += 2;
 648     int val;
 649     p = parseHex(p, val);
 650     if (p<0)
 651         return -1;
 652     ch = peek(p);
 653     if (ch != ';')
 654         {
 655         error("parseEntity: expected ';'");
 656         return -1;
 657         }
 658     p++;
 659     result = val;
 660     return p;
 661 }
 662
 663 int URI::parseAsciiEntity(int p0, int &result)
 664 {
 665     int p = p0;
 666     int ch = peek(p);
 667     if (ch != '%')
 668         return p0;
 669     p++;
 670     int val;
 671     p = parseHex(p, val);
 672     if (p<0)
 673         return -1;
 674     result = val;
 675     return p;
 676 }
 677
 678
 679 int URI::parseScheme(int p0)
 680 {
 681     int p = p0;
 682     for (LookupEntry *entry = schemes; entry->sval ; entry++)
 683         {
 684         int p2 = match(p, entry->sval);
 685         if (p2 > p)
 686             {
 687             schemeStr = entry->sval;
 688             scheme    = entry->ival;
 689             port      = entry->port;
 690             p = p2;
 691             return p;
 692             }
 693         }
 694
 695     return p;
 696 }
 697
 698
 699 int URI::parseHierarchicalPart(int p0)
 700 {
 701     int p = p0;
 702     int ch;
 703
 704     //# Authority field (host and port, for example)
 705     int p2 = match(p, "//");
 706     if (p2 > p)
 707         {
 708         p = p2;
 709         portSpecified = false;
 710         DOMString portStr;
 711         while (p < parselen)
 712             {
 713             ch = peek(p);
 714             if (ch == '/')
 715                 break;
 716             else if (ch == '&') //IRI entity
 717                 {
 718                 int val;
 719                 p2 = parseEntity(p, val);
 720                 if (p2<p)
 721                     {
 722                     return -1;
 723                     }
 724                 p = p2;
 725                 authority.push_back((XMLCh)val);
 726                 }
 727             else if (ch == '%') //ascii hex excape
 728                 {
 729                 int val;
 730                 p2 = parseAsciiEntity(p, val);
 731                 if (p2<p)
 732                     {
 733                     return -1;
 734                     }
 735                 p = p2;
 736                 authority.push_back((XMLCh)val);
 737                 }
 738             else if (ch == ':')
 739                 {
 740                 portSpecified = true;
 741                 p++;
 742                 }
 743             else if (portSpecified)
 744                 {
 745                 portStr.push_back((XMLCh)ch);
 746                 p++;
 747                 }
 748             else
 749                 {
 750                 authority.push_back((XMLCh)ch);
 751                 p++;
 752                 }
 753             }
 754         if (portStr.size() > 0)
 755             {
 756             char *pstr = (char *)portStr.c_str();
 757             char *endStr;
 758             long val = strtol(pstr, &endStr, 10);
 759             if (endStr > pstr) //successful parse?
 760                 port = val;
 761             }
 762         }
 763
 764     //# Are we absolute?
 765     ch = peek(p);
 766     if (isLetter(ch) && peek(p+1)==':')
 767         {
 768         absolute = true;
 769         path.push_back((XMLCh)'/');
 770         }
 771     else if (ch == '/')
 772         {
 773         absolute = true;
 774         if (p>p0) //in other words, if '/' is not the first char
 775             opaque = true;
 776         path.push_back((XMLCh)ch);
 777         p++;
 778         }
 779
 780     while (p < parselen)
 781         {
 782         ch = peek(p);
 783         if (ch == '?' || ch == '#')
 784             break;
 785         else if (ch == '&') //IRI entity
 786             {
 787             int val;
 788             p2 = parseEntity(p, val);
 789             if (p2<p)
 790                 {
 791                 return -1;
 792                 }
 793             p = p2;
 794             path.push_back((XMLCh)val);
 795             }
 796         else if (ch == '%') //ascii hex excape
 797             {
 798             int val;
 799             p2 = parseAsciiEntity(p, val);
 800             if (p2<p)
 801                 {
 802                 return -1;
 803                 }
 804             p = p2;
 805             path.push_back((XMLCh)val);
 806             }
 807         else
 808             {
 809             path.push_back((XMLCh)ch);
 810             p++;
 811             }
 812         }
 813     //trace("path:%s", toStr(path).c_str());
 814     return p;
 815 }
 816
 817 int URI::parseQuery(int p0)
 818 {
 819     int p = p0;
 820     int ch = peek(p);
 821     if (ch != '?')
 822         return p0;
 823
 824     p++;
 825     while (p < parselen)
 826         {
 827         ch = peek(p);
 828         if (ch == '#')
 829             break;
 830         query.push_back((XMLCh)ch);
 831         p++;
 832         }
 833
 834
 835     return p;
 836 }
 837
 838 int URI::parseFragment(int p0)
 839 {
 840
 841     int p = p0;
 842     int ch = peek(p);
 843     if (ch != '#')
 844         return p0;
 845
 846     p++;
 847     while (p < parselen)
 848         {
 849         ch = peek(p);
 850         if (ch == '?')
 851             break;
 852         fragment.push_back(ch);
 853         p++;
 854         }
 855
 856
 857     return p;
 858 }
 859
 860
 861 int URI::parse(int p0)
 862 {
 863
 864     int p = p0;
 865
 866     int p2 = parseScheme(p);
 867     if (p2 < 0)
 868         {
 869         error("Scheme");
 870         return -1;
 871         }
 872     p = p2;
 873
 874
 875     p2 = parseHierarchicalPart(p);
 876     if (p2 < 0)
 877         {
 878         error("Hierarchical part");
 879         return -1;
 880         }
 881     p = p2;
 882
 883     p2 = parseQuery(p);
 884     if (p2 < 0)
 885         {
 886         error("Query");
 887         return -1;
 888         }
 889     p = p2;
 890
 891
 892     p2 = parseFragment(p);
 893     if (p2 < 0)
 894         {
 895         error("Fragment");
 896         return -1;
 897         }
 898     p = p2;
 899
 900     return p;
 901
 902 }
 903
 904
 905
 906 bool URI::parse(const DOMString &str)
 907 {
 908
 909     parselen = str.size();
 910     parsebuf = new int[str.size()];
 911     if (!parsebuf)
 912         {
 913         error("parse : could not allocate parsebuf");
 914         return false;
 915         }
 916
 917     DOMString::const_iterator iter;
 918     unsigned int i=0;
 919     for (iter= str.begin() ; iter!=str.end() ; iter++)
 920         {
 921         int ch = *iter;
 922         if (ch == '\\')
 923             parsebuf[i++] = '/';
 924         else
 925             parsebuf[i++] = ch;
 926         }
 927
 928
 929     int p = parse(0);
 930     normalize();
 931
 932     delete[] parsebuf;
 933
 934     if (p < 0)
 935         {
 936         error("Syntax error");
 937         return false;
 938         }
 939
 940     //printf("uri:%s\n", toString().c_str());
 941     //printf("parse:%s\n", toStr(path).c_str());
 942
 943     return true;
 944
 945 }
 946
 947
 948
 949
 950
 951 }  //namespace dom
 952 }  //namespace w3c
 953 }  //namespace org
 954 //#########################################################################
 955 //# E N D    O F    F I L E
 956 //#########################################################################
 957
 958
 959