src/dom/uri.cpp

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * Authors:
  11  *   Bob Jamison
  12  *
  13  * Copyright (C) 2005-2007 Bob Jamison
  14  *
  15  *  This library is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU Lesser General Public
  17  *  License as published by the Free Software Foundation; either
  18  *  version 2.1 of the License, or (at your option) any later version.
  19  *
  20  *  This library is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  Lesser General Public License for more details.
  24  *
  25  *  You should have received a copy of the GNU Lesser General Public
  26  *  License along with this library; if not, write to the Free Software
  27  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  28  */
  29
  30
  31
  32
  33 #include "uri.h"
  34 #include "charclass.h"
  35
  36 #include <stdio.h>
  37 #include <stdarg.h>
  38
  39
  40
  41 namespace org
  42 {
  43 namespace w3c
  44 {
  45 namespace dom
  46 {
  47
  48
  49 typedef struct
  50 {
  51     int  ival;
  52     char *sval;
  53     int  port;
  54 } LookupEntry;
  55
  56 static LookupEntry schemes[] =
  57 {
  58     { URI::SCHEME_DATA,   "data:",    0 },
  59     { URI::SCHEME_HTTP,   "http:",   80 },
  60     { URI::SCHEME_HTTPS,  "https:", 443 },
  61     { URI::SCHEME_FTP,    "ftp",     12 },
  62     { URI::SCHEME_FILE,   "file:",    0 },
  63     { URI::SCHEME_LDAP,   "ldap:",  123 },
  64     { URI::SCHEME_MAILTO, "mailto:", 25 },
  65     { URI::SCHEME_NEWS,   "news:",  117 },
  66     { URI::SCHEME_TELNET, "telnet:", 23 },
  67     { 0,                  NULL,       0 }
  68 };
  69
  70
  71
  72 //#########################################################################
  73 //# C O N S T R U C T O R
  74 //#########################################################################
  75
  76 /**
  77  *
  78  */
  79 URI::URI()
  80 {
  81     init();
  82 }
  83
  84 /**
  85  *
  86  */
  87 URI::URI(const DOMString &str)
  88 {
  89     init();
  90     parse(str);
  91 }
  92
  93
  94 /**
  95  *
  96  */
  97 URI::URI(const char *str)
  98 {
  99     init();
 100     DOMString domStr = str;
 101     parse(domStr);
 102 }
 103
 104
 105 /**
 106  *
 107  */
 108 URI::URI(const URI &other)
 109 {
 110     init();
 111     assign(other);
 112 }
 113
 114
 115 /**
 116  *
 117  */
 118 URI &URI::operator=(const URI &other)
 119 {
 120     init();
 121     assign(other);
 122     return *this;
 123 }
 124
 125
 126 /**
 127  *
 128  */
 129 URI::~URI()
 130 {
 131 }
 132
 133
 134
 135
 136
 137 /**
 138  *
 139  */
 140 void URI::init()
 141 {
 142     parsebuf  = NULL;
 143     parselen  = 0;
 144     scheme    = SCHEME_NONE;
 145     schemeStr = "";
 146     port      = 0;
 147     authority = "";
 148     path      = "";
 149     absolute  = false;
 150     opaque    = false;
 151     query     = "";
 152     fragment  = "";
 153 }
 154
 155
 156 /**
 157  *
 158  */
 159 void URI::assign(const URI &other)
 160 {
 161     scheme    = other.scheme;
 162     schemeStr = other.schemeStr;
 163     authority = other.authority;
 164     port      = other.port;
 165     path      = other.path;
 166     absolute  = other.absolute;
 167     opaque    = other.opaque;
 168     query     = other.query;
 169     fragment  = other.fragment;
 170 }
 171
 172
 173 //#########################################################################
 174 //#A T T R I B U T E S
 175 //#########################################################################
 176
 177 DOMString URI::toString() const
 178 {
 179     DOMString str = schemeStr;
 180     if (authority.size() > 0)
 181         {
 182         str.append("//");
 183         str.append(authority);
 184         }
 185     str.append(path);
 186     if (query.size() > 0)
 187         {
 188         str.append("?");
 189         str.append(query);
 190         }
 191     if (fragment.size() > 0)
 192         {
 193         str.append("#");
 194         str.append(fragment);
 195         }
 196     return str;
 197 }
 198
 199
 200 int URI::getScheme() const
 201 {
 202     return scheme;
 203 }
 204
 205 DOMString URI::getSchemeStr() const
 206 {
 207     return schemeStr;
 208 }
 209
 210
 211 DOMString URI::getAuthority() const
 212 {
 213     DOMString ret = authority;
 214     if (portSpecified && port>=0)
 215         {
 216         char buf[7];
 217         snprintf(buf, 6, ":%6d", port);
 218         ret.append(buf);
 219         }
 220     return ret;
 221 }
 222
 223 DOMString URI::getHost() const
 224 {
 225     return authority;
 226 }
 227
 228 int URI::getPort() const
 229 {
 230     return port;
 231 }
 232
 233
 234 DOMString URI::getPath() const
 235 {
 236     return path;
 237 }
 238
 239 DOMString URI::getNativePath() const
 240 {
 241     DOMString npath;
 242 #ifdef __WIN32__
 243     unsigned int firstChar = 0;
 244     if (path.size() >= 3)
 245         {
 246         if (path[0] == '/' &&
 247             isLetter(path[1]) &&
 248             path[2] == ':')
 249             firstChar++;
 250          }
 251     for (unsigned int i=firstChar ; i<path.size() ; i++)
 252         {
 253         XMLCh ch = (XMLCh) path[i];
 254         if (ch == '/')
 255             npath.push_back((XMLCh)'\\');
 256         else
 257             npath.push_back(ch);
 258         }
 259 #else
 260     npath = path;
 261 #endif
 262     return npath;
 263 }
 264
 265
 266 bool URI::isAbsolute() const
 267 {
 268     return absolute;
 269 }
 270
 271 bool URI::isOpaque() const
 272 {
 273     return opaque;
 274 }
 275
 276
 277 DOMString URI::getQuery() const
 278 {
 279     return query;
 280 }
 281
 282
 283 DOMString URI::getFragment() const
 284 {
 285     return fragment;
 286 }
 287
 288
 289 URI URI::resolve(const URI &other) const
 290 {
 291     //### According to w3c, this is handled in 3 cases
 292
 293     //## 1
 294     if (opaque || other.isAbsolute())
 295         return other;
 296
 297     //## 2
 298     if (other.fragment.size()  >  0 &&
 299         other.path.size()      == 0 &&
 300         other.scheme           == SCHEME_NONE &&
 301         other.authority.size() == 0 &&
 302         other.query.size()     == 0 )
 303         {
 304         URI fragUri = *this;
 305         fragUri.fragment = other.fragment;
 306         return fragUri;
 307         }
 308
 309     //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
 310     URI newUri;
 311     //# 3.1
 312     newUri.scheme    = scheme;
 313     newUri.schemeStr = schemeStr;
 314     newUri.query     = other.query;
 315     newUri.fragment  = other.fragment;
 316     if (other.authority.size() > 0)
 317         {
 318         //# 3.2
 319         if (absolute || other.absolute)
 320             newUri.absolute = true;
 321         newUri.authority = other.authority;
 322         newUri.port      = other.port;//part of authority
 323         newUri.path      = other.path;
 324         }
 325     else
 326         {
 327         //# 3.3
 328         if (other.absolute)
 329             {
 330             newUri.absolute = true;
 331             newUri.path     = other.path;
 332             }
 333         else
 334             {
 335             DOMString::size_type pos = path.find_last_of('/');
 336             if (pos != path.npos)
 337                 {
 338                 DOMString tpath = path.substr(0, pos+1);
 339                 tpath.append(other.path);
 340                 newUri.path = tpath;
 341                 }
 342             else
 343                 newUri.path = other.path;
 344             }
 345         }
 346
 347     newUri.normalize();
 348     return newUri;
 349 }
 350
 351
 352 /**
 353  *  This follows the Java URI algorithm:
 354  *   1. All "." segments are removed.
 355  *   2. If a ".." segment is preceded by a non-".." segment
 356  *          then both of these segments are removed. This step
 357  *          is repeated until it is no longer applicable.
 358  *   3. If the path is relative, and if its first segment
 359  *          contains a colon character (':'), then a "." segment
 360  *          is prepended. This prevents a relative URI with a path
 361  *          such as "a:b/c/d" from later being re-parsed as an
 362  *          opaque URI with a scheme of "a" and a scheme-specific
 363  *          part of "b/c/d". (Deviation from RFC 2396)
 364  */
 365 void URI::normalize()
 366 {
 367     std::vector<DOMString> segments;
 368
 369     //## Collect segments
 370     if (path.size()<2)
 371         return;
 372     bool abs = false;
 373     unsigned int pos=0;
 374     if (path[0]=='/')
 375         {
 376         abs = true;
 377         pos++;
 378         }
 379     while (pos < path.size())
 380         {
 381         DOMString::size_type pos2 = path.find('/', pos);
 382         if (pos2==path.npos)
 383             {
 384             DOMString seg = path.substr(pos);
 385             //printf("last segment:%s\n", seg.c_str());
 386             segments.push_back(seg);
 387             break;
 388             }
 389         if (pos2>pos)
 390             {
 391             DOMString seg = path.substr(pos, pos2-pos);
 392             //printf("segment:%s\n", seg.c_str());
 393             segments.push_back(seg);
 394             }
 395         pos = pos2;
 396         pos++;
 397         }
 398
 399     //## Clean up (normalize) segments
 400     bool edited = false;
 401     std::vector<DOMString>::iterator iter;
 402     for (iter=segments.begin() ; iter!=segments.end() ; )
 403         {
 404         DOMString s = *iter;
 405         if (s == ".")
 406             {
 407             iter = segments.erase(iter);
 408             edited = true;
 409             }
 410         else if (s == ".." &&
 411                  iter != segments.begin() &&
 412                  *(iter-1) != "..")
 413             {
 414             iter--; //back up, then erase two entries
 415             iter = segments.erase(iter);
 416             iter = segments.erase(iter);
 417             edited = true;
 418             }
 419         else
 420             iter++;
 421         }
 422
 423     //## Rebuild path, if necessary
 424     if (edited)
 425         {
 426         path.clear();
 427         if (abs)
 428             {
 429             path.append("/");
 430             }
 431         std::vector<DOMString>::iterator iter;
 432         for (iter=segments.begin() ; iter!=segments.end() ; iter++)
 433             {
 434             if (iter != segments.begin())
 435                 path.append("/");
 436             path.append(*iter);
 437             }
 438         }
 439
 440 }
 441
 442
 443
 444 //#########################################################################
 445 //# M E S S A G E S
 446 //#########################################################################
 447
 448 void URI::error(const char *fmt, ...)
 449 {
 450     va_list args;
 451     fprintf(stderr, "URI error: ");
 452     va_start(args, fmt);
 453     vfprintf(stderr, fmt, args);
 454     va_end(args);
 455     fprintf(stderr, "\n");
 456 }
 457
 458 void URI::trace(const char *fmt, ...)
 459 {
 460     va_list args;
 461     fprintf(stdout, "URI: ");
 462     va_start(args, fmt);
 463     vfprintf(stdout, fmt, args);
 464     va_end(args);
 465     fprintf(stdout, "\n");
 466 }
 467
 468
 469
 470 //#########################################################################
 471 //# P A R S I N G
 472 //#########################################################################
 473
 474
 475
 476 int URI::peek(int p)
 477 {
 478     if (p<0 || p>=parselen)
 479         return -1;
 480     return parsebuf[p];
 481 }
 482
 483
 484
 485 int URI::match(int p0, char *key)
 486 {
 487     int p = p0;
 488     while (p < parselen)
 489         {
 490         if (*key == '\0')
 491             return p;
 492         else if (*key != parsebuf[p])
 493             break;
 494         p++; key++;
 495         }
 496     return p0;
 497 }
 498
 499 //#########################################################################
 500 //#  Parsing is performed according to:
 501 //#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
 502 //#########################################################################
 503
 504 int URI::parseHex(int p0, int &result)
 505 {
 506     int p = p0;
 507     int val = 0;
 508
 509     //# Upper 4
 510     XMLCh ch = peek(p);
 511     if (ch >= '0' && ch <= '9')
 512         val += (ch - '0');
 513     else if (ch >= 'a' && ch <= 'f')
 514         val += (10 + ch - 'a');
 515     else if (ch >= 'A' && ch <= 'F')
 516         val += (10 + ch - 'A');
 517     else
 518         {
 519         error("parseHex : unexpected character : %c", ch);
 520         return -1;
 521         }
 522     p++;
 523     val <<= 4;
 524
 525     //# Lower 4
 526     ch = peek(p);
 527     if (ch >= '0' && ch <= '9')
 528         val += (ch - '0');
 529     else if (ch >= 'a' && ch <= 'f')
 530         val += (10 + ch - 'a');
 531     else if (ch >= 'A' && ch <= 'F')
 532         val += (10 + ch - 'A');
 533     else
 534         {
 535         error("parseHex : unexpected character : %c", ch);
 536         return -1;
 537         }
 538     result = val;
 539     return p;
 540 }
 541
 542
 543
 544 int URI::parseEntity(int p0, int &result)
 545 {
 546     int p = p0;
 547     XMLCh ch = peek(p);
 548     if (ch != '&')
 549         return p0;
 550     p++;
 551     if (!match(p, "#x"))
 552         {
 553         error("parseEntity: expected '#x'");
 554         return -1;
 555         }
 556     p += 2;
 557     int val;
 558     p = parseHex(p, val);
 559     if (p<0)
 560         return -1;
 561     result = val;
 562     return p;
 563 }
 564
 565 int URI::parseAsciiEntity(int p0, int &result)
 566 {
 567     int p = p0;
 568     XMLCh ch = peek(p);
 569     if (ch != '%')
 570         return p0;
 571     p++;
 572     int val;
 573     p = parseHex(p, val);
 574     if (p<0)
 575         return -1;
 576     result = val;
 577     return p;
 578 }
 579
 580
 581 int URI::parseScheme(int p0)
 582 {
 583     int p = p0;
 584     for (LookupEntry *entry = schemes; entry->sval ; entry++)
 585         {
 586         int p2 = match(p, entry->sval);
 587         if (p2 > p)
 588             {
 589             schemeStr = entry->sval;
 590             scheme    = entry->ival;
 591             port      = entry->port;
 592             p = p2;
 593             return p;
 594             }
 595         }
 596
 597     return p;
 598 }
 599
 600
 601 int URI::parseHierarchicalPart(int p0)
 602 {
 603     int p = p0;
 604     int ch;
 605
 606     //# Authority field (host and port, for example)
 607     int p2 = match(p, "//");
 608     if (p2 > p)
 609         {
 610         p = p2;
 611         portSpecified = false;
 612         DOMString portStr;
 613         while (p < parselen)
 614             {
 615             ch = peek(p);
 616             if (ch == '/')
 617                 break;
 618             else if (ch == ':')
 619                 portSpecified = true;
 620             else if (portSpecified)
 621                 portStr.push_back((XMLCh)ch);
 622             else
 623                 authority.push_back((XMLCh)ch);
 624             p++;
 625             }
 626         if (portStr.size() > 0)
 627             {
 628             char *pstr = (char *)portStr.c_str();
 629             char *endStr;
 630             long val = strtol(pstr, &endStr, 10);
 631             if (endStr > pstr) //successful parse?
 632                 port = val;
 633             }
 634         }
 635
 636     //# Are we absolute?
 637     ch = peek(p);
 638     if (isLetter(ch) && peek(p+1)==':')
 639         {
 640         absolute = true;
 641         path.push_back((XMLCh)'/');
 642         }
 643     else if (ch == '/')
 644         {
 645         absolute = true;
 646         if (p>p0) //in other words, if '/' is not the first char
 647             opaque = true;
 648         path.push_back((XMLCh)ch);
 649         p++;
 650         }
 651
 652     while (p < parselen)
 653         {
 654         ch = peek(p);
 655         if (ch == '?' || ch == '#')
 656             break;
 657         else if (ch == '&') //IRI entity
 658             {
 659             int val;
 660             p2 = parseEntity(p, val);
 661             if (p2<p)
 662                 {
 663                 return -1;
 664                 }
 665             p = p2;
 666             path.push_back((XMLCh)val);
 667             }
 668         else if (ch == '%') //ascii hex excape
 669             {
 670             int val;
 671             p2 = parseAsciiEntity(p, val);
 672             if (p2<p)
 673                 {
 674                 return -1;
 675                 }
 676             p = p2;
 677             path.push_back((XMLCh)val);
 678             }
 679         else
 680             {
 681             path.push_back((XMLCh)ch);
 682             p++;
 683             }
 684         }
 685
 686     return p;
 687 }
 688
 689 int URI::parseQuery(int p0)
 690 {
 691     int p = p0;
 692     int ch = peek(p);
 693     if (ch != '?')
 694         return p0;
 695
 696     p++;
 697     while (p < parselen)
 698         {
 699         ch = peek(p);
 700         if (ch == '#')
 701             break;
 702         query.push_back((XMLCh)ch);
 703         p++;
 704         }
 705
 706
 707     return p;
 708 }
 709
 710 int URI::parseFragment(int p0)
 711 {
 712
 713     int p = p0;
 714     int ch = peek(p);
 715     if (ch != '#')
 716         return p0;
 717
 718     p++;
 719     while (p < parselen)
 720         {
 721         ch = peek(p);
 722         if (ch == '?')
 723             break;
 724         fragment.push_back((XMLCh)ch);
 725         p++;
 726         }
 727
 728
 729     return p;
 730 }
 731
 732
 733 int URI::parse(int p0)
 734 {
 735
 736     int p = p0;
 737
 738     int p2 = parseScheme(p);
 739     if (p2 < 0)
 740         {
 741         error("Scheme");
 742         return -1;
 743         }
 744     p = p2;
 745
 746
 747     p2 = parseHierarchicalPart(p);
 748     if (p2 < 0)
 749         {
 750         error("Hierarchical part");
 751         return -1;
 752         }
 753     p = p2;
 754
 755     p2 = parseQuery(p);
 756     if (p2 < 0)
 757         {
 758         error("Query");
 759         return -1;
 760         }
 761     p = p2;
 762
 763
 764     p2 = parseFragment(p);
 765     if (p2 < 0)
 766         {
 767         error("Fragment");
 768         return -1;
 769         }
 770     p = p2;
 771
 772     return p;
 773
 774 }
 775
 776
 777
 778 bool URI::parse(const DOMString &str)
 779 {
 780
 781     parselen = str.size();
 782
 783     DOMString tmp;
 784     for (unsigned int i=0 ; i<str.size() ; i++)
 785         {
 786         XMLCh ch = (XMLCh) str[i];
 787         if (ch == '\\')
 788             tmp.push_back((XMLCh)'/');
 789         else
 790             tmp.push_back(ch);
 791         }
 792     parsebuf = (char *) tmp.c_str();
 793
 794
 795     int p = parse(0);
 796     normalize();
 797
 798     if (p < 0)
 799         {
 800         error("Syntax error");
 801         return false;
 802         }
 803
 804     //printf("uri:%s\n", toString().c_str());
 805     //printf("path:%s\n", path.c_str());
 806
 807     return true;
 808
 809 }
 810
 811
 812
 813
 814
 815 }  //namespace dom
 816 }  //namespace w3c
 817 }  //namespace org
 818 //#########################################################################
 819 //# E N D    O F    F I L E
 820 //#########################################################################
 821
 822
 823