src/dom/uri.cpp

   1 /**
   2  * Phoebe DOM Implementation.
   3  *
   4  * This is a C++ approximation of the W3C DOM model, which follows
   5  * fairly closely the specifications in the various .idl files, copies of
   6  * which are provided for reference.  Most important is this one:
   7  *
   8  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
   9  *
  10  * Authors:
  11  *   Bob Jamison
  12  *
  13  * Copyright (C) 2005 Bob Jamison
  14  *
  15  *  This library is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU Lesser General Public
  17  *  License as published by the Free Software Foundation; either
  18  *  version 2.1 of the License, or (at your option) any later version.
  19  *
  20  *  This library is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  Lesser General Public License for more details.
  24  *
  25  *  You should have received a copy of the GNU Lesser General Public
  26  *  License along with this library; if not, write to the Free Software
  27  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  28  */
  29
  30
  31
  32
  33 #include "uri.h"
  34 #include "charclass.h"
  35
  36 #include <stdio.h>
  37 #include <stdarg.h>
  38
  39
  40
  41 namespace org
  42 {
  43 namespace w3c
  44 {
  45 namespace dom
  46 {
  47
  48
  49 typedef struct
  50 {
  51     int  ival;
  52     char *sval;
  53     int  port;
  54 } LookupEntry;
  55
  56 LookupEntry schemes[] =
  57 {
  58     { URI::SCHEME_DATA,   "data:",    0 },
  59     { URI::SCHEME_HTTP,   "http:",   80 },
  60     { URI::SCHEME_HTTPS,  "https:", 443 },
  61     { URI::SCHEME_FTP,    "ftp",     12 },
  62     { URI::SCHEME_FILE,   "file:",    0 },
  63     { URI::SCHEME_LDAP,   "ldap:",  123 },
  64     { URI::SCHEME_MAILTO, "mailto:", 25 },
  65     { URI::SCHEME_NEWS,   "news:",  117 },
  66     { URI::SCHEME_TELNET, "telnet:", 23 },
  67     { 0,                  NULL,       0 }
  68 };
  69
  70
  71
  72 //#########################################################################
  73 //# C O N S T R U C T O R
  74 //#########################################################################
  75
  76 /**
  77  *
  78  */
  79 URI::URI()
  80 {
  81     init();
  82 }
  83
  84 /**
  85  *
  86  */
  87 URI::URI(const DOMString &str)
  88 {
  89     init();
  90     parse(str);
  91 }
  92
  93
  94 /**
  95  *
  96  */
  97 URI::URI(const char *str)
  98 {
  99     init();
 100     DOMString domStr = str;
 101     parse(domStr);
 102 }
 103
 104
 105 /**
 106  *
 107  */
 108 URI::URI(const URI &other)
 109 {
 110     init();
 111     assign(other);
 112 }
 113
 114
 115 /**
 116  *
 117  */
 118 URI &URI::operator=(const URI &other)
 119 {
 120     init();
 121     assign(other);
 122     return *this;
 123 }
 124
 125
 126 /**
 127  *
 128  */
 129 URI::~URI()
 130 {
 131 }
 132
 133
 134
 135
 136
 137 /**
 138  *
 139  */
 140 void URI::init()
 141 {
 142     parsebuf  = NULL;
 143     parselen  = 0;
 144     scheme    = SCHEME_NONE;
 145     schemeStr = "";
 146     port      = 0;
 147     authority = "";
 148     path      = "";
 149     absolute  = false;
 150     opaque    = false;
 151     query     = "";
 152     fragment  = "";
 153 }
 154
 155
 156 /**
 157  *
 158  */
 159 void URI::assign(const URI &other)
 160 {
 161     scheme    = other.scheme;
 162     schemeStr = other.schemeStr;
 163     authority = other.authority;
 164     port      = other.port;
 165     path      = other.path;
 166     absolute  = other.absolute;
 167     opaque    = other.opaque;
 168     query     = other.query;
 169     fragment  = other.fragment;
 170 }
 171
 172
 173 //#########################################################################
 174 //#A T T R I B U T E S
 175 //#########################################################################
 176
 177 DOMString URI::toString() const
 178 {
 179     DOMString str = schemeStr;
 180     if (authority.size() > 0)
 181         {
 182         str.append("//");
 183         str.append(authority);
 184         }
 185     str.append(path);
 186     if (query.size() > 0)
 187         {
 188         str.append("?");
 189         str.append(query);
 190         }
 191     if (fragment.size() > 0)
 192         {
 193         str.append("#");
 194         str.append(fragment);
 195         }
 196     return str;
 197 }
 198
 199
 200 int URI::getScheme() const
 201 {
 202     return scheme;
 203 }
 204
 205 DOMString URI::getSchemeStr() const
 206 {
 207     return schemeStr;
 208 }
 209
 210
 211 DOMString URI::getAuthority() const
 212 {
 213     DOMString ret = authority;
 214     if (portSpecified && port>=0)
 215         {
 216         char buf[7];
 217         snprintf(buf, 6, ":%6d", port);
 218         ret.append(buf);
 219         }
 220     return ret;
 221 }
 222
 223 DOMString URI::getHost() const
 224 {
 225     return authority;
 226 }
 227
 228 int URI::getPort() const
 229 {
 230     return port;
 231 }
 232
 233
 234 DOMString URI::getPath() const
 235 {
 236     return path;
 237 }
 238
 239
 240 bool URI::isAbsolute() const
 241 {
 242     return absolute;
 243 }
 244
 245 bool URI::isOpaque() const
 246 {
 247     return opaque;
 248 }
 249
 250
 251 DOMString URI::getQuery() const
 252 {
 253     return query;
 254 }
 255
 256
 257 DOMString URI::getFragment() const
 258 {
 259     return fragment;
 260 }
 261
 262
 263 URI URI::resolve(const URI &other) const
 264 {
 265     //### According to w3c, this is handled in 3 cases
 266
 267     //## 1
 268     if (opaque || other.isAbsolute())
 269         return other;
 270
 271     //## 2
 272     if (other.fragment.size()  >  0 &&
 273         other.path.size()      == 0 &&
 274         other.scheme           == SCHEME_NONE &&
 275         other.authority.size() == 0 &&
 276         other.query.size()     == 0 )
 277         {
 278         URI fragUri = *this;
 279         fragUri.fragment = other.fragment;
 280         return fragUri;
 281         }
 282
 283     //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
 284     URI newUri;
 285     //# 3.1
 286     newUri.scheme    = scheme;
 287     newUri.schemeStr = schemeStr;
 288     newUri.query     = other.query;
 289     newUri.fragment  = other.fragment;
 290     if (other.authority.size() > 0)
 291         {
 292         //# 3.2
 293         if (absolute || other.absolute)
 294             newUri.absolute = true;
 295         newUri.authority = other.authority;
 296         newUri.port      = other.port;//part of authority
 297         newUri.path      = other.path;
 298         }
 299     else
 300         {
 301         //# 3.3
 302         if (other.absolute)
 303             {
 304             newUri.absolute = true;
 305             newUri.path     = other.path;
 306             }
 307         else
 308             {
 309             unsigned int pos = path.rfind('/');
 310             if (pos != path.npos)
 311                 {
 312                 DOMString tpath = path.substr(0, pos+1);
 313                 tpath.append(other.path);
 314                 newUri.path = tpath;
 315                 }
 316             }
 317         }
 318     newUri.normalize();
 319     return newUri;
 320 }
 321
 322
 323 /**
 324  *  This follows the Java URI algorithm:
 325  *   1. All "." segments are removed.
 326  *   2. If a ".." segment is preceded by a non-".." segment
 327  *          then both of these segments are removed. This step
 328  *          is repeated until it is no longer applicable.
 329  *   3. If the path is relative, and if its first segment
 330  *          contains a colon character (':'), then a "." segment
 331  *          is prepended. This prevents a relative URI with a path
 332  *          such as "a:b/c/d" from later being re-parsed as an
 333  *          opaque URI with a scheme of "a" and a scheme-specific
 334  *          part of "b/c/d". (Deviation from RFC 2396)
 335  */
 336 void URI::normalize()
 337 {
 338     std::vector<DOMString> segments;
 339
 340     //## Collect segments
 341     if (path.size()<2)
 342         return;
 343     bool abs = false;
 344     unsigned int pos=0;
 345     if (path[0]=='/')
 346         {
 347         abs = true;
 348         pos++;
 349         }
 350     while (pos < path.size())
 351         {
 352         unsigned int pos2 = path.find('/', pos);
 353         if (pos2==path.npos)
 354             {
 355             DOMString seg = path.substr(pos);
 356             //printf("last segment:%s\n", seg.c_str());
 357             segments.push_back(seg);
 358             break;
 359             }
 360         if (pos2>pos)
 361             {
 362             DOMString seg = path.substr(pos, pos2-pos);
 363             //printf("segment:%s\n", seg.c_str());
 364             segments.push_back(seg);
 365             }
 366         pos = pos2;
 367         pos++;
 368         }
 369
 370     //## Clean up (normalize) segments
 371     bool edited = false;
 372     std::vector<DOMString>::iterator iter;
 373     for (iter=segments.begin() ; iter!=segments.end() ; )
 374         {
 375         DOMString s = *iter;
 376         if (s == ".")
 377             {
 378             iter = segments.erase(iter);
 379             edited = true;
 380             }
 381         else if (s == ".." &&
 382                  iter != segments.begin() &&
 383                  *(iter-1) != "..")
 384             {
 385             iter--; //back up, then erase two entries
 386             iter = segments.erase(iter);
 387             iter = segments.erase(iter);
 388             edited = true;
 389             }
 390         else
 391             iter++;
 392         }
 393
 394     //## Rebuild path, if necessary
 395     if (edited)
 396         {
 397         path.clear();
 398         if (abs)
 399             {
 400             path.append("/");
 401             }
 402         std::vector<DOMString>::iterator iter;
 403         for (iter=segments.begin() ; iter!=segments.end() ; iter++)
 404             {
 405             if (iter != segments.begin())
 406                 path.append("/");
 407             path.append(*iter);
 408             }
 409         }
 410
 411 }
 412
 413
 414
 415 //#########################################################################
 416 //# M E S S A G E S
 417 //#########################################################################
 418
 419 void URI::error(const char *fmt, ...)
 420 {
 421     va_list args;
 422     fprintf(stderr, "URI error: ");
 423     va_start(args, fmt);
 424     vfprintf(stderr, fmt, args);
 425     va_end(args);
 426     fprintf(stderr, "\n");
 427 }
 428
 429 void URI::trace(const char *fmt, ...)
 430 {
 431     va_list args;
 432     fprintf(stdout, "URI: ");
 433     va_start(args, fmt);
 434     vfprintf(stdout, fmt, args);
 435     va_end(args);
 436     fprintf(stdout, "\n");
 437 }
 438
 439
 440
 441 //#########################################################################
 442 //# P A R S I N G
 443 //#########################################################################
 444
 445
 446
 447 int URI::peek(int p)
 448 {
 449     if (p<0 || p>=parselen)
 450         return -1;
 451     return parsebuf[p];
 452 }
 453
 454
 455
 456 int URI::match(int p0, char *key)
 457 {
 458     int p = p0;
 459     while (p < parselen)
 460         {
 461         if (*key == '\0')
 462             return p;
 463         else if (*key != parsebuf[p])
 464             break;
 465         p++; key++;
 466         }
 467     return p0;
 468 }
 469
 470 //#########################################################################
 471 //#  Parsing is performed according to:
 472 //#  http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
 473 //#########################################################################
 474
 475 int URI::parseScheme(int p0)
 476 {
 477     int p = p0;
 478     for (LookupEntry *entry = schemes; entry->sval ; entry++)
 479         {
 480         int p2 = match(p, entry->sval);
 481         if (p2 > p)
 482             {
 483             schemeStr = entry->sval;
 484             scheme    = entry->ival;
 485             port      = entry->port;
 486             p = p2;
 487             return p;
 488             }
 489         }
 490
 491     return p;
 492 }
 493
 494
 495 int URI::parseHierarchicalPart(int p0)
 496 {
 497     int p = p0;
 498     int ch;
 499
 500     //# Authority field (host and port, for example)
 501     int p2 = match(p, "//");
 502     if (p2 > p)
 503         {
 504         p = p2;
 505         portSpecified = false;
 506         DOMString portStr;
 507         while (p < parselen)
 508             {
 509             ch = peek(p);
 510             if (ch == '/')
 511                 break;
 512             else if (ch == ':')
 513                 portSpecified = true;
 514             else if (portSpecified)
 515                 portStr.push_back((XMLCh)ch);
 516             else
 517                 authority.push_back((XMLCh)ch);
 518             p++;
 519             }
 520         if (portStr.size() > 0)
 521             {
 522             char *pstr = (char *)portStr.c_str();
 523             char *endStr;
 524             long val = strtol(pstr, &endStr, 10);
 525             if (endStr > pstr) //successful parse?
 526                 port = val;
 527             }
 528         }
 529
 530     //# Are we absolute?
 531     ch = peek(p);
 532     if (ch == '/')
 533         {
 534         absolute = true;
 535         if (p>p0) //in other words, if '/' is not the first char
 536             opaque = true;
 537         path.push_back((XMLCh)ch);
 538         p++;
 539         }
 540
 541     while (p < parselen)
 542         {
 543         ch = peek(p);
 544         if (ch == '?' || ch == '#')
 545             break;
 546         path.push_back((XMLCh)ch);
 547         p++;
 548         }
 549
 550     return p;
 551 }
 552
 553 int URI::parseQuery(int p0)
 554 {
 555     int p = p0;
 556     int ch = peek(p);
 557     if (ch != '?')
 558         return p0;
 559
 560     p++;
 561     while (p < parselen)
 562         {
 563         ch = peek(p);
 564         if (ch == '#')
 565             break;
 566         query.push_back((XMLCh)ch);
 567         p++;
 568         }
 569
 570
 571     return p;
 572 }
 573
 574 int URI::parseFragment(int p0)
 575 {
 576
 577     int p = p0;
 578     int ch = peek(p);
 579     if (ch != '#')
 580         return p0;
 581
 582     p++;
 583     while (p < parselen)
 584         {
 585         ch = peek(p);
 586         if (ch == '?')
 587             break;
 588         fragment.push_back((XMLCh)ch);
 589         p++;
 590         }
 591
 592
 593     return p;
 594 }
 595
 596
 597 int URI::parse(int p0)
 598 {
 599
 600     int p = p0;
 601
 602     int p2 = parseScheme(p);
 603     if (p2 < 0)
 604         {
 605         error("Scheme");
 606         return -1;
 607         }
 608     p = p2;
 609
 610
 611     p2 = parseHierarchicalPart(p);
 612     if (p2 < 0)
 613         {
 614         error("Hierarchical part");
 615         return -1;
 616         }
 617     p = p2;
 618
 619     p2 = parseQuery(p);
 620     if (p2 < 0)
 621         {
 622         error("Query");
 623         return -1;
 624         }
 625     p = p2;
 626
 627
 628     p2 = parseFragment(p);
 629     if (p2 < 0)
 630         {
 631         error("Fragment");
 632         return -1;
 633         }
 634     p = p2;
 635
 636     return p;
 637
 638 }
 639
 640
 641
 642 bool URI::parse(const DOMString &str)
 643 {
 644
 645     parselen = str.size();
 646     DOMString tmp = str;
 647     parsebuf = (char *) tmp.c_str();
 648
 649
 650     int p = parse(0);
 651     normalize();
 652
 653     if (p < 0)
 654         {
 655         error("Syntax error");
 656         return false;
 657         }
 658
 659     //printf("uri:%s\n", toString().c_str());
 660     //printf("path:%s\n", path.c_str());
 661
 662     return true;
 663
 664 }
 665
 666
 667
 668
 669
 670 }  //namespace dom
 671 }  //namespace w3c
 672 }  //namespace org
 673 //#########################################################################
 674 //# E N D    O F    F I L E
 675 //#########################################################################
 676
 677
 678