src/dom/uri.h

   1 #ifndef __URI_H__
   2 #define __URI_H__
   3
   4 /**
   5  * Phoebe DOM Implementation.
   6  *
   7  * This is a C++ approximation of the W3C DOM model, which follows
   8  * fairly closely the specifications in the various .idl files, copies of
   9  * which are provided for reference.  Most important is this one:
  10  *
  11  * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
  12  *
  13  * Authors:
  14  *   Bob Jamison
  15  *
  16  * Copyright (C) 2005-2008 Bob Jamison
  17  *
  18  *  This library is free software; you can redistribute it and/or
  19  *  modify it under the terms of the GNU Lesser General Public
  20  *  License as published by the Free Software Foundation; either
  21  *  version 2.1 of the License, or (at your option) any later version.
  22  *
  23  *  This library is distributed in the hope that it will be useful,
  24  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  25  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26  *  Lesser General Public License for more details.
  27  *
  28  *  You should have received a copy of the GNU Lesser General Public
  29  *  License along with this library; if not, write to the Free Software
  30  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  31  *
  32  * =======================================================================
  33  * NOTES
  34  *
  35  * Some definitions are taken from the URI RFC:
  36  * http://www.ietf.org/rfc/rfc2396.txt
  37  */
  38
  39 #include "dom.h"
  40
  41
  42 namespace org
  43 {
  44 namespace w3c
  45 {
  46 namespace dom
  47 {
  48
  49
  50 /**
  51  *  A class that implements the W3C URI resource reference.  Although this
  52  *  API attempts to process URIs as closely as possible to the needs of W3,
  53  *  this model is not based on any official W3C spec.
  54  */
  55 class URI
  56 {
  57 public:
  58
  59     /**
  60      * Code that indicates the scheme type.
  61      */
  62     typedef enum
  63         {
  64         SCHEME_NONE =0,
  65         SCHEME_DATA,
  66         SCHEME_HTTP,
  67         SCHEME_HTTPS,
  68         SCHEME_FTP,
  69         SCHEME_FILE,
  70         SCHEME_LDAP,
  71         SCHEME_MAILTO,
  72         SCHEME_NEWS,
  73         SCHEME_TELNET
  74         } SchemeTypes;
  75
  76     /**
  77      * Simple constructor
  78      */
  79     URI();
  80
  81     /**
  82      * Copy constructor
  83      */
  84     URI(const DOMString &str);
  85
  86
  87     /**
  88      * Parsing constructor
  89      */
  90     URI(const char *str);
  91
  92     /**
  93      * Copy constructor
  94      */
  95     URI(const URI &other);
  96
  97     /**
  98      *  Assignment operator
  99      */
 100     URI &operator=(const URI &other);
 101
 102     /**
 103      * Destructor
 104      */
 105     virtual ~URI();
 106
 107     /**
 108      * Parse a string to initialize this URI.
 109      */
 110     virtual bool parse(const DOMString &str);
 111
 112     /**
 113      * Produce a string displaying this URI's current value, in W3C format.
 114      */
 115     virtual DOMString toString() const;
 116
 117     /**
 118      * Return the scheme (SchemeTypes above) of this URI as an enumeration
 119      */
 120     virtual int getScheme() const;
 121
 122     /**
 123      * Return the scheme value as a string
 124      * From the RFC:
 125      * Just as there are many different methods of access to resources,
 126      * there are a variety of schemes for identifying such resources.  The
 127      * URI syntax consists of a sequence of components separated by reserved
 128      * characters, with the first component defining the semantics for the
 129      * remainder of the URI string.
 130      *
 131      * Scheme names consist of a sequence of characters beginning with a
 132      * lower case letter and followed by any combination of lower case
 133      * letters, digits, plus ("+"), period ("."), or hyphen ("-").  For
 134      * resiliency, programs interpreting URI should treat upper case letters
 135      * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
 136      * well as "http").
 137      *
 138      *   scheme        = alpha *( alpha | digit | "+" | "-" | "." )
 139      *
 140      * Relative URI references are distinguished from absolute URI in that
 141      * they do not begin with a scheme name.  Instead, the scheme is
 142      * inherited from the base URI, as described in Section 5.2.
 143      *
 144      */
 145     virtual DOMString getSchemeStr() const;
 146
 147     /**
 148      * From the RFC:
 149      * Many URI schemes include a top hierarchical element for a naming
 150      * authority, such that the namespace defined by the remainder of the
 151      * URI is governed by that authority.  This authority component is
 152      * typically defined by an Internet-based server or a scheme-specific
 153      * registry of naming authorities.
 154      *
 155      *  authority     = server | reg_name
 156      *
 157      * The authority component is preceded by a double slash "//" and is
 158      * terminated by the next slash "/", question-mark "?", or by the end of
 159      * the URI.  Within the authority component, the characters ";", ":",
 160      * "@", "?", and "/" are reserved.
 161      *
 162      * An authority component is not required for a URI scheme to make use
 163      * of relative references.  A base URI without an authority component
 164      * implies that any relative reference will also be without an authority
 165      * component.
 166      */
 167     virtual DOMString getAuthority() const;
 168
 169     /**
 170      *  Same as getAuthority, but if the port has been specified
 171      *  as host:port , the port will not be included
 172      */
 173     virtual DOMString getHost() const;
 174
 175     /**
 176      * Return the port (TCPIP port for transport-type schemes)
 177      */
 178     virtual int getPort() const;
 179
 180     /**
 181      * From the RFC:
 182      * The path component contains data, specific to the authority (or the
 183      * scheme if there is no authority component), identifying the resource
 184      * within the scope of that scheme and authority.
 185      *
 186      * path          = [ abs_path | opaque_part ]
 187      *
 188      * path_segments = segment *( "/" segment )
 189      * segment       = *pchar *( ";" param )
 190      * param         = *pchar
 191      *
 192      * pchar         = unreserved | escaped |
 193      *                  ":" | "@" | "&" | "=" | "+" | "$" | ","
 194      *
 195      * The path may consist of a sequence of path segments separated by a
 196      * single slash "/" character.  Within a path segment, the characters
 197      * "/", ";", "=", and "?" are reserved.  Each path segment may include a
 198      * sequence of parameters, indicated by the semicolon ";" character.
 199      * The parameters are not significant to the parsing of relative
 200      * references.
 201      */
 202     virtual DOMString getPath() const;
 203
 204     /**
 205      * Converts the URI's internal canonical representation of the path to
 206      * what is meaningful on the architecture on which this method is called.
 207      */
 208     virtual DOMString getNativePath() const;
 209
 210     /**
 211      * An absolute URI contains the name of the scheme being used (<scheme>)
 212      * followed by a colon (":") and then a string (the <scheme-specific-part>)
 213      * whose interpretation depends on the scheme.
 214      */
 215     virtual bool isAbsolute() const;
 216
 217     /**
 218      * URI that do not make use of the slash "/" character for separating
 219      *   hierarchical components are considered opaque
 220      */
 221     virtual bool isOpaque() const;
 222
 223     /**
 224      * The part of the URI following a ? in the path.
 225      *
 226      * From the RFC:
 227      * The query component is a string of information to be interpreted by
 228      * the resource.
 229      *
 230      *      query         = *uric
 231      *
 232      * Within a query component, the characters ";", "/", "?", ":", "@",
 233      * "&", "=", "+", ",", and "$" are reserved.
 234      *
 235      */
 236     virtual DOMString getQuery() const;
 237
 238     /**
 239      * From the RFC:
 240      * When a URI reference is used to perform a retrieval action on the
 241      * identified resource, the optional fragment identifier, separated from
 242      * the URI by a crosshatch ("#") character, consists of additional
 243      * reference information to be interpreted by the user agent after the
 244      * retrieval action has been successfully completed.  As such, it is not
 245      * part of a URI, but is often used in conjunction with a URI.
 246      *
 247      *    fragment      = *uric
 248      *
 249      * The semantics of a fragment identifier is a property of the data
 250      * resulting from a retrieval action, regardless of the type of URI used
 251      * in the reference.  Therefore, the format and interpretation of
 252      * fragment identifiers is dependent on the media type [RFC2046] of the
 253      * retrieval result.  The character restrictions described in Section 2
 254      * for URI also apply to the fragment in a URI-reference.  Individual
 255      * media types may define additional restrictions or structure within
 256      * the fragment for specifying different types of "partial views" that
 257      * can be identified within that media type.
 258      *
 259      * A fragment identifier is only meaningful when a URI reference is
 260      * intended for retrieval and the result of that retrieval is a document
 261      * for which the identified fragment is consistently defined.
 262      */
 263     virtual DOMString getFragment() const;
 264
 265     /**
 266      * resolve()
 267      * This is by far the most useful feature of a URI.  It defines a set
 268      * of rules for finding one resource relative to another, so that your
 269      * resource search is well-defined and much easier.
 270      *
 271      * From the RFC:
 272      *
 273      *  The base URI is established according to the rules of Section 5.1 and
 274      *  parsed into the four main components as described in Section 3.  Note
 275      *  that only the scheme component is required to be present in the base
 276      *  URI; the other components may be empty or undefined.  A component is
 277      *  undefined if its preceding separator does not appear in the URI
 278      *  reference; the path component is never undefined, though it may be
 279      *  empty.  The base URI's query component is not used by the resolution
 280      *  algorithm and may be discarded.
 281      *
 282      *  For each URI reference, the following steps are performed in order:
 283      *
 284      *  1) The URI reference is parsed into the potential four components and
 285      *     fragment identifier, as described in Section 4.3.
 286      *
 287      *  2) If the path component is empty and the scheme, authority, and
 288      *     query components are undefined, then it is a reference to the
 289      *     current document and we are done.  Otherwise, the reference URI's
 290      *     query and fragment components are defined as found (or not found)
 291      *     within the URI reference and not inherited from the base URI.
 292      *
 293      *  3) If the scheme component is defined, indicating that the reference
 294      *     starts with a scheme name, then the reference is interpreted as an
 295      *     absolute URI and we are done.  Otherwise, the reference URI's
 296      *     scheme is inherited from the base URI's scheme component.
 297      *
 298      *     Due to a loophole in prior specifications [RFC1630], some parsers
 299      *     allow the scheme name to be present in a relative URI if it is the
 300      *     same as the base URI scheme.  Unfortunately, this can conflict
 301      *     with the correct parsing of non-hierarchical URI.  For backwards
 302      *     compatibility, an implementation may work around such references
 303      *     by removing the scheme if it matches that of the base URI and the
 304      *     scheme is known to always use the <hier_part> syntax.  The parser
 305      *     can then continue with the steps below for the remainder of the
 306      *     reference components.  Validating parsers should mark such a
 307      *     misformed relative reference as an error.
 308      *
 309      *  4) If the authority component is defined, then the reference is a
 310      *     network-path and we skip to step 7.  Otherwise, the reference
 311      *     URI's authority is inherited from the base URI's authority
 312      *     component, which will also be undefined if the URI scheme does not
 313      *     use an authority component.
 314      *
 315      *  5) If the path component begins with a slash character ("/"), then
 316      *     the reference is an absolute-path and we skip to step 7.
 317      *
 318      *  6) If this step is reached, then we are resolving a relative-path
 319      *     reference.  The relative path needs to be merged with the base
 320      *     URI's path.  Although there are many ways to do this, we will
 321      *     describe a simple method using a separate string buffer.
 322      *
 323      *     a) All but the last segment of the base URI's path component is
 324      *        copied to the buffer.  In other words, any characters after the
 325      *        last (right-most) slash character, if any, are excluded.
 326      *
 327      *     b) The reference's path component is appended to the buffer
 328      *        string.
 329      *
 330      *     c) All occurrences of "./", where "." is a complete path segment,
 331      *        are removed from the buffer string.
 332      *
 333      *     d) If the buffer string ends with "." as a complete path segment,
 334      *        that "." is removed.
 335      *
 336      *     e) All occurrences of "<segment>/../", where <segment> is a
 337      *        complete path segment not equal to "..", are removed from the
 338      *        buffer string.  Removal of these path segments is performed
 339      *        iteratively, removing the leftmost matching pattern on each
 340      *        iteration, until no matching pattern remains.
 341      *
 342      *     f) If the buffer string ends with "<segment>/..", where <segment>
 343      *        is a complete path segment not equal to "..", that
 344      *        "<segment>/.." is removed.
 345      *
 346      *     g) If the resulting buffer string still begins with one or more
 347      *        complete path segments of "..", then the reference is
 348      *        considered to be in error.  Implementations may handle this
 349      *        error by retaining these components in the resolved path (i.e.,
 350      *        treating them as part of the final URI), by removing them from
 351      *        the resolved path (i.e., discarding relative levels above the
 352      *        root), or by avoiding traversal of the reference.
 353      *
 354      *     h) The remaining buffer string is the reference URI's new path
 355      *        component.
 356      *
 357      *  7) The resulting URI components, including any inherited from the
 358      *     base URI, are recombined to give the absolute form of the URI
 359      *     reference.  Using pseudocode, this would be
 360      *
 361      *        result = ""
 362      *
 363      *        if scheme is defined then
 364      *            append scheme to result
 365      *            append ":" to result
 366      *
 367      *        if authority is defined then
 368      *            append "//" to result
 369      *            append authority to result
 370      *
 371      *        append path to result
 372      *
 373      *        if query is defined then
 374      *            append "?" to result
 375      *            append query to result
 376      *
 377      *        if fragment is defined then
 378      *            append "#" to result
 379      *            append fragment to result
 380      *
 381      *        return result
 382      *
 383      *     Note that we must be careful to preserve the distinction between a
 384      *     component that is undefined, meaning that its separator was not
 385      *     present in the reference, and a component that is empty, meaning
 386      *     that the separator was present and was immediately followed by the
 387      *     next component separator or the end of the reference.
 388      *
 389      *  The above algorithm is intended to provide an example by which the
 390      *  output of implementations can be tested -- implementation of the
 391      *  algorithm itself is not required.  For example, some systems may find
 392      *  it more efficient to implement step 6 as a pair of segment stacks
 393      *  being merged, rather than as a series of string pattern replacements.
 394      *
 395      *     Note: Some WWW client applications will fail to separate the
 396      *     reference's query component from its path component before merging
 397      *     the base and reference paths in step 6 above.  This may result in
 398      *     a loss of information if the query component contains the strings
 399      *     "/../" or "/./".
 400      *
 401      */
 402     virtual URI resolve(const URI &other) const;
 403
 404     /**
 405      * "Mends" a URI by examining the path, and converting it to canonical
 406      *  form.  In particular, it takes patterns like "/./" and "/a/../b/../c"
 407      *  and simplifies them.
 408      */
 409     virtual void normalize();
 410
 411
 412 private:
 413
 414     void init();
 415
 416     //assign values of other to this. used by copy constructor
 417     void assign(const URI &other);
 418
 419     int scheme;
 420
 421     DOMString schemeStr;
 422
 423     std::vector<int> authority;
 424
 425     bool portSpecified;
 426
 427     int port;
 428
 429     std::vector<int> path;
 430
 431     bool absolute;
 432
 433     bool opaque;
 434
 435     std::vector<int> query;
 436
 437     std::vector<int> fragment;
 438
 439     void error(const char *fmt, ...)
 440     #ifdef G_GNUC_PRINTF
 441     G_GNUC_PRINTF(2, 3)
 442     #endif
 443     ;
 444
 445     void trace(const char *fmt, ...)
 446     #ifdef G_GNUC_PRINTF
 447     G_GNUC_PRINTF(2, 3)
 448     #endif
 449     ;
 450
 451     int peek(int p);
 452
 453     int match(int p, char const *key);
 454
 455     int parseHex(int p, int &result);
 456
 457     int parseEntity(int p, int &result);
 458
 459     int parseAsciiEntity(int p, int &result);
 460
 461     int parseScheme(int p);
 462
 463     int parseHierarchicalPart(int p0);
 464
 465     int parseQuery(int p0);
 466
 467     int parseFragment(int p0);
 468
 469     int parse(int p);
 470
 471     int *parsebuf;
 472
 473     int parselen;
 474
 475 };
 476
 477
 478
 479 }  //namespace dom
 480 }  //namespace w3c
 481 }  //namespace org
 482
 483
 484
 485 #endif /* __URI_H__ */
 486