X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=src%2Fdom%2Furi.h;h=40f80b077e7bc9ae0b0fde9e958674cd0ed8aad0;hb=a2000ef431355d36b8d856b79add9c2fa7550e02;hp=57c5c5554e68b35109bfede589b05d2912cd9037;hpb=eb273c67144b8e1690737756bddc8e008e89f784;p=inkscape.git diff --git a/src/dom/uri.h b/src/dom/uri.h index 57c5c5554..40f80b077 100644 --- a/src/dom/uri.h +++ b/src/dom/uri.h @@ -13,7 +13,7 @@ * Authors: * Bob Jamison * - * Copyright (C) 2005-2007 Bob Jamison + * Copyright (C) 2005-2008 Bob Jamison * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -28,6 +28,12 @@ * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * ======================================================================= + * NOTES + * + * Some definitions are taken from the URI RFC: + * http://www.ietf.org/rfc/rfc2396.txt */ #include "dom.h" @@ -42,12 +48,17 @@ namespace dom /** - * A class that implements the W3C URI resource reference. + * A class that implements the W3C URI resource reference. Although this + * API attempts to process URIs as closely as possible to the needs of W3, + * this model is not based on any official W3C spec. */ class URI { public: + /** + * Code that indicates the scheme type. + */ typedef enum { SCHEME_NONE =0, @@ -63,18 +74,18 @@ public: } SchemeTypes; /** - * + * Simple constructor */ URI(); /** - * + * Copy constructor */ URI(const DOMString &str); /** - * + * Parsing constructor */ URI(const char *str); @@ -84,37 +95,74 @@ public: URI(const URI &other); /** - * Assignment + * Assignment operator */ URI &operator=(const URI &other); /** - * + * Destructor */ virtual ~URI(); /** - * + * Parse a string to initialize this URI. */ virtual bool parse(const DOMString &str); /** - * + * Produce a string displaying this URI's current value, in W3C format. */ virtual DOMString toString() const; /** - * + * Return the scheme (SchemeTypes above) of this URI as an enumeration */ virtual int getScheme() const; /** - * + * Return the scheme value as a string + * From the RFC: + * Just as there are many different methods of access to resources, + * there are a variety of schemes for identifying such resources. The + * URI syntax consists of a sequence of components separated by reserved + * characters, with the first component defining the semantics for the + * remainder of the URI string. + * + * Scheme names consist of a sequence of characters beginning with a + * lower case letter and followed by any combination of lower case + * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For + * resiliency, programs interpreting URI should treat upper case letters + * as equivalent to lower case in scheme names (e.g., allow "HTTP" as + * well as "http"). + * + * scheme = alpha *( alpha | digit | "+" | "-" | "." ) + * + * Relative URI references are distinguished from absolute URI in that + * they do not begin with a scheme name. Instead, the scheme is + * inherited from the base URI, as described in Section 5.2. + * */ virtual DOMString getSchemeStr() const; /** - * + * From the RFC: + * Many URI schemes include a top hierarchical element for a naming + * authority, such that the namespace defined by the remainder of the + * URI is governed by that authority. This authority component is + * typically defined by an Internet-based server or a scheme-specific + * registry of naming authorities. + * + * authority = server | reg_name + * + * The authority component is preceded by a double slash "//" and is + * terminated by the next slash "/", question-mark "?", or by the end of + * the URI. Within the authority component, the characters ";", ":", + * "@", "?", and "/" are reserved. + * + * An authority component is not required for a URI scheme to make use + * of relative references. A base URI without an authority component + * implies that any relative reference will also be without an authority + * component. */ virtual DOMString getAuthority() const; @@ -125,50 +173,242 @@ public: virtual DOMString getHost() const; /** - * + * Return the port (TCPIP port for transport-type schemes) */ virtual int getPort() const; /** - * + * From the RFC: + * The path component contains data, specific to the authority (or the + * scheme if there is no authority component), identifying the resource + * within the scope of that scheme and authority. + * + * path = [ abs_path | opaque_part ] + * + * path_segments = segment *( "/" segment ) + * segment = *pchar *( ";" param ) + * param = *pchar + * + * pchar = unreserved | escaped | + * ":" | "@" | "&" | "=" | "+" | "$" | "," + * + * The path may consist of a sequence of path segments separated by a + * single slash "/" character. Within a path segment, the characters + * "/", ";", "=", and "?" are reserved. Each path segment may include a + * sequence of parameters, indicated by the semicolon ";" character. + * The parameters are not significant to the parsing of relative + * references. */ virtual DOMString getPath() const; /** - * + * Converts the URI's internal canonical representation of the path to + * what is meaningful on the architecture on which this method is called. */ virtual DOMString getNativePath() const; /** - * - */ + * An absolute URI contains the name of the scheme being used () + * followed by a colon (":") and then a string (the ) + * whose interpretation depends on the scheme. + */ virtual bool isAbsolute() const; /** - * - */ + * URI that do not make use of the slash "/" character for separating + * hierarchical components are considered opaque + */ virtual bool isOpaque() const; /** + * The part of the URI following a ? in the path. + * + * From the RFC: + * The query component is a string of information to be interpreted by + * the resource. * + * query = *uric + * + * Within a query component, the characters ";", "/", "?", ":", "@", + * "&", "=", "+", ",", and "$" are reserved. + * */ virtual DOMString getQuery() const; /** - * + * From the RFC: + * When a URI reference is used to perform a retrieval action on the + * identified resource, the optional fragment identifier, separated from + * the URI by a crosshatch ("#") character, consists of additional + * reference information to be interpreted by the user agent after the + * retrieval action has been successfully completed. As such, it is not + * part of a URI, but is often used in conjunction with a URI. + * + * fragment = *uric + * + * The semantics of a fragment identifier is a property of the data + * resulting from a retrieval action, regardless of the type of URI used + * in the reference. Therefore, the format and interpretation of + * fragment identifiers is dependent on the media type [RFC2046] of the + * retrieval result. The character restrictions described in Section 2 + * for URI also apply to the fragment in a URI-reference. Individual + * media types may define additional restrictions or structure within + * the fragment for specifying different types of "partial views" that + * can be identified within that media type. + * + * A fragment identifier is only meaningful when a URI reference is + * intended for retrieval and the result of that retrieval is a document + * for which the identified fragment is consistently defined. */ virtual DOMString getFragment() const; /** - * + * resolve() + * This is by far the most useful feature of a URI. It defines a set + * of rules for finding one resource relative to another, so that your + * resource search is well-defined and much easier. + * + * From the RFC: + * + * The base URI is established according to the rules of Section 5.1 and + * parsed into the four main components as described in Section 3. Note + * that only the scheme component is required to be present in the base + * URI; the other components may be empty or undefined. A component is + * undefined if its preceding separator does not appear in the URI + * reference; the path component is never undefined, though it may be + * empty. The base URI's query component is not used by the resolution + * algorithm and may be discarded. + * + * For each URI reference, the following steps are performed in order: + * + * 1) The URI reference is parsed into the potential four components and + * fragment identifier, as described in Section 4.3. + * + * 2) If the path component is empty and the scheme, authority, and + * query components are undefined, then it is a reference to the + * current document and we are done. Otherwise, the reference URI's + * query and fragment components are defined as found (or not found) + * within the URI reference and not inherited from the base URI. + * + * 3) If the scheme component is defined, indicating that the reference + * starts with a scheme name, then the reference is interpreted as an + * absolute URI and we are done. Otherwise, the reference URI's + * scheme is inherited from the base URI's scheme component. + * + * Due to a loophole in prior specifications [RFC1630], some parsers + * allow the scheme name to be present in a relative URI if it is the + * same as the base URI scheme. Unfortunately, this can conflict + * with the correct parsing of non-hierarchical URI. For backwards + * compatibility, an implementation may work around such references + * by removing the scheme if it matches that of the base URI and the + * scheme is known to always use the syntax. The parser + * can then continue with the steps below for the remainder of the + * reference components. Validating parsers should mark such a + * misformed relative reference as an error. + * + * 4) If the authority component is defined, then the reference is a + * network-path and we skip to step 7. Otherwise, the reference + * URI's authority is inherited from the base URI's authority + * component, which will also be undefined if the URI scheme does not + * use an authority component. + * + * 5) If the path component begins with a slash character ("/"), then + * the reference is an absolute-path and we skip to step 7. + * + * 6) If this step is reached, then we are resolving a relative-path + * reference. The relative path needs to be merged with the base + * URI's path. Although there are many ways to do this, we will + * describe a simple method using a separate string buffer. + * + * a) All but the last segment of the base URI's path component is + * copied to the buffer. In other words, any characters after the + * last (right-most) slash character, if any, are excluded. + * + * b) The reference's path component is appended to the buffer + * string. + * + * c) All occurrences of "./", where "." is a complete path segment, + * are removed from the buffer string. + * + * d) If the buffer string ends with "." as a complete path segment, + * that "." is removed. + * + * e) All occurrences of "/../", where is a + * complete path segment not equal to "..", are removed from the + * buffer string. Removal of these path segments is performed + * iteratively, removing the leftmost matching pattern on each + * iteration, until no matching pattern remains. + * + * f) If the buffer string ends with "/..", where + * is a complete path segment not equal to "..", that + * "/.." is removed. + * + * g) If the resulting buffer string still begins with one or more + * complete path segments of "..", then the reference is + * considered to be in error. Implementations may handle this + * error by retaining these components in the resolved path (i.e., + * treating them as part of the final URI), by removing them from + * the resolved path (i.e., discarding relative levels above the + * root), or by avoiding traversal of the reference. + * + * h) The remaining buffer string is the reference URI's new path + * component. + * + * 7) The resulting URI components, including any inherited from the + * base URI, are recombined to give the absolute form of the URI + * reference. Using pseudocode, this would be + * + * result = "" + * + * if scheme is defined then + * append scheme to result + * append ":" to result + * + * if authority is defined then + * append "//" to result + * append authority to result + * + * append path to result + * + * if query is defined then + * append "?" to result + * append query to result + * + * if fragment is defined then + * append "#" to result + * append fragment to result + * + * return result + * + * Note that we must be careful to preserve the distinction between a + * component that is undefined, meaning that its separator was not + * present in the reference, and a component that is empty, meaning + * that the separator was present and was immediately followed by the + * next component separator or the end of the reference. + * + * The above algorithm is intended to provide an example by which the + * output of implementations can be tested -- implementation of the + * algorithm itself is not required. For example, some systems may find + * it more efficient to implement step 6 as a pair of segment stacks + * being merged, rather than as a series of string pattern replacements. + * + * Note: Some WWW client applications will fail to separate the + * reference's query component from its path component before merging + * the base and reference paths in step 6 above. This may result in + * a loss of information if the query component contains the strings + * "/../" or "/./". + * */ virtual URI resolve(const URI &other) const; /** - * + * "Mends" a URI by examining the path, and converting it to canonical + * form. In particular, it takes patterns like "/./" and "/a/../b/../c" + * and simplifies them. */ virtual void normalize(); + private: void init(); @@ -180,21 +420,21 @@ private: DOMString schemeStr; - DOMString authority; + std::vector authority; bool portSpecified; int port; - DOMString path; + std::vector path; bool absolute; bool opaque; - DOMString query; + std::vector query; - DOMString fragment; + std::vector fragment; void error(const char *fmt, ...) #ifdef G_GNUC_PRINTF @@ -210,7 +450,7 @@ private: int peek(int p); - int match(int p, char *key); + int match(int p, char const *key); int parseHex(int p, int &result); @@ -228,7 +468,7 @@ private: int parse(int p); - char *parsebuf; + int *parsebuf; int parselen;