1 #ifndef __URI_H__
2 #define __URI_H__
4 /**
5 * Phoebe DOM Implementation.
6 *
7 * This is a C++ approximation of the W3C DOM model, which follows
8 * fairly closely the specifications in the various .idl files, copies of
9 * which are provided for reference. Most important is this one:
10 *
11 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
12 *
13 * Authors:
14 * Bob Jamison
15 *
16 * Copyright (C) 2005-2008 Bob Jamison
17 *
18 * This library is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU Lesser General Public
20 * License as published by the Free Software Foundation; either
21 * version 2.1 of the License, or (at your option) any later version.
22 *
23 * This library is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * Lesser General Public License for more details.
27 *
28 * You should have received a copy of the GNU Lesser General Public
29 * License along with this library; if not, write to the Free Software
30 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
31 *
32 * =======================================================================
33 * NOTES
34 *
35 * Some definitions are taken from the URI RFC:
36 * http://www.ietf.org/rfc/rfc2396.txt
37 */
39 #include "dom.h"
42 namespace org
43 {
44 namespace w3c
45 {
46 namespace dom
47 {
50 /**
51 * A class that implements the W3C URI resource reference. Although this
52 * API attempts to process URIs as closely as possible to the needs of W3,
53 * this model is not based on any official W3C spec.
54 */
55 class URI
56 {
57 public:
59 /**
60 * Code that indicates the scheme type.
61 */
62 typedef enum
63 {
64 SCHEME_NONE =0,
65 SCHEME_DATA,
66 SCHEME_HTTP,
67 SCHEME_HTTPS,
68 SCHEME_FTP,
69 SCHEME_FILE,
70 SCHEME_LDAP,
71 SCHEME_MAILTO,
72 SCHEME_NEWS,
73 SCHEME_TELNET
74 } SchemeTypes;
76 /**
77 * Simple constructor
78 */
79 URI();
81 /**
82 * Copy constructor
83 */
84 URI(const DOMString &str);
87 /**
88 * Parsing constructor
89 */
90 URI(const char *str);
92 /**
93 * Copy constructor
94 */
95 URI(const URI &other);
97 /**
98 * Assignment operator
99 */
100 URI &operator=(const URI &other);
102 /**
103 * Destructor
104 */
105 virtual ~URI();
107 /**
108 * Parse a string to initialize this URI.
109 */
110 virtual bool parse(const DOMString &str);
112 /**
113 * Produce a string displaying this URI's current value, in W3C format.
114 */
115 virtual DOMString toString() const;
117 /**
118 * Return the scheme (SchemeTypes above) of this URI as an enumeration
119 */
120 virtual int getScheme() const;
122 /**
123 * Return the scheme value as a string
124 * From the RFC:
125 * Just as there are many different methods of access to resources,
126 * there are a variety of schemes for identifying such resources. The
127 * URI syntax consists of a sequence of components separated by reserved
128 * characters, with the first component defining the semantics for the
129 * remainder of the URI string.
130 *
131 * Scheme names consist of a sequence of characters beginning with a
132 * lower case letter and followed by any combination of lower case
133 * letters, digits, plus ("+"), period ("."), or hyphen ("-"). For
134 * resiliency, programs interpreting URI should treat upper case letters
135 * as equivalent to lower case in scheme names (e.g., allow "HTTP" as
136 * well as "http").
137 *
138 * scheme = alpha *( alpha | digit | "+" | "-" | "." )
139 *
140 * Relative URI references are distinguished from absolute URI in that
141 * they do not begin with a scheme name. Instead, the scheme is
142 * inherited from the base URI, as described in Section 5.2.
143 *
144 */
145 virtual DOMString getSchemeStr() const;
147 /**
148 * From the RFC:
149 * Many URI schemes include a top hierarchical element for a naming
150 * authority, such that the namespace defined by the remainder of the
151 * URI is governed by that authority. This authority component is
152 * typically defined by an Internet-based server or a scheme-specific
153 * registry of naming authorities.
154 *
155 * authority = server | reg_name
156 *
157 * The authority component is preceded by a double slash "//" and is
158 * terminated by the next slash "/", question-mark "?", or by the end of
159 * the URI. Within the authority component, the characters ";", ":",
160 * "@", "?", and "/" are reserved.
161 *
162 * An authority component is not required for a URI scheme to make use
163 * of relative references. A base URI without an authority component
164 * implies that any relative reference will also be without an authority
165 * component.
166 */
167 virtual DOMString getAuthority() const;
169 /**
170 * Same as getAuthority, but if the port has been specified
171 * as host:port , the port will not be included
172 */
173 virtual DOMString getHost() const;
175 /**
176 * Return the port (TCPIP port for transport-type schemes)
177 */
178 virtual int getPort() const;
180 /**
181 * From the RFC:
182 * The path component contains data, specific to the authority (or the
183 * scheme if there is no authority component), identifying the resource
184 * within the scope of that scheme and authority.
185 *
186 * path = [ abs_path | opaque_part ]
187 *
188 * path_segments = segment *( "/" segment )
189 * segment = *pchar *( ";" param )
190 * param = *pchar
191 *
192 * pchar = unreserved | escaped |
193 * ":" | "@" | "&" | "=" | "+" | "$" | ","
194 *
195 * The path may consist of a sequence of path segments separated by a
196 * single slash "/" character. Within a path segment, the characters
197 * "/", ";", "=", and "?" are reserved. Each path segment may include a
198 * sequence of parameters, indicated by the semicolon ";" character.
199 * The parameters are not significant to the parsing of relative
200 * references.
201 */
202 virtual DOMString getPath() const;
204 /**
205 * Converts the URI's internal canonical representation of the path to
206 * what is meaningful on the architecture on which this method is called.
207 */
208 virtual DOMString getNativePath() const;
210 /**
211 * An absolute URI contains the name of the scheme being used (<scheme>)
212 * followed by a colon (":") and then a string (the <scheme-specific-part>)
213 * whose interpretation depends on the scheme.
214 */
215 virtual bool isAbsolute() const;
217 /**
218 * URI that do not make use of the slash "/" character for separating
219 * hierarchical components are considered opaque
220 */
221 virtual bool isOpaque() const;
223 /**
224 * The part of the URI following a ? in the path.
225 *
226 * From the RFC:
227 * The query component is a string of information to be interpreted by
228 * the resource.
229 *
230 * query = *uric
231 *
232 * Within a query component, the characters ";", "/", "?", ":", "@",
233 * "&", "=", "+", ",", and "$" are reserved.
234 *
235 */
236 virtual DOMString getQuery() const;
238 /**
239 * From the RFC:
240 * When a URI reference is used to perform a retrieval action on the
241 * identified resource, the optional fragment identifier, separated from
242 * the URI by a crosshatch ("#") character, consists of additional
243 * reference information to be interpreted by the user agent after the
244 * retrieval action has been successfully completed. As such, it is not
245 * part of a URI, but is often used in conjunction with a URI.
246 *
247 * fragment = *uric
248 *
249 * The semantics of a fragment identifier is a property of the data
250 * resulting from a retrieval action, regardless of the type of URI used
251 * in the reference. Therefore, the format and interpretation of
252 * fragment identifiers is dependent on the media type [RFC2046] of the
253 * retrieval result. The character restrictions described in Section 2
254 * for URI also apply to the fragment in a URI-reference. Individual
255 * media types may define additional restrictions or structure within
256 * the fragment for specifying different types of "partial views" that
257 * can be identified within that media type.
258 *
259 * A fragment identifier is only meaningful when a URI reference is
260 * intended for retrieval and the result of that retrieval is a document
261 * for which the identified fragment is consistently defined.
262 */
263 virtual DOMString getFragment() const;
265 /**
266 * resolve()
267 * This is by far the most useful feature of a URI. It defines a set
268 * of rules for finding one resource relative to another, so that your
269 * resource search is well-defined and much easier.
270 *
271 * From the RFC:
272 *
273 * The base URI is established according to the rules of Section 5.1 and
274 * parsed into the four main components as described in Section 3. Note
275 * that only the scheme component is required to be present in the base
276 * URI; the other components may be empty or undefined. A component is
277 * undefined if its preceding separator does not appear in the URI
278 * reference; the path component is never undefined, though it may be
279 * empty. The base URI's query component is not used by the resolution
280 * algorithm and may be discarded.
281 *
282 * For each URI reference, the following steps are performed in order:
283 *
284 * 1) The URI reference is parsed into the potential four components and
285 * fragment identifier, as described in Section 4.3.
286 *
287 * 2) If the path component is empty and the scheme, authority, and
288 * query components are undefined, then it is a reference to the
289 * current document and we are done. Otherwise, the reference URI's
290 * query and fragment components are defined as found (or not found)
291 * within the URI reference and not inherited from the base URI.
292 *
293 * 3) If the scheme component is defined, indicating that the reference
294 * starts with a scheme name, then the reference is interpreted as an
295 * absolute URI and we are done. Otherwise, the reference URI's
296 * scheme is inherited from the base URI's scheme component.
297 *
298 * Due to a loophole in prior specifications [RFC1630], some parsers
299 * allow the scheme name to be present in a relative URI if it is the
300 * same as the base URI scheme. Unfortunately, this can conflict
301 * with the correct parsing of non-hierarchical URI. For backwards
302 * compatibility, an implementation may work around such references
303 * by removing the scheme if it matches that of the base URI and the
304 * scheme is known to always use the <hier_part> syntax. The parser
305 * can then continue with the steps below for the remainder of the
306 * reference components. Validating parsers should mark such a
307 * misformed relative reference as an error.
308 *
309 * 4) If the authority component is defined, then the reference is a
310 * network-path and we skip to step 7. Otherwise, the reference
311 * URI's authority is inherited from the base URI's authority
312 * component, which will also be undefined if the URI scheme does not
313 * use an authority component.
314 *
315 * 5) If the path component begins with a slash character ("/"), then
316 * the reference is an absolute-path and we skip to step 7.
317 *
318 * 6) If this step is reached, then we are resolving a relative-path
319 * reference. The relative path needs to be merged with the base
320 * URI's path. Although there are many ways to do this, we will
321 * describe a simple method using a separate string buffer.
322 *
323 * a) All but the last segment of the base URI's path component is
324 * copied to the buffer. In other words, any characters after the
325 * last (right-most) slash character, if any, are excluded.
326 *
327 * b) The reference's path component is appended to the buffer
328 * string.
329 *
330 * c) All occurrences of "./", where "." is a complete path segment,
331 * are removed from the buffer string.
332 *
333 * d) If the buffer string ends with "." as a complete path segment,
334 * that "." is removed.
335 *
336 * e) All occurrences of "<segment>/../", where <segment> is a
337 * complete path segment not equal to "..", are removed from the
338 * buffer string. Removal of these path segments is performed
339 * iteratively, removing the leftmost matching pattern on each
340 * iteration, until no matching pattern remains.
341 *
342 * f) If the buffer string ends with "<segment>/..", where <segment>
343 * is a complete path segment not equal to "..", that
344 * "<segment>/.." is removed.
345 *
346 * g) If the resulting buffer string still begins with one or more
347 * complete path segments of "..", then the reference is
348 * considered to be in error. Implementations may handle this
349 * error by retaining these components in the resolved path (i.e.,
350 * treating them as part of the final URI), by removing them from
351 * the resolved path (i.e., discarding relative levels above the
352 * root), or by avoiding traversal of the reference.
353 *
354 * h) The remaining buffer string is the reference URI's new path
355 * component.
356 *
357 * 7) The resulting URI components, including any inherited from the
358 * base URI, are recombined to give the absolute form of the URI
359 * reference. Using pseudocode, this would be
360 *
361 * result = ""
362 *
363 * if scheme is defined then
364 * append scheme to result
365 * append ":" to result
366 *
367 * if authority is defined then
368 * append "//" to result
369 * append authority to result
370 *
371 * append path to result
372 *
373 * if query is defined then
374 * append "?" to result
375 * append query to result
376 *
377 * if fragment is defined then
378 * append "#" to result
379 * append fragment to result
380 *
381 * return result
382 *
383 * Note that we must be careful to preserve the distinction between a
384 * component that is undefined, meaning that its separator was not
385 * present in the reference, and a component that is empty, meaning
386 * that the separator was present and was immediately followed by the
387 * next component separator or the end of the reference.
388 *
389 * The above algorithm is intended to provide an example by which the
390 * output of implementations can be tested -- implementation of the
391 * algorithm itself is not required. For example, some systems may find
392 * it more efficient to implement step 6 as a pair of segment stacks
393 * being merged, rather than as a series of string pattern replacements.
394 *
395 * Note: Some WWW client applications will fail to separate the
396 * reference's query component from its path component before merging
397 * the base and reference paths in step 6 above. This may result in
398 * a loss of information if the query component contains the strings
399 * "/../" or "/./".
400 *
401 */
402 virtual URI resolve(const URI &other) const;
404 /**
405 * "Mends" a URI by examining the path, and converting it to canonical
406 * form. In particular, it takes patterns like "/./" and "/a/../b/../c"
407 * and simplifies them.
408 */
409 virtual void normalize();
412 private:
414 void init();
416 //assign values of other to this. used by copy constructor
417 void assign(const URI &other);
419 int scheme;
421 DOMString schemeStr;
423 std::vector<int> authority;
425 bool portSpecified;
427 int port;
429 std::vector<int> path;
431 bool absolute;
433 bool opaque;
435 std::vector<int> query;
437 std::vector<int> fragment;
439 void error(const char *fmt, ...)
440 #ifdef G_GNUC_PRINTF
441 G_GNUC_PRINTF(2, 3)
442 #endif
443 ;
445 void trace(const char *fmt, ...)
446 #ifdef G_GNUC_PRINTF
447 G_GNUC_PRINTF(2, 3)
448 #endif
449 ;
451 int peek(int p);
453 int match(int p, char const *key);
455 int parseHex(int p, int &result);
457 int parseEntity(int p, int &result);
459 int parseAsciiEntity(int p, int &result);
461 int parseScheme(int p);
463 int parseHierarchicalPart(int p0);
465 int parseQuery(int p0);
467 int parseFragment(int p0);
469 int parse(int p);
471 int *parsebuf;
473 int parselen;
475 };
479 } //namespace dom
480 } //namespace w3c
481 } //namespace org
485 #endif /* __URI_H__ */