1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005-2007 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
33 #include "uri.h"
34 #include "charclass.h"
36 #include <stdio.h>
37 #include <stdarg.h>
41 namespace org
42 {
43 namespace w3c
44 {
45 namespace dom
46 {
49 typedef struct
50 {
51 int ival;
52 char *sval;
53 int port;
54 } LookupEntry;
56 static LookupEntry schemes[] =
57 {
58 { URI::SCHEME_DATA, "data:", 0 },
59 { URI::SCHEME_HTTP, "http:", 80 },
60 { URI::SCHEME_HTTPS, "https:", 443 },
61 { URI::SCHEME_FTP, "ftp", 12 },
62 { URI::SCHEME_FILE, "file:", 0 },
63 { URI::SCHEME_LDAP, "ldap:", 123 },
64 { URI::SCHEME_MAILTO, "mailto:", 25 },
65 { URI::SCHEME_NEWS, "news:", 117 },
66 { URI::SCHEME_TELNET, "telnet:", 23 },
67 { 0, NULL, 0 }
68 };
72 //#########################################################################
73 //# C O N S T R U C T O R
74 //#########################################################################
76 /**
77 *
78 */
79 URI::URI()
80 {
81 init();
82 }
84 /**
85 *
86 */
87 URI::URI(const DOMString &str)
88 {
89 init();
90 parse(str);
91 }
94 /**
95 *
96 */
97 URI::URI(const char *str)
98 {
99 init();
100 DOMString domStr = str;
101 parse(domStr);
102 }
105 /**
106 *
107 */
108 URI::URI(const URI &other)
109 {
110 init();
111 assign(other);
112 }
115 /**
116 *
117 */
118 URI &URI::operator=(const URI &other)
119 {
120 init();
121 assign(other);
122 return *this;
123 }
126 /**
127 *
128 */
129 URI::~URI()
130 {
131 }
137 /**
138 *
139 */
140 void URI::init()
141 {
142 parsebuf = NULL;
143 parselen = 0;
144 scheme = SCHEME_NONE;
145 schemeStr = "";
146 port = 0;
147 authority = "";
148 path = "";
149 absolute = false;
150 opaque = false;
151 query = "";
152 fragment = "";
153 }
156 /**
157 *
158 */
159 void URI::assign(const URI &other)
160 {
161 scheme = other.scheme;
162 schemeStr = other.schemeStr;
163 authority = other.authority;
164 port = other.port;
165 path = other.path;
166 absolute = other.absolute;
167 opaque = other.opaque;
168 query = other.query;
169 fragment = other.fragment;
170 }
173 //#########################################################################
174 //#A T T R I B U T E S
175 //#########################################################################
177 DOMString URI::toString() const
178 {
179 DOMString str = schemeStr;
180 if (authority.size() > 0)
181 {
182 str.append("//");
183 str.append(authority);
184 }
185 str.append(path);
186 if (query.size() > 0)
187 {
188 str.append("?");
189 str.append(query);
190 }
191 if (fragment.size() > 0)
192 {
193 str.append("#");
194 str.append(fragment);
195 }
196 return str;
197 }
200 int URI::getScheme() const
201 {
202 return scheme;
203 }
205 DOMString URI::getSchemeStr() const
206 {
207 return schemeStr;
208 }
211 DOMString URI::getAuthority() const
212 {
213 DOMString ret = authority;
214 if (portSpecified && port>=0)
215 {
216 char buf[7];
217 snprintf(buf, 6, ":%6d", port);
218 ret.append(buf);
219 }
220 return ret;
221 }
223 DOMString URI::getHost() const
224 {
225 return authority;
226 }
228 int URI::getPort() const
229 {
230 return port;
231 }
234 DOMString URI::getPath() const
235 {
236 return path;
237 }
239 DOMString URI::getNativePath() const
240 {
241 DOMString npath;
242 #ifdef __WIN32__
243 unsigned int firstChar = 0;
244 if (path.size() >= 3)
245 {
246 if (path[0] == '/' &&
247 isLetter(path[1]) &&
248 path[2] == ':')
249 firstChar++;
250 }
251 for (unsigned int i=firstChar ; i<path.size() ; i++)
252 {
253 XMLCh ch = (XMLCh) path[i];
254 if (ch == '/')
255 npath.push_back((XMLCh)'\\');
256 else
257 npath.push_back(ch);
258 }
259 #else
260 npath = path;
261 #endif
262 return npath;
263 }
266 bool URI::isAbsolute() const
267 {
268 return absolute;
269 }
271 bool URI::isOpaque() const
272 {
273 return opaque;
274 }
277 DOMString URI::getQuery() const
278 {
279 return query;
280 }
283 DOMString URI::getFragment() const
284 {
285 return fragment;
286 }
289 URI URI::resolve(const URI &other) const
290 {
291 //### According to w3c, this is handled in 3 cases
293 //## 1
294 if (opaque || other.isAbsolute())
295 return other;
297 //## 2
298 if (other.fragment.size() > 0 &&
299 other.path.size() == 0 &&
300 other.scheme == SCHEME_NONE &&
301 other.authority.size() == 0 &&
302 other.query.size() == 0 )
303 {
304 URI fragUri = *this;
305 fragUri.fragment = other.fragment;
306 return fragUri;
307 }
309 //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
310 URI newUri;
311 //# 3.1
312 newUri.scheme = scheme;
313 newUri.schemeStr = schemeStr;
314 newUri.query = other.query;
315 newUri.fragment = other.fragment;
316 if (other.authority.size() > 0)
317 {
318 //# 3.2
319 if (absolute || other.absolute)
320 newUri.absolute = true;
321 newUri.authority = other.authority;
322 newUri.port = other.port;//part of authority
323 newUri.path = other.path;
324 }
325 else
326 {
327 //# 3.3
328 if (other.absolute)
329 {
330 newUri.absolute = true;
331 newUri.path = other.path;
332 }
333 else
334 {
335 DOMString::size_type pos = path.find_last_of('/');
336 if (pos != path.npos)
337 {
338 DOMString tpath = path.substr(0, pos+1);
339 tpath.append(other.path);
340 newUri.path = tpath;
341 }
342 else
343 newUri.path = other.path;
344 }
345 }
347 newUri.normalize();
348 return newUri;
349 }
352 /**
353 * This follows the Java URI algorithm:
354 * 1. All "." segments are removed.
355 * 2. If a ".." segment is preceded by a non-".." segment
356 * then both of these segments are removed. This step
357 * is repeated until it is no longer applicable.
358 * 3. If the path is relative, and if its first segment
359 * contains a colon character (':'), then a "." segment
360 * is prepended. This prevents a relative URI with a path
361 * such as "a:b/c/d" from later being re-parsed as an
362 * opaque URI with a scheme of "a" and a scheme-specific
363 * part of "b/c/d". (Deviation from RFC 2396)
364 */
365 void URI::normalize()
366 {
367 std::vector<DOMString> segments;
369 //## Collect segments
370 if (path.size()<2)
371 return;
372 bool abs = false;
373 unsigned int pos=0;
374 if (path[0]=='/')
375 {
376 abs = true;
377 pos++;
378 }
379 while (pos < path.size())
380 {
381 DOMString::size_type pos2 = path.find('/', pos);
382 if (pos2==path.npos)
383 {
384 DOMString seg = path.substr(pos);
385 //printf("last segment:%s\n", seg.c_str());
386 segments.push_back(seg);
387 break;
388 }
389 if (pos2>pos)
390 {
391 DOMString seg = path.substr(pos, pos2-pos);
392 //printf("segment:%s\n", seg.c_str());
393 segments.push_back(seg);
394 }
395 pos = pos2;
396 pos++;
397 }
399 //## Clean up (normalize) segments
400 bool edited = false;
401 std::vector<DOMString>::iterator iter;
402 for (iter=segments.begin() ; iter!=segments.end() ; )
403 {
404 DOMString s = *iter;
405 if (s == ".")
406 {
407 iter = segments.erase(iter);
408 edited = true;
409 }
410 else if (s == ".." &&
411 iter != segments.begin() &&
412 *(iter-1) != "..")
413 {
414 iter--; //back up, then erase two entries
415 iter = segments.erase(iter);
416 iter = segments.erase(iter);
417 edited = true;
418 }
419 else
420 iter++;
421 }
423 //## Rebuild path, if necessary
424 if (edited)
425 {
426 path.clear();
427 if (abs)
428 {
429 path.append("/");
430 }
431 std::vector<DOMString>::iterator iter;
432 for (iter=segments.begin() ; iter!=segments.end() ; iter++)
433 {
434 if (iter != segments.begin())
435 path.append("/");
436 path.append(*iter);
437 }
438 }
440 }
444 //#########################################################################
445 //# M E S S A G E S
446 //#########################################################################
448 void URI::error(const char *fmt, ...)
449 {
450 va_list args;
451 fprintf(stderr, "URI error: ");
452 va_start(args, fmt);
453 vfprintf(stderr, fmt, args);
454 va_end(args);
455 fprintf(stderr, "\n");
456 }
458 void URI::trace(const char *fmt, ...)
459 {
460 va_list args;
461 fprintf(stdout, "URI: ");
462 va_start(args, fmt);
463 vfprintf(stdout, fmt, args);
464 va_end(args);
465 fprintf(stdout, "\n");
466 }
470 //#########################################################################
471 //# P A R S I N G
472 //#########################################################################
476 int URI::peek(int p)
477 {
478 if (p<0 || p>=parselen)
479 return -1;
480 return parsebuf[p];
481 }
485 int URI::match(int p0, char *key)
486 {
487 int p = p0;
488 while (p < parselen)
489 {
490 if (*key == '\0')
491 return p;
492 else if (*key != parsebuf[p])
493 break;
494 p++; key++;
495 }
496 return p0;
497 }
499 //#########################################################################
500 //# Parsing is performed according to:
501 //# http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
502 //#########################################################################
504 int URI::parseHex(int p0, int &result)
505 {
506 int p = p0;
507 int val = 0;
509 //# Upper 4
510 XMLCh ch = peek(p);
511 if (ch >= '0' && ch <= '9')
512 val += (ch - '0');
513 else if (ch >= 'a' && ch <= 'f')
514 val += (10 + ch - 'a');
515 else if (ch >= 'A' && ch <= 'F')
516 val += (10 + ch - 'A');
517 else
518 {
519 error("parseHex : unexpected character : %c", ch);
520 return -1;
521 }
522 p++;
523 val <<= 4;
525 //# Lower 4
526 ch = peek(p);
527 if (ch >= '0' && ch <= '9')
528 val += (ch - '0');
529 else if (ch >= 'a' && ch <= 'f')
530 val += (10 + ch - 'a');
531 else if (ch >= 'A' && ch <= 'F')
532 val += (10 + ch - 'A');
533 else
534 {
535 error("parseHex : unexpected character : %c", ch);
536 return -1;
537 }
538 result = val;
539 return p;
540 }
544 int URI::parseEntity(int p0, int &result)
545 {
546 int p = p0;
547 XMLCh ch = peek(p);
548 if (ch != '&')
549 return p0;
550 p++;
551 if (!match(p, "#x"))
552 {
553 error("parseEntity: expected '#x'");
554 return -1;
555 }
556 p += 2;
557 int val;
558 p = parseHex(p, val);
559 if (p<0)
560 return -1;
561 result = val;
562 return p;
563 }
565 int URI::parseAsciiEntity(int p0, int &result)
566 {
567 int p = p0;
568 XMLCh ch = peek(p);
569 if (ch != '%')
570 return p0;
571 p++;
572 int val;
573 p = parseHex(p, val);
574 if (p<0)
575 return -1;
576 result = val;
577 return p;
578 }
581 int URI::parseScheme(int p0)
582 {
583 int p = p0;
584 for (LookupEntry *entry = schemes; entry->sval ; entry++)
585 {
586 int p2 = match(p, entry->sval);
587 if (p2 > p)
588 {
589 schemeStr = entry->sval;
590 scheme = entry->ival;
591 port = entry->port;
592 p = p2;
593 return p;
594 }
595 }
597 return p;
598 }
601 int URI::parseHierarchicalPart(int p0)
602 {
603 int p = p0;
604 int ch;
606 //# Authority field (host and port, for example)
607 int p2 = match(p, "//");
608 if (p2 > p)
609 {
610 p = p2;
611 portSpecified = false;
612 DOMString portStr;
613 while (p < parselen)
614 {
615 ch = peek(p);
616 if (ch == '/')
617 break;
618 else if (ch == ':')
619 portSpecified = true;
620 else if (portSpecified)
621 portStr.push_back((XMLCh)ch);
622 else
623 authority.push_back((XMLCh)ch);
624 p++;
625 }
626 if (portStr.size() > 0)
627 {
628 char *pstr = (char *)portStr.c_str();
629 char *endStr;
630 long val = strtol(pstr, &endStr, 10);
631 if (endStr > pstr) //successful parse?
632 port = val;
633 }
634 }
636 //# Are we absolute?
637 ch = peek(p);
638 if (isLetter(ch) && peek(p+1)==':')
639 {
640 absolute = true;
641 path.push_back((XMLCh)'/');
642 }
643 else if (ch == '/')
644 {
645 absolute = true;
646 if (p>p0) //in other words, if '/' is not the first char
647 opaque = true;
648 path.push_back((XMLCh)ch);
649 p++;
650 }
652 while (p < parselen)
653 {
654 ch = peek(p);
655 if (ch == '?' || ch == '#')
656 break;
657 else if (ch == '&') //IRI entity
658 {
659 int val;
660 p2 = parseEntity(p, val);
661 if (p2<p)
662 {
663 return -1;
664 }
665 p = p2;
666 path.push_back((XMLCh)val);
667 }
668 else if (ch == '%') //ascii hex excape
669 {
670 int val;
671 p2 = parseAsciiEntity(p, val);
672 if (p2<p)
673 {
674 return -1;
675 }
676 p = p2;
677 path.push_back((XMLCh)val);
678 }
679 else
680 {
681 path.push_back((XMLCh)ch);
682 p++;
683 }
684 }
686 return p;
687 }
689 int URI::parseQuery(int p0)
690 {
691 int p = p0;
692 int ch = peek(p);
693 if (ch != '?')
694 return p0;
696 p++;
697 while (p < parselen)
698 {
699 ch = peek(p);
700 if (ch == '#')
701 break;
702 query.push_back((XMLCh)ch);
703 p++;
704 }
707 return p;
708 }
710 int URI::parseFragment(int p0)
711 {
713 int p = p0;
714 int ch = peek(p);
715 if (ch != '#')
716 return p0;
718 p++;
719 while (p < parselen)
720 {
721 ch = peek(p);
722 if (ch == '?')
723 break;
724 fragment.push_back((XMLCh)ch);
725 p++;
726 }
729 return p;
730 }
733 int URI::parse(int p0)
734 {
736 int p = p0;
738 int p2 = parseScheme(p);
739 if (p2 < 0)
740 {
741 error("Scheme");
742 return -1;
743 }
744 p = p2;
747 p2 = parseHierarchicalPart(p);
748 if (p2 < 0)
749 {
750 error("Hierarchical part");
751 return -1;
752 }
753 p = p2;
755 p2 = parseQuery(p);
756 if (p2 < 0)
757 {
758 error("Query");
759 return -1;
760 }
761 p = p2;
764 p2 = parseFragment(p);
765 if (p2 < 0)
766 {
767 error("Fragment");
768 return -1;
769 }
770 p = p2;
772 return p;
774 }
778 bool URI::parse(const DOMString &str)
779 {
781 parselen = str.size();
783 DOMString tmp;
784 for (unsigned int i=0 ; i<str.size() ; i++)
785 {
786 XMLCh ch = (XMLCh) str[i];
787 if (ch == '\\')
788 tmp.push_back((XMLCh)'/');
789 else
790 tmp.push_back(ch);
791 }
792 parsebuf = (char *) tmp.c_str();
795 int p = parse(0);
796 normalize();
798 if (p < 0)
799 {
800 error("Syntax error");
801 return false;
802 }
804 //printf("uri:%s\n", toString().c_str());
805 //printf("path:%s\n", path.c_str());
807 return true;
809 }
815 } //namespace dom
816 } //namespace w3c
817 } //namespace org
818 //#########################################################################
819 //# E N D O F F I L E
820 //#########################################################################