1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005-2008 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
33 #include "uri.h"
34 #include "ucd.h"
36 #include <stdio.h>
37 #include <stdarg.h>
38 #include <vector>
41 namespace org
42 {
43 namespace w3c
44 {
45 namespace dom
46 {
49 typedef struct
50 {
51 int ival;
52 char const *sval;
53 int port;
54 } LookupEntry;
56 static LookupEntry schemes[] =
57 {
58 { URI::SCHEME_DATA, "data:", 0 },
59 { URI::SCHEME_HTTP, "http:", 80 },
60 { URI::SCHEME_HTTPS, "https:", 443 },
61 { URI::SCHEME_FTP, "ftp", 12 },
62 { URI::SCHEME_FILE, "file:", 0 },
63 { URI::SCHEME_LDAP, "ldap:", 123 },
64 { URI::SCHEME_MAILTO, "mailto:", 25 },
65 { URI::SCHEME_NEWS, "news:", 117 },
66 { URI::SCHEME_TELNET, "telnet:", 23 },
67 { 0, NULL, 0 }
68 };
72 //#########################################################################
73 //# C O N S T R U C T O R
74 //#########################################################################
76 /**
77 *
78 */
79 URI::URI()
80 {
81 init();
82 }
84 /**
85 *
86 */
87 URI::URI(const DOMString &str)
88 {
89 init();
90 parse(str);
91 }
94 /**
95 *
96 */
97 URI::URI(const char *str)
98 {
99 init();
100 DOMString domStr = str;
101 parse(domStr);
102 }
105 /**
106 *
107 */
108 URI::URI(const URI &other)
109 {
110 init();
111 assign(other);
112 }
115 /**
116 *
117 */
118 URI &URI::operator=(const URI &other)
119 {
120 init();
121 assign(other);
122 return *this;
123 }
126 /**
127 *
128 */
129 URI::~URI()
130 {
131 }
137 /**
138 *
139 */
140 void URI::init()
141 {
142 parsebuf = NULL;
143 parselen = 0;
144 scheme = SCHEME_NONE;
145 schemeStr.clear();
146 port = 0;
147 authority.clear();
148 path.clear();
149 absolute = false;
150 opaque = false;
151 query.clear();
152 fragment.clear();
153 }
156 /**
157 *
158 */
159 void URI::assign(const URI &other)
160 {
161 scheme = other.scheme;
162 schemeStr = other.schemeStr;
163 authority = other.authority;
164 port = other.port;
165 path = other.path;
166 absolute = other.absolute;
167 opaque = other.opaque;
168 query = other.query;
169 fragment = other.fragment;
170 }
173 //#########################################################################
174 //#A T T R I B U T E S
175 //#########################################################################
176 static const char *hexChars = "0123456789abcdef";
178 static DOMString toStr(const std::vector<int> &arr)
179 {
180 DOMString buf;
181 std::vector<int>::const_iterator iter;
182 for (iter=arr.begin() ; iter!=arr.end() ; iter++)
183 {
184 int ch = *iter;
185 if (isprint(ch))
186 buf.push_back((XMLCh)ch);
187 else
188 {
189 buf.push_back('%');
190 int hi = ((ch>>4) & 0xf);
191 buf.push_back(hexChars[hi]);
192 int lo = ((ch ) & 0xf);
193 buf.push_back(hexChars[lo]);
194 }
195 }
196 return buf;
197 }
200 DOMString URI::toString() const
201 {
202 DOMString str = schemeStr;
203 if (authority.size() > 0)
204 {
205 str.append("//");
206 str.append(toStr(authority));
207 }
208 str.append(toStr(path));
209 if (query.size() > 0)
210 {
211 str.append("?");
212 str.append(toStr(query));
213 }
214 if (fragment.size() > 0)
215 {
216 str.append("#");
217 str.append(toStr(fragment));
218 }
219 return str;
220 }
223 int URI::getScheme() const
224 {
225 return scheme;
226 }
228 DOMString URI::getSchemeStr() const
229 {
230 return schemeStr;
231 }
234 DOMString URI::getAuthority() const
235 {
236 DOMString ret = toStr(authority);
237 if (portSpecified && port>=0)
238 {
239 char buf[7];
240 snprintf(buf, 6, ":%6d", port);
241 ret.append(buf);
242 }
243 return ret;
244 }
246 DOMString URI::getHost() const
247 {
248 DOMString str = toStr(authority);
249 return str;
250 }
252 int URI::getPort() const
253 {
254 return port;
255 }
258 DOMString URI::getPath() const
259 {
260 DOMString str = toStr(path);
261 return str;
262 }
264 DOMString URI::getNativePath() const
265 {
266 DOMString pathStr = toStr(path);
267 DOMString npath;
268 #ifdef __WIN32__
269 unsigned int firstChar = 0;
270 if (pathStr.size() >= 3)
271 {
272 if (pathStr[0] == '/' &&
273 uni_is_letter(pathStr[1]) &&
274 pathStr[2] == ':')
275 firstChar++;
276 }
277 for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
278 {
279 XMLCh ch = (XMLCh) pathStr[i];
280 if (ch == '/')
281 npath.push_back((XMLCh)'\\');
282 else
283 npath.push_back(ch);
284 }
285 #else
286 npath = pathStr;
287 #endif
288 return npath;
289 }
292 bool URI::isAbsolute() const
293 {
294 return absolute;
295 }
297 bool URI::isOpaque() const
298 {
299 return opaque;
300 }
303 DOMString URI::getQuery() const
304 {
305 DOMString str = toStr(query);
306 return str;
307 }
310 DOMString URI::getFragment() const
311 {
312 DOMString str = toStr(fragment);
313 return str;
314 }
319 static int find(const std::vector<int> &str, int ch, int startpos)
320 {
321 for (unsigned int i = startpos ; i < str.size() ; i++)
322 {
323 if (ch == str[i])
324 return i;
325 }
326 return -1;
327 }
330 static int findLast(const std::vector<int> &str, int ch)
331 {
332 /**
333 * Fixed. Originally I used an unsigned int for str.size(),
334 * which was dumb, since i>=0 would always be true.
335 */
336 for (int i = ((int)str.size())-1 ; i>=0 ; i--)
337 {
338 if (ch == str[i])
339 return i;
340 }
341 return -1;
342 }
345 static bool sequ(const std::vector<int> &str, const char *key)
346 {
347 char *c = (char *)key;
348 for (unsigned int i=0 ; i<str.size() ; i++)
349 {
350 if (! (*c))
351 return false;
352 if (*c != str[i])
353 return false;
354 }
355 return true;
356 }
359 static std::vector<int> substr(const std::vector<int> &str,
360 int startpos, int len)
361 {
362 std::vector<int> buf;
363 unsigned int pos = startpos;
364 for (int i=0 ; i<len ; i++)
365 {
366 if (pos >= str.size())
367 break;
368 buf.push_back(str[pos++]);
369 }
370 return buf;
371 }
374 URI URI::resolve(const URI &other) const
375 {
376 //### According to w3c, this is handled in 3 cases
378 //## 1
379 if (opaque || other.isAbsolute())
380 return other;
382 //## 2
383 if (other.fragment.size() > 0 &&
384 other.path.size() == 0 &&
385 other.scheme == SCHEME_NONE &&
386 other.authority.size() == 0 &&
387 other.query.size() == 0 )
388 {
389 URI fragUri = *this;
390 fragUri.fragment = other.fragment;
391 return fragUri;
392 }
394 //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
395 URI newUri;
396 //# 3.1
397 newUri.scheme = scheme;
398 newUri.schemeStr = schemeStr;
399 newUri.query = other.query;
400 newUri.fragment = other.fragment;
401 if (other.authority.size() > 0)
402 {
403 //# 3.2
404 if (absolute || other.absolute)
405 newUri.absolute = true;
406 newUri.authority = other.authority;
407 newUri.port = other.port;//part of authority
408 newUri.path = other.path;
409 }
410 else
411 {
412 //# 3.3
413 if (other.absolute)
414 {
415 newUri.absolute = true;
416 newUri.path = other.path;
417 }
418 else
419 {
420 int pos = findLast(path, '/');
421 if (pos >= 0)
422 {
423 newUri.path.clear();
424 //# append my path up to and including the '/'
425 for (int i = 0; i<=pos ; i++)
426 newUri.path.push_back(path[i]);
427 //# append other path
428 for (unsigned int i = 0; i<other.path.size() ; i++)
429 newUri.path.push_back(other.path[i]);
430 }
431 else
432 newUri.path = other.path;
433 }
434 }
436 newUri.normalize();
438 return newUri;
439 }
442 /**
443 * This follows the Java URI algorithm:
444 * 1. All "." segments are removed.
445 * 2. If a ".." segment is preceded by a non-".." segment
446 * then both of these segments are removed. This step
447 * is repeated until it is no longer applicable.
448 * 3. If the path is relative, and if its first segment
449 * contains a colon character (':'), then a "." segment
450 * is prepended. This prevents a relative URI with a path
451 * such as "a:b/c/d" from later being re-parsed as an
452 * opaque URI with a scheme of "a" and a scheme-specific
453 * part of "b/c/d". (Deviation from RFC 2396)
454 */
455 void URI::normalize()
456 {
457 std::vector< std::vector<int> > segments;
459 //## Collect segments
460 if (path.size()<2)
461 return;
462 bool abs = false;
463 int pos=0;
464 int len = (int) path.size();
466 if (path[0]=='/')
467 {
468 abs = true;
469 pos++;
470 }
472 while (pos < len)
473 {
474 int pos2 = find(path, '/', pos);
475 if (pos2 < 0)
476 {
477 std::vector<int> seg = substr(path, pos, path.size()-pos);
478 //printf("last segment:%s\n", toStr(seg).c_str());
479 segments.push_back(seg);
480 break;
481 }
482 if (pos2>pos)
483 {
484 std::vector<int> seg = substr(path, pos, pos2-pos);
485 //printf("segment:%s\n", toStr(seg).c_str());
486 segments.push_back(seg);
487 }
488 pos = pos2;
489 pos++;
490 }
492 //## Clean up (normalize) segments
493 bool edited = false;
494 std::vector< std::vector<int> >::iterator iter;
495 for (iter=segments.begin() ; iter!=segments.end() ; )
496 {
497 std::vector<int> s = *iter;
498 if (sequ(s,"."))
499 {
500 iter = segments.erase(iter);
501 edited = true;
502 }
503 else if (sequ(s, "..") && iter != segments.begin() &&
504 !sequ(*(iter-1), ".."))
505 {
506 iter--; //back up, then erase two entries
507 iter = segments.erase(iter);
508 iter = segments.erase(iter);
509 edited = true;
510 }
511 else
512 iter++;
513 }
515 //## Rebuild path, if necessary
516 if (edited)
517 {
518 path.clear();
519 if (abs)
520 {
521 path.push_back('/');
522 }
523 std::vector< std::vector<int> >::iterator iter;
524 for (iter=segments.begin() ; iter!=segments.end() ; iter++)
525 {
526 if (iter != segments.begin())
527 path.push_back('/');
528 std::vector<int> seg = *iter;
529 for (unsigned int i = 0; i<seg.size() ; i++)
530 path.push_back(seg[i]);
531 }
532 }
534 }
538 //#########################################################################
539 //# M E S S A G E S
540 //#########################################################################
542 void URI::error(const char *fmt, ...)
543 {
544 va_list args;
545 fprintf(stderr, "URI error: ");
546 va_start(args, fmt);
547 vfprintf(stderr, fmt, args);
548 va_end(args);
549 fprintf(stderr, "\n");
550 }
552 void URI::trace(const char *fmt, ...)
553 {
554 va_list args;
555 fprintf(stdout, "URI: ");
556 va_start(args, fmt);
557 vfprintf(stdout, fmt, args);
558 va_end(args);
559 fprintf(stdout, "\n");
560 }
564 //#########################################################################
565 //# P A R S I N G
566 //#########################################################################
570 int URI::peek(int p)
571 {
572 if (p<0 || p>=parselen)
573 return -1;
574 return parsebuf[p];
575 }
579 int URI::match(int p0, char const *key)
580 {
581 int p = p0;
582 while (p < parselen)
583 {
584 if (*key == '\0')
585 return p;
586 else if (*key != parsebuf[p])
587 break;
588 p++; key++;
589 }
590 return p0;
591 }
593 //#########################################################################
594 //# Parsing is performed according to:
595 //# http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
596 //#########################################################################
598 int URI::parseHex(int p0, int &result)
599 {
600 int p = p0;
601 int val = 0;
603 //# Upper 4
604 int ch = peek(p);
605 if (ch >= '0' && ch <= '9')
606 val += (ch - '0');
607 else if (ch >= 'a' && ch <= 'f')
608 val += (10 + ch - 'a');
609 else if (ch >= 'A' && ch <= 'F')
610 val += (10 + ch - 'A');
611 else
612 {
613 error("parseHex : unexpected character : %c", ch);
614 return -1;
615 }
616 p++;
617 val <<= 4;
619 //# Lower 4
620 ch = peek(p);
621 if (ch >= '0' && ch <= '9')
622 val += (ch - '0');
623 else if (ch >= 'a' && ch <= 'f')
624 val += (10 + ch - 'a');
625 else if (ch >= 'A' && ch <= 'F')
626 val += (10 + ch - 'A');
627 else
628 {
629 error("parseHex : unexpected character : %c", ch);
630 return -1;
631 }
632 p++;
633 result = val;
634 return p;
635 }
639 int URI::parseEntity(int p0, int &result)
640 {
641 int p = p0;
642 int ch = peek(p);
643 if (ch != '&')
644 return p0;
645 p++;
646 if (!match(p, "#x"))
647 {
648 error("parseEntity: expected '#x'");
649 return -1;
650 }
651 p += 2;
652 int val;
653 p = parseHex(p, val);
654 if (p<0)
655 return -1;
656 ch = peek(p);
657 if (ch != ';')
658 {
659 error("parseEntity: expected ';'");
660 return -1;
661 }
662 p++;
663 result = val;
664 return p;
665 }
667 int URI::parseAsciiEntity(int p0, int &result)
668 {
669 int p = p0;
670 int ch = peek(p);
671 if (ch != '%')
672 return p0;
673 p++;
674 int val;
675 p = parseHex(p, val);
676 if (p<0)
677 return -1;
678 result = val;
679 return p;
680 }
683 int URI::parseScheme(int p0)
684 {
685 int p = p0;
686 for (LookupEntry *entry = schemes; entry->sval ; entry++)
687 {
688 int p2 = match(p, entry->sval);
689 if (p2 > p)
690 {
691 schemeStr = entry->sval;
692 scheme = entry->ival;
693 port = entry->port;
694 p = p2;
695 return p;
696 }
697 }
699 return p;
700 }
703 int URI::parseHierarchicalPart(int p0)
704 {
705 int p = p0;
706 int ch;
708 //# Authority field (host and port, for example)
709 int p2 = match(p, "//");
710 if (p2 > p)
711 {
712 p = p2;
713 portSpecified = false;
714 DOMString portStr;
715 while (p < parselen)
716 {
717 ch = peek(p);
718 if (ch == '/')
719 break;
720 else if (ch == '&') //IRI entity
721 {
722 int val;
723 p2 = parseEntity(p, val);
724 if (p2<p)
725 {
726 return -1;
727 }
728 p = p2;
729 authority.push_back((XMLCh)val);
730 }
731 else if (ch == '%') //ascii hex excape
732 {
733 int val;
734 p2 = parseAsciiEntity(p, val);
735 if (p2<p)
736 {
737 return -1;
738 }
739 p = p2;
740 authority.push_back((XMLCh)val);
741 }
742 else if (ch == ':')
743 {
744 portSpecified = true;
745 p++;
746 }
747 else if (portSpecified)
748 {
749 portStr.push_back((XMLCh)ch);
750 p++;
751 }
752 else
753 {
754 authority.push_back((XMLCh)ch);
755 p++;
756 }
757 }
758 if (portStr.size() > 0)
759 {
760 char *pstr = (char *)portStr.c_str();
761 char *endStr;
762 long val = strtol(pstr, &endStr, 10);
763 if (endStr > pstr) //successful parse?
764 port = val;
765 }
766 }
768 //# Are we absolute?
769 ch = peek(p);
770 if (uni_is_letter(ch) && peek(p+1)==':')
771 {
772 absolute = true;
773 path.push_back((XMLCh)'/');
774 }
775 else if (ch == '/')
776 {
777 absolute = true;
778 if (p>p0) //in other words, if '/' is not the first char
779 opaque = true;
780 path.push_back((XMLCh)ch);
781 p++;
782 }
784 while (p < parselen)
785 {
786 ch = peek(p);
787 if (ch == '?' || ch == '#')
788 break;
789 else if (ch == '&') //IRI entity
790 {
791 int val;
792 p2 = parseEntity(p, val);
793 if (p2<p)
794 {
795 return -1;
796 }
797 p = p2;
798 path.push_back((XMLCh)val);
799 }
800 else if (ch == '%') //ascii hex excape
801 {
802 int val;
803 p2 = parseAsciiEntity(p, val);
804 if (p2<p)
805 {
806 return -1;
807 }
808 p = p2;
809 path.push_back((XMLCh)val);
810 }
811 else
812 {
813 path.push_back((XMLCh)ch);
814 p++;
815 }
816 }
817 //trace("path:%s", toStr(path).c_str());
818 return p;
819 }
821 int URI::parseQuery(int p0)
822 {
823 int p = p0;
824 int ch = peek(p);
825 if (ch != '?')
826 return p0;
828 p++;
829 while (p < parselen)
830 {
831 ch = peek(p);
832 if (ch == '#')
833 break;
834 query.push_back((XMLCh)ch);
835 p++;
836 }
839 return p;
840 }
842 int URI::parseFragment(int p0)
843 {
845 int p = p0;
846 int ch = peek(p);
847 if (ch != '#')
848 return p0;
850 p++;
851 while (p < parselen)
852 {
853 ch = peek(p);
854 if (ch == '?')
855 break;
856 fragment.push_back(ch);
857 p++;
858 }
861 return p;
862 }
865 int URI::parse(int p0)
866 {
868 int p = p0;
870 int p2 = parseScheme(p);
871 if (p2 < 0)
872 {
873 error("Scheme");
874 return -1;
875 }
876 p = p2;
879 p2 = parseHierarchicalPart(p);
880 if (p2 < 0)
881 {
882 error("Hierarchical part");
883 return -1;
884 }
885 p = p2;
887 p2 = parseQuery(p);
888 if (p2 < 0)
889 {
890 error("Query");
891 return -1;
892 }
893 p = p2;
896 p2 = parseFragment(p);
897 if (p2 < 0)
898 {
899 error("Fragment");
900 return -1;
901 }
902 p = p2;
904 return p;
906 }
910 bool URI::parse(const DOMString &str)
911 {
913 parselen = str.size();
914 parsebuf = new int[str.size()];
915 if (!parsebuf)
916 {
917 error("parse : could not allocate parsebuf");
918 return false;
919 }
921 DOMString::const_iterator iter;
922 unsigned int i=0;
923 for (iter= str.begin() ; iter!=str.end() ; iter++)
924 {
925 int ch = *iter;
926 if (ch == '\\')
927 parsebuf[i++] = '/';
928 else
929 parsebuf[i++] = ch;
930 }
933 int p = parse(0);
934 normalize();
936 delete[] parsebuf;
938 if (p < 0)
939 {
940 error("Syntax error");
941 return false;
942 }
944 //printf("uri:%s\n", toString().c_str());
945 //printf("parse:%s\n", toStr(path).c_str());
947 return true;
949 }
955 } //namespace dom
956 } //namespace w3c
957 } //namespace org
958 //#########################################################################
959 //# E N D O F F I L E
960 //#########################################################################