1 /**
2 * Phoebe DOM Implementation.
3 *
4 * This is a C++ approximation of the W3C DOM model, which follows
5 * fairly closely the specifications in the various .idl files, copies of
6 * which are provided for reference. Most important is this one:
7 *
8 * http://www.w3.org/TR/2004/REC-DOM-Level-3-Core-20040407/idl-definitions.html
9 *
10 * Authors:
11 * Bob Jamison
12 *
13 * Copyright (C) 2005-2007 Bob Jamison
14 *
15 * This library is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public
17 * License as published by the Free Software Foundation; either
18 * version 2.1 of the License, or (at your option) any later version.
19 *
20 * This library is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public
26 * License along with this library; if not, write to the Free Software
27 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 */
33 #include "uri.h"
34 #include "charclass.h"
36 #include <stdio.h>
37 #include <stdarg.h>
38 #include <vector>
41 namespace org
42 {
43 namespace w3c
44 {
45 namespace dom
46 {
49 typedef struct
50 {
51 int ival;
52 char const *sval;
53 int port;
54 } LookupEntry;
56 static LookupEntry schemes[] =
57 {
58 { URI::SCHEME_DATA, "data:", 0 },
59 { URI::SCHEME_HTTP, "http:", 80 },
60 { URI::SCHEME_HTTPS, "https:", 443 },
61 { URI::SCHEME_FTP, "ftp", 12 },
62 { URI::SCHEME_FILE, "file:", 0 },
63 { URI::SCHEME_LDAP, "ldap:", 123 },
64 { URI::SCHEME_MAILTO, "mailto:", 25 },
65 { URI::SCHEME_NEWS, "news:", 117 },
66 { URI::SCHEME_TELNET, "telnet:", 23 },
67 { 0, NULL, 0 }
68 };
72 //#########################################################################
73 //# C O N S T R U C T O R
74 //#########################################################################
76 /**
77 *
78 */
79 URI::URI()
80 {
81 init();
82 }
84 /**
85 *
86 */
87 URI::URI(const DOMString &str)
88 {
89 init();
90 parse(str);
91 }
94 /**
95 *
96 */
97 URI::URI(const char *str)
98 {
99 init();
100 DOMString domStr = str;
101 parse(domStr);
102 }
105 /**
106 *
107 */
108 URI::URI(const URI &other)
109 {
110 init();
111 assign(other);
112 }
115 /**
116 *
117 */
118 URI &URI::operator=(const URI &other)
119 {
120 init();
121 assign(other);
122 return *this;
123 }
126 /**
127 *
128 */
129 URI::~URI()
130 {
131 }
137 /**
138 *
139 */
140 void URI::init()
141 {
142 parsebuf = NULL;
143 parselen = 0;
144 scheme = SCHEME_NONE;
145 schemeStr.clear();
146 port = 0;
147 authority.clear();
148 path.clear();
149 absolute = false;
150 opaque = false;
151 query.clear();
152 fragment.clear();
153 }
156 /**
157 *
158 */
159 void URI::assign(const URI &other)
160 {
161 scheme = other.scheme;
162 schemeStr = other.schemeStr;
163 authority = other.authority;
164 port = other.port;
165 path = other.path;
166 absolute = other.absolute;
167 opaque = other.opaque;
168 query = other.query;
169 fragment = other.fragment;
170 }
173 //#########################################################################
174 //#A T T R I B U T E S
175 //#########################################################################
176 static char *hexChars = "0123456789abcdef";
178 static DOMString toStr(const std::vector<int> &arr)
179 {
180 DOMString buf;
181 std::vector<int>::const_iterator iter;
182 for (iter=arr.begin() ; iter!=arr.end() ; iter++)
183 {
184 int ch = *iter;
185 if (isprint(ch))
186 buf.push_back((XMLCh)ch);
187 else
188 {
189 buf.push_back('%');
190 int hi = ((ch>>4) & 0xf);
191 buf.push_back(hexChars[hi]);
192 int lo = ((ch ) & 0xf);
193 buf.push_back(hexChars[lo]);
194 }
195 }
196 return buf;
197 }
200 DOMString URI::toString() const
201 {
202 DOMString str = schemeStr;
203 if (authority.size() > 0)
204 {
205 str.append("//");
206 str.append(toStr(authority));
207 }
208 str.append(toStr(path));
209 if (query.size() > 0)
210 {
211 str.append("?");
212 str.append(toStr(query));
213 }
214 if (fragment.size() > 0)
215 {
216 str.append("#");
217 str.append(toStr(fragment));
218 }
219 return str;
220 }
223 int URI::getScheme() const
224 {
225 return scheme;
226 }
228 DOMString URI::getSchemeStr() const
229 {
230 return schemeStr;
231 }
234 DOMString URI::getAuthority() const
235 {
236 DOMString ret = toStr(authority);
237 if (portSpecified && port>=0)
238 {
239 char buf[7];
240 snprintf(buf, 6, ":%6d", port);
241 ret.append(buf);
242 }
243 return ret;
244 }
246 DOMString URI::getHost() const
247 {
248 DOMString str = toStr(authority);
249 return str;
250 }
252 int URI::getPort() const
253 {
254 return port;
255 }
258 DOMString URI::getPath() const
259 {
260 DOMString str = toStr(path);
261 return str;
262 }
264 DOMString URI::getNativePath() const
265 {
266 DOMString pathStr = toStr(path);
267 DOMString npath;
268 #ifdef __WIN32__
269 unsigned int firstChar = 0;
270 if (pathStr.size() >= 3)
271 {
272 if (pathStr[0] == '/' &&
273 uni_is_letter(pathStr[1]) &&
274 pathStr[2] == ':')
275 firstChar++;
276 }
277 for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
278 {
279 XMLCh ch = (XMLCh) pathStr[i];
280 if (ch == '/')
281 npath.push_back((XMLCh)'\\');
282 else
283 npath.push_back(ch);
284 }
285 #else
286 npath = pathStr;
287 #endif
288 return npath;
289 }
292 bool URI::isAbsolute() const
293 {
294 return absolute;
295 }
297 bool URI::isOpaque() const
298 {
299 return opaque;
300 }
303 DOMString URI::getQuery() const
304 {
305 DOMString str = toStr(query);
306 return str;
307 }
310 DOMString URI::getFragment() const
311 {
312 DOMString str = toStr(fragment);
313 return str;
314 }
319 static int find(const std::vector<int> &str, int ch, int startpos)
320 {
321 for (unsigned int i = startpos ; i < str.size() ; i++)
322 {
323 if (ch == str[i])
324 return i;
325 }
326 return -1;
327 }
330 static int findLast(const std::vector<int> &str, int ch)
331 {
332 // TODO FIXME BUGBUG
333 // This loop appears to be infinite, so it is probably not being called.
334 // Test for a problem, then fix after it has been observed locking up.
335 for (unsigned int i = str.size()-1 ; i>=0 ; i--)
336 {
337 if (ch == str[i])
338 return i;
339 }
340 return -1;
341 }
344 static bool sequ(const std::vector<int> &str, char *key)
345 {
346 char *c = key;
347 for (unsigned int i=0 ; i<str.size() ; i++)
348 {
349 if (! (*c))
350 return false;
351 if (*c != str[i])
352 return false;
353 }
354 return true;
355 }
358 static std::vector<int> substr(const std::vector<int> &str,
359 int startpos, int len)
360 {
361 std::vector<int> buf;
362 unsigned int pos = startpos;
363 for (int i=0 ; i<len ; i++)
364 {
365 if (pos >= str.size())
366 break;
367 buf.push_back(str[pos++]);
368 }
369 return buf;
370 }
373 URI URI::resolve(const URI &other) const
374 {
375 //### According to w3c, this is handled in 3 cases
377 //## 1
378 if (opaque || other.isAbsolute())
379 return other;
381 //## 2
382 if (other.fragment.size() > 0 &&
383 other.path.size() == 0 &&
384 other.scheme == SCHEME_NONE &&
385 other.authority.size() == 0 &&
386 other.query.size() == 0 )
387 {
388 URI fragUri = *this;
389 fragUri.fragment = other.fragment;
390 return fragUri;
391 }
393 //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
394 URI newUri;
395 //# 3.1
396 newUri.scheme = scheme;
397 newUri.schemeStr = schemeStr;
398 newUri.query = other.query;
399 newUri.fragment = other.fragment;
400 if (other.authority.size() > 0)
401 {
402 //# 3.2
403 if (absolute || other.absolute)
404 newUri.absolute = true;
405 newUri.authority = other.authority;
406 newUri.port = other.port;//part of authority
407 newUri.path = other.path;
408 }
409 else
410 {
411 //# 3.3
412 if (other.absolute)
413 {
414 newUri.absolute = true;
415 newUri.path = other.path;
416 }
417 else
418 {
419 int pos = findLast(path, '/');
420 if (pos >= 0)
421 {
422 newUri.path.clear();
423 //# append my path up to and including the '/'
424 for (int i = 0; i<=pos ; i++)
425 newUri.path.push_back(path[i]);
426 //# append other path
427 for (unsigned int i = 0; i<other.path.size() ; i++)
428 newUri.path.push_back(other.path[i]);
429 }
430 else
431 newUri.path = other.path;
432 }
433 }
435 newUri.normalize();
437 return newUri;
438 }
441 /**
442 * This follows the Java URI algorithm:
443 * 1. All "." segments are removed.
444 * 2. If a ".." segment is preceded by a non-".." segment
445 * then both of these segments are removed. This step
446 * is repeated until it is no longer applicable.
447 * 3. If the path is relative, and if its first segment
448 * contains a colon character (':'), then a "." segment
449 * is prepended. This prevents a relative URI with a path
450 * such as "a:b/c/d" from later being re-parsed as an
451 * opaque URI with a scheme of "a" and a scheme-specific
452 * part of "b/c/d". (Deviation from RFC 2396)
453 */
454 void URI::normalize()
455 {
456 std::vector< std::vector<int> > segments;
458 //## Collect segments
459 if (path.size()<2)
460 return;
461 bool abs = false;
462 int pos=0;
463 int len = (int) path.size();
465 if (path[0]=='/')
466 {
467 abs = true;
468 pos++;
469 }
471 while (pos < len)
472 {
473 int pos2 = find(path, '/', pos);
474 if (pos2 < 0)
475 {
476 std::vector<int> seg = substr(path, pos, path.size()-pos);
477 //printf("last segment:%s\n", toStr(seg).c_str());
478 segments.push_back(seg);
479 break;
480 }
481 if (pos2>pos)
482 {
483 std::vector<int> seg = substr(path, pos, pos2-pos);
484 //printf("segment:%s\n", toStr(seg).c_str());
485 segments.push_back(seg);
486 }
487 pos = pos2;
488 pos++;
489 }
491 //## Clean up (normalize) segments
492 bool edited = false;
493 std::vector< std::vector<int> >::iterator iter;
494 for (iter=segments.begin() ; iter!=segments.end() ; )
495 {
496 std::vector<int> s = *iter;
497 if (sequ(s,"."))
498 {
499 iter = segments.erase(iter);
500 edited = true;
501 }
502 else if (sequ(s, "..") && iter != segments.begin() &&
503 !sequ(*(iter-1), ".."))
504 {
505 iter--; //back up, then erase two entries
506 iter = segments.erase(iter);
507 iter = segments.erase(iter);
508 edited = true;
509 }
510 else
511 iter++;
512 }
514 //## Rebuild path, if necessary
515 if (edited)
516 {
517 path.clear();
518 if (abs)
519 {
520 path.push_back('/');
521 }
522 std::vector< std::vector<int> >::iterator iter;
523 for (iter=segments.begin() ; iter!=segments.end() ; iter++)
524 {
525 if (iter != segments.begin())
526 path.push_back('/');
527 std::vector<int> seg = *iter;
528 for (unsigned int i = 0; i<seg.size() ; i++)
529 path.push_back(seg[i]);
530 }
531 }
533 }
537 //#########################################################################
538 //# M E S S A G E S
539 //#########################################################################
541 void URI::error(const char *fmt, ...)
542 {
543 va_list args;
544 fprintf(stderr, "URI error: ");
545 va_start(args, fmt);
546 vfprintf(stderr, fmt, args);
547 va_end(args);
548 fprintf(stderr, "\n");
549 }
551 void URI::trace(const char *fmt, ...)
552 {
553 va_list args;
554 fprintf(stdout, "URI: ");
555 va_start(args, fmt);
556 vfprintf(stdout, fmt, args);
557 va_end(args);
558 fprintf(stdout, "\n");
559 }
563 //#########################################################################
564 //# P A R S I N G
565 //#########################################################################
569 int URI::peek(int p)
570 {
571 if (p<0 || p>=parselen)
572 return -1;
573 return parsebuf[p];
574 }
578 int URI::match(int p0, char const *key)
579 {
580 int p = p0;
581 while (p < parselen)
582 {
583 if (*key == '\0')
584 return p;
585 else if (*key != parsebuf[p])
586 break;
587 p++; key++;
588 }
589 return p0;
590 }
592 //#########################################################################
593 //# Parsing is performed according to:
594 //# http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
595 //#########################################################################
597 int URI::parseHex(int p0, int &result)
598 {
599 int p = p0;
600 int val = 0;
602 //# Upper 4
603 int ch = peek(p);
604 if (ch >= '0' && ch <= '9')
605 val += (ch - '0');
606 else if (ch >= 'a' && ch <= 'f')
607 val += (10 + ch - 'a');
608 else if (ch >= 'A' && ch <= 'F')
609 val += (10 + ch - 'A');
610 else
611 {
612 error("parseHex : unexpected character : %c", ch);
613 return -1;
614 }
615 p++;
616 val <<= 4;
618 //# Lower 4
619 ch = peek(p);
620 if (ch >= '0' && ch <= '9')
621 val += (ch - '0');
622 else if (ch >= 'a' && ch <= 'f')
623 val += (10 + ch - 'a');
624 else if (ch >= 'A' && ch <= 'F')
625 val += (10 + ch - 'A');
626 else
627 {
628 error("parseHex : unexpected character : %c", ch);
629 return -1;
630 }
631 p++;
632 result = val;
633 return p;
634 }
638 int URI::parseEntity(int p0, int &result)
639 {
640 int p = p0;
641 int ch = peek(p);
642 if (ch != '&')
643 return p0;
644 p++;
645 if (!match(p, "#x"))
646 {
647 error("parseEntity: expected '#x'");
648 return -1;
649 }
650 p += 2;
651 int val;
652 p = parseHex(p, val);
653 if (p<0)
654 return -1;
655 ch = peek(p);
656 if (ch != ';')
657 {
658 error("parseEntity: expected ';'");
659 return -1;
660 }
661 p++;
662 result = val;
663 return p;
664 }
666 int URI::parseAsciiEntity(int p0, int &result)
667 {
668 int p = p0;
669 int ch = peek(p);
670 if (ch != '%')
671 return p0;
672 p++;
673 int val;
674 p = parseHex(p, val);
675 if (p<0)
676 return -1;
677 result = val;
678 return p;
679 }
682 int URI::parseScheme(int p0)
683 {
684 int p = p0;
685 for (LookupEntry *entry = schemes; entry->sval ; entry++)
686 {
687 int p2 = match(p, entry->sval);
688 if (p2 > p)
689 {
690 schemeStr = entry->sval;
691 scheme = entry->ival;
692 port = entry->port;
693 p = p2;
694 return p;
695 }
696 }
698 return p;
699 }
702 int URI::parseHierarchicalPart(int p0)
703 {
704 int p = p0;
705 int ch;
707 //# Authority field (host and port, for example)
708 int p2 = match(p, "//");
709 if (p2 > p)
710 {
711 p = p2;
712 portSpecified = false;
713 DOMString portStr;
714 while (p < parselen)
715 {
716 ch = peek(p);
717 if (ch == '/')
718 break;
719 else if (ch == '&') //IRI entity
720 {
721 int val;
722 p2 = parseEntity(p, val);
723 if (p2<p)
724 {
725 return -1;
726 }
727 p = p2;
728 authority.push_back((XMLCh)val);
729 }
730 else if (ch == '%') //ascii hex excape
731 {
732 int val;
733 p2 = parseAsciiEntity(p, val);
734 if (p2<p)
735 {
736 return -1;
737 }
738 p = p2;
739 authority.push_back((XMLCh)val);
740 }
741 else if (ch == ':')
742 {
743 portSpecified = true;
744 p++;
745 }
746 else if (portSpecified)
747 {
748 portStr.push_back((XMLCh)ch);
749 p++;
750 }
751 else
752 {
753 authority.push_back((XMLCh)ch);
754 p++;
755 }
756 }
757 if (portStr.size() > 0)
758 {
759 char *pstr = (char *)portStr.c_str();
760 char *endStr;
761 long val = strtol(pstr, &endStr, 10);
762 if (endStr > pstr) //successful parse?
763 port = val;
764 }
765 }
767 //# Are we absolute?
768 ch = peek(p);
769 if (uni_is_letter(ch) && peek(p+1)==':')
770 {
771 absolute = true;
772 path.push_back((XMLCh)'/');
773 }
774 else if (ch == '/')
775 {
776 absolute = true;
777 if (p>p0) //in other words, if '/' is not the first char
778 opaque = true;
779 path.push_back((XMLCh)ch);
780 p++;
781 }
783 while (p < parselen)
784 {
785 ch = peek(p);
786 if (ch == '?' || ch == '#')
787 break;
788 else if (ch == '&') //IRI entity
789 {
790 int val;
791 p2 = parseEntity(p, val);
792 if (p2<p)
793 {
794 return -1;
795 }
796 p = p2;
797 path.push_back((XMLCh)val);
798 }
799 else if (ch == '%') //ascii hex excape
800 {
801 int val;
802 p2 = parseAsciiEntity(p, val);
803 if (p2<p)
804 {
805 return -1;
806 }
807 p = p2;
808 path.push_back((XMLCh)val);
809 }
810 else
811 {
812 path.push_back((XMLCh)ch);
813 p++;
814 }
815 }
816 //trace("path:%s", toStr(path).c_str());
817 return p;
818 }
820 int URI::parseQuery(int p0)
821 {
822 int p = p0;
823 int ch = peek(p);
824 if (ch != '?')
825 return p0;
827 p++;
828 while (p < parselen)
829 {
830 ch = peek(p);
831 if (ch == '#')
832 break;
833 query.push_back((XMLCh)ch);
834 p++;
835 }
838 return p;
839 }
841 int URI::parseFragment(int p0)
842 {
844 int p = p0;
845 int ch = peek(p);
846 if (ch != '#')
847 return p0;
849 p++;
850 while (p < parselen)
851 {
852 ch = peek(p);
853 if (ch == '?')
854 break;
855 fragment.push_back(ch);
856 p++;
857 }
860 return p;
861 }
864 int URI::parse(int p0)
865 {
867 int p = p0;
869 int p2 = parseScheme(p);
870 if (p2 < 0)
871 {
872 error("Scheme");
873 return -1;
874 }
875 p = p2;
878 p2 = parseHierarchicalPart(p);
879 if (p2 < 0)
880 {
881 error("Hierarchical part");
882 return -1;
883 }
884 p = p2;
886 p2 = parseQuery(p);
887 if (p2 < 0)
888 {
889 error("Query");
890 return -1;
891 }
892 p = p2;
895 p2 = parseFragment(p);
896 if (p2 < 0)
897 {
898 error("Fragment");
899 return -1;
900 }
901 p = p2;
903 return p;
905 }
909 bool URI::parse(const DOMString &str)
910 {
912 parselen = str.size();
913 parsebuf = new int[str.size()];
914 if (!parsebuf)
915 {
916 error("parse : could not allocate parsebuf");
917 return false;
918 }
920 DOMString::const_iterator iter;
921 unsigned int i=0;
922 for (iter= str.begin() ; iter!=str.end() ; iter++)
923 {
924 int ch = *iter;
925 if (ch == '\\')
926 parsebuf[i++] = '/';
927 else
928 parsebuf[i++] = ch;
929 }
932 int p = parse(0);
933 normalize();
935 delete[] parsebuf;
937 if (p < 0)
938 {
939 error("Syntax error");
940 return false;
941 }
943 //printf("uri:%s\n", toString().c_str());
944 //printf("parse:%s\n", toStr(path).c_str());
946 return true;
948 }
954 } //namespace dom
955 } //namespace w3c
956 } //namespace org
957 //#########################################################################
958 //# E N D O F F I L E
959 //#########################################################################