diff --git a/src/dom/uri.cpp b/src/dom/uri.cpp
--- a/src/dom/uri.cpp
+++ b/src/dom/uri.cpp
* Authors:
* Bob Jamison
*
- * Copyright (C) 2005 Bob Jamison
+ * Copyright (C) 2005-2008 Bob Jamison
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
#include "uri.h"
+#include "ucd.h"
+#include <stdio.h>
#include <stdarg.h>
-
+#include <vector>
namespace org
typedef struct
{
- int ival;
- char *sval;
+ int ival;
+ char const *sval;
+ int port;
} LookupEntry;
-LookupEntry schemes[] =
-{
- { URI::SCHEME_DATA, "data:" },
- { URI::SCHEME_HTTP, "http:" },
- { URI::SCHEME_FTP, "ftp" },
- { URI::SCHEME_FILE, "file:" },
- { URI::SCHEME_LDAP, "ldap:" },
- { URI::SCHEME_MAILTO, "mailto:" },
- { URI::SCHEME_NEWS, "news:" },
- { URI::SCHEME_TELNET, "telnet:" },
- { 0, NULL }
+static LookupEntry schemes[] =
+{
+ { URI::SCHEME_DATA, "data:", 0 },
+ { URI::SCHEME_HTTP, "http:", 80 },
+ { URI::SCHEME_HTTPS, "https:", 443 },
+ { URI::SCHEME_FTP, "ftp", 12 },
+ { URI::SCHEME_FILE, "file:", 0 },
+ { URI::SCHEME_LDAP, "ldap:", 123 },
+ { URI::SCHEME_MAILTO, "mailto:", 25 },
+ { URI::SCHEME_NEWS, "news:", 117 },
+ { URI::SCHEME_TELNET, "telnet:", 23 },
+ { 0, NULL, 0 }
};
URI::URI(const URI &other)
{
init();
- scheme = other.scheme;
- schemeStr = other.schemeStr;
- authority = other.authority;
- path = other.path;
- absolute = other.absolute;
- query = other.query;
- fragment = other.fragment;
+ assign(other);
+}
+
+
+/**
+ *
+ */
+URI &URI::operator=(const URI &other)
+{
+ init();
+ assign(other);
+ return *this;
}
parsebuf = NULL;
parselen = 0;
scheme = SCHEME_NONE;
- schemeStr = "";
- authority = "";
- path = "";
+ schemeStr.clear();
+ port = 0;
+ authority.clear();
+ path.clear();
absolute = false;
- query = "";
- fragment = "";
+ opaque = false;
+ query.clear();
+ fragment.clear();
}
+/**
+ *
+ */
+void URI::assign(const URI &other)
+{
+ scheme = other.scheme;
+ schemeStr = other.schemeStr;
+ authority = other.authority;
+ port = other.port;
+ path = other.path;
+ absolute = other.absolute;
+ opaque = other.opaque;
+ query = other.query;
+ fragment = other.fragment;
+}
+
//#########################################################################
//#A T T R I B U T E S
//#########################################################################
+static const char *hexChars = "0123456789abcdef";
-DOMString URI::toString()
+static DOMString toStr(const std::vector<int> &arr)
+{
+ DOMString buf;
+ std::vector<int>::const_iterator iter;
+ for (iter=arr.begin() ; iter!=arr.end() ; iter++)
+ {
+ int ch = *iter;
+ if (isprint(ch))
+ buf.push_back((XMLCh)ch);
+ else
+ {
+ buf.push_back('%');
+ int hi = ((ch>>4) & 0xf);
+ buf.push_back(hexChars[hi]);
+ int lo = ((ch ) & 0xf);
+ buf.push_back(hexChars[lo]);
+ }
+ }
+ return buf;
+}
+
+
+DOMString URI::toString() const
{
DOMString str = schemeStr;
- if (authority.size()>0)
+ if (authority.size() > 0)
{
str.append("//");
- str.append(authority);
+ str.append(toStr(authority));
}
- str.append(path);
+ str.append(toStr(path));
if (query.size() > 0)
{
str.append("?");
- str.append(query);
+ str.append(toStr(query));
}
if (fragment.size() > 0)
{
str.append("#");
- str.append(fragment);
+ str.append(toStr(fragment));
}
return str;
}
-int URI::getScheme()
+int URI::getScheme() const
{
return scheme;
}
-DOMString URI::getSchemeStr()
+DOMString URI::getSchemeStr() const
{
return schemeStr;
}
-DOMString URI::getAuthority()
+DOMString URI::getAuthority() const
{
- return authority;
+ DOMString ret = toStr(authority);
+ if (portSpecified && port>=0)
+ {
+ char buf[7];
+ snprintf(buf, 6, ":%6d", port);
+ ret.append(buf);
+ }
+ return ret;
}
+DOMString URI::getHost() const
+{
+ DOMString str = toStr(authority);
+ return str;
+}
-DOMString URI::getPath()
+int URI::getPort() const
{
- return path;
+ return port;
}
-bool URI::getIsAbsolute()
+DOMString URI::getPath() const
+{
+ DOMString str = toStr(path);
+ return str;
+}
+
+DOMString URI::getNativePath() const
+{
+ DOMString pathStr = toStr(path);
+ DOMString npath;
+#ifdef __WIN32__
+ unsigned int firstChar = 0;
+ if (pathStr.size() >= 3)
+ {
+ if (pathStr[0] == '/' &&
+ uni_is_letter(pathStr[1]) &&
+ pathStr[2] == ':')
+ firstChar++;
+ }
+ for (unsigned int i=firstChar ; i<pathStr.size() ; i++)
+ {
+ XMLCh ch = (XMLCh) pathStr[i];
+ if (ch == '/')
+ npath.push_back((XMLCh)'\\');
+ else
+ npath.push_back(ch);
+ }
+#else
+ npath = pathStr;
+#endif
+ return npath;
+}
+
+
+bool URI::isAbsolute() const
{
return absolute;
}
+bool URI::isOpaque() const
+{
+ return opaque;
+}
+
+
+DOMString URI::getQuery() const
+{
+ DOMString str = toStr(query);
+ return str;
+}
+
+
+DOMString URI::getFragment() const
+{
+ DOMString str = toStr(fragment);
+ return str;
+}
+
+
+
+
+static int find(const std::vector<int> &str, int ch, int startpos)
+{
+ for (unsigned int i = startpos ; i < str.size() ; i++)
+ {
+ if (ch == str[i])
+ return i;
+ }
+ return -1;
+}
+
+
+static int findLast(const std::vector<int> &str, int ch)
+{
+ /**
+ * Fixed. Originally I used an unsigned int for str.size(),
+ * which was dumb, since i>=0 would always be true.
+ */
+ for (int i = ((int)str.size())-1 ; i>=0 ; i--)
+ {
+ if (ch == str[i])
+ return i;
+ }
+ return -1;
+}
+
+
+static bool sequ(const std::vector<int> &str, const char *key)
+{
+ char *c = (char *)key;
+ for (unsigned int i=0 ; i<str.size() ; i++)
+ {
+ if (! (*c))
+ return false;
+ if (*c != str[i])
+ return false;
+ }
+ return true;
+}
+
+
+static std::vector<int> substr(const std::vector<int> &str,
+ int startpos, int len)
+{
+ std::vector<int> buf;
+ unsigned int pos = startpos;
+ for (int i=0 ; i<len ; i++)
+ {
+ if (pos >= str.size())
+ break;
+ buf.push_back(str[pos++]);
+ }
+ return buf;
+}
+
-DOMString URI::getQuery()
+URI URI::resolve(const URI &other) const
{
- return query;
+ //### According to w3c, this is handled in 3 cases
+
+ //## 1
+ if (opaque || other.isAbsolute())
+ return other;
+
+ //## 2
+ if (other.fragment.size() > 0 &&
+ other.path.size() == 0 &&
+ other.scheme == SCHEME_NONE &&
+ other.authority.size() == 0 &&
+ other.query.size() == 0 )
+ {
+ URI fragUri = *this;
+ fragUri.fragment = other.fragment;
+ return fragUri;
+ }
+
+ //## 3 http://www.ietf.org/rfc/rfc2396.txt, section 5.2
+ URI newUri;
+ //# 3.1
+ newUri.scheme = scheme;
+ newUri.schemeStr = schemeStr;
+ newUri.query = other.query;
+ newUri.fragment = other.fragment;
+ if (other.authority.size() > 0)
+ {
+ //# 3.2
+ if (absolute || other.absolute)
+ newUri.absolute = true;
+ newUri.authority = other.authority;
+ newUri.port = other.port;//part of authority
+ newUri.path = other.path;
+ }
+ else
+ {
+ //# 3.3
+ if (other.absolute)
+ {
+ newUri.absolute = true;
+ newUri.path = other.path;
+ }
+ else
+ {
+ int pos = findLast(path, '/');
+ if (pos >= 0)
+ {
+ newUri.path.clear();
+ //# append my path up to and including the '/'
+ for (int i = 0; i<=pos ; i++)
+ newUri.path.push_back(path[i]);
+ //# append other path
+ for (unsigned int i = 0; i<other.path.size() ; i++)
+ newUri.path.push_back(other.path[i]);
+ }
+ else
+ newUri.path = other.path;
+ }
+ }
+
+ newUri.normalize();
+
+ return newUri;
}
-DOMString URI::getFragment()
+/**
+ * This follows the Java URI algorithm:
+ * 1. All "." segments are removed.
+ * 2. If a ".." segment is preceded by a non-".." segment
+ * then both of these segments are removed. This step
+ * is repeated until it is no longer applicable.
+ * 3. If the path is relative, and if its first segment
+ * contains a colon character (':'), then a "." segment
+ * is prepended. This prevents a relative URI with a path
+ * such as "a:b/c/d" from later being re-parsed as an
+ * opaque URI with a scheme of "a" and a scheme-specific
+ * part of "b/c/d". (Deviation from RFC 2396)
+ */
+void URI::normalize()
{
- return fragment;
+ std::vector< std::vector<int> > segments;
+
+ //## Collect segments
+ if (path.size()<2)
+ return;
+ bool abs = false;
+ int pos=0;
+ int len = (int) path.size();
+
+ if (path[0]=='/')
+ {
+ abs = true;
+ pos++;
+ }
+
+ while (pos < len)
+ {
+ int pos2 = find(path, '/', pos);
+ if (pos2 < 0)
+ {
+ std::vector<int> seg = substr(path, pos, path.size()-pos);
+ //printf("last segment:%s\n", toStr(seg).c_str());
+ segments.push_back(seg);
+ break;
+ }
+ if (pos2>pos)
+ {
+ std::vector<int> seg = substr(path, pos, pos2-pos);
+ //printf("segment:%s\n", toStr(seg).c_str());
+ segments.push_back(seg);
+ }
+ pos = pos2;
+ pos++;
+ }
+
+ //## Clean up (normalize) segments
+ bool edited = false;
+ std::vector< std::vector<int> >::iterator iter;
+ for (iter=segments.begin() ; iter!=segments.end() ; )
+ {
+ std::vector<int> s = *iter;
+ if (sequ(s,"."))
+ {
+ iter = segments.erase(iter);
+ edited = true;
+ }
+ else if (sequ(s, "..") && iter != segments.begin() &&
+ !sequ(*(iter-1), ".."))
+ {
+ iter--; //back up, then erase two entries
+ iter = segments.erase(iter);
+ iter = segments.erase(iter);
+ edited = true;
+ }
+ else
+ iter++;
+ }
+
+ //## Rebuild path, if necessary
+ if (edited)
+ {
+ path.clear();
+ if (abs)
+ {
+ path.push_back('/');
+ }
+ std::vector< std::vector<int> >::iterator iter;
+ for (iter=segments.begin() ; iter!=segments.end() ; iter++)
+ {
+ if (iter != segments.begin())
+ path.push_back('/');
+ std::vector<int> seg = *iter;
+ for (unsigned int i = 0; i<seg.size() ; i++)
+ path.push_back(seg[i]);
+ }
+ }
+
}
-int URI::match(int p0, char *key)
+int URI::match(int p0, char const *key)
{
int p = p0;
while (p < parselen)
{
- if (*key != parsebuf[p])
+ if (*key == '\0')
+ return p;
+ else if (*key != parsebuf[p])
break;
p++; key++;
}
- return p;
+ return p0;
}
//#########################################################################
//# http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#components
//#########################################################################
+int URI::parseHex(int p0, int &result)
+{
+ int p = p0;
+ int val = 0;
+
+ //# Upper 4
+ int ch = peek(p);
+ if (ch >= '0' && ch <= '9')
+ val += (ch - '0');
+ else if (ch >= 'a' && ch <= 'f')
+ val += (10 + ch - 'a');
+ else if (ch >= 'A' && ch <= 'F')
+ val += (10 + ch - 'A');
+ else
+ {
+ error("parseHex : unexpected character : %c", ch);
+ return -1;
+ }
+ p++;
+ val <<= 4;
+
+ //# Lower 4
+ ch = peek(p);
+ if (ch >= '0' && ch <= '9')
+ val += (ch - '0');
+ else if (ch >= 'a' && ch <= 'f')
+ val += (10 + ch - 'a');
+ else if (ch >= 'A' && ch <= 'F')
+ val += (10 + ch - 'A');
+ else
+ {
+ error("parseHex : unexpected character : %c", ch);
+ return -1;
+ }
+ p++;
+ result = val;
+ return p;
+}
+
+
+
+int URI::parseEntity(int p0, int &result)
+{
+ int p = p0;
+ int ch = peek(p);
+ if (ch != '&')
+ return p0;
+ p++;
+ if (!match(p, "#x"))
+ {
+ error("parseEntity: expected '#x'");
+ return -1;
+ }
+ p += 2;
+ int val;
+ p = parseHex(p, val);
+ if (p<0)
+ return -1;
+ ch = peek(p);
+ if (ch != ';')
+ {
+ error("parseEntity: expected ';'");
+ return -1;
+ }
+ p++;
+ result = val;
+ return p;
+}
+
+int URI::parseAsciiEntity(int p0, int &result)
+{
+ int p = p0;
+ int ch = peek(p);
+ if (ch != '%')
+ return p0;
+ p++;
+ int val;
+ p = parseHex(p, val);
+ if (p<0)
+ return -1;
+ result = val;
+ return p;
+}
+
+
int URI::parseScheme(int p0)
{
int p = p0;
{
schemeStr = entry->sval;
scheme = entry->ival;
+ port = entry->port;
p = p2;
return p;
}
if (p2 > p)
{
p = p2;
+ portSpecified = false;
+ DOMString portStr;
while (p < parselen)
{
ch = peek(p);
if (ch == '/')
break;
- authority.push_back(ch);
- p++;
+ else if (ch == '&') //IRI entity
+ {
+ int val;
+ p2 = parseEntity(p, val);
+ if (p2<p)
+ {
+ return -1;
+ }
+ p = p2;
+ authority.push_back((XMLCh)val);
+ }
+ else if (ch == '%') //ascii hex excape
+ {
+ int val;
+ p2 = parseAsciiEntity(p, val);
+ if (p2<p)
+ {
+ return -1;
+ }
+ p = p2;
+ authority.push_back((XMLCh)val);
+ }
+ else if (ch == ':')
+ {
+ portSpecified = true;
+ p++;
+ }
+ else if (portSpecified)
+ {
+ portStr.push_back((XMLCh)ch);
+ p++;
+ }
+ else
+ {
+ authority.push_back((XMLCh)ch);
+ p++;
+ }
+ }
+ if (portStr.size() > 0)
+ {
+ char *pstr = (char *)portStr.c_str();
+ char *endStr;
+ long val = strtol(pstr, &endStr, 10);
+ if (endStr > pstr) //successful parse?
+ port = val;
}
}
//# Are we absolute?
ch = peek(p);
- if (ch == '/')
+ if (uni_is_letter(ch) && peek(p+1)==':')
+ {
+ absolute = true;
+ path.push_back((XMLCh)'/');
+ }
+ else if (ch == '/')
{
absolute = true;
- path.push_back(ch);
+ if (p>p0) //in other words, if '/' is not the first char
+ opaque = true;
+ path.push_back((XMLCh)ch);
p++;
}
ch = peek(p);
if (ch == '?' || ch == '#')
break;
- path.push_back(ch);
- p++;
+ else if (ch == '&') //IRI entity
+ {
+ int val;
+ p2 = parseEntity(p, val);
+ if (p2<p)
+ {
+ return -1;
+ }
+ p = p2;
+ path.push_back((XMLCh)val);
+ }
+ else if (ch == '%') //ascii hex excape
+ {
+ int val;
+ p2 = parseAsciiEntity(p, val);
+ if (p2<p)
+ {
+ return -1;
+ }
+ p = p2;
+ path.push_back((XMLCh)val);
+ }
+ else
+ {
+ path.push_back((XMLCh)ch);
+ p++;
+ }
}
-
+ //trace("path:%s", toStr(path).c_str());
return p;
}
ch = peek(p);
if (ch == '#')
break;
- query.push_back(ch);
+ query.push_back((XMLCh)ch);
p++;
}
{
parselen = str.size();
- DOMString tmp = str;
- parsebuf = (char *) tmp.c_str();
+ parsebuf = new int[str.size()];
+ if (!parsebuf)
+ {
+ error("parse : could not allocate parsebuf");
+ return false;
+ }
+
+ DOMString::const_iterator iter;
+ unsigned int i=0;
+ for (iter= str.begin() ; iter!=str.end() ; iter++)
+ {
+ int ch = *iter;
+ if (ch == '\\')
+ parsebuf[i++] = '/';
+ else
+ parsebuf[i++] = ch;
+ }
int p = parse(0);
+ normalize();
+
+ delete[] parsebuf;
if (p < 0)
{
return false;
}
+ //printf("uri:%s\n", toString().c_str());
+ //printf("parse:%s\n", toStr(path).c_str());
+
return true;
}