aboutsummaryrefslogtreecommitdiffstats
path: root/lib/xmerl/src/xmerl_uri.erl
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xmerl/src/xmerl_uri.erl')
-rw-r--r--lib/xmerl/src/xmerl_uri.erl478
1 files changed, 478 insertions, 0 deletions
diff --git a/lib/xmerl/src/xmerl_uri.erl b/lib/xmerl/src/xmerl_uri.erl
new file mode 100644
index 0000000000..d8edb2e6e1
--- /dev/null
+++ b/lib/xmerl/src/xmerl_uri.erl
@@ -0,0 +1,478 @@
+%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2005-2009. All Rights Reserved.
+%%
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% %CopyrightEnd%
+%%
+
+
+-module(xmerl_uri).
+
+
+-export([parse/1,resolve/2
+% scan_abspath/1
+ ]).
+
+
+%%% Parse URI and return {Scheme,Path}
+%%% Note that Scheme specific parsing/validation is not handled here!
+resolve(_Root,_Rel) ->
+ ok.
+
+%%% See "http://www.isi.edu/in-notes/iana/assignments/url-schemes" for a list of
+%%% defined URL schemes and references to its sources.
+
+parse(URI) ->
+ case parse_scheme(URI) of
+ {http,Cont} -> parse_http(Cont,http);
+ {https,Cont} -> parse_http(Cont,https);
+ {ftp,Cont} -> parse_ftp(Cont,ftp);
+ {sip,Cont} -> parse_sip(Cont,sip);
+ {sips,Cont} -> parse_sip(Cont,sips);
+ {sms,Cont} -> parse_sms(Cont,sms); % Note: From old draft
+ {error,Error} -> {error,Error};
+ {Scheme,Cont} -> {Scheme,Cont}
+ end.
+
+
+%%% Parse the scheme.
+parse_scheme(URI) ->
+ parse_scheme(URI,[]).
+
+parse_scheme([H|URI],Acc) when $a=<H,H=<$z; $A=<H,H=<$Z ->
+ parse_scheme2(URI,[H|Acc]);
+parse_scheme(_,_) ->
+ {error,no_scheme}.
+
+parse_scheme2([H|URI],Acc)
+ when $a=<H,H=<$z; $A=<H,H=<$Z; $0=<H,H=<$9; H==$-;H==$+;H==$. ->
+ parse_scheme2(URI,[H|Acc]);
+parse_scheme2([$:|URI],Acc) ->
+ {list_to_atom(lists:reverse(Acc)),URI};
+parse_scheme2(_,_) ->
+ {error,no_scheme}.
+
+
+%%% ............................................................................
+-define(HTTP_DEFAULT_PORT, 80).
+-define(HTTPS_DEFAULT_PORT, 443).
+
+%%% HTTP (Source RFC 2396, RFC 2616)
+%%% http_URL = "*" | absoluteURI | abs_path [ "?" query ] | authority
+
+%%% http_URL = "http:" "//" host [ ":" port ] [ abs_path [ "?" query ]]
+%%% Returns a tuple {http,Host,Port,Path,Query} where
+%%% Host = string() Host value
+%%% Port = string() Port value
+%%% Path = string() Absolute path
+%%% Query = string() Query or Fragment value
+parse_http("//"++C0,Scheme) ->
+ case scan_hostport(C0,Scheme) of
+ {C1,Host,Port} ->
+ case scan_pathquery(C1) of
+ {error,Error} ->
+ {error,Error};
+ {Path,Query} ->
+ {Scheme,Host,Port,Path,Query}
+ end;
+ {error,Error} ->
+ {error,Error}
+ end;
+parse_http(_,_) ->
+ {error,invalid_url}.
+
+scan_pathquery(C0) ->
+ case scan_abspath(C0) of
+ {error,Error} ->
+ {error,Error};
+ {[],[]} -> % Add implicit path
+ {"/",""};
+ {"?"++C1,Path} ->
+ case scan_query(C1,[]) of
+ {error,Error} ->
+ {error,Error};
+ Query ->
+ {Path,"?"++Query}
+ end;
+ {"#"++C1,Path} -> % 'query' and 'fragment' are both defined as '*uric'
+ case scan_query(C1,[]) of
+ {error,Error} ->
+ {error,Error};
+ Fragment ->
+ {Path,"#"++Fragment}
+ end;
+ {[],Path} ->
+ {Path,""}
+ end.
+
+
+%%% ............................................................................
+-define(FTP_DEFAULT_PORT, 21).
+
+%%% FTP (Source RFC 2396, RFC 1738, RFC 959)
+%%% Encoded :, @, or / characters appearing within the username or
+%%% password fields (as required by RFC 1738) are not handled.
+%%%
+%%% Note: This BNF has been modified to better fit with RFC 2396
+%%% ftp_URL = "ftp:" "//" [ ftp_userinfo "@"] host [ ":" port ] ftp_abs_path
+%%% ftp_userinfo = ftp_user [ ":" ftp_password ]
+%%% ftp_abs_path = "/" ftp_path_segments [ ";type=" ftp_type ]
+%%% ftp_path_segments = ftp_segment *( "/" ftp_segment)
+%%% ftp_segment = *[ ftp_uchar | "?" | ":" | "@" | "&" | "=" ]
+%%% ftp_type = "A" | "I" | "D" | "a" | "i" | "d"
+%%% ftp_user = *[ ftp_uchar | ";" | "?" | "&" | "=" ]
+%%% ftp_password = *[ ftp_uchar | ";" | "?" | "&" | "=" ]
+%%% ftp_uchar = ftp_unreserved | escaped
+%%% ftp_unreserved = alphanum | mark | "$" | "+" | ","
+parse_ftp("//"++C0,Scheme) ->
+ case ftp_userinfo(C0) of
+ {error, Error} ->
+ {error,Error};
+ {C1,Creds} ->
+ case scan_hostport(C1,Scheme) of
+ {C2,Host,Port} ->
+ case scan_abspath(C2) of
+ {error,Error} ->
+ {error,Error};
+ {[],[]} -> % Add implicit path
+ {Scheme,Creds,Host,Port,"/"};
+ {[],Path} ->
+ {Scheme,Creds,Host,Port,Path}
+ end;
+ {error,Error} ->
+ {error,Error}
+ end
+ end.
+
+ftp_userinfo(C0) ->
+ ftp_userinfo(C0, []).
+
+ftp_userinfo([], Acc) ->
+ {lists:reverse(Acc), {"",""}};
+ftp_userinfo(C0=[$/ |_], Acc) ->
+ {lists:reverse(Acc)++C0, {"",""}};
+ftp_userinfo([$@ |C0], Acc) ->
+ {C0, ftp_userinfo_1(lists:reverse(Acc), 0, "", "")};
+ftp_userinfo([C |C0], Acc) ->
+ ftp_userinfo(C0, [C |Acc]).
+
+
+ftp_userinfo_1([], 0, Acc, []) ->
+ { lists:reverse(Acc), ""};
+ftp_userinfo_1([], 1, Acc, User) ->
+ {User, lists:reverse(Acc)};
+ftp_userinfo_1([$:|_], 0, [], []) ->
+ {error,no_user};
+ftp_userinfo_1([$:|C0], 0, Acc,[]) ->
+ ftp_userinfo_1(C0, 1, [], lists:reverse(Acc));
+
+ftp_userinfo_1([C|C0],Stage, Acc, User) ->
+ ftp_userinfo_1(C0,Stage, [C|Acc], User).
+
+
+%%% .........................................................................
+-define(SIP_DEFAULT_PORT, 5060).
+-define(SIPTLS_DEFAULT_PORT, 5061).
+
+%%% SIP (Source RFC 2396, RFC 3261)
+%%% sip_URL = "sip:" [ sip_userinfo "@" ] host [ ":" port ]
+%%% sip_uri-parameters [ sip_headers ]
+%%% sip_userinfo = (sip_user | sip_telephone-subscriber)
+%%% [ ":" sip_password ]
+%%% sip_user = *( unreserved | escaped |
+%%% "&" | "=" | "+" | "$" | "," | ";" | "?" | "/")
+%%% sip_telephone-subscriber = See RFC2806
+%%% sip_password = *( unreserved | escaped |
+%%% "&" | "=" | "+" | "$" | "," )
+%%% sip_uri-parameters = *( ";" sip_uri-parameter )
+%%% sip_uri-parameter = sip_transport-param | sip_user-param |
+%%% sip_method-param | sip_ttl-param |
+%%% sip_maddr-param | sip_lr-param | sip_other-param
+%%% sip_transport-param = "transport=" ( "udp" | "tcp" | "sctp" | "tls" | token)
+%%% sip_user-param = "user=" ( "phone" | "ip" | token)
+%%% sip_method-param = "method=" sip_Method
+%%% sip_ttl-param = "ttl=" sip_ttl
+%%% sip_maddr-param = "maddr=" host
+%%% sip_lr-param = "lr"
+%%% sip_other-param = 1*sip_paramchar [ "=" 1*sip_paramchar ]
+%%% sip_Method = "INVITE" | "ACK" | "OPTIONS" | "BYE" |
+%%% "CANCEL" | "REGISTER" | token
+%%% sip_ttl = 1*3DIGIT ; 0 to 255
+%%% sip_paramchar = sip_param-unreserved | unreserved | escaped
+%%% sip_param-unreserved = "[" | "]" | "/" | ":" | "&" | "+" | "$"
+%%% sip_headers = "?" sip_header *( "&" sip_header )
+%%% sip_header = sip_hname "=" sip_hvalue
+%%% sip_hname = 1*( sip_hnv-unreserved | unreserved | escaped )
+%%% sip_hvalue = *( sip_hnv-unreserved / unreserved / escaped )
+%%% sip_hnv-unreserved = "[" | "]" | "/" | "?" | ":" | "+" | "$"
+
+%%% Note:
+%%% - FIXME: Headers not parsed
+parse_sip(C0,Scheme) ->
+ case string:tokens(C0,"@") of
+ [Userinfo,Hostport] ->
+ {User,Pass}=sip_userinfo(Userinfo),
+ {C1,Host,Port}=scan_hostport(Hostport,Scheme),
+ {C2,Parameters}=scan_parameters(C1),
+ Headers=scan_headers(C2),
+ {Scheme,User,Pass,Host,Port,Parameters,Headers};
+ [Hostport] ->
+ {C1,Host,Port}=scan_hostport(Hostport,Scheme),
+ {C2,Parameters}=scan_parameters(C1),
+ Headers=scan_headers(C2),
+ {Scheme,none,none,Host,Port,Parameters,Headers}
+ end.
+
+%%% FIXME! User can be telephone subscriber
+sip_userinfo(Userinfo) ->
+ case string:tokens(Userinfo,":") of
+ [User,Pass] -> {User,Pass};
+ [User] -> {User,none}
+ end.
+
+scan_parameters(C1) ->
+ ParList=string:tokens(C1,";"),
+ scan_parameters2(ParList,[], []).
+
+%% Is Foo the way to go? This code needs further investigation. (As
+%% does most of this module.) If we decide to keep it!
+scan_parameters2([],Out, Foo) ->
+ {lists:reverse(Foo), lists:reverse(Out)};
+scan_parameters2(["transport"++Val|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{transport,Val}|Out], Foo);
+scan_parameters2(["user"++Val|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{user,Val}|Out], Foo);
+scan_parameters2(["method"++Val|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{method,Val}|Out], Foo);
+scan_parameters2(["ttl"++Val|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{ttl,Val}|Out], Foo);
+scan_parameters2(["maddr"++Val|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{maddr,Val}|Out], Foo);
+scan_parameters2(["lr"|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[{lr,""}|Out], Foo);
+scan_parameters2([Other|Rest],Out, Foo) ->
+ scan_parameters2(Rest,[Out], [Other |Foo]).
+
+%%% FIXME!
+scan_headers(C2) ->
+ C2.
+
+%%% ............................................................................
+%%% SMS (Source draft-wilde-sms-uri-01, January 24 2002 and
+%%% draft-allocchio-gstn-01, November 2001)
+%%% The syntax definition for "gstn-phone" is taken from
+%%% [draft-allocchio-gstn-01], allowing global as well as local telephone
+%%% numbers.
+%%% Note: This BNF has been modified to better fit with RFC 2396
+%%% sms_URI = sms ":" 1*( sms-recipient ) [ sms-body ]
+%%% sms-recipient = gstn-phone sms-qualifier
+%%% [ "," sms-recipient ]
+%%% sms-qualifier = *( smsc-qualifier / pid-qualifier )
+%%% smsc-qualifier = ";smsc=" SMSC-sub-addr
+%%% pid-qualifier = ";pid=" PID-sub-addr
+%%% sms-body = ";body=" *urlc
+%%% gstn-phone = ( global-phone / local-phone )
+%%% global-phone = "+" 1*( DIGIT / written-sep )
+%%% local-phone = [ exit-code ] dial-number / exit-code [ dial-number ]
+%%% exit-code = phone-string
+%%% dial-number = phone-string
+%%% subaddr-string = phone-string
+%%% post-dial = phone-string
+%%% phone-string = 1*( DTMF / pause / tonewait / written-sep )
+%%% DTMF = ( DIGIT / "#" / "*" / "A" / "B" / "C" / "D" )
+%%% written-sep = ( "-" / "." )
+%%% pause = "p"
+%%% tonewait = "w"
+parse_sms(Cont,Scheme) ->
+ {Scheme,Cont}.
+
+
+%%% ==========================================================================
+%%% Generic URI parsing. BNF rules from RFC 2396
+
+%%% hostport = host [ ":" port ]
+scan_hostport(C0,Scheme) ->
+ case scan_host(C0) of
+ {error,Error} ->
+ {error,Error};
+ {":"++C1,Host} ->
+ {C2,Port}=scan_port(C1,[]),
+ {C2,Host,list_to_integer(Port)};
+ {C1,Host} when Scheme==http ->
+ {C1,Host,?HTTP_DEFAULT_PORT};
+ {C1,Host} when Scheme==https ->
+ {C1,Host,?HTTPS_DEFAULT_PORT};
+ {C1,Host} when Scheme==ftp ->
+ {C1,Host,?FTP_DEFAULT_PORT};
+ {C1,Host} when Scheme==sip ->
+ {C1,Host,?SIP_DEFAULT_PORT}
+ end.
+
+
+%%% host = hostname | IPv4address | IPv6reference
+%%% hostname = *( domainlabel "." ) toplabel [ "." ]
+%%% domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
+%%% toplabel = alpha | alpha *( alphanum | "-" ) alphanum
+%%% IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
+%%% ipv6reference = "[" IPv6address "]"
+%%% IPv6address = hexpart [ ":" IPv4address ]
+%%% hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
+%%% hexseq = hex4 *( ":" hex4)
+%%% hex4 = 1*4HEXDIG
+
+%%% Note:
+%%% Bitfields are set as follows:
+%%% Bit 0 = 0-9
+%%% Bit 1 = a-f
+%%% Bit 2 = g-z
+-define(BIT1, 1).
+-define(BIT2, 2).
+-define(BIT3, 4).
+
+%%% 1 = DIGIT are only digits
+%%% 3 = HEX are DIGITS + a-f
+%%% 6 = ALPHA are HEX - DIGITS + g-z
+-define(DIGIT, 1).
+-define(HEX, 3).
+-define(ALPHA, 6).
+
+
+scan_host(C0) ->
+ case scan_host2(C0,[],0,[],[]) of
+ {C1,IPv4address,[?DIGIT,?DIGIT,?DIGIT,?DIGIT]} ->
+ {C1,lists:reverse(lists:append(IPv4address))};
+%% {C1,IPv6address,[$[,Hex1,Hex2,Hex3,Hex4,$]]} when Hex1=<?HEX;
+%% Hex2=<?HEX;
+%% Hex3=<?HEX;
+%% Hex4=<?HEX ->
+%% {C1,lists:reverse(lists:append(IPv6address))};
+ {C1,Hostname,[A|_HostF]} ->
+ {C1,lists:reverse(lists:append(Hostname))};
+ _ ->
+ {error,no_host}
+ end.
+
+scan_host2([H|C0],Acc,CurF,Host,HostF) when $0=<H,H=<$9 ->
+ scan_host2(C0,[H|Acc],CurF bor ?BIT1,Host,HostF);
+scan_host2([H|C0],Acc,CurF,Host,HostF) when $a=<H,H=<$z; $A=<H,H=<$Z ->
+ scan_host2(C0,[H|Acc],CurF bor ?ALPHA,Host,HostF);
+scan_host2([$-|C0],Acc,CurF,Host,HostF) when CurF=/=0 ->
+ scan_host2(C0,[$-|Acc],CurF,Host,HostF);
+scan_host2([$.|C0],Acc,CurF,Host,HostF) when CurF=/=0 ->
+ scan_host2(C0,[],0,[".",Acc|Host],[CurF|HostF]);
+scan_host2(C0,Acc,CurF,Host,HostF) ->
+ {C0,[Acc|Host],[CurF|HostF]}.
+
+
+%%% port = *digit
+scan_port([H|C0],Acc) when $0=<H,H=<$9 ->
+ scan_port(C0,[H|Acc]);
+scan_port(C0,Acc) ->
+ {C0,lists:reverse(Acc)}.
+
+%%% abs_path = "/" path_segments
+scan_abspath([]) ->
+ {[],[]};
+scan_abspath("/"++C0) ->
+ scan_pathsegments(C0,["/"]);
+scan_abspath(_) ->
+ {error,no_abspath}.
+
+%%% path_segments = segment *( "/" segment )
+scan_pathsegments(C0,Acc) ->
+ case scan_segment(C0,[]) of
+ {"/"++C1,Segment} ->
+ scan_pathsegments(C1,["/",Segment|Acc]);
+ {C1,Segment} ->
+ {C1,lists:reverse(lists:append([Segment|Acc]))}
+ end.
+
+
+%%% segment = *pchar *( ";" param )
+%%% param = *pchar
+scan_segment(";"++C0,Acc) ->
+ {C1,ParamAcc}=scan_pchars(C0,";"++Acc),
+ scan_segment(C1,ParamAcc);
+scan_segment(C0,Acc) ->
+ case scan_pchars(C0,Acc) of
+ {";"++C1,Segment} ->
+ {C2,ParamAcc}=scan_pchars(C1,";"++Segment),
+ scan_segment(C2,ParamAcc);
+ {C1,Segment} ->
+ {C1,Segment}
+ end.
+
+%%% query = *uric
+%%% uric = reserved | unreserved | escaped
+%%% reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+%%% "$" | "," | "[" | "]"
+%%% unreserved = alphanum | mark
+%%% mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
+%%% "(" | ")"
+%%% escaped = "%" hex hex
+scan_query([],Acc) ->
+ lists:reverse(Acc);
+scan_query([$%,H1,H2|C0],Acc) -> % escaped
+ scan_query([hex2dec(H1)*16+hex2dec(H2)|C0],Acc);
+scan_query([H|C0],Acc) when $a=<H,H=<$z;$A=<H,H=<$Z;$0=<H,H=<$9 -> % alphanum
+ scan_query(C0,[H|Acc]);
+scan_query([H|C0],Acc) when H==$;; H==$/; H==$?; H==$:; H==$@; H==$[; H==$];
+ H==$&; H==$=; H==$+; H==$$; H==$, -> % reserved
+ scan_query(C0,[H|Acc]);
+scan_query([H|C0],Acc) when H==$-; H==$_; H==$.; H==$!; H==$~;
+ H==$*; H==$'; H==$(; H==$) -> % mark
+ scan_query(C0,[H|Acc]);
+scan_query([H|C0],Acc) when 0=<H,H=<127 -> % US ASCII
+ {H1,H2}=dec2hex(H),
+ scan_query(C0,[H2,H1,$%|Acc]);
+scan_query([_H|_C0],_Acc) ->
+ {error,no_query}.
+
+
+%%% pchar = unreserved | escaped |
+%%% ":" | "@" | "&" | "=" | "+" | "$" | ","
+scan_pchars([],Acc) ->
+ {[],Acc};
+scan_pchars([$%,H1,H2|C0],Acc) -> % escaped
+ scan_pchars([hex2dec(H1)*16+hex2dec(H2)|C0],Acc);
+scan_pchars([H|C0],Acc) when $a=<H,H=<$z;$A=<H,H=<$Z;$0=<H,H=<$9 -> % alphanum
+ scan_pchars(C0,[H|Acc]);
+scan_pchars([H|C0],Acc) when H==$-; H==$_; H==$.; H==$!; H==$~;
+ H==$*; H==$'; H==$(; H==$) -> % mark
+ scan_pchars(C0,[H|Acc]);
+scan_pchars([H|C0],Acc) when H==$:; H==$@; H==$&; H==$=; H==$+; H==$$; H==$, ->
+ scan_pchars(C0,[H|Acc]);
+scan_pchars([H|C0],Acc) when 0=<H,H=<127, % US ASCII
+ H=/=$?,H=/=$;,H=/=$/,H=/=$# ->
+ {H1,H2}=dec2hex(H),
+ scan_pchars(C0,[H2,H1,$%|Acc]);
+scan_pchars(C0,Acc) ->
+ {C0,Acc}.
+
+hex2dec(X) when X>=$0,X=<$9 -> X-$0;
+hex2dec(X) when X>=$A,X=<$F -> X-$A+10;
+hex2dec(X) when X>=$a,X=<$f -> X-$a+10.
+
+dec2hex(H) when H<256 ->
+ <<H1:4,H2:4>> = <<H>>,
+ {nibble2hex(H1),nibble2hex(H2)}.
+
+nibble2hex(X) when 0=<X,X=<9 -> X+$0;
+nibble2hex(10) -> $a;
+nibble2hex(11) -> $b;
+nibble2hex(12) -> $c;
+nibble2hex(13) -> $d;
+nibble2hex(14) -> $e;
+nibble2hex(15) -> $f.