diff options
-rw-r--r-- | lib/stdlib/doc/src/Makefile | 1 | ||||
-rw-r--r-- | lib/stdlib/doc/src/ref_man.xml | 1 | ||||
-rw-r--r-- | lib/stdlib/doc/src/specs.xml | 1 | ||||
-rw-r--r-- | lib/stdlib/doc/src/uri_string.xml | 255 | ||||
-rw-r--r-- | lib/stdlib/src/Makefile | 1 | ||||
-rw-r--r-- | lib/stdlib/src/uri_string.erl | 325 |
6 files changed, 584 insertions, 0 deletions
diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile index 93eac8220d..aeed79408b 100644 --- a/lib/stdlib/doc/src/Makefile +++ b/lib/stdlib/doc/src/Makefile @@ -98,6 +98,7 @@ XML_REF3_FILES = \ sys.xml \ timer.xml \ unicode.xml \ + uri_string.xml \ win32reg.xml \ zip.xml diff --git a/lib/stdlib/doc/src/ref_man.xml b/lib/stdlib/doc/src/ref_man.xml index 878a3babc5..68bfddbc71 100644 --- a/lib/stdlib/doc/src/ref_man.xml +++ b/lib/stdlib/doc/src/ref_man.xml @@ -93,6 +93,7 @@ <xi:include href="sys.xml"/> <xi:include href="timer.xml"/> <xi:include href="unicode.xml"/> + <xi:include href="uri_string.xml"/> <xi:include href="win32reg.xml"/> <xi:include href="zip.xml"/> </application> diff --git a/lib/stdlib/doc/src/specs.xml b/lib/stdlib/doc/src/specs.xml index 45b207b13d..d559adf9b6 100644 --- a/lib/stdlib/doc/src/specs.xml +++ b/lib/stdlib/doc/src/specs.xml @@ -60,6 +60,7 @@ <xi:include href="../specs/specs_sys.xml"/> <xi:include href="../specs/specs_timer.xml"/> <xi:include href="../specs/specs_unicode.xml"/> + <xi:include href="../specs/specs_uri_string.xml"/> <xi:include href="../specs/specs_win32reg.xml"/> <xi:include href="../specs/specs_zip.xml"/> </specs> diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml new file mode 100644 index 0000000000..e6b2bd5e80 --- /dev/null +++ b/lib/stdlib/doc/src/uri_string.xml @@ -0,0 +1,255 @@ +<?xml version="1.0" encoding="utf-8" ?> +<!DOCTYPE erlref SYSTEM "erlref.dtd"> + +<erlref> + <header> + <copyright> + <year>2017</year><year>2017</year> + <holder>Ericsson AB. All Rights Reserved.</holder> + </copyright> + <legalnotice> + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + </legalnotice> + + <title>maps</title> + <prepared>Péter Dimitrov</prepared> + <docno>1</docno> + <date>2017-08-23</date> + <rev>A</rev> + </header> + <module>uri_string</module> + <modulesummary>RFC 3986 compliant URI processing functions.</modulesummary> + <description> + <p>This module contains functions for parsing and handling RFC 3986 compliant URIs.</p> + <p>A URI is an identifier consisting of a sequence of characters matching the syntax + rule named <em>URI</em> in <em>RFC 3986</em>.</p> + <p> The generic URI syntax consists of a hierarchical sequence of components referred + to as the scheme, authority, path, query, and fragment:<pre> + URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + hier-part = "//" authority path-abempty + / path-absolute + / path-rootless + / path-empty + scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + authority = [ userinfo "@" ] host [ ":" port ] + userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + + reserved = gen-delims / sub-delims + gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + / "*" / "+" / "," / ";" / "=" + + unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + </pre><br></br> + </p> + <p>The interpretation of a URI depends only on the characters used and not on how those + characters are represented in a network protocol.</p> + <p>The functions implemented by this module covers the following use cases: + <list type="bulleted"> + <item>Parsing URIs<br></br> + <c>parse/1</c></item> + <item>Recomposing URIs<br></br> + <c>recompose/2</c></item> + <item>Resolving URI references<br></br> + <c>resolve_uri_reference/3</c></item> + <item>Creating URI references<br></br> + <c>create_uri_reference/3</c></item> + <item>Normalizing URIs<br></br> + <c>normalize/1</c></item> + <item>Transcoding URIs<br></br> + <c>transcode/2</c></item> + <item>Working with urlencoded query strings<br></br> + <c>compose_query/1, dissect_query/1</c></item> + </list> + </p> + <p>There are four different encodings present during the handling of URIs: + <list type="bulleted"> + <item>Inbound binary encoding in binaries</item> + <item>Inbound percent-encoding in lists and binaries</item> + <item>Outbound binary encoding in binaries</item> + <item>Outbound percent-encoding in lists and binaries</item> + </list> + </p> + <p>Unless otherwise specified the return value type and encoding are the same as the input + type and encoding. That is, binary input returns binary output, list input returns a list + output but mixed input returns list output. Input and output encodings are the same except + for <c>transcode/2</c>.</p> + <p>All of the functions but <c>transcode/2</c> expects input as unicode codepoints in + lists, UTF-8 encoding in binaries and UTF-8 encoding in percent-encoded URI parts. + <c>transcode/2</c> provides the means to convert between the supported URI encodings.</p> + </description> + + <datatypes> + <datatype> + <name name="bytelist"/> + <desc> + <p>Maybe improper list of bytes (0..255).</p> + </desc> + </datatype> + <datatype> + <name name="uri_map"/> + <desc> + <p>URI map holding the main components of a URI.</p> + </desc> + </datatype> + <datatype> + <name name="uri_string"/> + <desc> + <p>List of unicode codepoints, UTF-8 encoded binary, or a mix of the two, + representing an RFC 3986 compliant URI (<em>percent-encoded form</em>). + A URI is a sequence of characters from a very limited set: the letters of + the basic Latin alphabet, digits, and a few special characters.</p> + </desc> + </datatype> + </datatypes> + + <funcs> + + <func> + <name name="compose_query" arity="1"/> + <fsummary>Compose urlencoded query string.</fsummary> + <desc> + <p>Composes an urlencoded <c><anno>QueryString</anno></c> based on a + <c><anno>QueryList</anno></c>, a list of unescaped key-value pairs. + Media type <c>application/x-www-form-urlencoded</c> is defined in section + 8.2.1 of <c>RFC 1866</c> (HTML 2.0). + </p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:compose_query(...).</input> +</pre> + </desc> + </func> + + <func> + <name name="create_uri_reference" arity="2"/> + <fsummary>Create references.</fsummary> + <desc> + <p>Creates an RFC 3986 compliant <c><anno>RelativeDestURI</anno></c>, + based <c><anno>AbsoluteSourceURI</anno></c> and <c><anno>AbsoluteSourceURI</anno></c> + </p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:create_uri_reference(...,...).</input> +</pre> + </desc> + </func> + + <func> + <name name="dissect_query" arity="1"/> + <fsummary>Dissect query string.</fsummary> + <desc> + <p>Dissects an urlencoded <c><anno>QueryString</anno></c> and returns a + <c><anno>QueryList</anno></c>, a list of unescaped key-value pairs. + Media type <c>application/x-www-form-urlencoded</c> is defined in section + 8.2.1 of <c>RFC 1866</c> (HTML 2.0). + </p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:dissect_query(...).</input> +</pre> + </desc> + </func> + + <func> + <name name="normalize" arity="1"/> + <fsummary>Normalize URI.</fsummary> + <desc> + <p>Normalizes an RFC 3986 compliant <c><anno>URIString</anno></c> and returns + a <c><anno>NormalizedURI</anno></c>. The algorithm used to shorten the input + URI is called Syntax-Based Normalization and described at + <c>Section 6.2.2 of RFC 3986</c>. + </p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:normalize("http://example.org/one/two/../../one").</input> +"http://example.org/one" +</pre> + </desc> + </func> + + <func> + <name name="parse" arity="1"/> + <fsummary>Parse URI into a map.</fsummary> + <desc> + <p>Returns a <c>URIMap</c>, that is a <em>uri_map()</em> with the parsed components + of the <c><anno>URIString</anno></c>.</p> + <p>If parsing fails, a <c>parse_error</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:parse("foo://[email protected]:8042/over/there?name=ferret#nose").</input> +#{fragment => "nose",host => "example.com", + path => "/over/there",port => 8042,query => "name=ferret", + scheme => foo,userinfo => "user"} +2> </pre> + </desc> + </func> + + <func> + <name name="recompose" arity="1"/> + <fsummary>Recompose URI.</fsummary> + <desc> + <p>Returns an RFC 3986 compliant <c><anno>URIString</anno></c> (percent-encoded).</p> + <p>If the <c><anno>URIMap</anno></c> is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>URIMap = #{fragment => "nose", host => "example.com", path => "/over/there",</input> +port => 8042, query => "name=ferret", scheme => foo, userinfo => "user"}. +#{fragment => "top",host => "example.com", + path => "/over/there",port => 8042,query => "?name=ferret", + scheme => foo,userinfo => "user"} + +2> <input>uri_string:recompose(URIMap, []).</input> +"foo://example.com:8042/over/there?name=ferret#nose"</pre> + </desc> + </func> + + <func> + <name name="resolve_uri_reference" arity="2"/> + <fsummary>Resolve URI reference.</fsummary> + <desc> + <p>Resolves an RFC 3986 compliant <c><anno>RelativeURI</anno></c>, + based <c><anno>AbsoluteBaseURI</anno></c> and returns a new absolute URI + (<c><anno>AbsoluteDestURI</anno></c>).</p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:resolve_uri_reference(...,...).</input> +</pre> + </desc> + </func> + + <func> + <name name="transcode" arity="2"/> + <fsummary>Transcode URI.</fsummary> + <desc> + <p>Transcodes an RFC 3986 compliant <c><anno>URIString</anno></c>, + where <c><anno>Options</anno></c> is a list of tagged tuples, specifying the inbound + (<c>in_encoding</c>) and outbound (<c>out_encoding</c>) encodings.</p> + <p>If an argument is invalid, a <c>badarg</c> exception is raised.</p> + <p><em>Example:</em></p> + <pre> +1> <input>uri_string:transcode(<<"foo://f%20oo">>, [{in_encoding, utf8},</input> +{out_encoding, utf16}]). +<<0,102,0,111,0,111,0,58,0,47,0,47,0,102,0,37,0,48,0,48,0,37,0,50,0,48,0, + 111,0,111>> +</pre> + </desc> + </func> + + </funcs> +</erlref> diff --git a/lib/stdlib/src/Makefile b/lib/stdlib/src/Makefile index bf836203ec..8b156929d7 100644 --- a/lib/stdlib/src/Makefile +++ b/lib/stdlib/src/Makefile @@ -121,6 +121,7 @@ MODULES= \ timer \ unicode \ unicode_util \ + uri_string \ win32reg \ zip diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl new file mode 100644 index 0000000000..2c10c34f39 --- /dev/null +++ b/lib/stdlib/src/uri_string.erl @@ -0,0 +1,325 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +%% +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%% +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%% +%% [RFC 3986, Chapter 3. Syntax Components] +%% +%% The generic URI syntax consists of a hierarchical sequence of +%% components referred to as the scheme, authority, path, query, and +%% fragment. +%% +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% +%% The scheme and path components are required, though the path may be +%% empty (no characters). When authority is present, the path must +%% either be empty or begin with a slash ("/") character. When +%% authority is not present, the path cannot begin with two slash +%% characters ("//"). These restrictions result in five different ABNF +%% rules for a path (Section 3.3), only one of which will match any +%% given URI reference. +%% +%% The following are two example URIs and their component parts: +%% +%% foo://example.com:8042/over/there?name=ferret#nose +%% \_/ \______________/\_________/ \_________/ \__/ +%% | | | | | +%% scheme authority path query fragment +%% | _____________________|__ +%% / \ / \ +%% urn:example:animal:ferret:nose +%% +%% +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%% +%% +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%% +%% +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0<pchar> +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%% +%% +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%% +%% +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%% +%% +%% [RFC 3986, Chapter 4.3. Absolute URI] +%% +%% Some protocol elements allow only the absolute form of a URI without +%% a fragment identifier. For example, defining a base URI for later +%% use by relative references calls for an absolute-URI syntax rule that +%% does not allow a fragment. +%% +%% absolute-URI = scheme ":" hier-part [ "?" query ] +%% + +-module(uri_string). + + +-export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, + parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export_type([uri_map/0, uri_string/0, bytelist/0]). + + +%%%========================================================================= +%%% API +%%%========================================================================= + + +-type bytelist() :: maybe_improper_list( + 0..255 | + binary() | bytelist(), + binary() | []). + +%% URI compliant with RFC 3986 +%% ASCII %x21 - %x7A ("!" - "z") except +%% %x34 " double quote +%% %x60 < less than +%% %x62 > greater than +%% %x92 \ backslash +%% %x94 ^ caret / circumflex +%% %x96 ` grave / accent +-type uri_string() :: bytelist() | binary(). + + +%% RFC 3986, Chapter 3. Syntax Components +-type uri_map() :: + #{fragment := unicode:chardata(), + host := unicode:chardata(), + path := unicode:chardata(), + port := non_neg_integer(), + query := unicode:chardata(), + scheme := atom(), + userinfo := unicode:chardata()}. + +%% Parse URIs +-spec parse(URIString) -> URIMap when + URIString :: uri_string(), + URIMap :: uri_map(). +parse(_) -> + ok. + +%% Recompose URIs +-spec recompose(URIMap) -> URIString when + URIMap :: uri_map(), + URIString :: uri_string(). +recompose(_) -> + ok. + +%% Resolve references +-spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when + RelativeURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + AbsoluteDestURI :: uri_string(). +resolve_uri_reference(_,_) -> + ok. + +%% Create references +-spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when + AbsoluteSourceURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + RelativeDestURI :: uri_string(). +create_uri_reference(_,_) -> + ok. + +%% Normalize URIs +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(_) -> + ok. + +%% Transcode URIs +-spec transcode(URIString, Options) -> URIString when + URIString :: uri_string(), + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. +transcode(_, _) -> + ok. + + +%% Working with query strings +%% HTML 2.0 - application/x-www-form-urlencoded +%% RFC 1866 [8.2.1] + +%% Compose urlencoded query string from a list of unescaped key/value pairs. +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{unicode:chardata(), unicode:chardata()}], + QueryString :: uri_string(). +compose_query(_) -> + ok. + +%% Dissect a query string into a list of unescaped key/value pairs. +-spec dissect_query(QueryString) -> QueryList when + QueryString :: uri_string(), + QueryList :: [{unicode:chardata(), unicode:chardata()}]. +dissect_query(_) -> + ok. |