From 80feeb36f92a923f57f740c7c28c12bb8b69ec16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 28 Jul 2017 11:04:19 +0200 Subject: stdlib: Add API and doc of uri_string module --- lib/stdlib/doc/src/Makefile | 1 + lib/stdlib/doc/src/ref_man.xml | 1 + lib/stdlib/doc/src/specs.xml | 1 + lib/stdlib/doc/src/uri_string.xml | 255 ++++++++++++++++++++++++++++++ lib/stdlib/src/Makefile | 1 + lib/stdlib/src/uri_string.erl | 325 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 lib/stdlib/doc/src/uri_string.xml create mode 100644 lib/stdlib/src/uri_string.erl (limited to 'lib/stdlib') diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile index 93eac8220d..aeed79408b 100644 --- a/lib/stdlib/doc/src/Makefile +++ b/lib/stdlib/doc/src/Makefile @@ -98,6 +98,7 @@ XML_REF3_FILES = \ sys.xml \ timer.xml \ unicode.xml \ + uri_string.xml \ win32reg.xml \ zip.xml diff --git a/lib/stdlib/doc/src/ref_man.xml b/lib/stdlib/doc/src/ref_man.xml index 878a3babc5..68bfddbc71 100644 --- a/lib/stdlib/doc/src/ref_man.xml +++ b/lib/stdlib/doc/src/ref_man.xml @@ -93,6 +93,7 @@ + diff --git a/lib/stdlib/doc/src/specs.xml b/lib/stdlib/doc/src/specs.xml index 45b207b13d..d559adf9b6 100644 --- a/lib/stdlib/doc/src/specs.xml +++ b/lib/stdlib/doc/src/specs.xml @@ -60,6 +60,7 @@ + diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml new file mode 100644 index 0000000000..e6b2bd5e80 --- /dev/null +++ b/lib/stdlib/doc/src/uri_string.xml @@ -0,0 +1,255 @@ + + + + +
+ + 20172017 + Ericsson AB. All Rights Reserved. + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + maps + Péter Dimitrov + 1 + 2017-08-23 + A +
+ uri_string + RFC 3986 compliant URI processing functions. + +

This module contains functions for parsing and handling RFC 3986 compliant URIs.

+

A URI is an identifier consisting of a sequence of characters matching the syntax + rule named URI in RFC 3986.

+

The generic URI syntax consists of a hierarchical sequence of components referred + to as the scheme, authority, path, query, and fragment:

+    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+    hier-part   = "//" authority path-abempty
+                   / path-absolute
+                   / path-rootless
+                   / path-empty
+    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+    authority   = [ userinfo "@" ] host [ ":" port ]
+    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
+
+    reserved    = gen-delims / sub-delims
+    gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
+                / "*" / "+" / "," / ";" / "="
+
+    unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    


+

+

The interpretation of a URI depends only on the characters used and not on how those + characters are represented in a network protocol.

+

The functions implemented by this module covers the following use cases: + + Parsing URIs

+ parse/1
+ Recomposing URIs

+ recompose/2
+ Resolving URI references

+ resolve_uri_reference/3
+ Creating URI references

+ create_uri_reference/3
+ Normalizing URIs

+ normalize/1
+ Transcoding URIs

+ transcode/2
+ Working with urlencoded query strings

+ compose_query/1, dissect_query/1
+
+

+

There are four different encodings present during the handling of URIs: + + Inbound binary encoding in binaries + Inbound percent-encoding in lists and binaries + Outbound binary encoding in binaries + Outbound percent-encoding in lists and binaries + +

+

Unless otherwise specified the return value type and encoding are the same as the input + type and encoding. That is, binary input returns binary output, list input returns a list + output but mixed input returns list output. Input and output encodings are the same except + for transcode/2.

+

All of the functions but transcode/2 expects input as unicode codepoints in + lists, UTF-8 encoding in binaries and UTF-8 encoding in percent-encoded URI parts. + transcode/2 provides the means to convert between the supported URI encodings.

+
+ + + + + +

Maybe improper list of bytes (0..255).

+
+
+ + + +

URI map holding the main components of a URI.

+
+
+ + + +

List of unicode codepoints, UTF-8 encoded binary, or a mix of the two, + representing an RFC 3986 compliant URI (percent-encoded form). + A URI is a sequence of characters from a very limited set: the letters of + the basic Latin alphabet, digits, and a few special characters.

+
+
+
+ + + + + + Compose urlencoded query string. + +

Composes an urlencoded QueryString based on a + QueryList, a list of unescaped key-value pairs. + Media type application/x-www-form-urlencoded is defined in section + 8.2.1 of RFC 1866 (HTML 2.0). +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:compose_query(...).
+
+
+
+ + + + Create references. + +

Creates an RFC 3986 compliant RelativeDestURI, + based AbsoluteSourceURI and AbsoluteSourceURI +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:create_uri_reference(...,...).
+
+
+
+ + + + Dissect query string. + +

Dissects an urlencoded QueryString and returns a + QueryList, a list of unescaped key-value pairs. + Media type application/x-www-form-urlencoded is defined in section + 8.2.1 of RFC 1866 (HTML 2.0). +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:dissect_query(...).
+
+
+
+ + + + Normalize URI. + +

Normalizes an RFC 3986 compliant URIString and returns + a NormalizedURI. The algorithm used to shorten the input + URI is called Syntax-Based Normalization and described at + Section 6.2.2 of RFC 3986. +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:normalize("http://example.org/one/two/../../one").
+"http://example.org/one"
+
+
+
+ + + + Parse URI into a map. + +

Returns a URIMap, that is a uri_map() with the parsed components + of the URIString.

+

If parsing fails, a parse_error exception is raised.

+

Example:

+
+1> uri_string:parse("foo://user@example.com:8042/over/there?name=ferret#nose").
+#{fragment => "nose",host => "example.com",
+  path => "/over/there",port => 8042,query => "name=ferret",
+  scheme => foo,userinfo => "user"}
+2> 
+
+
+ + + + Recompose URI. + +

Returns an RFC 3986 compliant URIString (percent-encoded).

+

If the URIMap is invalid, a badarg exception is raised.

+

Example:

+
+1> URIMap = #{fragment => "nose", host => "example.com", path => "/over/there",
+port => 8042, query => "name=ferret", scheme => foo, userinfo => "user"}.
+#{fragment => "top",host => "example.com",
+  path => "/over/there",port => 8042,query => "?name=ferret",
+  scheme => foo,userinfo => "user"}
+
+2> uri_string:recompose(URIMap, []).
+"foo://example.com:8042/over/there?name=ferret#nose"
+
+
+ + + + Resolve URI reference. + +

Resolves an RFC 3986 compliant RelativeURI, + based AbsoluteBaseURI and returns a new absolute URI + (AbsoluteDestURI).

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:resolve_uri_reference(...,...).
+
+
+
+ + + + Transcode URI. + +

Transcodes an RFC 3986 compliant URIString, + where Options is a list of tagged tuples, specifying the inbound + (in_encoding) and outbound (out_encoding) encodings.

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:transcode(<<"foo://f%20oo">>, [{in_encoding, utf8},
+{out_encoding, utf16}]).
+<<0,102,0,111,0,111,0,58,0,47,0,47,0,102,0,37,0,48,0,48,0,37,0,50,0,48,0,
+  111,0,111>>
+
+
+
+ +
+
diff --git a/lib/stdlib/src/Makefile b/lib/stdlib/src/Makefile index bf836203ec..8b156929d7 100644 --- a/lib/stdlib/src/Makefile +++ b/lib/stdlib/src/Makefile @@ -121,6 +121,7 @@ MODULES= \ timer \ unicode \ unicode_util \ + uri_string \ win32reg \ zip diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl new file mode 100644 index 0000000000..2c10c34f39 --- /dev/null +++ b/lib/stdlib/src/uri_string.erl @@ -0,0 +1,325 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +%% +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%% +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%% +%% [RFC 3986, Chapter 3. Syntax Components] +%% +%% The generic URI syntax consists of a hierarchical sequence of +%% components referred to as the scheme, authority, path, query, and +%% fragment. +%% +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% +%% The scheme and path components are required, though the path may be +%% empty (no characters). When authority is present, the path must +%% either be empty or begin with a slash ("/") character. When +%% authority is not present, the path cannot begin with two slash +%% characters ("//"). These restrictions result in five different ABNF +%% rules for a path (Section 3.3), only one of which will match any +%% given URI reference. +%% +%% The following are two example URIs and their component parts: +%% +%% foo://example.com:8042/over/there?name=ferret#nose +%% \_/ \______________/\_________/ \_________/ \__/ +%% | | | | | +%% scheme authority path query fragment +%% | _____________________|__ +%% / \ / \ +%% urn:example:animal:ferret:nose +%% +%% +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%% +%% +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%% +%% +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0 +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%% +%% +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%% +%% +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%% +%% +%% [RFC 3986, Chapter 4.3. Absolute URI] +%% +%% Some protocol elements allow only the absolute form of a URI without +%% a fragment identifier. For example, defining a base URI for later +%% use by relative references calls for an absolute-URI syntax rule that +%% does not allow a fragment. +%% +%% absolute-URI = scheme ":" hier-part [ "?" query ] +%% + +-module(uri_string). + + +-export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, + parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export_type([uri_map/0, uri_string/0, bytelist/0]). + + +%%%========================================================================= +%%% API +%%%========================================================================= + + +-type bytelist() :: maybe_improper_list( + 0..255 | + binary() | bytelist(), + binary() | []). + +%% URI compliant with RFC 3986 +%% ASCII %x21 - %x7A ("!" - "z") except +%% %x34 " double quote +%% %x60 < less than +%% %x62 > greater than +%% %x92 \ backslash +%% %x94 ^ caret / circumflex +%% %x96 ` grave / accent +-type uri_string() :: bytelist() | binary(). + + +%% RFC 3986, Chapter 3. Syntax Components +-type uri_map() :: + #{fragment := unicode:chardata(), + host := unicode:chardata(), + path := unicode:chardata(), + port := non_neg_integer(), + query := unicode:chardata(), + scheme := atom(), + userinfo := unicode:chardata()}. + +%% Parse URIs +-spec parse(URIString) -> URIMap when + URIString :: uri_string(), + URIMap :: uri_map(). +parse(_) -> + ok. + +%% Recompose URIs +-spec recompose(URIMap) -> URIString when + URIMap :: uri_map(), + URIString :: uri_string(). +recompose(_) -> + ok. + +%% Resolve references +-spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when + RelativeURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + AbsoluteDestURI :: uri_string(). +resolve_uri_reference(_,_) -> + ok. + +%% Create references +-spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when + AbsoluteSourceURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + RelativeDestURI :: uri_string(). +create_uri_reference(_,_) -> + ok. + +%% Normalize URIs +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(_) -> + ok. + +%% Transcode URIs +-spec transcode(URIString, Options) -> URIString when + URIString :: uri_string(), + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. +transcode(_, _) -> + ok. + + +%% Working with query strings +%% HTML 2.0 - application/x-www-form-urlencoded +%% RFC 1866 [8.2.1] + +%% Compose urlencoded query string from a list of unescaped key/value pairs. +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{unicode:chardata(), unicode:chardata()}], + QueryString :: uri_string(). +compose_query(_) -> + ok. + +%% Dissect a query string into a list of unescaped key/value pairs. +-spec dissect_query(QueryString) -> QueryList when + QueryString :: uri_string(), + QueryList :: [{unicode:chardata(), unicode:chardata()}]. +dissect_query(_) -> + ok. -- cgit v1.2.3