diff --git a/doc/modules/ROOT/examples/unit/snippets.cpp b/doc/modules/ROOT/examples/unit/snippets.cpp index 7fe895c27..d88df6d92 100644 --- a/doc/modules/ROOT/examples/unit/snippets.cpp +++ b/doc/modules/ROOT/examples/unit/snippets.cpp @@ -819,6 +819,52 @@ parsing_scheme() } } +// tag::code_compound_scheme_1[] +// Helper function to extract transport scheme from compound schemes +boost::core::string_view +scheme_ex(boost::core::string_view s) +{ + // Find the last '+' in the scheme + // Examples: "git+https" -> "https", "svn+ssh" -> "ssh" + auto pos = s.rfind('+'); + if (pos != boost::core::string_view::npos) + return s.substr(pos + 1); + return {}; +} +// end::code_compound_scheme_1[] + +void +parsing_compound_scheme() +{ + + // Parse a URL with a compound scheme + url_view u("git+https://github.com/user/repo.git"); + + // The library treats the entire string as a single scheme per RFC 3986 + assert(u.scheme() == "git+https"); + + // Extract just the transport protocol suffix + boost::core::string_view transport = scheme_ex(u.scheme()); + assert(transport == "https"); + + // Test with other compound schemes + url_view u2("svn+ssh://example.com/repo"); + assert(u2.scheme() == "svn+ssh"); + assert(scheme_ex(u2.scheme()) == "ssh"); + + // Regular schemes without '+' return empty + url_view u3("https://example.com"); + assert(u3.scheme() == "https"); + assert(scheme_ex(u3.scheme()).empty()); + + // Multiple '+' characters: rfind returns the last one + url_view u4("npm+http+custom://example.com"); + assert(u4.scheme() == "npm+http+custom"); + assert(scheme_ex(u4.scheme()) == "custom"); + + boost::ignore_unused(u, u2, u3, u4, transport); +} + void parsing_authority() { @@ -2546,6 +2592,7 @@ class snippets_test parsing_components(); formatting_components(); run_silent(&parsing_scheme); + parsing_compound_scheme(); run_silent(&parsing_authority); run_silent(&parsing_path); run_silent(&parsing_query); diff --git a/doc/modules/ROOT/nav.adoc b/doc/modules/ROOT/nav.adoc index 23e6ccbee..956f359fc 100644 --- a/doc/modules/ROOT/nav.adoc +++ b/doc/modules/ROOT/nav.adoc @@ -1,9 +1,10 @@ * xref:quicklook.adoc[] -* xref:urls/index.adoc[] +* URLs ** xref:urls/parsing.adoc[] -** xref:urls/containers.adoc[] +** xref:urls/components.adoc[] ** xref:urls/segments.adoc[] ** xref:urls/params.adoc[] +* Operations ** xref:urls/normalization.adoc[] ** xref:urls/stringtoken.adoc[] ** xref:urls/percent-encoding.adoc[] diff --git a/doc/modules/ROOT/pages/urls/containers.adoc b/doc/modules/ROOT/pages/urls/components.adoc similarity index 79% rename from doc/modules/ROOT/pages/urls/containers.adoc rename to doc/modules/ROOT/pages/urls/components.adoc index 1ec897871..3f464a9f4 100644 --- a/doc/modules/ROOT/pages/urls/containers.adoc +++ b/doc/modules/ROOT/pages/urls/components.adoc @@ -7,7 +7,9 @@ // Official repository: https://github.com/boostorg/url // -= Containers += Components + +== Containers Three containers are provided for interacting with URLs: @@ -55,6 +57,175 @@ The tables and exposition which follow describe the available observers and modi == Scheme +The most important part is the __scheme__, whose production rule is: + + +[source] +---- +scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +---- + + +The scheme, which some informal texts incorrectly refer to as +"protocol", defines how the rest of the URL is interpreted. +Public schemes are registered and managed by the +https://en.wikipedia.org/wiki/Internet_Assigned_Numbers_Authority[Internet Assigned Numbers Authority,window=blank_] (IANA). +Here are some registered schemes and their corresponding +specifications: + +[cols="a,a"] +|=== +// Headers +|Scheme|Specification + +// Row 1, Column 1 +|**http** +// Row 1, Column 2 +|https://datatracker.ietf.org/doc/html/rfc7230#section-2.7.1[http URI Scheme (rfc7230),window=blank_] + +// Row 2, Column 1 +|**magnet** +// Row 2, Column 2 +|https://en.wikipedia.org/wiki/Magnet_URI_scheme[Magnet URI scheme,window=blank_] + +// Row 3, Column 1 +|**mailto** +// Row 3, Column 2 +|https://datatracker.ietf.org/doc/html/rfc6068[The 'mailto' URI Scheme (rfc6068),window=blank_] + +// Row 4, Column 1 +|**payto** +// Row 4, Column 2 +|https://datatracker.ietf.org/doc/html/rfc8905[The 'payto' URI Scheme for Payments (rfc8905),window=blank_] +// Row 4, Column 4 + +// Row 5, Column 1 +|**telnet** +// Row 5, Column 2 +|https://datatracker.ietf.org/doc/html/rfc4248[The telnet URI Scheme (rfc4248),window=blank_] + +// Row 6, Column 1 +|**urn** +// Row 6, Column 2 +|https://datatracker.ietf.org/doc/html/rfc2141[URN Syntax,window=blank_] + +|=== + + +Private schemes are possible, defined by organizations to enumerate internal +resources such as documents or physical devices, or to facilitate the operation +of their software. These are not subject to the same rigor as the registered +ones; they can be developed and modified by the organization to meet specific +needs with less concern for interoperability or backward compatibility. Note +that private does not imply secret; some private schemes such as Amazon's "s3" +have publicly available specifications and are quite popular. Here are some +examples: + +[cols="a,a"] +|=== +// Headers +|Scheme|Specification + +// Row 1, Column 1 +|**app** +// Row 1, Column 2 +|https://www.w3.org/TR/app-uri/[app: URL Scheme,window=blank_] + +// Row 2, Column 1 +|**odbc** +// Row 2, Column 2 +|https://datatracker.ietf.org/doc/html/draft-patrick-lambert-odbc-uri-scheme[ODBC URI Scheme,window=blank_] + +// Row 3, Column 1 +|**slack** +// Row 3, Column 2 +|https://api.slack.com/reference/deep-linking[Reference: Deep linking into Slack,window=blank_] + +|=== + + +In some cases the scheme is implied by the surrounding context and +therefore omitted. Here is a complete HTTP/1.1 GET request for the +target URL "/index.htm": + + +[source] +---- +GET /index.htm HTTP/1.1 +Host: www.example.com +Accept: text/html +User-Agent: Beast +---- + + +The scheme of "http" is implied here because the context is already an HTTP +request. The production rule for the URL in the request above is called +__origin-form__, defined in the +https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1[HTTP specification,window=blank_] +thusly: + + +[source] +---- +origin-form = absolute-path [ "?" query ] + +absolute-path = 1*( "/" segment ) +---- + + +[NOTE] +==== +All URLs have a scheme, whether it is explicit or implicit. +The scheme determines what the rest of the URL means. +==== + + +Here are some more examples of URLs using various schemes (and one example +of something that is not a URL): + + +[cols="a,a"] +|=== +// Headers +|URL|Notes + +// Row 1, Column 1 +|`pass:[https://www.boost.org/index.html]` +// Row 1, Column 2 +|Hierarchical URL with `https` protocol. Resource in the HTTP protocol. + +// Row 2, Column 1 +|`pass:[ftp://host.dom/etc/motd]` +// Row 2, Column 2 +|Hierarchical URL with `ftp` scheme. Resource in the FTP protocol. + +// Row 3, Column 1 +|`urn:isbn:045145052` +// Row 3, Column 2 +|Opaque URL with `urn` scheme. Identifies `isbn` resource. + +// Row 4, Column 1 +|`mailto:person@example.com` +// Row 4, Column 2 +|Opaque URL with `mailto` scheme. Identifies e-mail address. + +// Row 5, Column 1 +|`index.html` +// Row 5, Column 2 +|URL reference. Missing scheme and authority. + +// Row 6, Column 1 +|`www.boost.org` +// Row 6, Column 2 +|A Protocol-Relative Link (PRL). **Not a URL**. + +|=== + + + + +=== API Reference + The scheme is represented as a case-insensitive string, along with an enumeration constant which acts as a numeric identifier when the string matches one of the well-known schemes: http, https, ws, wss, file, and ftp. Characters in the scheme are never escaped; only letters and numbers are allowed, and the first character must be a letter. @@ -121,13 +292,89 @@ This includes the trailing colon (":"). |=== +[NOTE] +==== +Some package managers (pip, npm) and tools use compound schemes like `git+https://` or `svn+ssh://` where a plus sign separates a protocol from a transport mechanism. +Boost.URL treats these as single scheme strings per RFC 3986 (which allows plus signs). +To extract the transport suffix, use a helper like `scheme_ex`: + +[source,cpp] +---- +include::example$unit/snippets.cpp[tag=code_compound_scheme_1,indent=0] +---- + +This is an informal convention, not a URL standard. See https://github.com/whatwg/url/issues/230[WHATWG discussion,window=blank_]. +==== + == Authority +The authority determines how a resource can be accessed. +It contains two parts: the +https://www.rfc-editor.org/rfc/rfc3986#section-3.2.1[__userinfo__,window=blank_] +that holds identity credentials, and the +https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2[__host__,window=blank_] +and +https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3[__port__,window=blank_] +which identify a communication endpoint having dominion +over the resource described in the remainder of the URL. +This is the ABNF specification for the authority part: + +[source] +---- +authority = [ user [ ":" password ] "@" ] host [ ":" port ] +---- + + +The combination of user and optional password is called the +__userinfo__. + +image::AuthorityDiagram.svg[] + +Some observations: + +* The use of the password field is deprecated. +* The authority always has a defined host field, even if empty. +* The host can be a name, or an IPv4, an IPv6, or an IPvFuture address. +* All but the port field use percent-encoding to escape delimiters. + +The host subcomponent represents where resources +are located. + +[NOTE] +==== +Note that if an authority is present, the host is always +defined even if it is the empty string (corresponding +to a zero-length __reg-name__ in the BNF). + +[source,cpp] +---- +include::example$unit/snippets.cpp[tag=snippet_parsing_authority_10a,indent=0] +---- + +==== + + +The authority component also influences how we should +interpret the URL path. If the authority is present, +the path component must either be empty or begin with +a slash. + +[NOTE] +==== +Although the specification allows the format cpp:username:password[], +the password component should be used with care. + +It is not recommended to transfer password data through URLs +unless this is an empty string indicating no password. +==== + + + + +=== API Reference + The authority is an optional part whose presence is indicated by an unescaped double slash ("//") immediately following the scheme, or at the beginning if the scheme is not present. It contains three components: an optional userinfo, the host, and an optional port. -The authority in this diagram has all three components: - -image:::AuthorityDiagram.svg[] An empty authority, corresponding to just a zero-length host component, is distinct from the absence of an authority. These members are used to inspect and modify the authority as a whole string: diff --git a/doc/modules/ROOT/pages/urls/index.adoc b/doc/modules/ROOT/pages/urls/index.adoc deleted file mode 100644 index 1d97dd162..000000000 --- a/doc/modules/ROOT/pages/urls/index.adoc +++ /dev/null @@ -1,383 +0,0 @@ -// -// Copyright (c) 2023 Alan de Freitas (alandefreitas@gmail.com) -// -// Distributed under the Boost Software License, Version 1.0. (See accompanying -// file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt) -// -// Official repository: https://github.com/boostorg/url -// - - -= URLs - -A URL, short for "Uniform Resource Locator," is a compact string -of characters identifying an abstract or physical resource. -It has these five parts, which may be optional or disallowed -depending on the context: - -image::PartsDiagram.svg[] - -Each part's syntax is defined by a set of production rules in -https://tools.ietf.org/html/rfc3986[rfc3986,window=blank_]. All valid URLs conform to this grammar, also called -the "generic syntax." Here is an example URL which describes a -file and its location on a network host: - - -[source] ----- -https://www.example.com/path/to/file.txt?userid=1001&pages=3&results=full#page1 ----- - - -The parts and their corresponding text is as follows: - -[cols="a,a"] -|=== -// Headers -|Part|Text - -// Row 1, Column 1 -|__scheme__ -// Row 1, Column 2 -|"https" - -// Row 2, Column 1 -|__authority__ -// Row 2, Column 2 -|"www.example.com" - -// Row 3, Column 1 -|__path__ -// Row 3, Column 2 -|"/path/to/file.txt" - -// Row 4, Column 1 -|__query__ -// Row 4, Column 2 -|"userid=1001&pages=3&results=full" - -// Row 5, Column 1 -|__fragment__ -// Row 5, Column 2 -|"page1" - -|=== - - -The production rule for the example above is called a __URI__, -which can contain all five parts. The specification using -https://datatracker.ietf.org/doc/html/rfc2234[__ABNF notation__,window=blank_] -is: - - -[source] ----- -URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] - -hier-part = "//" authority path-abempty - / path-absolute - / path-rootless - / path-empty ----- - - -In this notation, the square brackets ("\// [" and "\]") denote optional -elements, quoted text represents character literals, and slashes are -used to indicate a choice between one of several elements. For the -complete specification of ABNF notation please consult -https://datatracker.ietf.org/doc/html/rfc2234[rfc2234,window=blank_], -"Augmented BNF for Syntax Specifications." -When using this library to process or create URLs, it is necessary -to choose which of these top-level production rules are applicable -for a given use-case: -__absolute-URI__, __origin-form__, __relative-ref__, __URI__, or -__URI-reference__. These are discussed in greater depth later. - - - -== Scheme - -The most important part is the __scheme__, whose production rule is: - - -[source] ----- -scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ----- - - -The scheme, which some informal texts incorrectly refer to as -"protocol", defines how the rest of the URL is interpreted. -Public schemes are registered and managed by the -https://en.wikipedia.org/wiki/Internet_Assigned_Numbers_Authority[Internet Assigned Numbers Authority,window=blank_] (IANA). -Here are some registered schemes and their corresponding -specifications: - -[cols="a,a"] -|=== -// Headers -|Scheme|Specification - -// Row 1, Column 1 -|**http** -// Row 1, Column 2 -|https://datatracker.ietf.org/doc/html/rfc7230#section-2.7.1[http URI Scheme (rfc7230),window=blank_] - -// Row 2, Column 1 -|**magnet** -// Row 2, Column 2 -|https://en.wikipedia.org/wiki/Magnet_URI_scheme[Magnet URI scheme,window=blank_] - -// Row 3, Column 1 -|**mailto** -// Row 3, Column 2 -|https://datatracker.ietf.org/doc/html/rfc6068[The 'mailto' URI Scheme (rfc6068),window=blank_] - -// Row 4, Column 1 -|**payto** -// Row 4, Column 2 -|https://datatracker.ietf.org/doc/html/rfc8905[The 'payto' URI Scheme for Payments (rfc8905),window=blank_] -// Row 4, Column 4 - -// Row 5, Column 1 -|**telnet** -// Row 5, Column 2 -|https://datatracker.ietf.org/doc/html/rfc4248[The telnet URI Scheme (rfc4248),window=blank_] - -// Row 6, Column 1 -|**urn** -// Row 6, Column 2 -|https://datatracker.ietf.org/doc/html/rfc2141[URN Syntax,window=blank_] - -|=== - - -Private schemes are possible, defined by organizations to enumerate internal -resources such as documents or physical devices, or to facilitate the operation -of their software. These are not subject to the same rigor as the registered -ones; they can be developed and modified by the organization to meet specific -needs with less concern for interoperability or backward compatibility. Note -that private does not imply secret; some private schemes such as Amazon's "s3" -have publicly available specifications and are quite popular. Here are some -examples: - -[cols="a,a"] -|=== -// Headers -|Scheme|Specification - -// Row 1, Column 1 -|**app** -// Row 1, Column 2 -|https://www.w3.org/TR/app-uri/[app: URL Scheme,window=blank_] - -// Row 2, Column 1 -|**odbc** -// Row 2, Column 2 -|https://datatracker.ietf.org/doc/html/draft-patrick-lambert-odbc-uri-scheme[ODBC URI Scheme,window=blank_] - -// Row 3, Column 1 -|**slack** -// Row 3, Column 2 -|https://api.slack.com/reference/deep-linking[Reference: Deep linking into Slack,window=blank_] - -|=== - - -In some cases the scheme is implied by the surrounding context and -therefore omitted. Here is a complete HTTP/1.1 GET request for the -target URL "/index.htm": - - -[source] ----- -GET /index.htm HTTP/1.1 -Host: www.example.com -Accept: text/html -User-Agent: Beast ----- - - -The scheme of "http" is implied here because the context is already an HTTP -request. The production rule for the URL in the request above is called -__origin-form__, defined in the -https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1[HTTP specification,window=blank_] -thusly: - - -[source] ----- -origin-form = absolute-path [ "?" query ] - -absolute-path = 1*( "/" segment ) ----- - - -[NOTE] -==== -All URLs have a scheme, whether it is explicit or implicit. -The scheme determines what the rest of the URL means. -==== - - -Here are some more examples of URLs using various schemes (and one example -of something that is not a URL): - - -[cols="a,a"] -|=== -// Headers -|URL|Notes - -// Row 1, Column 1 -|`pass:[https://www.boost.org/index.html]` -// Row 1, Column 2 -|Hierarchical URL with `https` protocol. Resource in the HTTP protocol. - -// Row 2, Column 1 -|`pass:[ftp://host.dom/etc/motd]` -// Row 2, Column 2 -|Hierarchical URL with `ftp` scheme. Resource in the FTP protocol. - -// Row 3, Column 1 -|`urn:isbn:045145052` -// Row 3, Column 2 -|Opaque URL with `urn` scheme. Identifies `isbn` resource. - -// Row 4, Column 1 -|`mailto:person@example.com` -// Row 4, Column 2 -|Opaque URL with `mailto` scheme. Identifies e-mail address. - -// Row 5, Column 1 -|`index.html` -// Row 5, Column 2 -|URL reference. Missing scheme and authority. - -// Row 6, Column 1 -|`www.boost.org` -// Row 6, Column 2 -|A Protocol-Relative Link (PRL). **Not a URL**. - -|=== - - - - -== Authority - -The authority determines how a resource can be accessed. -It contains two parts: the -https://www.rfc-editor.org/rfc/rfc3986#section-3.2.1[__userinfo__,window=blank_] -that holds identity credentials, and the -https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2[__host__,window=blank_] -and -https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3[__port__,window=blank_] -which identify a communication endpoint having dominion -over the resource described in the remainder of the URL. -This is the ABNF specification for the authority part: - -[source] ----- -authority = [ user [ ":" password ] "@" ] host [ ":" port ] ----- - - -The combination of user and optional password is called the -__userinfo__. - -The authority determines how a resource can be accessed. -It contains two parts: the -https://www.rfc-editor.org/rfc/rfc3986#section-3.2.1[__userinfo__,window=blank_] -that holds identity credentials, and the -https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2[__host__,window=blank_] -and -https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.3[__port__,window=blank_] -which identify a communication endpoint having dominion -over the resource described in the remainder of the URL. - -image::AuthorityDiagram.svg[] - -Some observations: - -* The use of the password field is deprecated. -* The authority always has a defined host field, even if empty. -* The host can be a name, or an IPv4, an IPv6, or an IPvFuture address. -* All but the port field use percent-encoding to escape delimiters. - -The host subcomponent represents where resources -are located. - -[NOTE] -==== -Note that if an authority is present, the host is always -defined even if it is the empty string (corresponding -to a zero-length __reg-name__ in the BNF). - -[source,cpp] ----- -include::example$unit/snippets.cpp[tag=snippet_parsing_authority_10a,indent=0] ----- - -==== - - -The authority component also influences how we should -interpret the URL path. If the authority is present, -the path component must either be empty or begin with -a slash. - -[NOTE] -==== -Although the specification allows the format cpp:username:password[], -the password component should be used with care. - -It is not recommended to transfer password data through URLs -unless this is an empty string indicating no password. -==== - - - - -== Containers - -This library provides the following containers, which -are capable of storing any possible URL: - -* cpp:url[]: A modifiable container for a URL. -* cpp:url_view[]: A non-owning reference to a valid URL. -* cpp:static_url[]: A URL with fixed-capacity storage. - -These containers maintain a useful invariant: they -always contain a valid URL. In addition, the library -provides the cpp:authority_view[] container which holds -a non-modifiable reference to a valid __authority__. -An authority by itself is not a valid URL. - -In the sections that follow we describe the mechanisms use to -parse strings using various specific grammars, followed by the -interface for inspecting and modifying each of the main parts of -the URL. Finally we discuss important algorithms available to -use with URLs. - -// -// URLs -// Parsing -// Containers -// Segments -// Params -// Normalization -// StringToken -// Percent Encoding - - - - - - - - - - - - diff --git a/doc/modules/ROOT/pages/urls/parsing.adoc b/doc/modules/ROOT/pages/urls/parsing.adoc index db0291c64..edb559ed5 100644 --- a/doc/modules/ROOT/pages/urls/parsing.adoc +++ b/doc/modules/ROOT/pages/urls/parsing.adoc @@ -10,6 +10,93 @@ = Parsing +A URL, short for "Uniform Resource Locator," is a compact string +of characters identifying an abstract or physical resource. +It has these five parts, which may be optional or disallowed +depending on the context: + +image::PartsDiagram.svg[] + +Each part's syntax is defined by a set of production rules in +https://tools.ietf.org/html/rfc3986[rfc3986,window=blank_]. All valid URLs conform to this grammar, also called +the "generic syntax." Here is an example URL which describes a +file and its location on a network host: + + +[source] +---- +https://www.example.com/path/to/file.txt?userid=1001&pages=3&results=full#page1 +---- + + +The parts and their corresponding text is as follows: + +[cols="a,a"] +|=== +// Headers +|Part|Text + +// Row 1, Column 1 +|__scheme__ +// Row 1, Column 2 +|"https" + +// Row 2, Column 1 +|__authority__ +// Row 2, Column 2 +|"www.example.com" + +// Row 3, Column 1 +|__path__ +// Row 3, Column 2 +|"/path/to/file.txt" + +// Row 4, Column 1 +|__query__ +// Row 4, Column 2 +|"userid=1001&pages=3&results=full" + +// Row 5, Column 1 +|__fragment__ +// Row 5, Column 2 +|"page1" + +|=== + + +The production rule for the example above is called a __URI__, +which can contain all five parts. The specification using +https://datatracker.ietf.org/doc/html/rfc2234[__ABNF notation__,window=blank_] +is: + + +[source] +---- +URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + +hier-part = "//" authority path-abempty + / path-absolute + / path-rootless + / path-empty +---- + + +In this notation, the square brackets ("\// [" and "\]") denote optional +elements, quoted text represents character literals, and slashes are +used to indicate a choice between one of several elements. For the +complete specification of ABNF notation please consult +https://datatracker.ietf.org/doc/html/rfc2234[rfc2234,window=blank_], +"Augmented BNF for Syntax Specifications." +When using this library to process or create URLs, it is necessary +to choose which of these top-level production rules are applicable +for a given use-case: +__absolute-URI__, __origin-form__, __relative-ref__, __URI__, or +__URI-reference__. These are discussed in greater depth later. + + + +== Parsing URLs + Algorithms which parse URLs return a view which references the underlying character buffer without taking ownership, avoiding memory allocations and copies. The following example parses a string literal containing a https://datatracker.ietf.org/doc/html/rfc3986#section-3[__URI__,window=blank_]: