stellar_prune/build/packages/gleam_stdlib/src/gleam/uri.gleam
2025-11-30 15:44:22 +01:00

770 lines
24 KiB
Gleam

//// Utilities for working with URIs
////
//// This module provides functions for working with URIs (for example, parsing
//// URIs or encoding query strings). The functions in this module are implemented
//// according to [RFC 3986](https://tools.ietf.org/html/rfc3986).
////
//// Query encoding (Form encoding) is defined in the
//// [W3C specification](https://www.w3.org/TR/html52/sec-forms.html#urlencoded-form-data).
import gleam/int
import gleam/list
import gleam/option.{type Option, None, Some}
import gleam/string
import gleam/string_tree.{type StringTree}
/// Type representing holding the parsed components of an URI.
/// All components of a URI are optional, except the path.
///
pub type Uri {
Uri(
scheme: Option(String),
userinfo: Option(String),
host: Option(String),
port: Option(Int),
path: String,
query: Option(String),
fragment: Option(String),
)
}
/// Constant representing an empty URI, equivalent to "".
///
/// ## Examples
///
/// ```gleam
/// let uri = Uri(..empty, scheme: Some("https"), host: Some("example.com"))
/// // -> Uri(
/// // scheme: Some("https"),
/// // userinfo: None,
/// // host: Some("example.com"),
/// // port: None,
/// // path: "",
/// // query: None,
/// // fragment: None,
/// // )
/// ```
///
pub const empty = Uri(
scheme: None,
userinfo: None,
host: None,
port: None,
path: "",
query: None,
fragment: None,
)
/// Parses a compliant URI string into the `Uri` Type.
/// If the string is not a valid URI string then an error is returned.
///
/// The opposite operation is `uri.to_string`.
///
/// ## Examples
///
/// ```gleam
/// parse("https://example.com:1234/a/b?query=true#fragment")
/// // -> Ok(
/// // Uri(
/// // scheme: Some("https"),
/// // userinfo: None,
/// // host: Some("example.com"),
/// // port: Some(1234),
/// // path: "/a/b",
/// // query: Some("query=true"),
/// // fragment: Some("fragment")
/// // )
/// // )
/// ```
///
@external(erlang, "gleam_stdlib", "uri_parse")
pub fn parse(uri_string: String) -> Result(Uri, Nil) {
// This parses a uri_string following the regex defined in
// https://tools.ietf.org/html/rfc3986#appendix-B
//
// TODO: This is not perfect and will be more permissive than its Erlang
// counterpart, ideally we want to replicate Erlang's implementation on the js
// target as well.
parse_scheme_loop(uri_string, uri_string, empty, 0)
}
fn parse_scheme_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
// `/` is not allowed to appear in a scheme so we know it's over and we can
// start parsing the authority with slashes.
"/" <> _ if size == 0 -> parse_authority_with_slashes(uri_string, pieces)
"/" <> _ -> {
let scheme = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, scheme: Some(string.lowercase(scheme)))
parse_authority_with_slashes(uri_string, pieces)
}
// `?` is not allowed to appear in a schemem, in an authority, or in a path;
// so if we see it we know it marks the beginning of the query part.
"?" <> rest if size == 0 -> parse_query_with_question_mark(rest, pieces)
"?" <> rest -> {
let scheme = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, scheme: Some(string.lowercase(scheme)))
parse_query_with_question_mark(rest, pieces)
}
// `#` is not allowed to appear in a scheme, in an authority, in a path or
// in a query; so if we see it we know it marks the beginning of the final
// fragment.
"#" <> rest if size == 0 -> parse_fragment(rest, pieces)
"#" <> rest -> {
let scheme = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, scheme: Some(string.lowercase(scheme)))
parse_fragment(rest, pieces)
}
// A colon marks the end of a uri scheme, but if it is not preceded by any
// character then it's not a valid URI.
":" <> _ if size == 0 -> Error(Nil)
":" <> rest -> {
let scheme = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, scheme: Some(string.lowercase(scheme)))
parse_authority_with_slashes(rest, pieces)
}
// If we could get to the end of the string and we've met no special
// chars whatsoever, that means the entire string is just a long path.
"" -> Ok(Uri(..pieces, path: original))
// In all other cases the first character is just a valid URI scheme
// character and we just keep munching characters until we reach the end of
// the uri scheme (or the end of the string and that would mean this is not
// a valid uri scheme since we found no `:`).
_ -> {
let #(_, rest) = pop_codeunit(uri_string)
parse_scheme_loop(original, rest, pieces, size + 1)
}
}
}
fn parse_authority_with_slashes(
uri_string: String,
pieces: Uri,
) -> Result(Uri, Nil) {
case uri_string {
// To be a valid authority the string must start with a `//`, otherwise
// there's no authority and we just skip ahead to parsing the path.
"//" -> Ok(Uri(..pieces, host: Some("")))
"//" <> rest -> parse_authority_pieces(rest, pieces)
_ -> parse_path(uri_string, pieces)
}
}
fn parse_authority_pieces(string: String, pieces: Uri) -> Result(Uri, Nil) {
parse_userinfo_loop(string, string, pieces, 0)
}
fn parse_userinfo_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
// `@` marks the end of the userinfo and the start of the host part in the
// authority string.
"@" <> rest if size == 0 -> parse_host(rest, pieces)
"@" <> rest -> {
let userinfo = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, userinfo: Some(userinfo))
parse_host(rest, pieces)
}
// If we reach the end of the authority string without finding an `@`
// special character, then we know that the authority doesn't actually
// contain the userinfo part.
// The entire string we just went through was a host! So we parse it as
// such.
"" | "/" <> _ | "?" <> _ | "#" <> _ -> parse_host(original, pieces)
// In all other cases we just keep munching characters increasing the size
// of the userinfo bit.
_ -> {
let #(_, rest) = pop_codeunit(uri_string)
parse_userinfo_loop(original, rest, pieces, size + 1)
}
}
}
fn parse_host(uri_string: String, pieces: Uri) -> Result(Uri, Nil) {
// A host string can be in two formats:
// - \[[:.a-zA-Z0-9]*\]
// - [^:]
case uri_string {
// If we find an opening bracket we know it's the first format.
"[" <> _ -> parse_host_within_brackets(uri_string, pieces)
// A `:` marks the beginning of the port part of the authority string.
":" <> _ -> {
let pieces = Uri(..pieces, host: Some(""))
parse_port(uri_string, pieces)
}
// If the string is empty then there's no need to keep going. The host is
// empty.
"" -> Ok(Uri(..pieces, host: Some("")))
// Otherwise it's the second format
_ -> parse_host_outside_of_brackets(uri_string, pieces)
}
}
fn parse_host_within_brackets(
uri_string: String,
pieces: Uri,
) -> Result(Uri, Nil) {
parse_host_within_brackets_loop(uri_string, uri_string, pieces, 0)
}
fn parse_host_within_brackets_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
// If the string is over the entire string we were iterating through is the
// host part.
"" -> Ok(Uri(..pieces, host: Some(uri_string)))
// A `]` marks the end of the host and the start of the port part.
"]" <> rest if size == 0 -> parse_port(rest, pieces)
"]" <> rest -> {
let host = codeunit_slice(original, at_index: 0, length: size + 1)
let pieces = Uri(..pieces, host: Some(host))
parse_port(rest, pieces)
}
// `/` marks the beginning of a path.
"/" <> _ if size == 0 -> parse_path(uri_string, pieces)
"/" <> _ -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_path(uri_string, pieces)
}
// `?` marks the beginning of the query with question mark.
"?" <> rest if size == 0 -> parse_query_with_question_mark(rest, pieces)
"?" <> rest -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_query_with_question_mark(rest, pieces)
}
// `#` marks the beginning of the fragment part.
"#" <> rest if size == 0 -> parse_fragment(rest, pieces)
"#" <> rest -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_fragment(rest, pieces)
}
// In all other cases we just keep iterating.
_ -> {
let #(char, rest) = pop_codeunit(uri_string)
// Inside `[...]` there can only be some characters, if we find a special
// one then we know that we're actually parsing the other format for the
// host and we switch to that!
case is_valid_host_within_brackets_char(char) {
True ->
parse_host_within_brackets_loop(original, rest, pieces, size + 1)
False ->
parse_host_outside_of_brackets_loop(original, original, pieces, 0)
}
}
}
}
fn is_valid_host_within_brackets_char(char: Int) -> Bool {
// [0-9]
{ 48 >= char && char <= 57 }
// [A-Z]
|| { 65 >= char && char <= 90 }
// [a-z]
|| { 97 >= char && char <= 122 }
// :
|| char == 58
// .
|| char == 46
}
fn parse_host_outside_of_brackets(
uri_string: String,
pieces: Uri,
) -> Result(Uri, Nil) {
parse_host_outside_of_brackets_loop(uri_string, uri_string, pieces, 0)
}
fn parse_host_outside_of_brackets_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
"" -> Ok(Uri(..pieces, host: Some(original)))
// `:` marks the beginning of the port.
":" <> _ -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_port(uri_string, pieces)
}
// `/` marks the beginning of a path.
"/" <> _ -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_path(uri_string, pieces)
}
// `?` marks the beginning of the query with question mark.
"?" <> rest -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_query_with_question_mark(rest, pieces)
}
// `#` marks the beginning of the fragment part.
"#" <> rest -> {
let host = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, host: Some(host))
parse_fragment(rest, pieces)
}
_ -> {
let #(_, rest) = pop_codeunit(uri_string)
parse_host_outside_of_brackets_loop(original, rest, pieces, size + 1)
}
}
}
fn parse_port(uri_string: String, pieces: Uri) -> Result(Uri, Nil) {
case uri_string {
":0" <> rest -> parse_port_loop(rest, pieces, 0)
":1" <> rest -> parse_port_loop(rest, pieces, 1)
":2" <> rest -> parse_port_loop(rest, pieces, 2)
":3" <> rest -> parse_port_loop(rest, pieces, 3)
":4" <> rest -> parse_port_loop(rest, pieces, 4)
":5" <> rest -> parse_port_loop(rest, pieces, 5)
":6" <> rest -> parse_port_loop(rest, pieces, 6)
":7" <> rest -> parse_port_loop(rest, pieces, 7)
":8" <> rest -> parse_port_loop(rest, pieces, 8)
":9" <> rest -> parse_port_loop(rest, pieces, 9)
// The port could be empty and be followed by any of the next delimiters.
// Like `:#`, `:?` or `:/`
":" | "" -> Ok(pieces)
// `?` marks the beginning of the query with question mark.
"?" <> rest | ":?" <> rest -> parse_query_with_question_mark(rest, pieces)
// `#` marks the beginning of the fragment part.
"#" <> rest | ":#" <> rest -> parse_fragment(rest, pieces)
// `/` marks the beginning of a path.
"/" <> _ -> parse_path(uri_string, pieces)
":" <> rest ->
case rest {
"/" <> _ -> parse_path(rest, pieces)
_ -> Error(Nil)
}
_ -> Error(Nil)
}
}
fn parse_port_loop(
uri_string: String,
pieces: Uri,
port: Int,
) -> Result(Uri, Nil) {
case uri_string {
// As long as we find port numbers we keep accumulating those.
"0" <> rest -> parse_port_loop(rest, pieces, port * 10)
"1" <> rest -> parse_port_loop(rest, pieces, port * 10 + 1)
"2" <> rest -> parse_port_loop(rest, pieces, port * 10 + 2)
"3" <> rest -> parse_port_loop(rest, pieces, port * 10 + 3)
"4" <> rest -> parse_port_loop(rest, pieces, port * 10 + 4)
"5" <> rest -> parse_port_loop(rest, pieces, port * 10 + 5)
"6" <> rest -> parse_port_loop(rest, pieces, port * 10 + 6)
"7" <> rest -> parse_port_loop(rest, pieces, port * 10 + 7)
"8" <> rest -> parse_port_loop(rest, pieces, port * 10 + 8)
"9" <> rest -> parse_port_loop(rest, pieces, port * 10 + 9)
// `?` marks the beginning of the query with question mark.
"?" <> rest -> {
let pieces = Uri(..pieces, port: Some(port))
parse_query_with_question_mark(rest, pieces)
}
// `#` marks the beginning of the fragment part.
"#" <> rest -> {
let pieces = Uri(..pieces, port: Some(port))
parse_fragment(rest, pieces)
}
// `/` marks the beginning of a path.
"/" <> _ -> {
let pieces = Uri(..pieces, port: Some(port))
parse_path(uri_string, pieces)
}
// The string (and so the port) is over, we return what we parsed so far.
"" -> Ok(Uri(..pieces, port: Some(port)))
// In all other cases we've ran into some invalid character inside the port
// so the uri is invalid!
_ -> Error(Nil)
}
}
fn parse_path(uri_string: String, pieces: Uri) -> Result(Uri, Nil) {
parse_path_loop(uri_string, uri_string, pieces, 0)
}
fn parse_path_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
// `?` marks the beginning of the query with question mark.
"?" <> rest -> {
let path = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, path: path)
parse_query_with_question_mark(rest, pieces)
}
// `#` marks the beginning of the fragment part.
"#" <> rest -> {
let path = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, path: path)
parse_fragment(rest, pieces)
}
// If the string is over that means the entirety of the string was the path
// and it has an empty query and fragment.
"" -> Ok(Uri(..pieces, path: original))
// In all other cases the character is allowed to be part of the path so we
// just keep munching until we reach to its end.
_ -> {
let #(_, rest) = pop_codeunit(uri_string)
parse_path_loop(original, rest, pieces, size + 1)
}
}
}
fn parse_query_with_question_mark(
uri_string: String,
pieces: Uri,
) -> Result(Uri, Nil) {
parse_query_with_question_mark_loop(uri_string, uri_string, pieces, 0)
}
fn parse_query_with_question_mark_loop(
original: String,
uri_string: String,
pieces: Uri,
size: Int,
) -> Result(Uri, Nil) {
case uri_string {
// `#` marks the beginning of the fragment part.
"#" <> rest if size == 0 -> parse_fragment(rest, pieces)
"#" <> rest -> {
let query = codeunit_slice(original, at_index: 0, length: size)
let pieces = Uri(..pieces, query: Some(query))
parse_fragment(rest, pieces)
}
// If the string is over that means the entirety of the string was the query
// and it has an empty fragment.
"" -> Ok(Uri(..pieces, query: Some(original)))
// In all other cases the character is allowed to be part of the query so we
// just keep munching until we reach to its end.
_ -> {
let #(_, rest) = pop_codeunit(uri_string)
parse_query_with_question_mark_loop(original, rest, pieces, size + 1)
}
}
}
fn parse_fragment(rest: String, pieces: Uri) -> Result(Uri, Nil) {
Ok(Uri(..pieces, fragment: Some(rest)))
}
// WARN: this function returns invalid strings!
// We need to return a String anyways to have this as the representation on the
// JavaScript target.
// Alternatively, we could rewrite the entire code to use a single
// `fold_codeunits`-style loop and a state machine.
@external(erlang, "gleam_stdlib", "string_pop_codeunit")
@external(javascript, "../gleam_stdlib.mjs", "pop_codeunit")
fn pop_codeunit(str: String) -> #(Int, String)
@external(erlang, "binary", "part")
@external(javascript, "../gleam_stdlib.mjs", "string_codeunit_slice")
fn codeunit_slice(str: String, at_index from: Int, length length: Int) -> String
/// Parses an urlencoded query string into a list of key value pairs.
/// Returns an error for invalid encoding.
///
/// The opposite operation is `uri.query_to_string`.
///
/// ## Examples
///
/// ```gleam
/// parse_query("a=1&b=2")
/// // -> Ok([#("a", "1"), #("b", "2")])
/// ```
///
@external(erlang, "gleam_stdlib", "parse_query")
@external(javascript, "../gleam_stdlib.mjs", "parse_query")
pub fn parse_query(query: String) -> Result(List(#(String, String)), Nil)
/// Encodes a list of key value pairs as a URI query string.
///
/// The opposite operation is `uri.parse_query`.
///
/// ## Examples
///
/// ```gleam
/// query_to_string([#("a", "1"), #("b", "2")])
/// // -> "a=1&b=2"
/// ```
///
pub fn query_to_string(query: List(#(String, String))) -> String {
query
|> list.map(query_pair)
|> list.intersperse(string_tree.from_string("&"))
|> string_tree.concat
|> string_tree.to_string
}
fn query_pair(pair: #(String, String)) -> StringTree {
string_tree.from_strings([percent_encode(pair.0), "=", percent_encode(pair.1)])
}
/// Encodes a string into a percent encoded representation.
///
/// ## Examples
///
/// ```gleam
/// percent_encode("100% great")
/// // -> "100%25%20great"
/// ```
///
@external(erlang, "gleam_stdlib", "percent_encode")
@external(javascript, "../gleam_stdlib.mjs", "percent_encode")
pub fn percent_encode(value: String) -> String
/// Decodes a percent encoded string.
///
/// ## Examples
///
/// ```gleam
/// percent_decode("100%25%20great+fun")
/// // -> Ok("100% great+fun")
/// ```
///
@external(erlang, "gleam_stdlib", "percent_decode")
@external(javascript, "../gleam_stdlib.mjs", "percent_decode")
pub fn percent_decode(value: String) -> Result(String, Nil)
/// Splits the path section of a URI into it's constituent segments.
///
/// Removes empty segments and resolves dot-segments as specified in
/// [section 5.2](https://www.ietf.org/rfc/rfc3986.html#section-5.2) of the RFC.
///
/// ## Examples
///
/// ```gleam
/// path_segments("/users/1")
/// // -> ["users" ,"1"]
/// ```
///
pub fn path_segments(path: String) -> List(String) {
remove_dot_segments(string.split(path, "/"))
}
fn remove_dot_segments(input: List(String)) -> List(String) {
remove_dot_segments_loop(input, [])
}
fn remove_dot_segments_loop(
input: List(String),
accumulator: List(String),
) -> List(String) {
case input {
[] -> list.reverse(accumulator)
[segment, ..rest] -> {
let accumulator = case segment, accumulator {
"", accumulator -> accumulator
".", accumulator -> accumulator
"..", [] -> []
"..", [_, ..accumulator] -> accumulator
segment, accumulator -> [segment, ..accumulator]
}
remove_dot_segments_loop(rest, accumulator)
}
}
}
/// Encodes a `Uri` value as a URI string.
///
/// The opposite operation is `uri.parse`.
///
/// ## Examples
///
/// ```gleam
/// let uri = Uri(..empty, scheme: Some("https"), host: Some("example.com"))
/// to_string(uri)
/// // -> "https://example.com"
/// ```
///
pub fn to_string(uri: Uri) -> String {
let parts = case uri.fragment {
Some(fragment) -> ["#", fragment]
None -> []
}
let parts = case uri.query {
Some(query) -> ["?", query, ..parts]
None -> parts
}
let parts = [uri.path, ..parts]
let parts = case uri.host, string.starts_with(uri.path, "/") {
Some(host), False if host != "" -> ["/", ..parts]
_, _ -> parts
}
let parts = case uri.host, uri.port {
Some(_), Some(port) -> [":", int.to_string(port), ..parts]
_, _ -> parts
}
let parts = case uri.scheme, uri.userinfo, uri.host {
Some(s), Some(u), Some(h) -> [s, "://", u, "@", h, ..parts]
Some(s), None, Some(h) -> [s, "://", h, ..parts]
Some(s), Some(_), None | Some(s), None, None -> [s, ":", ..parts]
None, None, Some(h) -> ["//", h, ..parts]
_, _, _ -> parts
}
string.concat(parts)
}
/// Fetches the origin of a URI.
///
/// Returns the origin of a uri as defined in
/// [RFC 6454](https://tools.ietf.org/html/rfc6454)
///
/// The supported URI schemes are `http` and `https`.
/// URLs without a scheme will return `Error`.
///
/// ## Examples
///
/// ```gleam
/// let assert Ok(uri) = parse("https://example.com/path?foo#bar")
/// origin(uri)
/// // -> Ok("https://example.com")
/// ```
///
pub fn origin(uri: Uri) -> Result(String, Nil) {
let Uri(scheme: scheme, host: host, port: port, ..) = uri
case host, scheme {
Some(h), Some("https") if port == Some(443) ->
Ok(string.concat(["https://", h]))
Some(h), Some("http") if port == Some(80) ->
Ok(string.concat(["http://", h]))
Some(h), Some(s) if s == "http" || s == "https" -> {
case port {
Some(p) -> Ok(string.concat([s, "://", h, ":", int.to_string(p)]))
None -> Ok(string.concat([s, "://", h]))
}
}
_, _ -> Error(Nil)
}
}
/// Resolves a URI with respect to the given base URI.
///
/// The base URI must be an absolute URI or this function will return an error.
/// The algorithm for merging uris is described in
/// [RFC 3986](https://tools.ietf.org/html/rfc3986#section-5.2).
///
pub fn merge(base: Uri, relative: Uri) -> Result(Uri, Nil) {
case base {
Uri(scheme: Some(_), host: Some(_), ..) ->
case relative {
Uri(host: Some(_), ..) -> {
let path =
relative.path
|> string.split("/")
|> remove_dot_segments()
|> join_segments()
let resolved =
Uri(
option.or(relative.scheme, base.scheme),
None,
relative.host,
option.or(relative.port, base.port),
path,
relative.query,
relative.fragment,
)
Ok(resolved)
}
_ -> {
let #(new_path, new_query) = case relative.path {
"" -> #(base.path, option.or(relative.query, base.query))
_ -> {
let path_segments = case string.starts_with(relative.path, "/") {
True -> string.split(relative.path, "/")
False ->
base.path
|> string.split("/")
|> drop_last()
|> list.append(string.split(relative.path, "/"))
}
let path =
path_segments
|> remove_dot_segments()
|> join_segments()
#(path, relative.query)
}
}
let resolved =
Uri(
base.scheme,
None,
base.host,
base.port,
new_path,
new_query,
relative.fragment,
)
Ok(resolved)
}
}
_ -> Error(Nil)
}
}
fn drop_last(elements: List(a)) -> List(a) {
list.take(from: elements, up_to: list.length(elements) - 1)
}
fn join_segments(segments: List(String)) -> String {
string.join(["", ..segments], "/")
}