Skip to content

Commit

Permalink
Add AsciiSet::EMPTY and boolean operators (#969)
Browse files Browse the repository at this point in the history
* Add AsciiSet::EMPTY and impl ops::Add for AsciiSet

In RFCs, the sets of characters to percent-encode are often defined as
the union of multiple sets. This change adds an `EMPTY` constant to
`AsciiSet` and implements the `Add` trait for `AsciiSet` so that sets
can be combined with the `+` operator.

AsciiSet now derives `Debug`, `PartialEq`, and `Eq` so that it can be
used in tests.

* implement ops::Not for AsciiSet

* Add const functions for negation / union of AsciiSet
  • Loading branch information
joshka authored Sep 19, 2024
1 parent 9404ff5 commit 5505565
Showing 1 changed file with 83 additions and 1 deletion.
84 changes: 83 additions & 1 deletion percent_encoding/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ use alloc::{
string::String,
vec::Vec,
};
use core::{fmt, mem, slice, str};
use core::{fmt, mem, ops, slice, str};

/// Represents a set of characters or bytes in the ASCII range.
///
Expand All @@ -66,6 +66,7 @@ use core::{fmt, mem, slice, str};
/// /// https://url.spec.whatwg.org/#fragment-percent-encode-set
/// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
/// ```
#[derive(Debug, PartialEq, Eq)]
pub struct AsciiSet {
mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK],
}
Expand All @@ -77,6 +78,11 @@ const ASCII_RANGE_LEN: usize = 0x80;
const BITS_PER_CHUNK: usize = 8 * mem::size_of::<Chunk>();

impl AsciiSet {
/// An empty set.
pub const EMPTY: AsciiSet = AsciiSet {
mask: [0; ASCII_RANGE_LEN / BITS_PER_CHUNK],
};

/// Called with UTF-8 bytes rather than code points.
/// Not used for non-ASCII bytes.
const fn contains(&self, byte: u8) -> bool {
Expand All @@ -100,6 +106,39 @@ impl AsciiSet {
mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK));
AsciiSet { mask }
}

/// Return the union of two sets.
pub const fn union(&self, other: Self) -> Self {
let mask = [
self.mask[0] | other.mask[0],
self.mask[1] | other.mask[1],
self.mask[2] | other.mask[2],
self.mask[3] | other.mask[3],
];
AsciiSet { mask }
}

/// Return the negation of the set.
pub const fn complement(&self) -> Self {
let mask = [!self.mask[0], !self.mask[1], !self.mask[2], !self.mask[3]];
AsciiSet { mask }
}
}

impl ops::Add for AsciiSet {
type Output = Self;

fn add(self, other: Self) -> Self {
self.union(other)
}
}

impl ops::Not for AsciiSet {
type Output = Self;

fn not(self) -> Self {
self.complement()
}
}

/// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL).
Expand Down Expand Up @@ -478,3 +517,46 @@ fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn add_op() {
let left = AsciiSet::EMPTY.add(b'A');
let right = AsciiSet::EMPTY.add(b'B');
let expected = AsciiSet::EMPTY.add(b'A').add(b'B');
assert_eq!(left + right, expected);
}

#[test]
fn not_op() {
let set = AsciiSet::EMPTY.add(b'A').add(b'B');
let not_set = !set;
assert!(!not_set.contains(b'A'));
assert!(not_set.contains(b'C'));
}

/// This test ensures that we can get the union of two sets as a constant value, which is
/// useful for defining sets in a modular way.
#[test]
fn union() {
const A: AsciiSet = AsciiSet::EMPTY.add(b'A');
const B: AsciiSet = AsciiSet::EMPTY.add(b'B');
const UNION: AsciiSet = A.union(B);
const EXPECTED: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B');
assert_eq!(UNION, EXPECTED);
}

/// This test ensures that we can get the complement of a set as a constant value, which is
/// useful for defining sets in a modular way.
#[test]
fn complement() {
const BOTH: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B');
const COMPLEMENT: AsciiSet = BOTH.complement();
assert!(!COMPLEMENT.contains(b'A'));
assert!(!COMPLEMENT.contains(b'B'));
assert!(COMPLEMENT.contains(b'C'));
}
}

0 comments on commit 5505565

Please sign in to comment.