Skip to content

Commit eebfe1f

Browse files
committed
Improved mail subject decoding: Added support for Q-encoded words and enabled decoding of multiple words inside the subject instead of treating the subject always as a single word. See also issue #6
1 parent 3dac893 commit eebfe1f

File tree

4 files changed

+111
-20
lines changed

4 files changed

+111
-20
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ version = "1.1.2"
1010
edition = "2021"
1111

1212
[dependencies]
13+
regex = "1"
1314
axum = "0.7"
1415
anyhow = "1"
1516
flate2 = "1"

src/imap.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ fn extract_metadata(mail: &Fetch, max_size: usize) -> Result<Mail> {
222222
.as_deref()
223223
.map(|s| String::from_utf8_lossy(s))
224224
.unwrap_or("n/a".into())
225-
.to_string(),
225+
.as_ref(),
226226
);
227227
Ok(Mail {
228228
body: None,

src/mail.rs

Lines changed: 108 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
use anyhow::{bail, Context, Result};
12
use base64::engine::general_purpose::STANDARD;
23
use base64::Engine;
4+
use regex::Regex;
35
use serde::Serialize;
46

57
#[derive(Serialize)]
@@ -23,36 +25,123 @@ pub struct Mail {
2325
pub parsing_errors: usize,
2426
}
2527

26-
/// Basic decoder for MIME Encoded Words.
27-
/// Currently only UTF-8 and Base64 are supported.
28-
/// Works only if the whole subject is encoded as a single word.
29-
pub fn decode_subject(value: String) -> String {
30-
const PREFIX: &str = "=?utf-8?b?";
31-
const SUFFIX: &str = "?=";
32-
let lowercase = value.to_lowercase();
33-
if lowercase.starts_with(PREFIX) && lowercase.ends_with(SUFFIX) {
34-
let b64 = &value[PREFIX.len()..(value.len() - SUFFIX.len())];
35-
if let Ok(bytes) = STANDARD.decode(b64) {
36-
String::from_utf8(bytes).unwrap_or(value)
28+
/// Decoding of Q-encoded data as described in RFC2047
29+
fn q_decode(mut data: &str) -> Result<Vec<u8>> {
30+
let mut result = Vec::new();
31+
while !data.is_empty() {
32+
if data.starts_with('_') {
33+
// This is always ASCII space (0x20)
34+
result.push(0x20);
35+
data = &data[1..];
36+
} else if data.starts_with('=') {
37+
// This is followed by two hex digits encoding a byte
38+
if data.len() >= 3 {
39+
let hex = &data[1..3];
40+
let value = u8::from_str_radix(hex, 16)
41+
.context("Expected valid hex string but found something else")?;
42+
result.push(value);
43+
data = &data[3..];
44+
} else {
45+
bail!("The equal character must be followed by two hex characters");
46+
}
3747
} else {
38-
value
48+
// Keep everything else as is...
49+
let byte = data[0..1].as_bytes();
50+
result.extend_from_slice(byte);
51+
data = &data[1..];
3952
}
53+
}
54+
Ok(result)
55+
}
56+
57+
/// Decoding of MIME encoded words as described in RFC2047
58+
/// This implementation currently only supports UTF-8!
59+
fn decode_word(charset: &str, encoding: &str, data: &str) -> Result<String> {
60+
let charset = charset.to_lowercase();
61+
let encoding = encoding.to_lowercase();
62+
let decoded = if encoding == "b" {
63+
STANDARD
64+
.decode(data)
65+
.context("Failed to decode Base64 data")?
66+
} else if encoding == "q" {
67+
q_decode(data).context("Failed to decode Q data")?
4068
} else {
41-
value
69+
bail!("Unsupported encoding: {encoding}")
70+
};
71+
if charset == "utf-8" {
72+
String::from_utf8(decoded).context("Failed to parse UTF-8 string")
73+
} else {
74+
// Unsupported charset
75+
bail!("Unsupported charset: {charset}")
4276
}
4377
}
4478

79+
/// Basic decoder for subjects containing MIME encoded words.
80+
/// Supported charsets: Only UTF-8
81+
/// Supported encodings: Base64 and Q
82+
pub fn decode_subject(value: &str) -> String {
83+
let re = Regex::new(r"=\?(.+?)\?(.)\?(.+?)\?=").unwrap();
84+
let mut result = value.to_owned();
85+
for capture in re.captures_iter(value) {
86+
let (matched, [charset, encoding, encoded]) = capture.extract();
87+
let decoded = match decode_word(charset, encoding, encoded) {
88+
Ok(word) => word,
89+
Err(_) => continue,
90+
};
91+
result = result.replace(matched, &decoded);
92+
}
93+
result
94+
}
95+
4596
#[cfg(test)]
4697
mod tests {
4798
use super::*;
4899

100+
#[test]
101+
fn q_decode_test() {
102+
assert_eq!(q_decode("").unwrap(), Vec::<u8>::new());
103+
assert_eq!(q_decode("abc").unwrap(), vec![b'a', b'b', b'c']);
104+
assert_eq!(q_decode("_").unwrap(), vec![0x20]);
105+
assert_eq!(
106+
q_decode("=00=ff=AA_abc").unwrap(),
107+
vec![0x00, 0xff, 0xaa, 0x20, b'a', b'b', b'c']
108+
);
109+
assert_eq!(
110+
q_decode("Best=C3=A4tigen").unwrap(),
111+
vec![66, 101, 115, 116, 195, 164, 116, 105, 103, 101, 110]
112+
);
113+
}
114+
115+
#[test]
116+
fn decode_word_test() {
117+
assert_eq!(decode_word("utf-8", "b", "YWJj").unwrap(), "abc");
118+
assert_eq!(decode_word("UtF-8", "B", "YWJj").unwrap(), "abc");
119+
assert_eq!(decode_word("utf-8", "q", "=C3=A4").unwrap(), "ä");
120+
assert_eq!(decode_word("utf-8", "b", "dGV4dA==").unwrap(), "text");
121+
122+
assert!(decode_word("unknown", "B", "YWJj").is_err());
123+
assert!(decode_word("utf-8", "unknown", "YWJj").is_err());
124+
assert!(decode_word("utf-8", "b", "not_valid_b64").is_err());
125+
}
126+
49127
#[test]
50128
fn decode_subject_test() {
51-
assert_eq!(decode_subject(String::from("")), "");
52-
assert_eq!(decode_subject(String::from("basic 123")), "basic 123");
53-
assert_eq!(decode_subject(String::from("=?utf-8?B??=")), "");
54-
assert_eq!(decode_subject(String::from("=?utf-8?B?dGV4dA==?=")), "text");
55-
assert_eq!(decode_subject(String::from("=?utf-8?B?YWJj?=")), "abc");
56-
assert_eq!(decode_subject(String::from("=?UTF-8?b?YWJj?=")), "abc");
129+
// Can handle empty strings
130+
assert_eq!(decode_subject(""), "");
131+
132+
// Can handle strings without encoded words
133+
assert_eq!(decode_subject("foobar 42"), "foobar 42");
134+
135+
// Ignores invalid words that cannot be decoded
136+
assert_eq!(decode_subject("=?foo?z?a?="), "=?foo?z?a?=");
137+
138+
// Can decode words in the middle
139+
assert_eq!(decode_subject(" =?UTF-8?b?YWJj?= "), " abc ");
140+
141+
// Can decode multiple words in one string
142+
assert_eq!(
143+
decode_subject(" =?UTF-8?B?YWJj?= =?UTF-8?Q?=C3=A4?= "),
144+
" abc ä "
145+
);
57146
}
58147
}

0 commit comments

Comments
 (0)