Improved mail subject decoding: Added support for Q-encoded words and enabled decoding of multiple words inside the subject instead of treating the subject always as a single word. See also issue #6

cry-inc · cry-inc · commit eebfe1fe3a0c · 2025-01-04T00:00:55.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,6 +10,7 @@ version = "1.1.2"
 edition = "2021"
 
 [dependencies]
+regex = "1"
 axum = "0.7"
 anyhow = "1"
 flate2 = "1"
diff --git a/src/imap.rs b/src/imap.rs
@@ -222,7 +222,7 @@ fn extract_metadata(mail: &Fetch, max_size: usize) -> Result<Mail> {
             .as_deref()
             .map(|s| String::from_utf8_lossy(s))
             .unwrap_or("n/a".into())
-            .to_string(),
+            .as_ref(),
     );
     Ok(Mail {
         body: None,
diff --git a/src/mail.rs b/src/mail.rs
@@ -1,5 +1,7 @@
+use anyhow::{bail, Context, Result};
 use base64::engine::general_purpose::STANDARD;
 use base64::Engine;
+use regex::Regex;
 use serde::Serialize;
 
 #[derive(Serialize)]
@@ -23,36 +25,123 @@ pub struct Mail {
     pub parsing_errors: usize,
 }
 
-/// Basic decoder for MIME Encoded Words.
-/// Currently only UTF-8 and Base64 are supported.
-/// Works only if the whole subject is encoded as a single word.
-pub fn decode_subject(value: String) -> String {
-    const PREFIX: &str = "=?utf-8?b?";
-    const SUFFIX: &str = "?=";
-    let lowercase = value.to_lowercase();
-    if lowercase.starts_with(PREFIX) && lowercase.ends_with(SUFFIX) {
-        let b64 = &value[PREFIX.len()..(value.len() - SUFFIX.len())];
-        if let Ok(bytes) = STANDARD.decode(b64) {
-            String::from_utf8(bytes).unwrap_or(value)
+/// Decoding of Q-encoded data as described in RFC2047
+fn q_decode(mut data: &str) -> Result<Vec<u8>> {
+    let mut result = Vec::new();
+    while !data.is_empty() {
+        if data.starts_with('_') {
+            // This is always ASCII space (0x20)
+            result.push(0x20);
+            data = &data[1..];
+        } else if data.starts_with('=') {
+            // This is followed by two hex digits encoding a byte
+            if data.len() >= 3 {
+                let hex = &data[1..3];
+                let value = u8::from_str_radix(hex, 16)
+                    .context("Expected valid hex string but found something else")?;
+                result.push(value);
+                data = &data[3..];
+            } else {
+                bail!("The equal character must be followed by two hex characters");
+            }
         } else {
-            value
+            // Keep everything else as is...
+            let byte = data[0..1].as_bytes();
+            result.extend_from_slice(byte);
+            data = &data[1..];
         }
+    }
+    Ok(result)
+}
+
+/// Decoding of MIME encoded words as described in RFC2047
+/// This implementation currently only supports UTF-8!
+fn decode_word(charset: &str, encoding: &str, data: &str) -> Result<String> {
+    let charset = charset.to_lowercase();
+    let encoding = encoding.to_lowercase();
+    let decoded = if encoding == "b" {
+        STANDARD
+            .decode(data)
+            .context("Failed to decode Base64 data")?
+    } else if encoding == "q" {
+        q_decode(data).context("Failed to decode Q data")?
     } else {
-        value
+        bail!("Unsupported encoding: {encoding}")
+    };
+    if charset == "utf-8" {
+        String::from_utf8(decoded).context("Failed to parse UTF-8 string")
+    } else {
+        // Unsupported charset
+        bail!("Unsupported charset: {charset}")
     }
 }
 
+/// Basic decoder for subjects containing MIME encoded words.
+/// Supported charsets: Only UTF-8
+/// Supported encodings: Base64 and Q
+pub fn decode_subject(value: &str) -> String {
+    let re = Regex::new(r"=\?(.+?)\?(.)\?(.+?)\?=").unwrap();
+    let mut result = value.to_owned();
+    for capture in re.captures_iter(value) {
+        let (matched, [charset, encoding, encoded]) = capture.extract();
+        let decoded = match decode_word(charset, encoding, encoded) {
+            Ok(word) => word,
+            Err(_) => continue,
+        };
+        result = result.replace(matched, &decoded);
+    }
+    result
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
+    #[test]
+    fn q_decode_test() {
+        assert_eq!(q_decode("").unwrap(), Vec::<u8>::new());
+        assert_eq!(q_decode("abc").unwrap(), vec![b'a', b'b', b'c']);
+        assert_eq!(q_decode("_").unwrap(), vec![0x20]);
+        assert_eq!(
+            q_decode("=00=ff=AA_abc").unwrap(),
+            vec![0x00, 0xff, 0xaa, 0x20, b'a', b'b', b'c']
+        );
+        assert_eq!(
+            q_decode("Best=C3=A4tigen").unwrap(),
+            vec![66, 101, 115, 116, 195, 164, 116, 105, 103, 101, 110]
+        );
+    }
+
+    #[test]
+    fn decode_word_test() {
+        assert_eq!(decode_word("utf-8", "b", "YWJj").unwrap(), "abc");
+        assert_eq!(decode_word("UtF-8", "B", "YWJj").unwrap(), "abc");
+        assert_eq!(decode_word("utf-8", "q", "=C3=A4").unwrap(), "ä");
+        assert_eq!(decode_word("utf-8", "b", "dGV4dA==").unwrap(), "text");
+
+        assert!(decode_word("unknown", "B", "YWJj").is_err());
+        assert!(decode_word("utf-8", "unknown", "YWJj").is_err());
+        assert!(decode_word("utf-8", "b", "not_valid_b64").is_err());
+    }
+
     #[test]
     fn decode_subject_test() {
-        assert_eq!(decode_subject(String::from("")), "");
-        assert_eq!(decode_subject(String::from("basic 123")), "basic 123");
-        assert_eq!(decode_subject(String::from("=?utf-8?B??=")), "");
-        assert_eq!(decode_subject(String::from("=?utf-8?B?dGV4dA==?=")), "text");
-        assert_eq!(decode_subject(String::from("=?utf-8?B?YWJj?=")), "abc");
-        assert_eq!(decode_subject(String::from("=?UTF-8?b?YWJj?=")), "abc");
+        // Can handle empty strings
+        assert_eq!(decode_subject(""), "");
+
+        // Can handle strings without encoded words
+        assert_eq!(decode_subject("foobar 42"), "foobar 42");
+
+        // Ignores invalid words that cannot be decoded
+        assert_eq!(decode_subject("=?foo?z?a?="), "=?foo?z?a?=");
+
+        // Can decode words in the middle
+        assert_eq!(decode_subject(" =?UTF-8?b?YWJj?= "), " abc ");
+
+        // Can decode multiple words in one string
+        assert_eq!(
+            decode_subject(" =?UTF-8?B?YWJj?= =?UTF-8?Q?=C3=A4?= "),
+            " abc ä "
+        );
     }
 }