1
+ use anyhow:: { bail, Context , Result } ;
1
2
use base64:: engine:: general_purpose:: STANDARD ;
2
3
use base64:: Engine ;
4
+ use regex:: Regex ;
3
5
use serde:: Serialize ;
4
6
5
7
#[ derive( Serialize ) ]
@@ -23,36 +25,123 @@ pub struct Mail {
23
25
pub parsing_errors : usize ,
24
26
}
25
27
26
- /// Basic decoder for MIME Encoded Words.
27
- /// Currently only UTF-8 and Base64 are supported.
28
- /// Works only if the whole subject is encoded as a single word.
29
- pub fn decode_subject ( value : String ) -> String {
30
- const PREFIX : & str = "=?utf-8?b?" ;
31
- const SUFFIX : & str = "?=" ;
32
- let lowercase = value. to_lowercase ( ) ;
33
- if lowercase. starts_with ( PREFIX ) && lowercase. ends_with ( SUFFIX ) {
34
- let b64 = & value[ PREFIX . len ( ) ..( value. len ( ) - SUFFIX . len ( ) ) ] ;
35
- if let Ok ( bytes) = STANDARD . decode ( b64) {
36
- String :: from_utf8 ( bytes) . unwrap_or ( value)
28
+ /// Decoding of Q-encoded data as described in RFC2047
29
+ fn q_decode ( mut data : & str ) -> Result < Vec < u8 > > {
30
+ let mut result = Vec :: new ( ) ;
31
+ while !data. is_empty ( ) {
32
+ if data. starts_with ( '_' ) {
33
+ // This is always ASCII space (0x20)
34
+ result. push ( 0x20 ) ;
35
+ data = & data[ 1 ..] ;
36
+ } else if data. starts_with ( '=' ) {
37
+ // This is followed by two hex digits encoding a byte
38
+ if data. len ( ) >= 3 {
39
+ let hex = & data[ 1 ..3 ] ;
40
+ let value = u8:: from_str_radix ( hex, 16 )
41
+ . context ( "Expected valid hex string but found something else" ) ?;
42
+ result. push ( value) ;
43
+ data = & data[ 3 ..] ;
44
+ } else {
45
+ bail ! ( "The equal character must be followed by two hex characters" ) ;
46
+ }
37
47
} else {
38
- value
48
+ // Keep everything else as is...
49
+ let byte = data[ 0 ..1 ] . as_bytes ( ) ;
50
+ result. extend_from_slice ( byte) ;
51
+ data = & data[ 1 ..] ;
39
52
}
53
+ }
54
+ Ok ( result)
55
+ }
56
+
57
+ /// Decoding of MIME encoded words as described in RFC2047
58
+ /// This implementation currently only supports UTF-8!
59
+ fn decode_word ( charset : & str , encoding : & str , data : & str ) -> Result < String > {
60
+ let charset = charset. to_lowercase ( ) ;
61
+ let encoding = encoding. to_lowercase ( ) ;
62
+ let decoded = if encoding == "b" {
63
+ STANDARD
64
+ . decode ( data)
65
+ . context ( "Failed to decode Base64 data" ) ?
66
+ } else if encoding == "q" {
67
+ q_decode ( data) . context ( "Failed to decode Q data" ) ?
40
68
} else {
41
- value
69
+ bail ! ( "Unsupported encoding: {encoding}" )
70
+ } ;
71
+ if charset == "utf-8" {
72
+ String :: from_utf8 ( decoded) . context ( "Failed to parse UTF-8 string" )
73
+ } else {
74
+ // Unsupported charset
75
+ bail ! ( "Unsupported charset: {charset}" )
42
76
}
43
77
}
44
78
79
+ /// Basic decoder for subjects containing MIME encoded words.
80
+ /// Supported charsets: Only UTF-8
81
+ /// Supported encodings: Base64 and Q
82
+ pub fn decode_subject ( value : & str ) -> String {
83
+ let re = Regex :: new ( r"=\?(.+?)\?(.)\?(.+?)\?=" ) . unwrap ( ) ;
84
+ let mut result = value. to_owned ( ) ;
85
+ for capture in re. captures_iter ( value) {
86
+ let ( matched, [ charset, encoding, encoded] ) = capture. extract ( ) ;
87
+ let decoded = match decode_word ( charset, encoding, encoded) {
88
+ Ok ( word) => word,
89
+ Err ( _) => continue ,
90
+ } ;
91
+ result = result. replace ( matched, & decoded) ;
92
+ }
93
+ result
94
+ }
95
+
45
96
#[ cfg( test) ]
46
97
mod tests {
47
98
use super :: * ;
48
99
100
+ #[ test]
101
+ fn q_decode_test ( ) {
102
+ assert_eq ! ( q_decode( "" ) . unwrap( ) , Vec :: <u8 >:: new( ) ) ;
103
+ assert_eq ! ( q_decode( "abc" ) . unwrap( ) , vec![ b'a' , b'b' , b'c' ] ) ;
104
+ assert_eq ! ( q_decode( "_" ) . unwrap( ) , vec![ 0x20 ] ) ;
105
+ assert_eq ! (
106
+ q_decode( "=00=ff=AA_abc" ) . unwrap( ) ,
107
+ vec![ 0x00 , 0xff , 0xaa , 0x20 , b'a' , b'b' , b'c' ]
108
+ ) ;
109
+ assert_eq ! (
110
+ q_decode( "Best=C3=A4tigen" ) . unwrap( ) ,
111
+ vec![ 66 , 101 , 115 , 116 , 195 , 164 , 116 , 105 , 103 , 101 , 110 ]
112
+ ) ;
113
+ }
114
+
115
+ #[ test]
116
+ fn decode_word_test ( ) {
117
+ assert_eq ! ( decode_word( "utf-8" , "b" , "YWJj" ) . unwrap( ) , "abc" ) ;
118
+ assert_eq ! ( decode_word( "UtF-8" , "B" , "YWJj" ) . unwrap( ) , "abc" ) ;
119
+ assert_eq ! ( decode_word( "utf-8" , "q" , "=C3=A4" ) . unwrap( ) , "ä" ) ;
120
+ assert_eq ! ( decode_word( "utf-8" , "b" , "dGV4dA==" ) . unwrap( ) , "text" ) ;
121
+
122
+ assert ! ( decode_word( "unknown" , "B" , "YWJj" ) . is_err( ) ) ;
123
+ assert ! ( decode_word( "utf-8" , "unknown" , "YWJj" ) . is_err( ) ) ;
124
+ assert ! ( decode_word( "utf-8" , "b" , "not_valid_b64" ) . is_err( ) ) ;
125
+ }
126
+
49
127
#[ test]
50
128
fn decode_subject_test ( ) {
51
- assert_eq ! ( decode_subject( String :: from( "" ) ) , "" ) ;
52
- assert_eq ! ( decode_subject( String :: from( "basic 123" ) ) , "basic 123" ) ;
53
- assert_eq ! ( decode_subject( String :: from( "=?utf-8?B??=" ) ) , "" ) ;
54
- assert_eq ! ( decode_subject( String :: from( "=?utf-8?B?dGV4dA==?=" ) ) , "text" ) ;
55
- assert_eq ! ( decode_subject( String :: from( "=?utf-8?B?YWJj?=" ) ) , "abc" ) ;
56
- assert_eq ! ( decode_subject( String :: from( "=?UTF-8?b?YWJj?=" ) ) , "abc" ) ;
129
+ // Can handle empty strings
130
+ assert_eq ! ( decode_subject( "" ) , "" ) ;
131
+
132
+ // Can handle strings without encoded words
133
+ assert_eq ! ( decode_subject( "foobar 42" ) , "foobar 42" ) ;
134
+
135
+ // Ignores invalid words that cannot be decoded
136
+ assert_eq ! ( decode_subject( "=?foo?z?a?=" ) , "=?foo?z?a?=" ) ;
137
+
138
+ // Can decode words in the middle
139
+ assert_eq ! ( decode_subject( " =?UTF-8?b?YWJj?= " ) , " abc " ) ;
140
+
141
+ // Can decode multiple words in one string
142
+ assert_eq ! (
143
+ decode_subject( " =?UTF-8?B?YWJj?= =?UTF-8?Q?=C3=A4?= " ) ,
144
+ " abc ä "
145
+ ) ;
57
146
}
58
147
}
0 commit comments