-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathConversions.hs
333 lines (298 loc) · 12.9 KB
/
Conversions.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE KindSignatures #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}
-- {-# LANGUAGE PartialTypeSignatures #-}
-- {-# OPTIONS_GHC -Wno-partial-type-signatures #-}
-- | Examples or moving between type annotated encodings
--
-- Please also see documentation in "Data.TypedEncoding.Conv".
--
-- Haskell programs typically make these imports to do String, ByteString, and Text conversions:
--
-- @
-- import qualified Data.Text as T (pack, unpack)
-- import qualified Data.ByteString.Char8 as B8 (pack, unpack)
-- import Data.Text.Encoding (decodeUtf8, encodeUtf8)
-- @
--
-- or corresponding @Lazy@ imports (not shown).
--
-- Enc-specific equivalents can be found in:
--
-- @
-- import qualified Data.TypedEncoding.Conv.Text as EncT (pack, unpack)
-- import qualified Data.TypedEncoding.Conv.ByteString.Char8 as EncB8 (pack, unpack)
-- import Data.TypedEncoding.Conv.Text.Encoding (decodeUtf8, encodeUtf8)
-- @
--
-- Conversions aim at providing type safety when moving between encoded string-like types.
--
-- __The assumption__ made by /typed-encoding/ is that @"enc-"@ encodings work in an equivalent way independently of the payload type.
-- For example, if the following instances exist:
--
-- @
-- EncodeF SomeErr (Enc xs () String) (Enc ("enc-B64" ': xs) () String)
-- EncodeF SomeErr (Enc xs () Text) (Enc ("enc-B64" ': xs) () Text)
-- @
--
-- Then /typed-encoding/ expects @pack@ @encodeF@ to commute (if encoding instances exist):
--
-- @
-- str -- EncT.pack --> txt
-- | |
-- encodeF encodeF
-- | |
-- v v
-- estr -- fmap EncT.pack --> etxt
-- @
--
-- (@unpack@ and $decode$ are expected to satisfy similar diagrams, not shown)
--
-- Basically, it should not matter which type we run the encoding (or decoding) on (other than performance cost).
--
-- Note that, as a consequence, multi-byte encodings (such as @enc-UTF8@ - available in /typed-encoding-encoding/ package)
-- that encode a Unicode characters into several bytes cannot be decoded
-- in @ByteString@ as this would violate @EncB8.pack@ and @EncB8.unpack@ consistency.
--
-- Also note that this requirement is concerned about @"enc-"@ encodings, @"r-"@ encodings are much simpler to reason about
-- in conversions.
--
-- This module also discusses concepts of __Superset__ (for @"r-"@ encodings), __leniency__, and __flattening__.
module Examples.TypedEncoding.Conversions where
import Data.TypedEncoding
import Data.TypedEncoding.Instances.Enc.Base64 ()
import Data.TypedEncoding.Instances.Restriction.Base64 ()
import Data.TypedEncoding.Instances.Restriction.ASCII ()
import Data.TypedEncoding.Instances.Restriction.UTF8 ()
import Data.TypedEncoding.Instances.Restriction.D76 ()
import Data.TypedEncoding.Instances.Restriction.ByteRep ()
import qualified Data.TypedEncoding.Conv.Text as EncT
import qualified Data.TypedEncoding.Conv.Text.Encoding as EncTe -- (decodeUtf8)
import qualified Data.Text as T
import qualified Data.ByteString as B
import GHC.TypeLits
import qualified Data.TypedEncoding.Conv.ByteString.Char8 as EncB8
import Data.TypedEncoding.Instances.Restriction.BoundedAlphaNums ()
-- $setup
-- >>> :set -XDataKinds -XMultiParamTypeClasses -XKindSignatures -XFlexibleInstances -XFlexibleContexts -XOverloadedStrings -XTypeApplications -XScopedTypeVariables
-- >>> import qualified Data.TypedEncoding.Instances.Enc.Base64 as EnB64 (acceptLenientS)
-- >>> import qualified Data.TypedEncoding.Conv.Text as EncT (pack, utf8Promote, utf8Demote)
-- >>> import qualified Data.TypedEncoding.Conv.ByteString.Char8 as EncB8 (pack, unpack)
-- >>> import qualified Data.TypedEncoding.Conv.Text.Encoding as EncTe (decodeUtf8, encodeUtf8)
-- >>> import Data.Proxy
--
-- This module contains some ghci friendly values to play with.
--
-- Each value is documented in a doctest style by including an equivalent ghci ready expression.
-- These documents generate a test suite for this library as well.
-- * Moving between Text and ByteString
eHelloAsciiB :: Either EncodeEx (Enc '["r-ASCII"] () B.ByteString)
eHelloAsciiB = _runEncodings encodings . toEncoding () $ "HeLlo world"
-- ^ Example value to play with
--
-- >>> _runEncodings encodings . toEncoding () $ "HeLlo world" :: Either EncodeEx (Enc '["r-ASCII"] () B.ByteString)
-- Right (UnsafeMkEnc Proxy () "HeLlo world")
Right helloAsciiB = eHelloAsciiB
-- ^ above with either removed
helloAsciiT :: Enc '["r-ASCII"] () T.Text
helloAsciiT = EncTe.decodeUtf8 helloAsciiB
-- ^
-- We use a tween function of the popular 'Data.Text.Encoding.decodeUtf8'
-- from the /text/ package.
--
-- Notice the encoding annotation is preserved.
--
-- >>> displ $ EncTe.decodeUtf8 helloAsciiB
-- "Enc '[r-ASCII] () (Text HeLlo world)"
-- * @pack@ from String
helloZero :: Enc ('[] :: [Symbol]) () String
helloZero = toEncoding () "Hello"
-- ^ Consider 0-encoding of a 'String', to move it to @Enc '[] () ByteString@ one could try:
--
-- >>> EncB8.pack helloZero
-- ...
-- ... error:
-- ... Empty list, no last element
-- ...
--
-- this does not compile. And it should not. @pack@ from "Data.ByteString.Char8" is error prone.
-- It is not an injection as it only considers first 8 bits of information from each 'Char'.
-- I doubt that there are any code examples of its intentional use on a String that has chars @> \'\255\'@.
--
-- @EncB8.pack@ will not compile unless the encoding has "r-CHAR8" as its superset.
-- This works:
--
-- >>> fmap (displ . EncB8.pack) . encodeFAll @'["r-ASCII"] @(Either EncodeEx) $ helloZero
-- Right "Enc '[r-ASCII] () (ByteString Hello)"
--
-- And the result is a @ByteString@ with bonus annotation describing its content.
--
-- Similar game is played for @Text@:
--
-- >>> fmap (displ . EncT.d76Demote . EncT.pack) . encodeFAll @'["r-UNICODE.D76"] @(Either EncodeEx) $ helloZero
-- Right "Enc '[] () (Text Hello)"
--
-- See "Data.TypedEncoding.Conv" for more information on this.
helloRestricted :: Either EncodeEx (Enc '["r-ban:zzzzz"] () B.ByteString)
helloRestricted = fmap EncB8.pack . _runEncodings encodings $ toEncoding () "Hello"
-- ^ more interestingly @EncB8.pack@ works fine on "r-" encodings that are subsets of "r-ASCII"
-- this example @"r-ban:zzzzz"@ restricts to 5 alpha-numeric charters all @< \'z\'@
--
-- >>> displ <$> helloRestricted
-- Right "Enc '[r-ban:zzzzz] () (ByteString Hello)"
--
-- Adding @"r-ASCII"@ annotation on this ByteString would have been redundant since @"r-ban:zzzzz"@ is more
-- restrictive (see Supersets below).
--
-- @unpack@, as expected will put us back in a String keeping the annotation
--
-- >>> fmap (displ . EncB8.unpack) helloRestricted
-- Right "Enc '[r-ban:zzzzz] () (String Hello)"
--
byteRep :: Either EncodeEx (Enc '["r-ByteRep"] () B.ByteString)
byteRep = fmap EncB8.pack . _runEncodings encodings $ toEncoding () "\254"
-- ^ For low level use of @Char@ instead of @Word8@, "r-ByteRep" represents anything under @256@.
-- * More complex rules
helloUtf8B64B :: Enc '["enc-B64", "r-UTF8"] () B.ByteString
helloUtf8B64B = encodePart @'["enc-B64"] helloUtf8B
-- ^ We Base64 encode a ByteString which adheres to UTF8 layout
--
-- >>> displ $ encodePart @'["enc-B64"] helloUtf8B
-- "Enc '[enc-B64,r-UTF8] () (ByteString SGVMbG8gd29ybGQ=)"
helloUtf8B64T :: Enc '["enc-B64"] () T.Text
helloUtf8B64T = EncT.utf8Demote . EncTe.decodeUtf8 $ helloUtf8B64B
-- ^ .. and copy it over to Text.
--
-- >>> displ $ EncTe.decodeUtf8 helloUtf8B64B
-- "Enc '[enc-B64,r-UTF8] () (Text SGVMbG8gd29ybGQ=)"
--
-- but UTF8 would be redundant in Text so the "r-UTF8" can be dropped:
--
-- >>> displ . EncT.utf8Demote . EncTe.decodeUtf8 $ helloUtf8B64B
-- "Enc '[enc-B64] () (Text SGVMbG8gd29ybGQ=)"
--
-- Conversely moving back to ByteString we need to recover the annotation
--
-- >>> :t EncTe.encodeUtf8 helloUtf8B64T
-- ...
-- ... Couldn't match type ...
-- ...
--
-- This is not allowed! We need to add the redundant "r-UTF8" back:
--
-- >>> displ . EncTe.encodeUtf8 . EncT.utf8Promote $ helloUtf8B64T
-- "Enc '[enc-B64,r-UTF8] () (ByteString SGVMbG8gd29ybGQ=)"
--
-- To achieve type safety, our @encodeUtf8@ and @decodeUtf8@ require "r-UTF8" annotation.
-- But since @Text@ values can always emit @UTF8@ layout, we can simply add and remove
-- these annotations on @Text@ encodings. This approach gives us type level safety over UTF8 encoding/decoding errors.
notTextB :: Enc '["enc-B64"] () B.ByteString
notTextB = encodeAll . toEncoding () $ "\195\177"
-- ^ 'notTextB' a binary, one that does not even represent a valid UTF8.
--
-- >>> encodeAll . toEncoding () $ "\195\177" :: Enc '["enc-B64"] () B.ByteString
-- UnsafeMkEnc Proxy () "w7E="
--
-- Decoding it to Text is prevented by the compiler
--
-- >>> :t EncTe.decodeUtf8 notTextB
-- ...
-- ... error:
-- ... Couldn't match type ...
-- ...
--
-- This is good because having the payload inside of @Enc '["enc-B64"] () Text@ would allow us
-- to try to decode it to Text (causing runtime errors).
--
-- We can move it to Text but to do that we will need to forget the "enc-B64" annotation.
-- This can be done, for example, using flattening (see below).
-- * Supersets
helloUtf8B :: Enc '["r-UTF8"] () B.ByteString
helloUtf8B = injectInto helloAsciiB
-- ^ To claim UTF8 on @helloAsciiB@, instead encoding again:
--
-- >>> encodeFAll . toEncoding () $ "HeLlo world" :: Either EncodeEx (Enc '["r-UTF8"] () B.ByteString)
-- Right (UnsafeMkEnc Proxy () "HeLlo world")
--
-- We should be able to convert the ASCII annotation directly.
--
-- This is done using 'IsSuperset' type family.
--
-- @injectInto@ method accepts proxy to specify superset to use.
--
-- >>> displ $ injectInto @"r-UTF8" helloAsciiB
-- "Enc '[r-UTF8] () (ByteString HeLlo world)"
--
-- Superset is intended for @"r-"@ annotations only, should not be used
-- with general encodings like @"enc-B64"@, it assumes that decoding in the superset
-- can replace the decoding from injected subset.
notTextBB64Ascii :: Enc '["r-ASCII", "enc-B64"] () B.ByteString
notTextBB64Ascii = _encodesInto notTextB
-- ^ /Base64/ encoding represents binary data in an ASCII string format.
--
-- In Haskell, we should be able to express this in types.
--
-- 'EncodingSuperset' class is what specifies this.
--
-- We can use it with '_encodesInto' combinator.
-- 'EncodingSuperset' should not be used directly at the calling site.
--
-- >>> displ (_encodesInto @"r-ASCII" $ notTextB)
-- "Enc '[r-ASCII,enc-B64] () (ByteString w7E=)"
--
-- '_encodesInto' can be used with a superset of the encoding
-- character set as well making it more backward compatible
-- (the definition of @EncodingSuperset "enc-B64" could be made more precise without breaking the code).
--
-- >>> displ (_encodesInto @"r-UTF8" $ notTextB)
-- "Enc '[r-UTF8,enc-B64] () (ByteString w7E=)"
--
notTextB64AsTxt :: Enc '["r-B64"] () T.Text
notTextB64AsTxt = EncTe.decodeUtf8 $ flattenAs notTextB
-- ^ /Base64/ encoding of a non-text binary data can still be converted to Text format
-- @Enc '["r-B64"] () T.Text@ signifies that the value is B64 encoding but it cannot be decoded to a Text.
-- tst = encodeAll . toEncoding () $ "" :: Enc '["enc-B64"] () B.ByteString
-- tst2 = EncTe.decodeUtf8 $ flattenAs $ tst :: Enc '["r-B64"] () T.Text
-- * Lenient recovery
lenientSomething :: Enc '["enc-B64-len"] () B.ByteString
lenientSomething = recreateAll . toEncoding () $ "abc==CB"
-- ^
-- >>> recreateAll . toEncoding () $ "abc==CB" :: Enc '["enc-B64-len"] () B.ByteString
-- UnsafeMkEnc Proxy () "abc==CB"
--
-- The rest of Haskell does lenient decoding, type safety allows this library to use it for recovery.
-- lenient algorithms are not partial and automatically fix invalid input:
--
-- >>> recreateFAll . toEncoding () $ "abc==CB" :: Either RecreateEx (Enc '["enc-B64"] () B.ByteString)
-- Left (RecreateEx "enc-B64" ("Base64-encoded bytestring is unpadded or has invalid padding"))
--
-- This library allows to recover to "enc-B64-len" which is different than "enc-B64"
--
-- 'EnB64.acceptLenientS' allows to convert "enc-B64-len" to "enc-B64"
--
-- >>> displ $ EnB64.acceptLenientS lenientSomething
-- "Enc '[enc-B64] () (ByteString abc=)"
--
-- This is now properly encoded data
--
-- >>> recreateFAll . toEncoding () $ "abc=" :: Either RecreateEx (Enc '["enc-B64"] () B.ByteString)
-- Right (UnsafeMkEnc Proxy () "abc=")
--
-- Except the content could be surprising
--
-- >>> decodeAll $ EnB64.acceptLenientS lenientSomething
-- UnsafeMkEnc Proxy () "i\183"
-- * Flattening
b64IsAscii :: Enc '["r-ASCII"] () B.ByteString
b64IsAscii = flattenAs helloUtf8B64B
-- ^ Base 64 encodes binary data as ASCII text.
-- thus, we should be able to treat "enc-B64" as "r-ASCII" losing some information.
-- this is done using 'FlattenAs' type class
--
-- >>> :t flattenAs @"r-ASCII" helloUtf8B64B
-- flattenAs @"r-ASCII" helloUtf8B64B
-- ... :: Enc '["r-ASCII"] () B.ByteString