Skip to content

Commit

Permalink
Restore bitvec implementation of encoding/decoding
Browse files Browse the repository at this point in the history
For decoding, use an implementation from @lukaszcz to avoid calling shiftR when
writing the bits of an Integer.

For encoding I continue to use the chunked encoding of ByteString to
Integer.

Co-authored-by: Lukasz Czajka <lukasz@heliax.dev>
  • Loading branch information
paulcadman and lukaszcz committed Aug 30, 2024
1 parent 1b7df9d commit b725f2e
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 133 deletions.
11 changes: 9 additions & 2 deletions bench2/Benchmark/Nockma/Encoding/ByteString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@ randomBytes numBytes = do
sg <- getStdGen
return (fst (genByteString numBytes sg))

testBytesSize :: Int
testBytesSize = 250000

bm :: Benchmark
bm =
bgroup
"ByteString Encoding to/from integer"
[ env (randomBytes 250000) (\bs -> bench "encode bytes to integer" (nf Encoding.encodeByteString bs)),
env (Encoding.encodeByteString <$> randomBytes 250000) (\i -> bench "decode bytes from integer" (nf Encoding.decodeByteString i))
[ env
(randomBytes testBytesSize)
(\bs -> bench "encode bytes to integer" (nf Encoding.encodeByteString bs)),
env
(Encoding.encodeByteString <$> randomBytes testBytesSize)
(\i -> bench "decode bytes from integer" (nf (Encoding.decodeByteStringWithDefault (error "failed to decode")) i))
]
9 changes: 8 additions & 1 deletion src/Juvix/Compiler/Core/Evaluator.hs
Original file line number Diff line number Diff line change
Expand Up @@ -497,13 +497,20 @@ geval opts herr tab env0 = eval' env0

-- Deserialize a Integer, serialized using `serializeInteger` to a Node
deserializeFromInteger :: Integer -> Node
deserializeFromInteger = deserializeNode . Encoding.decodeByteString
deserializeFromInteger = deserializeNode . decodeByteString
{-# INLINE deserializeFromInteger #-}

serializeToInteger :: Node -> Integer
serializeToInteger = Encoding.encodeByteString . serializeNode
{-# INLINE serializeToInteger #-}

decodeByteString :: Integer -> ByteString
decodeByteString = Encoding.decodeByteStringWithDefault decodeErr
where
decodeErr :: ByteString
decodeErr = err "failed to decode Integer"
{-# INLINE decodeByteString #-}

sign :: Node -> ByteString -> Node
sign !messageNode !secretKeyBs =
let !message = serializeNode messageNode
Expand Down
38 changes: 16 additions & 22 deletions src/Juvix/Compiler/Nockma/Encoding/Base.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module Juvix.Compiler.Nockma.Encoding.Base where

import Data.Bit as Bit
import Data.Bits
import Data.Vector.Unboxed qualified as U
import Juvix.Compiler.Nockma.Encoding.Effect.BitReader
import Juvix.Compiler.Nockma.Encoding.Effect.BitWriter
import Juvix.Prelude.Base
Expand All @@ -12,33 +11,28 @@ import Juvix.Prelude.Base
writeIntegral :: forall a r. (Integral a, Member BitWriter r) => a -> Sem r ()
writeIntegral x
| x < 0 = error "integerToVectorBits: negative integers are not supported in this implementation"
| otherwise = unfoldBits (fromIntegral x)
| otherwise = unfoldBits 0 (fromIntegral x)
where
unfoldBits :: Integer -> Sem r ()
unfoldBits n
| n == 0 = return ()
| otherwise = writeBit (Bit (testBit n 0)) <> unfoldBits (n `shiftR` 1)
len = bitLength x

integerToVectorBits :: (Integral a) => a -> Bit.Vector Bit
integerToVectorBits = run . execBitWriter . writeIntegral
unfoldBits :: Int -> Integer -> Sem r ()
unfoldBits idx n
| idx == len = return ()
| otherwise = writeBit (Bit (testBit n idx)) <> unfoldBits (idx + 1) n

-- | Computes the number of bits required to store the argument in binary
-- NB: 0 is encoded to the empty bit vector (as specified by the Hoon serialization spec), so 0 has bit length 0.
bitLength :: forall a. (Integral a) => a -> Int
bitLength =
length
. takeWhile (/= 0)
. iterate (`shiftR` 1)
. toInteger
bitLength :: (Integral a) => a -> Int
bitLength n
| n == 0 = 0
| otherwise = fromIntegral (integerLog2 (abs (fromIntegral n))) + 1

-- | Decode a vector of bits (ordered from least to most significant bits) to an integer
vectorBitsToInteger :: Bit.Vector Bit -> Integer
vectorBitsToInteger = U.ifoldl' go 0
where
go :: Integer -> Int -> Bit -> Integer
go acc idx (Bit b)
| b = setBit acc idx
| otherwise = acc
integerToVectorBits :: (Integral a) => a -> Bit.Vector Bit
integerToVectorBits = run . execBitWriter . writeIntegral

-- | Decode a vector of bits (ordered from least to most significant bits) to a ByteString
vectorBitsToByteString :: Bit.Vector Bit -> ByteString
vectorBitsToByteString = cloneToByteString

-- | Transform a Natural to an Int, computes Nothing if the Natural does not fit in an Int
safeNaturalToInt :: Natural -> Maybe Int
Expand Down
150 changes: 44 additions & 106 deletions src/Juvix/Compiler/Nockma/Encoding/ByteString.hs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
module Juvix.Compiler.Nockma.Encoding.ByteString where

import Data.Bit (Bit)
import Data.Bit qualified as Bit
import Data.Bits
import Data.ByteString qualified as BS
import Data.ByteString.Builder qualified as BB
import Data.ByteString.Builder qualified as BS
import Juvix.Compiler.Nockma.Encoding.Base
import Juvix.Compiler.Nockma.Encoding.Effect.BitReader
import Juvix.Compiler.Nockma.Encoding.Effect.BitWriter
import Juvix.Compiler.Nockma.Language
import Juvix.Prelude.Base

Expand All @@ -27,6 +31,25 @@ naturalToByteString = integerToByteStringLE . toInteger
byteStringToIntegerLE :: ByteString -> Integer
byteStringToIntegerLE = BS.foldr (\b acc -> acc `shiftL` 8 .|. fromIntegral b) 0

byteStringToIntegerLEChunked :: ByteString -> Integer
byteStringToIntegerLEChunked = foldr' go 0 . map (first byteStringChunkToInteger) . chunkByteString
where
chunkSize :: Int
chunkSize = 1024

go :: (Integer, Int) -> Integer -> Integer
go (i, size) acc = acc `shiftL` (8 * size) .|. i

chunkByteString :: ByteString -> [(ByteString, Int)]
chunkByteString bs
| BS.null bs = []
| otherwise =
let (chunk, rest) = BS.splitAt chunkSize bs
in (chunk, BS.length chunk) : chunkByteString rest

byteStringChunkToInteger :: ByteString -> Integer
byteStringChunkToInteger = BS.foldr' (\b acc -> acc `shiftL` 8 .|. fromIntegral b) 0

integerToByteStringLE :: Integer -> ByteString
integerToByteStringLE = BS.toStrict . BS.toLazyByteString . go
where
Expand Down Expand Up @@ -64,114 +87,29 @@ padByteString n bs
| BS.length bs >= n = bs
| otherwise = BS.append bs (BS.replicate (n - BS.length bs) 0)

-- | Encode an Int with a variable-length encoding
--
-- The input Int is encoded in 7 bit chunks in LSB ordering. The most significant
-- bit of each chunk is used to indicate when there are more bytes to read,
-- 1 meaning more bytes, 0 meaning no more bytes.
--
-- For example, the binary representation of 263202 is divided into 3 7-bit chunks:
--
-- 263202 = 10000 0001000 0100010
-- chunk1 chunk2 chunk3
--
-- The chunks are then combined using 3 bytes in LSB ordering, with a 1 in the MSB of the first
-- two bytes (indicating that another byte follows). The final byte has a 0 in the MSB bit.
--
-- chunk3 chunk2 chunk1
-- 1_0100010 1_0001000 0_0010000
encodeVarInt :: Int -> ByteString
encodeVarInt = \case
0 -> BS.singleton 0
n -> BS.toStrict (BB.toLazyByteString (buildVarInt n))
where
buildVarInt :: Int -> BB.Builder
buildVarInt = \case
0 -> mempty
i ->
let byteChunk = fromIntegral (i .&. 0x7F) -- Extract a 7-bit chunk
next = i `shiftR` 7 -- Shift to the next 7-bit chunk
currentByte =
if
| next == 0 -> byteChunk -- No more bytes, so most significant bit for this chunk is 0
| otherwise -> byteChunk .|. 0x80 -- More bytes, so most significant bit for this chunk is 1
in BB.word8 currentByte <> buildVarInt next

byteStringToIntegerBE :: ByteString -> Integer
byteStringToIntegerBE = foldl' go 0 . map (first byteStringChunkToInteger) . chunkByteString
where
chunkSize :: Int
chunkSize = 1024

go :: Integer -> (Integer, Int) -> Integer
go acc (i, size) = acc `shiftL` (8 * size) .|. i

chunkByteString :: ByteString -> [(ByteString, Int)]
chunkByteString bs
| BS.null bs = []
| otherwise =
let (chunk, rest) = BS.splitAt chunkSize bs
in (chunk, BS.length chunk) : chunkByteString rest

byteStringChunkToInteger :: ByteString -> Integer
byteStringChunkToInteger = BS.foldl' (\acc b -> acc `shiftL` 8 .|. fromIntegral b) 0
vectorBitsToInteger :: Bit.Vector Bit -> Integer
vectorBitsToInteger = byteStringToIntegerLEChunked . vectorBitsToByteString

-- | encode a ByteString to an Integer (in MSB ordering) with its length as part of the encoding.
-- | encode a ByteString to an Integer with its length as part of the encoding.
encodeByteString :: ByteString -> Integer
encodeByteString bs = byteStringToIntegerBE (encodedLength <> bs)
encodeByteString = vectorBitsToInteger . run . execBitWriter . go
where
encodedLength :: ByteString
encodedLength = encodeVarInt (BS.length bs)
go :: ByteString -> Sem (BitWriter ': r) ()
go bs = do
let len = BS.length bs
writeLength len
writeByteString bs

-- | decode a ByteString that was encoded using `encodeByteString`
decodeByteString :: Integer -> ByteString
decodeByteString n = padByteString len bytes
where
(len, bytes) = decodeVarInt (integerToBytes n)

-- | Decode an integer in MSB ordering to a bytestring.
integerToBytes :: Integer -> ByteString
integerToBytes 0 = BS.singleton 0
integerToBytes n = BS.reverse $ BS.unfoldr go n
where
go :: Integer -> Maybe (Word8, Integer)
go = \case
0 -> Nothing
i -> Just (fromIntegral (i .&. 0xff), i `shiftR` 8)

-- | Decode a variable-length encoded Int (using `encodeVarInt`) from the start of a ByteString.
--
-- An Int is accumulated from the least significant 7-bits chunk of each byte. The
-- most significant bit of each byte indicates if more bytes of the input should
-- be read. If the most significant bit is one, then there are more bytes, if it
-- is 0 then there are no more bytes.
--
-- For example:
--
-- byte1 byte2 byte3 remainder
-- 1_0100010 1_0001000 0_0010000 ...
--
-- The first byte has most significant bit = 1 so we accumulate the least significant 7 bits and continue.
--
-- acc: 100010
--
-- The second byte has most significant bit = 1 so we accumulate and continue. The bytes are
-- encoded using LSB ordering so we must shift this chunk left by 7:
--
-- acc: 0001000 0100010
--
-- The third byte has most significant bit = 0 so this is the last byte. We must shift this chunk by 2 * 7 = 14
--
-- result : 10000 0001000 0100010 = 263202
decodeVarInt :: ByteString -> (Int, ByteString)
decodeVarInt bs = go 0 0 bs
decodeByteString :: forall r. (Member (Error BitReadError) r) => Integer -> Sem r ByteString
decodeByteString i = evalBitReader (integerToVectorBits i) go
where
go :: Int -> Int -> ByteString -> (Int, ByteString)
go acc toShift s = case BS.uncons s of
Nothing -> (acc, BS.empty)
Just (x, xs) ->
if
| x .&. 0x80 == 0 -> (acc .|. (fromIntegral x `shiftL` toShift), xs) -- The most significant bit is 0, no more bytes
| otherwise ->
let chunk = x .&. 0x7F -- Extract the next 7-bit chunk
in go (acc .|. (fromIntegral chunk `shiftL` toShift)) (toShift + 7) xs
go :: Sem (BitReader ': r) ByteString
go = do
len <- consumeLength
v <- consumeRemaining
return (padByteString len (Bit.cloneToByteString v))

-- | decode a ByteString that was encoded using `encodeByteString` with a default that's used if decoding fails.
decodeByteStringWithDefault :: ByteString -> Integer -> ByteString
decodeByteStringWithDefault d = fromRight d . run . runErrorNoCallStack @BitReadError . decodeByteString
1 change: 1 addition & 0 deletions src/Juvix/Compiler/Nockma/Encoding/Cue.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module Juvix.Compiler.Nockma.Encoding.Cue where

import Data.Bit as Bit
import Juvix.Compiler.Nockma.Encoding.Base
import Juvix.Compiler.Nockma.Encoding.ByteString
import Juvix.Compiler.Nockma.Encoding.Effect.BitReader
import Juvix.Compiler.Nockma.Language
import Juvix.Compiler.Nockma.Pretty.Base
Expand Down
1 change: 1 addition & 0 deletions src/Juvix/Compiler/Nockma/Encoding/Jam.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ module Juvix.Compiler.Nockma.Encoding.Jam where
import Data.Bit as Bit
import Data.Bits
import Juvix.Compiler.Nockma.Encoding.Base
import Juvix.Compiler.Nockma.Encoding.ByteString
import Juvix.Compiler.Nockma.Encoding.Effect.BitWriter
import Juvix.Compiler.Nockma.Language
import Juvix.Prelude.Base
Expand Down
4 changes: 2 additions & 2 deletions test/Nockma/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ import Test.Tasty.Hedgehog

propEncodingRoundtrip :: Property
propEncodingRoundtrip = property $ do
-- The range must be greater than the chunkSize in `byteStringToIntegerBE`
-- The range must be greater than the chunkSize in `byteStringToIntegerLEChunked`
bs <- forAll (Gen.bytes (Range.linear 0 3000))
Encoding.decodeByteString (Encoding.encodeByteString bs) === bs
Encoding.decodeByteStringWithDefault (error "failed to decode") (Encoding.encodeByteString bs) === bs

allTests :: TestTree
allTests = testGroup "Nockma encoding" [testProperty "Roundtrip ByteArray to/from integer encoding" propEncodingRoundtrip]

0 comments on commit b725f2e

Please sign in to comment.