From 820c14bd49ba76916cc30fc61a79f26850f94673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thiago=20Rom=C3=A3o=20Barcala?= Date: Thu, 18 Apr 2024 14:32:16 +0200 Subject: [PATCH] Implement serialization of decimal logical type (#22) Thanks to @thiagorb for the contribution! --- lib/avro/datum.php | 67 ++++++++++++++++++++++++++++++++++++++++---- lib/avro/schema.php | 4 +++ test/DatumIOTest.php | 29 ++++++++++++++++++- 3 files changed, 94 insertions(+), 6 deletions(-) diff --git a/lib/avro/datum.php b/lib/avro/datum.php index 811cdd4..a810088 100644 --- a/lib/avro/datum.php +++ b/lib/avro/datum.php @@ -117,7 +117,7 @@ function write_data($writers_schema, $datum, $encoder) case AvroSchema::STRING_TYPE: return $encoder->write_string($datum); case AvroSchema::BYTES_TYPE: - return $encoder->write_bytes($datum); + return $encoder->write_bytes($writers_schema, $datum); case AvroSchema::ARRAY_SCHEMA: return $this->write_array($writers_schema, $datum, $encoder); case AvroSchema::MAP_SCHEMA: @@ -386,13 +386,25 @@ public function write_double($datum) * @param string $str * @uses self::write_bytes() */ - function write_string($str) { $this->write_bytes($str); } + function write_string($str) { $this->write_bytes(null, $str); } /** + * @param AvroSchema|null $writers_schema * @param string $bytes + * @throws AvroException */ - function write_bytes($bytes) + function write_bytes($writers_schema, $bytes) { + if ($writers_schema !== null && $writers_schema->logical_type() === 'decimal') { + $scale = $writers_schema->extra_attributes()['scale'] ?? 0; + $precision = $writers_schema->extra_attributes()['precision'] ?? null; + if ($precision === null) { + throw new AvroException('Decimal precision is required'); + } + + $bytes = self::decimal_to_bytes($bytes, $scale, $precision); + } + $this->write_long(strlen($bytes)); $this->write($bytes); } @@ -401,6 +413,49 @@ function write_bytes($bytes) * @param string $datum */ function write($datum) { $this->io->write($datum); } + + /** + * @throws AvroException + */ + private static function decimal_to_bytes($decimal, int $scale, int $precision): string + { + if (!is_numeric($decimal)) { + throw new AvroException('Decimal must be a numeric value'); + } + + $value = $decimal * (10 ** $scale); + if (!is_int($value)) { + $value = (int)round($value); + } + if (abs($value) > (10 ** $precision - 1)) { + throw new AvroException('Decimal value is out of range'); + } + + $packed = pack('J', $value); + $significantBit = self::getMostSignificantBitAt($packed, 0); + $trimByte = $significantBit ? 0xff : 0x00; + + $offset = 0; + $packedLength = strlen($packed); + while ($offset < $packedLength - 1) { + if (ord($packed[$offset]) !== $trimByte) { + break; + } + + if (self::getMostSignificantBitAt($packed, $offset + 1) !== $significantBit) { + break; + } + + $offset++; + } + + return substr($packed, $offset); + } + + private static function getMostSignificantBitAt($bytes, $offset): int + { + return ord($bytes[$offset]) & 0x80; + } } /** @@ -925,8 +980,10 @@ static public function long_bits_to_double($bits) */ static public function bytes_to_decimal($bytes, $scale = 0) { - $int = hexdec(bin2hex($bytes)); - return $scale > 0 ? ($int / (10 ** $scale)) : $int; + $mostSignificantBit = ord($bytes[0]) & 0x80; + $padded = str_pad($bytes, 8, $mostSignificantBit ? "\xff" : "\x00", STR_PAD_LEFT); + $int = unpack('J', $padded)[1]; + return $scale > 0 ? ($int / (10 ** $scale)) : $int; } /** diff --git a/lib/avro/schema.php b/lib/avro/schema.php index 0d7c1d0..f002f99 100644 --- a/lib/avro/schema.php +++ b/lib/avro/schema.php @@ -416,7 +416,11 @@ public static function is_valid_datum($expected_schema, $datum) case self::BOOLEAN_TYPE: return is_bool($datum); case self::STRING_TYPE: + return is_string($datum); case self::BYTES_TYPE: + if ($expected_schema->logical_type() === 'decimal') { + return is_numeric($datum); + } return is_string($datum); case self::INT_TYPE: return (is_int($datum) diff --git a/test/DatumIOTest.php b/test/DatumIOTest.php index 2ee82ec..1a07723 100644 --- a/test/DatumIOTest.php +++ b/test/DatumIOTest.php @@ -101,7 +101,34 @@ function data_provider() 'B', "\x02"), array('{"name":"rec","type":"record","fields":[{"name":"a","type":"int"},{"name":"b","type":"boolean"}]}', array('a' => 1, 'b' => false), - "\x02\x00") + "\x02\x00"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 1}', + '1', + "\x02\x0a"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 1}', + '-0.1', + "\x02\xff"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 1}', + -0.1, + "\x02\xff"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 1}', + 3.1, + "\x02\x1f"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 2}', + 2.55, + "\x04\x00\xff"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 0}', + -256, + "\x04\xff\x00"), + array('{"type":"bytes","logicalType": "decimal","precision": 4,"scale": 3}', + 0.127, + "\x02\x7f"), + array('{"type":"bytes","logicalType": "decimal","precision": 19,"scale": 0}', + PHP_INT_MAX, + "\x10\x7f\xff\xff\xff\xff\xff\xff\xff"), + array('{"type":"bytes","logicalType": "decimal","precision": 19,"scale": 0}', + PHP_INT_MIN, + "\x10\x80\x00\x00\x00\x00\x00\x00\x00") ); }