From 51ed758cedf703e8ac8f4022f6fd1bf4b887481e Mon Sep 17 00:00:00 2001 From: Sebastien Guillemot Date: Sun, 29 Sep 2024 11:12:49 +0900 Subject: [PATCH 1/3] Fix offset value when escape characters exist --- packages/node/test/offset.ts | 11 ++++++++--- packages/plainjs/dist/deno/tokenizer.ts | 6 ++++++ packages/plainjs/src/tokenizer.ts | 6 ++++++ packages/plainjs/test/offset.ts | 11 ++++++++--- packages/whatwg/test/offset.ts | 11 ++++++++--- 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/packages/node/test/offset.ts b/packages/node/test/offset.ts index 7de5194..2cd5986 100644 --- a/packages/node/test/offset.ts +++ b/packages/node/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]); diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts index b071519..3d0a5d5 100644 --- a/packages/plainjs/dist/deno/tokenizer.ts +++ b/packages/plainjs/dist/deno/tokenizer.ts @@ -110,6 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; + private escapeLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -336,6 +337,9 @@ export default class Tokenizer { value: string, offset: this.offset, }); + this.offset += this.escapeLength; + this.escapeLength = 0; + this.offset += this.bufferedString.byteLength + 1; continue; } @@ -398,12 +402,14 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); + this.escapeLength += 1; this.state = TokenizerStates.STRING_DEFAULT; continue; } if (n === charset.LATIN_SMALL_LETTER_U) { this.unicode = ""; + this.escapeLength += 4; this.state = TokenizerStates.STRING_UNICODE_DIGIT_1; continue; } diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts index 7cda24c..b0527ad 100644 --- a/packages/plainjs/src/tokenizer.ts +++ b/packages/plainjs/src/tokenizer.ts @@ -110,6 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; + private escapeLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -336,6 +337,9 @@ export default class Tokenizer { value: string, offset: this.offset, }); + this.offset += this.escapeLength; + this.escapeLength = 0; + this.offset += this.bufferedString.byteLength + 1; continue; } @@ -398,12 +402,14 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); + this.escapeLength += 1; this.state = TokenizerStates.STRING_DEFAULT; continue; } if (n === charset.LATIN_SMALL_LETTER_U) { this.unicode = ""; + this.escapeLength += 4; this.state = TokenizerStates.STRING_UNICODE_DIGIT_1; continue; } diff --git a/packages/plainjs/test/offset.ts b/packages/plainjs/test/offset.ts index 39fc243..2d0a236 100644 --- a/packages/plainjs/test/offset.ts +++ b/packages/plainjs/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "../src/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]); diff --git a/packages/whatwg/test/offset.ts b/packages/whatwg/test/offset.ts index 7de5194..2cd5986 100644 --- a/packages/whatwg/test/offset.ts +++ b/packages/whatwg/test/offset.ts @@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js"; const input1 = '{\n "string": "value",\n "number": 3,\n "object"'; const input2 = ': {\n "key": "vд"\n },\n "array": [\n -1,\n 12\n ]\n '; -const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }'; +const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,'; +const input4 = '"escape": "\\"\\u00e1" }'; const offsets = [ [0, TokenType.LEFT_BRACE], @@ -46,7 +47,11 @@ const offsets = [ [146, TokenType.STRING], [152, TokenType.COLON], [154, TokenType.NUMBER], - [159, TokenType.RIGHT_BRACE], + [158, TokenType.COMMA], + [159, TokenType.STRING], + [167, TokenType.COLON], + [169, TokenType.STRING], + [180, TokenType.RIGHT_BRACE], ]; test("offset", async () => { @@ -54,7 +59,7 @@ test("offset", async () => { await runTokenizerTest( new Tokenizer(), - [input1, input2, input3], + [input1, input2, input3, input4], ({ token, offset }) => { expect(offset).toEqual(offsets[i][0]); expect(token).toEqual(offsets[i][1]); From f2d10292e07dd9821f369845d0aeb6ea72ead8e4 Mon Sep 17 00:00:00 2001 From: Sebastien Guillemot Date: Sun, 29 Sep 2024 12:41:25 +0900 Subject: [PATCH 2/3] handle different length utf8 --- packages/plainjs/dist/deno/tokenizer.ts | 25 ++++++++++++------------- packages/plainjs/src/tokenizer.ts | 25 ++++++++++++------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts index 3d0a5d5..1d63c5f 100644 --- a/packages/plainjs/dist/deno/tokenizer.ts +++ b/packages/plainjs/dist/deno/tokenizer.ts @@ -402,14 +402,13 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); - this.escapeLength += 1; + this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } if (n === charset.LATIN_SMALL_LETTER_U) { this.unicode = ""; - this.escapeLength += 4; this.state = TokenizerStates.STRING_UNICODE_DIGIT_1; continue; } @@ -447,24 +446,24 @@ export default class Tokenizer { //<55296,56319> - highSurrogate this.highSurrogate = intVal; } else { - this.bufferedString.appendBuf( - this.encoder.encode(String.fromCharCode(intVal)), - ); + const buf = this.encoder.encode(String.fromCharCode(intVal)); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), - ), + const buf = this.encoder.encode( + String.fromCharCode(this.highSurrogate, intVal), ); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } else { - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ), + const buf = this.encoder.encode( + String.fromCharCode(this.highSurrogate), ); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } this.highSurrogate = undefined; } diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts index b0527ad..c0af992 100644 --- a/packages/plainjs/src/tokenizer.ts +++ b/packages/plainjs/src/tokenizer.ts @@ -402,14 +402,13 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); - this.escapeLength += 1; + this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } if (n === charset.LATIN_SMALL_LETTER_U) { this.unicode = ""; - this.escapeLength += 4; this.state = TokenizerStates.STRING_UNICODE_DIGIT_1; continue; } @@ -447,24 +446,24 @@ export default class Tokenizer { //<55296,56319> - highSurrogate this.highSurrogate = intVal; } else { - this.bufferedString.appendBuf( - this.encoder.encode(String.fromCharCode(intVal)), - ); + const buf = this.encoder.encode(String.fromCharCode(intVal)); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), - ), + const buf = this.encoder.encode( + String.fromCharCode(this.highSurrogate, intVal), ); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } else { - this.bufferedString.appendBuf( - this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ), + const buf = this.encoder.encode( + String.fromCharCode(this.highSurrogate), ); + this.bufferedString.appendBuf(buf); + this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } this.highSurrogate = undefined; } From 283cdb63004095b1720a6ec76038e9c36aa5b9a0 Mon Sep 17 00:00:00 2001 From: Sebastien Guillemot Date: Wed, 13 Nov 2024 21:27:35 +0900 Subject: [PATCH 3/3] PR feedback on escape offset fix --- packages/plainjs/dist/deno/tokenizer.ts | 37 +++++++++++++------------ packages/plainjs/src/tokenizer.ts | 37 +++++++++++++------------ 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts index 1d63c5f..2843085 100644 --- a/packages/plainjs/dist/deno/tokenizer.ts +++ b/packages/plainjs/dist/deno/tokenizer.ts @@ -110,7 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; - private escapeLength = 0; + private escapedCharsByteLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -301,6 +301,7 @@ export default class Tokenizer { if (n === charset.QUOTATION_MARK) { this.bufferedString.reset(); + this.escapedCharsByteLength = 0; this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -337,10 +338,10 @@ export default class Tokenizer { value: string, offset: this.offset, }); - this.offset += this.escapeLength; - this.escapeLength = 0; - - this.offset += this.bufferedString.byteLength + 1; + this.offset += + this.escapedCharsByteLength + + this.bufferedString.byteLength + + 1; continue; } @@ -402,7 +403,7 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); - this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 + this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -441,32 +442,32 @@ export default class Tokenizer { this.unicode + String.fromCharCode(n), 16, ); + let unicodeString: string; if (this.highSurrogate === undefined) { if (intVal >= 0xd800 && intVal <= 0xdbff) { //<55296,56319> - highSurrogate this.highSurrogate = intVal; + this.state = TokenizerStates.STRING_DEFAULT; + continue; } else { - const buf = this.encoder.encode(String.fromCharCode(intVal)); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) + unicodeString = String.fromCharCode(intVal); } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - const buf = this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), + unicodeString = String.fromCharCode( + this.highSurrogate, + intVal, ); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } else { - const buf = this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) + unicodeString = String.fromCharCode(this.highSurrogate); } this.highSurrogate = undefined; } + const unicodeBuffer = this.encoder.encode(unicodeString); + this.bufferedString.appendBuf(unicodeBuffer); + // len(\u0000)=6 minus the fact you're appending len(buf) + this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength; this.state = TokenizerStates.STRING_DEFAULT; continue; } diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts index c0af992..08a9d5d 100644 --- a/packages/plainjs/src/tokenizer.ts +++ b/packages/plainjs/src/tokenizer.ts @@ -110,7 +110,7 @@ export default class Tokenizer { private separator?: string; private separatorBytes?: Uint8Array; private separatorIndex = 0; - private escapeLength = 0; + private escapedCharsByteLength = 0; private bufferedString: StringBuilder; private bufferedNumber: StringBuilder; @@ -301,6 +301,7 @@ export default class Tokenizer { if (n === charset.QUOTATION_MARK) { this.bufferedString.reset(); + this.escapedCharsByteLength = 0; this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -337,10 +338,10 @@ export default class Tokenizer { value: string, offset: this.offset, }); - this.offset += this.escapeLength; - this.escapeLength = 0; - - this.offset += this.bufferedString.byteLength + 1; + this.offset += + this.escapedCharsByteLength + + this.bufferedString.byteLength + + 1; continue; } @@ -402,7 +403,7 @@ export default class Tokenizer { const controlChar = escapedSequences[n]; if (controlChar) { this.bufferedString.appendChar(controlChar); - this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 + this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1 this.state = TokenizerStates.STRING_DEFAULT; continue; } @@ -441,32 +442,32 @@ export default class Tokenizer { this.unicode + String.fromCharCode(n), 16, ); + let unicodeString: string; if (this.highSurrogate === undefined) { if (intVal >= 0xd800 && intVal <= 0xdbff) { //<55296,56319> - highSurrogate this.highSurrogate = intVal; + this.state = TokenizerStates.STRING_DEFAULT; + continue; } else { - const buf = this.encoder.encode(String.fromCharCode(intVal)); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) + unicodeString = String.fromCharCode(intVal); } } else { if (intVal >= 0xdc00 && intVal <= 0xdfff) { //<56320,57343> - lowSurrogate - const buf = this.encoder.encode( - String.fromCharCode(this.highSurrogate, intVal), + unicodeString = String.fromCharCode( + this.highSurrogate, + intVal, ); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) } else { - const buf = this.encoder.encode( - String.fromCharCode(this.highSurrogate), - ); - this.bufferedString.appendBuf(buf); - this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf) + unicodeString = String.fromCharCode(this.highSurrogate); } this.highSurrogate = undefined; } + const unicodeBuffer = this.encoder.encode(unicodeString); + this.bufferedString.appendBuf(unicodeBuffer); + // len(\u0000)=6 minus the fact you're appending len(buf) + this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength; this.state = TokenizerStates.STRING_DEFAULT; continue; }