From 51ed758cedf703e8ac8f4022f6fd1bf4b887481e Mon Sep 17 00:00:00 2001
From: Sebastien Guillemot <sebastiengllmt@gmail.com>
Date: Sun, 29 Sep 2024 11:12:49 +0900
Subject: [PATCH 1/3] Fix offset value when escape characters exist

---
 packages/node/test/offset.ts            | 11 ++++++++---
 packages/plainjs/dist/deno/tokenizer.ts |  6 ++++++
 packages/plainjs/src/tokenizer.ts       |  6 ++++++
 packages/plainjs/test/offset.ts         | 11 ++++++++---
 packages/whatwg/test/offset.ts          | 11 ++++++++---
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/packages/node/test/offset.ts b/packages/node/test/offset.ts
index 7de5194..2cd5986 100644
--- a/packages/node/test/offset.ts
+++ b/packages/node/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,7 +47,11 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
@@ -54,7 +59,7 @@ test("offset", async () => {
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);
diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts
index b071519..3d0a5d5 100644
--- a/packages/plainjs/dist/deno/tokenizer.ts
+++ b/packages/plainjs/dist/deno/tokenizer.ts
@@ -110,6 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
+  private escapeLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -336,6 +337,9 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
+              this.offset += this.escapeLength;
+              this.escapeLength = 0;
+
               this.offset += this.bufferedString.byteLength + 1;
               continue;
             }
@@ -398,12 +402,14 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
+              this.escapeLength += 1;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
 
             if (n === charset.LATIN_SMALL_LETTER_U) {
               this.unicode = "";
+              this.escapeLength += 4;
               this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
               continue;
             }
diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts
index 7cda24c..b0527ad 100644
--- a/packages/plainjs/src/tokenizer.ts
+++ b/packages/plainjs/src/tokenizer.ts
@@ -110,6 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
+  private escapeLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -336,6 +337,9 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
+              this.offset += this.escapeLength;
+              this.escapeLength = 0;
+
               this.offset += this.bufferedString.byteLength + 1;
               continue;
             }
@@ -398,12 +402,14 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
+              this.escapeLength += 1;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
 
             if (n === charset.LATIN_SMALL_LETTER_U) {
               this.unicode = "";
+              this.escapeLength += 4;
               this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
               continue;
             }
diff --git a/packages/plainjs/test/offset.ts b/packages/plainjs/test/offset.ts
index 39fc243..2d0a236 100644
--- a/packages/plainjs/test/offset.ts
+++ b/packages/plainjs/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "../src/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,7 +47,11 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
@@ -54,7 +59,7 @@ test("offset", async () => {
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);
diff --git a/packages/whatwg/test/offset.ts b/packages/whatwg/test/offset.ts
index 7de5194..2cd5986 100644
--- a/packages/whatwg/test/offset.ts
+++ b/packages/whatwg/test/offset.ts
@@ -4,7 +4,8 @@ import TokenType from "@streamparser/json/utils/types/tokenType.js";
 
 const input1 = '{\n  "string": "value",\n  "number": 3,\n  "object"';
 const input2 = ': {\n  "key": "vд"\n  },\n  "array": [\n  -1,\n  12\n  ]\n  ';
-const input3 = '"null": null, "true": true, "false": false, "frac": 3.14 }';
+const input3 = '"null": null, "true": true, "false": false, "frac": 3.14,';
+const input4 = '"escape": "\\"\\u00e1" }';
 
 const offsets = [
   [0, TokenType.LEFT_BRACE],
@@ -46,7 +47,11 @@ const offsets = [
   [146, TokenType.STRING],
   [152, TokenType.COLON],
   [154, TokenType.NUMBER],
-  [159, TokenType.RIGHT_BRACE],
+  [158, TokenType.COMMA],
+  [159, TokenType.STRING],
+  [167, TokenType.COLON],
+  [169, TokenType.STRING],
+  [180, TokenType.RIGHT_BRACE],
 ];
 
 test("offset", async () => {
@@ -54,7 +59,7 @@ test("offset", async () => {
 
   await runTokenizerTest(
     new Tokenizer(),
-    [input1, input2, input3],
+    [input1, input2, input3, input4],
     ({ token, offset }) => {
       expect(offset).toEqual(offsets[i][0]);
       expect(token).toEqual(offsets[i][1]);

From f2d10292e07dd9821f369845d0aeb6ea72ead8e4 Mon Sep 17 00:00:00 2001
From: Sebastien Guillemot <sebastiengllmt@gmail.com>
Date: Sun, 29 Sep 2024 12:41:25 +0900
Subject: [PATCH 2/3] handle different length utf8

---
 packages/plainjs/dist/deno/tokenizer.ts | 25 ++++++++++++-------------
 packages/plainjs/src/tokenizer.ts       | 25 ++++++++++++-------------
 2 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts
index 3d0a5d5..1d63c5f 100644
--- a/packages/plainjs/dist/deno/tokenizer.ts
+++ b/packages/plainjs/dist/deno/tokenizer.ts
@@ -402,14 +402,13 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
-              this.escapeLength += 1;
+              this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
 
             if (n === charset.LATIN_SMALL_LETTER_U) {
               this.unicode = "";
-              this.escapeLength += 4;
               this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
               continue;
             }
@@ -447,24 +446,24 @@ export default class Tokenizer {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(String.fromCharCode(intVal)),
-                  );
+                  const buf = this.encoder.encode(String.fromCharCode(intVal));
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate, intVal),
-                    ),
+                  const buf = this.encoder.encode(
+                    String.fromCharCode(this.highSurrogate, intVal),
                   );
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate),
-                    ),
+                  const buf = this.encoder.encode(
+                    String.fromCharCode(this.highSurrogate),
                   );
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 }
                 this.highSurrogate = undefined;
               }
diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts
index b0527ad..c0af992 100644
--- a/packages/plainjs/src/tokenizer.ts
+++ b/packages/plainjs/src/tokenizer.ts
@@ -402,14 +402,13 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
-              this.escapeLength += 1;
+              this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
 
             if (n === charset.LATIN_SMALL_LETTER_U) {
               this.unicode = "";
-              this.escapeLength += 4;
               this.state = TokenizerStates.STRING_UNICODE_DIGIT_1;
               continue;
             }
@@ -447,24 +446,24 @@ export default class Tokenizer {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(String.fromCharCode(intVal)),
-                  );
+                  const buf = this.encoder.encode(String.fromCharCode(intVal));
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate, intVal),
-                    ),
+                  const buf = this.encoder.encode(
+                    String.fromCharCode(this.highSurrogate, intVal),
                   );
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 } else {
-                  this.bufferedString.appendBuf(
-                    this.encoder.encode(
-                      String.fromCharCode(this.highSurrogate),
-                    ),
+                  const buf = this.encoder.encode(
+                    String.fromCharCode(this.highSurrogate),
                   );
+                  this.bufferedString.appendBuf(buf);
+                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 }
                 this.highSurrogate = undefined;
               }

From 283cdb63004095b1720a6ec76038e9c36aa5b9a0 Mon Sep 17 00:00:00 2001
From: Sebastien Guillemot <sebastiengllmt@gmail.com>
Date: Wed, 13 Nov 2024 21:27:35 +0900
Subject: [PATCH 3/3] PR feedback on escape offset fix

---
 packages/plainjs/dist/deno/tokenizer.ts | 37 +++++++++++++------------
 packages/plainjs/src/tokenizer.ts       | 37 +++++++++++++------------
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/packages/plainjs/dist/deno/tokenizer.ts b/packages/plainjs/dist/deno/tokenizer.ts
index 1d63c5f..2843085 100644
--- a/packages/plainjs/dist/deno/tokenizer.ts
+++ b/packages/plainjs/dist/deno/tokenizer.ts
@@ -110,7 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
-  private escapeLength = 0;
+  private escapedCharsByteLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -301,6 +301,7 @@ export default class Tokenizer {
 
             if (n === charset.QUOTATION_MARK) {
               this.bufferedString.reset();
+              this.escapedCharsByteLength = 0;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -337,10 +338,10 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
-              this.offset += this.escapeLength;
-              this.escapeLength = 0;
-
-              this.offset += this.bufferedString.byteLength + 1;
+              this.offset +=
+                this.escapedCharsByteLength +
+                this.bufferedString.byteLength +
+                1;
               continue;
             }
 
@@ -402,7 +403,7 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
-              this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
+              this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -441,32 +442,32 @@ export default class Tokenizer {
                 this.unicode + String.fromCharCode(n),
                 16,
               );
+              let unicodeString: string;
               if (this.highSurrogate === undefined) {
                 if (intVal >= 0xd800 && intVal <= 0xdbff) {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
+                  this.state = TokenizerStates.STRING_DEFAULT;
+                  continue;
                 } else {
-                  const buf = this.encoder.encode(String.fromCharCode(intVal));
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
+                  unicodeString = String.fromCharCode(intVal);
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  const buf = this.encoder.encode(
-                    String.fromCharCode(this.highSurrogate, intVal),
+                  unicodeString = String.fromCharCode(
+                    this.highSurrogate,
+                    intVal,
                   );
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 } else {
-                  const buf = this.encoder.encode(
-                    String.fromCharCode(this.highSurrogate),
-                  );
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
+                  unicodeString = String.fromCharCode(this.highSurrogate);
                 }
                 this.highSurrogate = undefined;
               }
+              const unicodeBuffer = this.encoder.encode(unicodeString);
+              this.bufferedString.appendBuf(unicodeBuffer);
+              // len(\u0000)=6 minus the fact you're appending len(buf)
+              this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
diff --git a/packages/plainjs/src/tokenizer.ts b/packages/plainjs/src/tokenizer.ts
index c0af992..08a9d5d 100644
--- a/packages/plainjs/src/tokenizer.ts
+++ b/packages/plainjs/src/tokenizer.ts
@@ -110,7 +110,7 @@ export default class Tokenizer {
   private separator?: string;
   private separatorBytes?: Uint8Array;
   private separatorIndex = 0;
-  private escapeLength = 0;
+  private escapedCharsByteLength = 0;
   private bufferedString: StringBuilder;
   private bufferedNumber: StringBuilder;
 
@@ -301,6 +301,7 @@ export default class Tokenizer {
 
             if (n === charset.QUOTATION_MARK) {
               this.bufferedString.reset();
+              this.escapedCharsByteLength = 0;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -337,10 +338,10 @@ export default class Tokenizer {
                 value: string,
                 offset: this.offset,
               });
-              this.offset += this.escapeLength;
-              this.escapeLength = 0;
-
-              this.offset += this.bufferedString.byteLength + 1;
+              this.offset +=
+                this.escapedCharsByteLength +
+                this.bufferedString.byteLength +
+                1;
               continue;
             }
 
@@ -402,7 +403,7 @@ export default class Tokenizer {
             const controlChar = escapedSequences[n];
             if (controlChar) {
               this.bufferedString.appendChar(controlChar);
-              this.escapeLength += 2 - 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
+              this.escapedCharsByteLength += 1; // len(\")=2 minus the fact you're appending len(controlChar)=1
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }
@@ -441,32 +442,32 @@ export default class Tokenizer {
                 this.unicode + String.fromCharCode(n),
                 16,
               );
+              let unicodeString: string;
               if (this.highSurrogate === undefined) {
                 if (intVal >= 0xd800 && intVal <= 0xdbff) {
                   //<55296,56319> - highSurrogate
                   this.highSurrogate = intVal;
+                  this.state = TokenizerStates.STRING_DEFAULT;
+                  continue;
                 } else {
-                  const buf = this.encoder.encode(String.fromCharCode(intVal));
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
+                  unicodeString = String.fromCharCode(intVal);
                 }
               } else {
                 if (intVal >= 0xdc00 && intVal <= 0xdfff) {
                   //<56320,57343> - lowSurrogate
-                  const buf = this.encoder.encode(
-                    String.fromCharCode(this.highSurrogate, intVal),
+                  unicodeString = String.fromCharCode(
+                    this.highSurrogate,
+                    intVal,
                   );
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
                 } else {
-                  const buf = this.encoder.encode(
-                    String.fromCharCode(this.highSurrogate),
-                  );
-                  this.bufferedString.appendBuf(buf);
-                  this.escapeLength += 6 - buf.byteLength; // len(\u0000)=6 minus the fact you're appending len(buf)
+                  unicodeString = String.fromCharCode(this.highSurrogate);
                 }
                 this.highSurrogate = undefined;
               }
+              const unicodeBuffer = this.encoder.encode(unicodeString);
+              this.bufferedString.appendBuf(unicodeBuffer);
+              // len(\u0000)=6 minus the fact you're appending len(buf)
+              this.escapedCharsByteLength += 6 - unicodeBuffer.byteLength;
               this.state = TokenizerStates.STRING_DEFAULT;
               continue;
             }