From bc5d4eb0fa76ea65faf98a900b145c6a4d6d595f Mon Sep 17 00:00:00 2001 From: Sashe Date: Mon, 17 Jan 2022 19:10:36 +0200 Subject: [PATCH] Optimized the StringShinglingTool --- .../functional/NearDuplicatesFinder.func.ts | 6 ++-- __tests__/unit/NearDuplicatesFinder.spec.ts | 2 +- .../ShinglingTool/StringShinglingTool.spec.ts | 36 ++----------------- .../functional/NearDuplicatesFinder.func.js | 6 ++-- .../NearDuplicatesFinder.func.js.map | 2 +- .../unit/NearDuplicatesFinder.spec.js | 2 +- .../ShinglingTool/StringShinglingTool.spec.js | 27 +------------- .../StringShinglingTool.spec.js.map | 2 +- dist/src/ShinglingTool/StringShinglingTool.js | 7 ++-- .../ShinglingTool/StringShinglingTool.js.map | 2 +- src/ShinglingTool/StringShinglingTool.ts | 10 ++++-- 11 files changed, 27 insertions(+), 75 deletions(-) diff --git a/__tests__/functional/NearDuplicatesFinder.func.ts b/__tests__/functional/NearDuplicatesFinder.func.ts index 9ae2e94..1f422e0 100644 --- a/__tests__/functional/NearDuplicatesFinder.func.ts +++ b/__tests__/functional/NearDuplicatesFinder.func.ts @@ -52,10 +52,10 @@ describe("Testing NearDuplicateFinder class", () => { const expected = { review5: [ [1, "review6"], - [0.9430284857571214, "review136"], + [0.9333333333333333, "review136"], ], - review6: [[0.9430284857571214, "review136"]], - review9: [[0.8916129032258064, "review81"]], + review6: [[0.9333333333333333, "review136"]], + review81: [[0.8853503184713376, "review9"]], }; const finder = makeDuplicatesFinderWithMocks({ diff --git a/__tests__/unit/NearDuplicatesFinder.spec.ts b/__tests__/unit/NearDuplicatesFinder.spec.ts index 78571d9..6af8897 100644 --- a/__tests__/unit/NearDuplicatesFinder.spec.ts +++ b/__tests__/unit/NearDuplicatesFinder.spec.ts @@ -25,7 +25,7 @@ describe("Testing NearDuplicateFinder class", () => { "Like The Rings of The Lord, but with pink parrots", "Like The Rings of The Lord, but with pink poodles", ], - { text0: [[0.7647058823529411, "text1"]] }, + { text0: [[0.6666666666666666, "text1"]] }, ], [ "Test case: Totally different identical texts (score=0)", diff --git a/__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts b/__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts index 2042cfb..afb239f 100644 --- a/__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts +++ b/__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts @@ -11,7 +11,7 @@ describe("Testing ShinglingTool/StringShinglingTool class", () => { [ "Test case: String with length that is bigger than the shingle length", "Not so long ", - ["Not so", "ot so ", "t so l", " so lo", "so lon", "o long", " long "], + ["Not so", " long "], ], [ "Test case: String with length that is equal the shingle length", @@ -27,39 +27,7 @@ describe("Testing ShinglingTool/StringShinglingTool class", () => { [ "Test case: String with non ascii symbols", "Като игра на тронове, ама във ваната", - [ - "Като и", - "ато иг", - "то игр", - "о игра", - " игра ", - "игра н", - "гра на", - "ра на ", - "а на т", - " на тр", - "на тро", - "а трон", - " троно", - "тронов", - "ронове", - "онове,", - "нове, ", - "ове, а", - "ве, ам", - "е, ама", - ", ама ", - " ама в", - "ама въ", - "ма във", - "а във ", - " във в", - "във ва", - "ъв ван", - "в вана", - " ванат", - "ваната", - ], + ["Като и", "гра на", " троно", "ве, ам", "а във ", "ваната"], ], ]; diff --git a/dist/__tests__/functional/NearDuplicatesFinder.func.js b/dist/__tests__/functional/NearDuplicatesFinder.func.js index 71bb725..2cac6ca 100644 --- a/dist/__tests__/functional/NearDuplicatesFinder.func.js +++ b/dist/__tests__/functional/NearDuplicatesFinder.func.js @@ -72,10 +72,10 @@ describe("Testing NearDuplicateFinder class", () => { const expected = { review5: [ [1, "review6"], - [0.9430284857571214, "review136"], + [0.9333333333333333, "review136"], ], - review6: [[0.9430284857571214, "review136"]], - review9: [[0.8916129032258064, "review81"]], + review6: [[0.9333333333333333, "review136"]], + review81: [[0.8853503184713376, "review9"]], }; const finder = (0, duplicatesFinderFactory_1.makeDuplicatesFinderWithMocks)({ minSimilarity: 0.01, diff --git a/dist/__tests__/functional/NearDuplicatesFinder.func.js.map b/dist/__tests__/functional/NearDuplicatesFinder.func.js.map index d8b372c..f9a64cc 100644 --- a/dist/__tests__/functional/NearDuplicatesFinder.func.js.map +++ b/dist/__tests__/functional/NearDuplicatesFinder.func.js.map @@ -1 +1 @@ -{"version":3,"file":"NearDuplicatesFinder.func.js","sourceRoot":"","sources":["../../../__tests__/functional/NearDuplicatesFinder.func.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;AAAA,uFAGmD;AACnD,4CAAoB;AACpB,gDAAwB;AACxB,wDAAgC;AAEhC,QAAQ,CAAC,mCAAmC,EAAE,GAAG,EAAE;IACjD,IAAI,CAAC,6DAA6D,EAAE,GAAS,EAAE;;QAC7E,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,iBAAiB,EAAE,WAAW,CAAC;aACjC;YACD,OAAO,EAAE,CAAC,CAAC,iBAAiB,EAAE,WAAW,CAAC,CAAC;YAC3C,QAAQ,EAAE,CAAC,CAAC,kBAAkB,EAAE,SAAS,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,uDAA6B,EAAC;YAC3C,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACnC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;QACnC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;IAEH,IAAI,CAAC,mEAAmE,EAAE,GAAS,EAAE;;QACnF,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,kBAAkB,EAAE,WAAW,CAAC;aAClC;YACD,OAAO,EAAE,CAAC,CAAC,kBAAkB,EAAE,WAAW,CAAC,CAAC;YAC5C,OAAO,EAAE,CAAC,CAAC,kBAAkB,EAAE,UAAU,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,uDAA6B,EAAC;YAC3C,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACnC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;QACnC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;IAEH,IAAI,CAAC,qEAAqE,EAAE,GAAS,EAAE;;QACrF,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,iBAAiB,EAAE,WAAW,CAAC;aACjC;YACD,OAAO,EAAE,CAAC,CAAC,iBAAiB,EAAE,WAAW,CAAC,CAAC;YAC3C,QAAQ,EAAE,CAAC,CAAC,kBAAkB,EAAE,SAAS,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,4DAAkC,EAAC;YAChD,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACzC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,CAAC;QACzC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"} \ No newline at end of file +{"version":3,"file":"NearDuplicatesFinder.func.js","sourceRoot":"","sources":["../../../__tests__/functional/NearDuplicatesFinder.func.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;AAAA,uFAGmD;AACnD,4CAAoB;AACpB,gDAAwB;AACxB,wDAAgC;AAEhC,QAAQ,CAAC,mCAAmC,EAAE,GAAG,EAAE;IACjD,IAAI,CAAC,6DAA6D,EAAE,GAAS,EAAE;;QAC7E,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,iBAAiB,EAAE,WAAW,CAAC;aACjC;YACD,OAAO,EAAE,CAAC,CAAC,iBAAiB,EAAE,WAAW,CAAC,CAAC;YAC3C,QAAQ,EAAE,CAAC,CAAC,kBAAkB,EAAE,SAAS,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,uDAA6B,EAAC;YAC3C,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACnC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;QACnC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;IAEH,IAAI,CAAC,mEAAmE,EAAE,GAAS,EAAE;;QACnF,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,kBAAkB,EAAE,WAAW,CAAC;aAClC;YACD,OAAO,EAAE,CAAC,CAAC,kBAAkB,EAAE,WAAW,CAAC,CAAC;YAC5C,QAAQ,EAAE,CAAC,CAAC,kBAAkB,EAAE,SAAS,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,uDAA6B,EAAC;YAC3C,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACnC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;QACnC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;IAEH,IAAI,CAAC,qEAAqE,EAAE,GAAS,EAAE;;QACrF,MAAM,QAAQ,GAAG;YACf,OAAO,EAAE;gBACP,CAAC,CAAC,EAAE,SAAS,CAAC;gBACd,CAAC,iBAAiB,EAAE,WAAW,CAAC;aACjC;YACD,OAAO,EAAE,CAAC,CAAC,iBAAiB,EAAE,WAAW,CAAC,CAAC;YAC3C,QAAQ,EAAE,CAAC,CAAC,kBAAkB,EAAE,SAAS,CAAC,CAAC;SAC5C,CAAC;QAEF,MAAM,MAAM,GAAG,IAAA,4DAAkC,EAAC;YAChD,aAAa,EAAE,IAAI;YACnB,YAAY,EAAE,CAAC;YACf,YAAY,EAAE,MAAM;YACpB,eAAe,EAAE,GAAG;YACpB,WAAW,EAAE,CAAC;SACf,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,YAAE,CAAC,gBAAgB,CACpC,cAAI,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,UAAU,EAAE,kBAAkB,CAAC,CACjE,CAAC;QAEF,MAAM,EAAE,GAAG,kBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,QAAQ;SACpB,CAAC,CAAC;QAEH,IAAI,KAAK,GAAG,CAAC,CAAC;;YAEd,KAAyB,IAAA,OAAA,cAAA,EAAE,CAAA,QAAA;gBAAhB,MAAM,IAAI,eAAA,CAAA;gBACnB,MAAM,MAAM,CAAC,GAAG,CAAC,SAAS,KAAK,EAAE,EAAE,IAAI,CAAC,CAAC;gBACzC,KAAK,IAAI,CAAC,CAAC;gBACX,IAAI,KAAK,GAAG,GAAG,EAAE;oBACf,MAAM;iBACP;aACF;;;;;;;;;QAED,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,MAAM,EAAE,CAAC;QACzC,MAAM,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACvC,CAAC,CAAA,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"} \ No newline at end of file diff --git a/dist/__tests__/unit/NearDuplicatesFinder.spec.js b/dist/__tests__/unit/NearDuplicatesFinder.spec.js index c8422b8..73144c3 100644 --- a/dist/__tests__/unit/NearDuplicatesFinder.spec.js +++ b/dist/__tests__/unit/NearDuplicatesFinder.spec.js @@ -25,7 +25,7 @@ describe("Testing NearDuplicateFinder class", () => { "Like The Rings of The Lord, but with pink parrots", "Like The Rings of The Lord, but with pink poodles", ], - { text0: [[0.7647058823529411, "text1"]] }, + { text0: [[0.6666666666666666, "text1"]] }, ], [ "Test case: Totally different identical texts (score=0)", diff --git a/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js b/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js index 008c18b..1f8d09c 100644 --- a/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js +++ b/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js @@ -10,7 +10,7 @@ describe("Testing ShinglingTool/StringShinglingTool class", () => { [ "Test case: String with length that is bigger than the shingle length", "Not so long ", - ["Not so", "ot so ", "t so l", " so lo", "so lon", "o long", " long "], + ["Not so", " long "], ], [ "Test case: String with length that is equal the shingle length", @@ -28,35 +28,10 @@ describe("Testing ShinglingTool/StringShinglingTool class", () => { "Като игра на тронове, ама във ваната", [ "Като и", - "ато иг", - "то игр", - "о игра", - " игра ", - "игра н", "гра на", - "ра на ", - "а на т", - " на тр", - "на тро", - "а трон", " троно", - "тронов", - "ронове", - "онове,", - "нове, ", - "ове, а", "ве, ам", - "е, ама", - ", ама ", - " ама в", - "ама въ", - "ма във", "а във ", - " във в", - "във ва", - "ъв ван", - "в вана", - " ванат", "ваната", ], ], diff --git a/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js.map b/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js.map index a1239eb..41b8a64 100644 --- a/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js.map +++ b/dist/__tests__/unit/ShinglingTool/StringShinglingTool.spec.js.map @@ -1 +1 @@ -{"version":3,"file":"StringShinglingTool.spec.js","sourceRoot":"","sources":["../../../../__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts"],"names":[],"mappings":";;;;;AAAA,yGAAiF;AAGjF,QAAQ,CAAC,iDAAiD,EAAE,GAAG,EAAE;IAC/D,MAAM,IAAI,GAAG,IAAI,6BAAmB,CAClC,CAAC,EACD,CAAC,OAAe,EAAmB,EAAE,CAAC,OAAO,CAC9C,CAAC;IAEF,MAAM,IAAI,GAAiC;QACzC;YACE,sEAAsE;YACtE,cAAc;YACd,CAAC,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,CAAC;SACvE;QACD;YACE,gEAAgE;YAChE,QAAQ;YACR,CAAC,QAAQ,CAAC;SACX;QACD;YACE,+DAA+D;YAC/D,KAAK;YACL,CAAC,KAAK,CAAC;SACR;QACD,CAAC,yBAAyB,EAAE,EAAE,EAAE,EAAE,CAAC;QACnC;YACE,0CAA0C;YAC1C,sCAAsC;YACtC;gBACE,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;aACT;SACF;KACF,CAAC;IAEF,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CACb,IAAI,EACJ,CAAC,QAAgB,EAAE,IAAY,EAAE,QAAkB,EAAE,EAAE;QACrD,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,KAAa,EAAE,OAAgB,EAAE,EAAE;YACjE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,QAAQ,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE;YAC7B,MAAM,CAAC,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;QACtC,CAAC,CAAC,CAAC;IACL,CAAC,CACF,CAAC;AACJ,CAAC,CAAC,CAAC"} \ No newline at end of file +{"version":3,"file":"StringShinglingTool.spec.js","sourceRoot":"","sources":["../../../../__tests__/unit/ShinglingTool/StringShinglingTool.spec.ts"],"names":[],"mappings":";;;;;AAAA,yGAAiF;AAGjF,QAAQ,CAAC,iDAAiD,EAAE,GAAG,EAAE;IAC/D,MAAM,IAAI,GAAG,IAAI,6BAAmB,CAClC,CAAC,EACD,CAAC,OAAe,EAAmB,EAAE,CAAC,OAAO,CAC9C,CAAC;IAEF,MAAM,IAAI,GAAiC;QACzC;YACE,sEAAsE;YACtE,cAAc;YACd,CAAC,QAAQ,EAAE,QAAQ,CAAC;SACrB;QACD;YACE,gEAAgE;YAChE,QAAQ;YACR,CAAC,QAAQ,CAAC;SACX;QACD;YACE,+DAA+D;YAC/D,KAAK;YACL,CAAC,KAAK,CAAC;SACR;QACD,CAAC,yBAAyB,EAAE,EAAE,EAAE,EAAE,CAAC;QACnC;YACE,0CAA0C;YAC1C,sCAAsC;YACtC;gBACI,QAAQ;gBACV,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;gBACR,QAAQ;aACT;SACF;KACF,CAAC;IAEF,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CACb,IAAI,EACJ,CAAC,QAAgB,EAAE,IAAY,EAAE,QAAkB,EAAE,EAAE;QACrD,MAAM,QAAQ,GAAc,EAAE,CAAC;QAC/B,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,KAAa,EAAE,OAAgB,EAAE,EAAE;YACjE,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QACH,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,QAAQ,CAAC,OAAO,CAAC,CAAC,EAAE,EAAE,KAAK,EAAE,EAAE;YAC7B,MAAM,CAAC,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;QACtC,CAAC,CAAC,CAAC;IACL,CAAC,CACF,CAAC;AACJ,CAAC,CAAC,CAAC"} \ No newline at end of file diff --git a/dist/src/ShinglingTool/StringShinglingTool.js b/dist/src/ShinglingTool/StringShinglingTool.js index 13efffd..1c7e75a 100644 --- a/dist/src/ShinglingTool/StringShinglingTool.js +++ b/dist/src/ShinglingTool/StringShinglingTool.js @@ -17,8 +17,11 @@ class StringShinglingTool extends BaseShinglingTool_1.BaseShinglingTool { while (endPosition <= items.length) { const shingle = this.hasher(items.slice(startPosition, endPosition).join("")); callback(docId, shingle); - startPosition += 1; - endPosition += 1; + startPosition += this.shingleSize; + endPosition = endPosition + this.shingleSize > items.length ? items.length : endPosition + this.shingleSize; + if (startPosition >= endPosition) { + break; + } } } } diff --git a/dist/src/ShinglingTool/StringShinglingTool.js.map b/dist/src/ShinglingTool/StringShinglingTool.js.map index c896958..346a69f 100644 --- a/dist/src/ShinglingTool/StringShinglingTool.js.map +++ b/dist/src/ShinglingTool/StringShinglingTool.js.map @@ -1 +1 @@ -{"version":3,"file":"StringShinglingTool.js","sourceRoot":"","sources":["../../../src/ShinglingTool/StringShinglingTool.ts"],"names":[],"mappings":";;AAAA,2DAAwD;AAGxD,MAAqB,mBAAoB,SAAQ,qCAAiB;IACzD,OAAO,CACZ,KAAa,EACb,IAAY,EACZ,QAAmD;QAEnD;;;WAGG;QAEH,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;QAExB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;QAEnC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE;YACrD,QAAQ,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;YACtB,OAAO;SACR;QACD,OAAO,WAAW,IAAI,KAAK,CAAC,MAAM,EAAE;YAClC,MAAM,OAAO,GAAY,IAAI,CAAC,MAAM,CAClC,KAAK,CAAC,KAAK,CAAC,aAAa,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CACjD,CAAC;YACF,QAAQ,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;YACzB,aAAa,IAAI,CAAC,CAAC;YACnB,WAAW,IAAI,CAAC,CAAC;SAClB;IACH,CAAC;CACF;AA7BD,sCA6BC"} \ No newline at end of file +{"version":3,"file":"StringShinglingTool.js","sourceRoot":"","sources":["../../../src/ShinglingTool/StringShinglingTool.ts"],"names":[],"mappings":";;AAAA,2DAAwD;AAGxD,MAAqB,mBAAoB,SAAQ,qCAAiB;IACzD,OAAO,CACZ,KAAa,EACb,IAAY,EACZ,QAAmD;QAEnD;;;WAGG;QAEH,MAAM,KAAK,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;QAExB,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;QAEnC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,WAAW,EAAE;YACrD,QAAQ,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;YACtB,OAAO;SACR;QACD,OAAO,WAAW,IAAI,KAAK,CAAC,MAAM,EAAE;YAClC,MAAM,OAAO,GAAY,IAAI,CAAC,MAAM,CAClC,KAAK,CAAC,KAAK,CAAC,aAAa,EAAE,WAAW,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CACjD,CAAC;YACF,QAAQ,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;YACzB,aAAa,IAAI,IAAI,CAAC,WAAW,CAAC;YAClC,WAAW,GAAG,WAAW,GAAG,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;YAC5G,IAAI,aAAa,IAAI,WAAW,EAAE;gBAChC,MAAM;aACP;SACF;IACH,CAAC;CACF;AAhCD,sCAgCC"} \ No newline at end of file diff --git a/src/ShinglingTool/StringShinglingTool.ts b/src/ShinglingTool/StringShinglingTool.ts index 4383b12..7db344e 100644 --- a/src/ShinglingTool/StringShinglingTool.ts +++ b/src/ShinglingTool/StringShinglingTool.ts @@ -26,8 +26,14 @@ export default class StringShinglingTool extends BaseShinglingTool { items.slice(startPosition, endPosition).join("") ); callback(docId, shingle); - startPosition += 1; - endPosition += 1; + startPosition += this.shingleSize; + endPosition = + endPosition + this.shingleSize > items.length + ? items.length + : endPosition + this.shingleSize; + if (startPosition >= endPosition) { + break; + } } } }