From 1d5539292df410f5dfad06890a03fd4994a56dde Mon Sep 17 00:00:00 2001 From: Huey Date: Sat, 12 Jun 2021 13:19:09 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20improve:=20sort=20search=20results?= =?UTF-8?q?=20by=20textual=20similarity;=20resolve=20#37?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squashed commit of the following: commit b0afbb01381f42590b43e5095c4ef043803e57e9 Author: Huey Date: Sat Jun 12 13:18:11 2021 +0800 ⚡improve: sort search results using Ratcliff/Obershelp commit 2ae77404f2296055059d90f2c1efdacb33a2e34d Author: Huey Date: Sat Jun 12 12:03:45 2021 +0800 SG: return only matching citations --- package-lock.json | 15 +++++++++++++-- package.json | 3 ++- src/utils/Finder/CaseCitationFinder/utils.ts | 2 +- src/utils/scraper/AU/AU.ts | 12 ++++++++---- src/utils/scraper/CA/CA.ts | 12 ++++++++---- src/utils/scraper/EU/EU.ts | 8 ++++++-- src/utils/scraper/HK/HK.ts | 12 ++++++++---- src/utils/scraper/NZ/NZ.ts | 10 +++++++--- src/utils/scraper/SG/SG.ts | 10 +++++++--- src/utils/scraper/SG/SGSC.ts | 6 +++++- src/utils/scraper/SG/SLW.ts | 5 +---- src/utils/scraper/UK/UK.ts | 10 +++++++--- src/utils/scraper/utils.ts | 8 ++++++++ 13 files changed, 81 insertions(+), 32 deletions(-) create mode 100644 src/utils/scraper/utils.ts diff --git a/package-lock.json b/package-lock.json index c2bf66e..e77d86f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,17 +1,18 @@ { "name": "clerkent", - "version": "2.4.2", + "version": "2.5.0", "lockfileVersion": 2, "requires": true, "packages": { "": { - "version": "2.4.2", + "version": "2.5.0", "license": "GPL-3", "dependencies": { "@babel/runtime": "^7.14.0", "axios": "^0.21.1", "axios-cache-adapter": "^2.7.3", "cheerio": "^1.0.0-rc.9", + "gestalt-pattern-matcher": "^0.0.12", "memoizee": "^0.4.15", "qs": "^6.10.1", "react": "^17.0.2", @@ -9761,6 +9762,11 @@ "node": ">=6.9.0" } }, + "node_modules/gestalt-pattern-matcher": { + "version": "0.0.12", + "resolved": "https://registry.npmjs.org/gestalt-pattern-matcher/-/gestalt-pattern-matcher-0.0.12.tgz", + "integrity": "sha512-Bz/PAkA44TCdZeeLyxO7hQFpisErATJBarhwxYOXYIh9FeURbRLB9hH+8+uN/InLzl/nyjTr4SitmQSBFEzYdQ==" + }, "node_modules/get-caller-file": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", @@ -30685,6 +30691,11 @@ "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", "dev": true }, + "gestalt-pattern-matcher": { + "version": "0.0.12", + "resolved": "https://registry.npmjs.org/gestalt-pattern-matcher/-/gestalt-pattern-matcher-0.0.12.tgz", + "integrity": "sha512-Bz/PAkA44TCdZeeLyxO7hQFpisErATJBarhwxYOXYIh9FeURbRLB9hH+8+uN/InLzl/nyjTr4SitmQSBFEzYdQ==" + }, "get-caller-file": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", diff --git a/package.json b/package.json index 1764792..70d25fd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "clerkent", - "version": "2.5.0", + "version": "2.6.0", "private": true, "description": "quick search for international caselaw and legislation", "repository": "https://github.com/lacuna-technologies/clerkent.git", @@ -47,6 +47,7 @@ "axios": "^0.21.1", "axios-cache-adapter": "^2.7.3", "cheerio": "^1.0.0-rc.9", + "gestalt-pattern-matcher": "^0.0.12", "memoizee": "^0.4.15", "qs": "^6.10.1", "react": "^17.0.2", diff --git a/src/utils/Finder/CaseCitationFinder/utils.ts b/src/utils/Finder/CaseCitationFinder/utils.ts index 8f7e7c2..950ee26 100644 --- a/src/utils/Finder/CaseCitationFinder/utils.ts +++ b/src/utils/Finder/CaseCitationFinder/utils.ts @@ -23,4 +23,4 @@ export const sortCitationsByVolume = (abbrsList, citationsArray: any[], attribut abbrsList, citationsArray.map(c => c[attribute]), ).map(c => citationsArray.find(v => v[attribute] === c)) -} +} \ No newline at end of file diff --git a/src/utils/scraper/AU/AU.ts b/src/utils/scraper/AU/AU.ts index 0aff4d7..2052353 100644 --- a/src/utils/scraper/AU/AU.ts +++ b/src/utils/scraper/AU/AU.ts @@ -5,6 +5,7 @@ import Logger from '../../Logger' import Constants from '../../Constants' import { sortAUCitations } from '../../Finder/CaseCitationFinder/AU' import Helpers from '../../Helpers' +import { sortByNameSimilarity } from '../utils' const getCaseByName = async (caseName: string): Promise => { try { @@ -16,10 +17,13 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.AU.id) - return sortAUCitations( - Helpers.uniqueBy(results, `citation`), - `citation`, - ) + return sortByNameSimilarity( + caseName, + sortAUCitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), + ) } catch (error) { Logger.error(error) } diff --git a/src/utils/scraper/CA/CA.ts b/src/utils/scraper/CA/CA.ts index c4d2281..6085505 100644 --- a/src/utils/scraper/CA/CA.ts +++ b/src/utils/scraper/CA/CA.ts @@ -5,6 +5,7 @@ import Logger from '../../Logger' import Constants from '../../Constants' import { sortCACitations } from '../../Finder/CaseCitationFinder/CA' import Helpers from '../../Helpers' +import { sortByNameSimilarity } from '../utils' const getCaseByName = async (caseName: string): Promise => { try { @@ -16,10 +17,13 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.CA.id) - return sortCACitations( - Helpers.uniqueBy(results, `citation`), - `citation`, - ) + return sortByNameSimilarity( + caseName, + sortCACitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), + ) } catch (error) { Logger.error(error) } diff --git a/src/utils/scraper/EU/EU.ts b/src/utils/scraper/EU/EU.ts index da8ec71..1a7d131 100644 --- a/src/utils/scraper/EU/EU.ts +++ b/src/utils/scraper/EU/EU.ts @@ -5,6 +5,7 @@ import type Law from '../../../types/Law' import Constants from '../../Constants' import Helpers from '../../Helpers' import Logger from '../../Logger' +import { sortByNameSimilarity } from '../utils' const getLegislation = EURLex.getLegislation @@ -17,7 +18,10 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.EU.id) - return Helpers.uniqueBy(results, `citation`) + return sortByNameSimilarity( + caseName, + Helpers.uniqueBy(results, `citation`), + ) } catch (error) { Logger.error(error) } @@ -30,7 +34,7 @@ const getCaseByCitation = async (citation: string, court: string): Promise => { try { @@ -19,10 +20,13 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.HK.id) - return sortHKCitations( - Helpers.uniqueBy(results, `citation`), - `citation`, - ) + return sortByNameSimilarity( + caseName, + sortHKCitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), + ) } catch (error) { Logger.error(error) } diff --git a/src/utils/scraper/NZ/NZ.ts b/src/utils/scraper/NZ/NZ.ts index 5ecd5d4..927ab70 100644 --- a/src/utils/scraper/NZ/NZ.ts +++ b/src/utils/scraper/NZ/NZ.ts @@ -5,6 +5,7 @@ import Logger from '../../Logger' import Helpers from '../../Helpers' import Constants from '../../Constants' import { sortNZCitations } from '../../Finder/CaseCitationFinder/NZ' +import { sortByNameSimilarity } from '../utils' const getCaseByName = async (caseName: string): Promise => { try { @@ -16,9 +17,12 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.NZ.id) - return sortNZCitations( - Helpers.uniqueBy(results, `citation`), - `citation`, + return sortByNameSimilarity( + caseName, + sortNZCitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), ) } catch (error) { Logger.error(error) diff --git a/src/utils/scraper/SG/SG.ts b/src/utils/scraper/SG/SG.ts index 0f5e2fa..b36d5c6 100644 --- a/src/utils/scraper/SG/SG.ts +++ b/src/utils/scraper/SG/SG.ts @@ -7,6 +7,7 @@ import Helpers from '../../Helpers' import Logger from '../../Logger' import Constants from '../../Constants' import { sortSGCitations } from '../../Finder/CaseCitationFinder/SG' +import { sortByNameSimilarity } from '../utils' const getLegislation = SSO.getLegislation @@ -21,9 +22,12 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.SG.id) - return sortSGCitations( - Helpers.uniqueBy(results, `citation`), - `citation`, + return sortByNameSimilarity( + caseName, + sortSGCitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), ) } catch (error) { Logger.error(error) diff --git a/src/utils/scraper/SG/SGSC.ts b/src/utils/scraper/SG/SGSC.ts index 6a40a9c..97450bc 100644 --- a/src/utils/scraper/SG/SGSC.ts +++ b/src/utils/scraper/SG/SGSC.ts @@ -29,6 +29,8 @@ const parseCase = ($: cheerio.Root, cheerioElement: cheerio.Element): Law.Case = } } +const trimLeadingPageZeros = (citation: string) => citation.replace(/ 0+([1-9]+)$/, ` $1`) + const getCaseByCitation = async (citation: string): Promise => { const { data } = await Request.get(getSearchResults(citation)) const $ = cheerio.load(data) @@ -36,7 +38,9 @@ const getCaseByCitation = async (citation: string): Promise => { const results = $(`.judgmentpage`) .map((_, element) => parseCase($, element)) .get() - .filter((match: Law.Case)=> match.citation.toLowerCase() === citation.toLowerCase()) + .filter(({ citation: scrapedCitation })=> ( + trimLeadingPageZeros(scrapedCitation).toLowerCase() === citation.toLowerCase() + )) Logger.log(`SGSC scrape results`, results) return results } diff --git a/src/utils/scraper/SG/SLW.ts b/src/utils/scraper/SG/SLW.ts index ba23509..dd6518e 100644 --- a/src/utils/scraper/SG/SLW.ts +++ b/src/utils/scraper/SG/SLW.ts @@ -29,10 +29,7 @@ const getCaseByCitation = async (citation: string): Promise => { const results = data .map(([name, link]) => parseCase(name, link)) - .filter(({ citation: scrapedCitation }) => ( - Helpers.isCitationValid(scrapedCitation) && - citation === scrapedCitation - )) + .filter(({ citation: scrapedCitation }) => citation.toLowerCase() === scrapedCitation.toLowerCase()) Logger.log(`SLW scrape results`, results) return results } diff --git a/src/utils/scraper/UK/UK.ts b/src/utils/scraper/UK/UK.ts index 34d19b0..4cd0b47 100644 --- a/src/utils/scraper/UK/UK.ts +++ b/src/utils/scraper/UK/UK.ts @@ -6,6 +6,7 @@ import Logger from '../../Logger' import Helpers from '../../Helpers' import { sortUKCitations } from '../../Finder/CaseCitationFinder/UK' import Constants from '../../Constants' +import { sortByNameSimilarity } from '../utils' const getLegislation = LegislationGovUk.getLegislation const getCaseByName = async (caseName: string): Promise => { @@ -18,9 +19,12 @@ const getCaseByName = async (caseName: string): Promise => { .flatMap(({ value }: PromiseFulfilledResult) => value) .filter(({ jurisdiction }) => jurisdiction === Constants.JURISDICTIONS.UK.id) - return sortUKCitations( - Helpers.uniqueBy(results, `citation`), - `citation`, + return sortByNameSimilarity( + caseName, + sortUKCitations( + Helpers.uniqueBy(results, `citation`), + `citation`, + ), ) } catch (error) { Logger.error(error) diff --git a/src/utils/scraper/utils.ts b/src/utils/scraper/utils.ts new file mode 100644 index 0000000..52d7c2a --- /dev/null +++ b/src/utils/scraper/utils.ts @@ -0,0 +1,8 @@ +import GestaltSimilarity from 'gestalt-pattern-matcher' +import Law from '../../types/Law' + +export const sortByNameSimilarity = (query: string, cases: Law.Case[]) => cases.sort((a, b) => { + const similarityA = GestaltSimilarity(query, a.name) + const similarityB = GestaltSimilarity(query, b.name) + return similarityB - similarityA +}) \ No newline at end of file