From c7a8759e77d2a57ab9ac97bfe0ad0a8e131f8573 Mon Sep 17 00:00:00 2001 From: Harry Date: Mon, 11 May 2020 09:57:09 +0800 Subject: [PATCH 1/7] feat: add parser for JSON with JS comment --- extruct/jsonld.py | 4 ++-- requirements.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f11580eb..b20fd033 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -2,7 +2,7 @@ """ JSON-LD extractor """ - +import jstyleson import json import re @@ -34,7 +34,7 @@ def _extract_items(self, node): data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads( + data = jstyleson.loads( HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) if isinstance(data, list): return data diff --git a/requirements.txt b/requirements.txt index 2dff7b47..da341d69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ mf2py>=1.1.0 six>=1.11 w3lib html-text +jstyleson From 3c0665efa2f32af353203da460ff922dea8fd6b0 Mon Sep 17 00:00:00 2001 From: Harry Date: Tue, 12 May 2020 10:16:33 +0800 Subject: [PATCH 2/7] fix: remove the regex of JS comment --- extruct/jsonld.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index b20fd033..1f4e0ca6 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -10,7 +10,6 @@ from extruct.utils import parse_html -HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') class JsonLdExtractor(object): @@ -34,8 +33,7 @@ def _extract_items(self, node): data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads( - HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + data = jstyleson.loads(script, strict=False) if isinstance(data, list): return data elif isinstance(data, dict): From 5bb1a87d2995c995768c39c3aba7525d7d0ab898 Mon Sep 17 00:00:00 2001 From: Harry Date: Fri, 15 May 2020 16:38:01 +0800 Subject: [PATCH 3/7] fix: adjust package --- extruct/jsonld.py | 3 ++- setup.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 1f4e0ca6..f7d267a2 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -2,9 +2,10 @@ """ JSON-LD extractor """ -import jstyleson import json import re +import jstyleson + import lxml.etree diff --git a/setup.py b/setup.py index 5620706d..aa970202 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,9 @@ def get_version(): 'mf2py', 'w3lib', 'html-text>=0.5.1', - 'six'], + 'six', + 'jstyleson' + ], extras_require={ 'cli': [ 'requests', From 8dc25e19d2af02184b46bd6ac64a2cf1b13c1f0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 May 2020 18:12:57 +0200 Subject: [PATCH 4/7] Update extruct/jsonld.py --- extruct/jsonld.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f7d267a2..ebfff382 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -4,9 +4,8 @@ """ import json import re -import jstyleson - +import jstyleson import lxml.etree from extruct.utils import parse_html From 09d7f47be718b9bfede20e2f99898ab67569fbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 May 2020 18:13:44 +0200 Subject: [PATCH 5/7] Update extruct/jsonld.py --- extruct/jsonld.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index ebfff382..2712da75 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -2,6 +2,7 @@ """ JSON-LD extractor """ + import json import re From 3970f7949527f665c994d90c87c15558e91e45e5 Mon Sep 17 00:00:00 2001 From: Harry Date: Wed, 20 May 2020 17:49:37 +0800 Subject: [PATCH 6/7] fix: handle unexpected jsonld format --- extruct/jsonld.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 2712da75..e30c6fe6 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -11,6 +11,7 @@ from extruct.utils import parse_html +HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') class JsonLdExtractor(object): @@ -34,7 +35,7 @@ def _extract_items(self, node): data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads(script, strict=False) + data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script),strict=False) if isinstance(data, list): return data elif isinstance(data, dict): From 8e5a60378e1cea16fc2e2ccd73569034d60b0d57 Mon Sep 17 00:00:00 2001 From: Harry Date: Wed, 20 May 2020 17:52:52 +0800 Subject: [PATCH 7/7] feat: add unittest --- .../JSONLD_with_JS_comment.html | 24 +++++++++++++++++++ .../JSONLD_with_JS_comment.jsonld | 12 ++++++++++ tests/test_jsonld.py | 5 ++++ 3 files changed, 41 insertions(+) create mode 100644 tests/samples/custom.invalid/JSONLD_with_JS_comment.html create mode 100644 tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld diff --git a/tests/samples/custom.invalid/JSONLD_with_JS_comment.html b/tests/samples/custom.invalid/JSONLD_with_JS_comment.html new file mode 100644 index 00000000..f5dd2bc5 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_JS_comment.html @@ -0,0 +1,24 @@ + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld b/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld new file mode 100644 index 00000000..4bd52332 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_with_JS_comment.jsonld @@ -0,0 +1,12 @@ +[ + { + "@context": "http://schema.org", + "@type": "NewsArticle", + "thumbnailUrl": "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg", + "keywords": "", + "url": "https://money.udn.com/money/story/5635/4158094", + "mainEntityOfPage": "https://money.udn.com/money/story/5635/4158094", + "headline": "讓AI挑出感興趣 SparkAmplify精準行銷當紅", + "articleSection": "商情" + } +] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 47309ee9..c5598f35 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -40,6 +40,11 @@ def test_jsonld_with_control_characters_comment(self): self.assertJsonLdCorrect( folder='custom.invalid', page='JSONLD_with_control_characters_comment') + + def test_jsonld_with_json_including_js_comment(self): + self.assertJsonLdCorrect( + folder='custom.invalid', + page='JSONLD_with_JS_comment') def assertJsonLdCorrect(self, folder, page): body, expected = self._get_body_expected(folder, page)