|
| 1 | +import json |
1 | 2 | import re
|
2 | 3 |
|
3 | 4 | from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader
|
@@ -28,8 +29,8 @@ class URLComponent(Component):
|
28 | 29 | DropdownInput(
|
29 | 30 | name="format",
|
30 | 31 | display_name="Output Format",
|
31 |
| - info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", |
32 |
| - options=["Text", "Raw HTML"], |
| 32 | + info="Output Format. Use 'Text' to extract the text from the HTML, 'Raw HTML' for the raw HTML content or 'JSON' to extract JSON from the HTML ", |
| 33 | + options=["Text", "Raw HTML", "JSON"], |
33 | 34 | value="Text",
|
34 | 35 | ),
|
35 | 36 | ]
|
@@ -71,17 +72,44 @@ def ensure_url(self, string: str) -> str:
|
71 | 72 | if not url_regex.match(string):
|
72 | 73 | msg = f"Invalid URL: {string}"
|
73 | 74 | raise ValueError(msg)
|
| 75 | + if self.format == "JSON": |
| 76 | + if not ".json" in string: |
| 77 | + msg = f"Invalid JSON URL: {string}" |
| 78 | + raise ValueError(msg) |
74 | 79 |
|
75 | 80 | return string
|
76 | 81 |
|
77 | 82 | def fetch_content(self) -> list[Data]:
|
78 | 83 | urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]
|
| 84 | + |
79 | 85 | if self.format == "Raw HTML":
|
80 | 86 | loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
|
81 | 87 | else:
|
82 | 88 | loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
|
| 89 | + |
83 | 90 | docs = loader.load()
|
84 |
| - data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] |
| 91 | + |
| 92 | + if self.format == "JSON": |
| 93 | + data = [] |
| 94 | + for doc in docs: |
| 95 | + try: |
| 96 | + json_content = json.loads(doc.page_content) |
| 97 | + data_dict = { |
| 98 | + "text": json.dumps(json_content, indent=2), |
| 99 | + **{key: str(value) for key, value in json_content.items()}, |
| 100 | + **doc.metadata |
| 101 | + } |
| 102 | + data.append(Data(**data_dict)) |
| 103 | + except json.JSONDecodeError: |
| 104 | + msg = f"Invalid JSON content from {doc.metadata.get('source', 'unknown URL')}" |
| 105 | + raise ValueError(msg) |
| 106 | + |
| 107 | + else: |
| 108 | + data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] |
| 109 | + |
| 110 | + self.status = data |
| 111 | + return data |
| 112 | + |
85 | 113 | self.status = data
|
86 | 114 | return data
|
87 | 115 |
|
|
0 commit comments