Skip to content

Commit 4ca1bf1

Browse files
committed
URL component improvement - JSON URL
1 parent c2411d4 commit 4ca1bf1

File tree

1 file changed

+31
-3
lines changed
  • src/backend/base/langflow/components/data

1 file changed

+31
-3
lines changed

src/backend/base/langflow/components/data/url.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import re
23

34
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader
@@ -28,8 +29,8 @@ class URLComponent(Component):
2829
DropdownInput(
2930
name="format",
3031
display_name="Output Format",
31-
info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
32-
options=["Text", "Raw HTML"],
32+
info="Output Format. Use 'Text' to extract the text from the HTML, 'Raw HTML' for the raw HTML content or 'JSON' to extract JSON from the HTML ",
33+
options=["Text", "Raw HTML", "JSON"],
3334
value="Text",
3435
),
3536
]
@@ -71,17 +72,44 @@ def ensure_url(self, string: str) -> str:
7172
if not url_regex.match(string):
7273
msg = f"Invalid URL: {string}"
7374
raise ValueError(msg)
75+
if self.format == "JSON":
76+
if not ".json" in string:
77+
msg = f"Invalid JSON URL: {string}"
78+
raise ValueError(msg)
7479

7580
return string
7681

7782
def fetch_content(self) -> list[Data]:
7883
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]
84+
7985
if self.format == "Raw HTML":
8086
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
8187
else:
8288
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
89+
8390
docs = loader.load()
84-
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]
91+
92+
if self.format == "JSON":
93+
data = []
94+
for doc in docs:
95+
try:
96+
json_content = json.loads(doc.page_content)
97+
data_dict = {
98+
"text": json.dumps(json_content, indent=2),
99+
**{key: str(value) for key, value in json_content.items()},
100+
**doc.metadata
101+
}
102+
data.append(Data(**data_dict))
103+
except json.JSONDecodeError:
104+
msg = f"Invalid JSON content from {doc.metadata.get('source', 'unknown URL')}"
105+
raise ValueError(msg)
106+
107+
else:
108+
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]
109+
110+
self.status = data
111+
return data
112+
85113
self.status = data
86114
return data
87115

0 commit comments

Comments
 (0)