From cbca9439d5eeef9ee1bdc34b4722cdfe14ca76ea Mon Sep 17 00:00:00 2001 From: Dong Shin Date: Wed, 4 Dec 2024 11:37:54 +0900 Subject: [PATCH 1/3] enable aiohttp proxy --- .../langchain_community/document_loaders/web_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index 3eab7f351a113..4c5e7d48cc706 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -128,6 +128,7 @@ def __init__( session: Any = None, *, show_progress: bool = True, + trust_env: bool = False, ) -> None: """Initialize loader. @@ -189,6 +190,7 @@ def __init__( self.continue_on_failure = continue_on_failure self.autoset_encoding = autoset_encoding self.encoding = encoding + self.trust_env = trust_env @property def web_path(self) -> str: @@ -199,7 +201,7 @@ def web_path(self) -> str: async def _fetch( self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 ) -> str: - async with aiohttp.ClientSession() as session: + async with aiohttp.ClientSession(trust_env=self.trust_env) as session: for i in range(retries): try: kwargs: Dict = dict( From 0b2011b42bef86721e04ccd08c1b627325197e10 Mon Sep 17 00:00:00 2001 From: Dong SHIN Date: Thu, 5 Dec 2024 01:32:57 +0900 Subject: [PATCH 2/3] docs: usage --- docs/docs/integrations/document_loaders/web_base.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb index 52589cf4b2a6f..1f76518ce809a 100644 --- a/docs/docs/integrations/document_loaders/web_base.ipynb +++ b/docs/docs/integrations/document_loaders/web_base.ipynb @@ -66,7 +66,10 @@ "source": [ "from langchain_community.document_loaders import WebBaseLoader\n", "\n", - "loader = WebBaseLoader(\"https://www.espn.com/\")" + "loader = WebBaseLoader(\"https://www.espn.com/\")\n", + "# If you need to use the proxy to make web requests, for example using http_proxy/https_proxy environmental variables,\n", + "# please set trust_env=True explicitly here as follows:\n", + "# loader = WebBaseLoader(\"https://www.espn.com/\", trust_env=True)" ] }, { From fc3098c07bf4900ca0109bc10fe680a01b60b566 Mon Sep 17 00:00:00 2001 From: Chester Curme Date: Tue, 17 Dec 2024 21:05:51 -0500 Subject: [PATCH 3/3] document parameter in api ref --- docs/docs/integrations/document_loaders/web_base.ipynb | 5 +---- .../langchain_community/document_loaders/web_base.py | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb index 1f76518ce809a..52589cf4b2a6f 100644 --- a/docs/docs/integrations/document_loaders/web_base.ipynb +++ b/docs/docs/integrations/document_loaders/web_base.ipynb @@ -66,10 +66,7 @@ "source": [ "from langchain_community.document_loaders import WebBaseLoader\n", "\n", - "loader = WebBaseLoader(\"https://www.espn.com/\")\n", - "# If you need to use the proxy to make web requests, for example using http_proxy/https_proxy environmental variables,\n", - "# please set trust_env=True explicitly here as follows:\n", - "# loader = WebBaseLoader(\"https://www.espn.com/\", trust_env=True)" + "loader = WebBaseLoader(\"https://www.espn.com/\")" ] }, { diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index 4c5e7d48cc706..a115cf34606f8 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -71,6 +71,7 @@ class WebBaseLoader(BaseLoader): # bs_kwargs = None, # session = None, # show_progress = True, + # trust_env = False, ) Lazy load: @@ -141,6 +142,8 @@ def __init__( bs_get_text_kwargs: kwargs for beatifulsoup4 get_text bs_kwargs: kwargs for beatifulsoup4 web page parsing show_progress: Show progress bar when loading pages. + trust_env: set to True if using proxy to make web requests, for example + using http(s)_proxy environment variables. Defaults to False. """ # web_path kept for backwards-compatibility. if web_path and web_paths: