From 0a111aa2f076040545fc05f6adc7a8fbe87c4898 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:24:33 -0600 Subject: [PATCH 1/9] Lint updates --- elm/web/website_crawl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py index e95745ba..a167c56b 100644 --- a/elm/web/website_crawl.py +++ b/elm/web/website_crawl.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +# flake8: noqa +# pylint: disable=no-member """ELM Document retrieval from a website""" import logging From 6cbee9fb78cd01873f551aff7abb9ed91dee7794 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:53:56 -0600 Subject: [PATCH 2/9] Update DDG to strictly search DDG --- elm/web/search/duckduckgo.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 53ef7c01..4b35631a 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -4,7 +4,7 @@ import asyncio import logging -from duckduckgo_search import DDGS +from ddgs import DDGS from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, SearchEngineLinkSearch) @@ -52,23 +52,14 @@ class APIDuckDuckGoSearch(SearchEngineLinkSearch): _SE_NAME = "DuckDuckGo API" - def __init__(self, region="wt-wt", backend="auto", timeout=10, - verify=True, sleep_min_seconds=10, sleep_max_seconds=20): + def __init__(self, region="us-en", timeout=10, verify=True, + sleep_min_seconds=10, sleep_max_seconds=20): """ Parameters ---------- region : str, optional - DDG search region param. By default, ``"wt-wt"``, which - signifies no region. - backend : {auto, html, lite}, optional - Option for DDG search type. - - - auto: select randomly between HTML and Lite backends - - html: collect data from https://html.duckduckgo.com - - lite: collect data from https://lite.duckduckgo.com - - By default, ``"auto"``. + DDG search region param. By default, ``"us-en"``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional @@ -84,7 +75,6 @@ def __init__(self, region="wt-wt", backend="auto", timeout=10, By default, ``20``. """ self.region = region - self.backend = backend self.timeout = timeout self.verify = verify self.sleep_min_seconds = sleep_min_seconds @@ -95,8 +85,8 @@ async def _search(self, query, num_results=10): ddgs = DDGS(timeout=self.timeout, verify=self.verify) results = ddgs.text(query, region=self.region, - backend=self.backend, - max_results=num_results) + backend="duckduckgo", + num_results=num_results) return list(filter(None, (info.get('href', "").replace("+", "%20") for info in results))) From 57f5ad9513fdfbe589a3affadfe95708bfcb93c1 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:54:01 -0600 Subject: [PATCH 3/9] Update lib --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9d565c57..c74c83a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ beautifulsoup4 camoufox click crawl4ai -duckduckgo-search +ddgs fake_useragent>=2.0.3 google-api-python-client html2text From 4f258474b2c3b38d96e7a764e57c875c34707fa3 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:57:07 -0600 Subject: [PATCH 4/9] Add `DuxDistributedGlobalSearch` --- elm/web/search/dux.py | 83 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 elm/web/search/dux.py diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py new file mode 100644 index 00000000..0cc0cf33 --- /dev/null +++ b/elm/web/search/dux.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +"""ELM Web Scraping - DuckDuckGo search""" +import logging + +from ddgs import DDGS + +from elm.web.search.base import SearchEngineLinkSearch + + +logger = logging.getLogger(__name__) + + +class DuxDistributedGlobalSearch(SearchEngineLinkSearch): + """Search the web for links using DuxDistributedGlobalSearch""" + + _SE_NAME = "DuxDistributedGlobalSearch" + + def __init__(self, region="us-en", safesearch="moderate", timelimit=None, + page=1, backend="auto", timeout=10, verify=True): + """ + + Parameters + ---------- + region : str, optional + DuxDistributedGlobalSearch search region param. + By default, ``"us-en"``. + safesearch : {on, moderate, off}, optional + The `safesearch` setting for search engines. + By default, ``None``. + timelimit : {d, w, m, y}, optional + The time limit used to bound the search results: + + -d: last day + -w: last week + -m: last month + -y: last year + + By default, ``None``. + page : int, default=1 + The page of results to return. By default, ``1``. + backend : str, optional + Option for DuxDistributedGlobalSearch backend: + + - auto: Randomly select 3 search engines to use + - all: All available search engines are used + - wikipedia: Wikipedia + - google: Google + - bing: Bing + - brave: Brave + - mojeek: Mojeek + - yahoo: Yahoo + - yandex: Yandex + - duckduckgo: Duckduckgo + + By default, ``"auto"``. + timeout : int, optional + Timeout for HTTP requests, in seconds. By default, ``10``. + verify : bool, optional + Apply SSL verification when making the request. + By default, ``True``. + """ + self.region = region + self.safesearch = safesearch + self.timelimit = timelimit + self.page = page + self.backend = backend + self.timeout = timeout + self.verify = verify + + async def _search(self, query, num_results=10): + """Search web for links related to a query""" + + ddgs = DDGS(timeout=self.timeout, verify=self.verify) + results = ddgs.text(query, region=self.region, + safesearch=self.safesearch, + timelimit=self.timelimit, + page=self.page, + backend=self.backend, + num_results=num_results) + + return list(filter(None, (info.get('href', "").replace("+", "%20") + for info in results))) + From 51a92d256d35b649a6ab4285865de7017b5d2e95 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:57:15 -0600 Subject: [PATCH 5/9] Docstring --- elm/web/search/dux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 0cc0cf33..ec9f5026 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""ELM Web Scraping - DuckDuckGo search""" +"""ELM Web Scraping - DuxDistributedGlobalSearch""" import logging from ddgs import DDGS From a30cc0d44f0147b4b2f9442b4a34e031bf176ce1 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 17:59:14 -0600 Subject: [PATCH 6/9] Update default --- elm/web/search/dux.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index ec9f5026..9f6475ae 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -16,7 +16,8 @@ class DuxDistributedGlobalSearch(SearchEngineLinkSearch): _SE_NAME = "DuxDistributedGlobalSearch" def __init__(self, region="us-en", safesearch="moderate", timelimit=None, - page=1, backend="auto", timeout=10, verify=True): + page=1, backend=("google", "bing", "yahoo", "duckduckgo"), + timeout=10, verify=True): """ Parameters @@ -38,7 +39,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, By default, ``None``. page : int, default=1 The page of results to return. By default, ``1``. - backend : str, optional + backend : str or iter of str, optional Option for DuxDistributedGlobalSearch backend: - auto: Randomly select 3 search engines to use @@ -52,7 +53,8 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, - yandex: Yandex - duckduckgo: Duckduckgo - By default, ``"auto"``. + Can also be a list or tuple of a combination of these. + By default, ``("google", "bing", "yahoo", "duckduckgo")``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional From 4362e8b54534e6c2a83ced74aa8e600cd3bd7cd6 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 18:01:16 -0600 Subject: [PATCH 7/9] Update verify default --- elm/web/search/duckduckgo.py | 4 ++-- elm/web/search/dux.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 4b35631a..5bd2cda0 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -52,7 +52,7 @@ class APIDuckDuckGoSearch(SearchEngineLinkSearch): _SE_NAME = "DuckDuckGo API" - def __init__(self, region="us-en", timeout=10, verify=True, + def __init__(self, region="us-en", timeout=10, verify=False, sleep_min_seconds=10, sleep_max_seconds=20): """ @@ -64,7 +64,7 @@ def __init__(self, region="us-en", timeout=10, verify=True, Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional Apply SSL verification when making the request. - By default, ``True``. + By default, ``False``. sleep_min_seconds : int, optional Minimum number of seconds to sleep between queries. We recommend not setting this below ``5`` seconds to avoid diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 9f6475ae..0ef1b6b8 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -17,7 +17,7 @@ class DuxDistributedGlobalSearch(SearchEngineLinkSearch): def __init__(self, region="us-en", safesearch="moderate", timelimit=None, page=1, backend=("google", "bing", "yahoo", "duckduckgo"), - timeout=10, verify=True): + timeout=10, verify=False): """ Parameters @@ -59,7 +59,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional Apply SSL verification when making the request. - By default, ``True``. + By default, ``False``. """ self.region = region self.safesearch = safesearch From cf2337e2694e25eaf258170a9340c7536f9f1a93 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 18:02:04 -0600 Subject: [PATCH 8/9] Use DDGS as new fallback --- elm/web/search/run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 3e19c451..919cd430 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -12,6 +12,7 @@ from elm.web.search.bing import PlaywrightBingLinkSearch from elm.web.search.duckduckgo import (APIDuckDuckGoSearch, PlaywrightDuckDuckGoLinkSearch) +from elm.web.search.dux import DuxDistributedGlobalSearch from elm.web.search.google import (APIGoogleCSESearch, APISerperSearch, PlaywrightGoogleCSELinkSearch, PlaywrightGoogleLinkSearch) @@ -33,6 +34,8 @@ "APISerperSearch": _SE_OPT(APISerperSearch, False, "google_serper_api_kwargs"), "APITavilySearch": _SE_OPT(APITavilySearch, False, "tavily_api_kwargs"), + "DuxDistributedGlobalSearch": _SE_OPT(DuxDistributedGlobalSearch, False, + "ddgs_kwargs"), "PlaywrightBingLinkSearch": _SE_OPT(PlaywrightBingLinkSearch, True, "pw_bing_se_kwargs"), "PlaywrightDuckDuckGoLinkSearch": _SE_OPT(PlaywrightDuckDuckGoLinkSearch, @@ -46,7 +49,7 @@ } """Supported search engines""" _DEFAULT_SE = ("PlaywrightGoogleLinkSearch", "PlaywrightDuckDuckGoLinkSearch", - "APIDuckDuckGoSearch") + "DuxDistributedGlobalSearch") async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, From e2800a49e1ac93e0478ea1ee41fb270a7d7b458e Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 18:02:42 -0600 Subject: [PATCH 9/9] Docstring updates --- elm/web/search/run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 919cd430..438c4d4e 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -116,6 +116,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, - google_cse_api_kwargs - google_serper_api_kwargs - tavily_api_kwargs + - ddgs_kwargs - pw_bing_se_kwargs - pw_ddg_se_kwargs - pw_google_cse_kwargs @@ -205,6 +206,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, - google_cse_api_kwargs - google_serper_api_kwargs - tavily_api_kwargs + - ddgs_kwargs - pw_bing_se_kwargs - pw_ddg_se_kwargs - pw_google_cse_kwargs