diff --git a/elm/web/search/duckduckgo.py b/elm/web/search/duckduckgo.py index 53ef7c01..5bd2cda0 100644 --- a/elm/web/search/duckduckgo.py +++ b/elm/web/search/duckduckgo.py @@ -4,7 +4,7 @@ import asyncio import logging -from duckduckgo_search import DDGS +from ddgs import DDGS from elm.web.search.base import (PlaywrightSearchEngineLinkSearch, SearchEngineLinkSearch) @@ -52,28 +52,19 @@ class APIDuckDuckGoSearch(SearchEngineLinkSearch): _SE_NAME = "DuckDuckGo API" - def __init__(self, region="wt-wt", backend="auto", timeout=10, - verify=True, sleep_min_seconds=10, sleep_max_seconds=20): + def __init__(self, region="us-en", timeout=10, verify=False, + sleep_min_seconds=10, sleep_max_seconds=20): """ Parameters ---------- region : str, optional - DDG search region param. By default, ``"wt-wt"``, which - signifies no region. - backend : {auto, html, lite}, optional - Option for DDG search type. - - - auto: select randomly between HTML and Lite backends - - html: collect data from https://html.duckduckgo.com - - lite: collect data from https://lite.duckduckgo.com - - By default, ``"auto"``. + DDG search region param. By default, ``"us-en"``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional Apply SSL verification when making the request. - By default, ``True``. + By default, ``False``. sleep_min_seconds : int, optional Minimum number of seconds to sleep between queries. We recommend not setting this below ``5`` seconds to avoid @@ -84,7 +75,6 @@ def __init__(self, region="wt-wt", backend="auto", timeout=10, By default, ``20``. """ self.region = region - self.backend = backend self.timeout = timeout self.verify = verify self.sleep_min_seconds = sleep_min_seconds @@ -95,8 +85,8 @@ async def _search(self, query, num_results=10): ddgs = DDGS(timeout=self.timeout, verify=self.verify) results = ddgs.text(query, region=self.region, - backend=self.backend, - max_results=num_results) + backend="duckduckgo", + num_results=num_results) return list(filter(None, (info.get('href', "").replace("+", "%20") for info in results))) diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py new file mode 100644 index 00000000..0ef1b6b8 --- /dev/null +++ b/elm/web/search/dux.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +"""ELM Web Scraping - DuxDistributedGlobalSearch""" +import logging + +from ddgs import DDGS + +from elm.web.search.base import SearchEngineLinkSearch + + +logger = logging.getLogger(__name__) + + +class DuxDistributedGlobalSearch(SearchEngineLinkSearch): + """Search the web for links using DuxDistributedGlobalSearch""" + + _SE_NAME = "DuxDistributedGlobalSearch" + + def __init__(self, region="us-en", safesearch="moderate", timelimit=None, + page=1, backend=("google", "bing", "yahoo", "duckduckgo"), + timeout=10, verify=False): + """ + + Parameters + ---------- + region : str, optional + DuxDistributedGlobalSearch search region param. + By default, ``"us-en"``. + safesearch : {on, moderate, off}, optional + The `safesearch` setting for search engines. + By default, ``None``. + timelimit : {d, w, m, y}, optional + The time limit used to bound the search results: + + -d: last day + -w: last week + -m: last month + -y: last year + + By default, ``None``. + page : int, default=1 + The page of results to return. By default, ``1``. + backend : str or iter of str, optional + Option for DuxDistributedGlobalSearch backend: + + - auto: Randomly select 3 search engines to use + - all: All available search engines are used + - wikipedia: Wikipedia + - google: Google + - bing: Bing + - brave: Brave + - mojeek: Mojeek + - yahoo: Yahoo + - yandex: Yandex + - duckduckgo: Duckduckgo + + Can also be a list or tuple of a combination of these. + By default, ``("google", "bing", "yahoo", "duckduckgo")``. + timeout : int, optional + Timeout for HTTP requests, in seconds. By default, ``10``. + verify : bool, optional + Apply SSL verification when making the request. + By default, ``False``. + """ + self.region = region + self.safesearch = safesearch + self.timelimit = timelimit + self.page = page + self.backend = backend + self.timeout = timeout + self.verify = verify + + async def _search(self, query, num_results=10): + """Search web for links related to a query""" + + ddgs = DDGS(timeout=self.timeout, verify=self.verify) + results = ddgs.text(query, region=self.region, + safesearch=self.safesearch, + timelimit=self.timelimit, + page=self.page, + backend=self.backend, + num_results=num_results) + + return list(filter(None, (info.get('href', "").replace("+", "%20") + for info in results))) + diff --git a/elm/web/search/run.py b/elm/web/search/run.py index 3e19c451..438c4d4e 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -12,6 +12,7 @@ from elm.web.search.bing import PlaywrightBingLinkSearch from elm.web.search.duckduckgo import (APIDuckDuckGoSearch, PlaywrightDuckDuckGoLinkSearch) +from elm.web.search.dux import DuxDistributedGlobalSearch from elm.web.search.google import (APIGoogleCSESearch, APISerperSearch, PlaywrightGoogleCSELinkSearch, PlaywrightGoogleLinkSearch) @@ -33,6 +34,8 @@ "APISerperSearch": _SE_OPT(APISerperSearch, False, "google_serper_api_kwargs"), "APITavilySearch": _SE_OPT(APITavilySearch, False, "tavily_api_kwargs"), + "DuxDistributedGlobalSearch": _SE_OPT(DuxDistributedGlobalSearch, False, + "ddgs_kwargs"), "PlaywrightBingLinkSearch": _SE_OPT(PlaywrightBingLinkSearch, True, "pw_bing_se_kwargs"), "PlaywrightDuckDuckGoLinkSearch": _SE_OPT(PlaywrightDuckDuckGoLinkSearch, @@ -46,7 +49,7 @@ } """Supported search engines""" _DEFAULT_SE = ("PlaywrightGoogleLinkSearch", "PlaywrightDuckDuckGoLinkSearch", - "APIDuckDuckGoSearch") + "DuxDistributedGlobalSearch") async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, @@ -113,6 +116,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, - google_cse_api_kwargs - google_serper_api_kwargs - tavily_api_kwargs + - ddgs_kwargs - pw_bing_se_kwargs - pw_ddg_se_kwargs - pw_google_cse_kwargs @@ -202,6 +206,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, - google_cse_api_kwargs - google_serper_api_kwargs - tavily_api_kwargs + - ddgs_kwargs - pw_bing_se_kwargs - pw_ddg_se_kwargs - pw_google_cse_kwargs diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py index e95745ba..a167c56b 100644 --- a/elm/web/website_crawl.py +++ b/elm/web/website_crawl.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +# flake8: noqa +# pylint: disable=no-member """ELM Document retrieval from a website""" import logging diff --git a/requirements.txt b/requirements.txt index 9d565c57..c74c83a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ beautifulsoup4 camoufox click crawl4ai -duckduckgo-search +ddgs fake_useragent>=2.0.3 google-api-python-client html2text