Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions elm/web/search/duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import asyncio
import logging

from duckduckgo_search import DDGS
from ddgs import DDGS

from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
SearchEngineLinkSearch)
Expand Down Expand Up @@ -52,28 +52,19 @@ class APIDuckDuckGoSearch(SearchEngineLinkSearch):

_SE_NAME = "DuckDuckGo API"

def __init__(self, region="wt-wt", backend="auto", timeout=10,
verify=True, sleep_min_seconds=10, sleep_max_seconds=20):
def __init__(self, region="us-en", timeout=10, verify=False,
sleep_min_seconds=10, sleep_max_seconds=20):
"""

Parameters
----------
region : str, optional
DDG search region param. By default, ``"wt-wt"``, which
signifies no region.
backend : {auto, html, lite}, optional
Option for DDG search type.

- auto: select randomly between HTML and Lite backends
- html: collect data from https://html.duckduckgo.com
- lite: collect data from https://lite.duckduckgo.com

By default, ``"auto"``.
DDG search region param. By default, ``"us-en"``.
timeout : int, optional
Timeout for HTTP requests, in seconds. By default, ``10``.
verify : bool, optional
Apply SSL verification when making the request.
By default, ``True``.
By default, ``False``.
sleep_min_seconds : int, optional
Minimum number of seconds to sleep between queries. We
recommend not setting this below ``5`` seconds to avoid
Expand All @@ -84,7 +75,6 @@ def __init__(self, region="wt-wt", backend="auto", timeout=10,
By default, ``20``.
"""
self.region = region
self.backend = backend
self.timeout = timeout
self.verify = verify
self.sleep_min_seconds = sleep_min_seconds
Expand All @@ -95,8 +85,8 @@ async def _search(self, query, num_results=10):

ddgs = DDGS(timeout=self.timeout, verify=self.verify)
results = ddgs.text(query, region=self.region,
backend=self.backend,
max_results=num_results)
backend="duckduckgo",
num_results=num_results)

return list(filter(None, (info.get('href', "").replace("+", "%20")
for info in results)))
Expand Down
85 changes: 85 additions & 0 deletions elm/web/search/dux.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
"""ELM Web Scraping - DuxDistributedGlobalSearch"""
import logging

from ddgs import DDGS

from elm.web.search.base import SearchEngineLinkSearch


logger = logging.getLogger(__name__)


class DuxDistributedGlobalSearch(SearchEngineLinkSearch):
"""Search the web for links using DuxDistributedGlobalSearch"""

_SE_NAME = "DuxDistributedGlobalSearch"

def __init__(self, region="us-en", safesearch="moderate", timelimit=None,
page=1, backend=("google", "bing", "yahoo", "duckduckgo"),
timeout=10, verify=False):
"""

Parameters
----------
region : str, optional
DuxDistributedGlobalSearch search region param.
By default, ``"us-en"``.
safesearch : {on, moderate, off}, optional
The `safesearch` setting for search engines.
By default, ``None``.
timelimit : {d, w, m, y}, optional
The time limit used to bound the search results:

-d: last day
-w: last week
-m: last month
-y: last year

By default, ``None``.
page : int, default=1
The page of results to return. By default, ``1``.
backend : str or iter of str, optional
Option for DuxDistributedGlobalSearch backend:

- auto: Randomly select 3 search engines to use
- all: All available search engines are used
- wikipedia: Wikipedia
- google: Google
- bing: Bing
- brave: Brave
- mojeek: Mojeek
- yahoo: Yahoo
- yandex: Yandex
- duckduckgo: Duckduckgo

Can also be a list or tuple of a combination of these.
By default, ``("google", "bing", "yahoo", "duckduckgo")``.
timeout : int, optional
Timeout for HTTP requests, in seconds. By default, ``10``.
verify : bool, optional
Apply SSL verification when making the request.
By default, ``False``.
"""
self.region = region
self.safesearch = safesearch
self.timelimit = timelimit
self.page = page
self.backend = backend
self.timeout = timeout
self.verify = verify

async def _search(self, query, num_results=10):
"""Search web for links related to a query"""

ddgs = DDGS(timeout=self.timeout, verify=self.verify)
results = ddgs.text(query, region=self.region,
safesearch=self.safesearch,
timelimit=self.timelimit,
page=self.page,
backend=self.backend,
num_results=num_results)

return list(filter(None, (info.get('href', "").replace("+", "%20")
for info in results)))

7 changes: 6 additions & 1 deletion elm/web/search/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from elm.web.search.bing import PlaywrightBingLinkSearch
from elm.web.search.duckduckgo import (APIDuckDuckGoSearch,
PlaywrightDuckDuckGoLinkSearch)
from elm.web.search.dux import DuxDistributedGlobalSearch
from elm.web.search.google import (APIGoogleCSESearch, APISerperSearch,
PlaywrightGoogleCSELinkSearch,
PlaywrightGoogleLinkSearch)
Expand All @@ -33,6 +34,8 @@
"APISerperSearch": _SE_OPT(APISerperSearch, False,
"google_serper_api_kwargs"),
"APITavilySearch": _SE_OPT(APITavilySearch, False, "tavily_api_kwargs"),
"DuxDistributedGlobalSearch": _SE_OPT(DuxDistributedGlobalSearch, False,
"ddgs_kwargs"),
"PlaywrightBingLinkSearch": _SE_OPT(PlaywrightBingLinkSearch, True,
"pw_bing_se_kwargs"),
"PlaywrightDuckDuckGoLinkSearch": _SE_OPT(PlaywrightDuckDuckGoLinkSearch,
Expand All @@ -46,7 +49,7 @@
}
"""Supported search engines"""
_DEFAULT_SE = ("PlaywrightGoogleLinkSearch", "PlaywrightDuckDuckGoLinkSearch",
"APIDuckDuckGoSearch")
"DuxDistributedGlobalSearch")


async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
Expand Down Expand Up @@ -113,6 +116,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
- google_cse_api_kwargs
- google_serper_api_kwargs
- tavily_api_kwargs
- ddgs_kwargs
- pw_bing_se_kwargs
- pw_ddg_se_kwargs
- pw_google_cse_kwargs
Expand Down Expand Up @@ -202,6 +206,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE,
- google_cse_api_kwargs
- google_serper_api_kwargs
- tavily_api_kwargs
- ddgs_kwargs
- pw_bing_se_kwargs
- pw_ddg_se_kwargs
- pw_google_cse_kwargs
Expand Down
2 changes: 2 additions & 0 deletions elm/web/website_crawl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
# flake8: noqa
# pylint: disable=no-member
"""ELM Document retrieval from a website"""

import logging
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ beautifulsoup4
camoufox
click
crawl4ai
duckduckgo-search
ddgs
fake_useragent>=2.0.3
google-api-python-client
html2text
Expand Down
Loading