Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion elm/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
ELM version number
"""

__version__ = "0.0.25"
__version__ = "0.0.26"
34 changes: 34 additions & 0 deletions elm/web/search/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import random
import asyncio
import logging
import requests
from urllib.parse import quote
from abc import ABC, abstractmethod
from contextlib import asynccontextmanager

from serpapi.serp_api_client import SerpApiClient
from rebrowser_playwright.async_api import async_playwright
from playwright_stealth import StealthConfig

Expand Down Expand Up @@ -286,6 +288,38 @@ def __init__(self, api_key=None):
self.api_key = api_key or os.environ.get(self.API_KEY_VAR or "")


class PatchedSerpApiClient(SerpApiClient):
"""SerpApiClient patched to allow bypassing of SSL verification"""

def __init__(self, params_dict, engine=None, timeout=60000, verify=True):
super().__init__(params_dict=params_dict, engine=engine,
timeout=timeout)
self.verify = verify

def get_response(self, path='/search'):
"""Get search response

Parameters
----------
path : str, default='/search'
API path to use for the search.

Returns
-------
Response object provided by ``requests.get``.
"""
url = None
try:
url, parameter = self.construct_url(path)
response = requests.get(url, parameter, timeout=self.timeout,
verify=self.verify)
return response
except requests.HTTPError as e:
logger.error("fail: " + url)
logger.error(e, e.response.status_code)
raise e


async def _navigate_to_se_url(page, se_url, timeout=90_000):
"""Navigate to search engine url"""
await page.goto(se_url)
Expand Down
41 changes: 40 additions & 1 deletion elm/web/search/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from playwright.async_api import TimeoutError as PlaywrightTimeoutError

from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
APISearchEngineLinkSearch)
APISearchEngineLinkSearch,
PatchedSerpApiClient)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -239,6 +240,44 @@ async def _search(self, query, num_results=10):
return list(filter(None, (info.get("link") for info in results)))


class SerpAPIGoogleSearch(APISearchEngineLinkSearch):
"""Search the google for links using the SerpAPI service"""

_SE_NAME = "SerpAPI (Google)"

API_KEY_VAR = "SERPAPI_KEY"
"""Environment variable that should contain the SerpAPI key"""

def __init__(self, api_key=None, verify=False):
"""

Parameters
----------
api_key : str, optional
API key for serper search API. If ``None``, will look up the
API key using the ``"SERPAPI_KEY"`` environment variable.
By default, ``None``.
verify : bool, default=False
Option to use SSL verification when making request to API
endpoint. By default, ``False``.
"""
super().__init__(api_key=api_key)
self.verify = verify

async def _search(self, query, num_results=10, **param_kwargs):
"""Search web for links related to a query"""

params = {"q": query, "hl": "en", "gl": "us", "api_key": self.api_key}
params.update(param_kwargs)

client = PatchedSerpApiClient(params, engine="google",
verify=self.verify)
results = client.get_dict()
results = results.get("organic_results", [])
return list(filter(None, (info.get('link', "").replace("+", "%20")
for info in results)))[:num_results]


class APISerperSearch(APISearchEngineLinkSearch):
"""Search the web for links using the Google Serper API"""

Expand Down
3 changes: 3 additions & 0 deletions elm/web/search/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
PlaywrightDuckDuckGoLinkSearch)
from elm.web.search.dux import DuxDistributedGlobalSearch
from elm.web.search.google import (APIGoogleCSESearch, APISerperSearch,
SerpAPIGoogleSearch,
CamoufoxGoogleLinkSearch,
PlaywrightGoogleCSELinkSearch,
PlaywrightGoogleLinkSearch)
Expand All @@ -34,6 +35,8 @@
"google_cse_api_kwargs"),
"APISerperSearch": _SE_OPT(APISerperSearch, False,
"google_serper_api_kwargs"),
"SerpAPIGoogleSearch": _SE_OPT(SerpAPIGoogleSearch, False,
"google_serpapi_kwargs"),
"APITavilySearch": _SE_OPT(APITavilySearch, False, "tavily_api_kwargs"),
"CamoufoxGoogleLinkSearch": _SE_OPT(CamoufoxGoogleLinkSearch, True,
"cf_google_se_kwargs"),
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ crawl4ai
ddgs
fake_useragent>=2.0.3
google-api-python-client
google-search-results
html2text
httpx
langchain
Expand Down
Loading