Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GitHub workflow #2

Merged
merged 11 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions .github/workflows/tests-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,20 @@ name: Tests runner

on:
push:
branches-ignore:
- '*'

jobs:
build:

runs-on: ubuntu-latest

strategy:
matrix:
python-version: [3.10, 3.11]

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
python-version: "3.10"

- name: Install Poetry
run: |
Expand All @@ -33,5 +27,8 @@ jobs:
- name: Install dependencies
run: poetry install

- name: Linting & mypy
run: poetry run nox

- name: Run tests
run: poetry run pytest
run: poetry run pytest -m "not local_only"
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ repos:
rev: 4.0.1
hooks:
- id: flake8
args: ["--ignore=ANN101,ANN102,W503", "--max-line-length=120"]
args: ["--ignore=ANN101,ANN102,W503,INP001", "--max-line-length=120"]
additional_dependencies:
- flake8-annotations==2.9.0
- flake8-bugbear==21.9.2
Expand Down
46 changes: 12 additions & 34 deletions bearish/scrapers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,18 @@

import pandas as pd
import simplejson
import undetected_chromedriver as uc # type: ignore
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, computed_field
from selenium.common import MoveTargetOutOfBoundsException, TimeoutException
from selenium.webdriver import ActionChains, Chrome, Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webdriver import WebDriver as BaseWebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait


from bearish.scrapers.model import HistoricalData


from bearish.scrapers.settings import TradingCountry, InvestingCountry
from bearish.scrapers.model import HistoricalData, _clean
from bearish.scrapers.settings import InvestingCountry, TradingCountry
from bearish.scrapers.type import Locator


Expand Down Expand Up @@ -88,13 +85,11 @@ def move_by_x_offset_from_left_border(element: BaseElement, x_offset: int) -> bo
return right_border


def init_chrome(load_strategy_none: bool = False, headless: bool = False) -> Chrome:
option = Options()
def init_chrome(headless: bool = True) -> uc.Chrome:
options = {}
if headless:
option.add_argument("--headless")
if load_strategy_none:
option.page_load_strategy = "none"
return Chrome(options=option)
options.update({"headless": True})
return uc.Chrome(use_subprocess=False, version_main=121, **options)


def bearish_path_fun() -> Path:
Expand All @@ -108,16 +103,6 @@ class BaseSettings(BaseModel):
...


def clean_dict(data: Dict[str, Any]) -> Dict[str, Any]:
cleaned_data = {}
for name, value in data.items():
if isinstance(value, dict):
cleaned_data[str(name)] = clean_dict(value)
else:
cleaned_data[str(name)] = value
return cleaned_data


def _replace_values(
tables: list[pd.DataFrame], replace_values: Dict[str, str]
) -> list[pd.DataFrame]:
Expand Down Expand Up @@ -159,36 +144,29 @@ def move_from_left_to_right_border(
return actions


def _get_country_name_per_enum(enum: Type[TradingCountry] | Type[InvestingCountry], country: Locator | int) -> str:
def _get_country_name_per_enum(
enum: Type[TradingCountry] | Type[InvestingCountry], country: Locator | int
) -> str:
return next(
k
for k, v in enum.__dict__.items()
if isinstance(v, (Locator, int)) and v == country
)


def _clean(
data: List[Dict[str, Any]] | Dict[str, Any]
) -> List[Dict[str, Any]] | Dict[str, Any]:
if isinstance(data, list):
return [clean_dict(data_) for data_ in data]
else:
return clean_dict(data)


class CountryNameMixin:
@abc.abstractmethod
def _get_country_name(self) -> str:
...



class BasePage(BaseModel):
url: str
source: Literal["trading", "investing", "yahoo"]
settings: BaseSettings
browser: WebDriver = Field(default_factory=init_chrome, description="")
bearish_path: Path = Field(default_factory=bearish_path_fun, description="")
first_page_only: Optional[bool] = False
model_config = ConfigDict(arbitrary_types_allowed=True, use_enum_values=True)
_tables = PrivateAttr(default_factory=list)
_skip_existing = PrivateAttr(default=True)
Expand Down Expand Up @@ -296,7 +274,7 @@ def folder_path(self) -> Path:

if hasattr(self, "country"):

path = path / self._get_country_name()
path = path / self._get_country_name() # type: ignore
elif hasattr(self, "exchange"):
path = path / self.exchange
else:
Expand Down
24 changes: 17 additions & 7 deletions bearish/scrapers/investing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import datetime
from functools import partial
from typing import Any, Dict, List, Literal

Expand All @@ -20,9 +21,11 @@
_get_country_name_per_enum,
init_chrome,
)
from bearish.scrapers.type import Locator
from bearish.scrapers.model import HistoricalData
from bearish.scrapers.settings import InvestingCountry
from bearish.scrapers.type import Locator

ONE_PAGE = 3

COLUMNS_LENGTH = 2

Expand Down Expand Up @@ -78,12 +81,20 @@ def get_statements_urls(self, exchange: str) -> List[str]:
]


class UpdateInvestingSettings(InvestingSettings):
start_date: str = Field(
default_factory=lambda: (
datetime.date.today() - datetime.timedelta(days=1)
).strftime("%d-%m-%Y")
)


class InvestingScreenerScraper(BasePage, CountryNameMixin):
country: int
settings: InvestingSettings = Field(default=InvestingSettings())
source: Literal["trading", "investing", "yahoo"] = "investing"
browser: WebDriver = Field(
default_factory=lambda: init_chrome(load_strategy_none=True, headless=True),
default_factory=lambda: init_chrome(headless=True),
description="",
)

Expand All @@ -101,7 +112,8 @@ def url_validator(cls, data: Dict[str, Any]) -> Dict[str, Any]:
}

def click_one_trust_button(self) -> None:
self.click(self.settings.one_trust_button)
with contextlib.suppress(TimeoutException):
self.click(self.settings.one_trust_button)

def _preprocess_tables(self) -> List[Dict[str, Any]]:
dataframe = pd.concat([table[-1] for table in self._tables])
Expand Down Expand Up @@ -133,6 +145,8 @@ def read_next_pages(self) -> None:
except (ElementClickInterceptedException, TimeoutException):
break
page_number += 1
if (page_number == ONE_PAGE) and self.first_page_only:
break

def _custom_scrape(self) -> list[dict[str, Any]]:
self.click_one_trust_button()
Expand All @@ -145,10 +159,6 @@ class InvestingTickerScraper(BaseTickerPage):
exchange: str
source: Literal["trading", "investing", "yahoo"] = "investing"
settings: InvestingSettings = Field(default=InvestingSettings())
browser: WebDriver = Field(
default_factory=lambda: init_chrome(load_strategy_none=True, headless=False),
description="",
)

@model_validator(mode="before")
@classmethod
Expand Down
67 changes: 49 additions & 18 deletions bearish/scrapers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,13 @@
from typing import Any, Dict, Literal, Optional, Type, Union

from pydantic import BaseModel, ConfigDict, Field
from selenium.webdriver.chrome.webdriver import WebDriver

from bearish.scrapers.base import BasePage, bearish_path_fun
from bearish.scrapers.investing import (
InvestingScreenerScraper,
InvestingTickerScraper,
)
from bearish.scrapers.base import BasePage, BaseSettings, bearish_path_fun, init_chrome
from bearish.scrapers.investing import InvestingScreenerScraper, InvestingTickerScraper
from bearish.scrapers.model import Ticker, merge, unflatten_json
from bearish.scrapers.trading import (
TradingScreenerScraper,
TradingTickerScraper,
)
from bearish.scrapers.settings import TradingCountry, InvestingCountry
from bearish.scrapers.settings import InvestingCountry, TradingCountry
from bearish.scrapers.trading import TradingScreenerScraper, TradingTickerScraper

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -54,38 +49,57 @@ class Scraper(BaseModel):
)
source: Source
country: Literal["germany", "france", "belgium", "usa"]
settings: Optional[BaseSettings] = None
browser: WebDriver = Field(default_factory=init_chrome, description="")

def _screener_scraper(self) -> BasePage:
def _screener_scraper(self, first_page_only: bool = False) -> BasePage:
return self.source.screener( # type: ignore
country=getattr(self.source.country, self.country),
bearish_path=self.bearish_path,
first_page_only=first_page_only,
settings=self.settings,
browser=self.browser,
)

def scrape(
self, skip_existing: bool = True, symbols: Optional[list[str]] = None
self,
skip_existing: bool = True,
symbols: Optional[list[str]] = None,
first_page_only: bool = False,
) -> None:
screener_scraper = self._screener_scraper()
screener_scraper = self._screener_scraper(first_page_only=first_page_only)
screener_scraper.scrape(skip_existing=skip_existing)
tickers = Ticker.from_json(screener_scraper.get_stored_raw())
tickers = Ticker.from_json(
screener_scraper.get_stored_raw(), source=screener_scraper.source
)
tickers = _filter_by_symbols(tickers=tickers, symbols=symbols)
for ticker in tickers:
scraper = self.source.ticker( # type: ignore
exchange=ticker.reference, bearish_path=self.bearish_path
browser=self.browser,
exchange=ticker.reference,
bearish_path=self.bearish_path,
settings=self.settings,
)
try:
scraper.scrape(skip_existing=skip_existing)
except Exception as e:
logger.error(f"Fail {ticker.reference}. reason: {e}")

def create_db_json(self) -> list[Dict[str, Any]]:
def create_db_json(
self, symbols: Optional[list[str]] = None
) -> list[Dict[str, Any]]:
scraper = self._screener_scraper()
if not scraper.get_stored_raw().exists():
return []
tickers = Ticker.from_json(scraper.get_stored_raw())
tickers = Ticker.from_json(scraper.get_stored_raw(), source=scraper.source)
db_json = []
tickers = _filter_by_symbols(tickers=tickers, symbols=symbols)
for ticker in tickers:
ticker_scraper = self.source.ticker( # type: ignore
browser=None, exchange=ticker.reference, bearish_path=self.bearish_path
browser=self.browser,
exchange=ticker.reference,
bearish_path=self.bearish_path,
settings=self.settings,
)
if not ticker_scraper.get_stored_raw().exists():
continue
Expand All @@ -95,3 +109,20 @@ def create_db_json(self) -> list[Dict[str, Any]]:
merge(Ticker, ticker, ticker_)
db_json.append(ticker.model_dump())
return db_json

def update_db_json(self, db_json_path: Path) -> None:
db_json = json.loads(db_json_path.read_text())
tickers = [Ticker(**ticker_json) for ticker_json in db_json]
for ticker in tickers:
ticker_scraper = self.source.ticker( # type: ignore
browser=self.browser,
exchange=ticker.reference,
bearish_path=self.bearish_path,
settings=self.settings,
)
if ticker_scraper.source != ticker.source:
continue
records = ticker_scraper.scrape(skip_existing=False)
if not records:
continue
Ticker.from_record(records, source=ticker.source)
Loading
Loading