Merge pull request #2 from andoludo/github-workflow

GitHub workflow
andoludo · Mar 16, 2024 · d45050a · d45050a
2 parents 219648f + 29a163a
commit d45050a
Show file tree

Hide file tree

Showing 14 changed files with 704 additions and 111 deletions.
diff --git a/.github/workflows/tests-runner.yaml b/.github/workflows/tests-runner.yaml
@@ -2,26 +2,20 @@ name: Tests runner
 
 on:
   push:
-    branches-ignore:
-      - '*'
 
 jobs:
   build:
 
     runs-on: ubuntu-latest
 
-    strategy:
-      matrix:
-        python-version: [3.10, 3.11]
-
     steps:
     - name: Checkout repository
       uses: actions/checkout@v2
 
     - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: "3.10"
 
     - name: Install Poetry
       run: |
@@ -33,5 +27,8 @@ jobs:
     - name: Install dependencies
       run: poetry install
 
+    - name: Linting & mypy
+      run: poetry run nox
+
     - name: Run tests
-      run: poetry run pytest
+      run: poetry run pytest -m "not local_only"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
     rev: 4.0.1
     hooks:
       - id: flake8
-        args: ["--ignore=ANN101,ANN102,W503", "--max-line-length=120"]
+        args: ["--ignore=ANN101,ANN102,W503,INP001", "--max-line-length=120"]
         additional_dependencies:
           - flake8-annotations==2.9.0
           - flake8-bugbear==21.9.2

diff --git a/bearish/scrapers/base.py b/bearish/scrapers/base.py
@@ -9,21 +9,18 @@
 
 import pandas as pd
 import simplejson
+import undetected_chromedriver as uc  # type: ignore
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, computed_field
 from selenium.common import MoveTargetOutOfBoundsException, TimeoutException
 from selenium.webdriver import ActionChains, Chrome, Keys
-from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.webdriver import WebDriver
 from selenium.webdriver.remote.webdriver import WebDriver as BaseWebDriver
 from selenium.webdriver.remote.webelement import WebElement
 from selenium.webdriver.support import expected_conditions
 from selenium.webdriver.support.wait import WebDriverWait
 
-
-from bearish.scrapers.model import HistoricalData
-
-
-from bearish.scrapers.settings import TradingCountry, InvestingCountry
+from bearish.scrapers.model import HistoricalData, _clean
+from bearish.scrapers.settings import InvestingCountry, TradingCountry
 from bearish.scrapers.type import Locator
 
 
@@ -88,13 +85,11 @@ def move_by_x_offset_from_left_border(element: BaseElement, x_offset: int) -> bo
     return right_border
 
 
-def init_chrome(load_strategy_none: bool = False, headless: bool = False) -> Chrome:
-    option = Options()
+def init_chrome(headless: bool = True) -> uc.Chrome:
+    options = {}
     if headless:
-        option.add_argument("--headless")
-    if load_strategy_none:
-        option.page_load_strategy = "none"
-    return Chrome(options=option)
+        options.update({"headless": True})
+    return uc.Chrome(use_subprocess=False, version_main=121, **options)
 
 
 def bearish_path_fun() -> Path:
@@ -108,16 +103,6 @@ class BaseSettings(BaseModel):
     ...
 
 
-def clean_dict(data: Dict[str, Any]) -> Dict[str, Any]:
-    cleaned_data = {}
-    for name, value in data.items():
-        if isinstance(value, dict):
-            cleaned_data[str(name)] = clean_dict(value)
-        else:
-            cleaned_data[str(name)] = value
-    return cleaned_data
-
-
 def _replace_values(
     tables: list[pd.DataFrame], replace_values: Dict[str, str]
 ) -> list[pd.DataFrame]:
@@ -159,36 +144,29 @@ def move_from_left_to_right_border(
     return actions
 
 
-def _get_country_name_per_enum(enum: Type[TradingCountry] | Type[InvestingCountry], country: Locator | int) -> str:
+def _get_country_name_per_enum(
+    enum: Type[TradingCountry] | Type[InvestingCountry], country: Locator | int
+) -> str:
     return next(
         k
         for k, v in enum.__dict__.items()
         if isinstance(v, (Locator, int)) and v == country
     )
 
 
-def _clean(
-    data: List[Dict[str, Any]] | Dict[str, Any]
-) -> List[Dict[str, Any]] | Dict[str, Any]:
-    if isinstance(data, list):
-        return [clean_dict(data_) for data_ in data]
-    else:
-        return clean_dict(data)
-
-
 class CountryNameMixin:
     @abc.abstractmethod
     def _get_country_name(self) -> str:
         ...
 
 
-
 class BasePage(BaseModel):
     url: str
     source: Literal["trading", "investing", "yahoo"]
     settings: BaseSettings
     browser: WebDriver = Field(default_factory=init_chrome, description="")
     bearish_path: Path = Field(default_factory=bearish_path_fun, description="")
+    first_page_only: Optional[bool] = False
     model_config = ConfigDict(arbitrary_types_allowed=True, use_enum_values=True)
     _tables = PrivateAttr(default_factory=list)
     _skip_existing = PrivateAttr(default=True)
@@ -296,7 +274,7 @@ def folder_path(self) -> Path:
 
         if hasattr(self, "country"):
 
-            path = path / self._get_country_name()
+            path = path / self._get_country_name()  # type: ignore
         elif hasattr(self, "exchange"):
             path = path / self.exchange
         else:

diff --git a/bearish/scrapers/investing.py b/bearish/scrapers/investing.py
@@ -1,4 +1,5 @@
 import contextlib
+import datetime
 from functools import partial
 from typing import Any, Dict, List, Literal
 
@@ -20,9 +21,11 @@
     _get_country_name_per_enum,
     init_chrome,
 )
-from bearish.scrapers.type import Locator
 from bearish.scrapers.model import HistoricalData
 from bearish.scrapers.settings import InvestingCountry
+from bearish.scrapers.type import Locator
+
+ONE_PAGE = 3
 
 COLUMNS_LENGTH = 2
 
@@ -78,12 +81,20 @@ def get_statements_urls(self, exchange: str) -> List[str]:
         ]
 
 
+class UpdateInvestingSettings(InvestingSettings):
+    start_date: str = Field(
+        default_factory=lambda: (
+            datetime.date.today() - datetime.timedelta(days=1)
+        ).strftime("%d-%m-%Y")
+    )
+
+
 class InvestingScreenerScraper(BasePage, CountryNameMixin):
     country: int
     settings: InvestingSettings = Field(default=InvestingSettings())
     source: Literal["trading", "investing", "yahoo"] = "investing"
     browser: WebDriver = Field(
-        default_factory=lambda: init_chrome(load_strategy_none=True, headless=True),
+        default_factory=lambda: init_chrome(headless=True),
         description="",
     )
 
@@ -101,7 +112,8 @@ def url_validator(cls, data: Dict[str, Any]) -> Dict[str, Any]:
         }
 
     def click_one_trust_button(self) -> None:
-        self.click(self.settings.one_trust_button)
+        with contextlib.suppress(TimeoutException):
+            self.click(self.settings.one_trust_button)
 
     def _preprocess_tables(self) -> List[Dict[str, Any]]:
         dataframe = pd.concat([table[-1] for table in self._tables])
@@ -133,6 +145,8 @@ def read_next_pages(self) -> None:
             except (ElementClickInterceptedException, TimeoutException):
                 break
             page_number += 1
+            if (page_number == ONE_PAGE) and self.first_page_only:
+                break
 
     def _custom_scrape(self) -> list[dict[str, Any]]:
         self.click_one_trust_button()
@@ -145,10 +159,6 @@ class InvestingTickerScraper(BaseTickerPage):
     exchange: str
     source: Literal["trading", "investing", "yahoo"] = "investing"
     settings: InvestingSettings = Field(default=InvestingSettings())
-    browser: WebDriver = Field(
-        default_factory=lambda: init_chrome(load_strategy_none=True, headless=False),
-        description="",
-    )
 
     @model_validator(mode="before")
     @classmethod

diff --git a/bearish/scrapers/main.py b/bearish/scrapers/main.py
@@ -4,18 +4,13 @@
 from typing import Any, Dict, Literal, Optional, Type, Union
 
 from pydantic import BaseModel, ConfigDict, Field
+from selenium.webdriver.chrome.webdriver import WebDriver
 
-from bearish.scrapers.base import BasePage, bearish_path_fun
-from bearish.scrapers.investing import (
-    InvestingScreenerScraper,
-    InvestingTickerScraper,
-)
+from bearish.scrapers.base import BasePage, BaseSettings, bearish_path_fun, init_chrome
+from bearish.scrapers.investing import InvestingScreenerScraper, InvestingTickerScraper
 from bearish.scrapers.model import Ticker, merge, unflatten_json
-from bearish.scrapers.trading import (
-    TradingScreenerScraper,
-    TradingTickerScraper,
-)
-from bearish.scrapers.settings import TradingCountry, InvestingCountry
+from bearish.scrapers.settings import InvestingCountry, TradingCountry
+from bearish.scrapers.trading import TradingScreenerScraper, TradingTickerScraper
 
 logger = logging.getLogger(__name__)
 
@@ -54,38 +49,57 @@ class Scraper(BaseModel):
     )
     source: Source
     country: Literal["germany", "france", "belgium", "usa"]
+    settings: Optional[BaseSettings] = None
+    browser: WebDriver = Field(default_factory=init_chrome, description="")
 
-    def _screener_scraper(self) -> BasePage:
+    def _screener_scraper(self, first_page_only: bool = False) -> BasePage:
         return self.source.screener(  # type: ignore
             country=getattr(self.source.country, self.country),
             bearish_path=self.bearish_path,
+            first_page_only=first_page_only,
+            settings=self.settings,
+            browser=self.browser,
         )
 
     def scrape(
-        self, skip_existing: bool = True, symbols: Optional[list[str]] = None
+        self,
+        skip_existing: bool = True,
+        symbols: Optional[list[str]] = None,
+        first_page_only: bool = False,
     ) -> None:
-        screener_scraper = self._screener_scraper()
+        screener_scraper = self._screener_scraper(first_page_only=first_page_only)
         screener_scraper.scrape(skip_existing=skip_existing)
-        tickers = Ticker.from_json(screener_scraper.get_stored_raw())
+        tickers = Ticker.from_json(
+            screener_scraper.get_stored_raw(), source=screener_scraper.source
+        )
         tickers = _filter_by_symbols(tickers=tickers, symbols=symbols)
         for ticker in tickers:
             scraper = self.source.ticker(  # type: ignore
-                exchange=ticker.reference, bearish_path=self.bearish_path
+                browser=self.browser,
+                exchange=ticker.reference,
+                bearish_path=self.bearish_path,
+                settings=self.settings,
             )
             try:
                 scraper.scrape(skip_existing=skip_existing)
             except Exception as e:
                 logger.error(f"Fail {ticker.reference}. reason: {e}")
 
-    def create_db_json(self) -> list[Dict[str, Any]]:
+    def create_db_json(
+        self, symbols: Optional[list[str]] = None
+    ) -> list[Dict[str, Any]]:
         scraper = self._screener_scraper()
         if not scraper.get_stored_raw().exists():
             return []
-        tickers = Ticker.from_json(scraper.get_stored_raw())
+        tickers = Ticker.from_json(scraper.get_stored_raw(), source=scraper.source)
         db_json = []
+        tickers = _filter_by_symbols(tickers=tickers, symbols=symbols)
         for ticker in tickers:
             ticker_scraper = self.source.ticker(  # type: ignore
-                browser=None, exchange=ticker.reference, bearish_path=self.bearish_path
+                browser=self.browser,
+                exchange=ticker.reference,
+                bearish_path=self.bearish_path,
+                settings=self.settings,
             )
             if not ticker_scraper.get_stored_raw().exists():
                 continue
@@ -95,3 +109,20 @@ def create_db_json(self) -> list[Dict[str, Any]]:
             merge(Ticker, ticker, ticker_)
             db_json.append(ticker.model_dump())
         return db_json
+
+    def update_db_json(self, db_json_path: Path) -> None:
+        db_json = json.loads(db_json_path.read_text())
+        tickers = [Ticker(**ticker_json) for ticker_json in db_json]
+        for ticker in tickers:
+            ticker_scraper = self.source.ticker(  # type: ignore
+                browser=self.browser,
+                exchange=ticker.reference,
+                bearish_path=self.bearish_path,
+                settings=self.settings,
+            )
+            if ticker_scraper.source != ticker.source:
+                continue
+            records = ticker_scraper.scrape(skip_existing=False)
+            if not records:
+                continue
+            Ticker.from_record(records, source=ticker.source)