Skip to content

Commit

Permalink
[enh] add examples and fix postprocessor
Browse files Browse the repository at this point in the history
[mod] remove not needed functions

[enh] about project
  • Loading branch information
allendema committed Aug 1, 2024
1 parent d20f948 commit 7948de8
Show file tree
Hide file tree
Showing 15 changed files with 770 additions and 120 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

### Custom ###
# Ignore the entire "data/" directory
data/
# data/
# Allow specific files within the "data/" directory
!data/my_basket.sqlite3
!data/rewe-dl-jsonpp-*.json
Expand Down
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

20 changes: 5 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
Python library to call the APIs that the store itself calls.
To get good parsed results from those API calls see `parser.py`.

![GitHub Issues or Pull Requests](https://shields.sp-codes.de/github/issues/allendema/rewe_dl)
![Python Version from PEP 621 TOML](https://shields.sp-codes.de/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fallendema%2Frewe_dl%2Fmain%2Fpyproject.toml)
![Formatter](https://shields.sp-codes.de/badge/formatter-ruff-blue)
[![Python Version from PEP 621 TOML](https://shields.sp-codes.de/python/required-version-toml?tomlFilePath=https%3A%2F%2Fraw.githubusercontent.com%2Fallendema%2Frewe_dl%2Fmain%2Fpyproject.toml)](https://github.com/allendema/rewe_dl/main/pyproject.toml)
[![Formatter](https://shields.sp-codes.de/badge/formatter-ruff-blue)](https://github.com/astral-sh/ruff)
[![GitHub Issues or Pull Requests](https://shields.sp-codes.de/github/issues/allendema/rewe_dl)](https://github.com/allendema/rewe_dl/issues)
[![github commits](https://shields.sp-codes.de/github/last-commit/allendema/rewe_dl)](https://github.com/allendema/rewe_dl/commits/main)

## Usage
```bash
Expand Down Expand Up @@ -39,18 +40,7 @@ You can:

<details>
<summary>examples/discounted_to_sql.py</summary>
```python
def main():
discounted_products = STORE().get_discounted_products()

all_products = Parser().parse_search_results_products(discounted_products)

this_file = Path(__file__).stem
todays_date = datetime.today().strftime("%Y-%m-%d")

file_name = f"{this_file}-{todays_date}.sqlite3"

SqlPP.save_to_sql(all_products, file_name)
```python
```
</details>

Expand Down
1 change: 1 addition & 0 deletions data/discounted_to_json-2024-08-01.json

Large diffs are not rendered by default.

Binary file added data/discounted_to_sql-2024-08-01.sqlite3
Binary file not shown.
Binary file added data/my_basket.sqlite3
Binary file not shown.
Binary file added data/newest_products_to_sql.sqlite3
Binary file not shown.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
httpx
# apprise is options - used for notifications
apprise
# apprise is optional
4 changes: 2 additions & 2 deletions rewe_dl/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
},
"cookiesB": {"marketsCookie": {
"online": {"wwIdent":"1940419","marketZipCode":"56073","serviceTypes":["PICKUP"],"customerZipCode":"56073"},
"stationary": {"wwIdent":"1763154","marketZipCode":"56179","serviceTypes":["STATIONARY"]}}
"stationary": {"wwIdent":"1940419","marketZipCode":"56073","serviceTypes":["STATIONARY"]}}
},
"cookies": {
"marketsCookie": "%7B%22online%22%3A%7B%22wwIdent%22%3A%221940419%22%2C%22marketZipCode%22%3A%2256073%22%2C%22serviceTypes%22%3A%5B%22PICKUP%22%5D%2C%22customerZipCode%22%3A%2256073%22%7D%2C%22stationary%22%3A%7B%22wwIdent%22%3A%221763154%22%2C%22marketZipCode%22%3A%2256179%22%2C%22serviceTypes%22%3A%5B%22STATIONARY%22%5D%7D%7D",
"marketsCookie": "%7B%22online%22%3A%7B%22wwIdent%22%3A%221940419%22%2C%22marketZipCode%22%3A%2256073%22%2C%22serviceTypes%22%3A%5B%22PICKUP%22%5D%2C%22customerZipCode%22%3A%2256073%22%7D%2C%22stationary%22%3A%7B%22wwIdent%22%3A%221940419%22%2C%22marketZipCode%22%3A%2256073%22%2C%22serviceTypes%22%3A%5B%22STATIONARY%22%5D%7D%7D",
"websitebot-launch": "human-mousemove"
}
}
18 changes: 9 additions & 9 deletions rewe_dl/examples/discounted_to_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,30 @@
from datetime import datetime

PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.dirname(PROJECT_DIR))
PROJECT_ROOT = os.path.dirname(PROJECT_DIR)
DATA_FOLDER = os.path.join(os.path.dirname(PROJECT_ROOT), "data")

sys.path.append(PROJECT_ROOT)

from rewe import STORE
from parser import Parser
from postprocessor.output import JsonPP
from postprocessor.metadata import MetadataPP

log = logging.getLogger(__name__)


def main():
my_store = STORE(store_id="8534540")

discounted = my_store.get_discounted_products()

product_infos = my_store.product_infos(product_ids=my_store.product_ids(discounted))
discounted_products = my_store.get_discounted_products()

all_products = [product_md for product_md in Parser().parse_product_infos(product_infos)]
all_products = Parser().parse_search_results_products(discounted_products)

todays_date = datetime.today().strftime("%Y-%m-%d")
this_file = Path(__file__).stem

file_name = f"{this_file}-{todays_date}.json"

JsonPP.savings_to_json(all_products, file_name)
options = {"directory": DATA_FOLDER, "filename": f"{this_file}-{todays_date}.json", "mode": "json"}
MetadataPP(all_products, options=options).run()


if __name__ == "__main__":
Expand Down
4 changes: 3 additions & 1 deletion rewe_dl/examples/discounted_to_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@


def main():
discounted_products = STORE().get_discounted_products()
my_store = STORE(store_id="8534540")

discounted_products = my_store.get_discounted_products()

all_products = Parser().parse_search_results_products(discounted_products)

Expand Down
47 changes: 47 additions & 0 deletions rewe_dl/examples/raw_responses_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-

# Copyright 2023-2024 Allen Dema
from __future__ import annotations

import os
import sys
import logging
from pathlib import Path
from datetime import datetime

PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(PROJECT_DIR)
DATA_FOLDER = os.path.join(os.path.dirname(PROJECT_ROOT), "data")

sys.path.append(PROJECT_ROOT)

from rewe import STORE
from parser import Parser
from postprocessor.metadata import MetadataPP

log = logging.getLogger(__name__)


def main():
my_store = STORE(store_id="8534540")
query = "ja"

todays_date = datetime.today().strftime("%Y-%m-%d")
this_file = Path(__file__).stem

paginated = list(my_store.search(query, max_page=1))

for page, response in enumerate(paginated, 1):
options = {
"directory": DATA_FOLDER,
"filename": f"{this_file}-{query}-{page:02}-{todays_date}.json",
"mode": "json",
"ident": 4,
}

MetadataPP(response, options=options).run()


if __name__ == "__main__":
main()
26 changes: 19 additions & 7 deletions rewe_dl/postprocessor/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@
import os
import sys
import json
import types
import logging
from pathlib import Path

from postprocessor.common import PostProcessor

class MetadataPP:
log = logging.getLogger(__name__)


class MetadataPP(PostProcessor):
"""stripped down and modified from gallery-dl"""

def __init__(self, kwdict, options):
Expand Down Expand Up @@ -40,14 +46,18 @@ def _initialize_formatter(self):
self.directory.mkdir(parents=True, exist_ok=True)

if self.filename:
self.filename = Path(self.filename).resolve().absolute()
# avoid using '.resolve().absolute()' here - wrong 'join' in '_run()'
if self.filename and not self.directory:
self.directory = os.path.dirname(self.filename)
os.makedirs(self.directory, exist_ok=True)

def json_default(obj):
if isinstance(obj, None):
def json_default(self, obj):
if isinstance(obj, types.NoneType):
return None

if isinstance(obj, types.GeneratorType):
return list(obj)

return str(obj)

def _make_encoder(self):
Expand All @@ -66,12 +76,14 @@ def _write_custom(self, fp, kwdict):
def _write_json(self, fp, kwdict):
fp.write(self._json_encode(kwdict) + "\n")

def _run_file(self, kwdict):
path = Path(self.directory, self.filename)
def _run(self, kwdict):
path = Path(self.directory) / self.filename
path.parent.mkdir(parents=True, exist_ok=True)

with open(path, self.open_mode, encoding=self.encoding) as fp:
self.writer(fp, kwdict)

log.info(f"Wrote to file: {path}")

def run(self):
return self._run_file(self.kwdict)
return self._run(self.kwdict)
1 change: 1 addition & 0 deletions rewe_dl/postprocessor/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
class JsonPP(PostProcessor):
def __init__(self, md_list, options):
PostProcessor.__init__(self, md_list, options)
self.log.warning("USE metadata.MetadataPP!")

@staticmethod
def to_json(md: dict = None, file_name: str = None):
Expand Down
90 changes: 6 additions & 84 deletions rewe_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,13 @@
import locale
import logging
from random import choice
from functools import partial

import httpx

log = logging.getLogger(__name__)

PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(PROJECT_DIR)

import exception

global YELLOW
global RED
YELLOW = "\033[1;32;40m"
RED = "\033[31m"
log = logging.getLogger(__name__)


def create_agents() -> dict:
Expand Down Expand Up @@ -69,7 +61,7 @@ def save_to_json(json_data: dict, file_name: str, indent=4, mode="w") -> None:
else:
raise ValueError("json_data must be a dict or a list of dicts!")
except (IOError, OSError) as e:
return str(e)
log.error(str(e))


def save_to_jsonl(json_data: dict, file_name: str) -> None:
Expand All @@ -91,78 +83,7 @@ def save_to_jsonl(json_data: dict, file_name: str) -> None:
raise ValueError("json_data must be a dict or a list of dicts!")


def _htmlentity_transform(entity_with_semicolon):
"""Transforms an HTML entity to a character."""
# mod from yt-dlp
import html
import contextlib

entity = entity_with_semicolon[:-1]

# Known non-numeric HTML entity
if entity in html.entities.name2codepoint:
return chr(html.entities.name2codepoint[entity])

# TODO: HTML5 allows entities without a semicolon.
# E.g. '&Eacuteric' should be decoded as 'Éric'.
if entity_with_semicolon in html.entities.html5:
return html.entities.html5[entity_with_semicolon]

mobj = re.match(r"#(x[0-9a-fA-F]+|[0-9]+)", entity)
if mobj is not None:
numstr = mobj.group(1)
if numstr.startswith("x"):
base = 16
numstr = "0%s" % numstr
else:
base = 10
# See https://github.com/ytdl-org/youtube-dl/issues/7518
with contextlib.suppress(ValueError):
return chr(int(numstr, base))

# Unknown entity in name, return its literal representation
return "&%s;" % entity


def unescapeHTML(s):
"""https://github.com/ytdl-patched/ytdl-patched/blob/8522226d2fea04d48802a9ef402438ff79227fe4/yt_dlp/utils.py#L826"""
if s is None:
return None
assert isinstance(s, str)

return re.sub(r"&([^&;]+;)", lambda m: _htmlentity_transform(m.group(1)), s)


def clean_html(html):
"""Clean an HTML snippet into a readable string"""
import re

# mod from https://github.com/ytdl-patched/ytdl-patched/blob/8522226d2fea04d48802a9ef402438ff79227fe4/yt_dlp/utils.py#L580
if html is None or not isinstance(html, str): # Convenience for sanitizing descriptions etc.
return html

html = re.sub(r"\s+", " ", html)
html = re.sub(r"(?u)\s?<\s?br\s?/?\s?>\s?", "\n", html)
html = re.sub(r"(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>", "\n", html)
# Strip html tags
html = re.sub("<.*?>", "", html)
# Replace html entities
html = unescapeHTML(html)
return html.strip()


def escapeHTML(text):
# from https://github.com/ytdl-patched/ytdl-patched/blob/8522226d2fea04d48802a9ef402438ff79227fe4/yt_dlp/utils.py#L835
return (
text.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)


def read_file(file_name: str):
def read_file(file_name: str) -> list:
with open(file_name) as file:
data = file.readlines()
file.close()
Expand All @@ -187,9 +108,10 @@ def append_to_file(content, file_name: str) -> None:


def slugify(value):
"""# https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L42C1-L50C1
"""
Convert a string to a URL slug
# https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L42C1-L50C1
Adapted from:
https://github.com/django/django/blob/master/django/utils/text.py
"""
Expand All @@ -212,5 +134,5 @@ def load_config(config_path: str = None) -> dict:


@staticmethod
def json_compact(obj):
def json_compact(obj) -> dict:
return json.dumps(obj, separators=(",", ":"), sort_keys=True)

0 comments on commit 7948de8

Please sign in to comment.