Skip to content

Commit

Permalink
Fix code
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 20, 2023
1 parent 9cb9559 commit e064125
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 10 deletions.
1 change: 0 additions & 1 deletion archive_query_log/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,4 +307,3 @@ def warc_query_import(config: Config, services_path: Path) -> None:
from archive_query_log.imports.yaml import import_warc_query_parsers
WarcQueryParser.init(using=config.es.client)
import_warc_query_parsers(config, services_path)

5 changes: 1 addition & 4 deletions archive_query_log/imports/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from click import echo
from click import prompt
from cssselect import HTMLTranslator
from diskcache import Index
from elasticsearch_dsl.query import Terms
from tqdm.auto import tqdm
Expand Down Expand Up @@ -339,16 +338,14 @@ def import_warc_query_parsers(config: Config, services_path: Path) -> None:
services_list: Sequence[dict] = safe_load(file)
echo(f"Found {len(services_list)} service definitions.")

translator = HTMLTranslator()

services: Iterable[dict] = services_list
# noinspection PyTypeChecker
services = tqdm(
services,
desc="Import parsers for providers",
unit="provider",
)
for i, service in enumerate(services):
for service in services:
if ("domains" not in service or
"interpreted_query_parsers" not in service):
continue
Expand Down
4 changes: 2 additions & 2 deletions archive_query_log/monitoring/home.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class Progress(NamedTuple):
DocumentType = Type[BaseDocument]

_statistics_cache: dict[
tuple[DocumentType,
tuple[str, ...]], Statistics,
tuple[DocumentType, tuple[str, ...]],
Statistics,
] = ExpiringDict(
max_len=100,
max_age_seconds=30,
Expand Down
9 changes: 6 additions & 3 deletions archive_query_log/parsers/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@

from cssselect import GenericTranslator
from cssselect.parser import parse as cssselect_parse
# pylint: disable=no-name-in-module
from lxml.etree import parse as etree_parse, XMLParser, HTMLParser
# noinspection PyProtectedMember
# pylint: disable=no-name-in-module
from lxml.etree import _ElementTree
from warcio.recordloader import ArcWarcRecord

Expand All @@ -20,6 +22,7 @@ def parse_xml_tree(record: ArcWarcRecord) -> _ElementTree | None:
warn(UserWarning("No MIME type given."))
return None
mime_type = mime_type.split(";", maxsplit=1)[0]
parser: XMLParser | HTMLParser
if mime_type == "text/xml":
parser = XMLParser()
elif mime_type == "text/html":
Expand All @@ -42,7 +45,7 @@ def get_xml_xpath_non_empty_string(
raise ValueError(
f"XPath {xpath} did not return a list, was: {type(results)}")
if not all(isinstance(result, str) for result in results):
types = ", ".join(type(result) for result in results)
types = ", ".join(str(type(result)) for result in results)
raise ValueError(
f"XPath {xpath} did not return a list of strings, found: {types}")
results = (result.strip() for result in results)
Expand All @@ -61,7 +64,7 @@ def get_xml_xpath_non_empty_string(
f"XPath {xpath} did not return a string, was: {type(result)}")


translator = GenericTranslator()
_translator = GenericTranslator()


def text_xpath_from_css_selector(
Expand All @@ -78,7 +81,7 @@ def text_xpath_from_css_selector(
selectors = cssselect_parse(css_selector)

xpaths = (
"//" + translator.selector_to_xpath(
"//" + _translator.selector_to_xpath(
selector,
prefix="",
translate_pseudo_elements=True,
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ disable = [
"logging-fstring-interpolation"
]

[tool.bandit]
skips = ["B320", "B410"]

[tool.bandit.assert_used]
skips = ["**/test_*.py"]

Expand Down

0 comments on commit e064125

Please sign in to comment.