Skip to content

Commit

Permalink
added direct answers
Browse files Browse the repository at this point in the history
  • Loading branch information
JKlueber committed Apr 10, 2024
1 parent dabe86e commit 2da7caa
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 0 deletions.
65 changes: 65 additions & 0 deletions archive_query_log/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
UrlQueryParser, UrlPageParserType, UrlPageParser, \
UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \
WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \
WarcDirectAnswersParserType, WarcDirectAnswersParser, \
WarcMainContentParserType, WarcMainContentParser


Expand Down Expand Up @@ -380,6 +381,70 @@ def warc_snippets_import(config: Config, services_path: Path) -> None:
import_warc_snippets_parsers(config, services_path)


@parsers.group()
def warc_direct_answers() -> None:
pass


CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [
"xpath",
]


@warc_direct_answers.command("add")
@option("--provider-id", type=str)
@option("--url-pattern-regex", type=str)
@option("--priority", type=FloatRange(min=0, min_open=False))
@option("--parser-type",
type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True)
@option("--xpath", type=str)
@option("--url-xpath", type=str)
@option("--title-xpath", type=str)
@option("--text-xpath", type=str)
@pass_config
def warc_direct_answers_add(
config: Config,
provider_id: str | None,
url_pattern_regex: str | None,
parser_type: str,
xpath: str | None,
url_xpath: str | None,
text_xpath: str | None,
) -> None:
from archive_query_log.parsers.warc_snippets import \
add_warc_direct_answers_parser
parser_type_strict: WarcDirectAnswersParserType
if parser_type == "xpath":
parser_type_strict = "xpath"
if xpath is None:
raise UsageError("No XPath given.")
else:
raise ValueError(f"Invalid parser type: {parser_type}")
WarcDirectAnswersParser.init(using=config.es.client)
add_warc_direct_answers_parser(
config=config,
provider_id=provider_id,
url_pattern_regex=url_pattern_regex,
parser_type=parser_type_strict,
xpath=xpath,
url_xpath=url_xpath,
text_xpath=text_xpath,
)


@warc_direct_answers.command("import")
@option("-s", "--services-file", "services_path",
type=PathType(path_type=Path, exists=True, file_okay=True,
dir_okay=False, readable=True, resolve_path=True,
allow_dash=False),
default=Path("data") / "selected-services.yaml")
@pass_config
def warc_direct_answers_import(config: Config, services_path: Path) -> None:
from archive_query_log.imports.yaml import import_warc_direct_answers_parsers
WarcDirectAnswersParser.init(using=config.es.client)
import_warc_direct_answers_parsers(config, services_path)


@parsers.group()
def warc_main_content() -> None:
pass
Expand Down
72 changes: 72 additions & 0 deletions archive_query_log/imports/yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from archive_query_log.parsers.url_query import add_url_query_parser
from archive_query_log.parsers.warc_query import add_warc_query_parser
from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser
from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser
from archive_query_log.parsers.xml import xpaths_from_css_selector, \
text_xpath, merge_xpaths
from archive_query_log.providers import add_provider
Expand Down Expand Up @@ -479,3 +480,74 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None:
title_xpath=title_xpath,
text_xpath=snippet_xpath,
)


def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None:
echo("Load providers from services file.")
with services_path.open("r") as file:
services_list: Sequence[dict] = safe_load(file)
echo(f"Found {len(services_list)} service definitions.")

services: Iterable[dict] = services_list
# noinspection PyTypeChecker
services = tqdm(
services,
desc="Import parsers for providers",
unit="provider",
)
for service in services:
if ("domains" not in service or "results_parsers" not in service):
continue

results_parsers = service["results_parsers"]

providers = (
Provider.search(using=config.es.client)
.query(Terms(domains=service["domains"]))
.scan()
)
providers = safe_iter_scan(providers)
for provider in providers:
for results_parser in enumerate(results_parsers):
if results_parser["type"] != "html_selector":
continue
results_selector = results_parser["results_selector"]
url_selector = results_parser.get("url_selector")
direct_answer_selector = results_parser.get("direct_answer_selector")

results_xpaths = xpaths_from_css_selector(results_selector)
results_xpaths = [
"//" + result_xpath
for result_xpath in results_xpaths
]
results_xpath = merge_xpaths(results_xpaths)

if url_selector is not None:
url_xpaths = xpaths_from_css_selector(url_selector)
url_xpaths = [
text_xpath(xpath, attribute="href")
for xpath in url_xpaths
]
url_xpath = merge_xpaths(url_xpaths)
else:
url_xpath = None

if direct_answer_selector is not None:
direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector)
direct_answer_xpaths = [
text_xpath(xpath, text=True)
for xpath in direct_answer_xpaths
]
direct_answer_xpath = merge_xpaths(direct_answer_xpaths)
else:
direct_answer_xpath = None

add_warc_direct_answers_parser(
config=config,
provider_id=provider.meta.id,
url_pattern_regex=results_parser.get("url_pattern"),
parser_type="xpath",
xpath=results_xpath,
url_xpath=url_xpath,
text_xpath=direct_answer_xpath,
)

0 comments on commit 2da7caa

Please sign in to comment.