Skip to content

Commit

Permalink
Fix WARC parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Nov 23, 2023
1 parent 793581e commit d8670c8
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
12 changes: 9 additions & 3 deletions archive_query_log/parsers/warc_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from click import echo
from elasticsearch_dsl import Search
from elasticsearch_dsl.function import RandomScore
from elasticsearch_dsl.query import FunctionScore, Term, RankFeature
from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists
from tqdm.auto import tqdm
from warc_s3 import WarcS3Store

Expand Down Expand Up @@ -115,7 +115,10 @@ def _parse_serp_warc_query_action(
serp: Serp,
) -> Iterator[dict]:
# Re-check if it can be parsed.
if serp.warc_location is None:
if (serp.warc_location is None or
serp.warc_location.file is None or
serp.warc_location.offset is None or
serp.warc_location.length is None):
return

# Re-check if parsing is necessary.
Expand Down Expand Up @@ -155,7 +158,10 @@ def parse_serps_warc_query(config: Config) -> None:
Serp.index().refresh(using=config.es.client)
changed_serps_search: Search = (
Serp.search(using=config.es.client)
.filter(~Term(warc_query_parser__should_parse=False))
.filter(
Exists(field="warc_location") &
~Term(warc_query_parser__should_parse=False)
)
.query(
RankFeature(field="archive.priority", saturation={}) |
RankFeature(field="provider.priority", saturation={}) |
Expand Down
12 changes: 9 additions & 3 deletions archive_query_log/parsers/warc_snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from click import echo
from elasticsearch_dsl import Search
from elasticsearch_dsl.function import RandomScore
from elasticsearch_dsl.query import FunctionScore, Term, RankFeature
from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists
# noinspection PyProtectedMember
# pylint: disable=no-name-in-module
from lxml.etree import _Element
Expand Down Expand Up @@ -165,7 +165,10 @@ def _parse_serp_warc_snippets_action(
serp: Serp,
) -> Iterator[dict]:
# Re-check if it can be parsed.
if serp.warc_location is None:
if (serp.warc_location is None or
serp.warc_location.file is None or
serp.warc_location.offset is None or
serp.warc_location.length is None):
return

# Re-check if parsing is necessary.
Expand Down Expand Up @@ -233,7 +236,10 @@ def parse_serps_warc_snippets(config: Config) -> None:
Serp.index().refresh(using=config.es.client)
changed_serps_search: Search = (
Serp.search(using=config.es.client)
.filter(~Term(warc_snippets_parser__should_parse=False))
.filter(
Exists(field="warc_location") &
~Term(warc_snippets_parser__should_parse=False)
)
.query(
RankFeature(field="archive.priority", saturation={}) |
RankFeature(field="provider.priority", saturation={}) |
Expand Down

0 comments on commit d8670c8

Please sign in to comment.