Skip to content

Commit

Permalink
Merge pull request #3 from arXiv/add_resumption_token
Browse files Browse the repository at this point in the history
Add resumption token
  • Loading branch information
kyokukou authored Nov 18, 2024
2 parents 7da91c0 + 7202752 commit 92efc95
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 65 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ jobs:
poetry install
- name: Test with pytest
run: |
poetry run pytest --cov=oaipmh --cov-fail-under=40 tests
poetry run pytest --cov=oaipmh --cov-fail-under=80 tests
2 changes: 2 additions & 0 deletions oaipmh/data/oai_properties.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

class OAIParams:
VERB = "verb"
ID = "identifier"
Expand All @@ -20,3 +21,4 @@ def __init__(self, prefix: str, schema: str, namespace: str):
self.prefix = prefix
self.schema = schema
self.namespace = namespace

34 changes: 34 additions & 0 deletions oaipmh/processors/resume.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Dict, Tuple
import json
import base64

from oaipmh.data.oai_errors import OAIBadResumptionToken
from oaipmh.data.oai_properties import OAIParams

class ResToken:
def __init__(self, params: Dict[OAIParams, str], start_val: int):
self.params = params
self.start_val = start_val
self.token_str = self.to_token()

def to_token(self) -> str:
data = {
"params": self.params,
"start_val": self.start_val
}
json_str = json.dumps(data)
return base64.b64encode(json_str.encode("utf-8")).decode("utf-8")

@classmethod
def from_token(cls, encoded_str: str) -> Tuple[Dict[str, str], int]:
try:
json_str = base64.b64decode(encoded_str).decode("utf-8")
data = json.loads(json_str)
if not isinstance(data, dict) or set(data.keys()) != {"params", "start_val"}:
raise OAIBadResumptionToken("Token structure is invalid.")
if not isinstance(data["params"], dict) or not isinstance(data["start_val"], int):
raise OAIBadResumptionToken("Token contains invalid data types.")
return data["params"], data["start_val"]
except (Exception):
raise OAIBadResumptionToken("Token decoding failed or format is invalid.")

127 changes: 72 additions & 55 deletions oaipmh/requests/data_queries.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Dict, Union
import re
from datetime import datetime, timezone
from datetime import datetime, timezone, timedelta

from arxiv.taxonomy.definitions import GROUPS, ARCHIVES_ACTIVE, CATEGORIES_ACTIVE
from arxiv.taxonomy.category import Group, Archive, Category

from oaipmh.data.oai_config import SUPPORTED_METADATA_FORMATS, EARLIEST_DATE
from oaipmh.data.oai_errors import OAIBadArgument, OAIBadFormat
from oaipmh.data.oai_errors import OAIBadArgument, OAIBadFormat, OAIBadResumptionToken
from oaipmh.data.oai_properties import OAIParams, OAIVerbs
from oaipmh.processors.resume import ResToken
from oaipmh.serializers.output_formats import Response
from oaipmh.requests.param_processing import process_identifier

Expand All @@ -30,72 +31,88 @@ def get_record(params: Dict[str, str]) -> Response:
if meta_type_str not in SUPPORTED_METADATA_FORMATS:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str

#TODO paramters done, do rest of function

return "<a>b</a>", 200, {}

def list_data(params: Dict[str, str], just_ids: bool)-> Response:
"""runs both list queries. just_ids true for list identifiers, false for list records"""
query_data: Dict[OAIParams, str]={OAIParams.VERB:OAIVerbs.LIST_IDS}
query_data: Dict[OAIParams, str]={OAIParams.VERB:params[OAIParams.VERB]}

#parameter processing
given_params=set(params.keys())
if OAIParams.RES_TOKEN in given_params: #using resumption token
if OAIParams.RES_TOKEN in given_params: #get parameters from token
if given_params != {OAIParams.RES_TOKEN, OAIParams.VERB}: #resumption token is exclusive
raise OAIBadArgument(f"No other paramters allowed with {OAIParams.RES_TOKEN}")
token=params[OAIParams.RES_TOKEN]
#TODO token processing and validation

else: #using request parameters
#correct parameters present
if OAIParams.META_PREFIX not in given_params:
raise OAIBadArgument(f"{OAIParams.META_PREFIX} required.")
allowed_params={OAIParams.VERB,OAIParams.META_PREFIX, OAIParams.FROM, OAIParams.UNTIL, OAIParams.SET }
if given_params-allowed_params: #no extra keys allowed
raise OAIBadArgument(f"Unallowed parameter. Allowed parameters: {', '.join(str(param) for param in allowed_params)}")

#metadata
meta_type_str=params[OAIParams.META_PREFIX]
if meta_type_str not in SUPPORTED_METADATA_FORMATS:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str

#dates
from_str=params.get(OAIParams.FROM)
if from_str:
try:
if not re.fullmatch(DATE_REGEX, from_str):
raise ValueError
start_date=datetime.strptime(from_str, "%Y-%m-%d")
start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
query_data[OAIParams.FROM]=from_str
except Exception:
raise OAIBadArgument("from date format must be YYYY-MM-DD")
else:
start_date=EARLIEST_DATE

until_str=params.get(OAIParams.UNTIL)
if until_str:
try:
if not re.fullmatch(DATE_REGEX, until_str):
raise ValueError
end_date=datetime.strptime(until_str, "%Y-%m-%d")
end_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
query_data[OAIParams.UNTIL]=until_str
except Exception:
raise OAIBadArgument("until date format must be YYYY-MM-DD")
else:
end_date=datetime.now(timezone.utc)

#sets
set_str=params.get(OAIParams.SET)
if set_str:
rq_set= _parse_set(set_str)
query_data[OAIParams.SET]=set_str

#TODO check that combined parameters are valid (dates are okay, sets are active and not test) combined with token data
token_params, start_val=ResToken.from_token(token)
query_data[OAIParams.RES_TOKEN]=token
if params[OAIParams.VERB] != token_params[OAIParams.VERB]:
raise OAIBadResumptionToken("token from different verb", query_data)
params=token_params #set request parameters from token
given_params=set(params.keys())

#process request parameters
#correct parameters present
if OAIParams.META_PREFIX not in given_params:
raise OAIBadArgument(f"{OAIParams.META_PREFIX} required.")
allowed_params={OAIParams.VERB,OAIParams.META_PREFIX, OAIParams.FROM, OAIParams.UNTIL, OAIParams.SET }
if given_params-allowed_params: #no extra keys allowed
raise OAIBadArgument(f"Unallowed parameter. Allowed parameters: {', '.join(str(param) for param in allowed_params)}")

#metadata
meta_type_str=params[OAIParams.META_PREFIX]
if meta_type_str not in SUPPORTED_METADATA_FORMATS:
raise OAIBadFormat(reason="Did not recognize requested format", query_params=query_data)
meta_type=SUPPORTED_METADATA_FORMATS[meta_type_str]
query_data[OAIParams.META_PREFIX]=meta_type_str

#dates
from_str=params.get(OAIParams.FROM)
if from_str:
try:
if not re.fullmatch(DATE_REGEX, from_str):
raise ValueError
start_date=datetime.strptime(from_str, "%Y-%m-%d")
start_date = start_date.replace(hour=0, minute=0, second=0, tzinfo=timezone.utc)
query_data[OAIParams.FROM]=from_str
except Exception:
raise OAIBadArgument("from date format must be YYYY-MM-DD")
else:
start_date=EARLIEST_DATE

until_str=params.get(OAIParams.UNTIL)
if until_str:
try:
if not re.fullmatch(DATE_REGEX, until_str):
raise ValueError
end_date=datetime.strptime(until_str, "%Y-%m-%d")
end_date = end_date.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc)
query_data[OAIParams.UNTIL]=until_str
except Exception:
raise OAIBadArgument("until date format must be YYYY-MM-DD")
else:
end_date=datetime.now(timezone.utc).replace(hour=23, minute=59, second=59)

#sets
set_str=params.get(OAIParams.SET)
if set_str:
rq_set= _parse_set(set_str)
query_data[OAIParams.SET]=set_str
if not rq_set.is_active or 'test' in rq_set.id:
raise OAIBadArgument("Invalid set request")
else:
rq_set=None

#dates are valid
if start_date>end_date:
raise OAIBadArgument("until date must be greater than or equal to from date")
if start_date < EARLIEST_DATE:
raise OAIBadArgument("start date too early")
if end_date> datetime.now(timezone.utc) + timedelta(days=1):
raise OAIBadArgument("until date too late")

#TODO rest of function

Expand Down
Loading

0 comments on commit 92efc95

Please sign in to comment.