From a069890a0a707a3c8ce14b99692712f02c6aae0c Mon Sep 17 00:00:00 2001 From: Doc Ritezel Date: Tue, 26 Nov 2024 12:15:13 -0800 Subject: [PATCH] Provide a better message when GTFS-RT validator jar skips a file Signed-off-by: Erika Pacheco --- jobs/gtfs-rt-parser-v2/gtfs_rt_parser.py | 54 ++++++++++++------------ jobs/gtfs-rt-parser-v2/poetry.lock | 53 ++++++++++++++++++++++- jobs/gtfs-rt-parser-v2/pyproject.toml | 3 +- 3 files changed, 80 insertions(+), 30 deletions(-) diff --git a/jobs/gtfs-rt-parser-v2/gtfs_rt_parser.py b/jobs/gtfs-rt-parser-v2/gtfs_rt_parser.py index 62597461df..e4b792540e 100644 --- a/jobs/gtfs-rt-parser-v2/gtfs_rt_parser.py +++ b/jobs/gtfs-rt-parser-v2/gtfs_rt_parser.py @@ -61,6 +61,18 @@ class InvalidMetadata(Exception): pass +class NoScheduleDataSpecified(Exception): + pass + + +class ScheduleDataNotFound(Exception): + pass + + +class NoValidatorResults(Exception): + pass + + class RTProcessingStep(str, Enum): parse = "parse" validate = "validate" @@ -77,14 +89,6 @@ class RTValidationMetadata(BaseModel): gtfs_validator_version: str -class NoScheduleDataSpecified(Exception): - pass - - -class ScheduleDataNotFound(Exception): - pass - - class RTHourlyAggregation(PartitionedGCSArtifact): partition_names: ClassVar[List[str]] = ["dt", "hour", "base64_url"] step: RTProcessingStep @@ -277,7 +281,7 @@ def download(self, date: datetime.datetime) -> Optional[str]: .get_url_schedule(self.base64_validation_url) ) except KeyError: - print( + typer.secho( f"no schedule data found for {self.base64_validation_url} on day {day}" ) continue @@ -287,7 +291,7 @@ def download(self, date: datetime.datetime) -> Optional[str]: self.fs.get(schedule_extract.path, gtfs_zip) return gtfs_zip except FileNotFoundError: - print( + typer.secho( f"no schedule file found for {self.base64_validation_url} on day {day}" ) continue @@ -346,17 +350,17 @@ def get_local_paths(self) -> Dict[str, GTFSRTFeedExtract]: def get_results_paths(self) -> Dict[str, GTFSRTFeedExtract]: return {e.get_results_path(): e.extract for e in self.get_extracts()} - def get_hashed_results(self): + def get_hashed_results(self) -> Dict[str, Any]: hashed = {} for e in self.get_extracts(): if e.has_results(): - hashed[e.hash()] = e.get_results() + hashed[e.hash().hex()] = e.get_results() return hashed - def get_hashes(self) -> Dict[bytes, List[GTFSRTFeedExtract]]: - hashed: Dict[bytes, List[GTFSRTFeedExtract]] = defaultdict(list) + def get_hashes(self) -> Dict[str, List[GTFSRTFeedExtract]]: + hashed: Dict[str, List[GTFSRTFeedExtract]] = defaultdict(list) for e in self.get_extracts(): - hashed[e.hash()].append(e.extract) + hashed[e.hash().hex()].append(e.extract) return hashed def download(self): @@ -507,7 +511,7 @@ def process(self, tmp_dir: str, scope) -> List[RTFileProcessingOutcome]: e = ScheduleDataNotFound( f"no recent schedule data found for {self.aggregation.extracts[0].path}" ) - print(e) + typer.secho(e) scope.fingerprint = [ type(e), @@ -571,11 +575,11 @@ def process(self, tmp_dir: str, scope) -> List[RTFileProcessingOutcome]: for hash, extracts in aggregation_extracts.get_hashes().items(): try: records = hashed_results[hash] - except KeyError as e: + except KeyError: if self.verbose: paths = ", ".join(e.path for e in extracts) typer.secho( - f"WARNING: no results found for {paths}", + f"WARNING: validator did not produce results for {paths}", fg=typer.colors.YELLOW, ) @@ -584,7 +588,7 @@ def process(self, tmp_dir: str, scope) -> List[RTFileProcessingOutcome]: RTFileProcessingOutcome( step=self.aggregation.step, success=False, - exception=e, + exception=NoValidatorResults("No validator output"), extract=extract, ) ) @@ -680,7 +684,7 @@ def process(self, tmp_dir: str, scope) -> List[RTFileProcessingOutcome]: except DecodeError as e: if self.verbose: typer.secho( - f"WARNING: DecodeError for {str(extract.path)}", + f'DecodeError: "{str(e)}" thrown when decoding {str(extract.path)}', fg=typer.colors.YELLOW, ) outcomes.append( @@ -918,13 +922,9 @@ def main( # TODO: I dislike having to exclude the records here # I need to figure out the best way to have a single type represent the "metadata" of # the content as well as the content itself - result.save_content( - fs=get_fs(), - content="\n".join( - (json.dumps(make_pydantic_model_bq_safe(o)) for o in result.outcomes) - ).encode(), - exclude={"outcomes"}, - ) + raw = [json.dumps(make_pydantic_model_bq_safe(o)) for o in result.outcomes] + content = "\n".join(raw).encode("utf-8") + result.save_content(fs=get_fs(), content=content, exclude={"outcomes"}) assert ( len(outcomes) diff --git a/jobs/gtfs-rt-parser-v2/poetry.lock b/jobs/gtfs-rt-parser-v2/poetry.lock index 2d2c695f4d..fb12c1b018 100644 --- a/jobs/gtfs-rt-parser-v2/poetry.lock +++ b/jobs/gtfs-rt-parser-v2/poetry.lock @@ -507,6 +507,22 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "flake8" +version = "7.1.1" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"}, + {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.12.0,<2.13.0" +pyflakes = ">=3.2.0,<3.3.0" + [[package]] name = "fonttools" version = "4.54.1" @@ -1357,6 +1373,17 @@ pillow = ">=6.2.0" pyparsing = ">=2.3.1" python-dateutil = ">=2.7" +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + [[package]] name = "memory-profiler" version = "0.60.0" @@ -2007,6 +2034,17 @@ files = [ [package.dependencies] pyasn1 = ">=0.4.6,<0.7.0" +[[package]] +name = "pycodestyle" +version = "2.12.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, + {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, +] + [[package]] name = "pydantic" version = "1.9.2" @@ -2058,6 +2096,17 @@ typing-extensions = ">=3.7.4.3" dotenv = ["python-dotenv (>=0.10.4)"] email = ["email-validator (>=1.0.3)"] +[[package]] +name = "pyflakes" +version = "3.2.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, + {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, +] + [[package]] name = "pyparsing" version = "3.1.4" @@ -2583,5 +2632,5 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" -python-versions = ">=3.8,<3.10" -content-hash = "2ff33638394c0014c2e5df03ad30b6e2e57ec5f048c5087b75a2546e0e0bd9fa" +python-versions = ">=3.8.1,<3.10" +content-hash = "51e6481ee50e162cc336f8581791ba5a2864a56bf00d722091837931f1a75f0f" diff --git a/jobs/gtfs-rt-parser-v2/pyproject.toml b/jobs/gtfs-rt-parser-v2/pyproject.toml index aca94cbc89..7c94bdd1f5 100644 --- a/jobs/gtfs-rt-parser-v2/pyproject.toml +++ b/jobs/gtfs-rt-parser-v2/pyproject.toml @@ -5,7 +5,7 @@ description = "" authors = ["Andrew Vaccaro "] [tool.poetry.dependencies] -python = ">=3.8,<3.10" +python = ">=3.8.1,<3.10" gtfs-realtime-bindings = "0.0.7" google-auth = "1.32.1" pathy = {extras = ["gcs"], version = "^0.6.1"} @@ -26,6 +26,7 @@ types-protobuf = "^5.28.0.20240924" types-tqdm = "^4.66.0.20240417" isort = "^5.13.2" pytest-env = "^1.1.5" +flake8 = "^7.1.1" [build-system] requires = ["poetry-core>=1.0.0"]