From 69301360dd35eb0a7ba0096e8ef8b0d3bc26ca62 Mon Sep 17 00:00:00 2001 From: Kevin Ramirez Date: Tue, 17 Sep 2024 13:26:51 -0600 Subject: [PATCH] fix(pacer_free_documents): remove @throttle_task in get pdfs process wait longer when cycling the same court over and over again --- .../commands/scrape_pacer_free_opinions.py | 22 ++++++++++++++++--- cl/corpus_importer/tasks.py | 2 -- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py index ddb5cbb8c0..da1464e458 100644 --- a/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py +++ b/cl/corpus_importer/management/commands/scrape_pacer_free_opinions.py @@ -316,16 +316,32 @@ def get_pdfs( throttle = CeleryThrottle(queue_name=q) completed = 0 cycle_checker = CycleChecker() + current_court = None + prev_court = None for row in rows.iterator(): # Wait until the queue is short enough throttle.maybe_wait() + # Keep track of current and previous processed court + prev_court = current_court + current_court = row.court_id + if cycle_checker.check_if_cycled(row.court_id): - print( - f"Court cycle completed. Sleep 1 second before starting the next cycle." + if prev_court != current_court: + # We are cycling different courts, wait 1s before start next cycle + sleep = 1 + else: + # We are cycling the same court over and over again, waiting longer + # before queuing up more items from the same court + sleep = 3 + + logger.info( + f"Court cycle completed for: {row.court_id}. Current iteration: {cycle_checker.current_iteration}. Sleep {sleep} second(s) " + f"before starting the next cycle." ) - time.sleep(1) + time.sleep(sleep) + logger.info(f"Processing row id: {row.id} from {row.court_id}") c = chain( process_free_opinion_result.si( row.pk, diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index e00092b791..0102a069cf 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -444,7 +444,6 @@ def get_and_save_free_document_report( @app.task(bind=True, max_retries=5, ignore_result=True) -@throttle_task("1/4s", key="court_id") def process_free_opinion_result( self, row_pk: int, @@ -595,7 +594,6 @@ def process_free_opinion_result( interval_step=5, ignore_result=True, ) -@throttle_task("1/6s", key="court_id") def get_and_process_free_pdf( self: Task, data: TaskData,