Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions compass/extraction/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,16 @@ async def check_for_ordinance_info(

chunks = model_config.text_splitter.split_text(doc.text)
chunk_parser = ParseChunksWithMemory(chunks, num_to_recall=2)
legal_text_validator = LegalTextValidator(
tech=tech,
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
doc_is_from_ocr=doc.attrs.get("from_ocr", False),
**model_config.llm_call_kwargs,
legal_text_validator = (
None
if doc.attrs.get("is_legal_doc", False)
else LegalTextValidator(
tech=tech,
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
doc_is_from_ocr=doc.attrs.get("from_ocr", False),
**model_config.llm_call_kwargs,
)
)

ordinance_text_collector = ordinance_text_collector_class(
Expand Down Expand Up @@ -142,6 +146,15 @@ async def extract_date(doc, model_config, usage_tracker=None):
the attrs will contain a ``"date"`` key that will contain the
parsed date information.
"""
if "date" in doc.attrs:
logger.debug(
"Not extracting date for doc from %s. "
"Found existing date in doc attrs: %r",
doc.attrs.get("source"),
doc.attrs["date"],
)
return doc

date_llm_caller = StructuredLLMCaller(
llm_service=model_config.llm_service,
usage_tracker=usage_tracker,
Expand Down
71 changes: 64 additions & 7 deletions compass/scripts/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer
from compass.utilities.enums import LLMTasks
from compass.utilities.io import load_local_docs
from compass.pb import COMPASS_PB


Expand All @@ -50,7 +51,7 @@ async def download_known_urls(
are applied. By default, ``None``.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncFileLoader`.
:class:`elm.web.file_loader.AsyncWebFileLoader`.
By default, ``None``.

Returns
Expand Down Expand Up @@ -93,6 +94,61 @@ async def download_known_urls(
return out_docs


async def load_known_docs(jurisdiction, fps, local_file_loader_kwargs=None):
"""Load documents from known local paths

Parameters
----------
jurisdiction : Jurisdiction
Jurisdiction instance representing the jurisdiction
corresponding to the documents.
fps : iterable of path-like
Collection of paths to load documents from.
local_file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncLocalFileLoader`.
By default, ``None``.

Returns
-------
out_docs : list
List of :obj:`~elm.web.document.BaseDocument` instances
containing documents from the paths, or an empty list if
something went wrong during the retrieval process.

Notes
-----
Requires :class:`~compass.services.threaded.TempFileCachePB`
service to be running.
"""

COMPASS_PB.update_jurisdiction_task(
jurisdiction.full_name, description="Loading known document(s)..."
)

local_file_loader_kwargs = local_file_loader_kwargs or {}
local_file_loader_kwargs.update(
{"file_cache_coroutine": TempFileCachePB.call}
)
async with COMPASS_PB.file_download_prog_bar(
jurisdiction.full_name, len(fps)
):
try:
out_docs = await load_local_docs(fps, **local_file_loader_kwargs)
except KeyboardInterrupt:
raise
except Exception as e:
msg = (
"Encountered error of type %r while loading known documents: "
"%r"
)
err_type = type(e)
logger.exception(msg, err_type, fps)
out_docs = []

return out_docs


async def find_jurisdiction_website(
jurisdiction,
model_configs,
Expand All @@ -116,7 +172,7 @@ async def find_jurisdiction_website(
for all tasks.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncFileLoader`. If found, the
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
Expand Down Expand Up @@ -202,7 +258,7 @@ async def download_jurisdiction_ordinances_from_website(
the points are summed up.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncFileLoader`. If found, the
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
Expand Down Expand Up @@ -327,7 +383,7 @@ async def download_jurisdiction_ordinances_from_website_compass_crawl(
the points are summed up.
file_loader_kwargs : dict, optional
Dictionary of keyword arguments pairs to initialize
:class:`elm.web.file_loader.AsyncFileLoader`. If found, the
:class:`elm.web.file_loader.AsyncWebFileLoader`. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the Google URL search. By default, ``None``.
Expand Down Expand Up @@ -426,9 +482,10 @@ async def download_jurisdiction_ordinance_using_search_engine(
ordinance document. By default, ``5``.
file_loader_kwargs : dict, optional
Dictionary of keyword-argument pairs to initialize
:class:`elm.web.file_loader.AsyncFileLoader` with. If found, the
"pw_launch_kwargs" key in these will also be used to initialize
the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
:class:`elm.web.file_loader.AsyncWebFileLoader` with. If found,
the "pw_launch_kwargs" key in these will also be used to
initialize the
:class:`elm.web.search.google.PlaywrightGoogleLinkSearch`
used for the google URL search. By default, ``None``.
search_semaphore : :class:`asyncio.Semaphore`, optional
Semaphore instance that can be used to limit the number of
Expand Down
Loading