diff --git a/compass/_cli/finalize.py b/compass/_cli/finalize.py index 735034d25..e703a643c 100644 --- a/compass/_cli/finalize.py +++ b/compass/_cli/finalize.py @@ -1,6 +1,5 @@ """COMPASS CLI finalize subcommand""" -import json from datetime import datetime import click @@ -8,8 +7,8 @@ from rich.console import Console from compass.utilities import Directories +from compass.utilities.io import load_config from compass.utilities.jurisdictions import Jurisdiction -from compass.utilities.parsing import load_config from compass.utilities.finalize import save_run_meta, doc_infos_to_db, save_db from compass.scripts.process import _initialize_model_params @@ -62,8 +61,7 @@ def finalize(config): start_datetime = datetime.fromtimestamp(dirs.out.stat().st_ctime) end_datetime = datetime.fromtimestamp(jurisdictions_fp.stat().st_mtime) - with jurisdictions_fp.open("r", encoding="utf-8") as fh: - jurisdictions = json.load(fh) + jurisdictions = load_config(jurisdictions_fp) console.print("Compiling databases...") jurisdictions = jurisdictions.get("jurisdictions", []) diff --git a/compass/_cli/process.py b/compass/_cli/process.py index db708f5c0..31712d5ef 100644 --- a/compass/_cli/process.py +++ b/compass/_cli/process.py @@ -12,9 +12,10 @@ from rich.console import Console from compass.pb import COMPASS_PB +from compass.plugin import create_schema_based_one_shot_extraction_plugin from compass.scripts.process import process_jurisdictions_with_openai from compass.utilities.logs import AddLocationFilter -from compass.utilities.parsing import load_config +from compass.utilities.io import load_config @click.command @@ -41,10 +42,22 @@ is_flag=True, help="Flag to hide progress bars during processing.", ) -def process(config, verbose, no_progress): +@click.option( + "--plugin", + "-p", + required=False, + default=None, + help="One-shot plugin configuration to add to COMPASS before processing", +) +def process(config, verbose, no_progress, plugin): """Download and extract ordinances for a list of jurisdictions""" config = load_config(config) + if plugin is not None: + create_schema_based_one_shot_extraction_plugin( + config=plugin, tech=config["tech"] + ) + custom_theme = Theme({"logging.level.trace": "rgb(94,79,162)"}) console = Console(theme=custom_theme) diff --git a/compass/exceptions.py b/compass/exceptions.py index 0a833dca5..0bb269ebf 100644 --- a/compass/exceptions.py +++ b/compass/exceptions.py @@ -17,6 +17,10 @@ def __init__(self, *args, **kwargs): ) +class COMPASSFileNotFoundError(COMPASSError, FileNotFoundError): + """COMPASS FileNotFoundError""" + + class COMPASSNotInitializedError(COMPASSError): """COMPASS not initialized error""" diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py index 024861bd8..c569d144c 100644 --- a/compass/extraction/apply.py +++ b/compass/extraction/apply.py @@ -3,7 +3,7 @@ import logging from warnings import warn -from compass.llm import StructuredLLMCaller +from compass.llm import JSONFromTextLLMCaller from compass.extraction.date import DateExtractor from compass.validation import ( ParseChunksWithMemory, @@ -170,7 +170,7 @@ async def extract_date(doc, model_config, usage_tracker=None): ) return doc - date_llm_caller = StructuredLLMCaller( + date_llm_caller = JSONFromTextLLMCaller( llm_service=model_config.llm_service, usage_tracker=usage_tracker, **model_config.llm_call_kwargs, diff --git a/compass/extraction/date.py b/compass/extraction/date.py index bb505ad12..ab766bf26 100644 --- a/compass/extraction/date.py +++ b/compass/extraction/date.py @@ -37,12 +37,12 @@ class DateExtractor: ) """System message for date extraction LLM calls""" - def __init__(self, structured_llm_caller, text_splitter=None): + def __init__(self, json_llm_caller, text_splitter=None): """ Parameters ---------- - structured_llm_caller : StructuredLLMCaller + json_llm_caller : JSONFromTextLLMCaller Instance used for structured validation queries. text_splitter : LCTextSplitter, optional Optional text splitter (or subclass instance, or any object @@ -50,7 +50,7 @@ def __init__(self, structured_llm_caller, text_splitter=None): (used for splitting out pages in an HTML document). By default, ``None``. """ - self.slc = structured_llm_caller + self.jlc = json_llm_caller self.text_splitter = text_splitter async def parse(self, doc): @@ -84,7 +84,7 @@ async def _parse(self, doc): ) if can_check_url_for_date: logger.debug("Checking URL for date: %s", url) - response = await self.slc.call( + response = await self.jlc.call( sys_msg=self.SYSTEM_MESSAGE, content=( "Please extract the date from the URL for this " @@ -105,7 +105,7 @@ async def _parse(self, doc): if not text: continue - response = await self.slc.call( + response = await self.jlc.call( sys_msg=self.SYSTEM_MESSAGE, content=f"Please extract the date for this ordinance:\n{text}", usage_sub_label=LLMUsageCategory.DATE_EXTRACTION, diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py index 60b857bf5..5c7090632 100644 --- a/compass/extraction/small_wind/ordinance.py +++ b/compass/extraction/small_wind/ordinance.py @@ -369,12 +369,12 @@ class SmallWindOrdinanceTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "wind_energy_systems_text", - "out_fn": "{jurisdiction} Wind Ordinance Text.txt", + "out_fn": "{jurisdiction} Wind Ordinance.txt", "prompt": _WECS_TEXT_EXTRACTION_PROMPT, }, { "key": "cleaned_text_for_extraction", - "out_fn": "{jurisdiction} Cleaned Text.txt", + "out_fn": "{jurisdiction} Small Wind Ordinance.txt", "prompt": _SMALL_WECS_TEXT_EXTRACTION_PROMPT, }, ] @@ -398,12 +398,12 @@ class SmallWindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "permitted_use_only_text", - "out_fn": "{jurisdiction} Permitted Use Only.txt", + "out_fn": "{jurisdiction} Permitted Use.txt", "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, { "key": "districts_text", - "out_fn": "{jurisdiction} Districts.txt", + "out_fn": "{jurisdiction} Permitted Use Districts.txt", "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, ] diff --git a/compass/extraction/small_wind/parse.py b/compass/extraction/small_wind/parse.py index 12ece59c4..5b9c90195 100644 --- a/compass/extraction/small_wind/parse.py +++ b/compass/extraction/small_wind/parse.py @@ -242,7 +242,7 @@ class StructuredSmallWindOrdinanceParser(StructuredSmallWindParser): a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ @@ -558,7 +558,7 @@ class StructuredSmallWindPermittedUseDistrictsParser( a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ diff --git a/compass/extraction/small_wind/plugin.py b/compass/extraction/small_wind/plugin.py index 2b1a10e4a..f361573d5 100644 --- a/compass/extraction/small_wind/plugin.py +++ b/compass/extraction/small_wind/plugin.py @@ -20,7 +20,7 @@ SmallWindPermittedUseDistrictsTextExtractor.OUT_LABEL ) -SMALL_WIND_QUESTION_TEMPLATES = [ +SMALL_WIND_QUERY_TEMPLATES = [ "filetype:pdf {jurisdiction} wind energy conversion system ordinances", "wind energy conversion system ordinances {jurisdiction}", "{jurisdiction} wind WECS ordinance", @@ -66,8 +66,8 @@ class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin): IDENTIFIER = "small wind" """str: Identifier for extraction task """ - QUESTION_TEMPLATES = SMALL_WIND_QUESTION_TEMPLATES - """list: List of search engine question templates for extraction""" + QUERY_TEMPLATES = SMALL_WIND_QUERY_TEMPLATES + """list: List of search engine query templates for extraction""" WEBSITE_KEYWORDS = BEST_SMALL_WIND_ORDINANCE_WEBSITE_URL_KEYWORDS """list: List of keywords @@ -76,8 +76,8 @@ class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin): a website scrape for a wind ordinance document. """ - heuristic = SmallWindHeuristic() - """BaseHeuristic: Object with a ``check()`` method""" + HEURISTIC = SmallWindHeuristic + """BaseHeuristic: Class with a ``check()`` method""" TEXT_COLLECTORS = [ SmallWindOrdinanceTextCollector, diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py index 942869df0..1b85d400c 100644 --- a/compass/extraction/solar/ordinance.py +++ b/compass/extraction/solar/ordinance.py @@ -285,7 +285,7 @@ class SolarOrdinanceTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "cleaned_text_for_extraction", - "out_fn": "{jurisdiction} Cleaned Text.txt", + "out_fn": "{jurisdiction} Utility Scale Solar Ordinance.txt", "prompt": _SEF_TEXT_EXTRACTION_PROMPT, }, ] @@ -309,12 +309,12 @@ class SolarPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "permitted_use_only_text", - "out_fn": "{jurisdiction} Permitted Use Only.txt", + "out_fn": "{jurisdiction} Permitted Use.txt", "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, { "key": "districts_text", - "out_fn": "{jurisdiction} Districts.txt", + "out_fn": "{jurisdiction} Permitted Use Districts.txt", "prompt": _SEF_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, ] diff --git a/compass/extraction/solar/parse.py b/compass/extraction/solar/parse.py index 89088a331..290aebc1a 100644 --- a/compass/extraction/solar/parse.py +++ b/compass/extraction/solar/parse.py @@ -189,7 +189,7 @@ class StructuredSolarOrdinanceParser(StructuredSolarParser): a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ @@ -494,7 +494,7 @@ class StructuredSolarPermittedUseDistrictsParser(StructuredSolarParser): a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ diff --git a/compass/extraction/solar/plugin.py b/compass/extraction/solar/plugin.py index 3e2153c1b..1d4527230 100644 --- a/compass/extraction/solar/plugin.py +++ b/compass/extraction/solar/plugin.py @@ -18,7 +18,7 @@ SolarPermittedUseDistrictsTextExtractor.OUT_LABEL ) -SOLAR_QUESTION_TEMPLATES = [ +SOLAR_QUERY_TEMPLATES = [ "filetype:pdf {jurisdiction} solar energy conversion system ordinances", "solar energy conversion system ordinances {jurisdiction}", "{jurisdiction} solar energy farm ordinance", @@ -67,8 +67,8 @@ class COMPASSSolarExtractor(OrdinanceExtractionPlugin): IDENTIFIER = "solar" """str: Identifier for extraction task """ - QUESTION_TEMPLATES = SOLAR_QUESTION_TEMPLATES - """list: List of search engine question templates for extraction""" + QUERY_TEMPLATES = SOLAR_QUERY_TEMPLATES + """list: List of search engine query templates for extraction""" WEBSITE_KEYWORDS = BEST_SOLAR_ORDINANCE_WEBSITE_URL_KEYWORDS """list: List of keywords @@ -77,8 +77,8 @@ class COMPASSSolarExtractor(OrdinanceExtractionPlugin): a website scrape for a wind ordinance document. """ - heuristic = SolarHeuristic() - """BaseHeuristic: Object with a ``check()`` method""" + HEURISTIC = SolarHeuristic + """BaseHeuristic: Class with a ``check()`` method""" TEXT_COLLECTORS = [ SolarOrdinanceTextCollector, diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py index 0729293d2..c1826e6d2 100644 --- a/compass/extraction/water/plugin.py +++ b/compass/extraction/water/plugin.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -WATER_RIGHTS_QUESTION_TEMPLATES = [ +WATER_RIGHTS_QUERY_TEMPLATES = [ "{jurisdiction} rules", "{jurisdiction} management plan", "{jurisdiction} well permits", @@ -66,19 +66,6 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin): IDENTIFIER = "tx water rights" """str: Identifier for extraction task """ - QUESTION_TEMPLATES = WATER_RIGHTS_QUESTION_TEMPLATES - """list: List of search engine question templates for extraction""" - - WEBSITE_KEYWORDS = BEST_WATER_RIGHTS_ORDINANCE_WEBSITE_URL_KEYWORDS - """list: List of keywords - - Keywords indicate links which should be prioritized when performing - a website scrape for a wind ordinance document. - """ - - heuristic = WaterRightsHeuristic() - """BaseHeuristic: Object with a ``check()`` method""" - JURISDICTION_DATA_FP = ( importlib.resources.files("compass") / "data" @@ -86,6 +73,33 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin): ) """:term:`path-like `: Path to Texas GCW names""" + async def get_query_templates(self): # noqa: PLR6301 + """Get a list of search engine query templates for extraction + + Query templates can contain the placeholder ``{jurisdiction}`` + which will be replaced with the full jurisdiction name during + the search engine query. + """ + return WATER_RIGHTS_QUERY_TEMPLATES + + async def get_website_keywords(self): # noqa: PLR6301 + """Get a dict of website search keyword scores + + Dictionary mapping keywords to scores that indicate links which + should be prioritized when performing a website scrape for a + document. + """ + return BEST_WATER_RIGHTS_ORDINANCE_WEBSITE_URL_KEYWORDS + + async def get_heuristic(self): # noqa: PLR6301 + """Get a `BaseHeuristic` instance with a `check()` method + + The ``check()`` method should accept a string of text and return + ``True`` if the text passes the heuristic check and ``False`` + otherwise. + """ + return WaterRightsHeuristic() + async def filter_docs( self, extraction_context, diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py index 95c9e7fdc..e4a464406 100644 --- a/compass/extraction/wind/ordinance.py +++ b/compass/extraction/wind/ordinance.py @@ -343,12 +343,12 @@ class WindOrdinanceTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "wind_energy_systems_text", - "out_fn": "{jurisdiction} Wind Ordinance Text.txt", + "out_fn": "{jurisdiction} Wind Ordinance.txt", "prompt": _WECS_TEXT_EXTRACTION_PROMPT, }, { "key": "cleaned_text_for_extraction", - "out_fn": "{jurisdiction} Cleaned Text.txt", + "out_fn": "{jurisdiction} Utility Scale Wind Ordinance.txt", "prompt": _LARGE_WECS_TEXT_EXTRACTION_PROMPT, }, ] @@ -372,12 +372,12 @@ class WindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): PROMPTS = [ { "key": "permitted_use_only_text", - "out_fn": "{jurisdiction} Permitted Use Only.txt", + "out_fn": "{jurisdiction} Permitted Use.txt", "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, { "key": "districts_text", - "out_fn": "{jurisdiction} Districts.txt", + "out_fn": "{jurisdiction} Permitted Use Districts.txt", "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, }, ] diff --git a/compass/extraction/wind/parse.py b/compass/extraction/wind/parse.py index 95c6f20a3..bb54b11cf 100644 --- a/compass/extraction/wind/parse.py +++ b/compass/extraction/wind/parse.py @@ -179,7 +179,7 @@ class StructuredWindOrdinanceParser(StructuredWindParser): a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ @@ -497,7 +497,7 @@ class StructuredWindPermittedUseDistrictsParser(StructuredWindParser): a decision-tree-based chain-of-thought prompt on the text for each value to be extracted. Key Relationships: - Uses a StructuredLLMCaller for LLM queries and multiple + Uses a JSONFromTextLLMCaller for LLM queries and multiple AsyncDecisionTree instances to guide the extraction of individual values. """ diff --git a/compass/extraction/wind/plugin.py b/compass/extraction/wind/plugin.py index c8758213f..cbeddb07d 100644 --- a/compass/extraction/wind/plugin.py +++ b/compass/extraction/wind/plugin.py @@ -18,7 +18,7 @@ WindPermittedUseDistrictsTextExtractor.OUT_LABEL ) -WIND_QUESTION_TEMPLATES = [ +WIND_QUERY_TEMPLATES = [ "filetype:pdf {jurisdiction} wind energy conversion system ordinances", "wind energy conversion system ordinances {jurisdiction}", "{jurisdiction} wind WECS ordinance", @@ -65,8 +65,8 @@ class COMPASSWindExtractor(OrdinanceExtractionPlugin): IDENTIFIER = "wind" """str: Identifier for extraction task """ - QUESTION_TEMPLATES = WIND_QUESTION_TEMPLATES - """list: List of search engine question templates for extraction""" + QUERY_TEMPLATES = WIND_QUERY_TEMPLATES + """list: List of search engine query templates for extraction""" WEBSITE_KEYWORDS = BEST_WIND_ORDINANCE_WEBSITE_URL_KEYWORDS """list: List of keywords @@ -75,8 +75,8 @@ class COMPASSWindExtractor(OrdinanceExtractionPlugin): a website scrape for a wind ordinance document. """ - heuristic = WindHeuristic() - """BaseHeuristic: Object with a ``check()`` method""" + HEURISTIC = WindHeuristic + """BaseHeuristic: Class with a ``check()`` method""" TEXT_COLLECTORS = [ WindOrdinanceTextCollector, diff --git a/compass/llm/__init__.py b/compass/llm/__init__.py index 58aacfbe1..e3dbba9dc 100644 --- a/compass/llm/__init__.py +++ b/compass/llm/__init__.py @@ -1,4 +1,4 @@ """COMPASS Ordinance LLM callers""" -from .calling import LLMCaller, ChatLLMCaller, StructuredLLMCaller +from .calling import LLMCaller, ChatLLMCaller, JSONFromTextLLMCaller from .config import OpenAIConfig diff --git a/compass/llm/calling.py b/compass/llm/calling.py index d41dfa72e..bac7228e8 100644 --- a/compass/llm/calling.py +++ b/compass/llm/calling.py @@ -27,7 +27,7 @@ class BaseLLMCaller: invocation, allowing user to focus on only the message. 3. Track message history (ChatLLMCaller) or convert output into - JSON (StructuredLLMCaller). + JSON (JSONFromTextLLMCaller). Key Relationships: Delegates most of work to underlying ``Service`` class. @@ -65,8 +65,12 @@ class LLMCaller(BaseLLMCaller): -------- ChatLLMCaller Chat-like LLM calling functionality. - StructuredLLMCaller - Structured (JSON) LLM calling functionality. + JSONFromTextLLMCaller + LLM calling functionality that extracts structured data (JSON) + from the **text-based response**. + SchemaOutputLLMCaller + LLM calling functionality that allows you to specify the + expected output schema as part of the API call. """ async def call( @@ -107,8 +111,12 @@ class ChatLLMCaller(BaseLLMCaller): -------- LLMCaller Simple LLM caller, with no memory and no parsing utilities. - StructuredLLMCaller - Structured (JSON) LLM calling functionality. + JSONFromTextLLMCaller + LLM calling functionality that extracts structured data (JSON) + from the **text-based response**. + SchemaOutputLLMCaller + LLM calling functionality that allows you to specify the + expected output schema as part of the API call. """ def __init__( @@ -170,7 +178,7 @@ async def call(self, content, usage_sub_label=LLMUsageCategory.CHAT): return response -class StructuredLLMCaller(BaseLLMCaller): +class JSONFromTextLLMCaller(BaseLLMCaller): """Class to support structured (JSON) LLM calling functionality See Also @@ -179,6 +187,9 @@ class StructuredLLMCaller(BaseLLMCaller): Simple LLM caller, with no memory and no parsing utilities. ChatLLMCaller Chat-like LLM calling functionality. + SchemaOutputLLMCaller + LLM calling functionality that allows you to specify the + expected output schema as part of the API call. """ async def call( @@ -217,6 +228,100 @@ async def call( return llm_response_as_json(response) if response else {} +class SchemaOutputLLMCaller(BaseLLMCaller): + """Class to support structured (JSON) LLM calling functionality + + This class differs from :class:`JSONFromTextLLMCaller` in that it is + designed to work with LLM services that allow you to specify the + expected output schema as part of the API call (e.g. OpenAI function + calling). This allows for more direct retrieval of structured data + from the LLM, without needing to parse JSON from text-based + responses. The expected response format should be provided as a + parameter to the ``call`` method, and should be formatted according + to the specifications of the underlying LLM service for structured + output. + + See Also + -------- + LLMCaller + Simple LLM caller, with no memory and no parsing utilities. + ChatLLMCaller + Chat-like LLM calling functionality. + JSONFromTextLLMCaller + LLM calling functionality that extracts structured data (JSON) + from the **text-based response**. + """ + + async def call( + self, + sys_msg, + content, + response_format, + usage_sub_label=LLMUsageCategory.DEFAULT, + ): + """Call LLM for structured data retrieval + + Parameters + ---------- + sys_msg : str + The LLM system message. If this text does not contain the + instruction text "Return your answer as a dictionary in JSON + format", it will be added. + content : str + LLM call content (typically some text to extract info from). + usage_sub_label : str, optional + Label to store token usage under. By default, ``"default"``. + response_format : dict + Dictionary specifying the expected response format. This + will be passed to the underlying LLM service (e.g. OpenAI) + and should be formatted according to that service's + specifications for structured output. For example, for + OpenAI GPT models, this should be a dictionary with the + following keys: + + - `type`: Should be set to `"json_schema"` to indicate + that the expected output is structured JSON. + - `json_schema`: A dictionary specifying the expected + JSON schema of the output. This should include the + following keys: + + - `name`: A string name for this response format + (e.g. "extracted_features"). + - `strict`: A boolean indicating whether the LLM + should strictly adhere to the provided schema + (i.e. not include any keys not specified in the + schema). If `True`, the LLM will be instructed to + only include keys specified in the `schema` field. + If `False`, the LLM may include additional keys + not specified in the `schema` field. + - `schema`: A dictionary specifying the expected + JSON schema of the output. This should be + formatted according to JSON Schema specifications, + and should define the expected structure of the + output JSON object. For example, it may specify + that the output should be an object with certain + required properties, and the expected data types + of those properties. + + Returns + ------- + dict + Dictionary containing the LLM-extracted features. Dictionary + may be empty if there was an error during the LLM call. + """ + response = await self.llm_service.call( + usage_tracker=self.usage_tracker, + usage_sub_label=usage_sub_label, + messages=[ + {"role": "system", "content": sys_msg}, + {"role": "user", "content": content}, + ], + response_format=response_format, + **self.kwargs, + ) + return llm_response_as_json(response) if response else {} + + def _add_json_instructions_if_needed(system_message): """Add JSON instruction to system message if needed""" if "JSON format" not in system_message: diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index de078d647..5f50ee3f0 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -15,4 +15,6 @@ OrdinanceParser, OrdinanceExtractionPlugin, ) +from .noop import NoOpHeuristic, NoOpTextCollector, NoOpTextExtractor from .registry import PLUGIN_REGISTRY, register_plugin +from .one_shot import create_schema_based_one_shot_extraction_plugin diff --git a/compass/plugin/base.py b/compass/plugin/base.py index 1a65182a1..dcf9adb81 100644 --- a/compass/plugin/base.py +++ b/compass/plugin/base.py @@ -72,35 +72,33 @@ def IDENTIFIER(self): # noqa: N802 """str: Identifier for extraction task (e.g. "water rights")""" raise NotImplementedError - @property @abstractmethod - def QUESTION_TEMPLATES(self): # noqa: N802 - """list: List of search engine question templates for extraction + async def get_query_templates(self): + """Get a list of search engine query templates for extraction - Question templates can contain the placeholder - ``{jurisdiction}`` which will be replaced with the full - jurisdiction name during the search engine query. + Query templates can contain the placeholder ``{jurisdiction}`` + which will be replaced with the full jurisdiction name during + the search engine query. """ raise NotImplementedError - @property @abstractmethod - def WEBSITE_KEYWORDS(self): # noqa: N802 - """list: List of keywords + async def get_website_keywords(self): + """Get a dict of website search keyword scores - List of keywords that indicate links which should be prioritized - when performing a website scrape for a document. + Dictionary mapping keywords to scores that indicate links which + should be prioritized when performing a website scrape for a + document. """ raise NotImplementedError - @property @abstractmethod - def heuristic(self): - """BaseHeuristic: Object with a ``check()`` method + async def get_heuristic(self): + """Get a `BaseHeuristic` instance with a `check()` method - The ``check()`` method should accept a string of text and - return ``True`` if the text passes the heuristic check and - ``False`` otherwise. + The ``check()`` method should accept a string of text and return + ``True`` if the text passes the heuristic check and ``False`` + otherwise. """ raise NotImplementedError diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index 5f116fe3c..5e4fd26c2 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -4,10 +4,10 @@ from abc import ABC, abstractmethod from compass.plugin.base import BaseExtractionPlugin -from compass.llm.calling import LLMCaller +from compass.llm.calling import BaseLLMCaller, LLMCaller from compass.extraction import extract_relevant_text_with_ngram_validation from compass.scripts.download import filter_ordinance_docs -from compass.services.threaded import CleanedFileWriter +from compass.services.threaded import CLEANED_FP_REGISTRY, CleanedFileWriter from compass.utilities import doc_infos_to_db, save_db from compass.exceptions import COMPASSPluginConfigurationError @@ -37,7 +37,7 @@ def check(self, text): raise NotImplementedError -class BaseTextCollector(ABC): +class BaseTextCollector(BaseLLMCaller, ABC): """Base class for text collectors that gather relevant text""" @property @@ -121,12 +121,12 @@ def IDENTIFIER(self): # noqa: N802 @property @abstractmethod - def QUESTION_TEMPLATES(self): # noqa: N802 - """list: List of search engine question templates for extraction + def QUERY_TEMPLATES(self): # noqa: N802 + """list: List of search engine query templates for extraction - Question templates can contain the placeholder - ``{jurisdiction}`` which will be replaced with the full - jurisdiction name during the search engine query. + Query templates can contain the placeholder ``{jurisdiction}`` + which will be replaced with the full jurisdiction name during + the search engine query. """ raise NotImplementedError @@ -151,8 +151,9 @@ def TEXT_COLLECTORS(self): # noqa: N802 raise NotImplementedError @property - def heuristic(self): - """BaseHeuristic: Object with a ``check()`` method + @abstractmethod + def HEURISTIC(self): # noqa: N802 + """BaseHeuristic: Class with a ``check()`` method The ``check()`` method should accept a string of text and return ``True`` if the text passes the heuristic check and @@ -251,6 +252,33 @@ async def extract_relevant_text(self, doc, extractor_class, model_config): ) await self._write_cleaned_text(doc) + async def get_query_templates(self): + """Get a list of search engine query templates for extraction + + Query templates can contain the placeholder ``{jurisdiction}`` + which will be replaced with the full jurisdiction name during + the search engine query. + """ + return self.QUERY_TEMPLATES + + async def get_website_keywords(self): + """Get a dict of website search keyword scores + + Dictionary mapping keywords to scores that indicate links which + should be prioritized when performing a website scrape for a + document. + """ + return self.WEBSITE_KEYWORDS + + async def get_heuristic(self): + """Get a `BaseHeuristic` instance with a `check()` method + + The ``check()`` method should accept a string of text and return + ``True`` if the text passes the heuristic check and ``False`` + otherwise. + """ + return self.HEURISTIC() + async def filter_docs( self, extraction_context, need_jurisdiction_verification=True ): @@ -292,7 +320,7 @@ async def filter_docs( docs, self.jurisdiction, self.model_configs, - heuristic=self.heuristic, + heuristic=self.HEURISTIC(), tech=self.IDENTIFIER, text_collectors=self.TEXT_COLLECTORS, usage_tracker=self.usage_tracker, @@ -331,7 +359,14 @@ async def _write_cleaned_text(self, doc): def validate_plugin_configuration(self): """[NOT PUBLIC API] Validate plugin is properly configured""" - + self._validate_plugin_identifier() + self._validate_query_templates() + self._validate_website_keywords() + self._validate_text_collectors() + self._register_collected_text_file_names() + + def _validate_plugin_identifier(self): + """Validate that the plugin has a valid IDENTIFIER property""" try: __ = self.IDENTIFIER except NotImplementedError: @@ -341,23 +376,27 @@ def validate_plugin_configuration(self): ) raise COMPASSPluginConfigurationError(msg) from None + def _validate_query_templates(self): + """Validate that the plugin has valid QUERY_TEMPLATES""" try: - num_q_templates = len(self.QUESTION_TEMPLATES) + num_q_templates = len(self.QUERY_TEMPLATES) except NotImplementedError: msg = ( f"Plugin class {self.__class__.__name__} is missing required " - "property 'QUESTION_TEMPLATES'" + "property 'QUERY_TEMPLATES'" ) raise COMPASSPluginConfigurationError(msg) from None if num_q_templates == 0: msg = ( f"Plugin class {self.__class__.__name__} has an empty " - "'QUESTION_TEMPLATES' property! Please provide at least " - "one question template." + "'QUERY_TEMPLATES' property! Please provide at least " + "one query template." ) raise COMPASSPluginConfigurationError(msg) + def _validate_website_keywords(self): + """Validate that the plugin has valid WEBSITE_KEYWORDS""" try: num_website_keywords = len(self.WEBSITE_KEYWORDS) except NotImplementedError: @@ -375,6 +414,8 @@ def validate_plugin_configuration(self): ) raise COMPASSPluginConfigurationError(msg) + def _validate_text_collectors(self): + """Validate that the plugin has valid TEXT_COLLECTORS""" try: collectors = self.TEXT_COLLECTORS except NotImplementedError: @@ -402,3 +443,13 @@ def validate_plugin_configuration(self): f"{collector_class.__name__} is not!" ) raise COMPASSPluginConfigurationError(msg) + + def _register_collected_text_file_names(self): + """Register file names for writing cleaned text outputs""" + + CLEANED_FP_REGISTRY.setdefault(self.IDENTIFIER.casefold(), {}) + collected_text_key = list(self.TEXT_COLLECTORS)[-1].OUT_LABEL + + CLEANED_FP_REGISTRY[self.IDENTIFIER.casefold()][collected_text_key] = ( + "{jurisdiction} Collected Text.txt" + ) diff --git a/compass/plugin/noop.py b/compass/plugin/noop.py new file mode 100644 index 000000000..7d7ede507 --- /dev/null +++ b/compass/plugin/noop.py @@ -0,0 +1,121 @@ +"""COMPASS NoOp plugin implementation""" + +import logging + +from compass.plugin.interface import BaseHeuristic, BaseTextCollector +from compass.plugin.ordinance import BaseTextExtractor +from compass.utilities.parsing import merge_overlapping_texts + + +logger = logging.getLogger(__name__) + + +class NoOpHeuristic(BaseHeuristic): + """NoOp heuristic check""" + + def check(self, *__, **___): # noqa: PLR6301 + """Always return ``True``""" + return True + + +class NoOpTextCollector(BaseTextCollector): + """NoOp text collector that returns the full text""" + + OUT_LABEL = "relevant_text" + """Identifier for text collected by this class""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._chunks = {} + + @property + def relevant_text(self): + """str: Combined relevant text from the individual chunks""" + text = [self._chunks[ind] for ind in sorted(self._chunks)] + return merge_overlapping_texts(text) + + async def check_chunk(self, chunk_parser, ind): + """Check a chunk at a given ind to see if it contains ordinance + + In this implementation, we store all chunks, so this method + always returns ``True``. + + Parameters + ---------- + chunk_parser : ParseChunksWithMemory + Instance that contains a ``parse_from_ind`` method. + ind : int + Index of the chunk to check. + + Returns + ------- + bool + Boolean flag indicating whether or not the text in the chunk + contains large wind energy conversion system ordinance text. + """ + logger.debug( + "NoOpTextCollector: assuming all text is relevant, so adding " + "chunk at ind %d to extraction text", + ind, + ) + self._store_chunk(chunk_parser, ind) + return True + + def _store_chunk(self, parser, chunk_ind): + """Store chunk and its neighbors if it is not already stored""" + for offset in range(1 - parser.num_to_recall, 2): + ind_to_grab = chunk_ind + offset + if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): + continue + + self._chunks.setdefault( + ind_to_grab, parser.text_chunks[ind_to_grab] + ) + + +class NoOpTextExtractor(BaseTextExtractor): + """NoOp text extractor that returns the full text""" + + def __init__(self, llm_caller): + """ + + Parameters + ---------- + llm_caller : LLMCaller + LLM Caller instance used to extract ordinance info with. + """ + self.llm_caller = llm_caller + + async def return_original(self, text_chunks): # noqa: PLR6301 + """No processing, just return original text + + Parameters + ---------- + text_chunks : list of str + List of strings, each of which represent a chunk of text. + The order of the strings should be the order of the text + chunks. + + Returns + ------- + str + Ordinance text extracted from text chunks. + """ + logger.debug( + "No text extraction prompts provided; returning original text" + ) + return merge_overlapping_texts(text_chunks) + + @property + def parsers(self): + """Iterable of parsers provided by this extractor + + Yields + ------ + name : str + Name describing the type of text output by the parser. + parser : callable + Async function that takes a ``text_chunks`` input and + outputs parsed text. + """ + yield self.OUT_LABEL, self.return_original diff --git a/compass/plugin/one_shot/__init__.py b/compass/plugin/one_shot/__init__.py new file mode 100644 index 000000000..7e61a064a --- /dev/null +++ b/compass/plugin/one_shot/__init__.py @@ -0,0 +1,3 @@ +"""COMPASS one-shot extraction plugin""" + +from .base import create_schema_based_one_shot_extraction_plugin diff --git a/compass/plugin/one_shot/base.py b/compass/plugin/one_shot/base.py new file mode 100644 index 000000000..b3c4bb13f --- /dev/null +++ b/compass/plugin/one_shot/base.py @@ -0,0 +1,372 @@ +"""COMPASS one-shot extraction plugin""" + +import json +import logging +import hashlib +import importlib.resources +from pathlib import Path + +from platformdirs import user_data_dir + +from compass.llm.calling import SchemaOutputLLMCaller +from compass.plugin import ( + register_plugin, + NoOpHeuristic, + NoOpTextCollector, + NoOpTextExtractor, + PromptBasedTextCollector, + PromptBasedTextExtractor, + OrdinanceExtractionPlugin, +) +from compass.plugin.one_shot.generators import generate_query_templates +from compass.plugin.one_shot.components import ( + SchemaBasedTextCollector, + SchemaOrdinanceParser, +) +from compass.utilities.io import load_config +from compass.utilities.enums import LLMTasks + + +logger = logging.getLogger(__name__) +_SCHEMA_DIR = importlib.resources.files("compass.plugin.one_shot.schemas") + + +def create_schema_based_one_shot_extraction_plugin(config, tech): + """Create a one-shot extraction plugin based on a configuration + + Parameters + ---------- + config : dict or path-like + One-shot configuration dictionary. If not a dictionary, should + be a path to a file containing the configuration (supported + formats: JSON, JSON5, YAML, TOML). See the wind ordinance schema + for an example. The configuration must include the following + keys: + + - `schema`: A dictionary representing the schema of the + output. Can also be a path to a file that contains the + schema (supported formats: JSON, JSON5, YAML, TOML). See + the wind ordinance schema for an example. + + The configuration can also include the following optional keys: + + - `data_type_short_desc`: Short description of the type of + data being extracted with this plugin, in the format + `wind energy ordinance`, `solar energy ordinance`, + `water rights`. This is used to enhance the prompts for + the structured data extraction. + - `query_templates`: A list of search engine query + templates for document retrieval. Templates should include + ``{jurisdiction}`` as a placeholder for the jurisdiction + that is being processed. If not provided, the LLM will be + used to generate search engine queries based on the + schema input. + - `website_keywords`: A list of keywords to use for + filtering websites during document retrieval. If not + provided, the LLM will be used to generate website + keywords based on the schema input. + - `collection_prompts`: A list of prompts to use for + collecting relevant text from documents. Alternatively, + this input can simply be ``True``, in which case the LLM + will be used to generate the collection prompts. If + ``False``, ``None``, or not provided, the entire document + text will be used for extraction (no text collection). + - `text_extraction_prompts`: A list of prompts to use for + consolidating and extracting relevant text from the + documents. Alternatively, this input can simply be + ``True``, in which case the LLM will be used to generate + the text extraction prompts. If ``False``, ``None``, or + not provided, the entire document text will be used for + extraction (no text consolidation). + - `cache_query_templates`: Boolean flag indicating + whether or not to cache generated query templates and + website keywords for future use. By default, ``True``. + Caching is recommended since the generation of query + templates and website keywords can be costly, but if you + are iterating on the configuration and want to see the + effect of changes to the schema on the generated query + templates and website keywords in real time, you may want + to set this flag to ``False`` to avoid caching generated + templates/keywords until you have finalized the schema. + - `extraction_system_prompt`: Custom system prompt to use + for the structured data extraction step. If not provided, + a default prompt will be used that instructs the LLM to + extract structured data from the given document(s). You + may provide a custom system prompt if you want to provide + more specific instructions to the LLM for the structured + data extraction step. + + tech : str + Technology identifier to use for the plugin (e.g., "wind", + "solar"). Must be unique from the identifiers of any existing + plugins. + """ + if not isinstance(config, dict): + config = load_config(config) + + if isinstance(config["schema"], str): + config["schema"] = load_config(config["schema"]) + + text_collectors = _collectors_from_config(config) + text_extractors = _extractors_from_config( + config, in_label=text_collectors[-1].OUT_LABEL + ) + parsers = _parser_from_config( + config, in_label=text_extractors[-1].OUT_LABEL + ) + + class SchemaBasedExtractionPlugin(OrdinanceExtractionPlugin): + SCHEMA = config["schema"] + """dict: Schema for the output of the text extraction step""" + + IDENTIFIER = tech + """str: Identifier for extraction task """ + + # TODO: implement dynamic generation of the heuristic based on + # the extraction schema + HEURISTIC = NoOpHeuristic + """BaseHeuristic: Class with a ``check()`` method""" + + # TODO: implement dynamic generation of the website keywords + # based on the extraction schema + WEBSITE_KEYWORDS = { + "pdf": 23040, + "zoning": 11520, + "ordinance": 5760, + r"renewable%20energy": 1440, + r"renewable+energy": 1440, + "renewable energy": 1440, + "planning": 720, + "plan": 360, + "government": 180, + "code": 60, + "area": 60, + r"land%20development": 15, + r"land+development": 15, + "land development": 15, + "land": 3, + "environment": 3, + "energy": 3, + "renewable": 3, + "municipal": 1, + "department": 1, + } + + TEXT_COLLECTORS = text_collectors + """Classes for collecting text chunks from docs""" + + TEXT_EXTRACTORS = text_extractors + """Classes for extracting cleaned text from collected text""" + + PARSERS = parsers + """Classes for parsing structured ordinance data from text""" + + QUERY_TEMPLATES = [] # set by user or LLM-generated + """List: List of search engine query templates""" + + async def get_query_templates(self): + """Get a list of query templates for document retrieval + + Returns + ------- + list + List of search engine query templates for document + retrieval. Templates may include ``{jurisdiction}`` as + a placeholder for the jurisdiction that is being + processed. + """ + if self.QUERY_TEMPLATES: + return self.QUERY_TEMPLATES + + if qt := config.get("query_templates"): + self.QUERY_TEMPLATES = qt + return qt + + qt = _qt_from_cache(self.IDENTIFIER, config["schema"]) + if qt: + self.QUERY_TEMPLATES = qt + return qt + + model_config = self.model_configs.get( + LLMTasks.PLUGIN_GENERATION, + self.model_configs[LLMTasks.DEFAULT], + ) + schema_llm = SchemaOutputLLMCaller( + llm_service=model_config.llm_service, + usage_tracker=self.usage_tracker, + **model_config.llm_call_kwargs, + ) + logger.debug("Generating query templates...") + qt = await generate_query_templates( + schema_llm, config["schema"], add_think_prompt=True + ) + logger.debug("Generated the following query templates:\n%r", qt) + self.QUERY_TEMPLATES = qt + + if config.get("cache_query_templates", True): + _qt_to_cache(self.IDENTIFIER, config["schema"], qt) + + return qt + + def _validate_query_templates(self): + """NoOp validation for query templates + + Since templates can be generated by LLM, we don't know until + runtime whether or not they will be valid. + """ + + register_plugin(SchemaBasedExtractionPlugin) + + +def _collectors_from_config(config): + """Create a TextCollector subclass based on a config dict""" + cp = config.get("collection_prompts") + + if cp is True: + schema_fp = _SCHEMA_DIR / "validate_chunk.json5" + + class PluginCollector(SchemaBasedTextCollector): + OUT_LABEL = NoOpTextCollector.OUT_LABEL # reuse label + SCHEMA = config["schema"] + OUTPUT_SCHEMA = load_config(schema_fp) + + return [PluginCollector] + + if cp: + + class PluginCollector(PromptBasedTextCollector): + OUT_LABEL = NoOpTextCollector.OUT_LABEL # reuse label + PROMPTS = cp + + return [PluginCollector] + + return [NoOpTextCollector] + + +def _extractors_from_config(config, in_label): + """Create a TextExtractor subclass based on a config dict""" + tep = config.get("text_extraction_prompts") + + if tep is True: + # TODO: When implementing this, don't forget to register the + # text output file name so it gets store in the + # cleaned outputs directory + msg = ( + "LLM-based text extraction not implemented yet. If you would like " + "to see this feature implemented, please submit an issue or, " + "better yet, a pull request!" + ) + raise NotImplementedError(msg) + + if tep: + + class PluginTextExtractor(PromptBasedTextExtractor): + IN_LABEL = in_label + PROMPTS = tep + + return [PluginTextExtractor] + + class PluginTextExtractor(NoOpTextExtractor): + IN_LABEL = in_label + OUT_LABEL = "copied_relevant_text" + + return [PluginTextExtractor] + + +def _parser_from_config(config, in_label): + """Create a TextExtractor subclass based on a config dict""" + + new_sys_prompt = config.get( + "extraction_system_prompt", SchemaOrdinanceParser.SYSTEM_PROMPT + ) + + class PluginParser(SchemaOrdinanceParser): + IN_LABEL = in_label + OUT_LABEL = "structured_data" + SCHEMA = config["schema"] + DATA_TYPE_SHORT_DESC = config.get("data_type_short_desc") + SYSTEM_PROMPT = new_sys_prompt + + return [PluginParser] + + +def _qt_from_cache(identifier, schema): + """Get cached query templates for a given schema if they exist""" + # cspell: disable-next-line + data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) + cache_fp = data_dir / "qt_cache.json" + if not cache_fp.exists(): + return None + + logger.debug("Loading query templates from cache at %s", cache_fp) + qt = json.loads(cache_fp.read_text(encoding="utf-8")) + if identifier.casefold() not in qt: + return None + + potential_qt = qt[identifier.casefold()] + m = hashlib.sha256() + m.update(str(schema).encode()) + if potential_qt.get("sha256") != m.hexdigest(): + return None + + templates = potential_qt["templates"] + logger.debug( + "Found query templates for %r in cache:\n%r", identifier, templates + ) + return templates + + +def _qt_to_cache(identifier, schema, qt): + """Cache generated query templates for future use""" + # cspell: disable-next-line + data_dir = Path(user_data_dir(appname="INFRA-COMPASS", appauthor="NLR")) + data_dir.mkdir(parents=True, exist_ok=True) + cache_fp = data_dir / "qt_cache.json" + if not cache_fp.exists(): + logger.debug( + "Cache file for query templates not found at %s. Creating new " + "cache with current query templates for %r", + cache_fp, + identifier, + ) + cache = { + identifier.casefold(): { + "templates": qt, + "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), + } + } + cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") + return + + logger.debug("Loading query templates from cache at %s", cache_fp) + cache = json.loads(cache_fp.read_text(encoding="utf-8")) + if identifier.casefold() not in cache: + logger.debug( + "Adding query templates for %r to cache at %s", + identifier, + cache_fp, + ) + cache[identifier.casefold()] = { + "templates": qt, + "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), + } + cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") + return + + potential_qt = cache[identifier.casefold()] + m = hashlib.sha256() + m.update(str(schema).encode()) + if potential_qt.get("sha256") == m.hexdigest(): + logger.debug( + "Query templates for %r already in cache and schema hash " + "matches, so not updating cache", + identifier, + ) + return + + cache[identifier.casefold()] = { + "templates": qt, + "sha256": hashlib.sha256(str(schema).encode()).hexdigest(), + } + cache_fp.write_text(json.dumps(cache, indent=4), encoding="utf-8") + return diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py new file mode 100644 index 000000000..2c10b9665 --- /dev/null +++ b/compass/plugin/one_shot/components.py @@ -0,0 +1,242 @@ +"""COMPASS extraction schema-based plugin component implementations""" + +import logging +from abc import ABC, abstractmethod + +import pandas as pd + +from compass.llm.calling import SchemaOutputLLMCaller +from compass.plugin import BaseParser, BaseTextCollector +from compass.utilities.enums import LLMUsageCategory +from compass.utilities.parsing import merge_overlapping_texts + + +logger = logging.getLogger(__name__) +_TEXT_COLLECTION_SYSTEM_PROMPT = """\ +You are a structured extraction validator. You receive: +1) A text chunk. +2) An extraction schema that specifies the exact criteria for relevance \ +(e.g., technology type, document type, required data fields). + +Determine whether the chunk contains content that matches any of the \ +schema's criteria. Be strict and literal: only mark relevant if the chunk \ +clearly addresses the specific technology and document scope described in \ +the schema. Do not infer beyond the text. If relevant, summarize the \ +specific matching content; if not, state why it does not meet the schema's \ +requirements. Keep the response concise and consistent.\ +""" +_TEXT_COLLECTION_MAIN_PROMPT = """\ +Determine whether this text excerpt contains any information relevant to \ +the following extraction schema: + +{schema} + +TEXT: + +{text} + +Think before you answer.\ +""" +_DATA_PARSER_MAIN_PROMPT = """\ +Extract all {desc}features from the following text: + +{text} + +Think before you answer""" +_DATA_PARSER_SYSTEM_PROMPT = """\ +You are a legal scholar extracting structured data from {desc}documents. \ +Follow all instructions in the schema descriptions carefully.\ +""" + + +class SchemaBasedTextCollector(SchemaOutputLLMCaller, BaseTextCollector, ABC): + """Text extractor based on a chain of prompts""" + + @property + @abstractmethod + def SCHEMA(self): # noqa: N802 + """dict: Extraction schema""" + raise NotImplementedError + + @property + @abstractmethod + def OUTPUT_SCHEMA(self): # noqa: N802 + """dict: Validation output schema""" + raise NotImplementedError + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._chunks = {} + + @property + def relevant_text(self): + """str: Combined extraction text from the individual chunks""" + if not self._chunks: + logger.debug( + "No relevant extraction chunk(s) found in original text", + ) + return "" + + logger.debug( + "Grabbing %d extraction chunk(s) from original text at these " + "indices: %s", + len(self._chunks), + list(self._chunks), + ) + + text = [self._chunks[ind] for ind in sorted(self._chunks)] + return merge_overlapping_texts(text) + + async def check_chunk(self, chunk_parser, ind): + """Check a chunk at a given ind to see if it contains ordinance + + Parameters + ---------- + chunk_parser : ParseChunksWithMemory + Instance that contains a ``parse_from_ind`` method. + ind : int + Index of the chunk to check. + + Returns + ------- + bool + Boolean flag indicating whether or not the text in the chunk + contains large wind energy conversion system ordinance text. + """ + key = "contains_relevant_text" + passed_filter = await chunk_parser.parse_from_ind( + ind, + key=key, + llm_call_callback=self._check_chunk_with_prompt, + ) + + if not passed_filter: + logger.debug("Text at ind %d did not pass collection step", ind) + return False + + logger.debug("Text at ind %d passed collection step ", ind) + + self._store_chunk(chunk_parser, ind) + logger.debug("Added text chunk at ind %d to extraction text", ind) + return True + + async def _check_chunk_with_prompt(self, key, text_chunk): + """Call LLM on a chunk of text to check for ordinance""" + content = await self.call( + sys_msg=_TEXT_COLLECTION_SYSTEM_PROMPT, + content=_TEXT_COLLECTION_MAIN_PROMPT.format( + schema=self.SCHEMA, text=text_chunk + ), + response_format={ + "type": "json_schema", + "json_schema": { + "name": "chunk_validation", + "strict": True, + "schema": self.OUTPUT_SCHEMA, + }, + }, + usage_sub_label=LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION, + ) + logger.debug("LLM response: %s", content) + return content.get(key, False) + + def _store_chunk(self, parser, chunk_ind): + """Store chunk and its neighbors if it is not already stored""" + for offset in range(1 - parser.num_to_recall, 2): + ind_to_grab = chunk_ind + offset + if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): + continue + + self._chunks.setdefault( + ind_to_grab, parser.text_chunks[ind_to_grab] + ) + + +class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser): + """Base class for parsing structured data""" + + DATA_TYPE_SHORT_DESC = None + """Optional short description of the type of data being extracted + + Examples + -------- + - "wind energy ordinance" + - "solar energy ordinance" + - "water rights" + - "resource management plan geothermal restriction" + """ + SYSTEM_PROMPT = _DATA_PARSER_SYSTEM_PROMPT + """System prompt to use for parsing structured data with an LLM""" + + @property + @abstractmethod + def SCHEMA(self): # noqa: N802 + """dict: Extraction schema""" + raise NotImplementedError + + async def parse(self, text): + """Parse text and extract structured data + + Parameters + ---------- + text : str + Text which may or may not contain information relevant to + the current extraction. + + Returns + ------- + pandas.DataFrame or None + DataFrame containing structured extracted data. Can also + be ``None`` if no relevant values can be parsed from the + text. + """ + desc = ( + f"{self.DATA_TYPE_SHORT_DESC} " + if self.DATA_TYPE_SHORT_DESC + else "" + ) + + extraction = await self.call( + sys_msg=self.SYSTEM_PROMPT.format(desc=desc), + content=_DATA_PARSER_MAIN_PROMPT.format(desc=desc, text=text), + response_format={ + "type": "json_schema", + "json_schema": { + "name": "structured_data_extraction", + "strict": True, + "schema": self.SCHEMA, + }, + }, + usage_sub_label=LLMUsageCategory.ORDINANCE_VALUE_EXTRACTION, + ) + data = extraction["outputs"] + if not data: + logger.debug( + "LLM did not extract any relevant features from the text" + ) + return None + + return self._to_dataframe(data) + + def _to_dataframe(self, data): + """Convert LLM output to a DataFrame""" + + output_items = self.SCHEMA["properties"]["outputs"]["items"] + all_features = output_items["properties"]["feature"]["enum"] + + known_qual_features = set( + self.SCHEMA.get("$definitions", {}) + .get("qualitative_restrictions", {}) + .get("properties", {}) + ) + quant = [feat not in known_qual_features for feat in all_features] + + df = pd.DataFrame(data) + full_df = pd.DataFrame( + {"feature": all_features, "quantitative": quant} + ) + full_df = full_df.merge(df, on="feature", how="left") + + return full_df[ + ["feature", "value", "units", "section", "summary", "quantitative"] + ] diff --git a/compass/plugin/one_shot/generators.py b/compass/plugin/one_shot/generators.py new file mode 100644 index 000000000..20b08ff69 --- /dev/null +++ b/compass/plugin/one_shot/generators.py @@ -0,0 +1,139 @@ +"""COMPASS one-shot extraction plugin generators""" + +import importlib.resources + +from elm.utilities.retry import async_retry_with_exponential_backoff + +from compass.utilities.io import load_config +from compass.utilities.enums import LLMUsageCategory +from compass.exceptions import COMPASSRuntimeError + + +_SCHEMA_DIR = importlib.resources.files("compass.plugin.one_shot.schemas") +_QUERY_GENERATOR_SYSTEM_PROMPT = """\ +You are an expert search strategist for regulatory documents. \ +Goal: Given an extraction schema (JSON) for an ordinance domain, generate \ +high-quality search engine query templates that will find the legal texts \ +from which the schema's data can be extracted. + +Input: +- schema_json: a JSON schema describing features/requirements to extract. + +Output: +- Produce 5-10 query templates. +- Every template must include the literal placeholder "{jurisdiction}" \ +(exactly, with braces) somewhere in the template, which will be filled in \ +**later** with a plaintext string for a specific location (e.g. "City of \ +Denver, Colorado", "Clear Fork Groundwater Conservation District, Texas", \ +etc.). +- Do not include extra keys or any markdown. + +Guidelines: +- Derive terms from the schema title/description, feature names, and \ +definitions. Prefer official/legal terminology in the schema. +- Do not focus on specific extraction keywords; instead target the document \ +types that would include that information. +- Include a mix of broad and precise queries and both styles. +- Include at least one query with filetype:pdf. +- Include terms that indicate the governing document type \ +(e.g., "ordinance", "zoning", "code", "regulations", "chapter", "section"). +- Include domain-specific synonyms and abbreviations present in the schema \ +(e.g., WECS, WES, wind energy conversion system for wind, SECS, SEF, solar \ +energy conversion system for solar, etc.). +- If relevant to the schema, include some queries that target sites known to \ +host aggregate information (e.g. municode, american legal publishing, etc. \ +for ordinance documents). +- Avoid jurisdiction-specific entities other than the {jurisdiction} \ +placeholder. +- Ensure templates are for locating the legal text itself (not summaries, \ +news, or reports).\ +""" + + +@async_retry_with_exponential_backoff( + base_delay=1, + exponential_base=4, + jitter=True, + max_retries=3, + errors=(COMPASSRuntimeError,), +) +async def generate_query_templates( + schema_llm, extraction_schema, add_think_prompt=True +): + """Generate 5-10 search query templates for document retrieval + + Parameters + ---------- + schema_llm : SchemaOutputLLMCaller + A LLM caller configured to output structured data according to a + provided schema. This function relies on the LLM to generate the + query templates, so the quality of the generated templates will + depend on the capabilities of the LLM being used and how well it + can interpret the provided extraction schema. Highly recommended + to use the most powerful/capable instruction-tuned model for + this function. + extraction_schema : dict + A dictionary representing the schema of the desired extraction + task. The query templates will be generated based on the content + of this schema, so it should be as detailed and specific as + possible, and should include domain-specific terminology if + applicable. See the wind ordinance schema for an example. + add_think_prompt : bool, optional + Option to add a "Think before you answer" instruction to the end + of the prompt (useful for thinking models). + By default, ``True``. + + Returns + ------- + list of str + List of 5-10 query templates as strings, each including the + literal placeholder "{jurisdiction}" for later formatting. + + Raises + ------ + COMPASSRuntimeError + If the LLM fails to return any valid query templates after 3 + attempts. + """ + + query_schema_fp = _SCHEMA_DIR / "query_templates.json5" + query_schema = load_config(query_schema_fp) + main_prompt = ( + "Generate query templates for the following extraction schema:\n\n" + f"{extraction_schema}" + ) + if add_think_prompt: + main_prompt = f"{main_prompt}\n\nThink before you answer" + + response = await schema_llm.call( + sys_msg=_QUERY_GENERATOR_SYSTEM_PROMPT, + content=main_prompt, + response_format={ + "type": "json_schema", + "json_schema": { + "name": "query_template_generation", + "strict": True, + "schema": query_schema, + }, + }, + usage_sub_label=LLMUsageCategory.PLUGIN_GENERATION, + ) + out = [q for q in response["queries"] if _is_formattable(q)] + if not out: + msg = ( + "LLM did not return any valid query templates. " + f"Received response: {response}" + ) + raise COMPASSRuntimeError(msg) + + return out + + +def _is_formattable(q): + """True if the query template is formattable with a jurisdiction""" + try: + q.format(jurisdiction="test") + except Exception: # noqa: BLE001 + return False + + return True diff --git a/compass/plugin/one_shot/schemas/query_templates.json5 b/compass/plugin/one_shot/schemas/query_templates.json5 new file mode 100644 index 000000000..ff61f6355 --- /dev/null +++ b/compass/plugin/one_shot/schemas/query_templates.json5 @@ -0,0 +1,73 @@ +{ + "title": "Search Query Templates", + "description": "Schema for LLM-generated search query templates used to locate ordinance/legal text sources.", + "type": "object", + "additionalProperties": false, + "required": ["queries"], + "properties": { + "queries": { + "type": "array", + "minItems": 5, + "maxItems": 10, + "additionalProperties": false, + "items": {"type": "string"}, + } + }, + "$descriptions": { + "general": [ + "Return ONLY the fields allowed by this schema.", + "Each query MUST include the literal placeholder \"{jurisdiction}\".", + "Queries must target primary legal text (ordinances, codes, regulations, chapters, sections), not news or summaries.", + "Use official terminology from the extraction schema (title, description, feature names).", + "Include a mix of keyword-style queries and natural-language questions.", + "Include at least one query with filetype:pdf.", + "Avoid jurisdiction-specific entities other than \"{jurisdiction}\"." + ], + "query_content": [ + "Prefer ordinance and zoning terms: ordinance, zoning, code, regulation, chapter, section, unified development code, land use code.", + "Include domain synonyms and abbreviations if present in the schema (e.g., WECS, WES, WEF, WET).", + "Target code-hosting domains, but **only when relevant** (e.g., municode, amlegal, ecode360, codepublishing).", + "Keep queries short and search-friendly; avoid punctuation unless needed.", + "Do not include URLs unless using a site: filter." + ], + "quality_checks": [ + "Queries should be distinct (avoid near-duplicates).", + "5 to 10 queries total.", + "All queries must be valid search strings with the placeholder intact." + ] + }, + "$examples": [ + { + "queries": [ + "{jurisdiction} code of ordinances wind energy conversion system WECS WES wind turbine regulations", + "{jurisdiction} zoning ordinance wind energy overlay district special use conditional use chapter section", + "{jurisdiction} unified development code UDC wind energy conversion systems WECS zoning", + "{jurisdiction} unified development ordinance UDO wind energy turbines WET large wind energy systems LWES commercial wind CWECS", + "{jurisdiction} land use code alternate energy systems AES commercial energy production systems CEPCS wind", + "site:library.municode.com {jurisdiction} wind energy conversion system OR WECS OR WES ordinance", + "site:codelibrary.amlegal.com {jurisdiction} wind energy ordinance zoning code", + "site:ecode360.com {jurisdiction} wind energy wind turbine zoning regulations", + "site:codepublishing.com {jurisdiction} wind energy WECS zoning ordinance", + "filetype:pdf {jurisdiction} wind energy ordinance zoning code WECS WEF" + ] + }, + { + "queries": [ + "filetype:pdf {jurisdiction} solar energy conversion system ordinances", + "solar energy conversion system ordinances {jurisdiction}", + "{jurisdiction} solar energy farm ordinance", + "Where can I find the legal text for commercial solar energy conversion system zoning ordinances in {jurisdiction}?", + "What is the specific legal information regarding zoning ordinances for commercial solar energy conversion systems in {jurisdiction}?", + ] + }, + { + "queries": [ + "{jurisdiction} water conservation rules", + "{jurisdiction} management plan", + "{jurisdiction} well permits", + "{jurisdiction} well permit requirements", + "requirements to drill a water well in {jurisdiction}", + ] + } + ] +} \ No newline at end of file diff --git a/compass/plugin/one_shot/schemas/validate_chunk.json5 b/compass/plugin/one_shot/schemas/validate_chunk.json5 new file mode 100644 index 000000000..938ec4be7 --- /dev/null +++ b/compass/plugin/one_shot/schemas/validate_chunk.json5 @@ -0,0 +1,16 @@ +{ + "type": "object", + "description": "Response indicating whether or not the text chunk contains relevant text for the extraction task. If the chunk contains relevant text, the LLM should return 'true' for 'contains_relevant_text', and if not, it should return 'false'.", + "additionalProperties": false, + "required": ["contains_relevant_text", "explanation"], + "properties": { + "contains_relevant_text": { + "type": "boolean", + "description": "Flag indicating whether the text chunk contains relevant information for the extraction task. 'true' if relevant text is present, 'false' if not.", + }, + "explanation": { + "type": "string", + "description": "Explanation for the LLM's determination of whether the chunk contains relevant text for extraction based on the input extraction schema. If 'contains_relevant_text' is 'true', the explanation should briefly summarize the relevant content found in the chunk. If 'contains_relevant_text' is 'false', the explanation should briefly describe why the chunk was deemed irrelevant (e.g., off-topic content, no mention of any of the extraction schema content, text does not apply to system under consideration for extraction, etc.).", + } + }, +} \ No newline at end of file diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index eff794be3..f26e2d17e 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -15,7 +15,7 @@ from compass.llm.calling import ( BaseLLMCaller, ChatLLMCaller, - StructuredLLMCaller, + JSONFromTextLLMCaller, ) from compass.plugin.interface import ( BaseHeuristic, @@ -260,7 +260,7 @@ def GOOD_TECH_PHRASES(self): # noqa: N802 raise NotImplementedError -class PromptBasedTextCollector(StructuredLLMCaller, BaseTextCollector, ABC): +class PromptBasedTextCollector(JSONFromTextLLMCaller, BaseTextCollector, ABC): """Text extractor based on a chain of prompts""" @property diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 7d5d3e1e0..556bebc26 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -481,7 +481,7 @@ async def _crawl_hook(*__, **___): # noqa: RUF029 async def download_jurisdiction_ordinance_using_search_engine( - question_templates, + query_templates, jurisdiction, num_urls=5, file_loader_kwargs=None, @@ -494,7 +494,7 @@ async def download_jurisdiction_ordinance_using_search_engine( Parameters ---------- - question_templates : sequence of str + query_templates : sequence of str Query templates that will be formatted with the jurisdiction name before submission to the search engine. jurisdiction : Jurisdiction @@ -563,7 +563,7 @@ async def _download_hook(urls): # noqa: RUF029 kwargs.update(file_loader_kwargs or {}) try: out_docs = await _docs_from_web_search( - question_templates=question_templates, + query_templates=query_templates, jurisdiction=jurisdiction, num_urls=num_urls, search_semaphore=search_semaphore, @@ -700,7 +700,7 @@ async def filter_ordinance_docs( async def _docs_from_web_search( - question_templates, + query_templates, jurisdiction, num_urls, search_semaphore, @@ -711,8 +711,8 @@ async def _docs_from_web_search( ): """Download documents from the web using jurisdiction queries""" queries = [ - question.format(jurisdiction=jurisdiction.full_name) - for question in question_templates + query.format(jurisdiction=jurisdiction.full_name) + for query in query_templates ] kwargs.update({"file_cache_coroutine": TempFileCachePB.call}) diff --git a/compass/scripts/process.py b/compass/scripts/process.py index 6a14bc440..2a72a5793 100644 --- a/compass/scripts/process.py +++ b/compass/scripts/process.py @@ -64,7 +64,8 @@ log_versions, ) from compass.utilities.base import WebSearchParams -from compass.utilities.parsing import load_config, convert_paths_to_strings +from compass.utilities.io import load_config +from compass.utilities.parsing import convert_paths_to_strings from compass.pb import COMPASS_PB @@ -1000,7 +1001,7 @@ async def _download_known_url_documents(self): async def _find_documents_using_search_engine(self): """Search the web for ordinance docs using search engines""" docs = await download_jurisdiction_ordinance_using_search_engine( - self.extractor.QUESTION_TEMPLATES, + await self.extractor.get_query_templates(), self.jurisdiction, num_urls=self.web_search_params.num_urls_to_check_per_jurisdiction, file_loader_kwargs=self.file_loader_kwargs, @@ -1107,8 +1108,8 @@ async def _try_elm_crawl(self): ) out = await download_jurisdiction_ordinances_from_website( self.jurisdiction_website, - heuristic=self.extractor.heuristic, - keyword_points=self.extractor.WEBSITE_KEYWORDS, + heuristic=await self.extractor.get_heuristic(), + keyword_points=await self.extractor.get_website_keywords(), file_loader_kwargs=self.file_loader_kwargs_no_ocr, crawl_semaphore=self.crawl_semaphore, pb_jurisdiction_name=self.jurisdiction.full_name, @@ -1128,8 +1129,8 @@ async def _try_compass_crawl(self, scrape_results): docs = ( await download_jurisdiction_ordinances_from_website_compass_crawl( self.jurisdiction_website, - heuristic=self.extractor.heuristic, - keyword_points=self.extractor.WEBSITE_KEYWORDS, + heuristic=await self.extractor.get_heuristic(), + keyword_points=await self.extractor.get_website_keywords(), file_loader_kwargs=self.file_loader_kwargs_no_ocr, already_visited=checked_urls, crawl_semaphore=self.crawl_semaphore, diff --git a/compass/utilities/enums.py b/compass/utilities/enums.py index 4ea9250f0..0f869207d 100644 --- a/compass/utilities/enums.py +++ b/compass/utilities/enums.py @@ -3,6 +3,44 @@ from enum import StrEnum, auto +class CaseInsensitiveEnum(StrEnum): + """A string enum that is case insensitive""" + + def __new__(cls, value): + """Create new enum member""" + + value = value.lower().strip() + obj = str.__new__(cls, value) + obj._value_ = value + return cls._new_post_hook(obj, value) + + def __format__(self, format_spec): + return str.__format__(self._value_, format_spec) + + @classmethod + def _missing_(cls, value): + """Convert value to lowercase before lookup""" + if value is None: + return None + + value = value.lower().strip() + for member in cls: + if member.value == value: + return member + + return None + + @classmethod + def _new_post_hook(cls, obj, value): # noqa: ARG003 + """Hook for post-processing after __new__""" + return obj + + @classmethod + def members_as_str(cls): + """Set of enum members as strings""" + return {member.value for member in cls} + + class LLMUsageCategory(StrEnum): """Enumerate semantic buckets for tracking LLM usage @@ -47,6 +85,8 @@ class LLMUsageCategory(StrEnum): """Usage related to ordinance value extraction tasks""" PERMITTED_USE_VALUE_EXTRACTION = auto() """Usage related to permitted use value extraction tasks""" + PLUGIN_GENERATION = auto() + """Usage related to generating plugin prompts and templates""" class LLMTasks(StrEnum): @@ -137,3 +177,6 @@ class LLMTasks(StrEnum): This task represents the extraction of structured permitted use values. """ + + PLUGIN_GENERATION = LLMUsageCategory.PLUGIN_GENERATION + """Task related to generating plugin prompts and templates""" diff --git a/compass/utilities/io.py b/compass/utilities/io.py index f7a4cd3b7..4bda89d7c 100644 --- a/compass/utilities/io.py +++ b/compass/utilities/io.py @@ -1,58 +1,382 @@ -"""COMPASS I/O utilities""" - -import pprint -import logging - -from elm.web.file_loader import AsyncLocalFileLoader - - -logger = logging.getLogger(__name__) - - -async def load_local_docs(fps, **kwargs): - """Load local documents into `elm` document instances - - Parameters - ---------- - fps : Iterable - Iterable of paths referencing local files to load. - **kwargs - Additional keyword arguments forwarded to - :class:`elm.web.file_loader.AsyncLocalFileLoader` for - configuration such as ``loader``, caching, or parsing options. - - Returns - ------- - list of BaseDocument - Non-empty loaded documents corresponding to the supplied - filepaths. Empty results (e.g., unreadable files) are filtered - out of the returned list. - - Raises - ------ - elm.exceptions.ELMError - Propagated when the underlying loader fails to read one of the - provided files and is configured to raise on errors. - - Notes - ----- - Detailed debug information about loaded page counts is emitted via - the ``compass.utilities.io`` logger at ``TRACE`` level to assist - with troubleshooting ingestion runs. - """ - logger.trace("Loading docs for the following paths:\n%r", fps) - logger.trace( - "kwargs for AsyncLocalFileLoader:\n%s", - pprint.PrettyPrinter().pformat(kwargs), - ) - file_loader = AsyncLocalFileLoader(**kwargs) - docs = await file_loader.fetch_all(*fps) - - page_lens = { - doc.attrs.get("source_fp", "Unknown"): len(doc.pages) for doc in docs - } - logger.debug( - "Loaded the following number of pages for docs:\n%s", - pprint.PrettyPrinter().pformat(page_lens), - ) - return [doc for doc in docs if not doc.empty] +"""COMPASS I/O utilities + +A lot of this is taken directly from NLR's GAPs repo: +https://github.com/NatLabRockies/gaps +""" + +import logging +import contextlib +import collections +from pathlib import Path +from abc import ABC, abstractmethod + +import json +import yaml +import toml +import pyjson5 +import pprint +from elm.web.file_loader import AsyncLocalFileLoader + +from compass.utilities.enums import CaseInsensitiveEnum +from compass.exceptions import COMPASSValueError, COMPASSFileNotFoundError + + +logger = logging.getLogger(__name__) +_CONFIG_HANDLER_REGISTRY = {} + + +class _JSON5Formatter: + """Format input JSON5 data with indentation""" + + def __init__(self, data): + self.data = data + + def _format_as_json(self): + """Format the data input with as string with indentation""" + return json.dumps(self.data, indent=4) + + +class Handler(ABC): + """ABC for configuration file handler""" + + def __init_subclass__(cls): + super().__init_subclass__() + if isinstance(cls.FILE_EXTENSION, str): + _CONFIG_HANDLER_REGISTRY[cls.FILE_EXTENSION] = cls + else: + for file_extension in cls.FILE_EXTENSION: + _CONFIG_HANDLER_REGISTRY[file_extension] = cls + + @classmethod + def load(cls, file_name): + """Load the file contents""" + config_str = Path(file_name).read_text(encoding="utf-8") + return cls.loads(config_str) + + @classmethod + def write(cls, file_name, data): + """Write the data to a file""" + with Path(file_name).open("w", encoding="utf-8") as config_file: + cls.dump(data, config_file) + + @classmethod + @abstractmethod + def dump(cls, config, stream): + """Write the config to a stream (file)""" + + @classmethod + @abstractmethod + def dumps(cls, config): + """Convert the config to a string""" + + @classmethod + @abstractmethod + def loads(cls, config_str): + """Parse the string into a config dictionary""" + + @property + @abstractmethod + def FILE_EXTENSION(self): # noqa: N802 + """str: Enum name to use""" + + +class JSONHandler(Handler): + """JSON config file handler""" + + FILE_EXTENSION = "json" + """JSON file extension""" + + @classmethod + def dump(cls, config, stream): + """Write the config to a stream (JSON file)""" + return json.dump(config, stream, indent=4) + + @classmethod + def dumps(cls, config): + """Convert the config to a JSON string""" + return json.dumps(config, indent=4) + + @classmethod + def loads(cls, config_str): + """Parse the JSON string into a config dictionary""" + return json.loads(config_str) + + +class JSON5Handler(Handler): + """JSON5 config file handler""" + + FILE_EXTENSION = "json5" + """JSON5 file extension""" + + @classmethod + def dump(cls, config, stream): + """Write the config to a stream (JSON5 file)""" + return pyjson5.encode_io( + _JSON5Formatter(config), + stream, + supply_bytes=False, + tojson="_format_as_json", + ) + + @classmethod + def dumps(cls, config): + """Convert the config to a JSON5 string""" + return pyjson5.encode( + _JSON5Formatter(config), + tojson="_format_as_json", + ) + + @classmethod + def loads(cls, config_str): + """Parse the JSON5 string into a config dictionary""" + return pyjson5.decode(config_str, maxdepth=-1) + + +class YAMLHandler(Handler): + """YAML config file handler""" + + FILE_EXTENSION = "yaml", "yml" + """YAML file extensions""" + + @classmethod + def dump(cls, config, stream): + """Write the config to a stream (YAML file)""" + return yaml.safe_dump(config, stream, indent=2, sort_keys=False) + + @classmethod + def dumps(cls, config): + """Convert the config to a YAML string""" + return yaml.safe_dump(config, indent=2, sort_keys=False) + + @classmethod + def loads(cls, config_str): + """Parse the YAML string into a config dictionary""" + return yaml.safe_load(config_str) + + +class TOMLHandler(Handler): + """TOML config file handler""" + + FILE_EXTENSION = "toml" + """TOML file extension""" + + @classmethod + def dump(cls, config, stream): + """Write the config to a stream (TOML file)""" + return toml.dump(config, stream) + + @classmethod + def dumps(cls, config): + """Convert the config to a TOML string""" + return toml.dumps(config) + + @classmethod + def loads(cls, config_str): + """Parse the TOML string into a config dictionary""" + return toml.loads(config_str) + + +class _ConfigType(CaseInsensitiveEnum): + """Base config type enum class only meant to be initialized once""" + + @classmethod + def _new_post_hook(cls, obj, value): + """Hook for post-processing after __new__; adds methods""" + obj.dump = _CONFIG_HANDLER_REGISTRY[value].dump + obj.dumps = _CONFIG_HANDLER_REGISTRY[value].dumps + obj.load = _CONFIG_HANDLER_REGISTRY[value].load + obj.loads = _CONFIG_HANDLER_REGISTRY[value].loads + obj.write = _CONFIG_HANDLER_REGISTRY[value].write + obj.__doc__ = f"{value.upper()} config file handler" + return obj + + +ConfigType = _ConfigType( + "ConfigType", + { + config_type.upper(): config_type + for config_type in _CONFIG_HANDLER_REGISTRY + }, +) +"""An enumeration of the parseable config types""" + + +def load_config(config_filepath, resolve_paths=True): + """Load a config file + + Parameters + ---------- + config_filepath : path-like + Path to config file. + resolve_paths : bool, optional + Option to (recursively) resolve file-paths in the dictionary + w.r.t the config file directory. + By default, ``True``. + + Returns + ------- + dict + Dictionary containing configuration parameters. + + Raises + ------ + COMPASSValueError + If input `config_filepath` has no file ending. + """ + config_filepath = Path(config_filepath).expanduser().resolve() + if "." not in config_filepath.name: + msg = ( + f"Configuration file must have a file-ending. Got: " + f"{config_filepath.name}" + ) + raise COMPASSValueError(msg) + + if not config_filepath.exists(): + msg = f"Config file does not exist: {config_filepath}" + raise COMPASSFileNotFoundError(msg) + + try: + config_type = ConfigType(config_filepath.suffix[1:]) + except ValueError as err: + msg = ( + f"Got unknown config file extension: " + f"{config_filepath.suffix!r}. Supported extensions are: " + f"{', '.join({ct.value for ct in ConfigType})}" + ) + raise COMPASSValueError(msg) from err + + config = config_type.load(config_filepath) + if resolve_paths: + return resolve_all_paths(config, config_filepath.parent) + + return config + + +def resolve_all_paths(container, base_dir): + """Perform a deep string replacement and path resolve in `container` + + Parameters + ---------- + container : dict or list + Container like a dictionary or list that may (or may not) + contain relative paths to resolve. + base_dir : path-like + Base path to directory from which to resolve path string + (typically current directory) + + Returns + ------- + dict or list + Input container with updated strings. + """ + + if isinstance(container, str): + # `resolve_path` is safe to call on any string, + # even if it is not a path + container = resolve_path(container, Path(base_dir)) + + elif isinstance(container, collections.abc.Mapping): + container = { + key: resolve_all_paths(val, Path(base_dir)) + for key, val in container.items() + } + + elif isinstance(container, collections.abc.Sequence): + container = [ + resolve_all_paths(item, Path(base_dir)) for item in container + ] + + return container + + +def resolve_path(path, base_dir): + """Resolve a file path represented by the input string. + + This function resolves the input string if it resembles a path. + Specifically, the string will be resolved if it starts with + "``./``" or "``..``", or it if it contains either "``./``" or + "``..``" somewhere in the string body. Otherwise, the string + is returned unchanged, so this function *is* safe to call on any + string, even ones that do not resemble a path. + This method delegates the "resolving" logic to + :meth:`pathlib.Path.resolve`. This means the path is made + absolute, symlinks are resolved, and "``..``" components are + eliminated. If the ``path`` input starts with "``./``" or + "``..``", it is assumed to be w.r.t the config directory, *not* + the run directory. + + Parameters + ---------- + path : str + Input file path. + base_dir : path-like + Base path to directory from which to resolve path string + (typically current directory). + + Returns + ------- + str + The resolved path. + """ + base_dir = Path(base_dir) + + if path.startswith("./"): + path = base_dir / Path(path[2:]) + elif path.startswith(".."): + path = base_dir / Path(path) + elif "./" in path: # this covers both './' and '../' + path = Path(path) + + with contextlib.suppress(AttributeError): # `path` is still a `str` + path = path.expanduser().resolve().as_posix() + + return path + + +async def load_local_docs(fps, **kwargs): + """Load local documents into `elm` document instances + + Parameters + ---------- + fps : Iterable + Iterable of paths referencing local files to load. + **kwargs + Additional keyword arguments forwarded to + :class:`elm.web.file_loader.AsyncLocalFileLoader` for + configuration such as ``loader``, caching, or parsing options. + + Returns + ------- + list of BaseDocument + Non-empty loaded documents corresponding to the supplied + filepaths. Empty results (e.g., unreadable files) are filtered + out of the returned list. + + Raises + ------ + elm.exceptions.ELMError + Propagated when the underlying loader fails to read one of the + provided files and is configured to raise on errors. + + Notes + ----- + Detailed debug information about loaded page counts is emitted via + the ``compass.utilities.io`` logger at ``TRACE`` level to assist + with troubleshooting ingestion runs. + """ + logger.trace("Loading docs for the following paths:\n%r", fps) + logger.trace( + "kwargs for AsyncLocalFileLoader:\n%s", + pprint.PrettyPrinter().pformat(kwargs), + ) + file_loader = AsyncLocalFileLoader(**kwargs) + docs = await file_loader.fetch_all(*fps) + + page_lens = { + doc.attrs.get("source_fp", "Unknown"): len(doc.pages) for doc in docs + } + logger.debug( + "Loaded the following number of pages for docs:\n%s", + pprint.PrettyPrinter().pformat(page_lens), + ) + return [doc for doc in docs if not doc.empty] diff --git a/compass/utilities/parsing.py b/compass/utilities/parsing.py index f87d74c6f..11261c318 100644 --- a/compass/utilities/parsing.py +++ b/compass/utilities/parsing.py @@ -4,10 +4,8 @@ import logging from pathlib import Path -import pyjson5 import numpy as np -from compass.exceptions import COMPASSValueError logger = logging.getLogger(__name__) _ORD_CHECK_COLS = ["value", "summary"] @@ -196,51 +194,6 @@ def ordinances_bool_index(data): return found_features > 0 -def load_config(config_fp): - """Load configuration data from JSON or JSON5 sources - - Parameters - ---------- - config_fp : path-like - Path to config file to open and load. - - Returns - ------- - dict - Parsed configuration object. - - Raises - ------ - COMPASSValueError - If the file path does not exist or the extension is not - ``.json`` or ``.json5``. - - Notes - ----- - JSON5 enables comments and trailing commas, among other - quality-of-life improvements over vanilla JSON. - """ - config_fp = Path(config_fp) - - if not config_fp.exists(): - msg = f"Config file does not exist: {config_fp}" - raise COMPASSValueError(msg) - - if config_fp.suffix == ".json5": - with config_fp.open(encoding="utf-8") as fh: - return pyjson5.decode_io(fh) - - if config_fp.suffix == ".json": - with config_fp.open(encoding="utf-8") as fh: - return json.load(fh) - - msg = ( - "Got unknown config file extension: " - f"{config_fp.suffix}. Supported extensions are .json5 and .json." - ) - raise COMPASSValueError(msg) - - def convert_paths_to_strings(obj): """[NOT PUBLIC API] Convert all Path instances to strings""" logger.trace("Converting paths to strings in object: %s", obj) diff --git a/compass/validation/content.py b/compass/validation/content.py index c829c805f..34c31482e 100644 --- a/compass/validation/content.py +++ b/compass/validation/content.py @@ -9,7 +9,7 @@ from abc import ABC, abstractmethod from warnings import warn -from compass.llm.calling import ChatLLMCaller, StructuredLLMCaller +from compass.llm.calling import ChatLLMCaller, JSONFromTextLLMCaller from compass.validation.graphs import setup_graph_correct_document_type from compass.common import setup_async_decision_tree, run_async_tree from compass.utilities.enums import LLMUsageCategory @@ -291,7 +291,7 @@ async def check_chunk(self, chunk_parser, ind): raise NotImplementedError -class LegalTextValidator(TextKindValidator, StructuredLLMCaller): +class LegalTextValidator(TextKindValidator, JSONFromTextLLMCaller): """Parse chunks to determine if they contain legal text""" SYSTEM_MESSAGE = ( @@ -318,7 +318,7 @@ def __init__( check for the whole document to be considered legal text. By default, ``0.8``. *args, **kwargs - Parameters to pass to the StructuredLLMCaller initializer. + Parameters to pass to the JSONFromTextLLMCaller initializer. """ super().__init__(*args, **kwargs) self.tech = tech diff --git a/docs/source/conf.py b/docs/source/conf.py index 0b2b0c5f8..006802b3d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -137,6 +137,7 @@ "py:obj", "compass.extraction.wind.ordinance.WindPermittedUseDistrictsTextExtractor.OUT_LABEL", ), + ("py:class", "compass.utilities.io._ConfigType"), ] # -- Options for HTML output ------------------------------------------------- @@ -388,7 +389,8 @@ def setup(app): "OpenAIService": ":class:`~compass.services.openai.OpenAIService`", "ParseChunksWithMemory": ":class:`~compass.validation.content.ParseChunksWithMemory`", "Service": ":class:`~compass.services.base.Service`", - "StructuredLLMCaller": ":class:`~compass.llm.calling.StructuredLLMCaller`", + "JSONFromTextLLMCaller": ":class:`~compass.llm.calling.JSONFromTextLLMCaller`", + "SchemaOutputLLMCaller": ":class:`~compass.llm.calling.SchemaOutputLLMCaller`", "TimeBoundedUsageTracker": ":class:`~compass.services.usage.TimeBoundedUsageTracker`", "UsageTracker": ":class:`~compass.services.usage.UsageTracker`", "WindOrdinanceTextExtractor": ":class:`~compass.extraction.wind.ordinance.WindOrdinanceTextExtractor`", diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 1033daf82..c90e05ee5 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -9,4 +9,5 @@ get started with ``COMPASS``: :maxdepth: 1 execution_basics/README - parse_existing_docs/README + one_shot_schema_extraction/README + parse_existing_docs/code/README diff --git a/docs/source/examples/one_shot_schema_extraction b/docs/source/examples/one_shot_schema_extraction new file mode 120000 index 000000000..555ed8ef0 --- /dev/null +++ b/docs/source/examples/one_shot_schema_extraction @@ -0,0 +1 @@ +../../../examples/one_shot_schema_extraction \ No newline at end of file diff --git a/examples/README.md b/examples/README.md index afceeac5b..93ce7fcb6 100644 --- a/examples/README.md +++ b/examples/README.md @@ -6,6 +6,10 @@ This directory contains several examples/tutorials to help you get started with a ``COMPASS`` run, starting with basic inputs and working your way up to a full configuration file. - [`Quickstart Demo`](./openai_solar_demo): Small demo designed to get you running ``COMPASS`` as fast as possible. Requires a personal OpenAI API key. +- [`Run Texas Water Rights Extraction`](./water_rights_demo): An example setup for running + COMPASS water rights extraction for a groundwater conservation district in Texas. +- [`One-Shot Extraction`](./one_shot_schema_extraction): A tutorial and some example configurations + for setting up a custom one-shot extraction plugin within COMPASS. - [`Parse a Local PDF File`](./parse_existing_docs): A simple example of parsing a local PDF file for structured solar ordinance values. diff --git a/examples/one_shot_schema_extraction/README.rst b/examples/one_shot_schema_extraction/README.rst new file mode 100644 index 000000000..a89770c4c --- /dev/null +++ b/examples/one_shot_schema_extraction/README.rst @@ -0,0 +1,185 @@ +******************* +One-Shot Extraction +******************* + +This example shows how to author a one-shot extraction schema and run it +through COMPASS. The one-shot plugin uses your schema to extract structured +data in a single LLM call. + + +Prerequisites +============= +Be sure to go over the +`COMPASS Execution Basics `_ +to understand how to set up a run environment and model run configuration. +Once your one-shot schema is established, you will be executing the data +extraction pipeline in the same manner as described in that example. + + +Create Your Schema +================== +To start off, you will need to create a one-shot JSON schema that describes the +extraction output shape and embeds the extraction logic in schema field +descriptions. The easiest way to do this is by copying +`wind_schema.json `_ +and adjusting it for your domain. + +At a minimum, the schema must return an object with an ``outputs`` array, where +each item is an extraction record with the required fields shown below: + +.. code-block:: json + + { + "type": "object", + "required": ["outputs"], + "properties": { + "outputs": { + "type": "array", + "items": { + "type": "object", + "required": [ + "feature", + "value", + "units", + "section", + "summary" + ] + } + } + } + } + +The main field here is ``feature``, which is the ID of the extracted feature +(e.g., a setback distance or a maximum allowed height). The other fields +(``value``, ``units``, ``section``, and ``summary``) are important for keeping +the output consistent across various extractions and allowing a central database +to keep track of the scraped data. + +Once the schema for the ``outputs`` array is finalized, you can add additional +keys starting with a ``$`` to encode instructions, examples, and edge case +handling logic that the model can refer to when parsing the text. These extra +keys are not required, and they are ignored for the purposes of creating the +structure of the outputs themselves, but they often provide crucial context +that improves accuracy. + +For example, the +`wind extraction schema `_ +contains a ``$definitions`` key with detailed instructions on how to interpret +setback multipliers and how to choose the most restrictive value when multiple +setback distances are given in the text. This is reminiscent of the "decision logic" +that you would normally encode in a decision tree for a traditional plugin, +but here the logic is embedded in the schema itself and interpreted by the model +at extraction time. This approach allows you to encode complex edge case handling +logic without having to write any code, and it also allows you to easily update +the logic by simply editing the schema. + +The schema also includes a ``$examples`` key with example extractions that the model +can refer to when deciding how to parse the text. You can be as detailed as you want +in these instructions, and you can experiment with different outputs to tune the +model's understanding of the task and the desired output format. + +Finally, the same schema includes a ``$instructions`` key with general instructions +for the model to follow when parsing the text. This is a good place to reinforce the +importance of following the schema and to provide any additional context that might be +helpful for the model to know when performing the extraction. + +You can add or remove as many of these extra keys as you want, and you can experiment with +different ways of encoding the instructions and examples to see what works best for your +particular use case. The main thing to keep in mind is that the core structure of the +output must be defined by the ``outputs`` array in the schema, and any additional context +or instructions should be provided through these extra keys. + +.. NOTE:: You can compare the `one-shot wind schema `_ + to the existing decision trees in the `wind energy plugin `_ + to get a feel for the translation of decision tree logic to schema descriptions. + + +.. Important Schema Components +.. --------------------------- +.. **Feature Catalog** +.. Define the allowed feature IDs (often as an enum) under +.. ``outputs.items.properties.feature``. These IDs are what the parser uses to +.. create the final output rows. + +.. **Field Requirements** +.. Enforce ``required`` fields and ``additionalProperties: false`` to keep the +.. output consistent. The core fields are ``feature``, ``value``, ``units``, +.. ``section``, and ``summary``. + +.. **Decision Logic in Descriptions** +.. Use field descriptions and ``$definitions`` to encode extraction rules and +.. edge cases (e.g., how to choose the most restrictive value or how to interpret +.. setback multipliers). + +.. **Instructions and Examples** +.. Use ``$instructions`` and ``$examples`` to reinforce the desired output and to +.. anchor the model on your conventions. + + +Build a Plugin Config +===================== +Once you have defined your schema, the hard work is done! The next step is to +build a one-shot plugin config that tells COMPASS how to use the schema and +how to retrieve and filter documents. As with all configs in COMPASS, you may +define your plugin configuration via JSON, JSON5, YAML, or TOML. + +At a minimum, you must supply a ``schema`` key (either a dictionary containing the +full schema or a path to a schema file): + +.. literalinclude:: plugin_config_minimal.json + :language: json + + +If you want a little bit more control over the extraction pipeline, you may +specify several additional keys that let you customize query templates, website +filters, and text extraction prompts: + + +.. literalinclude:: plugin_config_simple.json5 + :language: json5 + +The key options are listed below: + +- ``data_type_short_desc``: Short label used in prompts (e.g., ``wind energy ordinance``). +- ``query_templates``: Search queries with a ``{jurisdiction}`` placeholder. +- ``website_keywords``: Keyword weights for document search prioritization. +- ``collection_prompts``: Prompt list for chunk filtering, or ``true`` to auto-generate. +- ``text_extraction_prompts``: Prompt list for text consolidation, or ``true`` to auto-generate. +- ``cache_query_templates``: Cache generated query templates and keywords. By default, ``true``. +- ``extraction_system_prompt``: Optional system prompt override for extraction. + + +See `this documentation `_ +for further details. + +If you want full control over all of the options above, you can specify them directly in the config +as shown below. You can also specify custom prompts for the collection and text extraction steps, +which gives you even more control over the pipeline and allows you to further tune the model. + + +.. literalinclude:: plugin_config.yaml + :language: yaml + + +Execution +========= +Once both the schema and plugin configuration are set up, you can run your newly created +one-shot plugin alongside the standard COMPASS pipeline using the ``--plugin`` flag. +The main run config still controls core pipeline settings and must include a ``tech`` +value that matches your target technology. + +.. code-block:: shell + + compass process -c config.json5 \ + -p examples/one_shot_schema_extraction/plugin_config.yaml + +If you are using ``pixi``: + +.. code-block:: shell + + pixi run compass process -c config.json5 \ + -p examples/one_shot_schema_extraction/plugin_config.yaml + +Add ``-v`` (or ``-vv``) if you want log output in the terminal. +See the `Execution Basics example `_ +for more details on running COMPASS pipelines. diff --git a/examples/one_shot_schema_extraction/plugin_config.yaml b/examples/one_shot_schema_extraction/plugin_config.yaml new file mode 100644 index 000000000..4da8700c5 --- /dev/null +++ b/examples/one_shot_schema_extraction/plugin_config.yaml @@ -0,0 +1,116 @@ +schema: ./wind_schema.json + +data_type_short_desc: wind energy ordinance + +query_templates: + - "filetype:pdf {jurisdiction} wind energy conversion system ordinances" + - "wind energy conversion system ordinances {jurisdiction}" + - "{jurisdiction} wind WECS ordinance" + - "Where can I find the legal text for commercial wind energy conversion system zoning ordinances in {jurisdiction}?" + - "What is the specific legal information regarding zoning ordinances for commercial wind energy conversion systems in {jurisdiction}?" + +website_keywords: + pdf: 92160 + wecs: 46080 + wind: 23040 + zoning: 11520 + ordinance: 5760 + renewable%20energy: 1440 + renewable+energy: 1440 + renewable energy: 1440 + planning: 720 + plan: 360 + government: 180 + code: 60 + area: 60 + land%20development: 15 + land+development: 15 + land development: 15 + land: 3 + environment: 3 + energy: 3 + renewable: 3 + municipal: 1 + department: 1 + +collection_prompts: + - key: contains_ord_info + label: contains ordinance info + prompt: "You extract structured data from text. Return your answer in JSON format (not markdown). Your JSON file must include exactly two keys. The first key is 'wind_reqs', which is a string that summarizes all zoning, siting, setback, system design, and operational requirements/restrictions that are explicitly enacted in the text for a wind energy system (or wind turbine/tower) for a given jurisdiction. Note that wind energy bans are an important restriction to track. Include any **closely related provisions** if they clearly pertain to the **development, operation, modification, or removal** of wind energy systems (or wind turbines/towers). All restrictions should be enforceable - ignore any text that only provides a legal definition of the regulation. If the text does not specify any concrete zoning, siting, setback, system design, or operational requirements/restrictions for a wind energy system, set this key to `null`. The last key is '{key}', which is a boolean that is set to True if the text excerpt explicitly details zoning, siting, setback, system design, or operational requirements/restrictions for a wind energy system (or wind turbine/tower) and False otherwise." + + - key: x + label: for utility-scale WECS + prompt: "You are a legal scholar that reads ordinance text and determines whether any of it applies to zoning, siting, setback, system design, or operational requirements/restrictions for **large wind energy systems**. Large wind energy systems (WES) may also be referred to as wind turbines, wind energy conversion systems (WECS), wind energy facilities (WEF), wind energy turbines (WET), large wind energy turbines (LWET), utility-scale wind energy turbines (UWET), commercial wind energy conversion systems (CWECS), alternate energy systems (AES), commercial energy production systems (CEPCS), or similar. Your client is a commercial wind developer that does not care about ordinances related to private, residential, micro, small, or medium sized wind energy systems. Ignore any text related to such systems. Return your answer as a dictionary in JSON format (not markdown). Your JSON file must include exactly two keys. The first key is 'summary' which contains a string that lists all of the types of wind energy systems the text applies to (if any). The second key is '{key}', which is a boolean that is set to True if any part of the text excerpt details zoning, siting, setback, system design, or operational requirements/restrictions for the **large wind energy conversion systems** (or similar) that the client is interested in and False otherwise." + +text_extraction_prompts: + - key: wind_energy_systems_text + out_fn: "{jurisdiction} Wind Ordinance.txt" + prompt: |- + # CONTEXT # + We want to reduce the provided excerpt to only contain information about **wind energy systems**. The extracted text will be used for structured data extraction, so it must be both **comprehensive** (retaining all relevant details) and **focused** (excluding unrelated content), with **zero rewriting or paraphrasing**. Ensure that all retained information is **directly applicable to wind energy systems** while preserving full context and accuracy. + + # OBJECTIVE # + Extract all text **pertaining to wind energy systems** from the provided excerpt. + + # RESPONSE # + Follow these guidelines carefully: + + 1. ## Scope of Extraction ##: + - Include all text that pertains to **wind energy systems**. + - Explicitly include any text related to **bans or prohibitions** on wind energy systems. + - Explicitly include any text related to the adoption or enactment date of the ordinance (if any). + + 2. ## Exclusions ##: + - Do **not** include text that does not pertain to wind energy systems. + + 3. ## Formatting & Structure ##: + - **Preserve _all_ section titles, headers, and numberings** for reference. + - **Maintain the original wording, formatting, and structure** to ensure accuracy. + + 4. ## Output Handling ##: + - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. + - Do not add, explain, reword, or summarize anything. + - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** + - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. + - If **no relevant text** is found, return the response: 'No relevant text.' + + - key: cleaned_text_for_extraction + out_fn: "{jurisdiction} Utility Scale Wind Ordinance.txt" + prompt: |- + # CONTEXT # + We want to reduce the provided excerpt to only contain information about **large wind energy systems**. The extracted text will be used for structured data extraction, so it must be both **comprehensive** (retaining all relevant details) and **focused** (excluding unrelated content), with **zero rewriting or paraphrasing**. Ensure that all retained information is **directly applicable** to large wind energy systems while preserving full context and accuracy. + + # OBJECTIVE # + Extract all text **pertaining to large wind energy systems** from the provided excerpt. + + # RESPONSE # + Follow these guidelines carefully: + + 1. ## Scope of Extraction ##: + - Include all text that pertains to **large wind energy systems**, even if they are referred to by different names such as: Wind turbines, wind energy conversion systems (wecs), wind energy facilities (wef), wind energy turbines (wet), large wind energy turbines (lwet), utility-scale wind energy turbines (uwet), commercial wind energy conversion systems (cwecs), alternate energy systems (aes), commercial energy production systems (cepcs), or similar + - Explicitly include any text related to **bans or prohibitions** on large wind energy systems. + - Explicitly include any text related to the adoption or enactment date of the ordinance (if any). + - **Retain all relevant technical, design, operational, safety, environmental, and infrastructure-related provisions** that apply to the topic, such as (but not limited to): + - Compliance with legal or regulatory standards. + - Site, structural, or design specifications. + - Environmental impact considerations. + - Safety and risk mitigation measures. + - Infrastructure, implementation, operation, and maintenance details. + - All other **closely related provisions**. + + 2. ## Exclusions ##: + - Do **not** include text that explicitly applies **only** to private, residential, micro, small, or medium sized wind energy systems. + - Do **not** include text that does not pertain at all to wind energy systems. + + 3. ## Formatting & Structure ##: + - **Preserve _all_ section titles, headers, and numberings** for reference. + - **Maintain the original wording, formatting, and structure** to ensure accuracy. + + 4. ## Output Handling ##: + - This is a strict extraction task — act like a text filter, **not** a summarizer or writer. + - Do not add, explain, reword, or summarize anything. + - The output must be a **copy-paste** of the original excerpt. **Absolutely no paraphrasing or rewriting.** + - The output must consist **only** of contiguous or discontiguous verbatim blocks copied from the input. + - If **no relevant text** is found, return the response: 'No relevant text.' + +extraction_system_prompt: "You are a legal scholar extracting structured data from wind energy ordinances. Follow all instructions in the schema descriptions carefully. Only extract requirements for large, commercial, utility-scale wind energy systems." diff --git a/examples/one_shot_schema_extraction/plugin_config_minimal.json b/examples/one_shot_schema_extraction/plugin_config_minimal.json new file mode 100644 index 000000000..ff8babbae --- /dev/null +++ b/examples/one_shot_schema_extraction/plugin_config_minimal.json @@ -0,0 +1,3 @@ +{ + "schema": "./wind_schema.json" +} \ No newline at end of file diff --git a/examples/one_shot_schema_extraction/plugin_config_simple.json5 b/examples/one_shot_schema_extraction/plugin_config_simple.json5 new file mode 100644 index 000000000..92b4e9260 --- /dev/null +++ b/examples/one_shot_schema_extraction/plugin_config_simple.json5 @@ -0,0 +1,15 @@ +{ + // Always required for one-shot schema extraction plugins + "schema": "./wind_schema.json", + + // The default value for ``cache_query_templates`` is ``true``, + // but we include it here anyway for completeness and to + // demonstrate that it can be set to ``false`` if desired. + "cache_query_templates": true, + + // By setting this option to ``true``, we indicate that we would + // like a text collection (filter) step, but would like to simply + // use the schema to guide the filtering (instead of providing + // custom prompts). + "collection_prompts": true, +} \ No newline at end of file diff --git a/examples/one_shot_schema_extraction/wind_schema.json b/examples/one_shot_schema_extraction/wind_schema.json new file mode 100644 index 000000000..636cfe02f --- /dev/null +++ b/examples/one_shot_schema_extraction/wind_schema.json @@ -0,0 +1,278 @@ +{ + "title": "Wind Ordinance Extraction Schema", + "description": "Single-shot structured extraction schema for wind energy ordinances. This schema encodes the complete multi-step decision tree logic from COMPASS into comprehensive field descriptions that guide LLMs to extract all ordinance features in one call. The output is structured as a flat array of extraction objects, where each object represents one row in the final CSV/DataFrame output.", + "version": "1.0.0", + "type": "object", + "required": ["outputs"], + "additionalProperties": false, + "properties": { + "outputs": { + "type": "array", + "items": { + "type": "object", + "required": ["feature", "value", "units", "section", "summary"], + "additionalProperties": false, + "properties": { + "feature": { + "type": "string", + "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs.", + "enum": [ + "structures (participating)", + "structures (non-participating)", + "property line (participating)", + "property line (non-participating)", + "roads", + "railroads", + "transmission", + "water", + "public conservation lands", + "other wecs", + "noise", + "maximum turbine height", + "maximum project size", + "minimum lot size", + "maximum lot size", + "shadow flicker", + "tower density", + "blade clearance", + "color", + "decommissioning", + "lighting", + "prohibitions", + "visual impact", + "repowering", + "climbing prevention", + "signage", + "soil", + "primary use districts", + "special use districts", + "accessory use districts", + "prohibited use districts" + ] + }, + "value": { + "description": "The extracted ordinance value. Type and format depend on the feature type. For setbacks and numerical restrictions: either a numerical distance value OR a multiplier value. For qualitative restrictions: typically null (summary contains the text). For districts: an array of district names. For prohibitions: may be 'ENR' (Explicitly Not Regulated) if the jurisdiction explicitly does not regulate wind energy systems. Can be null if no value found.", + "anyOf": [ + {"type": "number"}, + {"type": "string"}, + {"type": "array", "items": {"type": "string"}, "additionalProperties": false}, + {"type": "null"} + ] + }, + "units": { + "type": ["string", "null"], + "description": "Units for the extracted value. For distance setbacks: 'feet' or 'meters'. For multiplier-based setbacks: one of 'tip-height-multiplier', 'hub-height-multiplier', or 'rotor-diameter-multiplier'. For noise: typically 'dBA'. For shadow flicker: typically 'hr/year'. For lot sizes: area units like 'acres' or 'square feet'. Can be null if no value or if value is qualitative text." + }, + "section": { + "type": ["string", "null"], + "description": "The section title or number from the ordinance where this requirement is found (e.g., 'SECTION 1308 – Performance / Construction Standards'). Include numerical labels if provided. Null if no section identifier is available." + }, + "summary": { + "type": ["string", "null"], + "description": "A short summary of the relevant ordinance requirement using direct text excerpts and quotes as much as possible. If multiple options exist and a selection was made, list all other options and their conditions in the summary. For qualitative restrictions, this is the primary output field containing the full extracted text. Can be null if no requirement found." + } + } + } + } + }, + "$definitions": { + "system_size_context": { + "description": "CRITICAL CONTEXT: Only extract ordinances for systems that would typically be defined as large, commercial, or utility-scale wind energy systems based on the text itself — for example, systems intended for offsite electricity generation or sale, or those above thresholds such as height, rotor diameter, or rated capacity (often 1MW+). Do NOT consider any text that applies **only** to smaller or clearly non-commercial systems (e.g., 'small', 'micro', 'private', 'personal', 'onsite', residential systems) or to meteorological towers." + }, + "setback_features": { + "description": "Setback features with their aliases and extraction guidance", + "properties": { + "structures (participating)": { + "description": "Setback requirements from occupied dwellings, buildings, structures, or residences where the property owner is a **participating** owner. Participating owners may be able to sign a waiver or enter an agreement to reduce or completely eliminate the setback requirement. ONLY extract if the ordinance explicitly distinguishes between participating and non-participating owners AND specifies a numerical value for participating owners. If participating owners can completely waive or reduce by an unspecified amount the setback requirements, leave value as null. If the ordinance specifies that participating owners must abide by the same setbacks as non-participating owners, use the same value for both. VALUE: For distance-based setbacks, provide numerical distance. For multiplier-based setbacks (e.g., '1.5 times the total height'), treat fall zone as 1x height multiplier. If multiple multiplier values exist, select the largest. UNITS: Use 'feet' or 'meters' for distance, or 'tip-height-multiplier', 'hub-height-multiplier', or 'rotor-diameter-multiplier' for multiplier-based. Default to 'tip-height-multiplier' unless text explicitly states hub height or rotor diameter. If text includes an adder value (e.g., 'multiplier * height + 100 feet'), convert adder to feet and note in summary. If text includes 'the greater of' or minimum threshold clauses, note the minimum distance in summary. If text includes 'the lesser of' or maximum limit clauses, note the maximum distance in summary. IGNORE: Do not respond based on setbacks from property lines, lot lines, facility perimeters, parcels, subdivisions, roads, railroads, transmission lines, wetlands, or public conservation lands. CLARIFICATION: Dwelling units and occupied buildings **are not equivalent** to property lines unless the text **explicitly** makes that connection. Only focus on large-scale systems as defined in system_size_context." + }, + "structures (non-participating)": { + "description": "Setback requirements from occupied dwellings, buildings, structures, or residences where the property owner is a **non-participating** owner (i.e., has not signed a waiver or agreement). ONLY extract if the ordinance explicitly distinguishes between participating and non-participating owners AND specifies a numerical value for non-participating owners. If no distinction is made, use this feature for the general structure setback. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE & CLARIFICATIONS: Same as 'structures (participating)'." + }, + "property line (participating)": { + "description": "Setback requirements from property lines, lot lines, facility perimeters, parcels, or subdivisions where the property owner is a **participating** owner. Participating owners may be able to sign a waiver or enter an agreement to reduce or completely eliminate the setback requirement. ONLY extract if the ordinance explicitly distinguishes between participating and non-participating owners AND specifies a numerical value for participating property owners. If participating property owners can completely waive the setback, leave value as null. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, roads, railroads, transmission lines, wetlands, or public conservation lands. CLARIFICATION: Dwelling units, structures, occupied buildings, and residences **are not equivalent** to property lines or parcel boundaries unless the text **explicitly** makes that connection. Only focus on large-scale systems." + }, + "property line (non-participating)": { + "description": "Setback requirements from property lines, lot lines, facility perimeters, parcels, or subdivisions where the property owner is a **non-participating** owner. ONLY extract if the ordinance explicitly distinguishes between participating and non-participating owners. If no distinction is made, use this feature for the general property line setback. VALUE & UNITS: Same rules as 'property line (participating)'. IGNORE & CLARIFICATIONS: Same as 'property line (participating)'." + }, + "roads": { + "description": "Setback requirements from roads. Roads may also be labeled as rights-of-way. Before extracting, verify this is truly a road setback and not better classified as a property line setback. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, property lines, railroads, transmission lines, wetlands, or public conservation lands. Only focus on large-scale systems." + }, + "railroads": { + "description": "Setback requirements from railroads. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, property lines, roads, transmission lines, wetlands, or public conservation lands. Only focus on large-scale systems." + }, + "transmission": { + "description": "Setback requirements from overhead electrical transmission lines, overhead utility lines, utility easements, utility lines, power lines, electrical lines, or transmission lines. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, property lines, roads, railroads, wetlands, or public conservation lands. Only focus on large-scale systems." + }, + "water": { + "description": "Setback requirements from lakes, reservoirs, streams, rivers, or wetlands. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, property lines, roads, railroads, transmission lines, or public conservation lands. CLARIFICATION: Public conservation lands (or similar) **are not equivalent** to wetlands (or similar) unless the text **explicitly** makes that connection. Only focus on large-scale systems." + }, + "public conservation lands": { + "description": "Setback requirements from public conservation lands, natural resource protection areas, or preservation areas. VALUE & UNITS: Same rules as 'structures (participating)'. IGNORE: Do not respond based on setbacks from structures, property lines, roads, railroads, transmission lines, or wetlands. CLARIFICATION: Wetlands **are not equivalent** to public conservation lands unless the text **explicitly** makes that connection. Only focus on large-scale systems." + } + } + }, + "numerical_restrictions": { + "description": "Non-setback numerical restriction features", + "properties": { + "other wecs": { + "description": "Extract the **minimum** required separation distance between the wind energy conversion system being installed and other existing or planned wind energy conversion systems (i.e., spacing between turbines in different projects or installations). VALUE: Numerical value representing the minimum separation distance. If multiple values are mentioned, select the most restrictive (largest minimum) value. UNITS: Typically 'feet' or 'meters', but may also be a multiplier ('tip-height-multiplier', 'hub-height-multiplier', 'rotor-diameter-multiplier'). VERIFICATION: Ensure this is a requirement that has been explicitly enacted in the ordinance, not just a definition. Only extract if the text contains explicit numerical restrictions. If the jurisdiction explicitly does not regulate this aspect, you may indicate 'ENR' (Explicitly Not Regulated) as a special value. Only focus on large-scale systems." + }, + "noise": { + "description": "Extract the maximum noise level allowed for wind energy conversion systems. VALUE: Numerical value representing the maximum allowable noise level. If multiple values are mentioned, select the most restrictive (lowest maximum) value. UNITS: Assume standard units for noise are 'dBA' unless explicitly stated otherwise. VERIFICATION: Ensure this is an enacted requirement, not just a definition. Only extract if explicit numerical restrictions exist. Only focus on large-scale systems." + }, + "maximum turbine height": { + "description": "Extract the maximum turbine height allowed. CRITICAL: Maximum turbine height should be given in total feet or meters from the ground and **should NOT be relative to some other feature like structure height, airspace level, etc.** VALUE: Numerical value representing the maximum allowable turbine height measured from ground level. If multiple values exist, select the most restrictive (lowest maximum). UNITS: Typically 'feet' or 'meters'. VERIFICATION: Ensure this is an absolute height limit from the ground, not a relative measurement. Only extract enacted requirements with explicit numerical restrictions. Only focus on large-scale systems." + }, + "maximum project size": { + "description": "Extract the maximum project size or total installation allowed. Maximum project size is typically specified as a maximum system size value (e.g., in MW or nameplate capacity) OR as a maximum number of turbines. It should NOT be specified as a maximum area. VALUE: Numerical value representing either maximum capacity (MW) or maximum number of turbines. If multiple limits exist, select the most restrictive. Clearly indicate in the summary whether this is a capacity limit or turbine count limit. UNITS: Typically 'MW', 'megawatts', 'kilowatts', or 'number of turbines'. VERIFICATION: Only extract enacted requirements with explicit numerical restrictions. Only focus on large-scale systems." + }, + "minimum lot size": { + "description": "Extract the **minimum** lot, parcel, or tract size allowed for wind energy system installation. Minimum lot size should **always** be specified as an area value. VALUE: Numerical value representing the minimum required lot/parcel/tract area. If multiple values exist, select the most restrictive (largest minimum). UNITS: Area units such as 'acres', 'square feet', 'square meters', 'hectares'. Ensure the units represent area, not linear distance. VERIFICATION: Only extract enacted requirements with explicit numerical restrictions. Only focus on large-scale systems." + }, + "maximum lot size": { + "description": "Extract the **maximum** lot, parcel, or tract size allowed for wind energy system installation. Maximum lot size should **always** be specified as an area value. VALUE: Numerical value representing the maximum allowed lot/parcel/tract area. If multiple values exist, select the most restrictive (smallest maximum). UNITS: Area units such as 'acres', 'square feet', 'square meters', 'hectares'. Ensure the units represent area, not linear distance. VERIFICATION: Only extract enacted requirements with explicit numerical restrictions. Only focus on large-scale systems." + }, + "shadow flicker": { + "description": "Extract the maximum shadow flicker allowed. If the text prohibits shadow flicker entirely (e.g., 'shadow flicker is prohibited'), treat this as a maximum value of 0 hours per year. VALUE: Numerical value representing the maximum allowable shadow flicker. If prohibition exists, use 0. If multiple values exist, select the most restrictive (lowest maximum). UNITS: Assume standard units for shadow flicker are 'hr/year' (hours per year) unless explicitly stated otherwise. May also be 'hours per year' or 'hr/yr'. VERIFICATION: Only extract enacted requirements. Note prohibition explicitly in summary if applicable. Only focus on large-scale systems." + }, + "tower density": { + "description": "Extract the **minimum** allowed spacing between individual turbines within the same project or facility. CRITICAL: Do **NOT** try to infer the spacing requirement based on other restrictions such as setbacks from facility perimeters, property lines, etc. Only extract if there is an explicit spacing requirement between turbines. VALUE: Numerical value representing the minimum spacing between turbines. UNITS: May be distance units ('feet', 'meters') OR multiplier units ('tip-height-multiplier', 'hub-height-multiplier', 'rotor-diameter-multiplier'). Assume standard units for spacing are one of these. VERIFICATION: Only extract if explicitly stated. Do not infer from other setback requirements. Only focus on large-scale systems." + }, + "blade clearance": { + "description": "Extract the minimum blade clearance allowed (i.e., minimum distance between blade tip at lowest point and ground level or final grade). VALUE: Numerical value representing the minimum clearance between blade tip and ground. If multiple values exist, select the most restrictive (largest minimum) value. UNITS: Typically 'feet' or 'meters'. VERIFICATION: Only extract enacted requirements with explicit numerical restrictions. Only focus on large-scale systems." + } + } + }, + "qualitative_restrictions": { + "description": "Non-numerical restriction features that require text extraction", + "properties": { + "color": { + "description": "Extract color or finish requirements for wind energy conversion systems, including turbines, support structures, accessory buildings, poles, antennas, and other external facilities. VALUE: Typically null (the summary contains all relevant information). SUMMARY: Extract the full text describing color requirements, using direct excerpts and quotes. Include requirements about paint color, finish, neutral colors, galvanization, visibility minimization, blending with surroundings, and repainting obligations. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "decommissioning": { + "description": "Extract decommissioning requirements for wind energy conversion systems. This includes requirements for removal of the facility upon abandonment, cessation of operation, or end of useful life. VALUE: Typically null. SUMMARY: Extract all text describing decommissioning requirements, using direct excerpts. Include: timeline for removal (e.g., 'within 90 days of ceasing operation for 12 months'), scope of removal (e.g., 'remove all machinery, equipment, shelters to 5 feet below grade'), site restoration requirements (e.g., 'restore to natural condition'), financial assurance requirements (e.g., 'post bond for 150% of removal cost'), and any requirements for decommissioning plans or cost estimates. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "lighting": { + "description": "Extract lighting requirements for wind energy conversion systems. This includes requirements about when lighting is required, prohibited, or restricted, as well as specifications about the type, color, or intensity of lighting. VALUE: Typically null. SUMMARY: Extract all text describing lighting requirements, prohibitions, or restrictions, using direct excerpts. Include FAA lighting requirements, security lighting restrictions, prohibition of unnecessary lighting, and any specific technical specifications. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "prohibitions": { + "description": "Extract any prohibitions, moratoria, or bans on wind energy conversion systems. VALUE: Typically null, but may be the special value 'ENR' (Explicitly Not Regulated) if the jurisdiction explicitly states it does not regulate wind energy systems. SUMMARY: Extract all text describing prohibitions, moratoria, or bans. CRITICAL: Only extract prohibitions that are **unconditional** and **currently in effect** (not proposed or temporary). If the jurisdiction allows wind energy systems in any capacity (with permits, conditions, etc.), this is NOT a prohibition. If text describes areas where wind is not allowed, this should be extracted under 'prohibited use districts' instead. Only focus on large-scale systems." + }, + "visual impact": { + "description": "Extract visual impact **assessment** requirements for wind energy conversion systems. This refers to requirements to conduct, submit, or provide visual impact studies, analyses, or assessments as part of the permitting process. VALUE: Typically null. SUMMARY: Extract all text describing visual impact assessment requirements, using direct excerpts. Include requirements for: submission of photographs or renderings showing before/after views, superimposed images of proposed systems on current site conditions, viewshed analyses, photographic simulations, or other visual impact documentation. VERIFICATION: Focus on assessment/study requirements, not general aesthetic requirements. Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "repowering": { + "description": "Extract requirements or regulations specific to **repowering** of wind energy systems (i.e., replacing or upgrading existing turbines with new equipment). This is distinct from initial installation or decommissioning. VALUE: Typically null. SUMMARY: Extract all text describing repowering-specific requirements, using direct excerpts. Include requirements about: conditions under which repowering is allowed, whether repowered systems must meet new/updated standards, permitting requirements specific to repowering, and any distinctions between repowering and new installation. VERIFICATION: Only extract requirements that are explicitly about repowering, replacement, or upgrade of existing systems. Only focus on large-scale systems." + }, + "climbing prevention": { + "description": "Extract climbing prevention requirements for wind turbine towers. This includes requirements for anti-climbing devices, access ladder positioning, tower access doors, and physical security measures to prevent unauthorized climbing. VALUE: Typically null. SUMMARY: Extract all text describing climbing prevention requirements, using direct excerpts. Include: minimum height requirements for access ladders or climbing apparatus (e.g., 'ladders must be 15 feet above ground level'), door requirements (e.g., 'solid steel doors with locking devices'), locking requirements (e.g., 'kept locked at all times'), fencing requirements related to tower access, and any other physical security measures. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "signage": { + "description": "Extract signage requirements for wind energy conversion systems. This includes requirements for identification signs, prohibition of advertising, emergency contact information, and warning signs. VALUE: Typically null. SUMMARY: Extract all text describing signage requirements, using direct excerpts. Include: prohibition of off-premise or advertising signage, requirements for identification signs with facility owner name, requirements for 24-hour emergency telephone numbers, requirements for warning signs, and any specifications about sign size, placement, or content. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + }, + "soil": { + "description": "Extract soil, erosion, and/or sediment control requirements for wind energy system installation and operation. VALUE: Typically null. SUMMARY: Extract all text describing soil, erosion control, or sediment control requirements, using direct excerpts. Include: requirements for erosion and sediment control plans, best management practices, soil conservation measures, restoration of disturbed areas, and prevention of soil degradation during construction or operation. VERIFICATION: Only extract if explicitly enacted. Only focus on large-scale systems." + } + } + }, + "permitted_use_districts": { + "description": "Zoning districts where wind energy systems are permitted under various conditions", + "properties": { + "primary use districts": { + "description": "Extract all zoning districts where large wind energy systems are permitted as primary use or similar (e.g., permitted by right, without special conditions or approval beyond standard permitting). Consider any wind energy overlay districts as primary use districts. VALUE: An array of district names/codes as strings (e.g., ['Agricultural A-1', 'Agricultural A-2', 'Industrial I-1']). If no districts allow primary use, use null. SUMMARY: Provide a brief summary with direct text excerpts describing which districts allow wind as primary use and under what general conditions. SECTION: Include the section title/number where this information is found. CLARIFICATION: Large wind energy systems (WES) may also be referred to as: wind turbines, wind energy conversion systems (WECS), wind energy facilities (WEF), wind energy turbines (WET), large wind energy systems (LWES), utility-scale wind energy turbines (UWET), commercial wind energy conversion systems (CWECS), alternate energy systems (AES), commercial energy production systems (CEPCS), or similar terminology. Only focus on large-scale systems as defined in system_size_context." + }, + "special use districts": { + "description": "Extract all zoning districts where large wind energy systems are permitted as special use or similar (e.g., requires approval by zoning appeals board, planning commission, special use permit, conditional use permit, or meeting certain conditions like completing a specific permitting process). Do NOT include solar overlay districts (those should be classified as primary use). VALUE: An array of district names/codes as strings. If no districts require special use approval, use null. SUMMARY: Provide summary with direct text excerpts. SECTION: Include section title/number. CLARIFICATION: Same alternative terminology as in 'primary use districts'. Only focus on large-scale systems." + }, + "accessory use districts": { + "description": "Extract all zoning districts where large wind energy systems are permitted as accessory use or similar (e.g., when integrated with an existing structure, secondary to another primary use, or as an accessory to a principal use). Do NOT include solar overlay districts (classify as primary use). Do NOT include districts where wind is only allowed as accessory but also separately as primary or special use. VALUE: An array of district names/codes as strings. If no districts specify accessory use, use null. SUMMARY: Provide summary with direct text excerpts explaining the accessory use conditions. SECTION: Include section title/number. CLARIFICATION: Same alternative terminology as in 'primary use districts'. Only focus on large-scale systems." + }, + "prohibited use districts": { + "description": "Extract all zoning districts where large wind energy systems are prohibited or similar (e.g., where wind energy systems are not allowed, banned, or explicitly excluded). CRITICAL: Only output specific districts where wind energy systems are prohibited **unconditionally**. Do not include districts where wind might be allowed with special approval or under certain conditions. If wind is allowed in any capacity in a district, it should not be listed here. VALUE: An array of district names/codes as strings. If no districts prohibit wind unconditionally, use null. SUMMARY: Provide summary with direct text excerpts clearly stating the prohibition. SECTION: Include section title/number. CLARIFICATION: Same alternative terminology as in 'primary use districts'. Only focus on large-scale systems." + } + } + } + }, + "$examples": [ + { + "feature": "property line (non-participating)", + "value": 1.5, + "units": "tip-height-multiplier", + "section": "SECTION 1308 – Performance / Construction Standards", + "summary": "Setback requirements for WECS: Property Lines ... shall be 1.5 times the total height. The text specifies '1.5 times the total height' as the setback from property lines for Wind Energy Conversion Systems (WECS). No other multipliers or static distance adders are given for setbacks from property lines, lot lines, facility perimeters, parcels, or subdivisions. The multiplier applies to the total height (tip height) of the turbine. Other options such as hub-height-multiplier or rotor-diameter-multiplier are not mentioned or specified in the text." + }, + { + "feature": "roads", + "value": 1.5, + "units": "tip-height-multiplier", + "section": "SECTION 1308 – Performance / Construction Standards", + "summary": "Setback requirements for WECS: Property Lines, roads, overhead utility or transmission lines, electrical substations, communication or meteorological towers shall be 1.5 times the total height, and shall be Six Hundred Forty (640) feet from the nearest residence. The multiplier applies to the total height (tip height) of the WECS. No additional static distance value is added to the setback from roads." + }, + { + "feature": "noise", + "value": 50, + "units": "dBA", + "section": "SECTION 1308 – Performance / Construction Standards", + "summary": "The WECS and or Facility shall not generate noise in excess of Fifty (50) decibel levels at ground level to the property lines or at the nearest residence. Such measurements shall be signed by a qualified engineer, stating that noise levels are being met, per ordinance." + }, + { + "feature": "blade clearance", + "value": 30, + "units": "feet", + "section": "SECTION 1308 – Performance / Construction Standards", + "summary": "Section 1308, subsection H states: 'Minimum clearance between blade tip and ground level or final grade is to be Thirty (30) feet.' This applies to all WECS erected or placed within Decatur County after the ordinance's effective date." + }, + { + "feature": "decommissioning", + "value": null, + "units": null, + "section": "SECTION 1306 – General Requirements and SECTION 1308 – Performance / Construction Standards", + "summary": "Section 1306(B)(5) requires \"a copy of the Facility Maintenance and Removal Agreement signed by all applicants that binds the applicants and all successors in interest to the property to properly maintain and or remove the facility upon abandonment in compliance with the terms of this Ordinance.\" Section 1308(O) states \"At such time that a WECS ceases operation for a period of Twelve (12) months, the company shall physically remove the WECS and facility within Ninety (90) days from the Twelve (12) month period from when the operation ceased.\"" + }, + { + "feature": "special use districts", + "value": ["Agricultural A-1", "Agricultural A-2", "Business B-1", "Business B-2", "Business B-3", "Industrial I-1", "Industrial I-2"], + "units": null, + "section": "SECTION 1304 – Permanent Special Exceptions", + "summary": "The following are permitted as permanent special exceptions in the Agricultural A-1 & A-2 zoning districts; in the Business B-1, B-2 and B-3 zoning districts; and in the Industrial I-1 and I-2 zoning districts when approved by the Decatur County Board of Zoning Appeals: (C) Placement of a WECS as permitted under the zoning districts A-1, A-2, B-1, B-2, B-3, I-1 and I-2." + } + ], + "$instructions": { + "general": [ + "Extract ordinance requirements ONLY for large, commercial, or utility-scale wind energy systems (typically intended for offsite electricity generation or sale, often 1MW+ or above specified thresholds). Do NOT extract requirements that apply only to small, micro, private, personal, onsite, or residential systems, or to meteorological towers.", + "Use direct text excerpts and quotes in the summary field as much as possible. If multiple options exist and you make a selection, list all other options and their conditions in the summary.", + "Only extract explicit requirements that have been enacted. Do not extract definitions unless they also constitute requirements.", + "For features where no requirement exists, set value, units, and summary to null (omit the feature from the array entirely, or include it with null values).", + "Pay careful attention to clarifying text in parentheses, footnotes, and cross-references.", + "When multiple values are mentioned for the same feature, select the most restrictive value (largest minimum, lowest maximum, etc.) and note the other options in the summary.", + "Ensure units are expressed using standard, conventional names (e.g., 'feet', 'meters', 'dBA', 'hr/year', 'tip-height-multiplier')." + ], + "setback_specific": [ + "For distance-based setbacks, extract the numerical value and units (feet/meters).", + "For multiplier-based setbacks, extract the multiplier value and type (tip-height-multiplier, hub-height-multiplier, rotor-diameter-multiplier). Default to tip-height-multiplier unless text explicitly specifies hub or rotor diameter.", + "Treat any mention of 'fall zone' as a tip-height-multiplier of 1.", + "If text includes an adder (e.g., 'multiplier × height + X feet'), note this in the summary. Adder values should be converted to feet.", + "If text includes conditional minimums ('the greater of', 'no less than'), note minimum threshold in summary.", + "If text includes conditional maximums ('the lesser of', 'no more than'), note maximum limit in summary.", + "For structures and property lines: Check if ordinance distinguishes between participating and non-participating owners. If yes, extract separate values for each. If no distinction, use the non-participating feature for the general setback.", + "For roads: Verify the requirement is truly for roads and not better classified as a property line setback." + ], + "restriction_specific": [ + "For numerical restrictions: Extract numerical value and standard units. If multiple values exist, select the most restrictive.", + "For shadow flicker: If prohibited, use value of 0 with units 'hr/year'.", + "For maximum turbine height: Must be absolute height from ground, not relative to other features.", + "For maximum project size: Specify in summary whether limit is capacity-based (MW) or count-based (number of turbines).", + "For minimum/maximum lot size: Must be area values, not linear distances.", + "For tower density: Do not infer from other setback requirements; only extract if explicitly stated as spacing between turbines.", + "For qualitative restrictions: Extract full relevant text in summary field using direct excerpts and quotes. Value is typically null.", + "For prohibitions: Only extract if unconditional and currently in effect. If jurisdiction allows wind with any conditions, this is not a prohibition." + ], + "district_specific": [ + "For permitted use districts: Extract array of district names/codes. Consider overlay districts as primary use.", + "For special use districts: Include districts requiring board approval, special permits, or conditional use permits.", + "For accessory use districts: Include districts where wind is only allowed as accessory/secondary to another use.", + "For prohibited use districts: Only include districts with unconditional prohibitions currently in effect.", + "Use the exact district names/codes as they appear in the ordinance text." + ] + } +} diff --git a/examples/parse_existing_docs/README.md b/examples/parse_existing_docs/README.md new file mode 100644 index 000000000..98813816a --- /dev/null +++ b/examples/parse_existing_docs/README.md @@ -0,0 +1,11 @@ +# INFRA-COMPASS to Parse Local Docs + +This directory contains examples/tutorials to help you parse local documents using ``COMPASS``. + +- [`Parsing via CLI`](./CLI): Example showing how to parse local documents via the CLI + using a typical ``COMPASS`` configuration file. +- [`Parsing via Code`](./code): Tutorial using the ``COMPASS`` Python API to write code that + parses a local PDF file. + +For rendered versions of these guides, see the +[online documentation](https://natlabrockies.github.io/COMPASS/examples/index.html). diff --git a/examples/parse_existing_docs/README.rst b/examples/parse_existing_docs/code/README.rst similarity index 100% rename from examples/parse_existing_docs/README.rst rename to examples/parse_existing_docs/code/README.rst diff --git a/examples/parse_existing_docs/parse_pdf.py b/examples/parse_existing_docs/code/parse_pdf.py similarity index 100% rename from examples/parse_existing_docs/parse_pdf.py rename to examples/parse_existing_docs/code/parse_pdf.py diff --git a/pixi.lock b/pixi.lock index 6072cf3d1..cf1d43a29 100644 --- a/pixi.lock +++ b/pixi.lock @@ -147,6 +147,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-10.4.0-py313hc9074d1_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-25.3-pyh145f28c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/pixman-0.46.4-h54a6638_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.5.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/playwright-1.51.1-hbf95b10_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/poppler-24.12.0-hd7b24de_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.12-hd8ed1ab_0.conda @@ -268,7 +269,6 @@ environments: - pypi: https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/70/cb/e7cd2f6161e30a4009cf38dd00024b1303197afcd4297081b0ccd21016a8/patchright-1.51.3-py3-none-manylinux1_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e0/e3/79a2ad7ca71160fb6442772155389881672c98bd44c6022303ce242cbfb9/pdftotext-2.2.2.tar.gz - - pypi: https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/7a/fd/bc60798803414ecab66456208eeff4308344d0c055ca0d294d2cdd692b60/playwright-1.51.0-py3-none-manylinux1_x86_64.whl - pypi: https://files.pythonhosted.org/packages/c3/7b/cbd5d999a07ff2a21465975d4eb477ae6f69765e8fe8c9087dab250180d8/primp-0.15.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - pypi: https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl @@ -477,6 +477,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-10.4.0-py313h579fd5f_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-25.3-pyh145f28c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pixman-0.46.4-h7ac5ae9_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.5.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/playwright-1.51.1-h0ee932a_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/poppler-24.12.0-h549c9f3_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.12-hd8ed1ab_0.conda @@ -602,7 +603,6 @@ environments: - pypi: https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/81/09/e6126d32175f96ea963616debbb8e380e7c987ca913efeb59bf7e7f39438/patchright-1.51.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - pypi: https://files.pythonhosted.org/packages/e0/e3/79a2ad7ca71160fb6442772155389881672c98bd44c6022303ce242cbfb9/pdftotext-2.2.2.tar.gz - - pypi: https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/0d/14/13db550d7b892aefe80f8581c6557a17cbfc2e084383cd09d25fdd488f6e/playwright-1.51.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl - pypi: https://files.pythonhosted.org/packages/56/0b/a87556189da4de1fc6360ca1aa05e8335509633f836cdd06dd17f0743300/primp-0.15.0.tar.gz - pypi: https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl @@ -770,6 +770,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-64/pillow-10.4.0-py313hcd5872a_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-25.3-pyh145f28c_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/pixman-0.46.4-ha059160_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.5.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/playwright-1.51.1-h046caca_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-64/poppler-24.12.0-hcc361ce_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.12-hd8ed1ab_0.conda @@ -887,7 +888,6 @@ environments: - pypi: https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/1a/55/d62c85fff36e9e9e515ee92407b02acb556e6832d4fbcc8624b638cf70bb/patchright-1.51.3-py3-none-macosx_11_0_universal2.whl - pypi: https://files.pythonhosted.org/packages/e0/e3/79a2ad7ca71160fb6442772155389881672c98bd44c6022303ce242cbfb9/pdftotext-2.2.2.tar.gz - - pypi: https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/ba/b1/061c322319072225beba45e8c6695b7c1429f83bb97bdb5ed51ea3a009fc/playwright-1.51.0-py3-none-macosx_11_0_universal2.whl - pypi: https://files.pythonhosted.org/packages/f5/5a/146ac964b99ea7657ad67eb66f770be6577dfe9200cb28f9a95baffd6c3f/primp-0.15.0-cp38-abi3-macosx_10_12_x86_64.whl - pypi: https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl @@ -1057,6 +1057,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pillow-10.4.0-py313h4ca4afe_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-25.3-pyh145f28c_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/pixman-0.46.4-h81086ad_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.5.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/playwright-1.51.1-h3339cab_0.conda - conda: https://conda.anaconda.org/conda-forge/osx-arm64/poppler-24.12.0-ha29e788_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.12-hd8ed1ab_0.conda @@ -1174,7 +1175,6 @@ environments: - pypi: https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/55/20/4df3f817c28938e22ee7c7c4b28d8b3a212e5a111c3bd9633bc410267daa/patchright-1.51.3-py3-none-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/e0/e3/79a2ad7ca71160fb6442772155389881672c98bd44c6022303ce242cbfb9/pdftotext-2.2.2.tar.gz - - pypi: https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/32/4a/5f2ff6866bdf88e86147930b0be86b227f3691f4eb01daad5198302a8cbe/playwright-1.51.0-py3-none-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/bc/8a/cc2321e32db3ce64d6e32950d5bcbea01861db97bfb20b5394affc45b387/primp-0.15.0-cp38-abi3-macosx_11_0_arm64.whl - pypi: https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl @@ -1333,6 +1333,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/pillow-10.4.0-py313h24ec7aa_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pip-25.3-pyh145f28c_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/pixman-0.46.4-h5112557_1.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/platformdirs-4.5.1-pyhcf101f3_0.conda - conda: https://conda.anaconda.org/microsoft/noarch/playwright-1.51.0-py_0.conda - conda: https://conda.anaconda.org/conda-forge/win-64/poppler-24.12.0-heaa0bce_2.conda - conda: https://conda.anaconda.org/conda-forge/noarch/poppler-data-0.4.12-hd8ed1ab_0.conda @@ -1452,7 +1453,6 @@ environments: - pypi: https://files.pythonhosted.org/packages/0a/08/b83b94a2b10ee7d27dddff4812a188e6669e520dafccb590613a90fa9d76/nlr_rex-0.5.0-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/6e/fd/97e3e26893904bdeff36d54e6ea5fe5f81a245a96c1b73ebe37e956ce11d/patchright-1.51.3-py3-none-win_amd64.whl - - pypi: https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl - pypi: https://files.pythonhosted.org/packages/20/0f/098488de02e3d52fc77e8d55c1467f6703701b6ea6788f40409bb8c00dd4/playwright-1.51.0-py3-none-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/0c/dd/f0183ed0145e58cf9d286c1b2c14f63ccee987a4ff79ac85acc31b5d86bd/primp-0.15.0-cp38-abi3-win_amd64.whl - pypi: https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl @@ -16692,8 +16692,8 @@ packages: timestamp: 1736252433366 - pypi: ./ name: infra-compass - version: 0.13.2.dev37+g1efc31b.d20260201 - sha256: 11b69caa4a9bde6e3537064e5f9edb2b5381483014927be4c9bb2a6a45f2c3b0 + version: 0.13.2.dev5+g2a2cf74.d20260210 + sha256: 14c80efebd7b5a7937d34c6e5de47a023a4ffc6e29953d0d7267c4c258816d68 requires_dist: - beautifulsoup4>=4.12.3,<5 - click>=8.1.7,<9 @@ -16706,6 +16706,7 @@ packages: - openai>=1.1.0 - pandas>=2.2.3,<3 - pdftotext>=2.2.2,<3 + - platformdirs>=4.5.1,<5 - playwright>=1.49.0,<1.52 - pyjson5>=2.0.0,<3 - rich>=13.9.4,<14 diff --git a/pyproject.toml b/pyproject.toml index a13d3a023..1aa801681 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "openai>=1.1.0", "pandas>=2.2.3,<3", "pdftotext>=2.2.2,<3", + "platformdirs>=4.5.1,<5", "playwright>=1.49.0,<1.52", # version range required for c4ai" "pyjson5>=2.0.0,<3", "rich>=13.9.4,<14", @@ -174,6 +175,7 @@ pandas = ">=2.2.3,<3" pdf2image = ">=1.17.0,<2" pillow = ">=10.4,<11.dev0" # version range required for c4ai playwright = ">=1.49.0,<1.52" # version range required for c4ai +platformdirs = ">=4.5.1,<5" pypdf2 = ">=3.0.1,<4" pytesseract = ">=0.3.13,<0.4" python = ">=3.12" diff --git a/tests/python/unit/plugin/test_plugin_ordinances.py b/tests/python/unit/plugin/test_plugin_ordinances.py index 247d8ecbc..074c2daa5 100644 --- a/tests/python/unit/plugin/test_plugin_ordinances.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -42,8 +42,8 @@ class MYPlugin(OrdinanceExtractionPlugin): IDENTIFIER = "test" WEBSITE_KEYWORDS = ["test"] - QUESTION_TEMPLATES = ["test"] - heuristic = None + QUERY_TEMPLATES = ["test"] + HEURISTIC = None async def parse_docs_for_structured_data(self, extraction_context): return extraction_context @@ -84,8 +84,8 @@ class MYPlugin(OrdinanceExtractionPlugin): IDENTIFIER = "test" WEBSITE_KEYWORDS = ["test"] - QUESTION_TEMPLATES = ["test"] - heuristic = None + QUERY_TEMPLATES = ["test"] + HEURISTIC = None async def parse_docs_for_structured_data(self, extraction_context): return extraction_context @@ -126,8 +126,8 @@ class MYPlugin(OrdinanceExtractionPlugin): IDENTIFIER = "test" WEBSITE_KEYWORDS = ["test"] - QUESTION_TEMPLATES = ["test"] - heuristic = None + QUERY_TEMPLATES = ["test"] + HEURISTIC = None async def parse_docs_for_structured_data(self, extraction_context): return extraction_context @@ -172,8 +172,8 @@ class MYPlugin(OrdinanceExtractionPlugin): IDENTIFIER = "test" WEBSITE_KEYWORDS = ["test"] - QUESTION_TEMPLATES = ["test"] - heuristic = None + QUERY_TEMPLATES = ["test"] + HEURISTIC = None async def parse_docs_for_structured_data(self, extraction_context): return extraction_context diff --git a/tests/python/unit/scripts/test_process.py b/tests/python/unit/scripts/test_process.py index 70ce550d4..645e3754b 100644 --- a/tests/python/unit/scripts/test_process.py +++ b/tests/python/unit/scripts/test_process.py @@ -6,7 +6,7 @@ import pytest -from compass.exceptions import COMPASSValueError +from compass.exceptions import COMPASSValueError, COMPASSFileNotFoundError import compass.scripts.process as process_module from compass.scripts.process import ( _COMPASSRunner, @@ -63,7 +63,9 @@ def test_known_local_docs_missing_file(tmp_path): process_kwargs=ProcessKwargs(str(missing_fp), None), ) - with pytest.raises(COMPASSValueError, match="Config file does not exist"): + with pytest.raises( + COMPASSFileNotFoundError, match="Config file does not exist" + ): _ = runner.known_local_docs @@ -79,7 +81,9 @@ def test_known_local_docs_logs_missing_file(tmp_path, testing_log_file): process_kwargs=ProcessKwargs(str(missing_fp), None), ) - with pytest.raises(COMPASSValueError, match="Config file does not exist"): + with pytest.raises( + COMPASSFileNotFoundError, match="Config file does not exist" + ): _ = runner.known_local_docs assert testing_log_file.exists() diff --git a/tests/python/unit/services/test_services_threaded.py b/tests/python/unit/services/test_services_threaded.py index 6a7b72b68..c65ee02de 100644 --- a/tests/python/unit/services/test_services_threaded.py +++ b/tests/python/unit/services/test_services_threaded.py @@ -264,7 +264,9 @@ def test_write_cleaned_file_skips_missing_section(tmp_path): outputs = threaded._write_cleaned_file( doc, tmp_path, tech="wind", jurisdiction_name="Partial" ) - assert [fp.name for fp in outputs] == ["Partial Cleaned Text.txt"] + assert [fp.name for fp in outputs] == [ + "Partial Utility Scale Wind Ordinance.txt" + ] def test_write_ord_db_creates_csv(tmp_path): @@ -366,8 +368,8 @@ async def test_cleaned_file_writer_process(tmp_path, monkeypatch): writer.release_resources() assert sorted(fp.name for fp in outputs) == [ - "Writer Cleaned Text.txt", - "Writer Districts.txt", + "Writer Permitted Use Districts.txt", + "Writer Utility Scale Wind Ordinance.txt", ] diff --git a/tests/python/unit/test_exceptions.py b/tests/python/unit/test_exceptions.py index 33e80ddc9..d1cfcecc4 100644 --- a/tests/python/unit/test_exceptions.py +++ b/tests/python/unit/test_exceptions.py @@ -10,6 +10,7 @@ from compass.exceptions import ( COMPASSError, + COMPASSFileNotFoundError, COMPASSTypeError, COMPASSValueError, COMPASSNotInitializedError, @@ -57,6 +58,10 @@ def test_exceptions_log_uncaught_error(assert_message_was_logged): COMPASSNotInitializedError, [COMPASSError, COMPASSNotInitializedError], ), + ( + COMPASSFileNotFoundError, + [COMPASSError, FileNotFoundError, COMPASSFileNotFoundError], + ), ( COMPASSTypeError, [COMPASSError, TypeError, COMPASSTypeError], diff --git a/tests/python/unit/utilities/test_utilities_io.py b/tests/python/unit/utilities/test_utilities_io.py index 83dd5b048..65683f9ec 100644 --- a/tests/python/unit/utilities/test_utilities_io.py +++ b/tests/python/unit/utilities/test_utilities_io.py @@ -1,11 +1,17 @@ -"""Test COMPASS I/O utilities""" +"""COMPASS I/O tests""" import os +import json from pathlib import Path import pytest -from compass.utilities.io import load_local_docs +from compass.utilities.io import ( + load_config, + ConfigType, + resolve_all_paths, + load_local_docs, +) from compass.services.cpu import ( PDFLoader, OCRPDFLoader, @@ -14,12 +20,210 @@ ) from compass.services.provider import RunningAsyncServices from compass.services.threaded import read_html_file, HTMLFileLoader -from compass.exceptions import COMPASSNotInitializedError +from compass.exceptions import COMPASSNotInitializedError, COMPASSValueError PYT_CMD = os.getenv("TESSERACT_CMD") +def test_resolve_all_paths(): + """Test resolving all paths""" + + base_dir = Path.home() + + assert resolve_all_paths("test", base_dir) == "test" + assert resolve_all_paths("~test", base_dir) == "~test" + assert ( + resolve_all_paths("/test/f.csv", base_dir) + == Path("/test/f.csv").as_posix() + ) + assert ( + resolve_all_paths("./test", base_dir) == (base_dir / "test").as_posix() + ) + assert resolve_all_paths("../", base_dir) == base_dir.parent.as_posix() + assert resolve_all_paths(".././", base_dir) == base_dir.parent.as_posix() + assert ( + resolve_all_paths("../test_file.json", base_dir) + == (base_dir.parent / "test_file.json").as_posix() + ) + assert ( + resolve_all_paths("../test_dir/./../", base_dir) + == base_dir.parent.as_posix() + ) + assert ( + resolve_all_paths("test_dir/./", base_dir) + == Path("test_dir").resolve().as_posix() + ) + assert ( + resolve_all_paths("test_dir/../", base_dir) + == Path("test_dir").resolve().parent.as_posix() + ) + assert ( + resolve_all_paths("~/test_dir/../", base_dir) == Path.home().as_posix() + ) + + +def test_resolve_all_paths_list(): + """Test resolving all paths in a list""" + base_dir = Path.home() + input_ = [ + "test", + "./test", + "../", + ".././", + "../test_file.json", + "../test_dir/./../", + ["test", "../test_dir/./../"], + ] + expected_output = [ + "test", + (base_dir / "test").as_posix(), + base_dir.parent.as_posix(), + base_dir.parent.as_posix(), + (base_dir.parent / "test_file.json").as_posix(), + base_dir.parent.as_posix(), + ["test", base_dir.parent.as_posix()], + ] + + assert resolve_all_paths(input_, base_dir) == expected_output + + +def test_resolve_all_paths_dict(): + """Test resolving all paths in a dict""" + base_dir = Path.home() + input_ = { + "a": "test", + "b": "./test", + "c": "../", + "d": ".././", + "e": "../test_file.json", + "f": "../test_dir/./../", + "g": ["test", "../test_dir/./../"], + "h": { + "a": "test", + "b": ["test", "../test_dir/./../"], + }, + } + expected_output = { + "a": "test", + "b": (base_dir / "test").as_posix(), + "c": base_dir.parent.as_posix(), + "d": base_dir.parent.as_posix(), + "e": (base_dir.parent / "test_file.json").as_posix(), + "f": base_dir.parent.as_posix(), + "g": [ + "test", + base_dir.parent.as_posix(), + ], + "h": { + "a": "test", + "b": [ + "test", + base_dir.parent.as_posix(), + ], + }, + } + + assert resolve_all_paths(input_, base_dir) == expected_output + + +@pytest.mark.parametrize("config_type", list(ConfigType)) +def test_write_load_config(tmp_path, config_type): + """Test loading a configuration file""" + + base_fn = f"test.{config_type}" + + test_dictionary = {"a": 1, "b": 2} + with Path(tmp_path / base_fn).open("w", encoding="utf-8") as config_file: + config_type.dump(test_dictionary, config_file) + + assert load_config(tmp_path / "." / base_fn) == test_dictionary + + test_dictionary = { + "a": 1, + "b": "A string", + "path_a": "./config.json", + "path_b": "./../another.json", + "path_c": "./something/.././../another.json", + } + config_type.write(tmp_path / base_fn, test_dictionary) + + expected_dict = { + "a": 1, + "b": "A string", + "path_a": (tmp_path / "config.json").as_posix(), + "path_b": (tmp_path.parent / "another.json").as_posix(), + "path_c": (tmp_path.parent / "another.json").as_posix(), + } + assert load_config(tmp_path / "." / base_fn) == expected_dict + + assert ( + load_config(tmp_path / "." / base_fn, resolve_paths=False) + == test_dictionary + ) + + +@pytest.mark.parametrize("config_type", list(ConfigType)) +def test_config_dumps_loads(config_type): + """Test dumping and loading a configuration file to and from a str""" + + test_dictionary = { + "a": 1, + "b": "A string", + "path_a": "./config.json", + "path_b": "./../another.json", + "path_c": "./something/.././../another.json", + } + assert ( + config_type.loads(config_type.dumps(test_dictionary)) + == test_dictionary + ) + + +def test_load_config_json(tmp_path): + """Test `load_config` with JSON file""" + + config_data = {"key": "value", "number": 42} + config_file = tmp_path / "test_config.json" + with config_file.open("w", encoding="utf-8") as f: + json.dump(config_data, f) + + result = load_config(config_file) + assert result == config_data + + +def test_load_config_json5(tmp_path): + """Test `load_config` with JSON5 file""" + + config_content = """{ + // This is a comment + "key": "value", + "number": 42, + }""" + config_file = tmp_path / "test_config.json5" + with config_file.open("w", encoding="utf-8") as f: + f.write(config_content) + + result = load_config(config_file) + assert result == {"key": "value", "number": 42} + + +def test_load_config_invalid_extension(tmp_path): + """Test `load_config` with invalid file extension""" + + config_file = tmp_path / "test_config.txt" + config_file.touch() + + with pytest.raises( + COMPASSValueError, + match=( + r"Got unknown config file extension: '.txt'. Supported " + r"extensions are:" + ), + ): + load_config(config_file) + + @pytest.mark.asyncio async def test_basic_load_pdf(test_data_files_dir): """Test basic loading of local PDF document""" diff --git a/tests/python/unit/utilities/test_utilities_parsing.py b/tests/python/unit/utilities/test_utilities_parsing.py index f96e1d529..fb916e68f 100644 --- a/tests/python/unit/utilities/test_utilities_parsing.py +++ b/tests/python/unit/utilities/test_utilities_parsing.py @@ -1,8 +1,6 @@ """Test COMPASS Ordinance parsing utilities""" -import json from pathlib import Path -from unittest.mock import MagicMock import numpy as np import pandas as pd @@ -13,12 +11,10 @@ convert_paths_to_strings, extract_ord_year_from_doc_attrs, llm_response_as_json, - load_config, merge_overlapping_texts, num_ordinances_dataframe, ordinances_bool_index, ) -from compass.exceptions import COMPASSValueError @pytest.mark.parametrize( @@ -60,15 +56,19 @@ def test_llm_response_as_json(in_str, expected): [ ( [ - "Some text. Some overlap. More text. More text that " - "shouldn't be touched. Some overlap.", + ( + "Some text. Some overlap. More text. More text that " + "shouldn't be touched. Some overlap." + ), "Some overlap. More text.", "Some non-overlapping text.", ], 12, - "Some text. Some overlap. More text. More text that " - "shouldn't be touched. Some overlap. More text.\nSome " - "non-overlapping text.", + ( + "Some text. Some overlap. More text. More text that " + "shouldn't be touched. Some overlap. More text.\nSome " + "non-overlapping text." + ), ), ([], 300, ""), (["single chunk"], 300, "single chunk"), @@ -185,47 +185,6 @@ def test_ordinances_bool_index_value_only(): np.testing.assert_array_equal(result, expected) -def test_load_config_json(tmp_path): - """Test `load_config` with JSON file""" - - config_data = {"key": "value", "number": 42} - config_file = tmp_path / "test_config.json" - with config_file.open("w", encoding="utf-8") as f: - json.dump(config_data, f) - - result = load_config(config_file) - assert result == config_data - - -def test_load_config_json5(tmp_path): - """Test `load_config` with JSON5 file""" - - config_content = """{ - // This is a comment - "key": "value", - "number": 42, - }""" - config_file = tmp_path / "test_config.json5" - with config_file.open("w", encoding="utf-8") as f: - f.write(config_content) - - result = load_config(config_file) - assert result == {"key": "value", "number": 42} - - -def test_load_config_invalid_extension(tmp_path): - """Test `load_config` with invalid file extension""" - - config_file = tmp_path / "test_config.txt" - config_file.touch() - - with pytest.raises( - COMPASSValueError, - match=r"Got unknown config file extension: \.txt", - ): - load_config(config_file) - - def test_convert_paths_to_strings_all_structures(): """Test `convert_paths_to_strings` across nested containers""" diff --git a/tests/python/unit/validation/test_validation_content.py b/tests/python/unit/validation/test_validation_content.py index f42ced70b..78d88d0c2 100644 --- a/tests/python/unit/validation/test_validation_content.py +++ b/tests/python/unit/validation/test_validation_content.py @@ -26,8 +26,8 @@ async def test_validation_with_mem(): keys = [] - class MockStructuredLLMCaller: - """Mock LLM caller for tests.""" + class MockJSONFromTextLLMCaller: + """Mock LLM caller for tests""" async def call(self, key, text_chunk): """Mock LLM call and record system message""" @@ -36,7 +36,7 @@ async def call(self, key, text_chunk): text_chunks = list(range(7)) validator = ParseChunksWithMemory(text_chunks, 3) - caller = MockStructuredLLMCaller() + caller = MockJSONFromTextLLMCaller() out = await validator.parse_from_ind( 0, key="test", llm_call_callback=caller.call