From 9d9af2dbacf0fd2a9e00371002b274a9f77667a3 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 15:58:28 -0700 Subject: [PATCH 01/22] Add debug logging --- compass/scripts/process.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/compass/scripts/process.py b/compass/scripts/process.py index cbb8f67a..cb8c579a 100644 --- a/compass/scripts/process.py +++ b/compass/scripts/process.py @@ -874,6 +874,11 @@ async def _run(self): ) if extraction_context is not None: return extraction_context + else: + logger.debug( + "%r processing had no known local docs configured", + self.jurisdiction.full_name, + ) if self.known_doc_urls: logger.debug( @@ -885,6 +890,11 @@ async def _run(self): ) if extraction_context is not None: return extraction_context + else: + logger.debug( + "%r processing had no known URLs configured", + self.jurisdiction.full_name, + ) if self.perform_se_search: logger.debug( @@ -897,6 +907,11 @@ async def _run(self): ) if extraction_context is not None: return extraction_context + else: + logger.debug( + "%r processing didn't have SE search enabled", + self.jurisdiction.full_name, + ) if self.perform_website_search: logger.debug( @@ -908,6 +923,12 @@ async def _run(self): ) if extraction_context is not None: return extraction_context + else: + logger.debug( + "%r processing didn't have jurisdiction website search " + "enabled", + self.jurisdiction.full_name, + ) return None From 80f5d1a205b4ceb7fd50fb7796a03493cb79aee7 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 19:16:03 -0700 Subject: [PATCH 02/22] Implement and use `PromptBasedTextExtractor` --- compass/extraction/small_wind/ordinance.py | 485 ++++++++------------- compass/extraction/solar/ordinance.py | 358 ++++++--------- compass/extraction/wind/ordinance.py | 483 ++++++++------------ compass/plugin/ordinance.py | 143 +++++- 4 files changed, 610 insertions(+), 859 deletions(-) diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py index 426df3b4..d392c37f 100644 --- a/compass/extraction/small_wind/ordinance.py +++ b/compass/extraction/small_wind/ordinance.py @@ -9,7 +9,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, OrdinanceTextCollector, - OrdinanceTextExtractor, + PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -32,6 +32,151 @@ _IGNORE_TYPES_MICRO = "private, micro, personal, building-mounted" _IGNORE_TYPES_LARGE = "large, utility-scale, for-sale, commercial" +_WECS_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information about \ +**wind energy systems**. The extracted text will be used for structured \ +data extraction, so it must be both **comprehensive** (retaining all relevant \ +details) and **focused** (excluding unrelated content), with **zero rewriting \ +or paraphrasing**. Ensure that all retained information is **directly \ +applicable to wind energy systems** while preserving full context and accuracy. + +# OBJECTIVE # +Extract all text **pertaining to wind energy systems** from the provided \ +excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Include all text that pertains to **wind energy systems**. +- Explicitly include any text related to **bans or prohibitions** on wind \ +energy systems. +- Explicitly include any text related to the adoption or enactment date of \ +the ordinance (if any). + +2. ## Exclusions ##: +- Do **not** include text that does not pertain to wind energy systems. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + +_SMALL_WECS_TEXT_EXTRACTION_PROMPT = f"""\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information about \ +**small, medium, or non-commercial wind energy systems**. The extracted text \ +will be used for structured data extraction, so it must be both \ +**comprehensive** (retaining all relevant details) and **focused** (excluding \ +unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \ +retained information is **directly applicable** to small, medium, or \ +non-commercial wind energy systems while preserving full context and accuracy. + +# OBJECTIVE # +Extract all text **pertaining to small, medium or non-commercial wind energy \ +systems** from the provided excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Include all text that pertains to **small, medium, or non-commercial wind \ +energy systems**, even if they are referred to by different names such as: \ +{_SMALL_WES_SYNONYMS.capitalize()} +- Explicitly include any text related to **bans or prohibitions** on small, \ +medium, or non-commercial wind energy systems. +- Explicitly include any text related to the adoption or enactment date of \ +the ordinance (if any). +- **Retain all relevant technical, design, operational, safety, \ +environmental, and infrastructure-related provisions** that apply to the \ +topic, such as (but not limited to): + - Compliance with legal or regulatory standards. + - Site, structural, or design specifications. + - Environmental impact considerations. + - Safety and risk mitigation measures. + - Infrastructure, implementation, operation, and maintenance details. + - All other **closely related provisions**. + +2. ## Exclusions ##: +- Do **not** include text that explicitly applies **only** to \ +{_IGNORE_TYPES_MICRO} or {_IGNORE_TYPES_LARGE} wind energy systems. +- Do **not** include text that does not pertain at all to wind energy systems. + +3.{{FORMATTING_PROMPT}} + +4. {{OUTPUT_PROMPT}}\ +""" + +_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +permitted use(s) for a district. The extracted text will be used for \ +structured data extraction, so it must be both **comprehensive** (retaining \ +all relevant details) and **focused** (excluding unrelated content), with \ +**zero rewriting or paraphrasing**. Ensure that all retained information is \ +**directly applicable** to permitted use(s) for one or more districts while \ +preserving full context and accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Pay extra attention to any references to **wind energy facilities** or \ +related terms. +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do **not** include unrelated regulations, procedural details, or \ +non-use-based restrictions. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + +_WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +**wind energy system** permitted use(s) for a district. The extracted text \ +will be used for structured data extraction, so it must be both \ +**comprehensive** (retaining all relevant details) and **focused** (excluding \ +unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \ +retained information is **directly applicable** to permitted use(s) for wind \ +energy systems in one or more districts while preserving full context and \ +accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to wind energy conversion system \ +permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do not include text that does not pertain at all to wind energy systems. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + class SmallWindHeuristic(OrdinanceHeuristic): """Perform a heuristic check for mention of wind turbines in text""" @@ -295,189 +440,39 @@ async def check_chunk(self, chunk_parser, ind): return False -class SmallWindOrdinanceTextExtractor(OrdinanceTextExtractor): +class SmallWindOrdinanceTextExtractor(PromptBasedTextExtractor): """Extract succinct ordinance text from input""" IN_LABEL = SmallWindOrdinanceTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "cleaned_text_for_extraction" - """Identifier for ordinance text extracted by this class""" - TASK_DESCRIPTION = "Extracting small wind ordinance text" """Task description to show in progress bar""" TASK_ID = "ordinance_text_extraction" """ID to use for this extraction for linking with LLM configs""" - WIND_ENERGY_SYSTEM_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "about **wind energy systems**. The extracted text will be used for " - "structured data extraction, so it must be both **comprehensive** " - "(retaining all relevant details) and **focused** (excluding " - "unrelated content), with **zero rewriting or paraphrasing**. " - "Ensure that all retained information is " - "**directly applicable to wind energy systems** while preserving " - "full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Extract all text **pertaining to wind energy systems** from the " - "provided excerpt.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Include all text that pertains to **wind energy systems**.\n" - "- Explicitly include any text related to **bans or prohibitions** " - "on wind energy systems.\n" - "- Explicitly include any text related to the adoption or enactment " - "date of the ordinance (if any).\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include text that does not pertain to wind energy " - "systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for WECS""" - - SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "about **small, medium, or non-commercial wind energy systems**. The " - "extracted text will be used for structured data extraction, so it " - "must be both **comprehensive** (retaining all relevant details) and " - "**focused** (excluding unrelated content), with **zero rewriting or " - "paraphrasing**. Ensure that all retained information " - "is **directly applicable** to small, medium, or non-commercial wind " - "energy systems while preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Extract all text **pertaining to small, medium or non-commercial " - "wind energy systems** from the provided excerpt.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Include all text that pertains to **small, medium, or " - "non-commercial wind energy systems**, even if they are referred to " - "by different names such as:\n" - f"\t{_SMALL_WES_SYNONYMS.capitalize()}.\n" - "- Explicitly include any text related to **bans or prohibitions** " - "on small, medium, or non-commercial wind energy systems.\n" - "- Explicitly include any text related to the adoption or enactment " - "date of the ordinance (if any).\n" - "- **Retain all relevant technical, design, operational, safety, " - "environmental, and infrastructure-related provisions** that apply " - "to the topic, such as (but not limited to):\n" - "\t- Compliance with legal or regulatory standards.\n" - "\t- Site, structural, or design specifications.\n" - "\t- Environmental impact considerations.\n" - "\t- Safety and risk mitigation measures.\n" - "\t- Infrastructure, implementation, operation, and maintenance " - "details.\n" - "\t- All other **closely related provisions**.\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include text that explicitly applies **only** to " - f"{_IGNORE_TYPES_MICRO} or {_IGNORE_TYPES_LARGE} " - "wind energy systems.\n" - f"- Do **not** include text that does not pertain at all to wind " - "energy systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for small WECS""" - - async def extract_wind_energy_system_section(self, text_chunks): - """Extract ordinance text from input text chunks for WES - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.WIND_ENERGY_SYSTEM_FILTER_PROMPT, - ) - - async def extract_small_wind_energy_system_section(self, text_chunks): - """Extract small WES ordinance text from input text chunks - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield ( - "wind_energy_systems_text", - self.extract_wind_energy_system_section, - ) - yield self.OUT_LABEL, self.extract_small_wind_energy_system_section + PROMPTS = [ + { + "key": "wind_energy_systems_text", + "out_fn": "{jurisdiction} Wind Ordinance Text.txt", + "prompt": _WECS_TEXT_EXTRACTION_PROMPT, + }, + { + "key": "cleaned_text_for_extraction", + "out_fn": "{jurisdiction} Cleaned Text.txt", + "prompt": _SMALL_WECS_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text extraction""" -class SmallWindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): +class SmallWindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): """Extract succinct permitted use districts text from input""" IN_LABEL = SmallWindPermittedUseDistrictsTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "districts_text" - """Identifier for permitted use text extracted by this class""" - TASK_DESCRIPTION = "Extracting small wind permitted use text" """Task description to show in progress bar""" @@ -486,148 +481,16 @@ class SmallWindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY - PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing permitted use(s) for a district. The extracted text will " - "be used for structured data extraction, so it must be both " - "**comprehensive** (retaining all relevant details) and **focused** " - "(excluding unrelated content), with **zero rewriting or " - "paraphrasing**. Ensure that all retained information " - "is **directly applicable** to permitted use(s) for one or more " - "districts while preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to permitted use(s) for " - "a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Pay extra attention to any references to **wind energy " - "facilities** or related terms.\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include unrelated regulations, procedural details, " - "or non-use-based restrictions.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses""" - - WES_PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing **wind energy system** permitted use(s) for a district. " - "The extracted text will be used for structured data extraction, so " - "it must be both **comprehensive** (retaining all relevant details) " - "and **focused** (excluding unrelated content), with **zero rewriting " - "or paraphrasing**. Ensure that all " - "retained information is **directly applicable** to permitted use(s) " - "for wind energy systems in one or more districts while " - "preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to wind energy conversion " - "system permitted use(s) for a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do not include text that does not pertain at all to wind " - "energy systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses for WECS""" - - async def extract_permitted_uses(self, text_chunks): - """Extract permitted uses text from input text chunks - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.PERMITTED_USES_FILTER_PROMPT, - ) - - async def extract_wes_permitted_uses(self, text_chunks): - """Extract permitted uses text for small WES from input text - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.WES_PERMITTED_USES_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield "permitted_use_only_text", self.extract_permitted_uses - yield self.OUT_LABEL, self.extract_wes_permitted_uses + PROMPTS = [ + { + "key": "permitted_use_only_text", + "out_fn": "{jurisdiction} Permitted Use Only.txt", + "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + { + "key": "districts_text", + "out_fn": "{jurisdiction} Districts.txt", + "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text extraction""" diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py index 3679e519..67dbb61d 100644 --- a/compass/extraction/solar/ordinance.py +++ b/compass/extraction/solar/ordinance.py @@ -9,7 +9,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, OrdinanceTextCollector, - OrdinanceTextExtractor, + PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -34,6 +34,109 @@ "CSP, private, residential, roof-mounted, micro, small, or medium sized" ) +_SEF_TEXT_EXTRACTION_PROMPT = f"""\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information about \ +**solar energy systems**. The extracted text will be used for structured data \ +extraction, so it must be both **comprehensive** (retaining all relevant \ +details) and **focused** (excluding unrelated content), with **zero rewriting \ +or paraphrasing**. Ensure that all retained information is **directly \ +applicable to solar energy systems** while preserving full context and \ +accuracy. + +# OBJECTIVE # +Extract all text **pertaining to solar energy systems** from the provided \ +excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Include **all** text that pertains to** solar energy systems**, even if \ +they are referred to by different names such as: \ +{_LARGE_SEF_SYNONYMS.capitalize()} +- Explicitly include any text related to **bans or prohibitions** on solar \ +energy systems. +- Explicitly include any text related to the adoption or enactment date of \ +the ordinance (if any). + +2. ## Exclusions ##: +- Do **not** include text that does not pertain to solar energy systems. + +3. {{FORMATTING_PROMPT}} + +4. {{OUTPUT_PROMPT}}\ +""" + +_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +permitted use(s) for a district. The extracted text will be used for \ +structured data extraction, so it must be both **comprehensive** (retaining \ +all relevant details) and **focused** (excluding unrelated content), with \ +**zero rewriting or paraphrasing**. Ensure that all retained information is \ +**directly applicable** to permitted use(s) for one or more districts while \ +preserving full context and accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Pay extra attention to any references to **solar energy facilities** or \ +related terms. +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do **not** include unrelated regulations, procedural details, or \ +non-use-based restrictions. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + +_SEF_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +**solar energy system** permitted use(s) for a district. The extracted text \ +will be used for structured data extraction, so it must be both \ +**comprehensive** (retaining all relevant details) and **focused** (excluding \ +unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \ +retained information is **directly applicable** to permitted use(s) for solar \ +energy systems in one or more districts while preserving full context and \ +accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to solar energy conversion system \ +permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do not include text that does not pertain at all to solar energy systems. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + class SolarHeuristic(OrdinanceHeuristic): """Perform a heuristic check for mention of solar farms in text""" @@ -249,109 +352,34 @@ async def check_chunk(self, chunk_parser, ind): return False -class SolarOrdinanceTextExtractor(OrdinanceTextExtractor): +class SolarOrdinanceTextExtractor(PromptBasedTextExtractor): """Extract succinct ordinance text from input""" IN_LABEL = SolarOrdinanceTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "cleaned_text_for_extraction" - """Identifier for ordinance text extracted by this class""" - TASK_DESCRIPTION = "Extracting solar ordinance text" """Task description to show in progress bar""" TASK_ID = "ordinance_text_extraction" """ID to use for this extraction for linking with LLM configs""" - SOLAR_ENERGY_SYSTEM_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "about **solar energy systems**. The extracted text will be used for " - "structured data extraction, so it must be both **comprehensive** " - "(retaining all relevant details) and **focused** (excluding " - "unrelated content), with **zero rewriting or paraphrasing**. " - "Ensure that all retained information is " - "**directly applicable to solar energy systems** while preserving " - "full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Extract all text **pertaining to solar energy systems** from the " - "provided excerpt.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Include **all** text that pertains to** solar energy systems**, " - "even if they are referred to by different names such as:\n" - f"\t{_LARGE_SEF_SYNONYMS.capitalize()}.\n" - "- Explicitly include any text related to **bans or prohibitions** " - "on solar energy systems.\n" - "- Explicitly include any text related to the adoption or enactment " - "date of the ordinance (if any).\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include text that does not pertain to solar energy " - "systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for SEF""" - - async def extract_solar_energy_system_section(self, text_chunks): - """Extract ordinance text from input text chunks for SEF - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.SOLAR_ENERGY_SYSTEM_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield self.OUT_LABEL, self.extract_solar_energy_system_section + PROMPTS = [ + { + "key": "cleaned_text_for_extraction", + "out_fn": "{jurisdiction} Cleaned Text.txt", + "prompt": _SEF_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text extraction""" -class SolarPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): +class SolarPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): """Extract succinct permitted use districts text from input""" IN_LABEL = SolarPermittedUseDistrictsTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "districts_text" - """Identifier for permitted use text extracted by this class""" - TASK_DESCRIPTION = "Extracting solar permitted use text" """Task description to show in progress bar""" @@ -360,148 +388,16 @@ class SolarPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY - PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing permitted use(s) for a district. The extracted text will " - "be used for structured data extraction, so it must be both " - "**comprehensive** (retaining all relevant details) and **focused** " - "(excluding unrelated content), with **zero rewriting or " - "paraphrasing**. Ensure that all retained information " - "is **directly applicable** to permitted use(s) for one or more " - "districts while preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to permitted use(s) for " - "a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Pay extra attention to any references to **solar energy " - "facilities** or related terms.\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include unrelated regulations, procedural details, " - "or non-use-based restrictions.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses""" - - SEF_PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing **solar energy system** permitted use(s) for a district. " - "The extracted text will be used for structured data extraction, so " - "it must be both **comprehensive** (retaining all relevant details) " - "and **focused** (excluding unrelated content), with **zero rewriting " - "or paraphrasing**. Ensure that all " - "retained information is **directly applicable** to permitted use(s) " - "for solar energy systems in one or more districts while " - "preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to solar energy " - "conversion system permitted use(s) for a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do not include text that does not pertain at all to solar " - "energy systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses for SEF""" - - async def extract_permitted_uses(self, text_chunks): - """Extract permitted uses text from input text chunks - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.PERMITTED_USES_FILTER_PROMPT, - ) - - async def extract_sef_permitted_uses(self, text_chunks): - """Extract permitted uses text for large SEF from input text - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.SEF_PERMITTED_USES_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield "permitted_use_only_text", self.extract_permitted_uses - yield self.OUT_LABEL, self.extract_sef_permitted_uses + PROMPTS = [ + { + "key": "permitted_use_only_text", + "out_fn": "{jurisdiction} Permitted Use Only.txt", + "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + { + "key": "districts_text", + "out_fn": "{jurisdiction} Districts.txt", + "prompt": _SEF_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text extraction""" diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py index d1be4f93..aa547184 100644 --- a/compass/extraction/wind/ordinance.py +++ b/compass/extraction/wind/ordinance.py @@ -9,7 +9,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, OrdinanceTextCollector, - OrdinanceTextExtractor, + PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -32,6 +32,151 @@ _SEARCH_TERMS_OR = _SEARCH_TERMS_AND.replace("and", "or") _IGNORE_TYPES = "private, residential, micro, small, or medium sized" +_WECS_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information about \ +**wind energy systems**. The extracted text will be used for structured data \ +extraction, so it must be both **comprehensive** (retaining all relevant \ +details) and **focused** (excluding unrelated content), with **zero rewriting \ +or paraphrasing**. Ensure that all retained information is **directly \ +applicable to wind energy systems** while preserving full context and accuracy. + +# OBJECTIVE # +Extract all text **pertaining to wind energy systems** from the provided \ +excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Include all text that pertains to **wind energy systems**. +- Explicitly include any text related to **bans or prohibitions** on wind \ +energy systems. +- Explicitly include any text related to the adoption or enactment date of \ +the ordinance (if any). + +2. ## Exclusions ##: +- Do **not** include text that does not pertain to wind energy systems. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + +_LARGE_WECS_TEXT_EXTRACTION_PROMPT = f"""\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information about \ +**large wind energy systems**. The extracted text will be used for structured \ +data extraction, so it must be both **comprehensive** (retaining all relevant \ +details) and **focused** (excluding unrelated content), with **zero rewriting \ +or paraphrasing**. Ensure that all retained information is **directly \ +applicable** to large wind energy systems while preserving full context and \ +accuracy. + +# OBJECTIVE # +Extract all text **pertaining to large wind energy systems** from the \ +provided excerpt. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Include all text that pertains to **large wind energy systems**, even if \ +they are referred to by different names such as: \ +{_LARGE_WES_SYNONYMS.capitalize()} +- Explicitly include any text related to **bans or prohibitions** on large \ +wind energy systems. +- Explicitly include any text related to the adoption or enactment date of \ +the ordinance (if any). +- **Retain all relevant technical, design, operational, safety, \ +environmental, and infrastructure-related provisions** that apply to the \ +topic, such as (but not limited to): + - Compliance with legal or regulatory standards. + - Site, structural, or design specifications. + - Environmental impact considerations. + - Safety and risk mitigation measures. + - Infrastructure, implementation, operation, and maintenance details. + - All other **closely related provisions**. + +2. ## Exclusions ##: +- Do **not** include text that explicitly applies **only** to {_IGNORE_TYPES} \ +wind energy systems. +- Do **not** include text that does not pertain at all to wind energy systems. + +3. {{FORMATTING_PROMPT}} + +4. {{OUTPUT_PROMPT}}\ +""" + +_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +permitted use(s) for a district. The extracted text will be used for \ +structured data extraction, so it must be both **comprehensive** (retaining \ +all relevant details) and **focused** (excluding unrelated content), with \ +**zero rewriting or paraphrasing**. Ensure that all retained information is \ +**directly applicable** to permitted use(s) for one or more districts while \ +preserving full context and accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Pay extra attention to any references to **wind energy facilities** or \ +related terms. +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do **not** include unrelated regulations, procedural details, or \ +non-use-based restrictions. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + +_WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\ +# CONTEXT # +We want to reduce the provided excerpt to only contain information detailing \ +**wind energy system** permitted use(s) for a district. The extracted text \ +will be used for structured data extraction, so it must be both \ +**comprehensive** (retaining all relevant details) and **focused** (excluding \ +unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \ +retained information is **directly applicable** to permitted use(s) for wind \ +energy systems in one or more districts while preserving full context and \ +accuracy. + +# OBJECTIVE # +Remove all text **not directly pertinent** to wind energy conversion system \ +permitted use(s) for a district. + +# RESPONSE # +Follow these guidelines carefully: + +1. ## Scope of Extraction ##: +- Retain all text defining permitted use(s) for a district, including: + - **Primary, Special, Conditional, Accessory, Prohibited, and any other \ +use types.** + - **District names and zoning classifications.** +- Ensure that **tables, lists, and structured elements** are preserved as \ +they may contain relevant details. + +2. ## Exclusions ##: +- Do not include text that does not pertain at all to wind energy systems. + +3. {FORMATTING_PROMPT} + +4. {OUTPUT_PROMPT}\ +""" + class WindHeuristic(OrdinanceHeuristic): """Perform a heuristic check for mention of wind turbines in text""" @@ -267,187 +412,39 @@ async def check_chunk(self, chunk_parser, ind): return False -class WindOrdinanceTextExtractor(OrdinanceTextExtractor): +class WindOrdinanceTextExtractor(PromptBasedTextExtractor): """Extract succinct ordinance text from input""" IN_LABEL = WindOrdinanceTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "cleaned_text_for_extraction" - """Identifier for ordinance text extracted by this class""" - TASK_DESCRIPTION = "Extracting wind ordinance text" """Task description to show in progress bar""" TASK_ID = "ordinance_text_extraction" """ID to use for this extraction for linking with LLM configs""" - WIND_ENERGY_SYSTEM_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "about **wind energy systems**. The extracted text will be used for " - "structured data extraction, so it must be both **comprehensive** " - "(retaining all relevant details) and **focused** (excluding " - "unrelated content), with **zero rewriting or paraphrasing**. " - "Ensure that all retained information is " - "**directly applicable to wind energy systems** while preserving " - "full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Extract all text **pertaining to wind energy systems** from the " - "provided excerpt.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Include all text that pertains to **wind energy systems**.\n" - "- Explicitly include any text related to **bans or prohibitions** " - "on wind energy systems.\n" - "- Explicitly include any text related to the adoption or enactment " - "date of the ordinance (if any).\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include text that does not pertain to wind energy " - "systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for WECS""" - - LARGE_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "about **large wind energy systems**. The extracted text will be " - "used for structured data extraction, so it must be both " - "**comprehensive** (retaining all relevant details) and **focused** " - "(excluding unrelated content), with **zero rewriting or " - "paraphrasing**. Ensure that all retained information " - "is **directly applicable** to large wind energy systems while " - "preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Extract all text **pertaining to large wind energy systems** from " - "the provided excerpt.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Include all text that pertains to **large wind energy systems**, " - "even if they are referred to by different names such as:\n" - f"\t{_LARGE_WES_SYNONYMS.capitalize()}.\n" - "- Explicitly include any text related to **bans or prohibitions** " - "on large wind energy systems.\n" - "- Explicitly include any text related to the adoption or enactment " - "date of the ordinance (if any).\n" - "- **Retain all relevant technical, design, operational, safety, " - "environmental, and infrastructure-related provisions** that apply " - "to the topic, such as (but not limited to):\n" - "\t- Compliance with legal or regulatory standards.\n" - "\t- Site, structural, or design specifications.\n" - "\t- Environmental impact considerations.\n" - "\t- Safety and risk mitigation measures.\n" - "\t- Infrastructure, implementation, operation, and maintenance " - "details.\n" - "\t- All other **closely related provisions**.\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include text that explicitly applies **only** to " - f"{_IGNORE_TYPES} wind energy systems.\n" - f"- Do **not** include text that does not pertain at all to wind " - "energy systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for utility-scale WECS""" - - async def extract_wind_energy_system_section(self, text_chunks): - """Extract ordinance text from input text chunks for WES - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.WIND_ENERGY_SYSTEM_FILTER_PROMPT, - ) - - async def extract_large_wind_energy_system_section(self, text_chunks): - """Extract large WES ordinance text from input text chunks - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.LARGE_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield ( - "wind_energy_systems_text", - self.extract_wind_energy_system_section, - ) - yield self.OUT_LABEL, self.extract_large_wind_energy_system_section + PROMPTS = [ + { + "key": "wind_energy_systems_text", + "out_fn": "{jurisdiction} Wind Ordinance Text.txt", + "prompt": _WECS_TEXT_EXTRACTION_PROMPT, + }, + { + "key": "cleaned_text_for_extraction", + "out_fn": "{jurisdiction} Cleaned Text.txt", + "prompt": _LARGE_WECS_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text extraction""" -class WindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): +class WindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor): """Extract succinct permitted use districts text from input""" IN_LABEL = WindPermittedUseDistrictsTextCollector.OUT_LABEL """Identifier for collected text ingested by this class""" - OUT_LABEL = "districts_text" - """Identifier for permitted use text extracted by this class""" - TASK_DESCRIPTION = "Extracting wind permitted use text" """Task description to show in progress bar""" @@ -456,148 +453,16 @@ class WindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor): _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY - PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing permitted use(s) for a district. The extracted text will " - "be used for structured data extraction, so it must be both " - "**comprehensive** (retaining all relevant details) and **focused** " - "(excluding unrelated content), with **zero rewriting or " - "paraphrasing**. Ensure that all retained information " - "is **directly applicable** to permitted use(s) for one or more " - "districts while preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to permitted use(s) for " - "a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Pay extra attention to any references to **wind energy " - "facilities** or related terms.\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do **not** include unrelated regulations, procedural details, " - "or non-use-based restrictions.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses""" - - WES_PERMITTED_USES_FILTER_PROMPT = ( - "# CONTEXT #\n" - "We want to reduce the provided excerpt to only contain information " - "detailing **wind energy system** permitted use(s) for a district. " - "The extracted text will be used for structured data extraction, so " - "it must be both **comprehensive** (retaining all relevant details) " - "and **focused** (excluding unrelated content), with **zero rewriting " - "or paraphrasing**. Ensure that all " - "retained information is **directly applicable** to permitted use(s) " - "for wind energy systems in one or more districts while " - "preserving full context and accuracy.\n" - "\n# OBJECTIVE #\n" - "Remove all text **not directly pertinent** to wind energy conversion " - "system permitted use(s) for a district.\n" - "\n# RESPONSE #\n" - "Follow these guidelines carefully:\n" - "\n1. ## Scope of Extraction ##:\n" - "- Retain all text defining permitted use(s) for a district, " - "including:\n" - "\t- **Primary, Special, Conditional, Accessory, Prohibited, and " - "any other use types.**\n" - "\t- **District names and zoning classifications.**\n" - "- Ensure that **tables, lists, and structured elements** are " - "preserved as they may contain relevant details.\n" - "\n2. ## Exclusions ##:\n" - "- Do not include text that does not pertain at all to wind " - "energy systems.\n" - "\n3. ## Formatting & Structure ##:\n" - "- **Preserve _all_ section titles, headers, and numberings** for " - "reference, **especially if they contain the district name**.\n" - "- **Maintain the original wording, formatting, and structure** to " - "ensure accuracy.\n" - "\n4. ## Output Handling ##:\n" - "- This is a strict extraction task — act like a text filter, **not** " - "a summarizer or writer.\n" - "- Do not add, explain, reword, or summarize anything.\n" - "- The output must be a **copy-paste** of the original excerpt.\n" - "**Absolutely no paraphrasing or rewriting.**\n" - "- The output must consist **only** of contiguous or discontiguous " - "verbatim blocks copied from the input.\n" - "- If **no relevant text** is found, return the response: " - "'No relevant text.'" - ) - """Prompt to extract ordinance text for permitted uses for WECS""" - - async def extract_permitted_uses(self, text_chunks): - """Extract permitted uses text from input text chunks - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.PERMITTED_USES_FILTER_PROMPT, - ) - - async def extract_wes_permitted_uses(self, text_chunks): - """Extract permitted uses text for large WES from input text - - Parameters - ---------- - text_chunks : list of str - List of strings, each of which represent a chunk of text. - The order of the strings should be the order of the text - chunks. - - Returns - ------- - str - Ordinance text extracted from text chunks. - """ - return await self._process( - text_chunks=text_chunks, - instructions=self.WES_PERMITTED_USES_FILTER_PROMPT, - ) - - @property - def parsers(self): - """Iterable of parsers provided by this extractor - - Yields - ------ - name : str - Name describing the type of text output by the parser. - parser : callable - Async function that takes a ``text_chunks`` input and - outputs parsed text. - """ - yield "permitted_use_only_text", self.extract_permitted_uses - yield self.OUT_LABEL, self.extract_wes_permitted_uses + PROMPTS = [ + { + "key": "permitted_use_only_text", + "out_fn": "{jurisdiction} Permitted Use Only.txt", + "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + { + "key": "districts_text", + "out_fn": "{jurisdiction} Districts.txt", + "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text extraction""" diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index affa5fb8..7cb2c6d1 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -197,20 +197,126 @@ def _store_chunk(self, parser, chunk_ind): ) -class OrdinanceTextExtractor(BaseTextExtractor, ABC): - """Base implementation for a text extractor""" +class PromptBasedTextExtractor(BaseTextExtractor, ABC): + """Text extractor based on a chain of prompts""" SYSTEM_MESSAGE = ( - "You are a text extraction assistant. Your job is to extract only " - "verbatim, **unmodified** excerpts from provided legal or policy " - "documents. Do not interpret or paraphrase. Do not summarize. Only " - "return exactly copied segments that match the specified scope. If " - "the relevant content appears within a table, return the entire " - "table, including headers and footers, exactly as formatted." + dedent( + """\ + You are a text extraction assistant. Your job is to extract only + verbatim, **unmodified** excerpts from the provided text. Do not + interpret or paraphrase. Do not summarize. Only return exactly copied + segments that match the specified scope. If the relevant content + appears within a table, return the entire table, including headers + and footers, exactly as formatted. + """ + ) + .replace("\n", " ") + .strip() ) """System message for text extraction LLM calls""" + + FORMATTING_PROMPT = ( + dedent( + """\ + ## Formatting & Structure ##: + - **Preserve _all_ section titles, headers, and numberings** for + reference. + - **Maintain the original wording, formatting, and structure** to + ensure accuracy. + """ + ) + .replace("\n ", " ") + .strip() + ) + """Prompt component instructing model to preserve text structure""" + + OUTPUT_PROMPT = ( + dedent( + """\ + ## Output Handling ##: + - This is a strict extraction task — act like a text filter, **not** + a summarizer or writer. + - Do not add, explain, reword, or summarize anything. + - The output must be a **copy-paste** of the original excerpt. + **Absolutely no paraphrasing or rewriting.** + - The output must consist **only** of contiguous or discontiguous + verbatim blocks copied from the input. + - If **no relevant text** is found, return the response: + 'No relevant text.' + """ + ) + .replace("\n ", " ") + .strip() + ) + """Prompt component instructing model output guidelines""" + _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY + @property + @abstractmethod + def PROMPTS(self): # noqa: N802 + """list: List of dicts defining the prompts for text extraction + + Each dict in the list should have the following keys: + + - **prompt**: [REQUIRED] The text extraction prompt to use + for the extraction. The prompt may use the following + placeholders, which will be filled in with the + corresponding class attributes when the prompt is applied: + + - ``"{FORMATTING_PROMPT}"``: The + :obj:`PromptBasedTextExtractor.FORMATTING_PROMPT` + class attribute, which provides instructions for + preserving the formatting and structure of the + extracted text. + - ``"{OUTPUT_PROMPT}"``: The + :obj:`PromptBasedTextExtractor.OUTPUT_PROMPT` + class attribute, which provides instructions for + how the model should format the output and what + content to include or exclude. + + - **key**: [OPTIONAL] A string identifier for the text + extracted by this prompt. If not provided, a default key + ``"extracted_text_{i}"`` will be used, where ``{i}`` is + the index of the prompt in the list. The value of this key + from the last dictionary in the input list will be used as + this extractor's `OUT_LABEL`, which is typically used to + link the extracted text to the appropriate parser via the + parser's `IN_LABEL`. All `key` values should be unique + across all prompts in the chain. + - **out_fn**: [OPTIONAL] A file name template that will be + used to write the extracted text to a file. The template + can include the placeholder ``{jurisdiction}``, which + will be replaced with the full jurisdiction name. If not + provided, the extracted text will not be written to a + file. This is primarily intended for debugging and + analysis purposes, and is not required for the extraction + process itself. + + The prompts will be applied in the order they appear in the + list, with the output text from each prompt being fed as input + to the next prompt in the chain. The final output of the last + prompt will be the output of the extractor. + """ + raise NotImplementedError + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + if getattr(cls, "__abstractmethods__", None): + return + + if not cls.PROMPTS: # TODO: This should happen at registration + msg = ( + f"{cls.__name__} must have at least one " + "prompt defined in the PROMPTS property" + ) + raise COMPASSPluginConfigurationError(msg) + + last_prompt = cls.PROMPTS[-1] + last_index = len(cls.PROMPTS) - 1 + cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}") + def __init__(self, llm_caller): """ @@ -221,6 +327,27 @@ def __init__(self, llm_caller): """ self.llm_caller = llm_caller + @property + def parsers(self): + """Iterable of parsers provided by this extractor + + Yields + ------ + name : str + Name describing the type of text output by the parser. + parser : callable + Async function that takes a ``text_chunks`` input and + outputs parsed text. + """ + for ind, prompt_dict in enumerate(self.PROMPTS): + key = prompt_dict.get("key", f"extracted_text_{ind}") + instructions = prompt_dict["prompt"].format( + FORMATTING_PROMPT=self.FORMATTING_PROMPT, + OUTPUT_PROMPT=self.OUTPUT_PROMPT, + ) + # out_fn = prompt_dict.get("out_fn", None) + yield key, partial(self._process, instructions=instructions) + async def _process(self, text_chunks, instructions, is_valid_chunk=None): """Perform extraction processing""" if is_valid_chunk is None: From 0e06c07090df4c88268fd99729bc02b1707d0ec1 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 19:22:26 -0700 Subject: [PATCH 03/22] Break uo plugin class into a filtered one and ordinance one --- compass/extraction/small_wind/plugin.py | 4 +- compass/extraction/solar/plugin.py | 4 +- compass/extraction/wind/plugin.py | 4 +- compass/plugin/__init__.py | 9 +- compass/plugin/interface.py | 434 +---------------- compass/plugin/ordinance.py | 443 +++++++++++++++++- docs/source/conf.py | 4 +- .../unit/plugin/test_plugin_interface.py | 22 +- 8 files changed, 475 insertions(+), 449 deletions(-) diff --git a/compass/extraction/small_wind/plugin.py b/compass/extraction/small_wind/plugin.py index e4d3de1b..02fb97ed 100644 --- a/compass/extraction/small_wind/plugin.py +++ b/compass/extraction/small_wind/plugin.py @@ -1,6 +1,6 @@ """COMPASS wind extraction plugin""" -from compass.plugin.interface import ExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin from compass.extraction.small_wind.ordinance import ( SmallWindHeuristic, SmallWindOrdinanceTextCollector, @@ -60,7 +60,7 @@ } -class COMPASSSmallWindExtractor(ExtractionPlugin): +class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin): """COMPASS small wind extraction plugin""" IDENTIFIER = "small wind" diff --git a/compass/extraction/solar/plugin.py b/compass/extraction/solar/plugin.py index 8123f2ac..4f6b5163 100644 --- a/compass/extraction/solar/plugin.py +++ b/compass/extraction/solar/plugin.py @@ -1,6 +1,6 @@ """COMPASS solar extraction plugin""" -from compass.plugin.interface import ExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin from compass.extraction.solar.ordinance import ( SolarHeuristic, SolarOrdinanceTextCollector, @@ -61,7 +61,7 @@ } -class COMPASSSolarExtractor(ExtractionPlugin): +class COMPASSSolarExtractor(OrdinanceExtractionPlugin): """COMPASS solar extraction plugin""" IDENTIFIER = "solar" diff --git a/compass/extraction/wind/plugin.py b/compass/extraction/wind/plugin.py index 1e22ffaa..905bcc87 100644 --- a/compass/extraction/wind/plugin.py +++ b/compass/extraction/wind/plugin.py @@ -1,6 +1,6 @@ """COMPASS wind extraction plugin""" -from compass.plugin.interface import ExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin from compass.extraction.wind.ordinance import ( WindHeuristic, WindOrdinanceTextCollector, @@ -59,7 +59,7 @@ } -class COMPASSWindExtractor(ExtractionPlugin): +class COMPASSWindExtractor(OrdinanceExtractionPlugin): """COMPASS wind extraction plugin""" IDENTIFIER = "wind" diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index f42cef35..369a3c80 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -4,13 +4,14 @@ from .interface import ( BaseHeuristic, BaseTextCollector, - BaseTextExtractor, - BaseParser, - ExtractionPlugin, + FilteredExtractionPlugin, ) from .ordinance import ( + BaseTextExtractor, + BaseParser, OrdinanceHeuristic, OrdinanceTextCollector, - OrdinanceTextExtractor, + PromptBasedTextExtractor, OrdinanceParser, + OrdinanceExtractionPlugin, ) diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index f73cff8c..25e8607b 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -1,52 +1,20 @@ """COMPASS extraction plugin base class""" -import asyncio import logging -from itertools import chain from abc import ABC, abstractmethod -from contextlib import contextmanager -from functools import cached_property - -import pandas as pd from compass.plugin.base import BaseExtractionPlugin from compass.llm.calling import LLMCaller -from compass.extraction import ( - extract_ordinance_values, - extract_relevant_text_with_ngram_validation, -) +from compass.extraction import extract_relevant_text_with_ngram_validation from compass.scripts.download import filter_ordinance_docs from compass.services.threaded import CleanedFileWriter -from compass.utilities.enums import LLMTasks -from compass.utilities import ( - num_ordinances_dataframe, - doc_infos_to_db, - save_db, -) -from compass.utilities.parsing import extract_ord_year_from_doc_attrs -from compass.exceptions import COMPASSPluginConfigurationError -from compass.pb import COMPASS_PB +from compass.utilities import doc_infos_to_db, save_db -logger = logging.getLogger(__name__) +logger = logging.getLogger(__name__) -EXCLUDE_FROM_ORD_DOC_CHECK = { - # if doc only contains these, it's not good enough to count as an - # ordinance. Note that prohibitions are explicitly not on this list - "color", - "decommissioning", - "lighting", - "visual impact", - "glare", - "repowering", - "fencing", - "climbing prevention", - "signage", - "soil", - "primary use districts", - "special use districts", - "accessory use districts", -} +# TODO: Allow other to register own clean file outputs +# TODO: Allow other to register their own jurisdictions csv class BaseHeuristic(ABC): @@ -129,86 +97,15 @@ async def check_chunk(self, chunk_parser, ind): raise NotImplementedError -class BaseTextExtractor(ABC): - """Extract succinct extraction text from input""" - - TASK_DESCRIPTION = "Condensing text for extraction" - """Task description to show in progress bar""" - - TASK_ID = "text_extraction" - """ID to use for this extraction for linking with LLM configs""" - - @property - @abstractmethod - def IN_LABEL(self): # noqa: N802 - """str: Identifier for text ingested by this class""" - raise NotImplementedError - - @property - @abstractmethod - def OUT_LABEL(self): # noqa: N802 - """str: Identifier for final text extracted by this class""" - raise NotImplementedError - - @property - @abstractmethod - def parsers(self): - """Generator: Generator of (key, extractor) pairs - - `extractor` should be an async callable that accepts a list of - text chunks and returns the shortened (succinct) text to be used - for extraction. The `key` should be a string identifier for the - text returned by the extractor. Multiple (key, extractor) pairs - can be chained in generator order to iteratively refine the - text for extraction. - """ - raise NotImplementedError - - -class BaseParser(ABC): - """Extract succinct extraction text from input""" - - TASK_ID = "data_extraction" - """ID to use for this extraction for linking with LLM configs""" - - @property - @abstractmethod - def IN_LABEL(self): # noqa: N802 - """str: Identifier for text ingested by this class""" - raise NotImplementedError - - @property - @abstractmethod - def OUT_LABEL(self): # noqa: N802 - """str: Identifier for final structured data output""" - raise NotImplementedError - - @abstractmethod - async def parse(self, text): - """Parse text and extract structured data - - Parameters - ---------- - text : str - Text which may or may not contain information relevant to - the current extraction. - - Returns - ------- - pandas.DataFrame or None - DataFrame containing structured extracted data. Can also - be ``None`` if no relevant values can be parsed from the - text. - """ - raise NotImplementedError - - -class ExtractionPlugin(BaseExtractionPlugin): +class FilteredExtractionPlugin(BaseExtractionPlugin): """Base class for COMPASS extraction plugins - This class provides a good balance between ease of use and - extraction flexibility, allowing implementers to provide additional - functionality during the extraction process. + This class provides the standard COMPASS document filtering and text + collection pipeline, allowing implementers to focus primarily on the + structured data extraction step. Filtering and text collection is + provided by subclassing the `BaseTextCollector` class and setting + the `TEXT_COLLECTORS` property to a list of the desired text + collectors. Plugins can hook into various stages of the extraction pipeline to modify behavior, add custom processing, or integrate with @@ -255,26 +152,6 @@ def TEXT_COLLECTORS(self): # noqa: N802 """ raise NotImplementedError - @property - @abstractmethod - def TEXT_EXTRACTORS(self): # noqa: N802 - """list of BaseTextExtractor: Classes to condense text - - Should be an iterable of one or more classes to condense text in - preparation for the extraction task. - """ - raise NotImplementedError - - @property - @abstractmethod - def PARSERS(self): # noqa: N802 - """list of BaseParser: Classes to extract structured data - - Should be an iterable of one or more classes to extract - structured data from text. - """ - raise NotImplementedError - @property def heuristic(self): """BaseHeuristic: Object with a ``check()`` method @@ -312,65 +189,6 @@ def save_structured_data(cls, doc_infos, out_dir): save_db(db, out_dir) return num_docs_found - def __init__(self, jurisdiction, model_configs, usage_tracker=None): - """ - - Parameters - ---------- - jurisdiction : Jurisdiction - Jurisdiction for which extraction is being performed. - model_configs : dict - Dictionary where keys are LLMTasks and values are LLMConfig - instances to be used for those tasks. - usage_tracker : UsageTracker, optional - Usage tracker instance that can be used to record the LLM - call cost. By default, ``None``. - """ - super().__init__( - jurisdiction=jurisdiction, - model_configs=model_configs, - usage_tracker=usage_tracker, - ) - - # TODO: This should happen during plugin registration - self._validate_in_out_keys() - - @cached_property - def producers(self): - """list: All classes that produce attributes on the doc""" - return chain(self.PARSERS, self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS) - - @cached_property - def consumer_producer_pairs(self): - """list: Pairs of (consumer, producer) for IN/OUT validation""" - return [ - (self.PARSERS, chain(self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)), - (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS), - ] - - def _validate_in_out_keys(self): - """Validate that all IN_LABELs have matching OUT_LABELs""" - out_keys = {} - for producer in self.producers: - out_keys.setdefault(producer.OUT_LABEL, []).append(producer) - - dupes = {k: v for k, v in out_keys.items() if len(v) > 1} - if dupes: - formatted = "\n".join( - [ - f"{key}: {[cls.__name__ for cls in classes]}" - for key, classes in dupes.items() - ] - ) - msg = ( - "Multiple processing classes produce the same OUT_LABEL key:\n" - f"{formatted}" - ) - raise COMPASSPluginConfigurationError(msg) - - for consumers, producers in self.consumer_producer_pairs: - _validate_in_out_keys(consumers, producers) - async def pre_filter_docs_hook(self, extraction_context): # noqa: PLR6301 """Pre-process documents before running them through the filter @@ -435,61 +253,6 @@ async def extract_relevant_text(self, doc, extractor_class, model_config): ) await self._write_cleaned_text(doc) - async def extract_ordinances_from_text( - self, doc, parser_class, model_config - ): - """Extract structured data from input text - - The extracted structured data will be stored in the ``.attrs`` - dictionary of the input document under the - ``parser_class.OUT_LABEL`` key. - - Parameters - ---------- - doc : BaseDocument - Document containing text to extract structured data from. - parser_class : BaseParser - Class to use for structured data extraction. - model_config : LLMConfig - Configuration for the LLM model to use for structured data - extraction. - """ - parser = parser_class( - llm_service=model_config.llm_service, - usage_tracker=self.usage_tracker, - **model_config.llm_call_kwargs, - ) - logger.info( - "Extracting %s...", parser_class.OUT_LABEL.replace("_", " ") - ) - await extract_ordinance_values( - doc, - parser, - text_key=parser_class.IN_LABEL, - out_key=parser_class.OUT_LABEL, - ) - - @classmethod - def get_structured_data_row_count(cls, data_df): - """Get the number of data rows extracted from a document - - Parameters - ---------- - data_df : pandas.DataFrame or None - DataFrame to check for extracted structured data. - - Returns - ------- - int - Number of data rows extracted from the document. - """ - if data_df is None: - return 0 - - return num_ordinances_dataframe( - data_df, exclude_features=EXCLUDE_FROM_ORD_DOC_CHECK - ) - async def filter_docs( self, extraction_context, need_jurisdiction_verification=True ): @@ -560,181 +323,8 @@ async def filter_docs( extraction_context.documents = docs return extraction_context - async def parse_docs_for_structured_data(self, extraction_context): - """Parse documents to extract structured data/information - - Parameters - ---------- - extraction_context : ExtractionContext - Context containing candidate documents to parse. - - Returns - ------- - ExtractionContext or None - Context with extracted data/information stored in the - ``.attrs`` dictionary, or ``None`` if no data was extracted. - """ - for doc_for_extraction in extraction_context: - data_df = await self.parse_single_doc_for_structured_data( - doc_for_extraction - ) - row_count = self.get_structured_data_row_count(data_df) - if row_count > 0: - await extraction_context.mark_doc_as_data_source( - doc_for_extraction, out_fn_stem=self.jurisdiction.full_name - ) - extraction_context.attrs["structured_data"] = data_df - logger.info( - "%d ordinance value(s) found in doc from %s for %s. ", - row_count, - doc_for_extraction.attrs.get("source", "unknown source"), - self.jurisdiction.full_name, - ) - return extraction_context - - logger.debug( - "No ordinances found; searched %d docs", - extraction_context.num_documents, - ) - return None - - async def parse_single_doc_for_structured_data(self, doc_for_extraction): - """Extract all possible structured data from a document - - This method is called from the default implementation of - `parse_docs_for_structured_data()` for each document that passed - filtering. If you overwrite`parse_docs_for_structured_data()``, - you can ignore this method. - - Parameters - ---------- - doc_for_extraction : BaseDocument - Document to extract structured data from. - - Returns - ------- - BaseDocument - Document with extracted structured data stored in the - ``.attrs`` dictionary. - """ - with self._tracked_progress(): - tasks = [ - asyncio.create_task( - self._try_extract_ordinances( - doc_for_extraction, parser_class - ), - name=self.jurisdiction.full_name, - ) - for parser_class in filter(None, self.PARSERS) - ] - await asyncio.gather(*tasks) - - return self._concat_scrape_results(doc_for_extraction) - - async def _try_extract_ordinances(self, doc_for_extraction, parser_class): - """Apply a single extractor and parser to legal text""" - - if parser_class.IN_LABEL not in doc_for_extraction.attrs: - await self._run_text_extractors(doc_for_extraction, parser_class) - - model_config = self._get_model_config( - primary_key=parser_class.TASK_ID, - secondary_key=LLMTasks.DATA_EXTRACTION, - ) - await self.extract_ordinances_from_text( - doc_for_extraction, - parser_class=parser_class, - model_config=model_config, - ) - - await self.record_usage() - - async def _run_text_extractors(self, doc_for_extraction, parser_class): - """Run text extractor(s) on document to get text for a parser""" - te = [ - te - for te in self.TEXT_EXTRACTORS - if te.OUT_LABEL == parser_class.IN_LABEL - ] - if len(te) != 1: - msg = ( - f"Could not find unique text extractor for parser " - f"{parser_class.__name__} with IN_LABEL " - f"{parser_class.IN_LABEL!r}. Got matches: {te}" - ) - raise COMPASSPluginConfigurationError(msg) - - te = te[0] - model_config = self._get_model_config( - primary_key=te.TASK_ID, - secondary_key=LLMTasks.TEXT_EXTRACTION, - ) - logger.debug( - "Condensing text for extraction using %r for doc from %s", - te.__name__, - doc_for_extraction.attrs.get("source", "unknown source"), - ) - assert self._jsp is not None, "No progress bar set!" - task_id = self._jsp.add_task(te.TASK_DESCRIPTION) - await self.extract_relevant_text(doc_for_extraction, te, model_config) - await self.record_usage() - self._jsp.remove_task(task_id) - - @contextmanager - def _tracked_progress(self): - """Context manager to set up jurisdiction sub-progress bar""" - loc = self.jurisdiction.full_name - with COMPASS_PB.jurisdiction_sub_prog(loc) as self._jsp: - yield - - self._jsp = None - - def _concat_scrape_results(self, doc): - """Concatenate structured data from all parsers""" - data = [doc.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS] - data = [df for df in data if df is not None and not df.empty] - if len(data) == 0: - return None - - data = data[0] if len(data) == 1 else pd.concat(data) - data["source"] = doc.attrs.get("source") - data["ord_year"] = extract_ord_year_from_doc_attrs(doc.attrs) - return data - - def _get_model_config(self, primary_key, secondary_key): - """Get model config: primary_key -> secondary_key -> default""" - if primary_key in self.model_configs: - return self.model_configs[primary_key] - return self.model_configs.get( - secondary_key, self.model_configs[LLMTasks.DEFAULT] - ) - async def _write_cleaned_text(self, doc): """Write cleaned text to `clean_files` dir""" out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name) doc.attrs["cleaned_fps"] = out_fp return doc - - -def _validate_in_out_keys(consumers, producers): - """Validate that all IN_LABELs have matching OUT_LABELs""" - in_keys = {} - out_keys = {} - - for producer_class in producers: - out_keys.setdefault(producer_class.OUT_LABEL, []).append( - producer_class - ) - - for consumer_class in chain(consumers): - in_keys.setdefault(consumer_class.IN_LABEL, []).append(consumer_class) - - for in_key, classes in in_keys.items(): - formatted = f"{[cls.__name__ for cls in classes]}" - if in_key not in out_keys: - msg = ( - f"One or more processing classes require IN_LABEL " - f"{in_key!r}, which is not produced by any previous " - f"processing class: {formatted}" - ) - raise COMPASSPluginConfigurationError(msg) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 7cb2c6d1..de75087a 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -2,9 +2,14 @@ import asyncio import logging -from abc import ABC, abstractmethod from warnings import warn +from textwrap import dedent +from itertools import chain +from functools import cached_property, partial +from abc import ABC, abstractmethod +from contextlib import contextmanager +import pandas as pd from elm import ApiBase from compass.llm.calling import ( @@ -12,22 +17,117 @@ ChatLLMCaller, StructuredLLMCaller, ) -from compass.utilities.enums import LLMUsageCategory -from compass.utilities.ngrams import convert_text_to_sentence_ngrams -from compass.warn import COMPASSWarning -from compass.utilities.parsing import ( - merge_overlapping_texts, - clean_backticks_from_llm_response, -) from compass.plugin.interface import ( BaseHeuristic, BaseTextCollector, - BaseTextExtractor, - BaseParser, + FilteredExtractionPlugin, +) +from compass.extraction import extract_ordinance_values +from compass.utilities.enums import LLMTasks, LLMUsageCategory +from compass.utilities.ngrams import convert_text_to_sentence_ngrams +from compass.utilities.parsing import ( + clean_backticks_from_llm_response, + extract_ord_year_from_doc_attrs, + merge_overlapping_texts, ) +from compass.utilities import num_ordinances_dataframe +from compass.warn import COMPASSWarning +from compass.exceptions import COMPASSPluginConfigurationError +from compass.pb import COMPASS_PB logger = logging.getLogger(__name__) +EXCLUDE_FROM_ORD_DOC_CHECK = { + # if doc only contains these, it's not good enough to count as an + # ordinance. Note that prohibitions are explicitly not on this list + "color", + "decommissioning", + "lighting", + "visual impact", + "glare", + "repowering", + "fencing", + "climbing prevention", + "signage", + "soil", + "primary use districts", + "special use districts", + "accessory use districts", +} + + +class BaseTextExtractor(ABC): + """Extract succinct extraction text from input""" + + TASK_DESCRIPTION = "Condensing text for extraction" + """Task description to show in progress bar""" + + TASK_ID = "text_extraction" + """ID to use for this extraction for linking with LLM configs""" + + @property + @abstractmethod + def IN_LABEL(self): # noqa: N802 + """str: Identifier for text ingested by this class""" + raise NotImplementedError + + @property + @abstractmethod + def OUT_LABEL(self): # noqa: N802 + """str: Identifier for final text extracted by this class""" + raise NotImplementedError + + @property + @abstractmethod + def parsers(self): + """Generator: Generator of (key, extractor) pairs + + `extractor` should be an async callable that accepts a list of + text chunks and returns the shortened (succinct) text to be used + for extraction. The `key` should be a string identifier for the + text returned by the extractor. Multiple (key, extractor) pairs + can be chained in generator order to iteratively refine the + text for extraction. + """ + raise NotImplementedError + + +class BaseParser(ABC): + """Extract succinct extraction text from input""" + + TASK_ID = "data_extraction" + """ID to use for this extraction for linking with LLM configs""" + + @property + @abstractmethod + def IN_LABEL(self): # noqa: N802 + """str: Identifier for text ingested by this class""" + raise NotImplementedError + + @property + @abstractmethod + def OUT_LABEL(self): # noqa: N802 + """str: Identifier for final structured data output""" + raise NotImplementedError + + @abstractmethod + async def parse(self, text): + """Parse text and extract structured data + + Parameters + ---------- + text : str + Text which may or may not contain information relevant to + the current extraction. + + Returns + ------- + pandas.DataFrame or None + DataFrame containing structured extracted data. Can also + be ``None`` if no relevant values can be parsed from the + text. + """ + raise NotImplementedError class OrdinanceHeuristic(BaseHeuristic, ABC): @@ -401,6 +501,329 @@ def _init_chat_llm_caller(self, system_message): ) +class OrdinanceExtractionPlugin(FilteredExtractionPlugin): + """Base class for COMPASS extraction plugins + + This class provides a good balance between ease of use and + extraction flexibility, allowing implementers to provide additional + functionality during the extraction process. + + Plugins can hook into various stages of the extraction pipeline + to modify behavior, add custom processing, or integrate with + external systems. + + Subclasses should implement the desired hooks and override + methods as needed. + """ + + @property + @abstractmethod + def TEXT_EXTRACTORS(self): # noqa: N802 + """list of BaseTextExtractor: Classes to condense text + + Should be an iterable of one or more classes to condense text in + preparation for the extraction task. + """ + raise NotImplementedError + + @property + @abstractmethod + def PARSERS(self): # noqa: N802 + """list of BaseParser: Classes to extract structured data + + Should be an iterable of one or more classes to extract + structured data from text. + """ + raise NotImplementedError + + @cached_property + def producers(self): + """list: All classes that produce attributes on the doc""" + return chain(self.PARSERS, self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS) + + @cached_property + def consumer_producer_pairs(self): + """list: Pairs of (consumer, producer) for IN/OUT validation""" + return [ + (self.PARSERS, chain(self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)), + (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS), + ] + + def __init__(self, jurisdiction, model_configs, usage_tracker=None): + """ + + Parameters + ---------- + jurisdiction : Jurisdiction + Jurisdiction for which extraction is being performed. + model_configs : dict + Dictionary where keys are LLMTasks and values are LLMConfig + instances to be used for those tasks. + usage_tracker : UsageTracker, optional + Usage tracker instance that can be used to record the LLM + call cost. By default, ``None``. + """ + super().__init__( + jurisdiction=jurisdiction, + model_configs=model_configs, + usage_tracker=usage_tracker, + ) + + # TODO: This should happen during plugin registration + self._validate_in_out_keys() + + def _validate_in_out_keys(self): + """Validate that all IN_LABELs have matching OUT_LABELs""" + out_keys = {} + for producer in self.producers: + out_keys.setdefault(producer.OUT_LABEL, []).append(producer) + + dupes = {k: v for k, v in out_keys.items() if len(v) > 1} + if dupes: + formatted = "\n".join( + [ + f"{key}: {[cls.__name__ for cls in classes]}" + for key, classes in dupes.items() + ] + ) + msg = ( + "Multiple processing classes produce the same OUT_LABEL key:\n" + f"{formatted}" + ) + raise COMPASSPluginConfigurationError(msg) + + for consumers, producers in self.consumer_producer_pairs: + _validate_in_out_keys(consumers, producers) + + async def extract_ordinances_from_text( + self, doc, parser_class, model_config + ): + """Extract structured data from input text + + The extracted structured data will be stored in the ``.attrs`` + dictionary of the input document under the + ``parser_class.OUT_LABEL`` key. + + Parameters + ---------- + doc : BaseDocument + Document containing text to extract structured data from. + parser_class : BaseParser + Class to use for structured data extraction. + model_config : LLMConfig + Configuration for the LLM model to use for structured data + extraction. + """ + parser = parser_class( + llm_service=model_config.llm_service, + usage_tracker=self.usage_tracker, + **model_config.llm_call_kwargs, + ) + logger.info( + "Extracting %s...", parser_class.OUT_LABEL.replace("_", " ") + ) + await extract_ordinance_values( + doc, + parser, + text_key=parser_class.IN_LABEL, + out_key=parser_class.OUT_LABEL, + ) + + @classmethod + def get_structured_data_row_count(cls, data_df): + """Get the number of data rows extracted from a document + + Parameters + ---------- + data_df : pandas.DataFrame or None + DataFrame to check for extracted structured data. + + Returns + ------- + int + Number of data rows extracted from the document. + """ + if data_df is None: + return 0 + + return num_ordinances_dataframe( + data_df, exclude_features=EXCLUDE_FROM_ORD_DOC_CHECK + ) + + async def parse_docs_for_structured_data(self, extraction_context): + """Parse documents to extract structured data/information + + Parameters + ---------- + extraction_context : ExtractionContext + Context containing candidate documents to parse. + + Returns + ------- + ExtractionContext or None + Context with extracted data/information stored in the + ``.attrs`` dictionary, or ``None`` if no data was extracted. + """ + for doc_for_extraction in extraction_context: + data_df = await self.parse_single_doc_for_structured_data( + doc_for_extraction + ) + row_count = self.get_structured_data_row_count(data_df) + if row_count > 0: + await extraction_context.mark_doc_as_data_source( + doc_for_extraction, out_fn_stem=self.jurisdiction.full_name + ) + extraction_context.attrs["structured_data"] = data_df + logger.info( + "%d ordinance value(s) found in doc from %s for %s. ", + row_count, + doc_for_extraction.attrs.get("source", "unknown source"), + self.jurisdiction.full_name, + ) + return extraction_context + + logger.debug( + "No ordinances found; searched %d docs", + extraction_context.num_documents, + ) + return None + + async def parse_single_doc_for_structured_data(self, doc_for_extraction): + """Extract all possible structured data from a document + + This method is called from the default implementation of + `parse_docs_for_structured_data()` for each document that passed + filtering. If you overwrite`parse_docs_for_structured_data()``, + you can ignore this method. + + Parameters + ---------- + doc_for_extraction : BaseDocument + Document to extract structured data from. + + Returns + ------- + BaseDocument + Document with extracted structured data stored in the + ``.attrs`` dictionary. + """ + with self._tracked_progress(): + tasks = [ + asyncio.create_task( + self._try_extract_ordinances( + doc_for_extraction, parser_class + ), + name=self.jurisdiction.full_name, + ) + for parser_class in filter(None, self.PARSERS) + ] + await asyncio.gather(*tasks) + + return self._concat_scrape_results(doc_for_extraction) + + async def _try_extract_ordinances(self, doc_for_extraction, parser_class): + """Apply a single extractor and parser to legal text""" + + if parser_class.IN_LABEL not in doc_for_extraction.attrs: + await self._run_text_extractors(doc_for_extraction, parser_class) + + model_config = self._get_model_config( + primary_key=parser_class.TASK_ID, + secondary_key=LLMTasks.DATA_EXTRACTION, + ) + await self.extract_ordinances_from_text( + doc_for_extraction, + parser_class=parser_class, + model_config=model_config, + ) + + await self.record_usage() + + async def _run_text_extractors(self, doc_for_extraction, parser_class): + """Run text extractor(s) on document to get text for a parser""" + te = [ + te + for te in self.TEXT_EXTRACTORS + if te.OUT_LABEL == parser_class.IN_LABEL + ] + if len(te) != 1: + msg = ( + f"Could not find unique text extractor for parser " + f"{parser_class.__name__} with IN_LABEL " + f"{parser_class.IN_LABEL!r}. Got matches: {te}" + ) + raise COMPASSPluginConfigurationError(msg) + + te = te[0] + model_config = self._get_model_config( + primary_key=te.TASK_ID, + secondary_key=LLMTasks.TEXT_EXTRACTION, + ) + logger.debug( + "Condensing text for extraction using %r for doc from %s", + te.__name__, + doc_for_extraction.attrs.get("source", "unknown source"), + ) + assert self._jsp is not None, "No progress bar set!" + task_id = self._jsp.add_task(te.TASK_DESCRIPTION) + await self.extract_relevant_text(doc_for_extraction, te, model_config) + await self.record_usage() + self._jsp.remove_task(task_id) + + @contextmanager + def _tracked_progress(self): + """Context manager to set up jurisdiction sub-progress bar""" + loc = self.jurisdiction.full_name + with COMPASS_PB.jurisdiction_sub_prog(loc) as self._jsp: + yield + + self._jsp = None + + def _concat_scrape_results(self, doc): + """Concatenate structured data from all parsers""" + data = [doc.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS] + data = [df for df in data if df is not None and not df.empty] + if len(data) == 0: + return None + + data = data[0] if len(data) == 1 else pd.concat(data) + data["source"] = doc.attrs.get("source") + data["ord_year"] = extract_ord_year_from_doc_attrs(doc.attrs) + return data + + def _get_model_config(self, primary_key, secondary_key): + """Get model config: primary_key -> secondary_key -> default""" + if primary_key in self.model_configs: + return self.model_configs[primary_key] + return self.model_configs.get( + secondary_key, self.model_configs[LLMTasks.DEFAULT] + ) + + def _valid_chunk(chunk): """True if chunk has content""" return chunk and "no relevant text" not in chunk.lower() + + +def _validate_in_out_keys(consumers, producers): + """Validate that all IN_LABELs have matching OUT_LABELs""" + in_keys = {} + out_keys = {} + + for producer_class in producers: + out_keys.setdefault(producer_class.OUT_LABEL, []).append( + producer_class + ) + + for consumer_class in chain(consumers): + in_keys.setdefault(consumer_class.IN_LABEL, []).append(consumer_class) + + for in_key, classes in in_keys.items(): + formatted = f"{[cls.__name__ for cls in classes]}" + if in_key not in out_keys: + msg = ( + f"One or more processing classes require IN_LABEL " + f"{in_key!r}, which is not produced by any previous " + f"processing class: {formatted}" + ) + raise COMPASSPluginConfigurationError(msg) diff --git a/docs/source/conf.py b/docs/source/conf.py index c5f4d1e8..454898db 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -349,8 +349,8 @@ def setup(app): # objects from COMPASS "AsyncDecisionTree": ":class:`~compass.common.tree.AsyncDecisionTree`", "BaseDocument": ":class:`elm.web.document.BaseDocument`", - "BaseParser": ":class:`~compass.plugin.interface.BaseParser`", - "BaseTextExtractor": ":class:`~compass.plugin.interface.BaseTextExtractor`", + "BaseParser": ":class:`~compass.plugin.ordinance.BaseParser`", + "BaseTextExtractor": ":class:`~compass.plugin.ordinance.BaseTextExtractor`", "ChatLLMCaller": ":class:`~compass.llm.calling.ChatLLMCaller`", "ExtractionContext": ":class:`~compass.extraction.context.ExtractionContext`", "Jurisdiction": ":class:`~compass.utilities.jurisdictions.Jurisdiction`", diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_interface.py index fd79ff86..1287d9ac 100644 --- a/tests/python/unit/plugin/test_plugin_interface.py +++ b/tests/python/unit/plugin/test_plugin_interface.py @@ -4,7 +4,7 @@ import pytest -from compass.plugin.interface import ExtractionPlugin +from compass.plugin.interface import FilteredExtractionPlugin from compass.exceptions import COMPASSPluginConfigurationError @@ -30,7 +30,7 @@ class PARS2: IN_LABEL = "collected" OUT_LABEL = "parsed_1" - class MYPlugin(ExtractionPlugin): + class MYPlugin(FilteredExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] @@ -40,6 +40,9 @@ class MYPlugin(ExtractionPlugin): QUESTION_TEMPLATES = [] heuristic = None + async def parse_docs_for_structured_data(self, extraction_context): + return extraction_context + with pytest.raises( COMPASSPluginConfigurationError, match="Multiple processing classes produce the same OUT_LABEL key", @@ -69,7 +72,7 @@ class PARS2: IN_LABEL = "collected" OUT_LABEL = "parsed_2" - class MYPlugin(ExtractionPlugin): + class MYPlugin(FilteredExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] @@ -79,6 +82,9 @@ class MYPlugin(ExtractionPlugin): QUESTION_TEMPLATES = [] heuristic = None + async def parse_docs_for_structured_data(self, extraction_context): + return extraction_context + with pytest.raises( COMPASSPluginConfigurationError, match="Multiple processing classes produce the same OUT_LABEL key", @@ -108,7 +114,7 @@ class PARS2: IN_LABEL = "collected" OUT_LABEL = "parsed_2" - class MYPlugin(ExtractionPlugin): + class MYPlugin(FilteredExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] @@ -118,6 +124,9 @@ class MYPlugin(ExtractionPlugin): QUESTION_TEMPLATES = [] heuristic = None + async def parse_docs_for_structured_data(self, extraction_context): + return extraction_context + with pytest.raises( COMPASSPluginConfigurationError, match=( @@ -151,7 +160,7 @@ class PARS2: IN_LABEL = "collected_2" OUT_LABEL = "parsed_2" - class MYPlugin(ExtractionPlugin): + class MYPlugin(FilteredExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] @@ -161,6 +170,9 @@ class MYPlugin(ExtractionPlugin): QUESTION_TEMPLATES = [] heuristic = None + async def parse_docs_for_structured_data(self, extraction_context): + return extraction_context + with pytest.raises( COMPASSPluginConfigurationError, match=( From f187cd1e7846b7c4bcf6051fd781e779124218ee Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 20:11:25 -0700 Subject: [PATCH 04/22] Add and use `PromptBasedTextCollector` --- compass/extraction/small_wind/ordinance.py | 250 +++++++-------------- compass/extraction/solar/ordinance.py | 242 +++++++------------- compass/extraction/wind/ordinance.py | 246 +++++++------------- compass/plugin/__init__.py | 2 +- compass/plugin/ordinance.py | 109 ++++++++- compass/validation/content.py | 18 +- 6 files changed, 360 insertions(+), 507 deletions(-) diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py index d392c37f..01dfa2ea 100644 --- a/compass/extraction/small_wind/ordinance.py +++ b/compass/extraction/small_wind/ordinance.py @@ -8,7 +8,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, - OrdinanceTextCollector, + PromptBasedTextCollector, PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -32,6 +32,59 @@ _IGNORE_TYPES_MICRO = "private, micro, personal, building-mounted" _IGNORE_TYPES_LARGE = "large, utility-scale, for-sale, commercial" +_CONTAINS_ORD_COLLECTION_PROMPT = f"""\ +You extract structured data from text. Return your answer in JSON format \ +(not markdown). Your JSON file must include exactly two keys. The first \ +key is 'wind_reqs', which is a string that summarizes all {_SEARCH_TERMS_AND} \ +that are explicitly enacted in the text for a wind energy system (or wind \ +turbine/tower) for a given jurisdiction. Note that wind energy bans are \ +an important restriction to track. Include any **closely related provisions** \ +if they clearly pertain to the **development, operation, modification, or \ +removal** of wind energy systems (or wind turbines/towers). All restrictions \ +should be enforceable - ignore any text that only provides a legal definition \ +of the regulation. If the text does not specify any concrete \ +{_SEARCH_TERMS_OR} for a wind energy system, set this key to `null`. The last \ +key is '{{key}}', which is a boolean that is set to True if the text excerpt \ +explicitly details {_SEARCH_TERMS_OR} for a wind energy system (or wind \ +turbine/tower) and False otherwise.\ +""" + +_IS_SMALL_COLLECTION_PROMPT = f"""\ +You are a legal scholar that reads ordinance text and determines whether any \ +of it applies to {_SEARCH_TERMS_OR} for **small, medium, or non-commercial \ +wind energy systems**. Small, medium, or non-commercial energy systems may \ +also be referred to as {_SMALL_WES_SYNONYMS}. Your client is a private \ +resident that does not care about ordinances related to {_IGNORE_TYPES_MICRO} \ +or {_IGNORE_TYPES_LARGE} wind energy systems. Ignore any text related to such \ +systems. Return your answer as a dictionary in JSON format (not markdown). \ +Your JSON file must include exactly two keys. The first key is 'summary' \ +which contains a string that lists all of the types of wind energy systems \ +the text applies to (if any). The second key is '{{key}}', which is a boolean \ +that is set to True if any part of the text excerpt details \ +{_SEARCH_TERMS_OR} for the **small, medium, or non-commercial wind energy \ +conversion systems** (or similar) that the client is interested in and False \ +otherwise.\ +""" + +_DISTRICTS_COLLECTION_PROMPT = f"""\ +You are a legal scholar that reads ordinance text and determines whether the \ +text explicitly contains relevant information to determine the districts (and \ +especially the district names) where small, medium, or non-commercial wind \ +energy systems are a permitted use (i.e. accessory use), as well as the \ +districts where wind energy systems are prohibited entirely. Small wind \ +energy systems (SWES) may also be referred to as {_SMALL_WES_SYNONYMS}. Do \ +not make any inferences; only answer based on information that is explicitly \ +stated in the text. Note that relevant information may sometimes be found in \ +tables. Return your answer as a dictionary in JSON format (not markdown). \ +Your JSON file must include exactly two keys. The first key is 'districts' \ +which contains a string that lists all of the district names for which the \ +text explicitly permits **small, medium, or non-commercial wind energy \ +systems** (if any). The last key is '{{key}}', which is a boolean that is set \ +to True if any part of the text excerpt provides information on districts \ +where **small, medium, or non-commercial wind energy systems** (or similar) \ +are a permitted use (i.e. accessory use) in and False otherwise.\ +""" + _WECS_TEXT_EXTRACTION_PROMPT = """\ # CONTEXT # We want to reduce the provided excerpt to only contain information about \ @@ -261,183 +314,44 @@ class SmallWindHeuristic(OrdinanceHeuristic): """Phrases that indicate text is about WECS""" -class SmallWindOrdinanceTextCollector(OrdinanceTextCollector): +class SmallWindOrdinanceTextCollector(PromptBasedTextCollector): """Check text chunks for ordinances and collect them if they do""" OUT_LABEL = "relevant_text" """Identifier for text collected by this class""" - CONTAINS_ORD_PROMPT = ( - "You extract structured data from text. Return your answer in JSON " - "format (not markdown). Your JSON file must include exactly two " - "keys. The first key is 'wind_reqs', which is a string that " - f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted " - "in the text for a wind energy system (or wind turbine/tower) for " - "a given jurisdiction. " - "Note that wind energy bans are an important restriction to track. " - "Include any **closely related provisions** if they clearly pertain " - "to the **development, operation, modification, or removal** of wind " - "energy systems (or wind turbines/towers). " - "All restrictions should be enforceable - ignore any text that only " - "provides a legal definition of the regulation. If the text does not " - f"specify any concrete {_SEARCH_TERMS_OR} for a wind energy system, " - "set this key to `null`. The last key is '{key}', which is a boolean " - "that is set to True if the text excerpt explicitly details " - f"{_SEARCH_TERMS_OR} for a wind energy system (or wind turbine/tower) " - "and False otherwise. " - ) - """Prompt to check if chunk contains WES ordinance info""" - - IS_SMALL_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - f"whether any of it applies to {_SEARCH_TERMS_OR} for **small, " - "medium, or non-commercial wind energy systems**. Small, medium, or " - "non-commercial energy systems may also be referred to as " - f"{_SMALL_WES_SYNONYMS}. " - "Your client is a private resident that does not care about " - f"ordinances related to {_IGNORE_TYPES_MICRO} or " - f"{_IGNORE_TYPES_LARGE} wind energy systems. Ignore any text " - "related to such systems. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'summary' which contains a string that lists all of the types of " - "wind energy systems the text applies to (if any). The second key is " - "'{key}', which is a boolean that is set to True if any part of the " - f"text excerpt details {_SEARCH_TERMS_OR} for the **small, medium, or " - "non-commercial wind energy conversion systems** (or similar) that " - "the client is interested in and False otherwise." - ) - """Prompt to check if chunk is for small WES""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk at a given ind to see if it contains ordinance - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains small wind energy conversion system ordinance text. - """ - contains_ord_info = await chunk_parser.parse_from_ind( - ind, - key="contains_ord_info", - llm_call_callback=self._check_chunk_contains_ord, - ) - if not contains_ord_info: - logger.debug("Text at ind %d does not contain ordinance info", ind) - return False - - logger.debug("Text at ind %d does contain ordinance info", ind) - - is_small_scale = await chunk_parser.parse_from_ind( - ind, - key="x", - llm_call_callback=self._check_chunk_is_for_small_scale, - ) - if not is_small_scale: - logger.debug("Text at ind %d is not for small WECS", ind) - return False - - logger.debug("Text at ind %d is for small WECS", ind) - - self._store_chunk(chunk_parser, ind) - logger.debug("Added text at ind %d to ordinances", ind) - - return True - - async def _check_chunk_contains_ord(self, key, text_chunk): - """Call LLM on a chunk of text to check for ordinance""" - content = await self.call( - sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - async def _check_chunk_is_for_small_scale(self, key, text_chunk): - """Call LLM on a chunk of text to check for small scale""" - content = await self.call( - sys_msg=self.IS_SMALL_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - -class SmallWindPermittedUseDistrictsTextCollector(OrdinanceTextCollector): + PROMPTS = [ + { + "key": "contains_ord_info", + "label": "contains ordinance info", + "prompt": _CONTAINS_ORD_COLLECTION_PROMPT, + }, + { + # Generic key like "x" makes the llm focus on the + # instruction rather than using the key name to infer the + # content, which can improve performance, + "key": "x", + "label": "for small WECS", + "prompt": _IS_SMALL_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text collection""" + + +class SmallWindPermittedUseDistrictsTextCollector(PromptBasedTextCollector): """Check text chunks for permitted wind districts; collect them""" OUT_LABEL = "permitted_use_text" """Identifier for text collected by this class""" - DISTRICT_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - "whether the text explicitly contains relevant information to " - "determine the districts (and especially the district names) where " - "small, medium, or non-commercial wind energy systems are a permitted " - "use (i.e. accessory use), as well as the districts where wind energy" - "systems are prohibited entirely. Small wind energy systems " - f"(SWES) may also be referred to as {_SMALL_WES_SYNONYMS}. " - "Do not make any inferences; only answer based on information that " - "is explicitly stated in the text. " - "Note that relevant information may sometimes be found in tables. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'districts' which contains a string that lists all of the district " - "names for which the text explicitly permits **small, medium, or " - "non-commercial wind energy systems** (if any). The last key is " - "'{key}', which is a boolean that is set to True if any part of the " - "text excerpt provides information on districts where **small, " - "medium, or non-commercial wind energy systems** (or similar) are a " - "permitted use (i.e. accessory use) in and False otherwise." - ) - """Prompt to check if chunk contains info on permitted districts""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk to see if it contains permitted uses - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains small wind energy conversion system permitted use - text. - """ - - key = "contains_district_info" - content = await self.call( - sys_msg=self.DISTRICT_PROMPT.format(key=key), - content=chunk_parser.text_chunks[ind], - usage_sub_label=( - LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION - ), - ) - logger.debug("LLM response: %s", content) - contains_district_info = content.get(key, False) - - if contains_district_info: - self._store_chunk(chunk_parser, ind) - logger.debug("Text at ind %d contains district info", ind) - return True - - logger.debug("Text at ind %d does not contain district info", ind) - return False + PROMPTS = [ + { + "key": "contains_district_info", + "label": "contains district info", + "prompt": _DISTRICTS_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text collection""" class SmallWindOrdinanceTextExtractor(PromptBasedTextExtractor): diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py index 67dbb61d..691adf12 100644 --- a/compass/extraction/solar/ordinance.py +++ b/compass/extraction/solar/ordinance.py @@ -8,7 +8,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, - OrdinanceTextCollector, + PromptBasedTextCollector, PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -34,6 +34,57 @@ "CSP, private, residential, roof-mounted, micro, small, or medium sized" ) +_CONTAINS_ORD_COLLECTION_PROMPT = f"""\ +You extract structured data from text. Return your answer in JSON format \ +(not markdown). Your JSON file must include exactly two keys. The first \ +key is 'solar_reqs', which is a string that summarizes all +{_SEARCH_TERMS_AND} that are explicitly enacted in the legal text for solar \ +energy systems for a given jurisdiction. Note that solar energy bans are an \ +important restriction to track. Include any **closely related provisions** \ +if they clearly pertain to the **development, operation, modification, or \ +removal** of solar energy systems (or solar panels). All restrictions should \ +be enforceable - ignore any text that only provides a legal definition of \ +the regulation. If the text does not specify any concrete {_SEARCH_TERMS_OR} \ +for a solar energy system, set this key to `null`. The last key is \ +'{{key}}', which is a boolean that is set to True if the text excerpt \ +explicitly details {_SEARCH_TERMS_OR} for a solar energy system and False \ +otherwise.\ +""" + +_IS_UTILITY_SCALE_COLLECTION_PROMPT = f""" +You are a legal scholar that reads ordinance text and determines whether it \ +applies to {_SEARCH_TERMS_OR} for **large solar energy systems**. Large \ +solar energy systems (SES) may also be referred to as \ +{_LARGE_SEF_SYNONYMS}. Your client is a commercial solar developer that does \ +not care about ordinances related to {_IGNORE_TYPES} solar energy systems. \ +Ignore any text related to such systems. Return your answer as a dictionary \ +in JSON format (not markdown). Your JSON file must include exactly two keys. \ +The first key is 'summary' which contains a string that summarizes the types \ +of solar energy systems the text applies to (if any). The second key is \ +'{{key}}', which is a boolean that is set to True if any part of the text \ +excerpt details {_SEARCH_TERMS_OR} for the **large solar energy conversion \ +systems** (or similar) that the client is interested in and False otherwise.\ +""" + +_DISTRICTS_COLLECTION_PROMPT = f""" +You are a legal scholar that reads ordinance text and determines whether it \ +explicitly contains relevant information to determine the districts (and \ +especially the district names) where large solar energy farms are a permitted \ +use (primary, special, accessory, or otherwise), as well as the districts \ +where large solar energy farms are prohibited entirely. Large solar energy \ +systems (SES) may also be referred to as {_LARGE_SEF_SYNONYMS}. Do not make \ +any inferences; only answer based on information that is explicitly stated in \ +the text. Note that relevant information may sometimes be found in tables. \ +Return your answer as a dictionary in JSON format (not markdown). Your JSON \ +file must include exactly two keys. The first key is 'districts' which \ +contains a string that lists all of the district names for which the text \ +explicitly permits **large solar energy farms** (if any). The last key is \ +'{{key}}', which is a boolean that is set to True if any part of the text \ +excerpt provides information on districts where **large solar energy farms** \ +(or similar) are a permitted use and False otherwise.\ +""" + + _SEF_TEXT_EXTRACTION_PROMPT = f"""\ # CONTEXT # We want to reduce the provided excerpt to only contain information about \ @@ -179,177 +230,44 @@ class SolarHeuristic(OrdinanceHeuristic): """Phrases that indicate text is about solar farms""" -class SolarOrdinanceTextCollector(OrdinanceTextCollector): +class SolarOrdinanceTextCollector(PromptBasedTextCollector): """Check text chunks for ordinances and collect them if they do""" OUT_LABEL = "relevant_text" """Identifier for text collected by this class""" - CONTAINS_ORD_PROMPT = ( - "You extract structured data from text. Return your answer in JSON " - "format (not markdown). Your JSON file must include exactly two " - "keys. The first key is 'solar_reqs', which is a string that " - f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted " - "in the legal text for solar energy systems for a given jurisdiction. " - "Note that solar energy bans are an important restriction to track. " - "Include any **closely related provisions** if they clearly pertain " - "to the **development, operation, modification, or removal** of solar " - "energy systems (or solar panels). " - "All restrictions should be enforceable - ignore any text that only " - "provides a legal definition of the regulation. If the text does not " - f"specify any concrete {_SEARCH_TERMS_OR} for a solar energy system, " - "set this key to `null`. The last key is '{key}', which is a boolean " - "that is set to True if the text excerpt explicitly details " - f"{_SEARCH_TERMS_OR} for a solar energy system and False otherwise." - ) - """Prompt to check if chunk contains SEF ordinance info""" - - IS_UTILITY_SCALE_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - f"whether it applies to {_SEARCH_TERMS_OR} for **large " - "solar energy systems**. Large solar energy systems (SES) may " - f"also be referred to as {_LARGE_SEF_SYNONYMS}. " - "Your client is a commercial solar developer that does not " - f"care about ordinances related to {_IGNORE_TYPES} solar energy " - "systems. Ignore any text related to such systems. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'summary' which contains a string that summarizes the types of " - "solar energy systems the text applies to (if any). The second key " - "is '{key}', which is a boolean that is set to True if any part of " - f"the text excerpt details {_SEARCH_TERMS_OR} for the **large solar " - "energy conversion systems** (or similar) that the client is " - "interested in and False otherwise." - ) - """Prompt to check if chunk is for utility-scale SEF""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk at a given ind to see if it contains ordinance - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains large solar energy farm ordinance text. - """ - contains_ord_info = await chunk_parser.parse_from_ind( - ind, - key="contains_ord_info", - llm_call_callback=self._check_chunk_contains_ord, - ) - if not contains_ord_info: - logger.debug("Text at ind %d does not contain ordinance info", ind) - return False - - logger.debug("Text at ind %d does contain ordinance info", ind) - - is_utility_scale = await chunk_parser.parse_from_ind( - ind, - key="x", - llm_call_callback=self._check_chunk_is_for_utility_scale, - ) - if not is_utility_scale: - logger.debug("Text at ind %d is not for utility-scale SEF", ind) - return False - - logger.debug("Text at ind %d is for utility-scale SEF", ind) - - self._store_chunk(chunk_parser, ind) - logger.debug("Added text at ind %d to ordinances", ind) - - return True - - async def _check_chunk_contains_ord(self, key, text_chunk): - """Call LLM on a chunk of text to check for ordinance""" - content = await self.call( - sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - async def _check_chunk_is_for_utility_scale(self, key, text_chunk): - """Call LLM on a chunk of text to check for utility scale""" - content = await self.call( - sys_msg=self.IS_UTILITY_SCALE_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - -class SolarPermittedUseDistrictsTextCollector(OrdinanceTextCollector): + PROMPTS = [ + { + "key": "contains_ord_info", + "label": "contains ordinance info", + "prompt": _CONTAINS_ORD_COLLECTION_PROMPT, + }, + { + # Generic key like "x" makes the llm focus on the + # instruction rather than using the key name to infer the + # content, which can improve performance, + "key": "x", + "label": "for utility-scale SEF", + "prompt": _IS_UTILITY_SCALE_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text collection""" + + +class SolarPermittedUseDistrictsTextCollector(PromptBasedTextCollector): """Check text chunks for permitted solar districts; collect them""" OUT_LABEL = "permitted_use_text" """Identifier for text collected by this class""" - DISTRICT_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - "whether it explicitly contains relevant information to determine the " - "districts (and especially the district names) where large solar " - "energy farms are a permitted use (primary, special, accessory, or " - "otherwise), as well as the districts where large solar energy farms " - "are prohibited entirely. Large solar energy systems (SES) may also " - f"be referred to as {_LARGE_SEF_SYNONYMS}. " - "Do not make any inferences; only answer based on information that " - "is explicitly stated in the text. " - "Note that relevant information may sometimes be found in tables. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'districts' which contains a string that lists all of the district " - "names for which the text explicitly permits **large solar energy " - "farms** (if any). The last key is '{key}', which is a boolean that " - "is set to True if any part of the text excerpt provides information " - "on districts where **large solar energy farms** (or similar) are a " - "permitted use and False otherwise." - ) - """Prompt to check if chunk contains info on permitted districts""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk to see if it contains permitted uses - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains large solar energy farm permitted use text. - """ - - key = "contains_district_info" - content = await self.call( - sys_msg=self.DISTRICT_PROMPT.format(key=key), - content=chunk_parser.text_chunks[ind], - usage_sub_label=( - LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION - ), - ) - logger.debug("LLM response: %s", content) - contains_district_info = content.get(key, False) - - if contains_district_info: - self._store_chunk(chunk_parser, ind) - logger.debug("Text at ind %d contains district info", ind) - return True - - logger.debug("Text at ind %d does not contain district info", ind) - return False + PROMPTS = [ + { + "key": "contains_district_info", + "label": "contains district info", + "prompt": _DISTRICTS_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text collection""" class SolarOrdinanceTextExtractor(PromptBasedTextExtractor): diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py index aa547184..907ee125 100644 --- a/compass/extraction/wind/ordinance.py +++ b/compass/extraction/wind/ordinance.py @@ -8,7 +8,7 @@ from compass.plugin.ordinance import ( OrdinanceHeuristic, - OrdinanceTextCollector, + PromptBasedTextCollector, PromptBasedTextExtractor, ) from compass.utilities.enums import LLMUsageCategory @@ -32,6 +32,58 @@ _SEARCH_TERMS_OR = _SEARCH_TERMS_AND.replace("and", "or") _IGNORE_TYPES = "private, residential, micro, small, or medium sized" +_CONTAINS_ORD_COLLECTION_PROMPT = f"""\ +You extract structured data from text. Return your answer in JSON format \ +(not markdown). Your JSON file must include exactly two keys. The first \ +key is 'wind_reqs', which is a string that summarizes all {_SEARCH_TERMS_AND} \ +that are explicitly enacted in the text for a wind energy system (or wind \ +turbine/tower) for a given jurisdiction. Note that wind energy bans are \ +an important restriction to track. Include any **closely related provisions** \ +if they clearly pertain to the **development, operation, modification, or \ +removal** of wind energy systems (or wind turbines/towers). All restrictions \ +should be enforceable - ignore any text that only provides a legal definition \ +of the regulation. If the text does not specify any concrete \ +{_SEARCH_TERMS_OR} for a wind energy system, set this key to `null`. The last \ +key is '{{key}}', which is a boolean that is set to True if the text excerpt \ +explicitly details {_SEARCH_TERMS_OR} for a wind energy system (or wind \ +turbine/tower) and False otherwise.\ +""" + +_IS_UTILITY_SCALE_COLLECTION_PROMPT = f"""\ +You are a legal scholar that reads ordinance text and determines whether \ +any of it applies to {_SEARCH_TERMS_OR} for **large wind energy systems**. \ +Large wind energy systems (WES) may also be referred to as \ +{_LARGE_WES_SYNONYMS}. Your client is a commercial wind developer that \ +does not care about ordinances related to {_IGNORE_TYPES} wind energy \ +systems. Ignore any text related to such systems. Return your answer as a \ +dictionary in JSON format (not markdown). Your JSON file must include \ +exactly two keys. The first key is 'summary' which contains a string that \ +lists all of the types of wind energy systems the text applies to (if any). \ +The second key is '{{key}}', which is a boolean that is set to True if any \ +part of the text excerpt details {_SEARCH_TERMS_OR} for the **large wind \ +energy conversion systems** (or similar) that the client is interested in \ +and False otherwise.\ +""" + +_DISTRICTS_COLLECTION_PROMPT = f"""\ +You are a legal scholar that reads ordinance text and determines whether \ +the text explicitly contains relevant information to determine the districts \ +(and especially the district names) where large wind energy systems are a \ +permitted use (primary, special, accessory, or otherwise), as well as the \ +districts where large wind energy systems are prohibited entirely. Large \ +wind energy systems (WES) may also be referred to as {_LARGE_WES_SYNONYMS}. \ +Do not make any inferences; only answer based on information that is \ +explicitly stated in the text. Note that relevant information may sometimes \ +be found in tables. Return your answer as a dictionary in JSON format (not \ +markdown). Your JSON file must include exactly two keys. The first key is \ +'districts' which contains a string that lists all of the district names for \ +which the text explicitly permits **large wind energy systems** (if any). \ +The last key is '{{key}}', which is a boolean that is set to True if any \ +part of the text excerpt provides information on districts where **large \ +wind energy systems** (or similar) are a permitted use in and False \ +otherwise.\ +""" + _WECS_TEXT_EXTRACTION_PROMPT = """\ # CONTEXT # We want to reduce the provided excerpt to only contain information about \ @@ -236,180 +288,44 @@ class WindHeuristic(OrdinanceHeuristic): """Phrases that indicate text is about WECS""" -class WindOrdinanceTextCollector(OrdinanceTextCollector): +class WindOrdinanceTextCollector(PromptBasedTextCollector): """Check text chunks for ordinances and collect them if they do""" OUT_LABEL = "relevant_text" """Identifier for text collected by this class""" - CONTAINS_ORD_PROMPT = ( - "You extract structured data from text. Return your answer in JSON " - "format (not markdown). Your JSON file must include exactly two " - "keys. The first key is 'wind_reqs', which is a string that " - f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted " - "in the text for a wind energy system (or wind turbine/tower) for " - "a given jurisdiction. " - "Note that wind energy bans are an important restriction to track. " - "Include any **closely related provisions** if they clearly pertain " - "to the **development, operation, modification, or removal** of wind " - "energy systems (or wind turbines/towers). " - "All restrictions should be enforceable - ignore any text that only " - "provides a legal definition of the regulation. If the text does not " - f"specify any concrete {_SEARCH_TERMS_OR} for a wind energy system, " - "set this key to `null`. The last key is '{key}', which is a boolean " - "that is set to True if the text excerpt explicitly details " - f"{_SEARCH_TERMS_OR} for a wind energy system (or wind turbine/tower) " - "and False otherwise. " - ) - """Prompt to check if chunk contains WES ordinance info""" - - IS_UTILITY_SCALE_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - f"whether any of it applies to {_SEARCH_TERMS_OR} for " - "**large wind energy systems**. Large wind energy systems (WES) may " - f"also be referred to as {_LARGE_WES_SYNONYMS}. " - "Your client is a commercial wind developer that does not " - f"care about ordinances related to {_IGNORE_TYPES} wind energy " - "systems. Ignore any text related to such systems. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'summary' which contains a string that lists all of the types of " - "wind energy systems the text applies to (if any). The second key is " - "'{key}', which is a boolean that is set to True if any part of the " - f"text excerpt details {_SEARCH_TERMS_OR} for the **large wind energy " - "conversion systems** (or similar) that the client is interested in " - "and False otherwise." - ) - """Prompt to check if chunk is for utility-scale WES""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk at a given ind to see if it contains ordinance - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains large wind energy conversion system ordinance text. - """ - contains_ord_info = await chunk_parser.parse_from_ind( - ind, - key="contains_ord_info", - llm_call_callback=self._check_chunk_contains_ord, - ) - if not contains_ord_info: - logger.debug("Text at ind %d does not contain ordinance info", ind) - return False - - logger.debug("Text at ind %d does contain ordinance info", ind) - - is_utility_scale = await chunk_parser.parse_from_ind( - ind, - key="x", - llm_call_callback=self._check_chunk_is_for_utility_scale, - ) - if not is_utility_scale: - logger.debug("Text at ind %d is not for utility-scale WECS", ind) - return False - - logger.debug("Text at ind %d is for utility-scale WECS", ind) - - self._store_chunk(chunk_parser, ind) - logger.debug("Added text at ind %d to ordinances", ind) - - return True - - async def _check_chunk_contains_ord(self, key, text_chunk): - """Call LLM on a chunk of text to check for ordinance""" - content = await self.call( - sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - async def _check_chunk_is_for_utility_scale(self, key, text_chunk): - """Call LLM on a chunk of text to check for utility scale""" - content = await self.call( - sys_msg=self.IS_UTILITY_SCALE_PROMPT.format(key=key), - content=text_chunk, - usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION), - ) - logger.debug("LLM response: %s", content) - return content.get(key, False) - - -class WindPermittedUseDistrictsTextCollector(OrdinanceTextCollector): + PROMPTS = [ + { + "key": "contains_ord_info", + "label": "contains ordinance info", + "prompt": _CONTAINS_ORD_COLLECTION_PROMPT, + }, + { + # Generic key like "x" makes the llm focus on the + # instruction rather than using the key name to infer the + # content, which can improve performance, + "key": "x", + "label": "for utility-scale WECS", + "prompt": _IS_UTILITY_SCALE_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for ordinance text collection""" + + +class WindPermittedUseDistrictsTextCollector(PromptBasedTextCollector): """Check text chunks for permitted wind districts; collect them""" OUT_LABEL = "permitted_use_text" """Identifier for text collected by this class""" - DISTRICT_PROMPT = ( - "You are a legal scholar that reads ordinance text and determines " - "whether the text explicitly contains relevant information to " - "determine the districts (and especially the district names) where " - "large wind energy systems are a permitted use (primary, special, " - "accessory, or otherwise), as well as the districts where large wind " - "energy systems are prohibited entirely. Large wind energy systems " - f"(WES) may also be referred to as {_LARGE_WES_SYNONYMS}. " - "Do not make any inferences; only answer based on information that " - "is explicitly stated in the text. " - "Note that relevant information may sometimes be found in tables. " - "Return your answer as a dictionary in JSON format (not markdown). " - "Your JSON file must include exactly two keys. The first key is " - "'districts' which contains a string that lists all of the district " - "names for which the text explicitly permits **large wind energy " - "systems** (if any). The last key is '{key}', which is a boolean that " - "is set to True if any part of the text excerpt provides information " - "on districts where **large wind energy systems** (or similar) are a " - "permitted use in and False otherwise." - ) - """Prompt to check if chunk contains info on permitted districts""" - - async def check_chunk(self, chunk_parser, ind): - """Check a chunk to see if it contains permitted uses - - Parameters - ---------- - chunk_parser : ParseChunksWithMemory - Instance that contains a ``parse_from_ind`` method. - ind : int - Index of the chunk to check. - - Returns - ------- - bool - Boolean flag indicating whether or not the text in the chunk - contains large wind energy conversion system permitted use - text. - """ - - key = "contains_district_info" - content = await self.call( - sys_msg=self.DISTRICT_PROMPT.format(key=key), - content=chunk_parser.text_chunks[ind], - usage_sub_label=( - LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION - ), - ) - logger.debug("LLM response: %s", content) - contains_district_info = content.get(key, False) - - if contains_district_info: - self._store_chunk(chunk_parser, ind) - logger.debug("Text at ind %d contains district info", ind) - return True - - logger.debug("Text at ind %d does not contain district info", ind) - return False + PROMPTS = [ + { + "key": "contains_district_info", + "label": "contains district info", + "prompt": _DISTRICTS_COLLECTION_PROMPT, + }, + ] + """Dicts defining the prompts for permitted use text collection""" class WindOrdinanceTextExtractor(PromptBasedTextExtractor): diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index 369a3c80..8dc16746 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -10,7 +10,7 @@ BaseTextExtractor, BaseParser, OrdinanceHeuristic, - OrdinanceTextCollector, + PromptBasedTextCollector, PromptBasedTextExtractor, OrdinanceParser, OrdinanceExtractionPlugin, diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index de75087a..4fae6e79 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -259,8 +259,58 @@ def GOOD_TECH_PHRASES(self): # noqa: N802 raise NotImplementedError -class OrdinanceTextCollector(StructuredLLMCaller, BaseTextCollector): - """Base class for ordinance text collectors""" +class PromptBasedTextCollector(StructuredLLMCaller, BaseTextCollector, ABC): + """Text extractor based on a chain of prompts""" + + @property + @abstractmethod + def PROMPTS(self): # noqa: N802 + """list: List of dicts defining the prompts for text extraction + + Each dict in the list should have the following keys: + + - **prompt**: [REQUIRED] The text filter prompt to use + to determine if a chunk of text is relevant for the + current extraction task. The prompt must instruct the LLM + to return a dictionary (as JSON) with at least one key + that outputs the filter decision. The prompt may use the + following placeholders, which will be filled in with the + corresponding class attributes when the prompt is applied: + + - ``"{key}"``: The key corresponding to this prompt. + + - **key**: [REQUIRED] A string identifier for the key that + in the output JSON dictionary that represents the LLM + filter decision (``True`` if the tech chunk should be + kept, and ``False`` otherwise). + - **label**: [OPTIONAL] A string label describing the type + of relevant text this prompt is looking for (e.g. "wind + energy conversion system ordinance text"). This is only + used for logging purposes and does not affect the + extraction process itself. If not provided, this will + default to "collector step {i}". + + The prompts will be applied in the order they appear in the + list, with the output text from each prompt being fed as input + to the next prompt in the chain. If any of the filter decisions + return ``False``, the text will be discarded and not passed to + subsequent prompts. The final output of the last prompt will + determine wether or not the chunk of text being evaluated is + kept as relevant text for extraction. + """ + raise NotImplementedError + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + if getattr(cls, "__abstractmethods__", None): + return + + if not cls.PROMPTS: # TODO: This should happen at registration + msg = ( + f"{cls.__name__} must have at least one " + "prompt defined in the PROMPTS property" + ) + raise COMPASSPluginConfigurationError(msg) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -285,6 +335,59 @@ def relevant_text(self): text = [self._chunks[ind] for ind in sorted(self._chunks)] return merge_overlapping_texts(text) + async def check_chunk(self, chunk_parser, ind): + """Check a chunk at a given ind to see if it contains ordinance + + Parameters + ---------- + chunk_parser : ParseChunksWithMemory + Instance that contains a ``parse_from_ind`` method. + ind : int + Index of the chunk to check. + + Returns + ------- + bool + Boolean flag indicating whether or not the text in the chunk + contains large wind energy conversion system ordinance text. + """ + for collection_step, prompt_dict in enumerate(self.PROMPTS): + key = prompt_dict["key"] + prompt = prompt_dict["prompt"].format(key=key) + label = prompt_dict.get("label", collection_step) + passed_filter = await chunk_parser.parse_from_ind( + ind, + key=key, + llm_call_callback=self._check_chunk_with_prompt, + prompt=prompt, + ) + + if not passed_filter: + logger.debug( + "Text at ind %d did not pass collection step: %s", + ind, + label, + ) + return False + + logger.debug( + "Text at ind %d passed collection step: %s ", ind, label + ) + + self._store_chunk(chunk_parser, ind) + logger.debug("Added text chunk at ind %d to extraction text", ind) + return True + + async def _check_chunk_with_prompt(self, key, text_chunk, prompt): + """Call LLM on a chunk of text to check for ordinance""" + content = await self.call( + sys_msg=prompt.format(key=key), + content=text_chunk, + usage_sub_label=LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION, + ) + logger.debug("LLM response: %s", content) + return content.get(key, False) + def _store_chunk(self, parser, chunk_ind): """Store chunk and its neighbors if it is not already stored""" for offset in range(1 - parser.num_to_recall, 2): @@ -676,7 +779,7 @@ async def parse_docs_for_structured_data(self, extraction_context): extraction_context.attrs["structured_data"] = data_df logger.info( "%d ordinance value(s) found in doc from %s for %s. ", - row_count, + num_ordinances_dataframe(data_df), doc_for_extraction.attrs.get("source", "unknown source"), self.jurisdiction.full_name, ) diff --git a/compass/validation/content.py b/compass/validation/content.py index 7725a28d..c829c805 100644 --- a/compass/validation/content.py +++ b/compass/validation/content.py @@ -50,19 +50,19 @@ def __init__(self, text_chunks, num_to_recall=2): self.num_to_recall = num_to_recall self.memory = [{} for _ in text_chunks] - # fmt: off def _inverted_mem(self, starting_ind): """Inverted memory""" - inverted_mem = self.memory[:starting_ind + 1:][::-1] - yield from inverted_mem[:self.num_to_recall] + inverted_mem = self.memory[:starting_ind + 1:][::-1] # fmt: off + yield from inverted_mem[:self.num_to_recall] # fmt: off - # fmt: off def _inverted_text(self, starting_ind): """Inverted text chunks""" - inverted_text = self.text_chunks[:starting_ind + 1:][::-1] - yield from inverted_text[:self.num_to_recall] + inverted_text = self.text_chunks[:starting_ind + 1:][::-1] # fmt: off + yield from inverted_text[:self.num_to_recall] # fmt: off - async def parse_from_ind(self, ind, key, llm_call_callback): + async def parse_from_ind( + self, ind, key, llm_call_callback, *args, **kwargs + ): """Validate a chunk by consulting current and prior context Cached verdicts are reused to avoid redundant LLM calls when @@ -97,7 +97,9 @@ async def parse_from_ind(self, ind, key, llm_call_callback): logger.debug("Mem at ind %d is %s", step, mem) check = mem.get(key) if check is None: - check = mem[key] = await llm_call_callback(key, text) + check = mem[key] = await llm_call_callback( + key, text, *args, **kwargs + ) if check: return check return False From 5fc485734b2e146824921d5536ccd3950eeb544b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 20:12:21 -0700 Subject: [PATCH 05/22] Change class name --- compass/extraction/small_wind/ordinance.py | 4 ++-- compass/extraction/solar/ordinance.py | 4 ++-- compass/extraction/wind/ordinance.py | 4 ++-- compass/plugin/__init__.py | 2 +- compass/plugin/ordinance.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py index 01dfa2ea..60b857bf 100644 --- a/compass/extraction/small_wind/ordinance.py +++ b/compass/extraction/small_wind/ordinance.py @@ -7,7 +7,7 @@ import logging from compass.plugin.ordinance import ( - OrdinanceHeuristic, + KeywordBasedHeuristic, PromptBasedTextCollector, PromptBasedTextExtractor, ) @@ -231,7 +231,7 @@ """ -class SmallWindHeuristic(OrdinanceHeuristic): +class SmallWindHeuristic(KeywordBasedHeuristic): """Perform a heuristic check for mention of wind turbines in text""" NOT_TECH_WORDS = [ diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py index 691adf12..942869df 100644 --- a/compass/extraction/solar/ordinance.py +++ b/compass/extraction/solar/ordinance.py @@ -7,7 +7,7 @@ import logging from compass.plugin.ordinance import ( - OrdinanceHeuristic, + KeywordBasedHeuristic, PromptBasedTextCollector, PromptBasedTextExtractor, ) @@ -189,7 +189,7 @@ """ -class SolarHeuristic(OrdinanceHeuristic): +class SolarHeuristic(KeywordBasedHeuristic): """Perform a heuristic check for mention of solar farms in text""" NOT_TECH_WORDS = [ diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py index 907ee125..95c9e7fd 100644 --- a/compass/extraction/wind/ordinance.py +++ b/compass/extraction/wind/ordinance.py @@ -7,7 +7,7 @@ import logging from compass.plugin.ordinance import ( - OrdinanceHeuristic, + KeywordBasedHeuristic, PromptBasedTextCollector, PromptBasedTextExtractor, ) @@ -230,7 +230,7 @@ """ -class WindHeuristic(OrdinanceHeuristic): +class WindHeuristic(KeywordBasedHeuristic): """Perform a heuristic check for mention of wind turbines in text""" NOT_TECH_WORDS = [ diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index 8dc16746..7dedc77e 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -9,7 +9,7 @@ from .ordinance import ( BaseTextExtractor, BaseParser, - OrdinanceHeuristic, + KeywordBasedHeuristic, PromptBasedTextCollector, PromptBasedTextExtractor, OrdinanceParser, diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 4fae6e79..3b545657 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -130,7 +130,7 @@ async def parse(self, text): raise NotImplementedError -class OrdinanceHeuristic(BaseHeuristic, ABC): +class KeywordBasedHeuristic(BaseHeuristic, ABC): """Perform a heuristic check for mention of a technology in text""" _GOOD_ACRONYM_CONTEXTS = [ From fcb00fb08c90abaf3219e7248076bafbc910ad93 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:08:11 -0700 Subject: [PATCH 06/22] Add validation logic --- compass/plugin/ordinance.py | 203 ++++++++++++++++++++++++------------ 1 file changed, 138 insertions(+), 65 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 3b545657..798dd911 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -300,18 +300,6 @@ def PROMPTS(self): # noqa: N802 """ raise NotImplementedError - def __init_subclass__(cls, **kwargs): - super().__init_subclass__(**kwargs) - if getattr(cls, "__abstractmethods__", None): - return - - if not cls.PROMPTS: # TODO: This should happen at registration - msg = ( - f"{cls.__name__} must have at least one " - "prompt defined in the PROMPTS property" - ) - raise COMPASSPluginConfigurationError(msg) - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._chunks = {} @@ -509,13 +497,6 @@ def __init_subclass__(cls, **kwargs): if getattr(cls, "__abstractmethods__", None): return - if not cls.PROMPTS: # TODO: This should happen at registration - msg = ( - f"{cls.__name__} must have at least one " - "prompt defined in the PROMPTS property" - ) - raise COMPASSPluginConfigurationError(msg) - last_prompt = cls.PROMPTS[-1] last_index = len(cls.PROMPTS) - 1 cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}") @@ -652,52 +633,6 @@ def consumer_producer_pairs(self): (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS), ] - def __init__(self, jurisdiction, model_configs, usage_tracker=None): - """ - - Parameters - ---------- - jurisdiction : Jurisdiction - Jurisdiction for which extraction is being performed. - model_configs : dict - Dictionary where keys are LLMTasks and values are LLMConfig - instances to be used for those tasks. - usage_tracker : UsageTracker, optional - Usage tracker instance that can be used to record the LLM - call cost. By default, ``None``. - """ - super().__init__( - jurisdiction=jurisdiction, - model_configs=model_configs, - usage_tracker=usage_tracker, - ) - - # TODO: This should happen during plugin registration - self._validate_in_out_keys() - - def _validate_in_out_keys(self): - """Validate that all IN_LABELs have matching OUT_LABELs""" - out_keys = {} - for producer in self.producers: - out_keys.setdefault(producer.OUT_LABEL, []).append(producer) - - dupes = {k: v for k, v in out_keys.items() if len(v) > 1} - if dupes: - formatted = "\n".join( - [ - f"{key}: {[cls.__name__ for cls in classes]}" - for key, classes in dupes.items() - ] - ) - msg = ( - "Multiple processing classes produce the same OUT_LABEL key:\n" - f"{formatted}" - ) - raise COMPASSPluginConfigurationError(msg) - - for consumers, producers in self.consumer_producer_pairs: - _validate_in_out_keys(consumers, producers) - async def extract_ordinances_from_text( self, doc, parser_class, model_config ): @@ -902,6 +837,144 @@ def _get_model_config(self, primary_key, secondary_key): secondary_key, self.model_configs[LLMTasks.DEFAULT] ) + def validate_plugin_configuration(self): + """[NOT PUBLIC API] Validate plugin is properly configured""" + super().validate_plugin_configuration() + self._validate_text_extractors() + self._validate_parsers() + self._validate_in_out_keys() + self._validate_collector_prompts() + self._validate_collector_prompts() + + def _validate_text_extractors(self): + """Validate user provided at least one text extractor class""" + try: + extractors = self.TEXT_EXTRACTORS + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'TEXT_EXTRACTORS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if len(extractors) == 0: + msg = ( + f"Plugin class {self.__class__.__name__} has an empty " + "'TEXT_EXTRACTORS' property! Please provide at least " + "one text extractor class." + ) + raise COMPASSPluginConfigurationError(msg) + + for extractor_class in extractors: + if not issubclass(extractor_class, BaseTextExtractor): + msg = ( + f"Plugin class {self.__class__.__name__} has invalid " + "entry in 'TEXT_EXTRACTORS' property: All entries must " + "be subclasses of " + "compass.plugin.ordinance.BaseTextExtractor, but " + f"{extractor_class.__name__} is not!" + ) + raise COMPASSPluginConfigurationError(msg) + + def _validate_parsers(self): + """Validate user provided at least one parser class""" + try: + parsers = self.PARSERS + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'PARSERS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if len(parsers) == 0: + msg = ( + f"Plugin class {self.__class__.__name__} has an empty " + "'PARSERS' property! Please provide at least " + "one text extractor class." + ) + raise COMPASSPluginConfigurationError(msg) + + for parsers_class in parsers: + if not issubclass(parsers_class, BaseParser): + msg = ( + f"Plugin class {self.__class__.__name__} has invalid " + "entry in 'PARSERS' property: All entries must " + "be subclasses of " + "compass.plugin.ordinance.BaseParser, but " + f"{parsers_class.__name__} is not!" + ) + raise COMPASSPluginConfigurationError(msg) + + def _validate_in_out_keys(self): + """Validate that all IN_LABELs have matching OUT_LABELs""" + out_keys = {} + for producer in self.producers: + out_keys.setdefault(producer.OUT_LABEL, []).append(producer) + + dupes = {k: v for k, v in out_keys.items() if len(v) > 1} + if dupes: + formatted = "\n".join( + [ + f"{key}: {[cls.__name__ for cls in classes]}" + for key, classes in dupes.items() + ] + ) + msg = ( + "Multiple processing classes produce the same OUT_LABEL key:\n" + f"{formatted}" + ) + raise COMPASSPluginConfigurationError(msg) + + for consumers, producers in self.consumer_producer_pairs: + _validate_in_out_keys(consumers, producers) + + def _validate_collector_prompts(self): + """Validate that all text collectors have prompts defined""" + + for collector in self.TEXT_COLLECTORS: + if not issubclass(collector, PromptBasedTextCollector): + continue + try: + num_prompts = len(collector.PROMPTS) + except NotImplementedError: + msg = ( + f"Text collector {self.__class__.__name__} is missing " + "required property 'PROMPTS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if num_prompts == 0: + msg = ( + f"Text collector {self.__class__.__name__} has an empty " + "'PROMPTS' property! Please provide at least one prompt " + "dictionary." + ) + raise COMPASSPluginConfigurationError(msg) + + def _validate_collector_prompts(self): + """Validate that all text extractors have prompts defined""" + + for collector in self.TEXT_EXTRACTORS: + if not issubclass(collector, PromptBasedTextExtractor): + continue + try: + num_prompts = len(collector.PROMPTS) + except NotImplementedError: + msg = ( + f"Text extractor {self.__class__.__name__} is missing " + "required property 'PROMPTS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if num_prompts == 0: + msg = ( + f"Text extractor {self.__class__.__name__} has an empty " + "'PROMPTS' property! Please provide at least one prompt " + "dictionary." + ) + raise COMPASSPluginConfigurationError(msg) + def _valid_chunk(chunk): """True if chunk has content""" From bf7f128548bf9233c7770517c25bbfb621fc7301 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:08:43 -0700 Subject: [PATCH 07/22] Allow plugins to register own districts --- compass/utilities/jurisdictions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/compass/utilities/jurisdictions.py b/compass/utilities/jurisdictions.py index 9ce6100b..1a305385 100644 --- a/compass/utilities/jurisdictions.py +++ b/compass/utilities/jurisdictions.py @@ -15,7 +15,6 @@ logger = logging.getLogger(__name__) KNOWN_JURISDICTIONS_REGISTRY = { importlib.resources.files("compass") / "data" / "conus_jurisdictions.csv", - importlib.resources.files("compass") / "data" / "tx_water_districts.csv", } _JUR_COLS = [ "Jurisdiction Type", From b9e30fcdb6711e9feba4f3087aca7128ad7ef87f Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:10:33 -0700 Subject: [PATCH 08/22] Add validation logic --- compass/plugin/base.py | 3 ++ compass/plugin/interface.py | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/compass/plugin/base.py b/compass/plugin/base.py index 71957bc8..0adbd6f2 100644 --- a/compass/plugin/base.py +++ b/compass/plugin/base.py @@ -146,3 +146,6 @@ async def record_usage(self): total_usage = await UsageUpdater.call(self.usage_tracker) total_cost = compute_total_cost_from_usage(total_usage) COMPASS_PB.update_total_cost(total_cost, replace=True) + + def validate_plugin_configuration(self): # noqa: B027 + """[NOT PUBLIC API] Validate plugin is properly configured""" diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index 25e8607b..b532b096 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -328,3 +328,77 @@ async def _write_cleaned_text(self, doc): out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name) doc.attrs["cleaned_fps"] = out_fp return doc + + def validate_plugin_configuration(self): + """[NOT PUBLIC API] Validate plugin is properly configured""" + + try: + __ = self.IDENTIFIER + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'IDENTIFIER'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + try: + num_q_templates = len(self.QUESTION_TEMPLATES) + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'QUESTION_TEMPLATES'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if num_q_templates == 0: + msg = ( + f"Plugin class {self.__class__.__name__} has an empty " + "'QUESTION_TEMPLATES' property! Please provide at least " + "one question template." + ) + raise COMPASSPluginConfigurationError(msg) + + try: + num_website_keywords = len(self.WEBSITE_KEYWORDS) + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'WEBSITE_KEYWORDS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if num_website_keywords == 0: + msg = ( + f"Plugin class {self.__class__.__name__} has an empty " + "'WEBSITE_KEYWORDS' property! Please provide at least " + "one website keyword." + ) + raise COMPASSPluginConfigurationError(msg) + + try: + collectors = self.TEXT_COLLECTORS + except NotImplementedError: + msg = ( + f"Plugin class {self.__class__.__name__} is missing required " + "property 'TEXT_COLLECTORS'" + ) + raise COMPASSPluginConfigurationError(msg) from None + + if len(collectors) == 0: + msg = ( + f"Plugin class {self.__class__.__name__} has an empty " + "'TEXT_COLLECTORS' property! Please provide at least " + "one text collector class." + ) + raise COMPASSPluginConfigurationError(msg) + + for collector_class in collectors: + if not issubclass(collector_class, BaseTextCollector): + msg = ( + f"Plugin class {self.__class__.__name__} has invalid " + "entry in 'TEXT_COLLECTORS' property: All entries must " + "be subclasses of " + "compass.plugin.interface.BaseTextCollector, but " + f"{collector_class.__name__} is not!" + ) + raise COMPASSPluginConfigurationError(msg) From d3abfb6f1eb40c15054c3e2f4e475f17b2cb2474 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:10:48 -0700 Subject: [PATCH 09/22] Allow `JURISDICTION_DATA_FP` property --- compass/plugin/base.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/compass/plugin/base.py b/compass/plugin/base.py index 0adbd6f2..0dcf2fba 100644 --- a/compass/plugin/base.py +++ b/compass/plugin/base.py @@ -34,6 +34,38 @@ def __init__(self, jurisdiction, model_configs, usage_tracker=None): self.model_configs = model_configs self.usage_tracker = usage_tracker + JURISDICTION_DATA_FP = None + """path-like: Optional path to jurisdiction CSV + + If provided, this CSV will extend the known jurisdictions (by + default, US states, counties, and townships). This CSV must have the + following columns: + + - State: The state in which the jurisdiction is located + (e.g. "Texas") + - County: The county in which the jurisdiction is located + (e.g. "Travis"). This can be left blank if the jurisdiction is + not associated with a county. + - Subdivision: The name of the subdivision of the county in + which the jurisdiction is located. Use this input for + jurisdictions that do not map to counties/townships (e.g. + water conservation districts, resource management plan areas, + etc.). This can be left blank if the jurisdiction does not + have the notion of a "subdivision". + - Jurisdiction Type: The type of jurisdiction (e.g. "county", + "township", "city", "special district", "RMP", etc.). + - FIPS: The code to be used for the jurisdiction, if applicable + (e.g. "48453" for Travis County, Texas, "22" for the + Culberson County Groundwater Conservation District, etc.). + This can be left blank if the jurisdiction does not have an + applicable code. + - Website: The official website for the jurisdiction, if + applicable (e.g. "https://www.traviscountytx.gov/"). This can + be left blank if the jurisdiction does not have an official + website or if the website is not known. + + """ + @property @abstractmethod def IDENTIFIER(self): # noqa: N802 From df89a772131e886881f817f7e5ccfd0fde6a8abc Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:11:18 -0700 Subject: [PATCH 10/22] Add import --- compass/plugin/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index b532b096..5ee98449 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -9,12 +9,12 @@ from compass.scripts.download import filter_ordinance_docs from compass.services.threaded import CleanedFileWriter from compass.utilities import doc_infos_to_db, save_db +from compass.exceptions import COMPASSPluginConfigurationError logger = logging.getLogger(__name__) # TODO: Allow other to register own clean file outputs -# TODO: Allow other to register their own jurisdictions csv class BaseHeuristic(ABC): From 8e3958dcd78d828676dc60fe4bcaddfbe3692da0 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:11:30 -0700 Subject: [PATCH 11/22] Add MVP of registry --- compass/plugin/registry.py | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 compass/plugin/registry.py diff --git a/compass/plugin/registry.py b/compass/plugin/registry.py new file mode 100644 index 00000000..d7397411 --- /dev/null +++ b/compass/plugin/registry.py @@ -0,0 +1,42 @@ +"""COMPASS plugin registry""" + +from compass.utilities.jurisdictions import KNOWN_JURISDICTIONS_REGISTRY +from compass.plugin.base import BaseExtractionPlugin +from compass.exceptions import COMPASSPluginConfigurationError + + +PLUGIN_REGISTRY = {} +"""dict: Registered COMPASS plugins""" + + +def register_plugin(plugin_class): + """Register a plugin class in the plugin registry + + Parameters + ---------- + plugin_class : type + The plugin class to register. Must be a subclass of + :class:`~compass.plugin.base.BaseExtractionPlugin` and must pass + the plugin configuration validation. + + Raises + ------ + COMPASSPluginConfigurationError + If the plugin class is not a subclass of + :class:`~compass.plugin.base.BaseExtractionPlugin` or if it does + not pass the plugin configuration validation. + """ + if not issubclass(plugin_class, BaseExtractionPlugin): + msg = ( + f"Plugin class {plugin_class.__name__} must be a subclass of " + "`compass.plugin.base.BaseExtractionPlugin`!" + ) + raise COMPASSPluginConfigurationError(msg) + + if plugin_class.JURISDICTION_DATA_FP is not None: + KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP) + + plugin_instance = plugin_class(None, None) + plugin_instance.validate_plugin_configuration() + + PLUGIN_REGISTRY[plugin_class.IDENTIFIER.casefold()] = plugin_class From de3a6648e2715a9a50e80fd0881defa0e93e26bf Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:11:59 -0700 Subject: [PATCH 12/22] Populate namespace --- compass/plugin/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py index 7dedc77e..de078d64 100644 --- a/compass/plugin/__init__.py +++ b/compass/plugin/__init__.py @@ -15,3 +15,4 @@ OrdinanceParser, OrdinanceExtractionPlugin, ) +from .registry import PLUGIN_REGISTRY, register_plugin From e820a99e4892bfead62b3b0c24826be21ab9bc8b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:12:13 -0700 Subject: [PATCH 13/22] Plugins now register themselves --- compass/extraction/small_wind/plugin.py | 5 ++++- compass/extraction/solar/plugin.py | 5 ++++- compass/extraction/water/plugin.py | 13 ++++++++++++- compass/extraction/wind/plugin.py | 5 ++++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/compass/extraction/small_wind/plugin.py b/compass/extraction/small_wind/plugin.py index 02fb97ed..2b1a10e4 100644 --- a/compass/extraction/small_wind/plugin.py +++ b/compass/extraction/small_wind/plugin.py @@ -1,6 +1,6 @@ """COMPASS wind extraction plugin""" -from compass.plugin import OrdinanceExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin, register_plugin from compass.extraction.small_wind.ordinance import ( SmallWindHeuristic, SmallWindOrdinanceTextCollector, @@ -96,3 +96,6 @@ class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin): StructuredSmallWindPermittedUseDistrictsParser, ] """Class for parsing structured ordinance data from text""" + + +register_plugin(COMPASSSmallWindExtractor) diff --git a/compass/extraction/solar/plugin.py b/compass/extraction/solar/plugin.py index 4f6b5163..3e2153c1 100644 --- a/compass/extraction/solar/plugin.py +++ b/compass/extraction/solar/plugin.py @@ -1,6 +1,6 @@ """COMPASS solar extraction plugin""" -from compass.plugin import OrdinanceExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin, register_plugin from compass.extraction.solar.ordinance import ( SolarHeuristic, SolarOrdinanceTextCollector, @@ -97,3 +97,6 @@ class COMPASSSolarExtractor(OrdinanceExtractionPlugin): StructuredSolarPermittedUseDistrictsParser, ] """Class for parsing structured ordinance data from text""" + + +register_plugin(COMPASSSolarExtractor) diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py index eda68709..5ca51ed7 100644 --- a/compass/extraction/water/plugin.py +++ b/compass/extraction/water/plugin.py @@ -1,6 +1,7 @@ """COMPASS water rights extraction plugin""" import logging +import importlib.resources from pathlib import Path import pandas as pd @@ -8,7 +9,7 @@ from elm.embed import ChunkAndEmbed from compass.extraction import extract_date -from compass.plugin.base import BaseExtractionPlugin +from compass.plugin import BaseExtractionPlugin, register_plugin from compass.utilities.enums import LLMTasks from compass.utilities.parsing import extract_ord_year_from_doc_attrs from compass.exceptions import COMPASSRuntimeError @@ -78,6 +79,13 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin): heuristic = WaterRightsHeuristic() """BaseHeuristic: Object with a ``check()`` method""" + JURISDICTION_DATA_FP = ( + importlib.resources.files("compass") + / "data" + / "tx_water_districts.csv" + ) + """path-like: Path to Texas GCW names""" + async def filter_docs( self, extraction_context, @@ -290,3 +298,6 @@ def _setup_endpoints(embedding_model_config): EnergyWizard.EMBEDDING_URL = endpoint EnergyWizard.URL = "openai.azure.com" # need to trigger Azure setup + + +register_plugin(TexasWaterRightsExtractor) diff --git a/compass/extraction/wind/plugin.py b/compass/extraction/wind/plugin.py index 905bcc87..c8758213 100644 --- a/compass/extraction/wind/plugin.py +++ b/compass/extraction/wind/plugin.py @@ -1,6 +1,6 @@ """COMPASS wind extraction plugin""" -from compass.plugin import OrdinanceExtractionPlugin +from compass.plugin import OrdinanceExtractionPlugin, register_plugin from compass.extraction.wind.ordinance import ( WindHeuristic, WindOrdinanceTextCollector, @@ -95,3 +95,6 @@ class COMPASSWindExtractor(OrdinanceExtractionPlugin): StructuredWindPermittedUseDistrictsParser, ] """Class for parsing structured ordinance data from text""" + + +register_plugin(COMPASSWindExtractor) From 110a4cbce59ab266123c113c2fd492778a8d5fd8 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:12:33 -0700 Subject: [PATCH 14/22] Now run based on plugin registry --- compass/scripts/process.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/compass/scripts/process.py b/compass/scripts/process.py index cb8c579a..6a14bc44 100644 --- a/compass/scripts/process.py +++ b/compass/scripts/process.py @@ -11,6 +11,7 @@ from elm.web.utilities import get_redirected_url +from compass.plugin import PLUGIN_REGISTRY from compass.extraction.context import ExtractionContext from compass.scripts.download import ( find_jurisdiction_website, @@ -21,10 +22,6 @@ download_jurisdiction_ordinances_from_website_compass_crawl, ) from compass.exceptions import COMPASSValueError, COMPASSError -from compass.extraction.wind import COMPASSWindExtractor -from compass.extraction.solar import COMPASSSolarExtractor -from compass.extraction.small_wind import COMPASSSmallWindExtractor -from compass.extraction.water.plugin import TexasWaterRightsExtractor from compass.validation.location import JurisdictionWebsiteValidator from compass.llm import OpenAIConfig from compass.services.cpu import ( @@ -72,12 +69,6 @@ logger = logging.getLogger(__name__) -EXTRACTION_REGISTRY = { - COMPASSWindExtractor.IDENTIFIER.casefold(): COMPASSWindExtractor, - COMPASSSolarExtractor.IDENTIFIER.casefold(): COMPASSSolarExtractor, - COMPASSSmallWindExtractor.IDENTIFIER.casefold(): COMPASSSmallWindExtractor, - TexasWaterRightsExtractor.IDENTIFIER.casefold(): TexasWaterRightsExtractor, -} MAX_CONCURRENT_SEARCH_ENGINE_QUERIES = 10 @@ -136,8 +127,10 @@ async def process_jurisdictions_with_openai( # noqa: PLR0917, PLR0913 CSV file, all downloaded ordinance documents (PDFs and HTML), usage metadata, and default subdirectories for logs and intermediate outputs (unless otherwise specified). - tech : {"wind", "solar", "small wind", "tx water rights"} - Label indicating which technology type is being processed. + tech : str + Label indicating which technology type is being processed. Must + be one of the keys of + :obj:`~compass.plugin.registry.PLUGIN_REGISTRY`. jurisdiction_fp : path-like Path to a CSV file specifying the jurisdictions to process. The CSV must contain at least two columns: "County" and "State", @@ -564,10 +557,10 @@ def tpe_kwargs(self): @cached_property def extractor_class(self): """obj: Extractor class for the specified technology""" - if self.tech.casefold() not in EXTRACTION_REGISTRY: + if self.tech.casefold() not in PLUGIN_REGISTRY: msg = f"Unknown tech input: {self.tech}" raise COMPASSValueError(msg) - return EXTRACTION_REGISTRY[self.tech.casefold()] + return PLUGIN_REGISTRY[self.tech.casefold()] @cached_property def _base_services(self): From 1ae2934718e20787364e8a8d24a692a830a76fbb Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:38:05 -0700 Subject: [PATCH 15/22] Cleaned file names now specified via a registry --- compass/services/threaded.py | 42 +++++------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/compass/services/threaded.py b/compass/services/threaded.py index c32e53ba..bc5f146f 100644 --- a/compass/services/threaded.py +++ b/compass/services/threaded.py @@ -16,13 +16,13 @@ from elm.web.document import PDFDocument, HTMLDocument from elm.web.utilities import write_url_doc_to_file -from compass import COMPASS_DEBUG_LEVEL from compass.services.base import Service from compass.utilities import compute_cost_from_totals from compass.pb import COMPASS_PB logger = logging.getLogger(__name__) +CLEANED_FP_REGISTRY = {} def _cache_file_with_hash(doc, file_content, out_dir, make_name_unique=False): @@ -69,59 +69,27 @@ def _move_file(doc, out_dir, out_fn=None): return out_fp -def _write_cleaned_file(doc, out_dir, jurisdiction_name=None): +def _write_cleaned_file(doc, out_dir, tech, jurisdiction_name=None): """Write cleaned ordinance text to directory""" if jurisdiction_name is None: return None out_dir = Path(out_dir) - if COMPASS_DEBUG_LEVEL > 0: - _write_interim_cleaned_files(doc, out_dir, jurisdiction_name) + doc_key_to_clean_fp = CLEANED_FP_REGISTRY.get(tech.casefold(), {}) - key_to_fp = { - "cleaned_text_for_extraction": ( - f"{jurisdiction_name} Cleaned Text.txt" - ), - "districts_text": f"{jurisdiction_name} Districts.txt", - } out_paths = [] - for key, fn in key_to_fp.items(): + for key, fn in doc_key_to_clean_fp.items(): cleaned_text = doc.attrs.get(key) if cleaned_text is None: continue - out_fp = out_dir / fn + out_fp = out_dir / fn.format(jurisdiction=jurisdiction_name) out_fp.write_text(cleaned_text, encoding="utf-8") out_paths.append(out_fp) return out_paths -def _write_interim_cleaned_files(doc, out_dir, jurisdiction_name): - """Write intermediate output texts to file; helpful for debugging""" - key_to_fp = { - "relevant_text": f"{jurisdiction_name} Ordinance Original text.txt", - "wind_energy_systems_text": ( - f"{jurisdiction_name} Wind Ordinance text.txt" - ), - "solar_energy_systems_text": ( - f"{jurisdiction_name} Solar Ordinance text.txt" - ), - "permitted_use_text": ( - f"{jurisdiction_name} Permitted Use Original text.txt" - ), - "permitted_use_only_text": ( - f"{jurisdiction_name} Permitted Use Only text.txt" - ), - } - for key, fn in key_to_fp.items(): - text = doc.attrs.get(key) - if text is None: - continue - - (out_dir / fn).write_text(text, encoding="utf-8") - - def _write_ord_db(extraction_context, out_dir, out_fn=None): """Write parsed ordinance database to directory""" ord_db = extraction_context.attrs.get("structured_data") From bee7d23dad551913a111203d8bf1cf806094e190 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:38:14 -0700 Subject: [PATCH 16/22] Import to register plugins --- compass/__init__.py | 9 +++++++++ compass/extraction/__init__.py | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/compass/__init__.py b/compass/__init__.py index 28cb2282..c048d1a7 100644 --- a/compass/__init__.py +++ b/compass/__init__.py @@ -3,4 +3,13 @@ from ._version import __version__ from .utilities.logs import setup_logging_levels, COMPASS_DEBUG_LEVEL +# Temporarily import to register plugins +# Can drop once plugins register themselves +from .extraction import ( + COMPASSWindExtractor, + COMPASSSolarExtractor, + COMPASSSmallWindExtractor, + TexasWaterRightsExtractor, +) + setup_logging_levels() diff --git a/compass/extraction/__init__.py b/compass/extraction/__init__.py index 373bf198..2ee97718 100644 --- a/compass/extraction/__init__.py +++ b/compass/extraction/__init__.py @@ -7,3 +7,10 @@ extract_relevant_text_with_ngram_validation, extract_ordinance_values, ) + +# Temporarily import to register plugins +# Can drop once plugins register themselves +from .wind import COMPASSWindExtractor +from .solar import COMPASSSolarExtractor +from .small_wind import COMPASSSmallWindExtractor +from .water import TexasWaterRightsExtractor From 42980ff43a2b9530771635b13aee3f5a18153018 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:38:26 -0700 Subject: [PATCH 17/22] Pass down tech name to cleaned file writing --- compass/plugin/interface.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py index 5ee98449..5f116fe3 100644 --- a/compass/plugin/interface.py +++ b/compass/plugin/interface.py @@ -14,8 +14,6 @@ logger = logging.getLogger(__name__) -# TODO: Allow other to register own clean file outputs - class BaseHeuristic(ABC): """Base class for a heuristic check""" @@ -325,7 +323,9 @@ async def filter_docs( async def _write_cleaned_text(self, doc): """Write cleaned text to `clean_files` dir""" - out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name) + out_fp = await CleanedFileWriter.call( + doc, self.IDENTIFIER, self.jurisdiction.full_name + ) doc.attrs["cleaned_fps"] = out_fp return doc From e436bd96471909a04c5521b0f70243899533a391 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 21:38:59 -0700 Subject: [PATCH 18/22] Register cleaned file names --- compass/plugin/ordinance.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index 798dd911..d0715464 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -22,6 +22,7 @@ BaseTextCollector, FilteredExtractionPlugin, ) +from compass.services.threaded import CLEANED_FP_REGISTRY from compass.extraction import extract_ordinance_values from compass.utilities.enums import LLMTasks, LLMUsageCategory from compass.utilities.ngrams import convert_text_to_sentence_ngrams @@ -529,7 +530,6 @@ def parsers(self): FORMATTING_PROMPT=self.FORMATTING_PROMPT, OUTPUT_PROMPT=self.OUTPUT_PROMPT, ) - # out_fn = prompt_dict.get("out_fn", None) yield key, partial(self._process, instructions=instructions) async def _process(self, text_chunks, instructions, is_valid_chunk=None): @@ -845,6 +845,7 @@ def validate_plugin_configuration(self): self._validate_in_out_keys() self._validate_collector_prompts() self._validate_collector_prompts() + self._register_clean_file_names() def _validate_text_extractors(self): """Validate user provided at least one text extractor class""" @@ -975,6 +976,20 @@ def _validate_collector_prompts(self): ) raise COMPASSPluginConfigurationError(msg) + def _register_clean_file_names(self): + """Register file names for writing cleaned text outputs""" + CLEANED_FP_REGISTRY.setdefault(self.IDENTIFIER.casefold(), {}) + for extractor_class in self.TEXT_EXTRACTORS: + if not issubclass(extractor_class, PromptBasedTextExtractor): + continue + for ind, prompt_dict in enumerate(extractor_class.PROMPTS): + out_fn = prompt_dict.get("out_fn", None) + if not out_fn: + continue + + key = prompt_dict.get("key", f"extracted_text_{ind}") + CLEANED_FP_REGISTRY[self.IDENTIFIER.casefold()][key] = out_fn + def _valid_chunk(chunk): """True if chunk has content""" From 86fc25959c8e877ed2378761398ca8fea8a0284e Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 22:03:01 -0700 Subject: [PATCH 19/22] Fix tests --- .../unit/plugin/test_plugin_interface.py | 79 ++++++++++--------- .../unit/services/test_services_threaded.py | 32 ++++++-- 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_interface.py index 1287d9ac..e4d79cd4 100644 --- a/tests/python/unit/plugin/test_plugin_interface.py +++ b/tests/python/unit/plugin/test_plugin_interface.py @@ -4,40 +4,45 @@ import pytest -from compass.plugin.interface import FilteredExtractionPlugin +from compass.plugin.ordinance import ( + BaseTextCollector, + BaseTextExtractor, + BaseParser, + OrdinanceExtractionPlugin, +) from compass.exceptions import COMPASSPluginConfigurationError def test_plugin_validation_parse_key_same(): """Test plugin interface validation logic""" - class COLL1: + class COLL1(BaseTextCollector): OUT_LABEL = "collected" - class EXT1: + class EXT1(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted" - class EXT2: + class EXT2(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted_2" - class PARS1: + class PARS1(BaseParser): IN_LABEL = "extracted" OUT_LABEL = "parsed_1" - class PARS2: + class PARS2(BaseParser): IN_LABEL = "collected" OUT_LABEL = "parsed_1" - class MYPlugin(FilteredExtractionPlugin): + class MYPlugin(OrdinanceExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] IDENTIFIER = "test" - WEBSITE_KEYWORDS = [] - QUESTION_TEMPLATES = [] + WEBSITE_KEYWORDS = ["test"] + QUESTION_TEMPLATES = ["test"] heuristic = None async def parse_docs_for_structured_data(self, extraction_context): @@ -47,39 +52,39 @@ async def parse_docs_for_structured_data(self, extraction_context): COMPASSPluginConfigurationError, match="Multiple processing classes produce the same OUT_LABEL key", ): - MYPlugin(None, None, None) + MYPlugin(None, None, None).validate_plugin_configuration() def test_plugin_validation_extract_key_same(): """Test plugin interface validation logic""" - class COLL1: + class COLL1(BaseTextCollector): OUT_LABEL = "collected" - class EXT1: + class EXT1(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted" - class EXT2: + class EXT2(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted" - class PARS1: + class PARS1(BaseParser): IN_LABEL = "extracted" OUT_LABEL = "parsed_1" - class PARS2: + class PARS2(BaseParser): IN_LABEL = "collected" OUT_LABEL = "parsed_2" - class MYPlugin(FilteredExtractionPlugin): + class MYPlugin(OrdinanceExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] IDENTIFIER = "test" - WEBSITE_KEYWORDS = [] - QUESTION_TEMPLATES = [] + WEBSITE_KEYWORDS = ["test"] + QUESTION_TEMPLATES = ["test"] heuristic = None async def parse_docs_for_structured_data(self, extraction_context): @@ -89,39 +94,39 @@ async def parse_docs_for_structured_data(self, extraction_context): COMPASSPluginConfigurationError, match="Multiple processing classes produce the same OUT_LABEL key", ): - MYPlugin(None, None, None) + MYPlugin(None, None, None).validate_plugin_configuration() def test_plugin_validation_no_in_key_for_extract(): """Test plugin interface validation logic""" - class COLL1: + class COLL1(BaseTextCollector): OUT_LABEL = "collected" - class EXT1: + class EXT1(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted" - class EXT2: + class EXT2(BaseTextExtractor): IN_LABEL = "collected_2" OUT_LABEL = "extracted_1" - class PARS1: + class PARS1(BaseParser): IN_LABEL = "extracted" OUT_LABEL = "parsed_1" - class PARS2: + class PARS2(BaseParser): IN_LABEL = "collected" OUT_LABEL = "parsed_2" - class MYPlugin(FilteredExtractionPlugin): + class MYPlugin(OrdinanceExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] IDENTIFIER = "test" - WEBSITE_KEYWORDS = [] - QUESTION_TEMPLATES = [] + WEBSITE_KEYWORDS = ["test"] + QUESTION_TEMPLATES = ["test"] heuristic = None async def parse_docs_for_structured_data(self, extraction_context): @@ -135,39 +140,39 @@ async def parse_docs_for_structured_data(self, extraction_context): r"\['EXT2'\]" ), ): - MYPlugin(None, None, None) + MYPlugin(None, None, None).validate_plugin_configuration() def test_plugin_validation_no_in_key_for_parse(): """Test plugin interface validation logic""" - class COLL1: + class COLL1(BaseTextCollector): OUT_LABEL = "collected" - class EXT1: + class EXT1(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted" - class EXT2: + class EXT2(BaseTextExtractor): IN_LABEL = "collected" OUT_LABEL = "extracted_1" - class PARS1: + class PARS1(BaseParser): IN_LABEL = "extracted" OUT_LABEL = "parsed_1" - class PARS2: + class PARS2(BaseParser): IN_LABEL = "collected_2" OUT_LABEL = "parsed_2" - class MYPlugin(FilteredExtractionPlugin): + class MYPlugin(OrdinanceExtractionPlugin): TEXT_COLLECTORS = [COLL1] TEXT_EXTRACTORS = [EXT1, EXT2] PARSERS = [PARS1, PARS2] IDENTIFIER = "test" - WEBSITE_KEYWORDS = [] - QUESTION_TEMPLATES = [] + WEBSITE_KEYWORDS = ["test"] + QUESTION_TEMPLATES = ["test"] heuristic = None async def parse_docs_for_structured_data(self, extraction_context): @@ -181,7 +186,7 @@ async def parse_docs_for_structured_data(self, extraction_context): r"\['PARS2'\]" ), ): - MYPlugin(None, None, None) + MYPlugin(None, None, None).validate_plugin_configuration() if __name__ == "__main__": diff --git a/tests/python/unit/services/test_services_threaded.py b/tests/python/unit/services/test_services_threaded.py index 566bbecd..d4522c06 100644 --- a/tests/python/unit/services/test_services_threaded.py +++ b/tests/python/unit/services/test_services_threaded.py @@ -16,6 +16,7 @@ from compass.services import threaded from compass.services.provider import RunningAsyncServices from compass.services.threaded import ( + CLEANED_FP_REGISTRY, CleanedFileWriter, FileMover, HTMLFileLoader, @@ -194,13 +195,12 @@ def test_move_file_handles_extensionless_cached_file(tmp_path): assert moved_fp.read_text(encoding="utf-8") == "content" -def test_write_cleaned_file_with_debug(tmp_path, monkeypatch): +def test_write_cleaned_file_with_debug(tmp_path): """Cleaned file writer should emit cleaned and debug outputs""" doc = HTMLDocument(["payload"]) doc.attrs.update( { - "jurisdiction_name": "Sample Jurisdiction", "cleaned_text_for_extraction": "clean", "districts_text": "districts", "relevant_text": "orig", @@ -209,18 +209,36 @@ def test_write_cleaned_file_with_debug(tmp_path, monkeypatch): } ) - monkeypatch.setattr(threaded, "COMPASS_DEBUG_LEVEL", 1, raising=False) + fp_names = { + "relevant_text": "{jurisdiction} Ordinance Original text.txt", + "cleaned_text_for_extraction": "{jurisdiction} Cleaned Text.txt", + "districts_text": "{jurisdiction} Districts.txt", + } + + CLEANED_FP_REGISTRY["cleaned_file_test"] = fp_names outputs = threaded._write_cleaned_file( - doc, tmp_path, jurisdiction_name="Sample Jurisdiction" + doc, + tmp_path, + tech="cleaned_file_test", + jurisdiction_name="Sample Jurisdiction", ) expected_files = { "Sample Jurisdiction Cleaned Text.txt", "Sample Jurisdiction Districts.txt", + "Sample Jurisdiction Ordinance Original text.txt", } assert {fp.name for fp in outputs} == expected_files assert all(fp.exists() for fp in outputs) + debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt" + assert debug_fp.exists() + assert debug_fp.read_text(encoding="utf-8") == "clean" + + debug_fp = tmp_path / "Sample Jurisdiction Districts.txt" + assert debug_fp.exists() + assert debug_fp.read_text(encoding="utf-8") == "districts" + debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt" assert debug_fp.exists() assert debug_fp.read_text(encoding="utf-8") == "orig" @@ -231,7 +249,7 @@ def test_write_cleaned_file_without_jurisdiction_returns_none(tmp_path): doc = HTMLDocument(["payload"]) doc.attrs["cleaned_text_for_extraction"] = "clean" - assert threaded._write_cleaned_file(doc, tmp_path) is None + assert threaded._write_cleaned_file(doc, tmp_path, tech="wind") is None def test_write_cleaned_file_skips_missing_section(tmp_path): @@ -241,7 +259,7 @@ def test_write_cleaned_file_skips_missing_section(tmp_path): doc.attrs.update({"cleaned_text_for_extraction": "clean"}) outputs = threaded._write_cleaned_file( - doc, tmp_path, jurisdiction_name="Partial" + doc, tmp_path, tech="wind", jurisdiction_name="Partial" ) assert [fp.name for fp in outputs] == ["Partial Cleaned Text.txt"] @@ -341,7 +359,7 @@ async def test_cleaned_file_writer_process(tmp_path, monkeypatch): writer = CleanedFileWriter(tmp_path) assert writer.can_process is True writer.acquire_resources() - outputs = await writer.process(doc, "Writer") + outputs = await writer.process(doc, "wind", "Writer") writer.release_resources() assert sorted(fp.name for fp in outputs) == [ From 9ae73eb2eeaed98313afc094d6f5e6d9aae8474f Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 22:03:43 -0700 Subject: [PATCH 20/22] Rename file --- .../{test_plugin_interface.py => test_plugin_ordinances.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/python/unit/plugin/{test_plugin_interface.py => test_plugin_ordinances.py} (99%) diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_ordinances.py similarity index 99% rename from tests/python/unit/plugin/test_plugin_interface.py rename to tests/python/unit/plugin/test_plugin_ordinances.py index e4d79cd4..247d8ecb 100644 --- a/tests/python/unit/plugin/test_plugin_interface.py +++ b/tests/python/unit/plugin/test_plugin_ordinances.py @@ -1,4 +1,4 @@ -"""COMPASS web crawling tests""" +"""COMPASS ordinance plugin tests""" from pathlib import Path From 155b16e2ae655aabb9cf2983485553bba144b1a4 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 6 Feb 2026 22:11:30 -0700 Subject: [PATCH 21/22] Fix docs --- compass/extraction/water/plugin.py | 2 +- compass/plugin/base.py | 2 +- docs/source/conf.py | 27 +++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py index 5ca51ed7..0729293d 100644 --- a/compass/extraction/water/plugin.py +++ b/compass/extraction/water/plugin.py @@ -84,7 +84,7 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin): / "data" / "tx_water_districts.csv" ) - """path-like: Path to Texas GCW names""" + """:term:`path-like `: Path to Texas GCW names""" async def filter_docs( self, diff --git a/compass/plugin/base.py b/compass/plugin/base.py index 0dcf2fba..1a65182a 100644 --- a/compass/plugin/base.py +++ b/compass/plugin/base.py @@ -35,7 +35,7 @@ def __init__(self, jurisdiction, model_configs, usage_tracker=None): self.usage_tracker = usage_tracker JURISDICTION_DATA_FP = None - """path-like: Optional path to jurisdiction CSV + """:term:`path-like `: Path to jurisdiction CSV If provided, this CSV will extend the known jurisdictions (by default, US states, counties, and townships). This CSV must have the diff --git a/docs/source/conf.py b/docs/source/conf.py index 454898db..0b2b0c5f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -112,6 +112,33 @@ # Avoid warning about api.rst not in TOC suppress_warnings = ["toc.not_included"] +nitpick_ignore = [ + ( + "py:obj", + "compass.extraction.small_wind.ordinance.SmallWindOrdinanceTextExtractor.OUT_LABEL", + ), + ( + "py:obj", + "compass.extraction.small_wind.ordinance.SmallWindPermittedUseDistrictsTextExtractor.OUT_LABEL", + ), + ( + "py:obj", + "compass.extraction.solar.ordinance.SolarOrdinanceTextExtractor.OUT_LABEL", + ), + ( + "py:obj", + "compass.extraction.solar.ordinance.SolarPermittedUseDistrictsTextExtractor.OUT_LABEL", + ), + ( + "py:obj", + "compass.extraction.wind.ordinance.WindOrdinanceTextExtractor.OUT_LABEL", + ), + ( + "py:obj", + "compass.extraction.wind.ordinance.WindPermittedUseDistrictsTextExtractor.OUT_LABEL", + ), +] + # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for From fe22287ac67f9174fdf2bf360d01169d533350bc Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Sat, 7 Feb 2026 12:20:16 -0700 Subject: [PATCH 22/22] PR review --- compass/plugin/ordinance.py | 4 +- compass/plugin/registry.py | 16 ++++-- .../unit/services/test_services_threaded.py | 55 ++++++++++--------- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py index d0715464..eff794be 100644 --- a/compass/plugin/ordinance.py +++ b/compass/plugin/ordinance.py @@ -844,7 +844,7 @@ def validate_plugin_configuration(self): self._validate_parsers() self._validate_in_out_keys() self._validate_collector_prompts() - self._validate_collector_prompts() + self._validate_extractor_prompts() self._register_clean_file_names() def _validate_text_extractors(self): @@ -953,7 +953,7 @@ def _validate_collector_prompts(self): ) raise COMPASSPluginConfigurationError(msg) - def _validate_collector_prompts(self): + def _validate_extractor_prompts(self): """Validate that all text extractors have prompts defined""" for collector in self.TEXT_EXTRACTORS: diff --git a/compass/plugin/registry.py b/compass/plugin/registry.py index d7397411..c866a041 100644 --- a/compass/plugin/registry.py +++ b/compass/plugin/registry.py @@ -33,10 +33,16 @@ def register_plugin(plugin_class): ) raise COMPASSPluginConfigurationError(msg) - if plugin_class.JURISDICTION_DATA_FP is not None: - KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP) + if (plugin_id := plugin_class.IDENTIFIER.casefold()) in PLUGIN_REGISTRY: + msg = ( + f"Plugin identifier '{plugin_class.IDENTIFIER}' is already in " + "use by another plugin! Please choose a unique identifier for " + f"{plugin_class.__name__}." + ) + raise COMPASSPluginConfigurationError(msg) - plugin_instance = plugin_class(None, None) - plugin_instance.validate_plugin_configuration() + plugin_class(None, None).validate_plugin_configuration() - PLUGIN_REGISTRY[plugin_class.IDENTIFIER.casefold()] = plugin_class + if plugin_class.JURISDICTION_DATA_FP is not None: + KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP) + PLUGIN_REGISTRY[plugin_id] = plugin_class diff --git a/tests/python/unit/services/test_services_threaded.py b/tests/python/unit/services/test_services_threaded.py index d4522c06..6a7b72b6 100644 --- a/tests/python/unit/services/test_services_threaded.py +++ b/tests/python/unit/services/test_services_threaded.py @@ -216,32 +216,35 @@ def test_write_cleaned_file_with_debug(tmp_path): } CLEANED_FP_REGISTRY["cleaned_file_test"] = fp_names - outputs = threaded._write_cleaned_file( - doc, - tmp_path, - tech="cleaned_file_test", - jurisdiction_name="Sample Jurisdiction", - ) - - expected_files = { - "Sample Jurisdiction Cleaned Text.txt", - "Sample Jurisdiction Districts.txt", - "Sample Jurisdiction Ordinance Original text.txt", - } - assert {fp.name for fp in outputs} == expected_files - assert all(fp.exists() for fp in outputs) - - debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt" - assert debug_fp.exists() - assert debug_fp.read_text(encoding="utf-8") == "clean" - - debug_fp = tmp_path / "Sample Jurisdiction Districts.txt" - assert debug_fp.exists() - assert debug_fp.read_text(encoding="utf-8") == "districts" - - debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt" - assert debug_fp.exists() - assert debug_fp.read_text(encoding="utf-8") == "orig" + try: + outputs = threaded._write_cleaned_file( + doc, + tmp_path, + tech="cleaned_file_test", + jurisdiction_name="Sample Jurisdiction", + ) + + expected_files = { + "Sample Jurisdiction Cleaned Text.txt", + "Sample Jurisdiction Districts.txt", + "Sample Jurisdiction Ordinance Original text.txt", + } + assert {fp.name for fp in outputs} == expected_files + assert all(fp.exists() for fp in outputs) + + debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt" + assert debug_fp.exists() + assert debug_fp.read_text(encoding="utf-8") == "clean" + + debug_fp = tmp_path / "Sample Jurisdiction Districts.txt" + assert debug_fp.exists() + assert debug_fp.read_text(encoding="utf-8") == "districts" + + debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt" + assert debug_fp.exists() + assert debug_fp.read_text(encoding="utf-8") == "orig" + finally: + del CLEANED_FP_REGISTRY["cleaned_file_test"] def test_write_cleaned_file_without_jurisdiction_returns_none(tmp_path):