From 9d9af2dbacf0fd2a9e00371002b274a9f77667a3 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 15:58:28 -0700
Subject: [PATCH 01/22] Add debug logging

---
 compass/scripts/process.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/compass/scripts/process.py b/compass/scripts/process.py
index cbb8f67a..cb8c579a 100644
--- a/compass/scripts/process.py
+++ b/compass/scripts/process.py
@@ -874,6 +874,11 @@ async def _run(self):
             )
             if extraction_context is not None:
                 return extraction_context
+        else:
+            logger.debug(
+                "%r processing had no known local docs configured",
+                self.jurisdiction.full_name,
+            )
 
         if self.known_doc_urls:
             logger.debug(
@@ -885,6 +890,11 @@ async def _run(self):
             )
             if extraction_context is not None:
                 return extraction_context
+        else:
+            logger.debug(
+                "%r processing had no known URLs configured",
+                self.jurisdiction.full_name,
+            )
 
         if self.perform_se_search:
             logger.debug(
@@ -897,6 +907,11 @@ async def _run(self):
             )
             if extraction_context is not None:
                 return extraction_context
+        else:
+            logger.debug(
+                "%r processing didn't have SE search enabled",
+                self.jurisdiction.full_name,
+            )
 
         if self.perform_website_search:
             logger.debug(
@@ -908,6 +923,12 @@ async def _run(self):
             )
             if extraction_context is not None:
                 return extraction_context
+        else:
+            logger.debug(
+                "%r processing didn't have jurisdiction website search "
+                "enabled",
+                self.jurisdiction.full_name,
+            )
 
         return None
 

From 80f5d1a205b4ceb7fd50fb7796a03493cb79aee7 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 19:16:03 -0700
Subject: [PATCH 02/22] Implement and use `PromptBasedTextExtractor`

---
 compass/extraction/small_wind/ordinance.py | 485 ++++++++-------------
 compass/extraction/solar/ordinance.py      | 358 ++++++---------
 compass/extraction/wind/ordinance.py       | 483 ++++++++------------
 compass/plugin/ordinance.py                | 143 +++++-
 4 files changed, 610 insertions(+), 859 deletions(-)

diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py
index 426df3b4..d392c37f 100644
--- a/compass/extraction/small_wind/ordinance.py
+++ b/compass/extraction/small_wind/ordinance.py
@@ -9,7 +9,7 @@
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
     OrdinanceTextCollector,
-    OrdinanceTextExtractor,
+    PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
 
@@ -32,6 +32,151 @@
 _IGNORE_TYPES_MICRO = "private, micro, personal, building-mounted"
 _IGNORE_TYPES_LARGE = "large, utility-scale, for-sale, commercial"
 
+_WECS_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information about \
+**wind energy systems**. The extracted text will be used for structured \
+data extraction, so it must be both **comprehensive** (retaining all relevant \
+details) and **focused** (excluding unrelated content), with **zero rewriting \
+or paraphrasing**. Ensure that all retained information is **directly \
+applicable to wind energy systems** while preserving full context and accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to wind energy systems** from the provided \
+excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Include all text that pertains to **wind energy systems**.
+- Explicitly include any text related to **bans or prohibitions** on wind \
+energy systems.
+- Explicitly include any text related to the adoption or enactment date of \
+the ordinance (if any).
+
+2. ## Exclusions ##:
+- Do **not** include text that does not pertain to wind energy systems.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
+_SMALL_WECS_TEXT_EXTRACTION_PROMPT = f"""\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information about \
+**small, medium, or non-commercial wind energy systems**. The extracted text \
+will be used for structured data extraction, so it must be both \
+**comprehensive** (retaining all relevant details) and **focused** (excluding \
+unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \
+retained information is **directly applicable** to small, medium, or \
+non-commercial wind energy systems while preserving full context and accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to small, medium or non-commercial wind energy \
+systems** from the provided excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Include all text that pertains to **small, medium, or non-commercial wind \
+energy systems**, even if they are referred to by different names such as: \
+{_SMALL_WES_SYNONYMS.capitalize()}
+- Explicitly include any text related to **bans or prohibitions** on small, \
+medium, or non-commercial wind energy systems.
+- Explicitly include any text related to the adoption or enactment date of \
+the ordinance (if any).
+- **Retain all relevant technical, design, operational, safety, \
+environmental, and infrastructure-related provisions** that apply to the \
+topic, such as (but not limited to):
+    - Compliance with legal or regulatory standards.
+    - Site, structural, or design specifications.
+    - Environmental impact considerations.
+    - Safety and risk mitigation measures.
+    - Infrastructure, implementation, operation, and maintenance details.
+    - All other **closely related provisions**.
+
+2. ## Exclusions ##:
+- Do **not** include text that explicitly applies **only** to \
+{_IGNORE_TYPES_MICRO} or {_IGNORE_TYPES_LARGE} wind energy systems.
+- Do **not** include text that does not pertain at all to wind energy systems.
+
+3.{{FORMATTING_PROMPT}}
+
+4. {{OUTPUT_PROMPT}}\
+"""
+
+_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+permitted use(s) for a district. The extracted text will be used for \
+structured data extraction, so it must be both **comprehensive** (retaining \
+all relevant details) and **focused** (excluding unrelated content), with \
+**zero rewriting or paraphrasing**. Ensure that all retained information is \
+**directly applicable** to permitted use(s) for one or more districts while \
+preserving full context and accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+    - **District names and zoning classifications.**
+- Pay extra attention to any references to **wind energy facilities** or \
+related terms.
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do **not** include unrelated regulations, procedural details, or \
+non-use-based restrictions.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
+_WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+**wind energy system** permitted use(s) for a district. The extracted text \
+will be used for structured data extraction, so it must be both \
+**comprehensive** (retaining all relevant details) and **focused** (excluding \
+unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \
+retained information is **directly applicable** to permitted use(s) for wind \
+energy systems in one or more districts while preserving full context and \
+accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to wind energy conversion system \
+permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+     - **District names and zoning classifications.**
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do not include text that does not pertain at all to wind energy systems.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
 
 class SmallWindHeuristic(OrdinanceHeuristic):
     """Perform a heuristic check for mention of wind turbines in text"""
@@ -295,189 +440,39 @@ async def check_chunk(self, chunk_parser, ind):
         return False
 
 
-class SmallWindOrdinanceTextExtractor(OrdinanceTextExtractor):
+class SmallWindOrdinanceTextExtractor(PromptBasedTextExtractor):
     """Extract succinct ordinance text from input"""
 
     IN_LABEL = SmallWindOrdinanceTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "cleaned_text_for_extraction"
-    """Identifier for ordinance text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting small wind ordinance text"
     """Task description to show in progress bar"""
 
     TASK_ID = "ordinance_text_extraction"
     """ID to use for this extraction for linking with LLM configs"""
 
-    WIND_ENERGY_SYSTEM_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "about **wind energy systems**. The extracted text will be used for "
-        "structured data extraction, so it must be both **comprehensive** "
-        "(retaining all relevant details) and **focused** (excluding "
-        "unrelated content), with **zero rewriting or paraphrasing**. "
-        "Ensure that all retained information is "
-        "**directly applicable to wind energy systems** while preserving "
-        "full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Extract all text **pertaining to wind energy systems** from the "
-        "provided excerpt.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Include all text that pertains to **wind energy systems**.\n"
-        "- Explicitly include any text related to **bans or prohibitions** "
-        "on wind energy systems.\n"
-        "- Explicitly include any text related to the adoption or enactment "
-        "date of the ordinance (if any).\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include text that does not pertain to wind energy "
-        "systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for WECS"""
-
-    SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "about **small, medium, or non-commercial wind energy systems**. The "
-        "extracted text will be used for structured data extraction, so it "
-        "must be both **comprehensive** (retaining all relevant details) and "
-        "**focused** (excluding unrelated content), with **zero rewriting or "
-        "paraphrasing**. Ensure that all retained information "
-        "is **directly applicable** to small, medium, or non-commercial wind "
-        "energy systems while preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Extract all text **pertaining to small, medium or non-commercial "
-        "wind energy systems** from the provided excerpt.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Include all text that pertains to **small, medium, or "
-        "non-commercial wind energy systems**, even if they are referred to "
-        "by different names such as:\n"
-        f"\t{_SMALL_WES_SYNONYMS.capitalize()}.\n"
-        "- Explicitly include any text related to **bans or prohibitions** "
-        "on small, medium, or non-commercial wind energy systems.\n"
-        "- Explicitly include any text related to the adoption or enactment "
-        "date of the ordinance (if any).\n"
-        "- **Retain all relevant technical, design, operational, safety, "
-        "environmental, and infrastructure-related provisions** that apply "
-        "to the topic, such as (but not limited to):\n"
-        "\t- Compliance with legal or regulatory standards.\n"
-        "\t- Site, structural, or design specifications.\n"
-        "\t- Environmental impact considerations.\n"
-        "\t- Safety and risk mitigation measures.\n"
-        "\t- Infrastructure, implementation, operation, and maintenance "
-        "details.\n"
-        "\t- All other **closely related provisions**.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include text that explicitly applies **only** to "
-        f"{_IGNORE_TYPES_MICRO} or {_IGNORE_TYPES_LARGE} "
-        "wind energy systems.\n"
-        f"- Do **not** include text that does not pertain at all to wind "
-        "energy systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for small WECS"""
-
-    async def extract_wind_energy_system_section(self, text_chunks):
-        """Extract ordinance text from input text chunks for WES
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.WIND_ENERGY_SYSTEM_FILTER_PROMPT,
-        )
-
-    async def extract_small_wind_energy_system_section(self, text_chunks):
-        """Extract small WES ordinance text from input text chunks
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.SMALL_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield (
-            "wind_energy_systems_text",
-            self.extract_wind_energy_system_section,
-        )
-        yield self.OUT_LABEL, self.extract_small_wind_energy_system_section
+    PROMPTS = [
+        {
+            "key": "wind_energy_systems_text",
+            "out_fn": "{jurisdiction} Wind Ordinance Text.txt",
+            "prompt": _WECS_TEXT_EXTRACTION_PROMPT,
+        },
+        {
+            "key": "cleaned_text_for_extraction",
+            "out_fn": "{jurisdiction} Cleaned Text.txt",
+            "prompt": _SMALL_WECS_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text extraction"""
 
 
-class SmallWindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
+class SmallWindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor):
     """Extract succinct permitted use districts text from input"""
 
     IN_LABEL = SmallWindPermittedUseDistrictsTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "districts_text"
-    """Identifier for permitted use text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting small wind permitted use text"
     """Task description to show in progress bar"""
 
@@ -486,148 +481,16 @@ class SmallWindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
 
     _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY
 
-    PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing permitted use(s) for a district. The extracted text will "
-        "be used for structured data extraction, so it must be both "
-        "**comprehensive** (retaining all relevant details) and **focused** "
-        "(excluding unrelated content), with **zero rewriting or "
-        "paraphrasing**. Ensure that all retained information "
-        "is **directly applicable** to permitted use(s) for one or more "
-        "districts while preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to permitted use(s) for "
-        "a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Pay extra attention to any references to **wind energy "
-        "facilities** or related terms.\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include unrelated regulations, procedural details, "
-        "or non-use-based restrictions.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses"""
-
-    WES_PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing **wind energy system** permitted use(s) for a district. "
-        "The extracted text will be used for structured data extraction, so "
-        "it must be both **comprehensive** (retaining all relevant details) "
-        "and **focused** (excluding unrelated content), with **zero rewriting "
-        "or paraphrasing**. Ensure that all "
-        "retained information is **directly applicable** to permitted use(s) "
-        "for wind energy systems in one or more districts while "
-        "preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to wind energy conversion "
-        "system permitted use(s) for a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do not include text that does not pertain at all to wind "
-        "energy systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses for WECS"""
-
-    async def extract_permitted_uses(self, text_chunks):
-        """Extract permitted uses text from input text chunks
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    async def extract_wes_permitted_uses(self, text_chunks):
-        """Extract permitted uses text for small WES from input text
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.WES_PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield "permitted_use_only_text", self.extract_permitted_uses
-        yield self.OUT_LABEL, self.extract_wes_permitted_uses
+    PROMPTS = [
+        {
+            "key": "permitted_use_only_text",
+            "out_fn": "{jurisdiction} Permitted Use Only.txt",
+            "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+        {
+            "key": "districts_text",
+            "out_fn": "{jurisdiction} Districts.txt",
+            "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text extraction"""
diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py
index 3679e519..67dbb61d 100644
--- a/compass/extraction/solar/ordinance.py
+++ b/compass/extraction/solar/ordinance.py
@@ -9,7 +9,7 @@
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
     OrdinanceTextCollector,
-    OrdinanceTextExtractor,
+    PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
 
@@ -34,6 +34,109 @@
     "CSP, private, residential, roof-mounted, micro, small, or medium sized"
 )
 
+_SEF_TEXT_EXTRACTION_PROMPT = f"""\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information about \
+**solar energy systems**. The extracted text will be used for structured data \
+extraction, so it must be both **comprehensive** (retaining all relevant \
+details) and **focused** (excluding unrelated content), with **zero rewriting \
+or paraphrasing**. Ensure that all retained information is **directly \
+applicable to solar energy systems** while preserving full context and \
+accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to solar energy systems** from the provided \
+excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Include **all** text that pertains to** solar energy systems**, even if \
+they are referred to by different names such as: \
+{_LARGE_SEF_SYNONYMS.capitalize()}
+- Explicitly include any text related to **bans or prohibitions** on solar \
+energy systems.
+- Explicitly include any text related to the adoption or enactment date of \
+the ordinance (if any).
+
+2. ## Exclusions ##:
+- Do **not** include text that does not pertain to solar energy systems.
+
+3. {{FORMATTING_PROMPT}}
+
+4. {{OUTPUT_PROMPT}}\
+"""
+
+_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+permitted use(s) for a district. The extracted text will be used for \
+structured data extraction, so it must be both **comprehensive** (retaining \
+all relevant details) and **focused** (excluding unrelated content), with \
+**zero rewriting or paraphrasing**. Ensure that all retained information is \
+**directly applicable** to permitted use(s) for one or more districts while \
+preserving full context and accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+    - **District names and zoning classifications.**
+- Pay extra attention to any references to **solar energy facilities** or \
+related terms.
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do **not** include unrelated regulations, procedural details, or \
+non-use-based restrictions.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
+_SEF_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+**solar energy system** permitted use(s) for a district. The extracted text \
+will be used for structured data extraction, so it must be both \
+**comprehensive** (retaining all relevant details) and **focused** (excluding \
+unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \
+retained information is **directly applicable** to permitted use(s) for solar \
+energy systems in one or more districts while preserving full context and \
+accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to solar energy conversion system \
+permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+    - **District names and zoning classifications.**
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do not include text that does not pertain at all to solar energy systems.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
 
 class SolarHeuristic(OrdinanceHeuristic):
     """Perform a heuristic check for mention of solar farms in text"""
@@ -249,109 +352,34 @@ async def check_chunk(self, chunk_parser, ind):
         return False
 
 
-class SolarOrdinanceTextExtractor(OrdinanceTextExtractor):
+class SolarOrdinanceTextExtractor(PromptBasedTextExtractor):
     """Extract succinct ordinance text from input"""
 
     IN_LABEL = SolarOrdinanceTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "cleaned_text_for_extraction"
-    """Identifier for ordinance text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting solar ordinance text"
     """Task description to show in progress bar"""
 
     TASK_ID = "ordinance_text_extraction"
     """ID to use for this extraction for linking with LLM configs"""
 
-    SOLAR_ENERGY_SYSTEM_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "about **solar energy systems**. The extracted text will be used for "
-        "structured data extraction, so it must be both **comprehensive** "
-        "(retaining all relevant details) and **focused** (excluding "
-        "unrelated content), with **zero rewriting or paraphrasing**. "
-        "Ensure that all retained information is "
-        "**directly applicable to solar energy systems** while preserving "
-        "full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Extract all text **pertaining to solar energy systems** from the "
-        "provided excerpt.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Include **all** text that pertains to** solar energy systems**, "
-        "even if they are referred to by different names such as:\n"
-        f"\t{_LARGE_SEF_SYNONYMS.capitalize()}.\n"
-        "- Explicitly include any text related to **bans or prohibitions** "
-        "on solar energy systems.\n"
-        "- Explicitly include any text related to the adoption or enactment "
-        "date of the ordinance (if any).\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include text that does not pertain to solar energy "
-        "systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for SEF"""
-
-    async def extract_solar_energy_system_section(self, text_chunks):
-        """Extract ordinance text from input text chunks for SEF
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.SOLAR_ENERGY_SYSTEM_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield self.OUT_LABEL, self.extract_solar_energy_system_section
+    PROMPTS = [
+        {
+            "key": "cleaned_text_for_extraction",
+            "out_fn": "{jurisdiction} Cleaned Text.txt",
+            "prompt": _SEF_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text extraction"""
 
 
-class SolarPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
+class SolarPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor):
     """Extract succinct permitted use districts text from input"""
 
     IN_LABEL = SolarPermittedUseDistrictsTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "districts_text"
-    """Identifier for permitted use text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting solar permitted use text"
     """Task description to show in progress bar"""
 
@@ -360,148 +388,16 @@ class SolarPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
 
     _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY
 
-    PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing permitted use(s) for a district. The extracted text will "
-        "be used for structured data extraction, so it must be both "
-        "**comprehensive** (retaining all relevant details) and **focused** "
-        "(excluding unrelated content), with **zero rewriting or "
-        "paraphrasing**. Ensure that all retained information "
-        "is **directly applicable** to permitted use(s) for one or more "
-        "districts while preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to permitted use(s) for "
-        "a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Pay extra attention to any references to **solar energy "
-        "facilities** or related terms.\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include unrelated regulations, procedural details, "
-        "or non-use-based restrictions.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses"""
-
-    SEF_PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing **solar energy system** permitted use(s) for a district. "
-        "The extracted text will be used for structured data extraction, so "
-        "it must be both **comprehensive** (retaining all relevant details) "
-        "and **focused** (excluding unrelated content), with **zero rewriting "
-        "or paraphrasing**. Ensure that all "
-        "retained information is **directly applicable** to permitted use(s) "
-        "for solar energy systems in one or more districts while "
-        "preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to solar energy "
-        "conversion system permitted use(s) for a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do not include text that does not pertain at all to solar "
-        "energy systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses for SEF"""
-
-    async def extract_permitted_uses(self, text_chunks):
-        """Extract permitted uses text from input text chunks
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    async def extract_sef_permitted_uses(self, text_chunks):
-        """Extract permitted uses text for large SEF from input text
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.SEF_PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield "permitted_use_only_text", self.extract_permitted_uses
-        yield self.OUT_LABEL, self.extract_sef_permitted_uses
+    PROMPTS = [
+        {
+            "key": "permitted_use_only_text",
+            "out_fn": "{jurisdiction} Permitted Use Only.txt",
+            "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+        {
+            "key": "districts_text",
+            "out_fn": "{jurisdiction} Districts.txt",
+            "prompt": _SEF_PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text extraction"""
diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py
index d1be4f93..aa547184 100644
--- a/compass/extraction/wind/ordinance.py
+++ b/compass/extraction/wind/ordinance.py
@@ -9,7 +9,7 @@
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
     OrdinanceTextCollector,
-    OrdinanceTextExtractor,
+    PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
 
@@ -32,6 +32,151 @@
 _SEARCH_TERMS_OR = _SEARCH_TERMS_AND.replace("and", "or")
 _IGNORE_TYPES = "private, residential, micro, small, or medium sized"
 
+_WECS_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information about \
+**wind energy systems**. The extracted text will be used for structured data \
+extraction, so it must be both **comprehensive** (retaining all relevant \
+details) and **focused** (excluding unrelated content), with **zero rewriting \
+or paraphrasing**. Ensure that all retained information is **directly \
+applicable to wind energy systems** while preserving full context and accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to wind energy systems** from the provided \
+excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Include all text that pertains to **wind energy systems**.
+- Explicitly include any text related to **bans or prohibitions** on wind \
+energy systems.
+- Explicitly include any text related to the adoption or enactment date of \
+the ordinance (if any).
+
+2. ## Exclusions ##:
+- Do **not** include text that does not pertain to wind energy systems.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
+_LARGE_WECS_TEXT_EXTRACTION_PROMPT = f"""\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information about \
+**large wind energy systems**. The extracted text will be used for structured \
+data extraction, so it must be both **comprehensive** (retaining all relevant \
+details) and **focused** (excluding unrelated content), with **zero rewriting \
+or paraphrasing**. Ensure that all retained information is **directly \
+applicable** to large wind energy systems while preserving full context and \
+accuracy.
+
+# OBJECTIVE #
+Extract all text **pertaining to large wind energy systems** from the \
+provided excerpt.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Include all text that pertains to **large wind energy systems**, even if \
+they are referred to by different names such as: \
+{_LARGE_WES_SYNONYMS.capitalize()}
+- Explicitly include any text related to **bans or prohibitions** on large \
+wind energy systems.
+- Explicitly include any text related to the adoption or enactment date of \
+the ordinance (if any).
+- **Retain all relevant technical, design, operational, safety, \
+environmental, and infrastructure-related provisions** that apply to the \
+topic, such as (but not limited to):
+    - Compliance with legal or regulatory standards.
+    - Site, structural, or design specifications.
+    - Environmental impact considerations.
+    - Safety and risk mitigation measures.
+    - Infrastructure, implementation, operation, and maintenance details.
+    - All other **closely related provisions**.
+
+2. ## Exclusions ##:
+- Do **not** include text that explicitly applies **only** to {_IGNORE_TYPES} \
+wind energy systems.
+- Do **not** include text that does not pertain at all to wind energy systems.
+
+3. {{FORMATTING_PROMPT}}
+
+4. {{OUTPUT_PROMPT}}\
+"""
+
+_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+permitted use(s) for a district. The extracted text will be used for \
+structured data extraction, so it must be both **comprehensive** (retaining \
+all relevant details) and **focused** (excluding unrelated content), with \
+**zero rewriting or paraphrasing**. Ensure that all retained information is \
+**directly applicable** to permitted use(s) for one or more districts while \
+preserving full context and accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+    - **District names and zoning classifications.**
+- Pay extra attention to any references to **wind energy facilities** or \
+related terms.
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do **not** include unrelated regulations, procedural details, or \
+non-use-based restrictions.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
+_WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT = """\
+# CONTEXT #
+We want to reduce the provided excerpt to only contain information detailing \
+**wind energy system** permitted use(s) for a district. The extracted text \
+will be used for structured data extraction, so it must be both \
+**comprehensive** (retaining all relevant details) and **focused** (excluding \
+unrelated content), with **zero rewriting or paraphrasing**. Ensure that all \
+retained information is **directly applicable** to permitted use(s) for wind \
+energy systems in one or more districts while preserving full context and \
+accuracy.
+
+# OBJECTIVE #
+Remove all text **not directly pertinent** to wind energy conversion system \
+permitted use(s) for a district.
+
+# RESPONSE #
+Follow these guidelines carefully:
+
+1. ## Scope of Extraction ##:
+- Retain all text defining permitted use(s) for a district, including:
+    - **Primary, Special, Conditional, Accessory, Prohibited, and any other \
+use types.**
+    - **District names and zoning classifications.**
+- Ensure that **tables, lists, and structured elements** are preserved as \
+they may contain relevant details.
+
+2. ## Exclusions ##:
+- Do not include text that does not pertain at all to wind energy systems.
+
+3. {FORMATTING_PROMPT}
+
+4. {OUTPUT_PROMPT}\
+"""
+
 
 class WindHeuristic(OrdinanceHeuristic):
     """Perform a heuristic check for mention of wind turbines in text"""
@@ -267,187 +412,39 @@ async def check_chunk(self, chunk_parser, ind):
         return False
 
 
-class WindOrdinanceTextExtractor(OrdinanceTextExtractor):
+class WindOrdinanceTextExtractor(PromptBasedTextExtractor):
     """Extract succinct ordinance text from input"""
 
     IN_LABEL = WindOrdinanceTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "cleaned_text_for_extraction"
-    """Identifier for ordinance text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting wind ordinance text"
     """Task description to show in progress bar"""
 
     TASK_ID = "ordinance_text_extraction"
     """ID to use for this extraction for linking with LLM configs"""
 
-    WIND_ENERGY_SYSTEM_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "about **wind energy systems**. The extracted text will be used for "
-        "structured data extraction, so it must be both **comprehensive** "
-        "(retaining all relevant details) and **focused** (excluding "
-        "unrelated content), with **zero rewriting or paraphrasing**. "
-        "Ensure that all retained information is "
-        "**directly applicable to wind energy systems** while preserving "
-        "full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Extract all text **pertaining to wind energy systems** from the "
-        "provided excerpt.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Include all text that pertains to **wind energy systems**.\n"
-        "- Explicitly include any text related to **bans or prohibitions** "
-        "on wind energy systems.\n"
-        "- Explicitly include any text related to the adoption or enactment "
-        "date of the ordinance (if any).\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include text that does not pertain to wind energy "
-        "systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for WECS"""
-
-    LARGE_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "about **large wind energy systems**. The extracted text will be "
-        "used for structured data extraction, so it must be both "
-        "**comprehensive** (retaining all relevant details) and **focused** "
-        "(excluding unrelated content), with **zero rewriting or "
-        "paraphrasing**. Ensure that all retained information "
-        "is **directly applicable** to large wind energy systems while "
-        "preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Extract all text **pertaining to large wind energy systems** from "
-        "the provided excerpt.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Include all text that pertains to **large wind energy systems**, "
-        "even if they are referred to by different names such as:\n"
-        f"\t{_LARGE_WES_SYNONYMS.capitalize()}.\n"
-        "- Explicitly include any text related to **bans or prohibitions** "
-        "on large wind energy systems.\n"
-        "- Explicitly include any text related to the adoption or enactment "
-        "date of the ordinance (if any).\n"
-        "- **Retain all relevant technical, design, operational, safety, "
-        "environmental, and infrastructure-related provisions** that apply "
-        "to the topic, such as (but not limited to):\n"
-        "\t- Compliance with legal or regulatory standards.\n"
-        "\t- Site, structural, or design specifications.\n"
-        "\t- Environmental impact considerations.\n"
-        "\t- Safety and risk mitigation measures.\n"
-        "\t- Infrastructure, implementation, operation, and maintenance "
-        "details.\n"
-        "\t- All other **closely related provisions**.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include text that explicitly applies **only** to "
-        f"{_IGNORE_TYPES} wind energy systems.\n"
-        f"- Do **not** include text that does not pertain at all to wind "
-        "energy systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for utility-scale WECS"""
-
-    async def extract_wind_energy_system_section(self, text_chunks):
-        """Extract ordinance text from input text chunks for WES
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.WIND_ENERGY_SYSTEM_FILTER_PROMPT,
-        )
-
-    async def extract_large_wind_energy_system_section(self, text_chunks):
-        """Extract large WES ordinance text from input text chunks
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.LARGE_WIND_ENERGY_SYSTEM_SECTION_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield (
-            "wind_energy_systems_text",
-            self.extract_wind_energy_system_section,
-        )
-        yield self.OUT_LABEL, self.extract_large_wind_energy_system_section
+    PROMPTS = [
+        {
+            "key": "wind_energy_systems_text",
+            "out_fn": "{jurisdiction} Wind Ordinance Text.txt",
+            "prompt": _WECS_TEXT_EXTRACTION_PROMPT,
+        },
+        {
+            "key": "cleaned_text_for_extraction",
+            "out_fn": "{jurisdiction} Cleaned Text.txt",
+            "prompt": _LARGE_WECS_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text extraction"""
 
 
-class WindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
+class WindPermittedUseDistrictsTextExtractor(PromptBasedTextExtractor):
     """Extract succinct permitted use districts text from input"""
 
     IN_LABEL = WindPermittedUseDistrictsTextCollector.OUT_LABEL
     """Identifier for collected text ingested by this class"""
 
-    OUT_LABEL = "districts_text"
-    """Identifier for permitted use text extracted by this class"""
-
     TASK_DESCRIPTION = "Extracting wind permitted use text"
     """Task description to show in progress bar"""
 
@@ -456,148 +453,16 @@ class WindPermittedUseDistrictsTextExtractor(OrdinanceTextExtractor):
 
     _USAGE_LABEL = LLMUsageCategory.DOCUMENT_PERMITTED_USE_DISTRICTS_SUMMARY
 
-    PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing permitted use(s) for a district. The extracted text will "
-        "be used for structured data extraction, so it must be both "
-        "**comprehensive** (retaining all relevant details) and **focused** "
-        "(excluding unrelated content), with **zero rewriting or "
-        "paraphrasing**. Ensure that all retained information "
-        "is **directly applicable** to permitted use(s) for one or more "
-        "districts while preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to permitted use(s) for "
-        "a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Pay extra attention to any references to **wind energy "
-        "facilities** or related terms.\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do **not** include unrelated regulations, procedural details, "
-        "or non-use-based restrictions.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses"""
-
-    WES_PERMITTED_USES_FILTER_PROMPT = (
-        "# CONTEXT #\n"
-        "We want to reduce the provided excerpt to only contain information "
-        "detailing **wind energy system** permitted use(s) for a district. "
-        "The extracted text will be used for structured data extraction, so "
-        "it must be both **comprehensive** (retaining all relevant details) "
-        "and **focused** (excluding unrelated content), with **zero rewriting "
-        "or paraphrasing**. Ensure that all "
-        "retained information is **directly applicable** to permitted use(s) "
-        "for wind energy systems in one or more districts while "
-        "preserving full context and accuracy.\n"
-        "\n# OBJECTIVE #\n"
-        "Remove all text **not directly pertinent** to wind energy conversion "
-        "system permitted use(s) for a district.\n"
-        "\n# RESPONSE #\n"
-        "Follow these guidelines carefully:\n"
-        "\n1. ## Scope of Extraction ##:\n"
-        "- Retain all text defining permitted use(s) for a district, "
-        "including:\n"
-        "\t- **Primary, Special, Conditional, Accessory, Prohibited, and "
-        "any other use types.**\n"
-        "\t- **District names and zoning classifications.**\n"
-        "- Ensure that **tables, lists, and structured elements** are "
-        "preserved as they may contain relevant details.\n"
-        "\n2. ## Exclusions ##:\n"
-        "- Do not include text that does not pertain at all to wind "
-        "energy systems.\n"
-        "\n3. ## Formatting & Structure ##:\n"
-        "- **Preserve _all_ section titles, headers, and numberings** for "
-        "reference, **especially if they contain the district name**.\n"
-        "- **Maintain the original wording, formatting, and structure** to "
-        "ensure accuracy.\n"
-        "\n4. ## Output Handling ##:\n"
-        "- This is a strict extraction task — act like a text filter, **not** "
-        "a summarizer or writer.\n"
-        "- Do not add, explain, reword, or summarize anything.\n"
-        "- The output must be a **copy-paste** of the original excerpt.\n"
-        "**Absolutely no paraphrasing or rewriting.**\n"
-        "- The output must consist **only** of contiguous or discontiguous "
-        "verbatim blocks copied from the input.\n"
-        "- If **no relevant text** is found, return the response: "
-        "'No relevant text.'"
-    )
-    """Prompt to extract ordinance text for permitted uses for WECS"""
-
-    async def extract_permitted_uses(self, text_chunks):
-        """Extract permitted uses text from input text chunks
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    async def extract_wes_permitted_uses(self, text_chunks):
-        """Extract permitted uses text for large WES from input text
-
-        Parameters
-        ----------
-        text_chunks : list of str
-            List of strings, each of which represent a chunk of text.
-            The order of the strings should be the order of the text
-            chunks.
-
-        Returns
-        -------
-        str
-            Ordinance text extracted from text chunks.
-        """
-        return await self._process(
-            text_chunks=text_chunks,
-            instructions=self.WES_PERMITTED_USES_FILTER_PROMPT,
-        )
-
-    @property
-    def parsers(self):
-        """Iterable of parsers provided by this extractor
-
-        Yields
-        ------
-        name : str
-            Name describing the type of text output by the parser.
-        parser : callable
-            Async function that takes a ``text_chunks`` input and
-            outputs parsed text.
-        """
-        yield "permitted_use_only_text", self.extract_permitted_uses
-        yield self.OUT_LABEL, self.extract_wes_permitted_uses
+    PROMPTS = [
+        {
+            "key": "permitted_use_only_text",
+            "out_fn": "{jurisdiction} Permitted Use Only.txt",
+            "prompt": _PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+        {
+            "key": "districts_text",
+            "out_fn": "{jurisdiction} Districts.txt",
+            "prompt": _WECS_PERMITTED_USES_TEXT_EXTRACTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text extraction"""
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index affa5fb8..7cb2c6d1 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -197,20 +197,126 @@ def _store_chunk(self, parser, chunk_ind):
             )
 
 
-class OrdinanceTextExtractor(BaseTextExtractor, ABC):
-    """Base implementation for a text extractor"""
+class PromptBasedTextExtractor(BaseTextExtractor, ABC):
+    """Text extractor based on a chain of prompts"""
 
     SYSTEM_MESSAGE = (
-        "You are a text extraction assistant. Your job is to extract only "
-        "verbatim, **unmodified** excerpts from provided legal or policy "
-        "documents. Do not interpret or paraphrase. Do not summarize. Only "
-        "return exactly copied segments that match the specified scope. If "
-        "the relevant content appears within a table, return the entire "
-        "table, including headers and footers, exactly as formatted."
+        dedent(
+            """\
+        You are a text extraction assistant. Your job is to extract only
+        verbatim, **unmodified** excerpts from the provided text. Do not
+        interpret or paraphrase. Do not summarize. Only return exactly copied
+        segments that match the specified scope. If the relevant content
+        appears within a table, return the entire table, including headers
+        and footers, exactly as formatted.
+        """
+        )
+        .replace("\n", " ")
+        .strip()
     )
     """System message for text extraction LLM calls"""
+
+    FORMATTING_PROMPT = (
+        dedent(
+            """\
+        ## Formatting & Structure ##:
+        - **Preserve _all_ section titles, headers, and numberings** for
+          reference.
+        - **Maintain the original wording, formatting, and structure** to
+          ensure accuracy.
+        """
+        )
+        .replace("\n  ", " ")
+        .strip()
+    )
+    """Prompt component instructing model to preserve text structure"""
+
+    OUTPUT_PROMPT = (
+        dedent(
+            """\
+        ## Output Handling ##:
+        - This is a strict extraction task — act like a text filter, **not**
+          a summarizer or writer.
+        - Do not add, explain, reword, or summarize anything.
+        - The output must be a **copy-paste** of the original excerpt.
+          **Absolutely no paraphrasing or rewriting.**
+        - The output must consist **only** of contiguous or discontiguous
+          verbatim blocks copied from the input.
+        - If **no relevant text** is found, return the response:
+          'No relevant text.'
+        """
+        )
+        .replace("\n  ", " ")
+        .strip()
+    )
+    """Prompt component instructing model output guidelines"""
+
     _USAGE_LABEL = LLMUsageCategory.DOCUMENT_ORDINANCE_SUMMARY
 
+    @property
+    @abstractmethod
+    def PROMPTS(self):  # noqa: N802
+        """list: List of dicts defining the prompts for text extraction
+
+        Each dict in the list should have the following keys:
+
+            - **prompt**: [REQUIRED] The text extraction prompt to use
+              for the extraction. The prompt may use the following
+              placeholders, which will be filled in with the
+              corresponding class attributes when the prompt is applied:
+
+                - ``"{FORMATTING_PROMPT}"``: The
+                      :obj:`PromptBasedTextExtractor.FORMATTING_PROMPT`
+                      class attribute, which provides instructions for
+                      preserving the formatting and structure of the
+                      extracted text.
+                - ``"{OUTPUT_PROMPT}"``: The
+                      :obj:`PromptBasedTextExtractor.OUTPUT_PROMPT`
+                      class attribute, which provides instructions for
+                      how the model should format the output and what
+                      content to include or exclude.
+
+            - **key**: [OPTIONAL] A string identifier for the text
+              extracted by this prompt. If not provided, a default key
+              ``"extracted_text_{i}"`` will be used, where ``{i}`` is
+              the index of the prompt in the list. The value of this key
+              from the last dictionary in the input list will be used as
+              this extractor's `OUT_LABEL`, which is typically used to
+              link the extracted text to the appropriate parser via the
+              parser's `IN_LABEL`. All `key` values should be unique
+              across all prompts in the chain.
+            - **out_fn**: [OPTIONAL] A file name template that will be
+              used to write the extracted text to a file. The template
+              can include the placeholder ``{jurisdiction}``, which
+              will be replaced with the full jurisdiction name. If not
+              provided, the extracted text will not be written to a
+              file. This is primarily intended for debugging and
+              analysis purposes, and is not required for the extraction
+              process itself.
+
+        The prompts will be applied in the order they appear in the
+        list, with the output text from each prompt being fed as input
+        to the next prompt in the chain. The final output of the last
+        prompt will be the output of the extractor.
+        """
+        raise NotImplementedError
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if getattr(cls, "__abstractmethods__", None):
+            return
+
+        if not cls.PROMPTS:  # TODO: This should happen at registration
+            msg = (
+                f"{cls.__name__} must have at least one "
+                "prompt defined in the PROMPTS property"
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        last_prompt = cls.PROMPTS[-1]
+        last_index = len(cls.PROMPTS) - 1
+        cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}")
+
     def __init__(self, llm_caller):
         """
 
@@ -221,6 +327,27 @@ def __init__(self, llm_caller):
         """
         self.llm_caller = llm_caller
 
+    @property
+    def parsers(self):
+        """Iterable of parsers provided by this extractor
+
+        Yields
+        ------
+        name : str
+            Name describing the type of text output by the parser.
+        parser : callable
+            Async function that takes a ``text_chunks`` input and
+            outputs parsed text.
+        """
+        for ind, prompt_dict in enumerate(self.PROMPTS):
+            key = prompt_dict.get("key", f"extracted_text_{ind}")
+            instructions = prompt_dict["prompt"].format(
+                FORMATTING_PROMPT=self.FORMATTING_PROMPT,
+                OUTPUT_PROMPT=self.OUTPUT_PROMPT,
+            )
+            # out_fn = prompt_dict.get("out_fn", None)
+            yield key, partial(self._process, instructions=instructions)
+
     async def _process(self, text_chunks, instructions, is_valid_chunk=None):
         """Perform extraction processing"""
         if is_valid_chunk is None:

From 0e06c07090df4c88268fd99729bc02b1707d0ec1 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 19:22:26 -0700
Subject: [PATCH 03/22] Break uo plugin class into a filtered one and ordinance
 one

---
 compass/extraction/small_wind/plugin.py       |   4 +-
 compass/extraction/solar/plugin.py            |   4 +-
 compass/extraction/wind/plugin.py             |   4 +-
 compass/plugin/__init__.py                    |   9 +-
 compass/plugin/interface.py                   | 434 +----------------
 compass/plugin/ordinance.py                   | 443 +++++++++++++++++-
 docs/source/conf.py                           |   4 +-
 .../unit/plugin/test_plugin_interface.py      |  22 +-
 8 files changed, 475 insertions(+), 449 deletions(-)

diff --git a/compass/extraction/small_wind/plugin.py b/compass/extraction/small_wind/plugin.py
index e4d3de1b..02fb97ed 100644
--- a/compass/extraction/small_wind/plugin.py
+++ b/compass/extraction/small_wind/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS wind extraction plugin"""
 
-from compass.plugin.interface import ExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin
 from compass.extraction.small_wind.ordinance import (
     SmallWindHeuristic,
     SmallWindOrdinanceTextCollector,
@@ -60,7 +60,7 @@
 }
 
 
-class COMPASSSmallWindExtractor(ExtractionPlugin):
+class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin):
     """COMPASS small wind extraction plugin"""
 
     IDENTIFIER = "small wind"
diff --git a/compass/extraction/solar/plugin.py b/compass/extraction/solar/plugin.py
index 8123f2ac..4f6b5163 100644
--- a/compass/extraction/solar/plugin.py
+++ b/compass/extraction/solar/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS solar extraction plugin"""
 
-from compass.plugin.interface import ExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin
 from compass.extraction.solar.ordinance import (
     SolarHeuristic,
     SolarOrdinanceTextCollector,
@@ -61,7 +61,7 @@
 }
 
 
-class COMPASSSolarExtractor(ExtractionPlugin):
+class COMPASSSolarExtractor(OrdinanceExtractionPlugin):
     """COMPASS solar extraction plugin"""
 
     IDENTIFIER = "solar"
diff --git a/compass/extraction/wind/plugin.py b/compass/extraction/wind/plugin.py
index 1e22ffaa..905bcc87 100644
--- a/compass/extraction/wind/plugin.py
+++ b/compass/extraction/wind/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS wind extraction plugin"""
 
-from compass.plugin.interface import ExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin
 from compass.extraction.wind.ordinance import (
     WindHeuristic,
     WindOrdinanceTextCollector,
@@ -59,7 +59,7 @@
 }
 
 
-class COMPASSWindExtractor(ExtractionPlugin):
+class COMPASSWindExtractor(OrdinanceExtractionPlugin):
     """COMPASS wind extraction plugin"""
 
     IDENTIFIER = "wind"
diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py
index f42cef35..369a3c80 100644
--- a/compass/plugin/__init__.py
+++ b/compass/plugin/__init__.py
@@ -4,13 +4,14 @@
 from .interface import (
     BaseHeuristic,
     BaseTextCollector,
-    BaseTextExtractor,
-    BaseParser,
-    ExtractionPlugin,
+    FilteredExtractionPlugin,
 )
 from .ordinance import (
+    BaseTextExtractor,
+    BaseParser,
     OrdinanceHeuristic,
     OrdinanceTextCollector,
-    OrdinanceTextExtractor,
+    PromptBasedTextExtractor,
     OrdinanceParser,
+    OrdinanceExtractionPlugin,
 )
diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py
index f73cff8c..25e8607b 100644
--- a/compass/plugin/interface.py
+++ b/compass/plugin/interface.py
@@ -1,52 +1,20 @@
 """COMPASS extraction plugin base class"""
 
-import asyncio
 import logging
-from itertools import chain
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from functools import cached_property
-
-import pandas as pd
 
 from compass.plugin.base import BaseExtractionPlugin
 from compass.llm.calling import LLMCaller
-from compass.extraction import (
-    extract_ordinance_values,
-    extract_relevant_text_with_ngram_validation,
-)
+from compass.extraction import extract_relevant_text_with_ngram_validation
 from compass.scripts.download import filter_ordinance_docs
 from compass.services.threaded import CleanedFileWriter
-from compass.utilities.enums import LLMTasks
-from compass.utilities import (
-    num_ordinances_dataframe,
-    doc_infos_to_db,
-    save_db,
-)
-from compass.utilities.parsing import extract_ord_year_from_doc_attrs
-from compass.exceptions import COMPASSPluginConfigurationError
-from compass.pb import COMPASS_PB
+from compass.utilities import doc_infos_to_db, save_db
 
-logger = logging.getLogger(__name__)
 
+logger = logging.getLogger(__name__)
 
-EXCLUDE_FROM_ORD_DOC_CHECK = {
-    # if doc only contains these, it's not good enough to count as an
-    # ordinance. Note that prohibitions are explicitly not on this list
-    "color",
-    "decommissioning",
-    "lighting",
-    "visual impact",
-    "glare",
-    "repowering",
-    "fencing",
-    "climbing prevention",
-    "signage",
-    "soil",
-    "primary use districts",
-    "special use districts",
-    "accessory use districts",
-}
+# TODO: Allow other to register own clean file outputs
+# TODO: Allow other to register their own jurisdictions csv
 
 
 class BaseHeuristic(ABC):
@@ -129,86 +97,15 @@ async def check_chunk(self, chunk_parser, ind):
         raise NotImplementedError
 
 
-class BaseTextExtractor(ABC):
-    """Extract succinct extraction text from input"""
-
-    TASK_DESCRIPTION = "Condensing text for extraction"
-    """Task description to show in progress bar"""
-
-    TASK_ID = "text_extraction"
-    """ID to use for this extraction for linking with LLM configs"""
-
-    @property
-    @abstractmethod
-    def IN_LABEL(self):  # noqa: N802
-        """str: Identifier for text ingested by this class"""
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def OUT_LABEL(self):  # noqa: N802
-        """str: Identifier for final text extracted by this class"""
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def parsers(self):
-        """Generator: Generator of (key, extractor) pairs
-
-        `extractor` should be an async callable that accepts a list of
-        text chunks and returns the shortened (succinct) text to be used
-        for extraction. The `key` should be a string identifier for the
-        text returned by the extractor. Multiple (key, extractor) pairs
-        can be chained in generator order to iteratively refine the
-        text for extraction.
-        """
-        raise NotImplementedError
-
-
-class BaseParser(ABC):
-    """Extract succinct extraction text from input"""
-
-    TASK_ID = "data_extraction"
-    """ID to use for this extraction for linking with LLM configs"""
-
-    @property
-    @abstractmethod
-    def IN_LABEL(self):  # noqa: N802
-        """str: Identifier for text ingested by this class"""
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def OUT_LABEL(self):  # noqa: N802
-        """str: Identifier for final structured data output"""
-        raise NotImplementedError
-
-    @abstractmethod
-    async def parse(self, text):
-        """Parse text and extract structured data
-
-        Parameters
-        ----------
-        text : str
-            Text which may or may not contain information relevant to
-            the current extraction.
-
-        Returns
-        -------
-        pandas.DataFrame or None
-            DataFrame containing structured extracted data. Can also
-            be ``None`` if no relevant values can be parsed from the
-            text.
-        """
-        raise NotImplementedError
-
-
-class ExtractionPlugin(BaseExtractionPlugin):
+class FilteredExtractionPlugin(BaseExtractionPlugin):
     """Base class for COMPASS extraction plugins
 
-    This class provides a good balance between ease of use and
-    extraction flexibility, allowing implementers to provide additional
-    functionality during the extraction process.
+    This class provides the standard COMPASS document filtering and text
+    collection pipeline, allowing implementers to focus primarily on the
+    structured data extraction step. Filtering and text collection is
+    provided by subclassing the `BaseTextCollector` class and setting
+    the `TEXT_COLLECTORS` property to a list of the desired text
+    collectors.
 
     Plugins can hook into various stages of the extraction pipeline
     to modify behavior, add custom processing, or integrate with
@@ -255,26 +152,6 @@ def TEXT_COLLECTORS(self):  # noqa: N802
         """
         raise NotImplementedError
 
-    @property
-    @abstractmethod
-    def TEXT_EXTRACTORS(self):  # noqa: N802
-        """list of BaseTextExtractor: Classes to condense text
-
-        Should be an iterable of one or more classes to condense text in
-        preparation for the extraction task.
-        """
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def PARSERS(self):  # noqa: N802
-        """list of BaseParser: Classes to extract structured data
-
-        Should be an iterable of one or more classes to extract
-        structured data from text.
-        """
-        raise NotImplementedError
-
     @property
     def heuristic(self):
         """BaseHeuristic: Object with a ``check()`` method
@@ -312,65 +189,6 @@ def save_structured_data(cls, doc_infos, out_dir):
         save_db(db, out_dir)
         return num_docs_found
 
-    def __init__(self, jurisdiction, model_configs, usage_tracker=None):
-        """
-
-        Parameters
-        ----------
-        jurisdiction : Jurisdiction
-            Jurisdiction for which extraction is being performed.
-        model_configs : dict
-            Dictionary where keys are LLMTasks and values are LLMConfig
-            instances to be used for those tasks.
-        usage_tracker : UsageTracker, optional
-            Usage tracker instance that can be used to record the LLM
-            call cost. By default, ``None``.
-        """
-        super().__init__(
-            jurisdiction=jurisdiction,
-            model_configs=model_configs,
-            usage_tracker=usage_tracker,
-        )
-
-        # TODO: This should happen during plugin registration
-        self._validate_in_out_keys()
-
-    @cached_property
-    def producers(self):
-        """list: All classes that produce attributes on the doc"""
-        return chain(self.PARSERS, self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)
-
-    @cached_property
-    def consumer_producer_pairs(self):
-        """list: Pairs of (consumer, producer) for IN/OUT validation"""
-        return [
-            (self.PARSERS, chain(self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)),
-            (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS),
-        ]
-
-    def _validate_in_out_keys(self):
-        """Validate that all IN_LABELs have matching OUT_LABELs"""
-        out_keys = {}
-        for producer in self.producers:
-            out_keys.setdefault(producer.OUT_LABEL, []).append(producer)
-
-        dupes = {k: v for k, v in out_keys.items() if len(v) > 1}
-        if dupes:
-            formatted = "\n".join(
-                [
-                    f"{key}: {[cls.__name__ for cls in classes]}"
-                    for key, classes in dupes.items()
-                ]
-            )
-            msg = (
-                "Multiple processing classes produce the same OUT_LABEL key:\n"
-                f"{formatted}"
-            )
-            raise COMPASSPluginConfigurationError(msg)
-
-        for consumers, producers in self.consumer_producer_pairs:
-            _validate_in_out_keys(consumers, producers)
-
     async def pre_filter_docs_hook(self, extraction_context):  # noqa: PLR6301
         """Pre-process documents before running them through the filter
 
@@ -435,61 +253,6 @@ async def extract_relevant_text(self, doc, extractor_class, model_config):
         )
         await self._write_cleaned_text(doc)
 
-    async def extract_ordinances_from_text(
-        self, doc, parser_class, model_config
-    ):
-        """Extract structured data from input text
-
-        The extracted structured data will be stored in the ``.attrs``
-        dictionary of the input document under the
-        ``parser_class.OUT_LABEL`` key.
-
-        Parameters
-        ----------
-        doc : BaseDocument
-            Document containing text to extract structured data from.
-        parser_class : BaseParser
-            Class to use for structured data extraction.
-        model_config : LLMConfig
-            Configuration for the LLM model to use for structured data
-            extraction.
-        """
-        parser = parser_class(
-            llm_service=model_config.llm_service,
-            usage_tracker=self.usage_tracker,
-            **model_config.llm_call_kwargs,
-        )
-        logger.info(
-            "Extracting %s...", parser_class.OUT_LABEL.replace("_", " ")
-        )
-        await extract_ordinance_values(
-            doc,
-            parser,
-            text_key=parser_class.IN_LABEL,
-            out_key=parser_class.OUT_LABEL,
-        )
-
-    @classmethod
-    def get_structured_data_row_count(cls, data_df):
-        """Get the number of data rows extracted from a document
-
-        Parameters
-        ----------
-        data_df : pandas.DataFrame or None
-            DataFrame to check for extracted structured data.
-
-        Returns
-        -------
-        int
-            Number of data rows extracted from the document.
-        """
-        if data_df is None:
-            return 0
-
-        return num_ordinances_dataframe(
-            data_df, exclude_features=EXCLUDE_FROM_ORD_DOC_CHECK
-        )
-
     async def filter_docs(
         self, extraction_context, need_jurisdiction_verification=True
     ):
@@ -560,181 +323,8 @@ async def filter_docs(
         extraction_context.documents = docs
         return extraction_context
 
-    async def parse_docs_for_structured_data(self, extraction_context):
-        """Parse documents to extract structured data/information
-
-        Parameters
-        ----------
-        extraction_context : ExtractionContext
-            Context containing candidate documents to parse.
-
-        Returns
-        -------
-        ExtractionContext or None
-            Context with extracted data/information stored in the
-            ``.attrs`` dictionary, or ``None`` if no data was extracted.
-        """
-        for doc_for_extraction in extraction_context:
-            data_df = await self.parse_single_doc_for_structured_data(
-                doc_for_extraction
-            )
-            row_count = self.get_structured_data_row_count(data_df)
-            if row_count > 0:
-                await extraction_context.mark_doc_as_data_source(
-                    doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
-                )
-                extraction_context.attrs["structured_data"] = data_df
-                logger.info(
-                    "%d ordinance value(s) found in doc from %s for %s. ",
-                    row_count,
-                    doc_for_extraction.attrs.get("source", "unknown source"),
-                    self.jurisdiction.full_name,
-                )
-                return extraction_context
-
-        logger.debug(
-            "No ordinances found; searched %d docs",
-            extraction_context.num_documents,
-        )
-        return None
-
-    async def parse_single_doc_for_structured_data(self, doc_for_extraction):
-        """Extract all possible structured data from a document
-
-        This method is called from the default implementation of
-        `parse_docs_for_structured_data()` for each document that passed
-        filtering. If you overwrite`parse_docs_for_structured_data()``,
-        you can ignore this method.
-
-        Parameters
-        ----------
-        doc_for_extraction : BaseDocument
-            Document to extract structured data from.
-
-        Returns
-        -------
-        BaseDocument
-            Document with extracted structured data stored in the
-            ``.attrs`` dictionary.
-        """
-        with self._tracked_progress():
-            tasks = [
-                asyncio.create_task(
-                    self._try_extract_ordinances(
-                        doc_for_extraction, parser_class
-                    ),
-                    name=self.jurisdiction.full_name,
-                )
-                for parser_class in filter(None, self.PARSERS)
-            ]
-            await asyncio.gather(*tasks)
-
-        return self._concat_scrape_results(doc_for_extraction)
-
-    async def _try_extract_ordinances(self, doc_for_extraction, parser_class):
-        """Apply a single extractor and parser to legal text"""
-
-        if parser_class.IN_LABEL not in doc_for_extraction.attrs:
-            await self._run_text_extractors(doc_for_extraction, parser_class)
-
-        model_config = self._get_model_config(
-            primary_key=parser_class.TASK_ID,
-            secondary_key=LLMTasks.DATA_EXTRACTION,
-        )
-        await self.extract_ordinances_from_text(
-            doc_for_extraction,
-            parser_class=parser_class,
-            model_config=model_config,
-        )
-
-        await self.record_usage()
-
-    async def _run_text_extractors(self, doc_for_extraction, parser_class):
-        """Run text extractor(s) on document to get text for a parser"""
-        te = [
-            te
-            for te in self.TEXT_EXTRACTORS
-            if te.OUT_LABEL == parser_class.IN_LABEL
-        ]
-        if len(te) != 1:
-            msg = (
-                f"Could not find unique text extractor for parser "
-                f"{parser_class.__name__} with IN_LABEL "
-                f"{parser_class.IN_LABEL!r}. Got matches: {te}"
-            )
-            raise COMPASSPluginConfigurationError(msg)
-
-        te = te[0]
-        model_config = self._get_model_config(
-            primary_key=te.TASK_ID,
-            secondary_key=LLMTasks.TEXT_EXTRACTION,
-        )
-        logger.debug(
-            "Condensing text for extraction using %r for doc from %s",
-            te.__name__,
-            doc_for_extraction.attrs.get("source", "unknown source"),
-        )
-        assert self._jsp is not None, "No progress bar set!"
-        task_id = self._jsp.add_task(te.TASK_DESCRIPTION)
-        await self.extract_relevant_text(doc_for_extraction, te, model_config)
-        await self.record_usage()
-        self._jsp.remove_task(task_id)
-
-    @contextmanager
-    def _tracked_progress(self):
-        """Context manager to set up jurisdiction sub-progress bar"""
-        loc = self.jurisdiction.full_name
-        with COMPASS_PB.jurisdiction_sub_prog(loc) as self._jsp:
-            yield
-
-        self._jsp = None
-
-    def _concat_scrape_results(self, doc):
-        """Concatenate structured data from all parsers"""
-        data = [doc.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS]
-        data = [df for df in data if df is not None and not df.empty]
-        if len(data) == 0:
-            return None
-
-        data = data[0] if len(data) == 1 else pd.concat(data)
-        data["source"] = doc.attrs.get("source")
-        data["ord_year"] = extract_ord_year_from_doc_attrs(doc.attrs)
-        return data
-
-    def _get_model_config(self, primary_key, secondary_key):
-        """Get model config: primary_key -> secondary_key -> default"""
-        if primary_key in self.model_configs:
-            return self.model_configs[primary_key]
-        return self.model_configs.get(
-            secondary_key, self.model_configs[LLMTasks.DEFAULT]
-        )
-
     async def _write_cleaned_text(self, doc):
         """Write cleaned text to `clean_files` dir"""
         out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name)
         doc.attrs["cleaned_fps"] = out_fp
         return doc
-
-
-def _validate_in_out_keys(consumers, producers):
-    """Validate that all IN_LABELs have matching OUT_LABELs"""
-    in_keys = {}
-    out_keys = {}
-
-    for producer_class in producers:
-        out_keys.setdefault(producer_class.OUT_LABEL, []).append(
-            producer_class
-        )
-
-    for consumer_class in chain(consumers):
-        in_keys.setdefault(consumer_class.IN_LABEL, []).append(consumer_class)
-
-    for in_key, classes in in_keys.items():
-        formatted = f"{[cls.__name__ for cls in classes]}"
-        if in_key not in out_keys:
-            msg = (
-                f"One or more processing classes require IN_LABEL "
-                f"{in_key!r}, which is not produced by any previous "
-                f"processing class: {formatted}"
-            )
-            raise COMPASSPluginConfigurationError(msg)
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 7cb2c6d1..de75087a 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -2,9 +2,14 @@
 
 import asyncio
 import logging
-from abc import ABC, abstractmethod
 from warnings import warn
+from textwrap import dedent
+from itertools import chain
+from functools import cached_property, partial
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
 
+import pandas as pd
 from elm import ApiBase
 
 from compass.llm.calling import (
@@ -12,22 +17,117 @@
     ChatLLMCaller,
     StructuredLLMCaller,
 )
-from compass.utilities.enums import LLMUsageCategory
-from compass.utilities.ngrams import convert_text_to_sentence_ngrams
-from compass.warn import COMPASSWarning
-from compass.utilities.parsing import (
-    merge_overlapping_texts,
-    clean_backticks_from_llm_response,
-)
 from compass.plugin.interface import (
     BaseHeuristic,
     BaseTextCollector,
-    BaseTextExtractor,
-    BaseParser,
+    FilteredExtractionPlugin,
+)
+from compass.extraction import extract_ordinance_values
+from compass.utilities.enums import LLMTasks, LLMUsageCategory
+from compass.utilities.ngrams import convert_text_to_sentence_ngrams
+from compass.utilities.parsing import (
+    clean_backticks_from_llm_response,
+    extract_ord_year_from_doc_attrs,
+    merge_overlapping_texts,
 )
+from compass.utilities import num_ordinances_dataframe
+from compass.warn import COMPASSWarning
+from compass.exceptions import COMPASSPluginConfigurationError
+from compass.pb import COMPASS_PB
 
 
 logger = logging.getLogger(__name__)
+EXCLUDE_FROM_ORD_DOC_CHECK = {
+    # if doc only contains these, it's not good enough to count as an
+    # ordinance. Note that prohibitions are explicitly not on this list
+    "color",
+    "decommissioning",
+    "lighting",
+    "visual impact",
+    "glare",
+    "repowering",
+    "fencing",
+    "climbing prevention",
+    "signage",
+    "soil",
+    "primary use districts",
+    "special use districts",
+    "accessory use districts",
+}
+
+
+class BaseTextExtractor(ABC):
+    """Extract succinct extraction text from input"""
+
+    TASK_DESCRIPTION = "Condensing text for extraction"
+    """Task description to show in progress bar"""
+
+    TASK_ID = "text_extraction"
+    """ID to use for this extraction for linking with LLM configs"""
+
+    @property
+    @abstractmethod
+    def IN_LABEL(self):  # noqa: N802
+        """str: Identifier for text ingested by this class"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def OUT_LABEL(self):  # noqa: N802
+        """str: Identifier for final text extracted by this class"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def parsers(self):
+        """Generator: Generator of (key, extractor) pairs
+
+        `extractor` should be an async callable that accepts a list of
+        text chunks and returns the shortened (succinct) text to be used
+        for extraction. The `key` should be a string identifier for the
+        text returned by the extractor. Multiple (key, extractor) pairs
+        can be chained in generator order to iteratively refine the
+        text for extraction.
+        """
+        raise NotImplementedError
+
+
+class BaseParser(ABC):
+    """Extract succinct extraction text from input"""
+
+    TASK_ID = "data_extraction"
+    """ID to use for this extraction for linking with LLM configs"""
+
+    @property
+    @abstractmethod
+    def IN_LABEL(self):  # noqa: N802
+        """str: Identifier for text ingested by this class"""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def OUT_LABEL(self):  # noqa: N802
+        """str: Identifier for final structured data output"""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def parse(self, text):
+        """Parse text and extract structured data
+
+        Parameters
+        ----------
+        text : str
+            Text which may or may not contain information relevant to
+            the current extraction.
+
+        Returns
+        -------
+        pandas.DataFrame or None
+            DataFrame containing structured extracted data. Can also
+            be ``None`` if no relevant values can be parsed from the
+            text.
+        """
+        raise NotImplementedError
 
 
 class OrdinanceHeuristic(BaseHeuristic, ABC):
@@ -401,6 +501,329 @@ def _init_chat_llm_caller(self, system_message):
         )
 
 
+class OrdinanceExtractionPlugin(FilteredExtractionPlugin):
+    """Base class for COMPASS extraction plugins
+
+    This class provides a good balance between ease of use and
+    extraction flexibility, allowing implementers to provide additional
+    functionality during the extraction process.
+
+    Plugins can hook into various stages of the extraction pipeline
+    to modify behavior, add custom processing, or integrate with
+    external systems.
+
+    Subclasses should implement the desired hooks and override
+    methods as needed.
+    """
+
+    @property
+    @abstractmethod
+    def TEXT_EXTRACTORS(self):  # noqa: N802
+        """list of BaseTextExtractor: Classes to condense text
+
+        Should be an iterable of one or more classes to condense text in
+        preparation for the extraction task.
+        """
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def PARSERS(self):  # noqa: N802
+        """list of BaseParser: Classes to extract structured data
+
+        Should be an iterable of one or more classes to extract
+        structured data from text.
+        """
+        raise NotImplementedError
+
+    @cached_property
+    def producers(self):
+        """list: All classes that produce attributes on the doc"""
+        return chain(self.PARSERS, self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)
+
+    @cached_property
+    def consumer_producer_pairs(self):
+        """list: Pairs of (consumer, producer) for IN/OUT validation"""
+        return [
+            (self.PARSERS, chain(self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS)),
+            (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS),
+        ]
+
+    def __init__(self, jurisdiction, model_configs, usage_tracker=None):
+        """
+
+        Parameters
+        ----------
+        jurisdiction : Jurisdiction
+            Jurisdiction for which extraction is being performed.
+        model_configs : dict
+            Dictionary where keys are LLMTasks and values are LLMConfig
+            instances to be used for those tasks.
+        usage_tracker : UsageTracker, optional
+            Usage tracker instance that can be used to record the LLM
+            call cost. By default, ``None``.
+        """
+        super().__init__(
+            jurisdiction=jurisdiction,
+            model_configs=model_configs,
+            usage_tracker=usage_tracker,
+        )
+
+        # TODO: This should happen during plugin registration
+        self._validate_in_out_keys()
+
+    def _validate_in_out_keys(self):
+        """Validate that all IN_LABELs have matching OUT_LABELs"""
+        out_keys = {}
+        for producer in self.producers:
+            out_keys.setdefault(producer.OUT_LABEL, []).append(producer)
+
+        dupes = {k: v for k, v in out_keys.items() if len(v) > 1}
+        if dupes:
+            formatted = "\n".join(
+                [
+                    f"{key}: {[cls.__name__ for cls in classes]}"
+                    for key, classes in dupes.items()
+                ]
+            )
+            msg = (
+                "Multiple processing classes produce the same OUT_LABEL key:\n"
+                f"{formatted}"
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        for consumers, producers in self.consumer_producer_pairs:
+            _validate_in_out_keys(consumers, producers)
+
+    async def extract_ordinances_from_text(
+        self, doc, parser_class, model_config
+    ):
+        """Extract structured data from input text
+
+        The extracted structured data will be stored in the ``.attrs``
+        dictionary of the input document under the
+        ``parser_class.OUT_LABEL`` key.
+
+        Parameters
+        ----------
+        doc : BaseDocument
+            Document containing text to extract structured data from.
+        parser_class : BaseParser
+            Class to use for structured data extraction.
+        model_config : LLMConfig
+            Configuration for the LLM model to use for structured data
+            extraction.
+        """
+        parser = parser_class(
+            llm_service=model_config.llm_service,
+            usage_tracker=self.usage_tracker,
+            **model_config.llm_call_kwargs,
+        )
+        logger.info(
+            "Extracting %s...", parser_class.OUT_LABEL.replace("_", " ")
+        )
+        await extract_ordinance_values(
+            doc,
+            parser,
+            text_key=parser_class.IN_LABEL,
+            out_key=parser_class.OUT_LABEL,
+        )
+
+    @classmethod
+    def get_structured_data_row_count(cls, data_df):
+        """Get the number of data rows extracted from a document
+
+        Parameters
+        ----------
+        data_df : pandas.DataFrame or None
+            DataFrame to check for extracted structured data.
+
+        Returns
+        -------
+        int
+            Number of data rows extracted from the document.
+        """
+        if data_df is None:
+            return 0
+
+        return num_ordinances_dataframe(
+            data_df, exclude_features=EXCLUDE_FROM_ORD_DOC_CHECK
+        )
+
+    async def parse_docs_for_structured_data(self, extraction_context):
+        """Parse documents to extract structured data/information
+
+        Parameters
+        ----------
+        extraction_context : ExtractionContext
+            Context containing candidate documents to parse.
+
+        Returns
+        -------
+        ExtractionContext or None
+            Context with extracted data/information stored in the
+            ``.attrs`` dictionary, or ``None`` if no data was extracted.
+        """
+        for doc_for_extraction in extraction_context:
+            data_df = await self.parse_single_doc_for_structured_data(
+                doc_for_extraction
+            )
+            row_count = self.get_structured_data_row_count(data_df)
+            if row_count > 0:
+                await extraction_context.mark_doc_as_data_source(
+                    doc_for_extraction, out_fn_stem=self.jurisdiction.full_name
+                )
+                extraction_context.attrs["structured_data"] = data_df
+                logger.info(
+                    "%d ordinance value(s) found in doc from %s for %s. ",
+                    row_count,
+                    doc_for_extraction.attrs.get("source", "unknown source"),
+                    self.jurisdiction.full_name,
+                )
+                return extraction_context
+
+        logger.debug(
+            "No ordinances found; searched %d docs",
+            extraction_context.num_documents,
+        )
+        return None
+
+    async def parse_single_doc_for_structured_data(self, doc_for_extraction):
+        """Extract all possible structured data from a document
+
+        This method is called from the default implementation of
+        `parse_docs_for_structured_data()` for each document that passed
+        filtering. If you overwrite`parse_docs_for_structured_data()``,
+        you can ignore this method.
+
+        Parameters
+        ----------
+        doc_for_extraction : BaseDocument
+            Document to extract structured data from.
+
+        Returns
+        -------
+        BaseDocument
+            Document with extracted structured data stored in the
+            ``.attrs`` dictionary.
+        """
+        with self._tracked_progress():
+            tasks = [
+                asyncio.create_task(
+                    self._try_extract_ordinances(
+                        doc_for_extraction, parser_class
+                    ),
+                    name=self.jurisdiction.full_name,
+                )
+                for parser_class in filter(None, self.PARSERS)
+            ]
+            await asyncio.gather(*tasks)
+
+        return self._concat_scrape_results(doc_for_extraction)
+
+    async def _try_extract_ordinances(self, doc_for_extraction, parser_class):
+        """Apply a single extractor and parser to legal text"""
+
+        if parser_class.IN_LABEL not in doc_for_extraction.attrs:
+            await self._run_text_extractors(doc_for_extraction, parser_class)
+
+        model_config = self._get_model_config(
+            primary_key=parser_class.TASK_ID,
+            secondary_key=LLMTasks.DATA_EXTRACTION,
+        )
+        await self.extract_ordinances_from_text(
+            doc_for_extraction,
+            parser_class=parser_class,
+            model_config=model_config,
+        )
+
+        await self.record_usage()
+
+    async def _run_text_extractors(self, doc_for_extraction, parser_class):
+        """Run text extractor(s) on document to get text for a parser"""
+        te = [
+            te
+            for te in self.TEXT_EXTRACTORS
+            if te.OUT_LABEL == parser_class.IN_LABEL
+        ]
+        if len(te) != 1:
+            msg = (
+                f"Could not find unique text extractor for parser "
+                f"{parser_class.__name__} with IN_LABEL "
+                f"{parser_class.IN_LABEL!r}. Got matches: {te}"
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        te = te[0]
+        model_config = self._get_model_config(
+            primary_key=te.TASK_ID,
+            secondary_key=LLMTasks.TEXT_EXTRACTION,
+        )
+        logger.debug(
+            "Condensing text for extraction using %r for doc from %s",
+            te.__name__,
+            doc_for_extraction.attrs.get("source", "unknown source"),
+        )
+        assert self._jsp is not None, "No progress bar set!"
+        task_id = self._jsp.add_task(te.TASK_DESCRIPTION)
+        await self.extract_relevant_text(doc_for_extraction, te, model_config)
+        await self.record_usage()
+        self._jsp.remove_task(task_id)
+
+    @contextmanager
+    def _tracked_progress(self):
+        """Context manager to set up jurisdiction sub-progress bar"""
+        loc = self.jurisdiction.full_name
+        with COMPASS_PB.jurisdiction_sub_prog(loc) as self._jsp:
+            yield
+
+        self._jsp = None
+
+    def _concat_scrape_results(self, doc):
+        """Concatenate structured data from all parsers"""
+        data = [doc.attrs.get(p.OUT_LABEL, None) for p in self.PARSERS]
+        data = [df for df in data if df is not None and not df.empty]
+        if len(data) == 0:
+            return None
+
+        data = data[0] if len(data) == 1 else pd.concat(data)
+        data["source"] = doc.attrs.get("source")
+        data["ord_year"] = extract_ord_year_from_doc_attrs(doc.attrs)
+        return data
+
+    def _get_model_config(self, primary_key, secondary_key):
+        """Get model config: primary_key -> secondary_key -> default"""
+        if primary_key in self.model_configs:
+            return self.model_configs[primary_key]
+        return self.model_configs.get(
+            secondary_key, self.model_configs[LLMTasks.DEFAULT]
+        )
+
+
 def _valid_chunk(chunk):
     """True if chunk has content"""
     return chunk and "no relevant text" not in chunk.lower()
+
+
+def _validate_in_out_keys(consumers, producers):
+    """Validate that all IN_LABELs have matching OUT_LABELs"""
+    in_keys = {}
+    out_keys = {}
+
+    for producer_class in producers:
+        out_keys.setdefault(producer_class.OUT_LABEL, []).append(
+            producer_class
+        )
+
+    for consumer_class in chain(consumers):
+        in_keys.setdefault(consumer_class.IN_LABEL, []).append(consumer_class)
+
+    for in_key, classes in in_keys.items():
+        formatted = f"{[cls.__name__ for cls in classes]}"
+        if in_key not in out_keys:
+            msg = (
+                f"One or more processing classes require IN_LABEL "
+                f"{in_key!r}, which is not produced by any previous "
+                f"processing class: {formatted}"
+            )
+            raise COMPASSPluginConfigurationError(msg)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c5f4d1e8..454898db 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -349,8 +349,8 @@ def setup(app):
     # objects from COMPASS
     "AsyncDecisionTree": ":class:`~compass.common.tree.AsyncDecisionTree`",
     "BaseDocument": ":class:`elm.web.document.BaseDocument`",
-    "BaseParser": ":class:`~compass.plugin.interface.BaseParser`",
-    "BaseTextExtractor": ":class:`~compass.plugin.interface.BaseTextExtractor`",
+    "BaseParser": ":class:`~compass.plugin.ordinance.BaseParser`",
+    "BaseTextExtractor": ":class:`~compass.plugin.ordinance.BaseTextExtractor`",
     "ChatLLMCaller": ":class:`~compass.llm.calling.ChatLLMCaller`",
     "ExtractionContext": ":class:`~compass.extraction.context.ExtractionContext`",
     "Jurisdiction": ":class:`~compass.utilities.jurisdictions.Jurisdiction`",
diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_interface.py
index fd79ff86..1287d9ac 100644
--- a/tests/python/unit/plugin/test_plugin_interface.py
+++ b/tests/python/unit/plugin/test_plugin_interface.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from compass.plugin.interface import ExtractionPlugin
+from compass.plugin.interface import FilteredExtractionPlugin
 from compass.exceptions import COMPASSPluginConfigurationError
 
 
@@ -30,7 +30,7 @@ class PARS2:
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_1"
 
-    class MYPlugin(ExtractionPlugin):
+    class MYPlugin(FilteredExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
@@ -40,6 +40,9 @@ class MYPlugin(ExtractionPlugin):
         QUESTION_TEMPLATES = []
         heuristic = None
 
+        async def parse_docs_for_structured_data(self, extraction_context):
+            return extraction_context
+
     with pytest.raises(
         COMPASSPluginConfigurationError,
         match="Multiple processing classes produce the same OUT_LABEL key",
@@ -69,7 +72,7 @@ class PARS2:
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(ExtractionPlugin):
+    class MYPlugin(FilteredExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
@@ -79,6 +82,9 @@ class MYPlugin(ExtractionPlugin):
         QUESTION_TEMPLATES = []
         heuristic = None
 
+        async def parse_docs_for_structured_data(self, extraction_context):
+            return extraction_context
+
     with pytest.raises(
         COMPASSPluginConfigurationError,
         match="Multiple processing classes produce the same OUT_LABEL key",
@@ -108,7 +114,7 @@ class PARS2:
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(ExtractionPlugin):
+    class MYPlugin(FilteredExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
@@ -118,6 +124,9 @@ class MYPlugin(ExtractionPlugin):
         QUESTION_TEMPLATES = []
         heuristic = None
 
+        async def parse_docs_for_structured_data(self, extraction_context):
+            return extraction_context
+
     with pytest.raises(
         COMPASSPluginConfigurationError,
         match=(
@@ -151,7 +160,7 @@ class PARS2:
         IN_LABEL = "collected_2"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(ExtractionPlugin):
+    class MYPlugin(FilteredExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
@@ -161,6 +170,9 @@ class MYPlugin(ExtractionPlugin):
         QUESTION_TEMPLATES = []
         heuristic = None
 
+        async def parse_docs_for_structured_data(self, extraction_context):
+            return extraction_context
+
     with pytest.raises(
         COMPASSPluginConfigurationError,
         match=(

From f187cd1e7846b7c4bcf6051fd781e779124218ee Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 20:11:25 -0700
Subject: [PATCH 04/22] Add and use `PromptBasedTextCollector`

---
 compass/extraction/small_wind/ordinance.py | 250 +++++++--------------
 compass/extraction/solar/ordinance.py      | 242 +++++++-------------
 compass/extraction/wind/ordinance.py       | 246 +++++++-------------
 compass/plugin/__init__.py                 |   2 +-
 compass/plugin/ordinance.py                | 109 ++++++++-
 compass/validation/content.py              |  18 +-
 6 files changed, 360 insertions(+), 507 deletions(-)

diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py
index d392c37f..01dfa2ea 100644
--- a/compass/extraction/small_wind/ordinance.py
+++ b/compass/extraction/small_wind/ordinance.py
@@ -8,7 +8,7 @@
 
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
-    OrdinanceTextCollector,
+    PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
@@ -32,6 +32,59 @@
 _IGNORE_TYPES_MICRO = "private, micro, personal, building-mounted"
 _IGNORE_TYPES_LARGE = "large, utility-scale, for-sale, commercial"
 
+_CONTAINS_ORD_COLLECTION_PROMPT = f"""\
+You extract structured data from text. Return your answer in JSON format \
+(not markdown). Your JSON file must include exactly two keys. The first \
+key is 'wind_reqs', which is a string that summarizes all {_SEARCH_TERMS_AND} \
+that are explicitly enacted in the text for a wind energy system (or wind \
+turbine/tower) for a given jurisdiction. Note that wind energy bans are \
+an important restriction to track. Include any **closely related provisions** \
+if they clearly pertain to the **development, operation, modification, or \
+removal** of wind energy systems (or wind turbines/towers). All restrictions \
+should be enforceable - ignore any text that only provides a legal definition \
+of the regulation. If the text does not specify any concrete \
+{_SEARCH_TERMS_OR} for a wind energy system, set this key to `null`. The last \
+key is '{{key}}', which is a boolean that is set to True if the text excerpt \
+explicitly details {_SEARCH_TERMS_OR} for a wind energy system (or wind \
+turbine/tower) and False otherwise.\
+"""
+
+_IS_SMALL_COLLECTION_PROMPT = f"""\
+You are a legal scholar that reads ordinance text and determines whether any \
+of it applies to {_SEARCH_TERMS_OR} for **small, medium, or non-commercial \
+wind energy systems**. Small, medium, or non-commercial energy systems may \
+also be referred to as {_SMALL_WES_SYNONYMS}. Your client is a private \
+resident that does not care about ordinances related to {_IGNORE_TYPES_MICRO} \
+or {_IGNORE_TYPES_LARGE} wind energy systems. Ignore any text related to such \
+systems. Return your answer as a dictionary in JSON format (not markdown). \
+Your JSON file must include exactly two keys. The first key is 'summary' \
+which contains a string that lists all of the types of wind energy systems \
+the text applies to (if any). The second key is '{{key}}', which is a boolean \
+that is set to True if any part of the text excerpt details \
+{_SEARCH_TERMS_OR} for the **small, medium, or non-commercial wind energy \
+conversion systems** (or similar) that the client is interested in and False \
+otherwise.\
+"""
+
+_DISTRICTS_COLLECTION_PROMPT = f"""\
+You are a legal scholar that reads ordinance text and determines whether the \
+text explicitly contains relevant information to determine the districts (and \
+especially the district names) where small, medium, or non-commercial wind \
+energy systems are a permitted use (i.e. accessory use), as well as the \
+districts where wind energy systems are prohibited entirely. Small wind \
+energy systems (SWES) may also be referred to as {_SMALL_WES_SYNONYMS}. Do \
+not make any inferences; only answer based on information that is explicitly \
+stated in the text. Note that relevant information may sometimes be found in \
+tables. Return your answer as a dictionary in JSON format (not markdown). \
+Your JSON file must include exactly two keys. The first key is 'districts' \
+which contains a string that lists all of the district names for which the \
+text explicitly permits **small, medium, or non-commercial wind energy \
+systems** (if any). The last key is '{{key}}', which is a boolean that is set \
+to True if any part of the text excerpt provides information on districts \
+where **small, medium, or non-commercial wind energy systems** (or similar) \
+are a permitted use (i.e. accessory use) in and False otherwise.\
+"""
+
 _WECS_TEXT_EXTRACTION_PROMPT = """\
 # CONTEXT #
 We want to reduce the provided excerpt to only contain information about \
@@ -261,183 +314,44 @@ class SmallWindHeuristic(OrdinanceHeuristic):
     """Phrases that indicate text is about WECS"""
 
 
-class SmallWindOrdinanceTextCollector(OrdinanceTextCollector):
+class SmallWindOrdinanceTextCollector(PromptBasedTextCollector):
     """Check text chunks for ordinances and collect them if they do"""
 
     OUT_LABEL = "relevant_text"
     """Identifier for text collected by this class"""
 
-    CONTAINS_ORD_PROMPT = (
-        "You extract structured data from text. Return your answer in JSON "
-        "format (not markdown). Your JSON file must include exactly two "
-        "keys. The first key is 'wind_reqs', which is a string that "
-        f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted "
-        "in the text for a wind energy system (or wind turbine/tower) for "
-        "a given jurisdiction. "
-        "Note that wind energy bans are an important restriction to track. "
-        "Include any **closely related provisions** if they clearly pertain "
-        "to the **development, operation, modification, or removal** of wind "
-        "energy systems (or wind turbines/towers). "
-        "All restrictions should be enforceable - ignore any text that only "
-        "provides a legal definition of the regulation. If the text does not "
-        f"specify any concrete {_SEARCH_TERMS_OR} for a wind energy system, "
-        "set this key to `null`. The last key is '{key}', which is a boolean "
-        "that is set to True if the text excerpt explicitly details "
-        f"{_SEARCH_TERMS_OR} for a wind energy system (or wind turbine/tower) "
-        "and False otherwise. "
-    )
-    """Prompt to check if chunk contains WES ordinance info"""
-
-    IS_SMALL_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        f"whether any of it applies to {_SEARCH_TERMS_OR} for **small, "
-        "medium, or non-commercial wind energy systems**. Small, medium, or "
-        "non-commercial energy systems may also be referred to as "
-        f"{_SMALL_WES_SYNONYMS}. "
-        "Your client is a private resident that does not care about "
-        f"ordinances related to {_IGNORE_TYPES_MICRO} or "
-        f"{_IGNORE_TYPES_LARGE} wind energy systems. Ignore any text "
-        "related to such systems. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'summary' which contains a string that lists all of the types of "
-        "wind energy systems the text applies to (if any). The second key is "
-        "'{key}', which is a boolean that is set to True if any part of the "
-        f"text excerpt details {_SEARCH_TERMS_OR} for the **small, medium, or "
-        "non-commercial wind energy conversion systems** (or similar) that "
-        "the client is interested in and False otherwise."
-    )
-    """Prompt to check if chunk is for small WES"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk at a given ind to see if it contains ordinance
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains small wind energy conversion system ordinance text.
-        """
-        contains_ord_info = await chunk_parser.parse_from_ind(
-            ind,
-            key="contains_ord_info",
-            llm_call_callback=self._check_chunk_contains_ord,
-        )
-        if not contains_ord_info:
-            logger.debug("Text at ind %d does not contain ordinance info", ind)
-            return False
-
-        logger.debug("Text at ind %d does contain ordinance info", ind)
-
-        is_small_scale = await chunk_parser.parse_from_ind(
-            ind,
-            key="x",
-            llm_call_callback=self._check_chunk_is_for_small_scale,
-        )
-        if not is_small_scale:
-            logger.debug("Text at ind %d is not for small WECS", ind)
-            return False
-
-        logger.debug("Text at ind %d is for small WECS", ind)
-
-        self._store_chunk(chunk_parser, ind)
-        logger.debug("Added text at ind %d to ordinances", ind)
-
-        return True
-
-    async def _check_chunk_contains_ord(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for ordinance"""
-        content = await self.call(
-            sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-    async def _check_chunk_is_for_small_scale(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for small scale"""
-        content = await self.call(
-            sys_msg=self.IS_SMALL_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-
-class SmallWindPermittedUseDistrictsTextCollector(OrdinanceTextCollector):
+    PROMPTS = [
+        {
+            "key": "contains_ord_info",
+            "label": "contains ordinance info",
+            "prompt": _CONTAINS_ORD_COLLECTION_PROMPT,
+        },
+        {
+            # Generic key like "x" makes the llm focus on the
+            # instruction rather than using the key name to infer the
+            # content, which can improve performance,
+            "key": "x",
+            "label": "for small WECS",
+            "prompt": _IS_SMALL_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text collection"""
+
+
+class SmallWindPermittedUseDistrictsTextCollector(PromptBasedTextCollector):
     """Check text chunks for permitted wind districts; collect them"""
 
     OUT_LABEL = "permitted_use_text"
     """Identifier for text collected by this class"""
 
-    DISTRICT_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        "whether the text explicitly contains relevant information to "
-        "determine the districts (and especially the district names) where "
-        "small, medium, or non-commercial wind energy systems are a permitted "
-        "use (i.e. accessory use), as well as the districts where wind energy"
-        "systems are prohibited entirely. Small wind energy systems "
-        f"(SWES) may also be referred to as {_SMALL_WES_SYNONYMS}. "
-        "Do not make any inferences; only answer based on information that "
-        "is explicitly stated in the text. "
-        "Note that relevant information may sometimes be found in tables. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'districts' which contains a string that lists all of the district "
-        "names for which the text explicitly permits **small, medium, or "
-        "non-commercial wind energy systems** (if any). The last key is "
-        "'{key}', which is a boolean that is set to True if any part of the "
-        "text excerpt provides information on districts where **small, "
-        "medium, or non-commercial wind energy systems** (or similar) are a "
-        "permitted use (i.e. accessory use) in and False otherwise."
-    )
-    """Prompt to check if chunk contains info on permitted districts"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk to see if it contains permitted uses
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains small wind energy conversion system permitted use
-            text.
-        """
-
-        key = "contains_district_info"
-        content = await self.call(
-            sys_msg=self.DISTRICT_PROMPT.format(key=key),
-            content=chunk_parser.text_chunks[ind],
-            usage_sub_label=(
-                LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION
-            ),
-        )
-        logger.debug("LLM response: %s", content)
-        contains_district_info = content.get(key, False)
-
-        if contains_district_info:
-            self._store_chunk(chunk_parser, ind)
-            logger.debug("Text at ind %d contains district info", ind)
-            return True
-
-        logger.debug("Text at ind %d does not contain district info", ind)
-        return False
+    PROMPTS = [
+        {
+            "key": "contains_district_info",
+            "label": "contains district info",
+            "prompt": _DISTRICTS_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text collection"""
 
 
 class SmallWindOrdinanceTextExtractor(PromptBasedTextExtractor):
diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py
index 67dbb61d..691adf12 100644
--- a/compass/extraction/solar/ordinance.py
+++ b/compass/extraction/solar/ordinance.py
@@ -8,7 +8,7 @@
 
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
-    OrdinanceTextCollector,
+    PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
@@ -34,6 +34,57 @@
     "CSP, private, residential, roof-mounted, micro, small, or medium sized"
 )
 
+_CONTAINS_ORD_COLLECTION_PROMPT = f"""\
+You extract structured data from text. Return your answer in JSON format \
+(not markdown). Your JSON file must include exactly two  keys. The first \
+key is 'solar_reqs', which is a string that summarizes all
+{_SEARCH_TERMS_AND} that are explicitly enacted in the legal text for solar  \
+energy systems for a given jurisdiction. Note that solar energy bans are an \
+important restriction to track. Include any **closely related provisions**  \
+if they clearly pertain to the **development, operation, modification, or  \
+removal** of solar energy systems (or solar panels). All restrictions should  \
+be enforceable - ignore any text that only provides a legal definition of  \
+the regulation. If the text does not specify any concrete {_SEARCH_TERMS_OR}  \
+for a solar energy system, set this key to `null`. The last key is \
+'{{key}}', which is a boolean that is set to True if the text excerpt  \
+explicitly details {_SEARCH_TERMS_OR} for a solar energy system and False  \
+otherwise.\
+"""
+
+_IS_UTILITY_SCALE_COLLECTION_PROMPT = f"""
+You are a legal scholar that reads ordinance text and determines whether it  \
+applies to {_SEARCH_TERMS_OR} for **large solar energy systems**. Large  \
+solar  energy systems (SES) may also be referred to as  \
+{_LARGE_SEF_SYNONYMS}. Your client is a commercial solar developer that does  \
+not care about ordinances related to {_IGNORE_TYPES} solar energy systems.  \
+Ignore any text related to such systems. Return your answer as a dictionary  \
+in JSON format (not markdown). Your JSON file must include exactly two keys.  \
+The first key is 'summary' which contains a string that summarizes the types  \
+of solar energy systems the text applies to (if any). The second key is  \
+'{{key}}', which is a boolean that is set to True if any part of the text  \
+excerpt details {_SEARCH_TERMS_OR} for the **large solar energy conversion  \
+systems** (or similar) that the client is interested in and False otherwise.\
+"""
+
+_DISTRICTS_COLLECTION_PROMPT = f"""
+You are a legal scholar that reads ordinance text and determines whether it \
+explicitly contains relevant information to determine the districts (and \
+especially the district names) where large solar energy farms are a permitted \
+use (primary, special, accessory, or otherwise), as well as the districts \
+where large solar energy farms are prohibited entirely. Large solar energy \
+systems (SES) may also be referred to as {_LARGE_SEF_SYNONYMS}. Do not make \
+any inferences; only answer based on information that is explicitly stated in \
+the text. Note that relevant information may sometimes be found in tables. \
+Return your answer as a dictionary in JSON format (not markdown). Your JSON \
+file must include exactly two keys. The first key is 'districts' which \
+contains a string that lists all of the district names for which the text \
+explicitly permits **large solar energy farms** (if any). The last key is \
+'{{key}}', which is a boolean that is set to True if any part of the text \
+excerpt provides information on districts where **large solar energy farms** \
+(or similar) are a permitted use and False otherwise.\
+"""
+
+
 _SEF_TEXT_EXTRACTION_PROMPT = f"""\
 # CONTEXT #
 We want to reduce the provided excerpt to only contain information about \
@@ -179,177 +230,44 @@ class SolarHeuristic(OrdinanceHeuristic):
     """Phrases that indicate text is about solar farms"""
 
 
-class SolarOrdinanceTextCollector(OrdinanceTextCollector):
+class SolarOrdinanceTextCollector(PromptBasedTextCollector):
     """Check text chunks for ordinances and collect them if they do"""
 
     OUT_LABEL = "relevant_text"
     """Identifier for text collected by this class"""
 
-    CONTAINS_ORD_PROMPT = (
-        "You extract structured data from text. Return your answer in JSON "
-        "format (not markdown). Your JSON file must include exactly two "
-        "keys. The first key is 'solar_reqs', which is a string that "
-        f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted "
-        "in the legal text for solar energy systems for a given jurisdiction. "
-        "Note that solar energy bans are an important restriction to track. "
-        "Include any **closely related provisions** if they clearly pertain "
-        "to the **development, operation, modification, or removal** of solar "
-        "energy systems (or solar panels). "
-        "All restrictions should be enforceable - ignore any text that only "
-        "provides a legal definition of the regulation. If the text does not "
-        f"specify any concrete {_SEARCH_TERMS_OR} for a solar energy system, "
-        "set this key to `null`. The last key is '{key}', which is a boolean "
-        "that is set to True if the text excerpt explicitly details "
-        f"{_SEARCH_TERMS_OR} for a solar energy system and False otherwise."
-    )
-    """Prompt to check if chunk contains SEF ordinance info"""
-
-    IS_UTILITY_SCALE_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        f"whether it applies to {_SEARCH_TERMS_OR} for **large "
-        "solar energy systems**. Large solar energy systems (SES) may "
-        f"also be referred to as {_LARGE_SEF_SYNONYMS}. "
-        "Your client is a commercial solar developer that does not "
-        f"care about ordinances related to {_IGNORE_TYPES} solar energy "
-        "systems. Ignore any text related to such systems. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'summary' which contains a string that summarizes the types of "
-        "solar energy systems the text applies to (if any). The second key "
-        "is '{key}', which is a boolean that is set to True if any part of "
-        f"the text excerpt details {_SEARCH_TERMS_OR} for the **large solar "
-        "energy conversion systems** (or similar) that the client is "
-        "interested in and False otherwise."
-    )
-    """Prompt to check if chunk is for utility-scale SEF"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk at a given ind to see if it contains ordinance
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains large solar energy farm ordinance text.
-        """
-        contains_ord_info = await chunk_parser.parse_from_ind(
-            ind,
-            key="contains_ord_info",
-            llm_call_callback=self._check_chunk_contains_ord,
-        )
-        if not contains_ord_info:
-            logger.debug("Text at ind %d does not contain ordinance info", ind)
-            return False
-
-        logger.debug("Text at ind %d does contain ordinance info", ind)
-
-        is_utility_scale = await chunk_parser.parse_from_ind(
-            ind,
-            key="x",
-            llm_call_callback=self._check_chunk_is_for_utility_scale,
-        )
-        if not is_utility_scale:
-            logger.debug("Text at ind %d is not for utility-scale SEF", ind)
-            return False
-
-        logger.debug("Text at ind %d is for utility-scale SEF", ind)
-
-        self._store_chunk(chunk_parser, ind)
-        logger.debug("Added text at ind %d to ordinances", ind)
-
-        return True
-
-    async def _check_chunk_contains_ord(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for ordinance"""
-        content = await self.call(
-            sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-    async def _check_chunk_is_for_utility_scale(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for utility scale"""
-        content = await self.call(
-            sys_msg=self.IS_UTILITY_SCALE_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-
-class SolarPermittedUseDistrictsTextCollector(OrdinanceTextCollector):
+    PROMPTS = [
+        {
+            "key": "contains_ord_info",
+            "label": "contains ordinance info",
+            "prompt": _CONTAINS_ORD_COLLECTION_PROMPT,
+        },
+        {
+            # Generic key like "x" makes the llm focus on the
+            # instruction rather than using the key name to infer the
+            # content, which can improve performance,
+            "key": "x",
+            "label": "for utility-scale SEF",
+            "prompt": _IS_UTILITY_SCALE_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text collection"""
+
+
+class SolarPermittedUseDistrictsTextCollector(PromptBasedTextCollector):
     """Check text chunks for permitted solar districts; collect them"""
 
     OUT_LABEL = "permitted_use_text"
     """Identifier for text collected by this class"""
 
-    DISTRICT_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        "whether it explicitly contains relevant information to determine the "
-        "districts (and especially the district names) where large solar "
-        "energy farms are a permitted use (primary, special, accessory, or "
-        "otherwise), as well as the districts where large solar energy farms "
-        "are prohibited entirely. Large solar energy systems (SES) may also "
-        f"be referred to as {_LARGE_SEF_SYNONYMS}. "
-        "Do not make any inferences; only answer based on information that "
-        "is explicitly stated in the text. "
-        "Note that relevant information may sometimes be found in tables. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'districts' which contains a string that lists all of the district "
-        "names for which the text explicitly permits **large solar energy "
-        "farms** (if any). The last key is '{key}', which is a boolean that "
-        "is set to True if any part of the text excerpt provides information "
-        "on districts where **large solar energy farms** (or similar) are a "
-        "permitted use and False otherwise."
-    )
-    """Prompt to check if chunk contains info on permitted districts"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk to see if it contains permitted uses
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains large solar energy farm permitted use text.
-        """
-
-        key = "contains_district_info"
-        content = await self.call(
-            sys_msg=self.DISTRICT_PROMPT.format(key=key),
-            content=chunk_parser.text_chunks[ind],
-            usage_sub_label=(
-                LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION
-            ),
-        )
-        logger.debug("LLM response: %s", content)
-        contains_district_info = content.get(key, False)
-
-        if contains_district_info:
-            self._store_chunk(chunk_parser, ind)
-            logger.debug("Text at ind %d contains district info", ind)
-            return True
-
-        logger.debug("Text at ind %d does not contain district info", ind)
-        return False
+    PROMPTS = [
+        {
+            "key": "contains_district_info",
+            "label": "contains district info",
+            "prompt": _DISTRICTS_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text collection"""
 
 
 class SolarOrdinanceTextExtractor(PromptBasedTextExtractor):
diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py
index aa547184..907ee125 100644
--- a/compass/extraction/wind/ordinance.py
+++ b/compass/extraction/wind/ordinance.py
@@ -8,7 +8,7 @@
 
 from compass.plugin.ordinance import (
     OrdinanceHeuristic,
-    OrdinanceTextCollector,
+    PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
 from compass.utilities.enums import LLMUsageCategory
@@ -32,6 +32,58 @@
 _SEARCH_TERMS_OR = _SEARCH_TERMS_AND.replace("and", "or")
 _IGNORE_TYPES = "private, residential, micro, small, or medium sized"
 
+_CONTAINS_ORD_COLLECTION_PROMPT = f"""\
+You extract structured data from text. Return your answer in JSON format \
+(not markdown). Your JSON file must include exactly two keys. The first \
+key is 'wind_reqs', which is a string that summarizes all {_SEARCH_TERMS_AND} \
+that are explicitly enacted in the text for a wind energy system (or wind \
+turbine/tower) for a given jurisdiction. Note that wind energy bans are \
+an important restriction to track. Include any **closely related provisions** \
+if they clearly pertain to the **development, operation, modification, or \
+removal** of wind energy systems (or wind turbines/towers). All restrictions \
+should be enforceable - ignore any text that only provides a legal definition \
+of the regulation. If the text does not specify any concrete \
+{_SEARCH_TERMS_OR} for a wind energy system, set this key to `null`. The last \
+key is '{{key}}', which is a boolean that is set to True if the text excerpt \
+explicitly details {_SEARCH_TERMS_OR} for a wind energy system (or wind \
+turbine/tower) and False otherwise.\
+"""
+
+_IS_UTILITY_SCALE_COLLECTION_PROMPT = f"""\
+You are a legal scholar that reads ordinance text and determines whether \
+any of it applies to {_SEARCH_TERMS_OR} for **large wind energy systems**. \
+Large wind energy systems (WES) may also be referred to as \
+{_LARGE_WES_SYNONYMS}. Your client is a commercial wind developer that \
+does not care about ordinances related to {_IGNORE_TYPES} wind energy \
+systems. Ignore any text related to such systems. Return your answer as a \
+dictionary in JSON format (not markdown). Your JSON file must include \
+exactly two keys. The first key is 'summary' which contains a string that \
+lists all of the types of wind energy systems the text applies to (if any). \
+The second key is '{{key}}', which is a boolean that is set to True if any \
+part of the text excerpt details {_SEARCH_TERMS_OR} for the **large wind \
+energy conversion systems** (or similar) that the client is interested in \
+and False otherwise.\
+"""
+
+_DISTRICTS_COLLECTION_PROMPT = f"""\
+You are a legal scholar that reads ordinance text and determines whether \
+the text explicitly contains relevant information to determine the districts \
+(and especially the district names) where large wind energy systems are a \
+permitted use (primary, special, accessory, or otherwise), as well as the \
+districts where large wind energy systems are prohibited entirely. Large \
+wind energy systems (WES) may also be referred to as {_LARGE_WES_SYNONYMS}. \
+Do not make any inferences; only answer based on information that is \
+explicitly stated in the text. Note that relevant information may sometimes \
+be found in tables. Return your answer as a dictionary in JSON format (not \
+markdown). Your JSON file must include exactly two keys. The first key is \
+'districts' which contains a string that lists all of the district names for \
+which the text explicitly permits **large wind energy systems** (if any). \
+The last key is '{{key}}', which is a boolean that is set to True if any \
+part of the text excerpt provides information on districts where **large \
+wind energy systems** (or similar) are a permitted use in and False  \
+otherwise.\
+"""
+
 _WECS_TEXT_EXTRACTION_PROMPT = """\
 # CONTEXT #
 We want to reduce the provided excerpt to only contain information about \
@@ -236,180 +288,44 @@ class WindHeuristic(OrdinanceHeuristic):
     """Phrases that indicate text is about WECS"""
 
 
-class WindOrdinanceTextCollector(OrdinanceTextCollector):
+class WindOrdinanceTextCollector(PromptBasedTextCollector):
     """Check text chunks for ordinances and collect them if they do"""
 
     OUT_LABEL = "relevant_text"
     """Identifier for text collected by this class"""
 
-    CONTAINS_ORD_PROMPT = (
-        "You extract structured data from text. Return your answer in JSON "
-        "format (not markdown). Your JSON file must include exactly two "
-        "keys. The first key is 'wind_reqs', which is a string that "
-        f"summarizes all {_SEARCH_TERMS_AND} that are explicitly enacted "
-        "in the text for a wind energy system (or wind turbine/tower) for "
-        "a given jurisdiction. "
-        "Note that wind energy bans are an important restriction to track. "
-        "Include any **closely related provisions** if they clearly pertain "
-        "to the **development, operation, modification, or removal** of wind "
-        "energy systems (or wind turbines/towers). "
-        "All restrictions should be enforceable - ignore any text that only "
-        "provides a legal definition of the regulation. If the text does not "
-        f"specify any concrete {_SEARCH_TERMS_OR} for a wind energy system, "
-        "set this key to `null`. The last key is '{key}', which is a boolean "
-        "that is set to True if the text excerpt explicitly details "
-        f"{_SEARCH_TERMS_OR} for a wind energy system (or wind turbine/tower) "
-        "and False otherwise. "
-    )
-    """Prompt to check if chunk contains WES ordinance info"""
-
-    IS_UTILITY_SCALE_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        f"whether any of it applies to {_SEARCH_TERMS_OR} for "
-        "**large wind energy systems**. Large wind energy systems (WES) may "
-        f"also be referred to as {_LARGE_WES_SYNONYMS}. "
-        "Your client is a commercial wind developer that does not "
-        f"care about ordinances related to {_IGNORE_TYPES} wind energy "
-        "systems. Ignore any text related to such systems. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'summary' which contains a string that lists all of the types of "
-        "wind energy systems the text applies to (if any). The second key is "
-        "'{key}', which is a boolean that is set to True if any part of the "
-        f"text excerpt details {_SEARCH_TERMS_OR} for the **large wind energy "
-        "conversion systems** (or similar) that the client is interested in "
-        "and False otherwise."
-    )
-    """Prompt to check if chunk is for utility-scale WES"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk at a given ind to see if it contains ordinance
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains large wind energy conversion system ordinance text.
-        """
-        contains_ord_info = await chunk_parser.parse_from_ind(
-            ind,
-            key="contains_ord_info",
-            llm_call_callback=self._check_chunk_contains_ord,
-        )
-        if not contains_ord_info:
-            logger.debug("Text at ind %d does not contain ordinance info", ind)
-            return False
-
-        logger.debug("Text at ind %d does contain ordinance info", ind)
-
-        is_utility_scale = await chunk_parser.parse_from_ind(
-            ind,
-            key="x",
-            llm_call_callback=self._check_chunk_is_for_utility_scale,
-        )
-        if not is_utility_scale:
-            logger.debug("Text at ind %d is not for utility-scale WECS", ind)
-            return False
-
-        logger.debug("Text at ind %d is for utility-scale WECS", ind)
-
-        self._store_chunk(chunk_parser, ind)
-        logger.debug("Added text at ind %d to ordinances", ind)
-
-        return True
-
-    async def _check_chunk_contains_ord(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for ordinance"""
-        content = await self.call(
-            sys_msg=self.CONTAINS_ORD_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-    async def _check_chunk_is_for_utility_scale(self, key, text_chunk):
-        """Call LLM on a chunk of text to check for utility scale"""
-        content = await self.call(
-            sys_msg=self.IS_UTILITY_SCALE_PROMPT.format(key=key),
-            content=text_chunk,
-            usage_sub_label=(LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION),
-        )
-        logger.debug("LLM response: %s", content)
-        return content.get(key, False)
-
-
-class WindPermittedUseDistrictsTextCollector(OrdinanceTextCollector):
+    PROMPTS = [
+        {
+            "key": "contains_ord_info",
+            "label": "contains ordinance info",
+            "prompt": _CONTAINS_ORD_COLLECTION_PROMPT,
+        },
+        {
+            # Generic key like "x" makes the llm focus on the
+            # instruction rather than using the key name to infer the
+            # content, which can improve performance,
+            "key": "x",
+            "label": "for utility-scale WECS",
+            "prompt": _IS_UTILITY_SCALE_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for ordinance text collection"""
+
+
+class WindPermittedUseDistrictsTextCollector(PromptBasedTextCollector):
     """Check text chunks for permitted wind districts; collect them"""
 
     OUT_LABEL = "permitted_use_text"
     """Identifier for text collected by this class"""
 
-    DISTRICT_PROMPT = (
-        "You are a legal scholar that reads ordinance text and determines "
-        "whether the text explicitly contains relevant information to "
-        "determine the districts (and especially the district names) where "
-        "large wind energy systems are a permitted use (primary, special, "
-        "accessory, or otherwise), as well as the districts where large wind "
-        "energy systems are prohibited entirely. Large wind energy systems "
-        f"(WES) may also be referred to as {_LARGE_WES_SYNONYMS}. "
-        "Do not make any inferences; only answer based on information that "
-        "is explicitly stated in the text. "
-        "Note that relevant information may sometimes be found in tables. "
-        "Return your answer as a dictionary in JSON format (not markdown). "
-        "Your JSON file must include exactly two keys. The first key is "
-        "'districts' which contains a string that lists all of the district "
-        "names for which the text explicitly permits **large wind energy "
-        "systems** (if any). The last key is '{key}', which is a boolean that "
-        "is set to True if any part of the text excerpt provides information "
-        "on districts where **large wind energy systems** (or similar) are a "
-        "permitted use in and False otherwise."
-    )
-    """Prompt to check if chunk contains info on permitted districts"""
-
-    async def check_chunk(self, chunk_parser, ind):
-        """Check a chunk to see if it contains permitted uses
-
-        Parameters
-        ----------
-        chunk_parser : ParseChunksWithMemory
-            Instance that contains a ``parse_from_ind`` method.
-        ind : int
-            Index of the chunk to check.
-
-        Returns
-        -------
-        bool
-            Boolean flag indicating whether or not the text in the chunk
-            contains large wind energy conversion system permitted use
-            text.
-        """
-
-        key = "contains_district_info"
-        content = await self.call(
-            sys_msg=self.DISTRICT_PROMPT.format(key=key),
-            content=chunk_parser.text_chunks[ind],
-            usage_sub_label=(
-                LLMUsageCategory.DOCUMENT_PERMITTED_USE_CONTENT_VALIDATION
-            ),
-        )
-        logger.debug("LLM response: %s", content)
-        contains_district_info = content.get(key, False)
-
-        if contains_district_info:
-            self._store_chunk(chunk_parser, ind)
-            logger.debug("Text at ind %d contains district info", ind)
-            return True
-
-        logger.debug("Text at ind %d does not contain district info", ind)
-        return False
+    PROMPTS = [
+        {
+            "key": "contains_district_info",
+            "label": "contains district info",
+            "prompt": _DISTRICTS_COLLECTION_PROMPT,
+        },
+    ]
+    """Dicts defining the prompts for permitted use text collection"""
 
 
 class WindOrdinanceTextExtractor(PromptBasedTextExtractor):
diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py
index 369a3c80..8dc16746 100644
--- a/compass/plugin/__init__.py
+++ b/compass/plugin/__init__.py
@@ -10,7 +10,7 @@
     BaseTextExtractor,
     BaseParser,
     OrdinanceHeuristic,
-    OrdinanceTextCollector,
+    PromptBasedTextCollector,
     PromptBasedTextExtractor,
     OrdinanceParser,
     OrdinanceExtractionPlugin,
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index de75087a..4fae6e79 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -259,8 +259,58 @@ def GOOD_TECH_PHRASES(self):  # noqa: N802
         raise NotImplementedError
 
 
-class OrdinanceTextCollector(StructuredLLMCaller, BaseTextCollector):
-    """Base class for ordinance text collectors"""
+class PromptBasedTextCollector(StructuredLLMCaller, BaseTextCollector, ABC):
+    """Text extractor based on a chain of prompts"""
+
+    @property
+    @abstractmethod
+    def PROMPTS(self):  # noqa: N802
+        """list: List of dicts defining the prompts for text extraction
+
+        Each dict in the list should have the following keys:
+
+            - **prompt**: [REQUIRED] The text filter prompt to use
+              to determine if a chunk of text is relevant for the
+              current extraction task. The prompt must instruct the LLM
+              to return a dictionary (as JSON) with at least one key
+              that outputs the filter decision. The prompt may use the
+              following placeholders, which will be filled in with the
+              corresponding class attributes when the prompt is applied:
+
+                - ``"{key}"``: The key corresponding to this prompt.
+
+            - **key**: [REQUIRED] A string identifier for the key that
+              in the output JSON dictionary that represents the LLM
+              filter decision (``True`` if the tech chunk should be
+              kept, and ``False`` otherwise).
+            - **label**: [OPTIONAL] A string label describing the type
+              of relevant text this prompt is looking for (e.g. "wind
+              energy conversion system ordinance text"). This is only
+              used for logging purposes and does not affect the
+              extraction process itself. If not provided, this will
+              default to "collector step {i}".
+
+        The prompts will be applied in the order they appear in the
+        list, with the output text from each prompt being fed as input
+        to the next prompt in the chain. If any of the filter decisions
+        return ``False``, the text will be discarded and not passed to
+        subsequent prompts. The final output of the last prompt will
+        determine wether or not the chunk of text being evaluated is
+        kept as relevant text for extraction.
+        """
+        raise NotImplementedError
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        if getattr(cls, "__abstractmethods__", None):
+            return
+
+        if not cls.PROMPTS:  # TODO: This should happen at registration
+            msg = (
+                f"{cls.__name__} must have at least one "
+                "prompt defined in the PROMPTS property"
+            )
+            raise COMPASSPluginConfigurationError(msg)
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -285,6 +335,59 @@ def relevant_text(self):
         text = [self._chunks[ind] for ind in sorted(self._chunks)]
         return merge_overlapping_texts(text)
 
+    async def check_chunk(self, chunk_parser, ind):
+        """Check a chunk at a given ind to see if it contains ordinance
+
+        Parameters
+        ----------
+        chunk_parser : ParseChunksWithMemory
+            Instance that contains a ``parse_from_ind`` method.
+        ind : int
+            Index of the chunk to check.
+
+        Returns
+        -------
+        bool
+            Boolean flag indicating whether or not the text in the chunk
+            contains large wind energy conversion system ordinance text.
+        """
+        for collection_step, prompt_dict in enumerate(self.PROMPTS):
+            key = prompt_dict["key"]
+            prompt = prompt_dict["prompt"].format(key=key)
+            label = prompt_dict.get("label", collection_step)
+            passed_filter = await chunk_parser.parse_from_ind(
+                ind,
+                key=key,
+                llm_call_callback=self._check_chunk_with_prompt,
+                prompt=prompt,
+            )
+
+            if not passed_filter:
+                logger.debug(
+                    "Text at ind %d did not pass collection step: %s",
+                    ind,
+                    label,
+                )
+                return False
+
+            logger.debug(
+                "Text at ind %d passed collection step: %s ", ind, label
+            )
+
+        self._store_chunk(chunk_parser, ind)
+        logger.debug("Added text chunk at ind %d to extraction text", ind)
+        return True
+
+    async def _check_chunk_with_prompt(self, key, text_chunk, prompt):
+        """Call LLM on a chunk of text to check for ordinance"""
+        content = await self.call(
+            sys_msg=prompt.format(key=key),
+            content=text_chunk,
+            usage_sub_label=LLMUsageCategory.DOCUMENT_CONTENT_VALIDATION,
+        )
+        logger.debug("LLM response: %s", content)
+        return content.get(key, False)
+
     def _store_chunk(self, parser, chunk_ind):
         """Store chunk and its neighbors if it is not already stored"""
         for offset in range(1 - parser.num_to_recall, 2):
@@ -676,7 +779,7 @@ async def parse_docs_for_structured_data(self, extraction_context):
                 extraction_context.attrs["structured_data"] = data_df
                 logger.info(
                     "%d ordinance value(s) found in doc from %s for %s. ",
-                    row_count,
+                    num_ordinances_dataframe(data_df),
                     doc_for_extraction.attrs.get("source", "unknown source"),
                     self.jurisdiction.full_name,
                 )
diff --git a/compass/validation/content.py b/compass/validation/content.py
index 7725a28d..c829c805 100644
--- a/compass/validation/content.py
+++ b/compass/validation/content.py
@@ -50,19 +50,19 @@ def __init__(self, text_chunks, num_to_recall=2):
         self.num_to_recall = num_to_recall
         self.memory = [{} for _ in text_chunks]
 
-    # fmt: off
     def _inverted_mem(self, starting_ind):
         """Inverted memory"""
-        inverted_mem = self.memory[:starting_ind + 1:][::-1]
-        yield from inverted_mem[:self.num_to_recall]
+        inverted_mem = self.memory[:starting_ind + 1:][::-1]  # fmt: off
+        yield from inverted_mem[:self.num_to_recall]  # fmt: off
 
-    # fmt: off
     def _inverted_text(self, starting_ind):
         """Inverted text chunks"""
-        inverted_text = self.text_chunks[:starting_ind + 1:][::-1]
-        yield from inverted_text[:self.num_to_recall]
+        inverted_text = self.text_chunks[:starting_ind + 1:][::-1]  # fmt: off
+        yield from inverted_text[:self.num_to_recall]  # fmt: off
 
-    async def parse_from_ind(self, ind, key, llm_call_callback):
+    async def parse_from_ind(
+        self, ind, key, llm_call_callback, *args, **kwargs
+    ):
         """Validate a chunk by consulting current and prior context
 
         Cached verdicts are reused to avoid redundant LLM calls when
@@ -97,7 +97,9 @@ async def parse_from_ind(self, ind, key, llm_call_callback):
             logger.debug("Mem at ind %d is %s", step, mem)
             check = mem.get(key)
             if check is None:
-                check = mem[key] = await llm_call_callback(key, text)
+                check = mem[key] = await llm_call_callback(
+                    key, text, *args, **kwargs
+                )
             if check:
                 return check
         return False

From 5fc485734b2e146824921d5536ccd3950eeb544b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 20:12:21 -0700
Subject: [PATCH 05/22] Change class name

---
 compass/extraction/small_wind/ordinance.py | 4 ++--
 compass/extraction/solar/ordinance.py      | 4 ++--
 compass/extraction/wind/ordinance.py       | 4 ++--
 compass/plugin/__init__.py                 | 2 +-
 compass/plugin/ordinance.py                | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/compass/extraction/small_wind/ordinance.py b/compass/extraction/small_wind/ordinance.py
index 01dfa2ea..60b857bf 100644
--- a/compass/extraction/small_wind/ordinance.py
+++ b/compass/extraction/small_wind/ordinance.py
@@ -7,7 +7,7 @@
 import logging
 
 from compass.plugin.ordinance import (
-    OrdinanceHeuristic,
+    KeywordBasedHeuristic,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
@@ -231,7 +231,7 @@
 """
 
 
-class SmallWindHeuristic(OrdinanceHeuristic):
+class SmallWindHeuristic(KeywordBasedHeuristic):
     """Perform a heuristic check for mention of wind turbines in text"""
 
     NOT_TECH_WORDS = [
diff --git a/compass/extraction/solar/ordinance.py b/compass/extraction/solar/ordinance.py
index 691adf12..942869df 100644
--- a/compass/extraction/solar/ordinance.py
+++ b/compass/extraction/solar/ordinance.py
@@ -7,7 +7,7 @@
 import logging
 
 from compass.plugin.ordinance import (
-    OrdinanceHeuristic,
+    KeywordBasedHeuristic,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
@@ -189,7 +189,7 @@
 """
 
 
-class SolarHeuristic(OrdinanceHeuristic):
+class SolarHeuristic(KeywordBasedHeuristic):
     """Perform a heuristic check for mention of solar farms in text"""
 
     NOT_TECH_WORDS = [
diff --git a/compass/extraction/wind/ordinance.py b/compass/extraction/wind/ordinance.py
index 907ee125..95c9e7fd 100644
--- a/compass/extraction/wind/ordinance.py
+++ b/compass/extraction/wind/ordinance.py
@@ -7,7 +7,7 @@
 import logging
 
 from compass.plugin.ordinance import (
-    OrdinanceHeuristic,
+    KeywordBasedHeuristic,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
 )
@@ -230,7 +230,7 @@
 """
 
 
-class WindHeuristic(OrdinanceHeuristic):
+class WindHeuristic(KeywordBasedHeuristic):
     """Perform a heuristic check for mention of wind turbines in text"""
 
     NOT_TECH_WORDS = [
diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py
index 8dc16746..7dedc77e 100644
--- a/compass/plugin/__init__.py
+++ b/compass/plugin/__init__.py
@@ -9,7 +9,7 @@
 from .ordinance import (
     BaseTextExtractor,
     BaseParser,
-    OrdinanceHeuristic,
+    KeywordBasedHeuristic,
     PromptBasedTextCollector,
     PromptBasedTextExtractor,
     OrdinanceParser,
diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 4fae6e79..3b545657 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -130,7 +130,7 @@ async def parse(self, text):
         raise NotImplementedError
 
 
-class OrdinanceHeuristic(BaseHeuristic, ABC):
+class KeywordBasedHeuristic(BaseHeuristic, ABC):
     """Perform a heuristic check for mention of a technology in text"""
 
     _GOOD_ACRONYM_CONTEXTS = [

From fcb00fb08c90abaf3219e7248076bafbc910ad93 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:08:11 -0700
Subject: [PATCH 06/22] Add validation logic

---
 compass/plugin/ordinance.py | 203 ++++++++++++++++++++++++------------
 1 file changed, 138 insertions(+), 65 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 3b545657..798dd911 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -300,18 +300,6 @@ def PROMPTS(self):  # noqa: N802
         """
         raise NotImplementedError
 
-    def __init_subclass__(cls, **kwargs):
-        super().__init_subclass__(**kwargs)
-        if getattr(cls, "__abstractmethods__", None):
-            return
-
-        if not cls.PROMPTS:  # TODO: This should happen at registration
-            msg = (
-                f"{cls.__name__} must have at least one "
-                "prompt defined in the PROMPTS property"
-            )
-            raise COMPASSPluginConfigurationError(msg)
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._chunks = {}
@@ -509,13 +497,6 @@ def __init_subclass__(cls, **kwargs):
         if getattr(cls, "__abstractmethods__", None):
             return
 
-        if not cls.PROMPTS:  # TODO: This should happen at registration
-            msg = (
-                f"{cls.__name__} must have at least one "
-                "prompt defined in the PROMPTS property"
-            )
-            raise COMPASSPluginConfigurationError(msg)
-
         last_prompt = cls.PROMPTS[-1]
         last_index = len(cls.PROMPTS) - 1
         cls.OUT_LABEL = last_prompt.get("key", f"extracted_text_{last_index}")
@@ -652,52 +633,6 @@ def consumer_producer_pairs(self):
             (self.TEXT_EXTRACTORS, self.TEXT_COLLECTORS),
         ]
 
-    def __init__(self, jurisdiction, model_configs, usage_tracker=None):
-        """
-
-        Parameters
-        ----------
-        jurisdiction : Jurisdiction
-            Jurisdiction for which extraction is being performed.
-        model_configs : dict
-            Dictionary where keys are LLMTasks and values are LLMConfig
-            instances to be used for those tasks.
-        usage_tracker : UsageTracker, optional
-            Usage tracker instance that can be used to record the LLM
-            call cost. By default, ``None``.
-        """
-        super().__init__(
-            jurisdiction=jurisdiction,
-            model_configs=model_configs,
-            usage_tracker=usage_tracker,
-        )
-
-        # TODO: This should happen during plugin registration
-        self._validate_in_out_keys()
-
-    def _validate_in_out_keys(self):
-        """Validate that all IN_LABELs have matching OUT_LABELs"""
-        out_keys = {}
-        for producer in self.producers:
-            out_keys.setdefault(producer.OUT_LABEL, []).append(producer)
-
-        dupes = {k: v for k, v in out_keys.items() if len(v) > 1}
-        if dupes:
-            formatted = "\n".join(
-                [
-                    f"{key}: {[cls.__name__ for cls in classes]}"
-                    for key, classes in dupes.items()
-                ]
-            )
-            msg = (
-                "Multiple processing classes produce the same OUT_LABEL key:\n"
-                f"{formatted}"
-            )
-            raise COMPASSPluginConfigurationError(msg)
-
-        for consumers, producers in self.consumer_producer_pairs:
-            _validate_in_out_keys(consumers, producers)
-
     async def extract_ordinances_from_text(
         self, doc, parser_class, model_config
     ):
@@ -902,6 +837,144 @@ def _get_model_config(self, primary_key, secondary_key):
             secondary_key, self.model_configs[LLMTasks.DEFAULT]
         )
 
+    def validate_plugin_configuration(self):
+        """[NOT PUBLIC API] Validate plugin is properly configured"""
+        super().validate_plugin_configuration()
+        self._validate_text_extractors()
+        self._validate_parsers()
+        self._validate_in_out_keys()
+        self._validate_collector_prompts()
+        self._validate_collector_prompts()
+
+    def _validate_text_extractors(self):
+        """Validate user provided at least one text extractor class"""
+        try:
+            extractors = self.TEXT_EXTRACTORS
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'TEXT_EXTRACTORS'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        if len(extractors) == 0:
+            msg = (
+                f"Plugin class {self.__class__.__name__} has an empty "
+                "'TEXT_EXTRACTORS' property! Please provide at least "
+                "one text extractor class."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        for extractor_class in extractors:
+            if not issubclass(extractor_class, BaseTextExtractor):
+                msg = (
+                    f"Plugin class {self.__class__.__name__} has invalid "
+                    "entry in 'TEXT_EXTRACTORS' property: All entries must "
+                    "be subclasses of "
+                    "compass.plugin.ordinance.BaseTextExtractor, but "
+                    f"{extractor_class.__name__} is not!"
+                )
+                raise COMPASSPluginConfigurationError(msg)
+
+    def _validate_parsers(self):
+        """Validate user provided at least one parser class"""
+        try:
+            parsers = self.PARSERS
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'PARSERS'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        if len(parsers) == 0:
+            msg = (
+                f"Plugin class {self.__class__.__name__} has an empty "
+                "'PARSERS' property! Please provide at least "
+                "one text extractor class."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        for parsers_class in parsers:
+            if not issubclass(parsers_class, BaseParser):
+                msg = (
+                    f"Plugin class {self.__class__.__name__} has invalid "
+                    "entry in 'PARSERS' property: All entries must "
+                    "be subclasses of "
+                    "compass.plugin.ordinance.BaseParser, but "
+                    f"{parsers_class.__name__} is not!"
+                )
+                raise COMPASSPluginConfigurationError(msg)
+
+    def _validate_in_out_keys(self):
+        """Validate that all IN_LABELs have matching OUT_LABELs"""
+        out_keys = {}
+        for producer in self.producers:
+            out_keys.setdefault(producer.OUT_LABEL, []).append(producer)
+
+        dupes = {k: v for k, v in out_keys.items() if len(v) > 1}
+        if dupes:
+            formatted = "\n".join(
+                [
+                    f"{key}: {[cls.__name__ for cls in classes]}"
+                    for key, classes in dupes.items()
+                ]
+            )
+            msg = (
+                "Multiple processing classes produce the same OUT_LABEL key:\n"
+                f"{formatted}"
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        for consumers, producers in self.consumer_producer_pairs:
+            _validate_in_out_keys(consumers, producers)
+
+    def _validate_collector_prompts(self):
+        """Validate that all text collectors have prompts defined"""
+
+        for collector in self.TEXT_COLLECTORS:
+            if not issubclass(collector, PromptBasedTextCollector):
+                continue
+            try:
+                num_prompts = len(collector.PROMPTS)
+            except NotImplementedError:
+                msg = (
+                    f"Text collector {self.__class__.__name__} is missing "
+                    "required property 'PROMPTS'"
+                )
+                raise COMPASSPluginConfigurationError(msg) from None
+
+            if num_prompts == 0:
+                msg = (
+                    f"Text collector {self.__class__.__name__} has an empty "
+                    "'PROMPTS' property! Please provide at least one prompt "
+                    "dictionary."
+                )
+                raise COMPASSPluginConfigurationError(msg)
+
+    def _validate_collector_prompts(self):
+        """Validate that all text extractors have prompts defined"""
+
+        for collector in self.TEXT_EXTRACTORS:
+            if not issubclass(collector, PromptBasedTextExtractor):
+                continue
+            try:
+                num_prompts = len(collector.PROMPTS)
+            except NotImplementedError:
+                msg = (
+                    f"Text extractor {self.__class__.__name__} is missing "
+                    "required property 'PROMPTS'"
+                )
+                raise COMPASSPluginConfigurationError(msg) from None
+
+            if num_prompts == 0:
+                msg = (
+                    f"Text extractor {self.__class__.__name__} has an empty "
+                    "'PROMPTS' property! Please provide at least one prompt "
+                    "dictionary."
+                )
+                raise COMPASSPluginConfigurationError(msg)
+
 
 def _valid_chunk(chunk):
     """True if chunk has content"""

From bf7f128548bf9233c7770517c25bbfb621fc7301 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:08:43 -0700
Subject: [PATCH 07/22] Allow plugins to register own districts

---
 compass/utilities/jurisdictions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compass/utilities/jurisdictions.py b/compass/utilities/jurisdictions.py
index 9ce6100b..1a305385 100644
--- a/compass/utilities/jurisdictions.py
+++ b/compass/utilities/jurisdictions.py
@@ -15,7 +15,6 @@
 logger = logging.getLogger(__name__)
 KNOWN_JURISDICTIONS_REGISTRY = {
     importlib.resources.files("compass") / "data" / "conus_jurisdictions.csv",
-    importlib.resources.files("compass") / "data" / "tx_water_districts.csv",
 }
 _JUR_COLS = [
     "Jurisdiction Type",

From b9e30fcdb6711e9feba4f3087aca7128ad7ef87f Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:10:33 -0700
Subject: [PATCH 08/22] Add validation logic

---
 compass/plugin/base.py      |  3 ++
 compass/plugin/interface.py | 74 +++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/compass/plugin/base.py b/compass/plugin/base.py
index 71957bc8..0adbd6f2 100644
--- a/compass/plugin/base.py
+++ b/compass/plugin/base.py
@@ -146,3 +146,6 @@ async def record_usage(self):
         total_usage = await UsageUpdater.call(self.usage_tracker)
         total_cost = compute_total_cost_from_usage(total_usage)
         COMPASS_PB.update_total_cost(total_cost, replace=True)
+
+    def validate_plugin_configuration(self):  # noqa: B027
+        """[NOT PUBLIC API] Validate plugin is properly configured"""
diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py
index 25e8607b..b532b096 100644
--- a/compass/plugin/interface.py
+++ b/compass/plugin/interface.py
@@ -328,3 +328,77 @@ async def _write_cleaned_text(self, doc):
         out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name)
         doc.attrs["cleaned_fps"] = out_fp
         return doc
+
+    def validate_plugin_configuration(self):
+        """[NOT PUBLIC API] Validate plugin is properly configured"""
+
+        try:
+            __ = self.IDENTIFIER
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'IDENTIFIER'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        try:
+            num_q_templates = len(self.QUESTION_TEMPLATES)
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'QUESTION_TEMPLATES'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        if num_q_templates == 0:
+            msg = (
+                f"Plugin class {self.__class__.__name__} has an empty "
+                "'QUESTION_TEMPLATES' property! Please provide at least "
+                "one question template."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        try:
+            num_website_keywords = len(self.WEBSITE_KEYWORDS)
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'WEBSITE_KEYWORDS'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        if num_website_keywords == 0:
+            msg = (
+                f"Plugin class {self.__class__.__name__} has an empty "
+                "'WEBSITE_KEYWORDS' property! Please provide at least "
+                "one website keyword."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        try:
+            collectors = self.TEXT_COLLECTORS
+        except NotImplementedError:
+            msg = (
+                f"Plugin class {self.__class__.__name__} is missing required "
+                "property 'TEXT_COLLECTORS'"
+            )
+            raise COMPASSPluginConfigurationError(msg) from None
+
+        if len(collectors) == 0:
+            msg = (
+                f"Plugin class {self.__class__.__name__} has an empty "
+                "'TEXT_COLLECTORS' property! Please provide at least "
+                "one text collector class."
+            )
+            raise COMPASSPluginConfigurationError(msg)
+
+        for collector_class in collectors:
+            if not issubclass(collector_class, BaseTextCollector):
+                msg = (
+                    f"Plugin class {self.__class__.__name__} has invalid "
+                    "entry in 'TEXT_COLLECTORS' property: All entries must "
+                    "be subclasses of "
+                    "compass.plugin.interface.BaseTextCollector, but "
+                    f"{collector_class.__name__} is not!"
+                )
+                raise COMPASSPluginConfigurationError(msg)

From d3abfb6f1eb40c15054c3e2f4e475f17b2cb2474 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:10:48 -0700
Subject: [PATCH 09/22] Allow `JURISDICTION_DATA_FP` property

---
 compass/plugin/base.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/compass/plugin/base.py b/compass/plugin/base.py
index 0adbd6f2..0dcf2fba 100644
--- a/compass/plugin/base.py
+++ b/compass/plugin/base.py
@@ -34,6 +34,38 @@ def __init__(self, jurisdiction, model_configs, usage_tracker=None):
         self.model_configs = model_configs
         self.usage_tracker = usage_tracker
 
+    JURISDICTION_DATA_FP = None
+    """path-like: Optional path to jurisdiction CSV
+
+    If provided, this CSV will extend the known jurisdictions (by
+    default, US states, counties, and townships). This CSV must have the
+    following columns:
+
+        - State: The state in which the jurisdiction is located
+          (e.g. "Texas")
+        - County: The county in which the jurisdiction is located
+          (e.g. "Travis"). This can be left blank if the jurisdiction is
+          not associated with a county.
+        - Subdivision: The name of the subdivision of the county in
+          which the jurisdiction is located. Use this input for
+          jurisdictions that do not map to counties/townships (e.g.
+          water conservation districts, resource management plan areas,
+          etc.). This can be left blank if the jurisdiction does not
+          have the notion of a "subdivision".
+        - Jurisdiction Type: The type of jurisdiction (e.g. "county",
+          "township", "city", "special district", "RMP", etc.).
+        - FIPS: The code to be used for the jurisdiction, if applicable
+          (e.g. "48453" for Travis County, Texas, "22" for the
+          Culberson County Groundwater Conservation District, etc.).
+          This can be left blank if the jurisdiction does not have an
+          applicable code.
+        - Website: The official website for the jurisdiction, if
+          applicable (e.g. "https://www.traviscountytx.gov/"). This can
+          be left blank if the jurisdiction does not have an official
+          website or if the website is not known.
+
+    """
+
     @property
     @abstractmethod
     def IDENTIFIER(self):  # noqa: N802

From df89a772131e886881f817f7e5ccfd0fde6a8abc Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:11:18 -0700
Subject: [PATCH 10/22] Add import

---
 compass/plugin/interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py
index b532b096..5ee98449 100644
--- a/compass/plugin/interface.py
+++ b/compass/plugin/interface.py
@@ -9,12 +9,12 @@
 from compass.scripts.download import filter_ordinance_docs
 from compass.services.threaded import CleanedFileWriter
 from compass.utilities import doc_infos_to_db, save_db
+from compass.exceptions import COMPASSPluginConfigurationError
 
 
 logger = logging.getLogger(__name__)
 
 # TODO: Allow other to register own clean file outputs
-# TODO: Allow other to register their own jurisdictions csv
 
 
 class BaseHeuristic(ABC):

From 8e3958dcd78d828676dc60fe4bcaddfbe3692da0 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:11:30 -0700
Subject: [PATCH 11/22] Add MVP of registry

---
 compass/plugin/registry.py | 42 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 compass/plugin/registry.py

diff --git a/compass/plugin/registry.py b/compass/plugin/registry.py
new file mode 100644
index 00000000..d7397411
--- /dev/null
+++ b/compass/plugin/registry.py
@@ -0,0 +1,42 @@
+"""COMPASS plugin registry"""
+
+from compass.utilities.jurisdictions import KNOWN_JURISDICTIONS_REGISTRY
+from compass.plugin.base import BaseExtractionPlugin
+from compass.exceptions import COMPASSPluginConfigurationError
+
+
+PLUGIN_REGISTRY = {}
+"""dict: Registered COMPASS plugins"""
+
+
+def register_plugin(plugin_class):
+    """Register a plugin class in the plugin registry
+
+    Parameters
+    ----------
+    plugin_class : type
+        The plugin class to register. Must be a subclass of
+        :class:`~compass.plugin.base.BaseExtractionPlugin` and must pass
+        the plugin configuration validation.
+
+    Raises
+    ------
+    COMPASSPluginConfigurationError
+        If the plugin class is not a subclass of
+        :class:`~compass.plugin.base.BaseExtractionPlugin` or if it does
+        not pass the plugin configuration validation.
+    """
+    if not issubclass(plugin_class, BaseExtractionPlugin):
+        msg = (
+            f"Plugin class {plugin_class.__name__} must be a subclass of "
+            "`compass.plugin.base.BaseExtractionPlugin`!"
+        )
+        raise COMPASSPluginConfigurationError(msg)
+
+    if plugin_class.JURISDICTION_DATA_FP is not None:
+        KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP)
+
+    plugin_instance = plugin_class(None, None)
+    plugin_instance.validate_plugin_configuration()
+
+    PLUGIN_REGISTRY[plugin_class.IDENTIFIER.casefold()] = plugin_class

From de3a6648e2715a9a50e80fd0881defa0e93e26bf Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:11:59 -0700
Subject: [PATCH 12/22] Populate namespace

---
 compass/plugin/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compass/plugin/__init__.py b/compass/plugin/__init__.py
index 7dedc77e..de078d64 100644
--- a/compass/plugin/__init__.py
+++ b/compass/plugin/__init__.py
@@ -15,3 +15,4 @@
     OrdinanceParser,
     OrdinanceExtractionPlugin,
 )
+from .registry import PLUGIN_REGISTRY, register_plugin

From e820a99e4892bfead62b3b0c24826be21ab9bc8b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:12:13 -0700
Subject: [PATCH 13/22] Plugins now register themselves

---
 compass/extraction/small_wind/plugin.py |  5 ++++-
 compass/extraction/solar/plugin.py      |  5 ++++-
 compass/extraction/water/plugin.py      | 13 ++++++++++++-
 compass/extraction/wind/plugin.py       |  5 ++++-
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/compass/extraction/small_wind/plugin.py b/compass/extraction/small_wind/plugin.py
index 02fb97ed..2b1a10e4 100644
--- a/compass/extraction/small_wind/plugin.py
+++ b/compass/extraction/small_wind/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS wind extraction plugin"""
 
-from compass.plugin import OrdinanceExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin, register_plugin
 from compass.extraction.small_wind.ordinance import (
     SmallWindHeuristic,
     SmallWindOrdinanceTextCollector,
@@ -96,3 +96,6 @@ class COMPASSSmallWindExtractor(OrdinanceExtractionPlugin):
         StructuredSmallWindPermittedUseDistrictsParser,
     ]
     """Class for parsing structured ordinance data from text"""
+
+
+register_plugin(COMPASSSmallWindExtractor)
diff --git a/compass/extraction/solar/plugin.py b/compass/extraction/solar/plugin.py
index 4f6b5163..3e2153c1 100644
--- a/compass/extraction/solar/plugin.py
+++ b/compass/extraction/solar/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS solar extraction plugin"""
 
-from compass.plugin import OrdinanceExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin, register_plugin
 from compass.extraction.solar.ordinance import (
     SolarHeuristic,
     SolarOrdinanceTextCollector,
@@ -97,3 +97,6 @@ class COMPASSSolarExtractor(OrdinanceExtractionPlugin):
         StructuredSolarPermittedUseDistrictsParser,
     ]
     """Class for parsing structured ordinance data from text"""
+
+
+register_plugin(COMPASSSolarExtractor)
diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py
index eda68709..5ca51ed7 100644
--- a/compass/extraction/water/plugin.py
+++ b/compass/extraction/water/plugin.py
@@ -1,6 +1,7 @@
 """COMPASS water rights extraction plugin"""
 
 import logging
+import importlib.resources
 from pathlib import Path
 
 import pandas as pd
@@ -8,7 +9,7 @@
 from elm.embed import ChunkAndEmbed
 
 from compass.extraction import extract_date
-from compass.plugin.base import BaseExtractionPlugin
+from compass.plugin import BaseExtractionPlugin, register_plugin
 from compass.utilities.enums import LLMTasks
 from compass.utilities.parsing import extract_ord_year_from_doc_attrs
 from compass.exceptions import COMPASSRuntimeError
@@ -78,6 +79,13 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin):
     heuristic = WaterRightsHeuristic()
     """BaseHeuristic: Object with a ``check()`` method"""
 
+    JURISDICTION_DATA_FP = (
+        importlib.resources.files("compass")
+        / "data"
+        / "tx_water_districts.csv"
+    )
+    """path-like: Path to Texas GCW names"""
+
     async def filter_docs(
         self,
         extraction_context,
@@ -290,3 +298,6 @@ def _setup_endpoints(embedding_model_config):
     EnergyWizard.EMBEDDING_URL = endpoint
 
     EnergyWizard.URL = "openai.azure.com"  # need to trigger Azure setup
+
+
+register_plugin(TexasWaterRightsExtractor)
diff --git a/compass/extraction/wind/plugin.py b/compass/extraction/wind/plugin.py
index 905bcc87..c8758213 100644
--- a/compass/extraction/wind/plugin.py
+++ b/compass/extraction/wind/plugin.py
@@ -1,6 +1,6 @@
 """COMPASS wind extraction plugin"""
 
-from compass.plugin import OrdinanceExtractionPlugin
+from compass.plugin import OrdinanceExtractionPlugin, register_plugin
 from compass.extraction.wind.ordinance import (
     WindHeuristic,
     WindOrdinanceTextCollector,
@@ -95,3 +95,6 @@ class COMPASSWindExtractor(OrdinanceExtractionPlugin):
         StructuredWindPermittedUseDistrictsParser,
     ]
     """Class for parsing structured ordinance data from text"""
+
+
+register_plugin(COMPASSWindExtractor)

From 110a4cbce59ab266123c113c2fd492778a8d5fd8 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:12:33 -0700
Subject: [PATCH 14/22] Now run based on plugin registry

---
 compass/scripts/process.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/compass/scripts/process.py b/compass/scripts/process.py
index cb8c579a..6a14bc44 100644
--- a/compass/scripts/process.py
+++ b/compass/scripts/process.py
@@ -11,6 +11,7 @@
 
 from elm.web.utilities import get_redirected_url
 
+from compass.plugin import PLUGIN_REGISTRY
 from compass.extraction.context import ExtractionContext
 from compass.scripts.download import (
     find_jurisdiction_website,
@@ -21,10 +22,6 @@
     download_jurisdiction_ordinances_from_website_compass_crawl,
 )
 from compass.exceptions import COMPASSValueError, COMPASSError
-from compass.extraction.wind import COMPASSWindExtractor
-from compass.extraction.solar import COMPASSSolarExtractor
-from compass.extraction.small_wind import COMPASSSmallWindExtractor
-from compass.extraction.water.plugin import TexasWaterRightsExtractor
 from compass.validation.location import JurisdictionWebsiteValidator
 from compass.llm import OpenAIConfig
 from compass.services.cpu import (
@@ -72,12 +69,6 @@
 
 
 logger = logging.getLogger(__name__)
-EXTRACTION_REGISTRY = {
-    COMPASSWindExtractor.IDENTIFIER.casefold(): COMPASSWindExtractor,
-    COMPASSSolarExtractor.IDENTIFIER.casefold(): COMPASSSolarExtractor,
-    COMPASSSmallWindExtractor.IDENTIFIER.casefold(): COMPASSSmallWindExtractor,
-    TexasWaterRightsExtractor.IDENTIFIER.casefold(): TexasWaterRightsExtractor,
-}
 MAX_CONCURRENT_SEARCH_ENGINE_QUERIES = 10
 
 
@@ -136,8 +127,10 @@ async def process_jurisdictions_with_openai(  # noqa: PLR0917, PLR0913
         CSV file, all downloaded ordinance documents (PDFs and HTML),
         usage metadata, and default subdirectories for logs and
         intermediate outputs (unless otherwise specified).
-    tech : {"wind", "solar", "small wind", "tx water rights"}
-        Label indicating which technology type is being processed.
+    tech : str
+        Label indicating which technology type is being processed. Must
+        be one of the keys of
+        :obj:`~compass.plugin.registry.PLUGIN_REGISTRY`.
     jurisdiction_fp : path-like
         Path to a CSV file specifying the jurisdictions to process.
         The CSV must contain at least two columns: "County" and "State",
@@ -564,10 +557,10 @@ def tpe_kwargs(self):
     @cached_property
     def extractor_class(self):
         """obj: Extractor class for the specified technology"""
-        if self.tech.casefold() not in EXTRACTION_REGISTRY:
+        if self.tech.casefold() not in PLUGIN_REGISTRY:
             msg = f"Unknown tech input: {self.tech}"
             raise COMPASSValueError(msg)
-        return EXTRACTION_REGISTRY[self.tech.casefold()]
+        return PLUGIN_REGISTRY[self.tech.casefold()]
 
     @cached_property
     def _base_services(self):

From 1ae2934718e20787364e8a8d24a692a830a76fbb Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:38:05 -0700
Subject: [PATCH 15/22] Cleaned file names now specified via a registry

---
 compass/services/threaded.py | 42 +++++-------------------------------
 1 file changed, 5 insertions(+), 37 deletions(-)

diff --git a/compass/services/threaded.py b/compass/services/threaded.py
index c32e53ba..bc5f146f 100644
--- a/compass/services/threaded.py
+++ b/compass/services/threaded.py
@@ -16,13 +16,13 @@
 from elm.web.document import PDFDocument, HTMLDocument
 from elm.web.utilities import write_url_doc_to_file
 
-from compass import COMPASS_DEBUG_LEVEL
 from compass.services.base import Service
 from compass.utilities import compute_cost_from_totals
 from compass.pb import COMPASS_PB
 
 
 logger = logging.getLogger(__name__)
+CLEANED_FP_REGISTRY = {}
 
 
 def _cache_file_with_hash(doc, file_content, out_dir, make_name_unique=False):
@@ -69,59 +69,27 @@ def _move_file(doc, out_dir, out_fn=None):
     return out_fp
 
 
-def _write_cleaned_file(doc, out_dir, jurisdiction_name=None):
+def _write_cleaned_file(doc, out_dir, tech, jurisdiction_name=None):
     """Write cleaned ordinance text to directory"""
     if jurisdiction_name is None:
         return None
 
     out_dir = Path(out_dir)
-    if COMPASS_DEBUG_LEVEL > 0:
-        _write_interim_cleaned_files(doc, out_dir, jurisdiction_name)
+    doc_key_to_clean_fp = CLEANED_FP_REGISTRY.get(tech.casefold(), {})
 
-    key_to_fp = {
-        "cleaned_text_for_extraction": (
-            f"{jurisdiction_name} Cleaned Text.txt"
-        ),
-        "districts_text": f"{jurisdiction_name} Districts.txt",
-    }
     out_paths = []
-    for key, fn in key_to_fp.items():
+    for key, fn in doc_key_to_clean_fp.items():
         cleaned_text = doc.attrs.get(key)
         if cleaned_text is None:
             continue
 
-        out_fp = out_dir / fn
+        out_fp = out_dir / fn.format(jurisdiction=jurisdiction_name)
         out_fp.write_text(cleaned_text, encoding="utf-8")
         out_paths.append(out_fp)
 
     return out_paths
 
 
-def _write_interim_cleaned_files(doc, out_dir, jurisdiction_name):
-    """Write intermediate output texts to file; helpful for debugging"""
-    key_to_fp = {
-        "relevant_text": f"{jurisdiction_name} Ordinance Original text.txt",
-        "wind_energy_systems_text": (
-            f"{jurisdiction_name} Wind Ordinance text.txt"
-        ),
-        "solar_energy_systems_text": (
-            f"{jurisdiction_name} Solar Ordinance text.txt"
-        ),
-        "permitted_use_text": (
-            f"{jurisdiction_name} Permitted Use Original text.txt"
-        ),
-        "permitted_use_only_text": (
-            f"{jurisdiction_name} Permitted Use Only text.txt"
-        ),
-    }
-    for key, fn in key_to_fp.items():
-        text = doc.attrs.get(key)
-        if text is None:
-            continue
-
-        (out_dir / fn).write_text(text, encoding="utf-8")
-
-
 def _write_ord_db(extraction_context, out_dir, out_fn=None):
     """Write parsed ordinance database to directory"""
     ord_db = extraction_context.attrs.get("structured_data")

From bee7d23dad551913a111203d8bf1cf806094e190 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:38:14 -0700
Subject: [PATCH 16/22] Import to register plugins

---
 compass/__init__.py            | 9 +++++++++
 compass/extraction/__init__.py | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/compass/__init__.py b/compass/__init__.py
index 28cb2282..c048d1a7 100644
--- a/compass/__init__.py
+++ b/compass/__init__.py
@@ -3,4 +3,13 @@
 from ._version import __version__
 from .utilities.logs import setup_logging_levels, COMPASS_DEBUG_LEVEL
 
+# Temporarily import to register plugins
+# Can drop once plugins register themselves
+from .extraction import (
+    COMPASSWindExtractor,
+    COMPASSSolarExtractor,
+    COMPASSSmallWindExtractor,
+    TexasWaterRightsExtractor,
+)
+
 setup_logging_levels()
diff --git a/compass/extraction/__init__.py b/compass/extraction/__init__.py
index 373bf198..2ee97718 100644
--- a/compass/extraction/__init__.py
+++ b/compass/extraction/__init__.py
@@ -7,3 +7,10 @@
     extract_relevant_text_with_ngram_validation,
     extract_ordinance_values,
 )
+
+# Temporarily import to register plugins
+# Can drop once plugins register themselves
+from .wind import COMPASSWindExtractor
+from .solar import COMPASSSolarExtractor
+from .small_wind import COMPASSSmallWindExtractor
+from .water import TexasWaterRightsExtractor

From 42980ff43a2b9530771635b13aee3f5a18153018 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:38:26 -0700
Subject: [PATCH 17/22] Pass down tech name to cleaned file writing

---
 compass/plugin/interface.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compass/plugin/interface.py b/compass/plugin/interface.py
index 5ee98449..5f116fe3 100644
--- a/compass/plugin/interface.py
+++ b/compass/plugin/interface.py
@@ -14,8 +14,6 @@
 
 logger = logging.getLogger(__name__)
 
-# TODO: Allow other to register own clean file outputs
-
 
 class BaseHeuristic(ABC):
     """Base class for a heuristic check"""
@@ -325,7 +323,9 @@ async def filter_docs(
 
     async def _write_cleaned_text(self, doc):
         """Write cleaned text to `clean_files` dir"""
-        out_fp = await CleanedFileWriter.call(doc, self.jurisdiction.full_name)
+        out_fp = await CleanedFileWriter.call(
+            doc, self.IDENTIFIER, self.jurisdiction.full_name
+        )
         doc.attrs["cleaned_fps"] = out_fp
         return doc
 

From e436bd96471909a04c5521b0f70243899533a391 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 21:38:59 -0700
Subject: [PATCH 18/22] Register cleaned file names

---
 compass/plugin/ordinance.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index 798dd911..d0715464 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -22,6 +22,7 @@
     BaseTextCollector,
     FilteredExtractionPlugin,
 )
+from compass.services.threaded import CLEANED_FP_REGISTRY
 from compass.extraction import extract_ordinance_values
 from compass.utilities.enums import LLMTasks, LLMUsageCategory
 from compass.utilities.ngrams import convert_text_to_sentence_ngrams
@@ -529,7 +530,6 @@ def parsers(self):
                 FORMATTING_PROMPT=self.FORMATTING_PROMPT,
                 OUTPUT_PROMPT=self.OUTPUT_PROMPT,
             )
-            # out_fn = prompt_dict.get("out_fn", None)
             yield key, partial(self._process, instructions=instructions)
 
     async def _process(self, text_chunks, instructions, is_valid_chunk=None):
@@ -845,6 +845,7 @@ def validate_plugin_configuration(self):
         self._validate_in_out_keys()
         self._validate_collector_prompts()
         self._validate_collector_prompts()
+        self._register_clean_file_names()
 
     def _validate_text_extractors(self):
         """Validate user provided at least one text extractor class"""
@@ -975,6 +976,20 @@ def _validate_collector_prompts(self):
                 )
                 raise COMPASSPluginConfigurationError(msg)
 
+    def _register_clean_file_names(self):
+        """Register file names for writing cleaned text outputs"""
+        CLEANED_FP_REGISTRY.setdefault(self.IDENTIFIER.casefold(), {})
+        for extractor_class in self.TEXT_EXTRACTORS:
+            if not issubclass(extractor_class, PromptBasedTextExtractor):
+                continue
+            for ind, prompt_dict in enumerate(extractor_class.PROMPTS):
+                out_fn = prompt_dict.get("out_fn", None)
+                if not out_fn:
+                    continue
+
+                key = prompt_dict.get("key", f"extracted_text_{ind}")
+                CLEANED_FP_REGISTRY[self.IDENTIFIER.casefold()][key] = out_fn
+
 
 def _valid_chunk(chunk):
     """True if chunk has content"""

From 86fc25959c8e877ed2378761398ca8fea8a0284e Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 22:03:01 -0700
Subject: [PATCH 19/22] Fix tests

---
 .../unit/plugin/test_plugin_interface.py      | 79 ++++++++++---------
 .../unit/services/test_services_threaded.py   | 32 ++++++--
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_interface.py
index 1287d9ac..e4d79cd4 100644
--- a/tests/python/unit/plugin/test_plugin_interface.py
+++ b/tests/python/unit/plugin/test_plugin_interface.py
@@ -4,40 +4,45 @@
 
 import pytest
 
-from compass.plugin.interface import FilteredExtractionPlugin
+from compass.plugin.ordinance import (
+    BaseTextCollector,
+    BaseTextExtractor,
+    BaseParser,
+    OrdinanceExtractionPlugin,
+)
 from compass.exceptions import COMPASSPluginConfigurationError
 
 
 def test_plugin_validation_parse_key_same():
     """Test plugin interface validation logic"""
 
-    class COLL1:
+    class COLL1(BaseTextCollector):
         OUT_LABEL = "collected"
 
-    class EXT1:
+    class EXT1(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted"
 
-    class EXT2:
+    class EXT2(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted_2"
 
-    class PARS1:
+    class PARS1(BaseParser):
         IN_LABEL = "extracted"
         OUT_LABEL = "parsed_1"
 
-    class PARS2:
+    class PARS2(BaseParser):
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_1"
 
-    class MYPlugin(FilteredExtractionPlugin):
+    class MYPlugin(OrdinanceExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
 
         IDENTIFIER = "test"
-        WEBSITE_KEYWORDS = []
-        QUESTION_TEMPLATES = []
+        WEBSITE_KEYWORDS = ["test"]
+        QUESTION_TEMPLATES = ["test"]
         heuristic = None
 
         async def parse_docs_for_structured_data(self, extraction_context):
@@ -47,39 +52,39 @@ async def parse_docs_for_structured_data(self, extraction_context):
         COMPASSPluginConfigurationError,
         match="Multiple processing classes produce the same OUT_LABEL key",
     ):
-        MYPlugin(None, None, None)
+        MYPlugin(None, None, None).validate_plugin_configuration()
 
 
 def test_plugin_validation_extract_key_same():
     """Test plugin interface validation logic"""
 
-    class COLL1:
+    class COLL1(BaseTextCollector):
         OUT_LABEL = "collected"
 
-    class EXT1:
+    class EXT1(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted"
 
-    class EXT2:
+    class EXT2(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted"
 
-    class PARS1:
+    class PARS1(BaseParser):
         IN_LABEL = "extracted"
         OUT_LABEL = "parsed_1"
 
-    class PARS2:
+    class PARS2(BaseParser):
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(FilteredExtractionPlugin):
+    class MYPlugin(OrdinanceExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
 
         IDENTIFIER = "test"
-        WEBSITE_KEYWORDS = []
-        QUESTION_TEMPLATES = []
+        WEBSITE_KEYWORDS = ["test"]
+        QUESTION_TEMPLATES = ["test"]
         heuristic = None
 
         async def parse_docs_for_structured_data(self, extraction_context):
@@ -89,39 +94,39 @@ async def parse_docs_for_structured_data(self, extraction_context):
         COMPASSPluginConfigurationError,
         match="Multiple processing classes produce the same OUT_LABEL key",
     ):
-        MYPlugin(None, None, None)
+        MYPlugin(None, None, None).validate_plugin_configuration()
 
 
 def test_plugin_validation_no_in_key_for_extract():
     """Test plugin interface validation logic"""
 
-    class COLL1:
+    class COLL1(BaseTextCollector):
         OUT_LABEL = "collected"
 
-    class EXT1:
+    class EXT1(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted"
 
-    class EXT2:
+    class EXT2(BaseTextExtractor):
         IN_LABEL = "collected_2"
         OUT_LABEL = "extracted_1"
 
-    class PARS1:
+    class PARS1(BaseParser):
         IN_LABEL = "extracted"
         OUT_LABEL = "parsed_1"
 
-    class PARS2:
+    class PARS2(BaseParser):
         IN_LABEL = "collected"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(FilteredExtractionPlugin):
+    class MYPlugin(OrdinanceExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
 
         IDENTIFIER = "test"
-        WEBSITE_KEYWORDS = []
-        QUESTION_TEMPLATES = []
+        WEBSITE_KEYWORDS = ["test"]
+        QUESTION_TEMPLATES = ["test"]
         heuristic = None
 
         async def parse_docs_for_structured_data(self, extraction_context):
@@ -135,39 +140,39 @@ async def parse_docs_for_structured_data(self, extraction_context):
             r"\['EXT2'\]"
         ),
     ):
-        MYPlugin(None, None, None)
+        MYPlugin(None, None, None).validate_plugin_configuration()
 
 
 def test_plugin_validation_no_in_key_for_parse():
     """Test plugin interface validation logic"""
 
-    class COLL1:
+    class COLL1(BaseTextCollector):
         OUT_LABEL = "collected"
 
-    class EXT1:
+    class EXT1(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted"
 
-    class EXT2:
+    class EXT2(BaseTextExtractor):
         IN_LABEL = "collected"
         OUT_LABEL = "extracted_1"
 
-    class PARS1:
+    class PARS1(BaseParser):
         IN_LABEL = "extracted"
         OUT_LABEL = "parsed_1"
 
-    class PARS2:
+    class PARS2(BaseParser):
         IN_LABEL = "collected_2"
         OUT_LABEL = "parsed_2"
 
-    class MYPlugin(FilteredExtractionPlugin):
+    class MYPlugin(OrdinanceExtractionPlugin):
         TEXT_COLLECTORS = [COLL1]
         TEXT_EXTRACTORS = [EXT1, EXT2]
         PARSERS = [PARS1, PARS2]
 
         IDENTIFIER = "test"
-        WEBSITE_KEYWORDS = []
-        QUESTION_TEMPLATES = []
+        WEBSITE_KEYWORDS = ["test"]
+        QUESTION_TEMPLATES = ["test"]
         heuristic = None
 
         async def parse_docs_for_structured_data(self, extraction_context):
@@ -181,7 +186,7 @@ async def parse_docs_for_structured_data(self, extraction_context):
             r"\['PARS2'\]"
         ),
     ):
-        MYPlugin(None, None, None)
+        MYPlugin(None, None, None).validate_plugin_configuration()
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unit/services/test_services_threaded.py b/tests/python/unit/services/test_services_threaded.py
index 566bbecd..d4522c06 100644
--- a/tests/python/unit/services/test_services_threaded.py
+++ b/tests/python/unit/services/test_services_threaded.py
@@ -16,6 +16,7 @@
 from compass.services import threaded
 from compass.services.provider import RunningAsyncServices
 from compass.services.threaded import (
+    CLEANED_FP_REGISTRY,
     CleanedFileWriter,
     FileMover,
     HTMLFileLoader,
@@ -194,13 +195,12 @@ def test_move_file_handles_extensionless_cached_file(tmp_path):
     assert moved_fp.read_text(encoding="utf-8") == "content"
 
 
-def test_write_cleaned_file_with_debug(tmp_path, monkeypatch):
+def test_write_cleaned_file_with_debug(tmp_path):
     """Cleaned file writer should emit cleaned and debug outputs"""
 
     doc = HTMLDocument(["payload"])
     doc.attrs.update(
         {
-            "jurisdiction_name": "Sample Jurisdiction",
             "cleaned_text_for_extraction": "clean",
             "districts_text": "districts",
             "relevant_text": "orig",
@@ -209,18 +209,36 @@ def test_write_cleaned_file_with_debug(tmp_path, monkeypatch):
         }
     )
 
-    monkeypatch.setattr(threaded, "COMPASS_DEBUG_LEVEL", 1, raising=False)
+    fp_names = {
+        "relevant_text": "{jurisdiction} Ordinance Original text.txt",
+        "cleaned_text_for_extraction": "{jurisdiction} Cleaned Text.txt",
+        "districts_text": "{jurisdiction} Districts.txt",
+    }
+
+    CLEANED_FP_REGISTRY["cleaned_file_test"] = fp_names
     outputs = threaded._write_cleaned_file(
-        doc, tmp_path, jurisdiction_name="Sample Jurisdiction"
+        doc,
+        tmp_path,
+        tech="cleaned_file_test",
+        jurisdiction_name="Sample Jurisdiction",
     )
 
     expected_files = {
         "Sample Jurisdiction Cleaned Text.txt",
         "Sample Jurisdiction Districts.txt",
+        "Sample Jurisdiction Ordinance Original text.txt",
     }
     assert {fp.name for fp in outputs} == expected_files
     assert all(fp.exists() for fp in outputs)
 
+    debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt"
+    assert debug_fp.exists()
+    assert debug_fp.read_text(encoding="utf-8") == "clean"
+
+    debug_fp = tmp_path / "Sample Jurisdiction Districts.txt"
+    assert debug_fp.exists()
+    assert debug_fp.read_text(encoding="utf-8") == "districts"
+
     debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt"
     assert debug_fp.exists()
     assert debug_fp.read_text(encoding="utf-8") == "orig"
@@ -231,7 +249,7 @@ def test_write_cleaned_file_without_jurisdiction_returns_none(tmp_path):
 
     doc = HTMLDocument(["payload"])
     doc.attrs["cleaned_text_for_extraction"] = "clean"
-    assert threaded._write_cleaned_file(doc, tmp_path) is None
+    assert threaded._write_cleaned_file(doc, tmp_path, tech="wind") is None
 
 
 def test_write_cleaned_file_skips_missing_section(tmp_path):
@@ -241,7 +259,7 @@ def test_write_cleaned_file_skips_missing_section(tmp_path):
     doc.attrs.update({"cleaned_text_for_extraction": "clean"})
 
     outputs = threaded._write_cleaned_file(
-        doc, tmp_path, jurisdiction_name="Partial"
+        doc, tmp_path, tech="wind", jurisdiction_name="Partial"
     )
     assert [fp.name for fp in outputs] == ["Partial Cleaned Text.txt"]
 
@@ -341,7 +359,7 @@ async def test_cleaned_file_writer_process(tmp_path, monkeypatch):
     writer = CleanedFileWriter(tmp_path)
     assert writer.can_process is True
     writer.acquire_resources()
-    outputs = await writer.process(doc, "Writer")
+    outputs = await writer.process(doc, "wind", "Writer")
     writer.release_resources()
 
     assert sorted(fp.name for fp in outputs) == [

From 9ae73eb2eeaed98313afc094d6f5e6d9aae8474f Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 22:03:43 -0700
Subject: [PATCH 20/22] Rename file

---
 .../{test_plugin_interface.py => test_plugin_ordinances.py}     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename tests/python/unit/plugin/{test_plugin_interface.py => test_plugin_ordinances.py} (99%)

diff --git a/tests/python/unit/plugin/test_plugin_interface.py b/tests/python/unit/plugin/test_plugin_ordinances.py
similarity index 99%
rename from tests/python/unit/plugin/test_plugin_interface.py
rename to tests/python/unit/plugin/test_plugin_ordinances.py
index e4d79cd4..247d8ecb 100644
--- a/tests/python/unit/plugin/test_plugin_interface.py
+++ b/tests/python/unit/plugin/test_plugin_ordinances.py
@@ -1,4 +1,4 @@
-"""COMPASS web crawling tests"""
+"""COMPASS ordinance plugin tests"""
 
 from pathlib import Path
 

From 155b16e2ae655aabb9cf2983485553bba144b1a4 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Fri, 6 Feb 2026 22:11:30 -0700
Subject: [PATCH 21/22] Fix docs

---
 compass/extraction/water/plugin.py |  2 +-
 compass/plugin/base.py             |  2 +-
 docs/source/conf.py                | 27 +++++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/compass/extraction/water/plugin.py b/compass/extraction/water/plugin.py
index 5ca51ed7..0729293d 100644
--- a/compass/extraction/water/plugin.py
+++ b/compass/extraction/water/plugin.py
@@ -84,7 +84,7 @@ class TexasWaterRightsExtractor(BaseExtractionPlugin):
         / "data"
         / "tx_water_districts.csv"
     )
-    """path-like: Path to Texas GCW names"""
+    """:term:`path-like <path-like object>`: Path to Texas GCW names"""
 
     async def filter_docs(
         self,
diff --git a/compass/plugin/base.py b/compass/plugin/base.py
index 0dcf2fba..1a65182a 100644
--- a/compass/plugin/base.py
+++ b/compass/plugin/base.py
@@ -35,7 +35,7 @@ def __init__(self, jurisdiction, model_configs, usage_tracker=None):
         self.usage_tracker = usage_tracker
 
     JURISDICTION_DATA_FP = None
-    """path-like: Optional path to jurisdiction CSV
+    """:term:`path-like <path-like object>`: Path to jurisdiction CSV
 
     If provided, this CSV will extend the known jurisdictions (by
     default, US states, counties, and townships). This CSV must have the
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 454898db..0b2b0c5f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -112,6 +112,33 @@
 # Avoid warning about api.rst not in TOC
 suppress_warnings = ["toc.not_included"]
 
+nitpick_ignore = [
+    (
+        "py:obj",
+        "compass.extraction.small_wind.ordinance.SmallWindOrdinanceTextExtractor.OUT_LABEL",
+    ),
+    (
+        "py:obj",
+        "compass.extraction.small_wind.ordinance.SmallWindPermittedUseDistrictsTextExtractor.OUT_LABEL",
+    ),
+    (
+        "py:obj",
+        "compass.extraction.solar.ordinance.SolarOrdinanceTextExtractor.OUT_LABEL",
+    ),
+    (
+        "py:obj",
+        "compass.extraction.solar.ordinance.SolarPermittedUseDistrictsTextExtractor.OUT_LABEL",
+    ),
+    (
+        "py:obj",
+        "compass.extraction.wind.ordinance.WindOrdinanceTextExtractor.OUT_LABEL",
+    ),
+    (
+        "py:obj",
+        "compass.extraction.wind.ordinance.WindPermittedUseDistrictsTextExtractor.OUT_LABEL",
+    ),
+]
+
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for

From fe22287ac67f9174fdf2bf360d01169d533350bc Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Sat, 7 Feb 2026 12:20:16 -0700
Subject: [PATCH 22/22] PR review

---
 compass/plugin/ordinance.py                   |  4 +-
 compass/plugin/registry.py                    | 16 ++++--
 .../unit/services/test_services_threaded.py   | 55 ++++++++++---------
 3 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/compass/plugin/ordinance.py b/compass/plugin/ordinance.py
index d0715464..eff794be 100644
--- a/compass/plugin/ordinance.py
+++ b/compass/plugin/ordinance.py
@@ -844,7 +844,7 @@ def validate_plugin_configuration(self):
         self._validate_parsers()
         self._validate_in_out_keys()
         self._validate_collector_prompts()
-        self._validate_collector_prompts()
+        self._validate_extractor_prompts()
         self._register_clean_file_names()
 
     def _validate_text_extractors(self):
@@ -953,7 +953,7 @@ def _validate_collector_prompts(self):
                 )
                 raise COMPASSPluginConfigurationError(msg)
 
-    def _validate_collector_prompts(self):
+    def _validate_extractor_prompts(self):
         """Validate that all text extractors have prompts defined"""
 
         for collector in self.TEXT_EXTRACTORS:
diff --git a/compass/plugin/registry.py b/compass/plugin/registry.py
index d7397411..c866a041 100644
--- a/compass/plugin/registry.py
+++ b/compass/plugin/registry.py
@@ -33,10 +33,16 @@ def register_plugin(plugin_class):
         )
         raise COMPASSPluginConfigurationError(msg)
 
-    if plugin_class.JURISDICTION_DATA_FP is not None:
-        KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP)
+    if (plugin_id := plugin_class.IDENTIFIER.casefold()) in PLUGIN_REGISTRY:
+        msg = (
+            f"Plugin identifier '{plugin_class.IDENTIFIER}' is already in "
+            "use by another plugin! Please choose a unique identifier for "
+            f"{plugin_class.__name__}."
+        )
+        raise COMPASSPluginConfigurationError(msg)
 
-    plugin_instance = plugin_class(None, None)
-    plugin_instance.validate_plugin_configuration()
+    plugin_class(None, None).validate_plugin_configuration()
 
-    PLUGIN_REGISTRY[plugin_class.IDENTIFIER.casefold()] = plugin_class
+    if plugin_class.JURISDICTION_DATA_FP is not None:
+        KNOWN_JURISDICTIONS_REGISTRY.add(plugin_class.JURISDICTION_DATA_FP)
+    PLUGIN_REGISTRY[plugin_id] = plugin_class
diff --git a/tests/python/unit/services/test_services_threaded.py b/tests/python/unit/services/test_services_threaded.py
index d4522c06..6a7b72b6 100644
--- a/tests/python/unit/services/test_services_threaded.py
+++ b/tests/python/unit/services/test_services_threaded.py
@@ -216,32 +216,35 @@ def test_write_cleaned_file_with_debug(tmp_path):
     }
 
     CLEANED_FP_REGISTRY["cleaned_file_test"] = fp_names
-    outputs = threaded._write_cleaned_file(
-        doc,
-        tmp_path,
-        tech="cleaned_file_test",
-        jurisdiction_name="Sample Jurisdiction",
-    )
-
-    expected_files = {
-        "Sample Jurisdiction Cleaned Text.txt",
-        "Sample Jurisdiction Districts.txt",
-        "Sample Jurisdiction Ordinance Original text.txt",
-    }
-    assert {fp.name for fp in outputs} == expected_files
-    assert all(fp.exists() for fp in outputs)
-
-    debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt"
-    assert debug_fp.exists()
-    assert debug_fp.read_text(encoding="utf-8") == "clean"
-
-    debug_fp = tmp_path / "Sample Jurisdiction Districts.txt"
-    assert debug_fp.exists()
-    assert debug_fp.read_text(encoding="utf-8") == "districts"
-
-    debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt"
-    assert debug_fp.exists()
-    assert debug_fp.read_text(encoding="utf-8") == "orig"
+    try:
+        outputs = threaded._write_cleaned_file(
+            doc,
+            tmp_path,
+            tech="cleaned_file_test",
+            jurisdiction_name="Sample Jurisdiction",
+        )
+
+        expected_files = {
+            "Sample Jurisdiction Cleaned Text.txt",
+            "Sample Jurisdiction Districts.txt",
+            "Sample Jurisdiction Ordinance Original text.txt",
+        }
+        assert {fp.name for fp in outputs} == expected_files
+        assert all(fp.exists() for fp in outputs)
+
+        debug_fp = tmp_path / "Sample Jurisdiction Cleaned Text.txt"
+        assert debug_fp.exists()
+        assert debug_fp.read_text(encoding="utf-8") == "clean"
+
+        debug_fp = tmp_path / "Sample Jurisdiction Districts.txt"
+        assert debug_fp.exists()
+        assert debug_fp.read_text(encoding="utf-8") == "districts"
+
+        debug_fp = tmp_path / "Sample Jurisdiction Ordinance Original text.txt"
+        assert debug_fp.exists()
+        assert debug_fp.read_text(encoding="utf-8") == "orig"
+    finally:
+        del CLEANED_FP_REGISTRY["cleaned_file_test"]
 
 
 def test_write_cleaned_file_without_jurisdiction_returns_none(tmp_path):