From 25c94e51d8ed490fecc4f04b17ece64f2f8ee689 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Tue, 6 Jan 2026 14:28:41 +0100 Subject: [PATCH 01/29] AI-2161 feat: add complex component config search tool --- TOOLS.md | 132 +++++++++ src/keboola_mcp_server/server.py | 2 + src/keboola_mcp_server/tools/usage.py | 395 ++++++++++++++++++++++++++ tests/test_server.py | 1 + 4 files changed, 530 insertions(+) diff --git a/TOOLS.md b/TOOLS.md index f8171479..7c025630 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -40,6 +40,7 @@ and the configuration ID. - [get_data_apps](#get_data_apps): Lists summaries of data apps in the project given the limit and offset or gets details of a data apps by providing their configuration IDs. - [modify_data_app](#modify_data_app): Creates or updates a Streamlit data app. +- [search_keboola_objects](#search_keboola_objects): Deep search across Keboola objects including their full JSON configuration data. ### Project Tools - [get_project_info](#get_project_info): Retrieves structured information about the current project, @@ -1721,6 +1722,137 @@ updating, set `authentication_type` to `default` to keep the existing authentica } ``` +--- + +## search_keboola_objects +**Annotations**: `read-only` + +**Tags**: `usage` + +**Description**: + +Deep search across Keboola objects including their full JSON configuration data. + +WHAT IT SEARCHES: +- Buckets/Tables: name, description, metadata, column names, column descriptions, and entire API payload +- Components/Flows/Data Apps/Transformations: name, description, and entire configuration JSON in raw format: + * All configuration parameters and nested settings + * Storage mappings (input/output tables) + * Credentials and connection details + * SQL queries and code blocks + * Any other data stored in the configuration + +WHEN TO USE: +- Find configurations by specific parameter values (e.g., API endpoints, database hosts) +- Search deep in nested JSON structures (e.g., table mappings, processors) +- Locate objects containing specific SQL code or queries +- Find configurations with particular credentials or connection strings +- Use advanced pattern matching with wildcards or regex + +PATTERN MATCHING: +- literal (default): Exact text matching - patterns=["salesforce.com"] +- wildcard: Glob patterns with * - patterns=["sales*"] matches "sales", "salesforce", "sales_data" +- regex: Regular expressions - patterns=["flow-[0-9]+"] matches "flow-1", "flow-123" +- Multiple patterns use OR logic (matches ANY pattern) + +USAGE EXAMPLES: + +1. Find extractors connecting to a specific database: + patterns=["prod-db-server.company.com"], search_types=["component"] + +2. Find transformations using a specific input table: + patterns=["in.c-main.customers"], search_types=["transformation"] + +3. Find all objects with "test" or "staging" in their configuration: + patterns=["test", "staging"], mode="literal" + +4. Find flows starting with "daily-" prefix: + patterns=["daily-*"], mode="wildcard", search_types=["flow"] + +5. Find components with API version v2 or v3: + patterns=["api/v[23]"], mode="regex", search_types=["component"] + +6. Find data apps using specific Python packages: + patterns=["pandas", "streamlit"], search_types=["data-app"] + +7. Search for exact table IDs (avoid partial matches): + patterns=["in.c-bucket.table"], whole_word=True + +8. Find configs with nested JSON structure (key-value in parameters): + patterns=[""parameters":\s*\{.*api\.paychex.*\}"], mode="regex" + +9. Find configs with specific authentication type: + patterns=[""authentication":\s*\{.*"type":\s*"oauth20""], mode="regex" + +10. Find configs with incremental loading enabled: + patterns=[""incremental":\s*true"], mode="regex" + +11. Find storage mappings referencing specific tables: + patterns=[""source":\s*"in\.*\.customers""], mode="regex" + +TIPS: +- Use whole_word=True when searching for IDs to avoid partial matches +- Start with literal mode for speed, use wildcard/regex for flexibility +- Narrow results with search_types when you know the object type +- Results include direct links to objects in Keboola UI + + +**Input JSON Schema**: +```json +{ + "properties": { + "patterns": { + "description": "Search patterns to match. Multiple patterns use OR logic (matches ANY pattern). Examples: [\"customer\"], [\"sales*\", \"revenue*\"] for wildcards, [\"flow-.*\"] for regex. Do not pass empty strings.", + "items": { + "type": "string" + }, + "type": "array" + }, + "mode": { + "default": "literal", + "description": "Pattern matching mode: \"literal\" - exact text match (default, fastest), \"wildcard\" - use * for glob patterns (e.g., \"sales*\"), \"regex\" - full regular expressions (most powerful).", + "enum": [ + "literal", + "wildcard", + "regex" + ], + "type": "string" + }, + "whole_word": { + "default": false, + "description": "When true, only matches complete words. Prevents partial matches like finding \"test\" in \"latest\". Useful for searching IDs or specific terms.", + "type": "boolean" + }, + "ignore_case": { + "default": true, + "description": "When true, search ignores letter casing (e.g., \"Sales\" matches \"sales\"). Default: true.", + "type": "boolean" + }, + "search_types": { + "default": [], + "description": "Filter by object types: \"bucket\", \"table\", \"component\", \"transformation\", \"flow\", \"data-app\". Empty list or [\"any\"] searches all types. Use to narrow results when you know what you need.", + "items": { + "enum": [ + "bucket", + "table", + "component", + "flow", + "data-app", + "transformation", + "any" + ], + "type": "string" + }, + "type": "array" + } + }, + "required": [ + "patterns" + ], + "type": "object" +} +``` + --- # Documentation Tools diff --git a/src/keboola_mcp_server/server.py b/src/keboola_mcp_server/server.py index fff236a2..dc3fb34c 100644 --- a/src/keboola_mcp_server/server.py +++ b/src/keboola_mcp_server/server.py @@ -34,6 +34,7 @@ from keboola_mcp_server.tools.search import add_search_tools from keboola_mcp_server.tools.sql import add_sql_tools from keboola_mcp_server.tools.storage import add_storage_tools +from keboola_mcp_server.tools.usage import add_usage_tools LOG = logging.getLogger(__name__) @@ -238,6 +239,7 @@ def create_server( add_sql_tools(mcp) add_storage_tools(mcp) add_keboola_prompts(mcp) + add_usage_tools(mcp) if custom_routes_handling != 'return': return mcp diff --git a/src/keboola_mcp_server/tools/usage.py b/src/keboola_mcp_server/tools/usage.py index 5f3fe173..8e2fc6d3 100644 --- a/src/keboola_mcp_server/tools/usage.py +++ b/src/keboola_mcp_server/tools/usage.py @@ -1,4 +1,5 @@ import json +import re from typing import Annotated, Literal, Mapping, Optional, Sequence from fastmcp import Context @@ -356,3 +357,397 @@ def get_last_updated_by( configuration_id=str(configuration_id), timestamp=timestamp, ) + + +def search_json_string( + json_string: str, + pattern: str, + *, + mode: Literal['literal', 'wildcard', 'regex'] = 'literal', + whole_word: bool = False, + ignore_case: bool = True, +) -> bool: + """ + Search inside a JSON string using different pattern modes. + + Args: + json_string: Stringified JSON (e.g. json.dumps(obj)) + pattern: Search pattern as string + mode: + - "literal": exact text match + - "wildcard": supports '*' like shell glob + - "regex": full regular expression + whole_word: Match full words only + ignore_case: Case-insensitive search + + Returns: + True if pattern is found, False otherwise + """ + + flags = re.IGNORECASE if ignore_case else 0 + + # Escape literal pattern + if mode == 'literal': + regex = re.escape(pattern) + + # Convert wildcard -> regex + elif mode == 'wildcard': + # Escape everything except '*' + regex = re.escape(pattern).replace(r'\*', '.*') + + # Regex mode + elif mode == 'regex': + regex = pattern + + else: + raise ValueError(f'Unsupported mode: {mode}') + + # Whole word match + if whole_word: + regex = rf'\b{regex}\b' + + return re.search(regex, json_string, flags) is not None + + +def _matches_patterns( + value: JsonStruct, + patterns: Sequence[str], + *, + mode: Literal['literal', 'wildcard', 'regex'], + whole_word: bool, + ignore_case: bool, +) -> bool: + haystack = _stringify_for_search(value) + return any( + search_json_string( + haystack, + pattern, + mode=mode, + whole_word=whole_word, + ignore_case=ignore_case, + ) + for pattern in patterns + ) + + +class DataMatch(BaseModel): + item_type: Literal['bucket', 'table', 'component', 'flow', 'data-app', 'transformation'] + bucket_id: str | None = None + table_id: str | None = None + component_id: str | None = None + configuration_id: str | None = None + configuration_row_id: str | None = None + name: str | None = None + description: str | None = None + + +def add_usage_tools(mcp: KeboolaMcpServer) -> None: + """Add usage/search tools to the MCP server.""" + mcp.add_tool( + FunctionTool.from_function( + search_keboola_objects, + annotations=ToolAnnotations(readOnlyHint=True), + serializer=toon_serializer, + tags={USAGE_TOOLS_TAG}, + ) + ) + + +async def search_data_matches( + client: KeboolaClient, + patterns: Sequence[str], + *, + mode: Literal['literal', 'wildcard', 'regex'] = 'literal', + whole_word: bool = False, + ignore_case: bool = True, + search_types: Optional[Sequence[SearchDataType]] = None, +) -> list[DataMatch]: + """ + Searches through configurations (components, flows, data apps) and optionally buckets/tables. + + :param client: The Keboola client to use. + :param patterns: Patterns to search for. + :param mode: Search mode (literal, wildcard, regex). + :param whole_word: Match whole words only. + :param ignore_case: Case-insensitive search. + :param search_types: Types to search in (bucket, table, component, flow, data-app). + :return: A list of data matches. + """ + normalized_patterns = _normalize_ids(patterns) + if not normalized_patterns: + return [] + + normalized_types = _normalize_data_search_types(search_types or []) + include_components = 'component' in normalized_types + include_transformations = 'transformation' in normalized_types + include_flow = 'flow' in normalized_types + include_data_apps = 'data-app' in normalized_types + + matches: list[DataMatch] = [] + + if 'bucket' in normalized_types: + for bucket in await client.storage_client.bucket_list(): + if _matches_patterns( + bucket, normalized_patterns, mode=mode, whole_word=whole_word, ignore_case=ignore_case + ): + matches.append( + DataMatch( + item_type='bucket', + bucket_id=bucket.get('id'), + name=bucket.get('displayName') or bucket.get('name'), + description=bucket.get('description'), + ) + ) + + if 'table' in normalized_types: + for bucket in await client.storage_client.bucket_list(): + bucket_id = bucket.get('id') + if not bucket_id: + continue + tables = await client.storage_client.bucket_table_list(bucket_id, include=['columns', 'columnMetadata']) + for table in tables: + if _matches_patterns( + table, normalized_patterns, mode=mode, whole_word=whole_word, ignore_case=ignore_case + ): + matches.append( + DataMatch( + item_type='table', + bucket_id=bucket_id, + table_id=table.get('id'), + name=table.get('displayName') or table.get('name'), + description=table.get('description'), + ) + ) + + if include_components or include_flow or include_data_apps: + components = await client.storage_client.component_list(include=['configuration', 'rows']) + for component in components: + component_id = component.get('id') + if not component_id: + continue + component_type = component.get('type') + + if component_id == DATA_APP_COMPONENT_ID and not include_data_apps: + continue + if component_id in {CONDITIONAL_FLOW_COMPONENT_ID, ORCHESTRATOR_COMPONENT_ID} and not include_flow: + continue + if component_id not in {DATA_APP_COMPONENT_ID, CONDITIONAL_FLOW_COMPONENT_ID, ORCHESTRATOR_COMPONENT_ID}: + if component_type == 'transformation' and not include_transformations: + continue + if component_type != 'transformation' and not include_components: + continue + + configurations = component.get('configurations', []) or [] + for configuration in configurations: + configuration_id = _get_configuration_id(configuration) + if not configuration_id: + continue + + config_name = configuration.get('name') + config_description = configuration.get('description') + config_definition = configuration.get('configuration') or {} + + config_match = _matches_patterns( + { + 'component_id': component_id, + 'component_type': component_type, + 'configuration_id': configuration_id, + 'name': config_name, + 'description': config_description, + 'configuration': config_definition, + }, + normalized_patterns, + mode=mode, + whole_word=whole_word, + ignore_case=ignore_case, + ) + + if config_match: + matches.append( + DataMatch( + item_type=( + 'data-app' + if component_id == DATA_APP_COMPONENT_ID + else ( + 'flow' + if component_id in {CONDITIONAL_FLOW_COMPONENT_ID, ORCHESTRATOR_COMPONENT_ID} + else 'transformation' if component_type == 'transformation' else 'component' + ) + ), + component_id=component_id, + configuration_id=configuration_id, + name=config_name, + description=config_description, + ) + ) + + rows = configuration.get('rows', []) or [] + for row in rows: + row_id = row.get('id') + row_name = row.get('name') + row_description = row.get('description') + row_config = row.get('configuration') or {} + if not row_id: + continue + + row_match = _matches_patterns( + { + 'component_id': component_id, + 'component_type': component_type, + 'configuration_id': configuration_id, + 'row_id': row_id, + 'name': row_name, + 'description': row_description, + 'configuration': row_config, + }, + normalized_patterns, + mode=mode, + whole_word=whole_word, + ignore_case=ignore_case, + ) + + if row_match: + matches.append( + DataMatch( + item_type=( + 'data-app' + if component_id == DATA_APP_COMPONENT_ID + else ( + 'flow' + if component_id in {CONDITIONAL_FLOW_COMPONENT_ID, ORCHESTRATOR_COMPONENT_ID} + else 'transformation' if component_type == 'transformation' else 'component' + ) + ), + component_id=component_id, + configuration_id=configuration_id, + configuration_row_id=row_id, + name=row_name or config_name, + description=row_description or config_description, + ) + ) + + return matches + + +@tool_errors() +async def search_keboola_objects( + ctx: Context, + patterns: Annotated[ + Sequence[str], + Field( + description=( + 'Search patterns to match. Multiple patterns use OR logic (matches ANY pattern). ' + 'Examples: ["customer"], ["sales*", "revenue*"] for wildcards, ["flow-.*"] for regex. ' + 'Do not pass empty strings.' + ) + ), + ], + mode: Annotated[ + Literal['literal', 'wildcard', 'regex'], + Field( + description=( + 'Pattern matching mode: ' + '"literal" - exact text match (default, fastest), ' + '"wildcard" - use * for glob patterns (e.g., "sales*"), ' + '"regex" - full regular expressions (most powerful).' + ) + ), + ] = 'literal', + whole_word: Annotated[ + bool, + Field( + description=( + 'When true, only matches complete words. Prevents partial matches like finding "test" in "latest". ' + 'Useful for searching IDs or specific terms.' + ) + ), + ] = False, + ignore_case: Annotated[ + bool, + Field(description='When true, search ignores letter casing (e.g., "Sales" matches "sales"). Default: true.'), + ] = True, + search_types: Annotated[ + Sequence[SearchDataType], + Field( + description=( + 'Filter by object types: "bucket", "table", "component", "transformation", "flow", "data-app". ' + 'Empty list or ["any"] searches all types. Use to narrow results when you know what you need.' + ) + ), + ] = tuple(), +) -> list[DataMatch]: + """ + Deep search across Keboola objects including their full JSON configuration data. + + WHAT IT SEARCHES: + - Buckets/Tables: name, description, metadata, column names, column descriptions, and entire API payload + - Components/Flows/Data Apps/Transformations: name, description, and entire configuration JSON in raw format: + * All configuration parameters and nested settings + * Storage mappings (input/output tables) + * Credentials and connection details + * SQL queries and code blocks + * Any other data stored in the configuration + + WHEN TO USE: + - Find configurations by specific parameter values (e.g., API endpoints, database hosts) + - Search deep in nested JSON structures (e.g., table mappings, processors) + - Locate objects containing specific SQL code or queries + - Find configurations with particular credentials or connection strings + - Use advanced pattern matching with wildcards or regex + + PATTERN MATCHING: + - literal (default): Exact text matching - patterns=["salesforce.com"] + - wildcard: Glob patterns with * - patterns=["sales*"] matches "sales", "salesforce", "sales_data" + - regex: Regular expressions - patterns=["flow-[0-9]+"] matches "flow-1", "flow-123" + - Multiple patterns use OR logic (matches ANY pattern) + + USAGE EXAMPLES: + + 1. Find extractors connecting to a specific database: + patterns=["prod-db-server.company.com"], search_types=["component"] + + 2. Find transformations using a specific input table: + patterns=["in.c-main.customers"], search_types=["transformation"] + + 3. Find all objects with "test" or "staging" in their configuration: + patterns=["test", "staging"], mode="literal" + + 4. Find flows starting with "daily-" prefix: + patterns=["daily-*"], mode="wildcard", search_types=["flow"] + + 5. Find components with API version v2 or v3: + patterns=["api/v[23]"], mode="regex", search_types=["component"] + + 6. Find data apps using specific Python packages: + patterns=["pandas", "streamlit"], search_types=["data-app"] + + 7. Search for exact table IDs (avoid partial matches): + patterns=["in.c-bucket.table"], whole_word=True + + 8. Find configs with nested JSON structure (key-value in parameters): + patterns=["\"parameters\":\\s*\\{.*api\\.paychex.*\\}"], mode="regex" + + 9. Find configs with specific authentication type: + patterns=["\"authentication\":\\s*\\{.*\"type\":\\s*\"oauth20\""], mode="regex" + + 10. Find configs with incremental loading enabled: + patterns=["\"incremental\":\\s*true"], mode="regex" + + 11. Find storage mappings referencing specific tables: + patterns=["\"source\":\\s*\"in\\.*\\.customers\""], mode="regex" + + TIPS: + - Use whole_word=True when searching for IDs to avoid partial matches + - Start with literal mode for speed, use wildcard/regex for flexibility + - Narrow results with search_types when you know the object type + - Results include direct links to objects in Keboola UI + """ + client = KeboolaClient.from_state(ctx.session.state) + return await search_data_matches( + client=client, + patterns=patterns, + mode=mode, + whole_word=whole_word, + ignore_case=ignore_case, + search_types=search_types, + ) diff --git a/tests/test_server.py b/tests/test_server.py index cff89f87..890958e8 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -60,6 +60,7 @@ async def test_list_tools(self): 'query_data', 'run_job', 'search', + 'search_keboola_objects', 'update_config', 'update_config_row', 'update_descriptions', From 7478604833806d9241c057958542716f073cddd2 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Thu, 8 Jan 2026 15:34:40 +0100 Subject: [PATCH 02/29] AI-2161 feat: add instruction examples --- src/keboola_mcp_server/tools/usage.py | 28 +++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/keboola_mcp_server/tools/usage.py b/src/keboola_mcp_server/tools/usage.py index 8e2fc6d3..dfbe5701 100644 --- a/src/keboola_mcp_server/tools/usage.py +++ b/src/keboola_mcp_server/tools/usage.py @@ -431,7 +431,7 @@ def _matches_patterns( class DataMatch(BaseModel): - item_type: Literal['bucket', 'table', 'component', 'flow', 'data-app', 'transformation'] + item_type: Literal[ 'component', 'flow', 'data-app', 'transformation'] bucket_id: str | None = None table_id: str | None = None component_id: str | None = None @@ -710,10 +710,10 @@ async def search_keboola_objects( patterns=["in.c-main.customers"], search_types=["transformation"] 3. Find all objects with "test" or "staging" in their configuration: - patterns=["test", "staging"], mode="literal" + patterns=["test", "staging"], mode="literal", search_types=["component", "transformation", "flow", "data-app"] - 4. Find flows starting with "daily-" prefix: - patterns=["daily-*"], mode="wildcard", search_types=["flow"] + 4. Find in which flows is this component used? kds-team.ex-shopify 01k9cz233cvd1rga3zzx40g8qj + patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_types=["flows"] 5. Find components with API version v2 or v3: patterns=["api/v[23]"], mode="regex", search_types=["component"] @@ -731,10 +731,26 @@ async def search_keboola_objects( patterns=["\"authentication\":\\s*\\{.*\"type\":\\s*\"oauth20\""], mode="regex" 10. Find configs with incremental loading enabled: - patterns=["\"incremental\":\\s*true"], mode="regex" + patterns=["\"incremental\":\\s*true"], mode="regex", search_types=["component", "transformation"] 11. Find storage mappings referencing specific tables: - patterns=["\"source\":\\s*\"in\\.*\\.customers\""], mode="regex" + patterns=["\"source\":\\s*\"in\\.*\\.customers\""], mode="regex", search_types=["transformation", "component"] + + 12. Find SQL transformations that calculate avg_monetary_value or create rfm_segment_summary: + patterns=["avg_monetary_value", "rfm_segment_summary"], mode="literal", search_types=["transformation"] + + 13. Find which components use a specific table in input/output mappings (both directions): + patterns=["out\\.c-RFM-Segment-Summary-for-App\\.rfm_segment_summary"], + mode="regex", + search_types=["component", "transformation"] + + # Or more specific - find only input mappings: + patterns=["\"source\":\\s*\"out\\.c-RFM-Segment-Summary-for-App\\.rfm_segment_summary\""], + mode="regex" + + # Or find only output mappings: + patterns=["\"destination\":\\s*\"out\\.c-RFM-Segment-Summary-for-App\\.rfm_segment_summary\""], + mode="regex" TIPS: - Use whole_word=True when searching for IDs to avoid partial matches From c8a43cba8d21769e6244be4d2f7db009832231df Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:17:13 +0100 Subject: [PATCH 03/29] AI-2161 fix: ignore private matches in SearchHit equality --- src/keboola_mcp_server/tools/search/tools.py | 237 ++++++++++++------- src/keboola_mcp_server/tools/search/usage.py | 2 +- 2 files changed, 147 insertions(+), 92 deletions(-) diff --git a/src/keboola_mcp_server/tools/search/tools.py b/src/keboola_mcp_server/tools/search/tools.py index bdda9062..638d6d05 100644 --- a/src/keboola_mcp_server/tools/search/tools.py +++ b/src/keboola_mcp_server/tools/search/tools.py @@ -12,11 +12,11 @@ from keboola_mcp_server.clients.base import JsonDict from keboola_mcp_server.clients.client import ( CONDITIONAL_FLOW_COMPONENT_ID, + DATA_APP_COMPONENT_ID, ORCHESTRATOR_COMPONENT_ID, KeboolaClient, get_metadata_property, ) -from keboola_mcp_server.clients.storage import ItemType from keboola_mcp_server.config import MetadataField from keboola_mcp_server.errors import tool_errors from keboola_mcp_server.links import Link, ProjectLinksManager @@ -45,29 +45,17 @@ 'state', ] -ITEM_TYPE_TO_COMPONENT_TYPES: Mapping[ItemType, Sequence[str]] = { - 'flow': ['other'], - 'transformation': ['transformation'], - 'configuration': ['extractor', 'writer'], - 'configuration-row': ['extractor', 'writer'], - 'workspace': ['other'], -} - SEARCH_ITEM_TYPE_TO_COMPONENT_TYPES: Mapping[SearchItemType, Sequence[str]] = { 'data-app': ['other'], 'flow': ['other'], 'transformation': ['transformation'], 'component': ['extractor', 'writer', 'application'], - 'configuration': ['extractor', 'writer'], - 'configuration-row': ['extractor', 'writer'], + 'configuration': ['extractor', 'writer', 'application'], + 'configuration-row': ['extractor', 'writer', 'application'], 'workspace': ['other'], } SearchType = Literal['textual', 'config-based'] -SearchConfigurationScope = Literal['any', 'parameters', 'storage', 'processors', 'authorization', 'tasks', 'phases'] -SearchConfigurationScopeResp = Literal[ - 'any', 'parameters', 'storage', 'storage.input', 'storage.output', 'processors', 'authorization', 'tasks', 'phases' -] SearchPatternMode = Literal['regex', 'literal'] @@ -109,7 +97,7 @@ class SearchHit(BaseModel): configuration_id: str | None = Field(default=None, description='The ID of the configuration.') configuration_row_id: str | None = Field(default=None, description='The ID of the configuration row.') - item_type: ItemType = Field(description='The type of the item (e.g. table, bucket, configuration, etc.).') + item_type: SearchItemType = Field(description='The type of the item (e.g. table, bucket, configuration, etc.).') updated: str = Field(description='The date and time the item was created in ISO 8601 format.') name: str | None = Field(default=None, description='Name of the item.') @@ -118,6 +106,11 @@ class SearchHit(BaseModel): links: list[Link] = Field(default_factory=list, description='Links to the item.') _matches: list[PatternMatch] = PrivateAttr(default_factory=list) + def __eq__(self, other: object) -> bool: + if isinstance(other, SearchHit): + return self.model_dump() == other.model_dump() + return False + @model_validator(mode='after') def check_id_fields(self) -> 'SearchHit': id_fields = [ @@ -154,7 +147,8 @@ class SearchSpec(BaseModel): case_sensitive: bool = False search_scopes: Sequence[str] = tuple() search_type: SearchType = 'textual' - return_matched_patterns: bool = False + # If True, returns all matched patterns instead of only the first one. + return_all_matched_patterns: bool = False # If True, stops searching scopes after the first match is found. stop_searching_after_first_value_match: bool = True @@ -218,7 +212,7 @@ def match_patterns(self, value: str | JsonDict | None) -> list[str]: if compiled.search(haystack) ) - if self.return_matched_patterns: + if self.return_all_matched_patterns: return list(matches) return [m] if (m := next(matches, None)) else [] @@ -250,6 +244,12 @@ def match_configuration_scopes(self, configuration: JsonDict | None) -> list[Pat ) def match_texts(self, texts: Sequence[str]) -> list[PatternMatch]: + """ + Matches a sequence of strings against the patterns. + + :param texts: The sequence of strings to match against the patterns. + :return: A list of PatternMatch objects. + """ matches: list[PatternMatch] = [] matches = ( @@ -270,21 +270,17 @@ def _get_field_value(item: JsonDict, fields: Sequence[str]) -> Any | None: return None -def _check_column_match(table: JsonDict, cfg: SearchSpec) -> bool: +def _check_column_match(table: JsonDict, cfg: SearchSpec) -> list[PatternMatch]: """Check if any column name or description matches the patterns.""" # Check column names (list of strings) - for col_name in table.get('columns', []): - if cfg.match_patterns(col_name): - return True - - # Check column descriptions (from columnMetadata) - column_metadata = table.get('columnMetadata', {}) - for col_meta in column_metadata.values(): - col_description = get_metadata_property(col_meta, MetadataField.DESCRIPTION) - if cfg.match_patterns(col_description): - return True + if col_names := table.get('columns', []): + if matched := cfg.match_texts(col_names): + return matched - return False + if col_metadata := table.get('columnMetadata', {}): + col_descs = (get_metadata_property(col_meta, MetadataField.DESCRIPTION) for col_meta in col_metadata.values()) + if matched := cfg.match_texts(col_descs): + return matched async def _fetch_buckets(client: KeboolaClient, cfg: SearchSpec) -> list[SearchHit]: @@ -298,12 +294,7 @@ async def _fetch_buckets(client: KeboolaClient, cfg: SearchSpec) -> list[SearchH bucket_display_name = bucket.get('displayName') bucket_description = get_metadata_property(bucket.get('metadata', []), MetadataField.DESCRIPTION) - if ( - cfg.match_patterns(bucket_id) - or cfg.match_patterns(bucket_name) - or cfg.match_patterns(bucket_display_name) - or cfg.match_patterns(bucket_description) - ): + if matches := cfg.match_texts([bucket_id, bucket_name, bucket_display_name, bucket_description]): hits.append( SearchHit( bucket_id=bucket_id, @@ -312,7 +303,7 @@ async def _fetch_buckets(client: KeboolaClient, cfg: SearchSpec) -> list[SearchH name=bucket_name, display_name=bucket_display_name, description=bucket_description, - ) + ).with_matches(matches) ) return hits @@ -333,13 +324,9 @@ async def _fetch_tables(client: KeboolaClient, cfg: SearchSpec) -> list[SearchHi table_display_name = table.get('displayName') table_description = get_metadata_property(table.get('metadata', []), MetadataField.DESCRIPTION) - if ( - cfg.match_patterns(table_id) - or cfg.match_patterns(table_name) - or cfg.match_patterns(table_display_name) - or cfg.match_patterns(table_description) - or _check_column_match(table, cfg) - ): + if matches := cfg.match_texts( + [table_id, table_name, table_display_name, table_description] + ) or _check_column_match(table, cfg): hits.append( SearchHit( table_id=table_id, @@ -348,7 +335,7 @@ async def _fetch_tables(client: KeboolaClient, cfg: SearchSpec) -> list[SearchHi name=table_name, display_name=table_display_name, description=table_description, - ) + ).with_matches(matches) ) return hits @@ -373,6 +360,13 @@ async def _fetch_configs( client: KeboolaClient, spec: SearchSpec, component_type: str | None = None ) -> AsyncGenerator[SearchHit, None]: components = await client.storage_client.component_list(component_type, include=['configuration', 'rows']) + + allowed_transformations = 'transformation' in spec.item_types + allowed_components = 'component' in spec.item_types + allowed_flows = 'flow' in spec.item_types + allowed_workspaces = 'workspace' in spec.item_types + allowed_data_apps = 'data-app' in spec.item_types + for component in components: if not (component_id := component.get('id')): continue @@ -380,10 +374,24 @@ async def _fetch_configs( current_component_type = component.get('type') if component_id in [ORCHESTRATOR_COMPONENT_ID, CONDITIONAL_FLOW_COMPONENT_ID]: item_type = 'flow' + if not allowed_flows: + continue elif current_component_type == 'transformation': item_type = 'transformation' + if not allowed_transformations: + continue elif component_id == 'keboola.sandboxes': item_type = 'workspace' + if not allowed_workspaces: + continue + elif component_id == DATA_APP_COMPONENT_ID: + item_type = 'data-app' + if not allowed_data_apps: + continue + elif current_component_type in ['extractor', 'writer', 'application']: + item_type = 'component' + if not allowed_components: + continue else: item_type = 'configuration' @@ -396,11 +404,7 @@ async def _fetch_configs( config_updated = _get_field_value(config, ['currentVersion.created', 'created']) or '' if spec.search_type == 'textual': - if ( - spec.match_patterns(config_id) - or spec.match_patterns(config_name) - or spec.match_patterns(config_description) - ): + if matches := spec.match_texts([config_id, config_name, config_description]): yield SearchHit( component_id=component_id, configuration_id=config_id, @@ -408,7 +412,7 @@ async def _fetch_configs( updated=config_updated, name=config_name, description=config_description, - ) + ).with_matches(matches) elif spec.search_type == 'config-based': if matches := spec.match_configuration_scopes(config.get('configuration')): yield SearchHit( @@ -428,11 +432,7 @@ async def _fetch_configs( row_description = row.get('description') if spec.search_type == 'textual': - if ( - spec.match_patterns(row_id) - or spec.match_patterns(row_name) - or spec.match_patterns(row_description) - ): + if matches := spec.match_texts([row_id, row_name, row_description]): yield SearchHit( component_id=component_id, configuration_id=config_id, @@ -441,7 +441,7 @@ async def _fetch_configs( updated=config_updated or _get_field_value(row, ['created']), name=row_name, description=row_description, - ) + ).with_matches(matches) elif spec.search_type == 'config-based': if matches := spec.match_configuration_scopes(row.get('configuration')): @@ -463,19 +463,46 @@ async def search( list[str], Field( description='One or more search patterns to match against item ID, name, display name, or description. ' - 'Supports regex patterns. Case-insensitive. Examples: ["customer"], ["sales", "revenue"], ' - '["test.*table"]. Do not use empty strings or empty lists.' + 'Supports regex patterns. Case-insensitive by default. Examples: ["customer"], ["sales", "revenue"], ' + '["test.*table"], ["key1.*:.*key2.*:.*value.*"]. Do not use empty strings or empty lists.' ), ], item_types: Annotated[ - Sequence[ItemType], + Sequence[SearchItemType], Field( - description='Optional filter for specific Keboola item types. Leave empty to search all types. ' + description='Filter for specific Keboola item types. ' 'Common values: "table" (data tables), "bucket" (table containers), "transformation" ' - '(SQL/Python transformations), "configuration" (extractor/writer configs), "flow" (orchestration flows). ' - "Use when you know what type of item you're looking for." + '(SQL/Python transformations), "component" (extractor/writer/application components), ' + '"data-app" (data apps), "flow" (orchestration flows). ' + "Use when you know what type of item you're looking for or leave empty to search all types." + ), + ] = tuple(), + search_type: Annotated[ + SearchType, + Field( + description='Search mode: "textual" (name/id/description) or "config-based" (stringified configuration ' + 'payloads).' + ), + ] = 'textual', + scopes: Annotated[ + Sequence[str], + Field( + description='Dot-separated keys to search in configuration payloads, used with "config-based" search. ' + 'Example: "parameters.field", "storage.input", "storage.output", "processors.before", "processors.after", ' + '"authorization", "tasks", "phases". Leave empty to search the whole configuration.' ), ] = tuple(), + mode: Annotated[ + SearchPatternMode, + Field( + description='How to interpret patterns: "regex" for regular expressions or "literal" for exact text ' + '(default: "literal").' + ), + ] = 'literal', + case_sensitive: Annotated[ + bool, + Field(description='If true, match patterns with case sensitivity (default: false).'), + ] = False, limit: Annotated[ int, Field( @@ -486,59 +513,89 @@ async def search( offset: Annotated[int, Field(description='Number of matching items to skip for pagination (default: 0).')] = 0, ) -> list[SearchHit]: """ - Searches for Keboola items (tables, buckets, configurations, transformations, flows, etc.) in the current project - by matching patterns against item ID, name, display name, or description. Returns matching items grouped by type - with their IDs and metadata. + Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current + project. + Supports two modes: + - textual: match patterns against ID, name, display name, description (and table columns) + - config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes + Returns matching items with IDs and metadata. WHEN TO USE: - - User asks to "find", "locate", or "search for" something by name + - User asks to "find", "locate", or "search for" something by name or text - User mentions a partial name and you need to find the full item (e.g., "find the customer table") - User asks "what tables/configs/flows do I have with X in the name?" + - User asks to find configs containing a value in parameters (use config-based + scopes and regex patterns) + - Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a + specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them - - User asks to "list all items with [name] in it" + - User assks to "what is the genesis of this item?" or "explain me bussiness logic of this item?" + - User asks to "list all items with [name] or [configuration value/part] in it" - DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. HOW IT WORKS: - - Searches by regex pattern matching against id, name, displayName, and description fields - - For tables, also searches column names and column descriptions - - Case-insensitive search - - Multiple patterns work as OR condition - matches items containing ANY of the patterns - - Returns grouped results by item type (tables, buckets, configurations, flows, etc.) - - Each result includes the item's ID, name, creation date, and relevant metadata + - mode: "regex" (default) or "literal" (escape special characters) + - case_sensitive: false by default; set true for exact casing + - search_type: + - "textual": matches id/name/display_name/description fields + - "config-based": matches stringified configuration payloads (JSON) via scopes or the whole config using + regex patterns. + - scopes: dot-separated paths (e.g., "parameters", "storage.input", "parameters.script") + - For tables, textual search also checks column names and column descriptions + - Multiple patterns are ORed: any match includes the item + - Results are ordered by update time, newest first, and can be paginated via limit/offset IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID - The search returns IDs that you can use with other tools (e.g., get_table, get_configs, get_flows) - - Results are ordered by update time. The most recently updated items are returned first. + - Use item_types to make the search more efficient when you know the type; scanning buckets and tables can be + expensive - For exact ID lookups, use specific tools like get_table, get_configs, get_flows instead - - Use find_component_id and get_configs tools to find configurations related to a specific component USAGE EXAMPLES: - user_input: "Find all tables with 'customer' in the name" - → patterns=["customer"], item_types=["table"] - → Returns all tables whose id, name, displayName, or description contains "customer" + → patterns=["customer"], search_type="textual", mode="literal", item_types=["table"] - user_input: "Find tables with 'email' column" - → patterns=["email"], item_types=["table"] - → Returns all tables that have a column named "email" or with "email" in column description + → patterns=["email"], search_type="textual", mode="literal", item_types=["table"] - user_input: "Search for the sales transformation" - → patterns=["sales"], item_types=["transformation"] + → patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"] → Returns transformations with "sales" in any searchable field - user_input: "Find items named 'daily report' or 'weekly summary'" - → patterns=["daily.*report", "weekly.*summary"], item_types=[] - → Returns all items matching any of these patterns + → patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[] + + - user_input: "Show me all configurations/components related to Google Analytics" + → patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"] + + - user_input: "Find storage input mappings referencing specific tables:" + → patterns=["\"storage\"\\.*\"input\"\\.*:\\s*\"in\\.*\\.customers\""], search_type="config-based", mode="regex", + item_types=["transformation", "component"] - - user_input: "Show me all configurations related to Google Analytics" - → patterns=["google.*analytics"], item_types=["configuration"] - → Returns configurations with matching patterns + - user input: "Find components or transformations using 'my_bucket' in output mappings" + → patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", + scopes=["storage.output"], mode="literal" + + - user input: "Find configs with specific authentication type" + → patterns=["\"authentication\":\\s*\\{.*\"type\":\\s*\"oauth20\""], search_type="config-based", mode="regex", + item_types=["component"] + + - user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" + → patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", + scopes=["tasks"] + + - user input: "Find data apps using specific code part ..." + → patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], + mode="regex"], scopes=["script"] """ cfg = SearchSpec( patterns=patterns, item_types=item_types, - search_type='textual', + pattern_mode=mode, + case_sensitive=case_sensitive, + search_type=search_type, + search_scopes=scopes, ) offset = max(0, offset) @@ -567,10 +624,12 @@ async def search( tasks.append(_fetch_configurations(client, cfg)) elif types_to_fetch & { 'configuration', + 'component', 'transformation', 'flow', 'configuration-row', 'workspace', + 'data-app', }: tasks.append(_fetch_configurations(client, cfg)) @@ -586,10 +645,6 @@ async def search( else: all_hits.extend(result) - # Filter by item_types if specified - if types_to_fetch: - all_hits = [item for item in all_hits if item.item_type in types_to_fetch] - # TODO: Should we sort by the item type too? all_hits.sort( key=lambda x: ( diff --git a/src/keboola_mcp_server/tools/search/usage.py b/src/keboola_mcp_server/tools/search/usage.py index 2bf1f6ce..778021d9 100644 --- a/src/keboola_mcp_server/tools/search/usage.py +++ b/src/keboola_mcp_server/tools/search/usage.py @@ -49,7 +49,7 @@ async def find_id_usage( search_scopes=scopes, pattern_mode='literal', search_type='config-based', - return_matched_patterns=True, + return_all_matched_patterns=True, stop_searching_after_first_value_match=False, ) From 88d66156d731810c5b12750a63369aa6dc58ee9c Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:18:17 +0100 Subject: [PATCH 04/29] AI-2161 test: use regex mode in search regex test --- tests/search/tools_test.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tests/search/tools_test.py b/tests/search/tools_test.py index da4587c4..ded3f3a8 100644 --- a/tests/search/tools_test.py +++ b/tests/search/tools_test.py @@ -9,11 +9,11 @@ from keboola_mcp_server.clients.ai_service import ComponentSuggestionResponse, SuggestedComponent from keboola_mcp_server.clients.base import JsonDict from keboola_mcp_server.clients.client import KeboolaClient -from keboola_mcp_server.clients.storage import ItemType from keboola_mcp_server.config import MetadataField from keboola_mcp_server.links import Link from keboola_mcp_server.tools.search.tools import ( SearchHit, + SearchItemType, SearchSpec, SuggestedComponentOutput, find_component_id, @@ -83,7 +83,7 @@ def component_list_side_effect(component_type, include=None): result = await search( ctx=mcp_context_client, patterns=['test'], - item_types=(cast(ItemType, 'table'), cast(ItemType, 'configuration')), + item_types=(cast(SearchItemType, 'table'), cast(SearchItemType, 'configuration')), limit=20, offset=0, ) @@ -144,7 +144,12 @@ async def test_search_with_regex_pattern(self, mocker: MockerFixture, mcp_contex keboola_client.storage_client.component_list = mocker.AsyncMock(return_value=[]) keboola_client.storage_client.workspace_list = mocker.AsyncMock(return_value=[]) - result = await search(ctx=mcp_context_client, patterns=['customer.*'], item_types=(cast(ItemType, 'bucket'),)) + result = await search( + ctx=mcp_context_client, + patterns=['customer.*'], + item_types=(cast(SearchItemType, 'bucket'),), + mode='regex', + ) assert isinstance(result, list) assert result == [ @@ -361,7 +366,7 @@ async def test_search_matches_description(self, mocker: MockerFixture, mcp_conte keboola_client.storage_client.component_list = mocker.AsyncMock(return_value=[]) keboola_client.storage_client.workspace_list = mocker.AsyncMock(return_value=[]) - result = await search(ctx=mcp_context_client, patterns=['test'], item_types=(cast(ItemType, 'bucket'),)) + result = await search(ctx=mcp_context_client, patterns=['test'], item_types=(cast(SearchItemType, 'bucket'),)) assert isinstance(result, list) assert result == [ @@ -680,7 +685,9 @@ async def test_search_table_by_columns( # Mock bucket_table_list with provided test data keboola_client.storage_client.bucket_table_list = mocker.AsyncMock(return_value=tables_data) - result = await search(ctx=mcp_context_client, patterns=[search_pattern], item_types=(cast(ItemType, 'table'),)) + result = await search( + ctx=mcp_context_client, patterns=[search_pattern], item_types=(cast(SearchItemType, 'table'),) + ) assert isinstance(result, list) assert len(result) == expected_count @@ -696,7 +703,7 @@ def test_match_texts_with_literal_and_regex(self): patterns=['foo.*'], item_types=('bucket',), pattern_mode='literal', - return_matched_patterns=True, + return_all_matched_patterns=True, ) matches = spec.match_texts(['foo.*', 'foobar']) assert [match.model_dump() for match in matches] == [ @@ -707,7 +714,7 @@ def test_match_texts_with_literal_and_regex(self): patterns=['foo.*'], item_types=('bucket',), pattern_mode='regex', - return_matched_patterns=True, + return_all_matched_patterns=True, ) regex_matches = regex_spec.match_texts(['foo.*', 'foobar']) assert [match.model_dump() for match in regex_matches] == [ @@ -718,7 +725,7 @@ def test_match_texts_case_sensitivity_and_stop(self): spec = SearchSpec( patterns=['foo', 'bar'], item_types=('bucket',), - return_matched_patterns=True, + return_all_matched_patterns=True, stop_searching_after_first_value_match=True, ) matches = spec.match_texts(['Foo baz', 'BAR qux']) @@ -729,7 +736,7 @@ def test_match_texts_case_sensitivity_and_stop(self): all_spec = SearchSpec( patterns=['foo', 'bar'], item_types=('bucket',), - return_matched_patterns=True, + return_all_matched_patterns=True, stop_searching_after_first_value_match=False, ) all_matches = all_spec.match_texts(['Foo baz', 'BAR qux']) @@ -748,7 +755,7 @@ def test_match_configuration_scopes(self): patterns=['alpha', 'beta'], item_types=('bucket',), search_scopes=('parameters', 'storage.input'), - return_matched_patterns=True, + return_all_matched_patterns=True, stop_searching_after_first_value_match=True, ) matches = spec.match_configuration_scopes(configuration) @@ -761,7 +768,7 @@ def test_match_configuration_scopes(self): patterns=['alpha', 'beta'], item_types=('bucket',), search_scopes=('parameters', 'storage.input'), - return_matched_patterns=True, + return_all_matched_patterns=True, stop_searching_after_first_value_match=False, ) first_only_matches = first_only_spec.match_configuration_scopes(configuration) @@ -772,7 +779,7 @@ def test_match_configuration_scopes(self): any_scope_spec = SearchSpec( patterns=['gamma'], item_types=('bucket',), - return_matched_patterns=True, + return_all_matched_patterns=True, ) any_scope_matches = any_scope_spec.match_configuration_scopes(configuration) assert [match.model_dump() for match in any_scope_matches] == [ From ff2f68bfb0bfa257be7b31568b653e96286b91ec Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:18:48 +0100 Subject: [PATCH 05/29] AI-2161 refactor: remove usage tool registration --- TOOLS.md | 246 +++++++++++-------------------- src/keboola_mcp_server/server.py | 2 - tests/test_server.py | 1 - 3 files changed, 87 insertions(+), 162 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index 6bcece3d..c6f39486 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -40,7 +40,6 @@ and the configuration ID. - [get_data_apps](#get_data_apps): Lists summaries of data apps in the project given the limit and offset or gets details of a data apps by providing their configuration IDs. - [modify_data_app](#modify_data_app): Creates or updates a Streamlit data app. -- [search_keboola_objects](#search_keboola_objects): Deep search across Keboola objects including their full JSON configuration data. ### Project Tools - [get_project_info](#get_project_info): Retrieves structured information about the current project, @@ -52,7 +51,7 @@ including essential context and base instructions for working with it ### Search Tools - [find_component_id](#find_component_id): Returns list of component IDs that match the given query. -- [search](#search): Searches for Keboola items (tables, buckets, configurations, transformations, flows, etc. +- [search](#search): Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc. ### Storage Tools - [get_buckets](#get_buckets): Lists buckets or retrieves full details of specific buckets, including metadata-derived descriptions, @@ -1723,137 +1722,6 @@ updating, set `authentication_type` to `default` to keep the existing authentica } ``` ---- - -## search_keboola_objects -**Annotations**: `read-only` - -**Tags**: `usage` - -**Description**: - -Deep search across Keboola objects including their full JSON configuration data. - -WHAT IT SEARCHES: -- Buckets/Tables: name, description, metadata, column names, column descriptions, and entire API payload -- Components/Flows/Data Apps/Transformations: name, description, and entire configuration JSON in raw format: - * All configuration parameters and nested settings - * Storage mappings (input/output tables) - * Credentials and connection details - * SQL queries and code blocks - * Any other data stored in the configuration - -WHEN TO USE: -- Find configurations by specific parameter values (e.g., API endpoints, database hosts) -- Search deep in nested JSON structures (e.g., table mappings, processors) -- Locate objects containing specific SQL code or queries -- Find configurations with particular credentials or connection strings -- Use advanced pattern matching with wildcards or regex - -PATTERN MATCHING: -- literal (default): Exact text matching - patterns=["salesforce.com"] -- wildcard: Glob patterns with * - patterns=["sales*"] matches "sales", "salesforce", "sales_data" -- regex: Regular expressions - patterns=["flow-[0-9]+"] matches "flow-1", "flow-123" -- Multiple patterns use OR logic (matches ANY pattern) - -USAGE EXAMPLES: - -1. Find extractors connecting to a specific database: - patterns=["prod-db-server.company.com"], search_types=["component"] - -2. Find transformations using a specific input table: - patterns=["in.c-main.customers"], search_types=["transformation"] - -3. Find all objects with "test" or "staging" in their configuration: - patterns=["test", "staging"], mode="literal" - -4. Find flows starting with "daily-" prefix: - patterns=["daily-*"], mode="wildcard", search_types=["flow"] - -5. Find components with API version v2 or v3: - patterns=["api/v[23]"], mode="regex", search_types=["component"] - -6. Find data apps using specific Python packages: - patterns=["pandas", "streamlit"], search_types=["data-app"] - -7. Search for exact table IDs (avoid partial matches): - patterns=["in.c-bucket.table"], whole_word=True - -8. Find configs with nested JSON structure (key-value in parameters): - patterns=[""parameters":\s*\{.*api\.paychex.*\}"], mode="regex" - -9. Find configs with specific authentication type: - patterns=[""authentication":\s*\{.*"type":\s*"oauth20""], mode="regex" - -10. Find configs with incremental loading enabled: - patterns=[""incremental":\s*true"], mode="regex" - -11. Find storage mappings referencing specific tables: - patterns=[""source":\s*"in\.*\.customers""], mode="regex" - -TIPS: -- Use whole_word=True when searching for IDs to avoid partial matches -- Start with literal mode for speed, use wildcard/regex for flexibility -- Narrow results with search_types when you know the object type -- Results include direct links to objects in Keboola UI - - -**Input JSON Schema**: -```json -{ - "properties": { - "patterns": { - "description": "Search patterns to match. Multiple patterns use OR logic (matches ANY pattern). Examples: [\"customer\"], [\"sales*\", \"revenue*\"] for wildcards, [\"flow-.*\"] for regex. Do not pass empty strings.", - "items": { - "type": "string" - }, - "type": "array" - }, - "mode": { - "default": "literal", - "description": "Pattern matching mode: \"literal\" - exact text match (default, fastest), \"wildcard\" - use * for glob patterns (e.g., \"sales*\"), \"regex\" - full regular expressions (most powerful).", - "enum": [ - "literal", - "wildcard", - "regex" - ], - "type": "string" - }, - "whole_word": { - "default": false, - "description": "When true, only matches complete words. Prevents partial matches like finding \"test\" in \"latest\". Useful for searching IDs or specific terms.", - "type": "boolean" - }, - "ignore_case": { - "default": true, - "description": "When true, search ignores letter casing (e.g., \"Sales\" matches \"sales\"). Default: true.", - "type": "boolean" - }, - "search_types": { - "default": [], - "description": "Filter by object types: \"bucket\", \"table\", \"component\", \"transformation\", \"flow\", \"data-app\". Empty list or [\"any\"] searches all types. Use to narrow results when you know what you need.", - "items": { - "enum": [ - "bucket", - "table", - "component", - "flow", - "data-app", - "transformation", - "any" - ], - "type": "string" - }, - "type": "array" - } - }, - "required": [ - "patterns" - ], - "type": "object" -} -``` - --- # Documentation Tools @@ -2505,53 +2373,80 @@ USAGE EXAMPLES: **Description**: -Searches for Keboola items (tables, buckets, configurations, transformations, flows, etc.) in the current project -by matching patterns against item ID, name, display name, or description. Returns matching items grouped by type -with their IDs and metadata. +Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current +project. +Supports two modes: +- textual: match patterns against ID, name, display name, description (and table columns) +- config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes +Returns matching items with IDs and metadata. WHEN TO USE: -- User asks to "find", "locate", or "search for" something by name +- User asks to "find", "locate", or "search for" something by name or text - User mentions a partial name and you need to find the full item (e.g., "find the customer table") - User asks "what tables/configs/flows do I have with X in the name?" +- User asks to find configs containing a value in parameters (use config-based + scopes and regex patterns) +- Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a +specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them -- User asks to "list all items with [name] in it" +- User assks to "what is the genesis of this item?" or "explain me bussiness logic of this item?" +- User asks to "list all items with [name] or [configuration value/part] in it" - DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. HOW IT WORKS: -- Searches by regex pattern matching against id, name, displayName, and description fields -- For tables, also searches column names and column descriptions -- Case-insensitive search -- Multiple patterns work as OR condition - matches items containing ANY of the patterns -- Returns grouped results by item type (tables, buckets, configurations, flows, etc.) -- Each result includes the item's ID, name, creation date, and relevant metadata +- mode: "regex" (default) or "literal" (escape special characters) +- case_sensitive: false by default; set true for exact casing +- search_type: + - "textual": matches id/name/display_name/description fields + - "config-based": matches stringified configuration payloads (JSON) via scopes or the whole config using + regex patterns. +- scopes: dot-separated paths (e.g., "parameters", "storage.input", "parameters.script") +- For tables, textual search also checks column names and column descriptions +- Multiple patterns are ORed: any match includes the item +- Results are ordered by update time, newest first, and can be paginated via limit/offset IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID - The search returns IDs that you can use with other tools (e.g., get_table, get_configs, get_flows) -- Results are ordered by update time. The most recently updated items are returned first. +- Use item_types to make the search more efficient when you know the type; scanning buckets and tables can be +expensive - For exact ID lookups, use specific tools like get_table, get_configs, get_flows instead -- Use find_component_id and get_configs tools to find configurations related to a specific component USAGE EXAMPLES: - user_input: "Find all tables with 'customer' in the name" - → patterns=["customer"], item_types=["table"] - → Returns all tables whose id, name, displayName, or description contains "customer" + → patterns=["customer"], search_type="textual", mode="literal", item_types=["table"] - user_input: "Find tables with 'email' column" - → patterns=["email"], item_types=["table"] - → Returns all tables that have a column named "email" or with "email" in column description + → patterns=["email"], search_type="textual", mode="literal", item_types=["table"] - user_input: "Search for the sales transformation" - → patterns=["sales"], item_types=["transformation"] + → patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"] → Returns transformations with "sales" in any searchable field - user_input: "Find items named 'daily report' or 'weekly summary'" - → patterns=["daily.*report", "weekly.*summary"], item_types=[] - → Returns all items matching any of these patterns + → patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[] + +- user_input: "Show me all configurations/components related to Google Analytics" + → patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"] + +- user_input: "Find storage input mappings referencing specific tables:" + → patterns=[""storage"\.*"input"\.*:\s*"in\.*\.customers""], search_type="config-based", mode="regex", + item_types=["transformation", "component"] + +- user input: "Find components or transformations using 'my_bucket' in output mappings" + → patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", + scopes=["storage.output"], mode="literal" + +- user input: "Find configs with specific authentication type" + → patterns=[""authentication":\s*\{.*"type":\s*"oauth20""], search_type="config-based", mode="regex", + item_types=["component"] -- user_input: "Show me all configurations related to Google Analytics" - → patterns=["google.*analytics"], item_types=["configuration"] - → Returns configurations with matching patterns +- user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" + → patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", + scopes=["tasks"] + +- user input: "Find data apps using specific code part ..." + → patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], + mode="regex"], scopes=["script"] **Input JSON Schema**: @@ -2559,7 +2454,7 @@ USAGE EXAMPLES: { "properties": { "patterns": { - "description": "One or more search patterns to match against item ID, name, display name, or description. Supports regex patterns. Case-insensitive. Examples: [\"customer\"], [\"sales\", \"revenue\"], [\"test.*table\"]. Do not use empty strings or empty lists.", + "description": "One or more search patterns to match against item ID, name, display name, or description. Supports regex patterns. Case-insensitive by default. Examples: [\"customer\"], [\"sales\", \"revenue\"], [\"test.*table\"], [\"key1.*:.*key2.*:.*value.*\"]. Do not use empty strings or empty lists.", "items": { "type": "string" }, @@ -2567,15 +2462,17 @@ USAGE EXAMPLES: }, "item_types": { "default": [], - "description": "Optional filter for specific Keboola item types. Leave empty to search all types. Common values: \"table\" (data tables), \"bucket\" (table containers), \"transformation\" (SQL/Python transformations), \"configuration\" (extractor/writer configs), \"flow\" (orchestration flows). Use when you know what type of item you're looking for.", + "description": "Filter for specific Keboola item types. Common values: \"table\" (data tables), \"bucket\" (table containers), \"transformation\" (SQL/Python transformations), \"component\" (extractor/writer/application components), \"data-app\" (data apps), \"flow\" (orchestration flows). Use when you know what type of item you're looking for or leave empty to search all types.", "items": { "enum": [ - "flow", "bucket", "table", + "data-app", + "flow", "transformation", "configuration", "configuration-row", + "component", "workspace", "shared-code", "rows", @@ -2585,6 +2482,37 @@ USAGE EXAMPLES: }, "type": "array" }, + "search_type": { + "default": "textual", + "description": "Search mode: \"textual\" (name/id/description) or \"config-based\" (stringified configuration payloads).", + "enum": [ + "textual", + "config-based" + ], + "type": "string" + }, + "scopes": { + "default": [], + "description": "Dot-separated keys to search in configuration payloads, used with \"config-based\" search. Example: \"parameters.field\", \"storage.input\", \"storage.output\", \"processors.before\", \"processors.after\", \"authorization\", \"tasks\", \"phases\". Leave empty to search the whole configuration.", + "items": { + "type": "string" + }, + "type": "array" + }, + "mode": { + "default": "literal", + "description": "How to interpret patterns: \"regex\" for regular expressions or \"literal\" for exact text (default: \"literal\").", + "enum": [ + "regex", + "literal" + ], + "type": "string" + }, + "case_sensitive": { + "default": false, + "description": "If true, match patterns with case sensitivity (default: false).", + "type": "boolean" + }, "limit": { "default": 50, "description": "Maximum number of items to return (default: 50, max: 100).", diff --git a/src/keboola_mcp_server/server.py b/src/keboola_mcp_server/server.py index 6fae8ac9..c7090624 100644 --- a/src/keboola_mcp_server/server.py +++ b/src/keboola_mcp_server/server.py @@ -34,7 +34,6 @@ from keboola_mcp_server.tools.search.tools import add_search_tools from keboola_mcp_server.tools.sql import add_sql_tools from keboola_mcp_server.tools.storage import add_storage_tools -from keboola_mcp_server.tools.usage import add_usage_tools LOG = logging.getLogger(__name__) @@ -239,7 +238,6 @@ def create_server( add_sql_tools(mcp) add_storage_tools(mcp) add_keboola_prompts(mcp) - add_usage_tools(mcp) if custom_routes_handling != 'return': return mcp diff --git a/tests/test_server.py b/tests/test_server.py index ee2d4973..cacdceb0 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -60,7 +60,6 @@ async def test_list_tools(self): 'query_data', 'run_job', 'search', - 'search_keboola_objects', 'update_config', 'update_config_row', 'update_descriptions', From dc1da3ea81d556043345825dcecaecec829f793b Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:50:05 +0100 Subject: [PATCH 06/29] AI-2161 fix: return extractor configs as configuration items --- src/keboola_mcp_server/tools/search/tools.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/keboola_mcp_server/tools/search/tools.py b/src/keboola_mcp_server/tools/search/tools.py index 638d6d05..fb8385d1 100644 --- a/src/keboola_mcp_server/tools/search/tools.py +++ b/src/keboola_mcp_server/tools/search/tools.py @@ -38,7 +38,6 @@ 'transformation', 'configuration', 'configuration-row', - 'component', 'workspace', 'shared-code', 'rows', @@ -49,7 +48,6 @@ 'data-app': ['other'], 'flow': ['other'], 'transformation': ['transformation'], - 'component': ['extractor', 'writer', 'application'], 'configuration': ['extractor', 'writer', 'application'], 'configuration-row': ['extractor', 'writer', 'application'], 'workspace': ['other'], @@ -361,11 +359,13 @@ async def _fetch_configs( ) -> AsyncGenerator[SearchHit, None]: components = await client.storage_client.component_list(component_type, include=['configuration', 'rows']) - allowed_transformations = 'transformation' in spec.item_types - allowed_components = 'component' in spec.item_types - allowed_flows = 'flow' in spec.item_types - allowed_workspaces = 'workspace' in spec.item_types - allowed_data_apps = 'data-app' in spec.item_types + allowed_transformations = 'transformation' in spec.item_types or component_type is None + allowed_components = ( + 'configuration' in spec.item_types or 'configuration-row' in spec.item_types or component_type is None + ) + allowed_flows = 'flow' in spec.item_types or component_type is None + allowed_workspaces = 'workspace' in spec.item_types or component_type is None + allowed_data_apps = 'data-app' in spec.item_types or component_type is None for component in components: if not (component_id := component.get('id')): @@ -389,7 +389,7 @@ async def _fetch_configs( if not allowed_data_apps: continue elif current_component_type in ['extractor', 'writer', 'application']: - item_type = 'component' + item_type = 'configuration' if not allowed_components: continue else: @@ -624,7 +624,6 @@ async def search( tasks.append(_fetch_configurations(client, cfg)) elif types_to_fetch & { 'configuration', - 'component', 'transformation', 'flow', 'configuration-row', From 4ef683b517dbab549b147b651fe50699003d8b87 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:50:09 +0100 Subject: [PATCH 07/29] AI-2161 fix: include configurations in table usage search --- src/keboola_mcp_server/tools/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/keboola_mcp_server/tools/storage.py b/src/keboola_mcp_server/tools/storage.py index 0707bdbe..5be81202 100644 --- a/src/keboola_mcp_server/tools/storage.py +++ b/src/keboola_mcp_server/tools/storage.py @@ -619,7 +619,7 @@ async def _fetch_table_detail(_table_id: str) -> TableDetail | str: # Add the component usage to the table detail if include_usage: usage_by_ids = await find_id_usage( - client, table_ids, ['component', 'transformation'], ['storage.input', 'storage.output'] + client, table_ids, ['configuration', 'configuration-row', 'transformation'], ['storage.input', 'storage.output'] ) for usage_by_id in usage_by_ids: if usage_by_id.target_id in tables_by_id and usage_by_id.usage_references: From da22a0929a22642212e65c61d8a41ef3b021e086 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:50:14 +0100 Subject: [PATCH 08/29] AI-2161 test: add config-based search integration test --- integtests/tools/test_search.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/integtests/tools/test_search.py b/integtests/tools/test_search.py index f7342ac6..26187cf0 100644 --- a/integtests/tools/test_search.py +++ b/integtests/tools/test_search.py @@ -103,3 +103,32 @@ async def test_find_component_id(mcp_client: Client): assert full_result.content[0].type == 'text' decoded_toon = toon_format.decode(full_result.content[0].text) assert decoded_toon == result + + +@pytest.mark.asyncio +async def test_search_config_based_simple_query( + mcp_client: Client, + configs: list[ConfigDef], +) -> None: + """ + Test config-based search with a simple scoped query. + """ + config = next(cfg for cfg in configs if cfg.component_id == 'ex-generic-v2') + full_result = await mcp_client.call_tool( + 'search', + { + 'patterns': ['wttr.in'], + 'item_types': ['configuration'], + 'search_type': 'config-based', + 'scopes': ['parameters.api.baseUrl'], + 'limit': 20, + 'offset': 0, + }, + ) + + assert full_result.structured_content is not None + result = [SearchHit.model_validate(hit) for hit in full_result.structured_content['result']] + + assert any( + hit.component_id == 'ex-generic-v2' and hit.configuration_id == config.configuration_id for hit in result + ), f'Expected config {config.configuration_id} to be returned. Found: {result}' From cbe58d72b2e4f54504ecc4c7b9af2976be48c41e Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 12 Jan 2026 18:52:21 +0100 Subject: [PATCH 09/29] AI-2161 style: apply tox --- TOOLS.md | 1 - src/keboola_mcp_server/tools/storage.py | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index c6f39486..3e935a5d 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -2472,7 +2472,6 @@ USAGE EXAMPLES: "transformation", "configuration", "configuration-row", - "component", "workspace", "shared-code", "rows", diff --git a/src/keboola_mcp_server/tools/storage.py b/src/keboola_mcp_server/tools/storage.py index 5be81202..1ff3f65e 100644 --- a/src/keboola_mcp_server/tools/storage.py +++ b/src/keboola_mcp_server/tools/storage.py @@ -619,7 +619,10 @@ async def _fetch_table_detail(_table_id: str) -> TableDetail | str: # Add the component usage to the table detail if include_usage: usage_by_ids = await find_id_usage( - client, table_ids, ['configuration', 'configuration-row', 'transformation'], ['storage.input', 'storage.output'] + client, + table_ids, + ['configuration', 'configuration-row', 'transformation'], + ['storage.input', 'storage.output'], ) for usage_by_id in usage_by_ids: if usage_by_id.target_id in tables_by_id and usage_by_id.usage_references: From e10c074079919e0a95b164a7238d4c156a1cafbc Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 21 Jan 2026 15:14:00 +0100 Subject: [PATCH 10/29] AI-2161 chore: update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 07873846..259efcc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "keboola-mcp-server" -version = "1.39.0" +version = "1.40.0" description = "MCP server for interacting with Keboola Connection" readme = "README.md" requires-python = ">=3.10" From e421ed804033847f69222991989b3d6adff99a60 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 21 Jan 2026 15:35:34 +0100 Subject: [PATCH 11/29] AI-2161 refactor: add component to search type --- TOOLS.md | 6 ++++-- src/keboola_mcp_server/tools/search.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index 40192983..f95a890b 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -2550,11 +2550,12 @@ USAGE EXAMPLES: **Description**: Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current -project. +project. Returns matching items with IDs and metadata. Supports two modes: - textual: match patterns against ID, name, display name, description (and table columns) - config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes -Returns matching items with IDs and metadata. +which can be derived from the configuration schemas or objects. + WHEN TO USE: - User asks to "find", "locate", or "search for" something by name or text @@ -2646,6 +2647,7 @@ USAGE EXAMPLES: "data-app", "flow", "transformation", + "component", "configuration", "configuration-row", "workspace", diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 0d9069b5..c4237572 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -36,6 +36,7 @@ 'data-app', 'flow', 'transformation', + 'component', 'configuration', 'configuration-row', 'workspace', @@ -48,6 +49,7 @@ SearchComponentItemType = Literal[ 'flow', 'transformation', + 'component', 'configuration', 'configuration-row', 'workspace', @@ -60,6 +62,7 @@ 'transformation': ['transformation'], 'configuration': ['extractor', 'writer', 'application'], 'configuration-row': ['extractor', 'writer', 'application'], + 'component': ['extractor', 'writer', 'application'], 'workspace': ['other'], } @@ -189,6 +192,11 @@ def _validate_component_args(self) -> 'SearchSpec': ) return self + @model_validator(mode='after') + def _validate_item_types(self) -> 'SearchSpec': + if 'component' in self.item_types: + self.item_types = list(set(self.item_types + ['configuration', 'configuration-row'])) + @staticmethod def _stringify(value: JsonDict) -> str: try: @@ -508,11 +516,12 @@ async def search( ) -> list[SearchHit]: """ Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current - project. + project. Returns matching items with IDs and metadata. Supports two modes: - textual: match patterns against ID, name, display name, description (and table columns) - config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes - Returns matching items with IDs and metadata. + which can be derived from the configuration schemas or objects. + WHEN TO USE: - User asks to "find", "locate", or "search for" something by name or text From 5ca04b55875f620ab308f0b57c797f7b94f8aa88 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Thu, 22 Jan 2026 12:43:32 +0100 Subject: [PATCH 12/29] AI-2161 fix: update import --- integtests/test_errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integtests/test_errors.py b/integtests/test_errors.py index d366f41a..b22fa19c 100644 --- a/integtests/test_errors.py +++ b/integtests/test_errors.py @@ -17,7 +17,7 @@ from keboola_mcp_server.tools.doc import docs_query from keboola_mcp_server.tools.jobs import get_jobs from keboola_mcp_server.tools.sql import query_data -from keboola_mcp_server.tools.storage import GetBucketsOutput, get_buckets +from keboola_mcp_server.tools.storage.tools import GetBucketsOutput, get_buckets class TestHttpErrors: From c1fc287f77e4fa7819614a34dd4959ac5c35a70f Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 26 Jan 2026 12:57:05 +0100 Subject: [PATCH 13/29] AI-2161 chore: update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 259efcc7..b2a74a52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "keboola-mcp-server" -version = "1.40.0" +version = "1.41.0" description = "MCP server for interacting with Keboola Connection" readme = "README.md" requires-python = ">=3.10" From 05b868effa277058d3f63a118ecce2f8e933bd0c Mon Sep 17 00:00:00 2001 From: Vita Stejskal Date: Fri, 6 Feb 2026 12:06:33 +0100 Subject: [PATCH 14/29] AI-2161 fix: missing return in validator, doc defaults, typos, and None filtering Co-Authored-By: Claude Opus 4.6 --- TOOLS.md | 4 ++-- src/keboola_mcp_server/tools/search.py | 31 +++++++++++++------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index f640ec0b..315200a1 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -2565,12 +2565,12 @@ WHEN TO USE: - Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them -- User assks to "what is the genesis of this item?" or "explain me bussiness logic of this item?" +- User asks to "what is the genesis of this item?" or "explain me business logic of this item?" - User asks to "list all items with [name] or [configuration value/part] in it" - DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. HOW IT WORKS: -- mode: "regex" (default) or "literal" (escape special characters) +- mode: "literal" (default) or "regex" (regular expressions) - case_sensitive: false by default; set true for exact casing - search_type: - "textual": matches id/name/display_name/description fields diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index c4237572..8bd5144c 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -2,7 +2,7 @@ import json import logging import re -from typing import Annotated, Any, AsyncGenerator, Literal, Mapping, Sequence +from typing import Annotated, Any, AsyncGenerator, Iterable, Literal, Mapping, Sequence from fastmcp import Context, FastMCP from fastmcp.tools import FunctionTool @@ -195,7 +195,8 @@ def _validate_component_args(self) -> 'SearchSpec': @model_validator(mode='after') def _validate_item_types(self) -> 'SearchSpec': if 'component' in self.item_types: - self.item_types = list(set(self.item_types + ['configuration', 'configuration-row'])) + self.item_types = list({*self.item_types, 'configuration', 'configuration-row'}) + return self @staticmethod def _stringify(value: JsonDict) -> str: @@ -246,7 +247,7 @@ def match_configuration_scopes(self, configuration: JsonDict | None) -> list[Pat return [PatternMatch(scope=None, patterns=matched)] return [] - def match_texts(self, texts: Sequence[str]) -> list[PatternMatch]: + def match_texts(self, texts: Iterable[str]) -> list[PatternMatch]: """ Matches a sequence of strings against the patterns. @@ -278,7 +279,7 @@ def _check_column_match(table: JsonDict, cfg: SearchSpec) -> list[PatternMatch]: if col_metadata := table.get('columnMetadata', {}): col_descs = (get_metadata_property(col_meta, MetadataField.DESCRIPTION) for col_meta in col_metadata.values()) - if matched := cfg.match_texts(col_descs): + if matched := cfg.match_texts(filter(None, col_descs)): return matched return [] @@ -324,9 +325,9 @@ async def _fetch_tables(client: KeboolaClient, spec: SearchSpec) -> list[SearchH table_display_name = table.get('displayName') table_description = get_metadata_property(table.get('metadata', []), MetadataField.DESCRIPTION) - if matches := spec.match_texts( - [table_id, table_name, table_display_name, table_description] - ) or _check_column_match(table, spec): + matches = spec.match_texts([table_id, table_name, table_display_name, table_description]) + matches.extend(_check_column_match(table, spec)) + if matches: hits.append( SearchHit( table_id=table_id, @@ -375,27 +376,27 @@ async def _fetch_configs( current_component_type = component.get('type') if component_id in [ORCHESTRATOR_COMPONENT_ID, CONDITIONAL_FLOW_COMPONENT_ID]: - item_type = 'flow' + item_type: SearchItemType = 'flow' if not allowed_flows: continue elif current_component_type == 'transformation': - item_type = 'transformation' + item_type: SearchItemType = 'transformation' if not allowed_transformations: continue elif component_id == 'keboola.sandboxes': - item_type = 'workspace' + item_type: SearchItemType = 'workspace' if not allowed_workspaces: continue elif component_id == DATA_APP_COMPONENT_ID: - item_type = 'data-app' + item_type: SearchItemType = 'data-app' if not allowed_data_apps: continue elif current_component_type in ['extractor', 'writer', 'application']: - item_type = 'configuration' + item_type: SearchItemType = 'configuration' if not allowed_components: continue else: - item_type = 'configuration' + item_type: SearchItemType = 'configuration' for config in component.get('configurations', []): if not (config_id := config.get('id')): @@ -531,12 +532,12 @@ async def search( - Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them - - User assks to "what is the genesis of this item?" or "explain me bussiness logic of this item?" + - User asks to "what is the genesis of this item?" or "explain me business logic of this item?" - User asks to "list all items with [name] or [configuration value/part] in it" - DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. HOW IT WORKS: - - mode: "regex" (default) or "literal" (escape special characters) + - mode: "literal" (default) or "regex" (regular expressions) - case_sensitive: false by default; set true for exact casing - search_type: - "textual": matches id/name/display_name/description fields From 6de7ea405ad22e53885680c87d1f9385b9423f49 Mon Sep 17 00:00:00 2001 From: Vita Stejskal Date: Sun, 8 Feb 2026 20:40:12 +0100 Subject: [PATCH 15/29] AI-2161: fix search tool's docstring --- TOOLS.md | 31 +++++++++++++------------- src/keboola_mcp_server/tools/search.py | 31 +++++++++++++------------- 2 files changed, 30 insertions(+), 32 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index 7fb36445..71935805 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -2628,40 +2628,39 @@ expensive USAGE EXAMPLES: - user_input: "Find all tables with 'customer' in the name" - → patterns=["customer"], search_type="textual", mode="literal", item_types=["table"] + → `patterns=["customer"], search_type="textual", mode="literal", item_types=["table"]` - user_input: "Find tables with 'email' column" - → patterns=["email"], search_type="textual", mode="literal", item_types=["table"] + → `patterns=["email"], search_type="textual", mode="literal", item_types=["table"]` - user_input: "Search for the sales transformation" - → patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"] - → Returns transformations with "sales" in any searchable field + → `patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"]` - user_input: "Find items named 'daily report' or 'weekly summary'" - → patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[] + → `patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[]` - user_input: "Show me all configurations/components related to Google Analytics" - → patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"] + → `patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"]` - user_input: "Find storage input mappings referencing specific tables:" - → patterns=[""storage"\.*"input"\.*:\s*"in\.*\.customers""], search_type="config-based", mode="regex", - item_types=["transformation", "component"] + → `patterns=["\"storage\".*\"input\".*:\s*\"in\..*\.customers\""], search_type="config-based", + mode="regex", item_types=["transformation", "component"]` - user input: "Find components or transformations using 'my_bucket' in output mappings" - → patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", - scopes=["storage.output"], mode="literal" + → `patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", + scopes=["storage.output"], mode="literal"` - user input: "Find configs with specific authentication type" - → patterns=[""authentication":\s*\{.*"type":\s*"oauth20""], search_type="config-based", mode="regex", - item_types=["component"] + → `patterns=["\"authentication\":\s*\{.*\"type\":\s*\"oauth20\""], search_type="config-based", + mode="regex", item_types=["component"]` - user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" - → patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", - scopes=["tasks"] + → `patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", + scopes=["tasks"]` - user input: "Find data apps using specific code part ..." - → patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], - mode="regex"], scopes=["script"] + → `patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], + mode="regex"], scopes=["script"]` **Input JSON Schema**: diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index e33738e2..42ae7714 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -557,40 +557,39 @@ async def search( USAGE EXAMPLES: - user_input: "Find all tables with 'customer' in the name" - → patterns=["customer"], search_type="textual", mode="literal", item_types=["table"] + → `patterns=["customer"], search_type="textual", mode="literal", item_types=["table"]` - user_input: "Find tables with 'email' column" - → patterns=["email"], search_type="textual", mode="literal", item_types=["table"] + → `patterns=["email"], search_type="textual", mode="literal", item_types=["table"]` - user_input: "Search for the sales transformation" - → patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"] - → Returns transformations with "sales" in any searchable field + → `patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"]` - user_input: "Find items named 'daily report' or 'weekly summary'" - → patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[] + → `patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[]` - user_input: "Show me all configurations/components related to Google Analytics" - → patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"] + → `patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"]` - user_input: "Find storage input mappings referencing specific tables:" - → patterns=["\"storage\"\\.*\"input\"\\.*:\\s*\"in\\.*\\.customers\""], search_type="config-based", mode="regex", - item_types=["transformation", "component"] + → `patterns=["\\"storage\\".*\\"input\\".*:\\s*\\"in\\..*\\.customers\\""], search_type="config-based", + mode="regex", item_types=["transformation", "component"]` - user input: "Find components or transformations using 'my_bucket' in output mappings" - → patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", - scopes=["storage.output"], mode="literal" + → `patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", + scopes=["storage.output"], mode="literal"` - user input: "Find configs with specific authentication type" - → patterns=["\"authentication\":\\s*\\{.*\"type\":\\s*\"oauth20\""], search_type="config-based", mode="regex", - item_types=["component"] + → `patterns=["\\"authentication\\":\\s*\\{.*\\"type\\":\\s*\\"oauth20\\""], search_type="config-based", + mode="regex", item_types=["component"]` - user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" - → patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", - scopes=["tasks"] + → `patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", + scopes=["tasks"]` - user input: "Find data apps using specific code part ..." - → patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], - mode="regex"], scopes=["script"] + → `patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], + mode="regex"], scopes=["script"]` """ spec = SearchSpec( From 0e6686e9614359390a703f876f890c52de6d8b0c Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:31:24 +0100 Subject: [PATCH 16/29] AI-2161 feat: simplify search API and improve config scope matching --- src/keboola_mcp_server/tools/search.py | 269 ++++++++++++++++++------- 1 file changed, 194 insertions(+), 75 deletions(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 42ae7714..90a430f1 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -4,8 +4,10 @@ import re from typing import Annotated, Any, AsyncGenerator, Iterable, Literal, Mapping, Sequence +import jsonpath_ng from fastmcp import Context, FastMCP from fastmcp.tools import FunctionTool +from jsonpath_ng.jsonpath import JSONPath from mcp.types import ToolAnnotations from pydantic import BaseModel, Field, PrivateAttr, model_validator @@ -114,6 +116,11 @@ class SearchHit(BaseModel): name: str | None = Field(default=None, description='Name of the item.') display_name: str | None = Field(default=None, description='Display name of the item.') description: str | None = Field(default=None, description='Description of the item.') + match_scopes: list[str] = Field( + default_factory=list, + description='Most specific JSONPath scopes within the configuration where a pattern was matched ' + '(config-based search only).', + ) links: list[Link] = Field(default_factory=list, description='Links to the item.') _matches: list[PatternMatch] = PrivateAttr(default_factory=list) @@ -148,6 +155,15 @@ def check_id_fields(self) -> 'SearchHit': def with_matches(self, matches: list['PatternMatch']) -> 'SearchHit': """Assign pattern matches to this search hit and return self for chaining.""" self._matches = matches + unique_scopes = list(dict.fromkeys(match.scope for match in matches if match.scope)) + self.match_scopes = [ + scope + for scope in unique_scopes + if not any( + other != scope and other.startswith(scope) and other[len(scope) : len(scope) + 1] in {'.', '['} + for other in unique_scopes + ) + ] return self @@ -163,6 +179,8 @@ class SearchSpec(BaseModel): _component_types: Sequence[str] = PrivateAttr(default_factory=tuple) _compiled_patterns: list[re.Pattern] = PrivateAttr(default_factory=list) _clean_patterns: list[str] = PrivateAttr(default_factory=list) + _all_nodes_expr: JSONPath | None = PrivateAttr(default=None) + _scope_exprs: list[tuple[str, JSONPath, JSONPath]] = PrivateAttr(default_factory=list) @model_validator(mode='after') def _compile_patterns(self) -> 'SearchSpec': @@ -198,8 +216,21 @@ def _validate_item_types(self) -> 'SearchSpec': self.item_types = list({*self.item_types, 'configuration', 'configuration-row'}) return self + @model_validator(mode='after') + def _compile_jsonpath_exprs(self) -> 'SearchSpec': + # Compile commonly used expressions once per SearchSpec instance. + self._all_nodes_expr = jsonpath_ng.parse('$..*') + self._scope_exprs = [] + for scope in self.search_scopes: + normalized = scope if scope.startswith('$') else f'$.{scope}' + try: + self._scope_exprs.append((scope, jsonpath_ng.parse(normalized), jsonpath_ng.parse(f'{normalized}..*'))) + except Exception as e: + LOG.warning(f'Invalid JSONPath scope "{scope}": {e}') + return self + @staticmethod - def _stringify(value: JsonDict) -> str: + def _stringify(value: Any) -> str: try: return json.dumps(value, sort_keys=True, default=str, ensure_ascii=False) except (TypeError, ValueError): @@ -227,25 +258,76 @@ def match_patterns(self, value: str | JsonDict | None) -> list[str]: return matches + def _find_matches_for_expr(self, configuration: JsonDict, parsed_expr: JSONPath) -> list[PatternMatch]: + """Find pattern matches on JSON nodes matched by a JSONPath expression.""" + matches: list[PatternMatch] = [] + for jpath_match in parsed_expr.find(configuration): + value = jpath_match.value + if matched := self.match_patterns(value): + matches.append( + PatternMatch( + scope=re.sub(r'\.\[', '[', str(jpath_match.full_path)), + patterns=matched, + ) + ) + if not self.return_all_matched_patterns: + return matches + return matches + + def _find_scalar_matches_for_expr(self, configuration: JsonDict, parsed_expr: JSONPath) -> list[PatternMatch]: + """Find pattern matches only on scalar nodes matched by a JSONPath expression.""" + matches: list[PatternMatch] = [] + for jpath_match in parsed_expr.find(configuration): + value = jpath_match.value + if value is None or isinstance(value, (dict, list)): + continue + if matched := self.match_patterns(value): + matches.append( + PatternMatch( + scope=re.sub(r'\.\[', '[', str(jpath_match.full_path)), + patterns=matched, + ) + ) + if not self.return_all_matched_patterns: + return matches + return matches + def match_configuration_scopes(self, configuration: JsonDict | None) -> list[PatternMatch]: """ - Checks configuration fields within specified scopes for pattern matches. + Checks configuration fields within specified JSONPath scopes for pattern matches. + Walks matching nodes within each scope and returns the exact path where the match + was found. When no scopes are specified, walks the entire configuration. :param configuration: The configuration to match against the patterns. - :return: A tuple of scopes and patterns that matched the configuration; empty patterns if no matches. + :return: List of PatternMatch with matching JSONPath scopes; empty list if no matches. """ + if configuration is None: + return [] + if self.search_scopes: - matches: list[PatternMatch] = [] - for scope in self.search_scopes: - if matched := self.match_patterns(get_nested(configuration, scope, default=None)): - matches.append(PatternMatch(scope=scope, patterns=matched)) + all_matches: list[PatternMatch] = [] + # Deduplicate hits when scopes overlap (e.g. "parameters" + "parameters.query") + # or the same logical scope is provided multiple times. + seen: set[str | None] = set() + for _scope, self_expr, desc_expr in self._scope_exprs: + # Include self scope only for scalar values. For objects/lists, include descendants only. + self_matches = self._find_scalar_matches_for_expr(configuration, self_expr) + desc_matches = self._find_matches_for_expr(configuration, desc_expr) + + scope_matches = desc_matches if desc_matches else self_matches + for match in scope_matches: + if match.scope in seen: + continue + seen.add(match.scope) + all_matches.append(match) if not self.return_all_matched_patterns: - break - return matches + return all_matches + return all_matches - if matched := self.match_patterns(configuration): - return [PatternMatch(scope=None, patterns=matched)] - return [] + # No scope provided – search all descendants and return exact match paths. + if self._all_nodes_expr is None: + self._all_nodes_expr = jsonpath_ng.parse('$..*') + return self._find_matches_for_expr(configuration, self._all_nodes_expr) def match_texts(self, texts: Iterable[str]) -> list[PatternMatch]: """ @@ -465,9 +547,10 @@ async def search( patterns: Annotated[ list[str], Field( - description='One or more search patterns to match against item ID, name, display name, or description. ' - 'Supports regex patterns. Case-insensitive by default. Examples: ["customer"], ["sales", "revenue"], ' - '["test.*table"], ["key1.*:.*key2.*:.*value.*"]. Do not use empty strings or empty lists.' + description='One or more search patterns to match against item ID, name, display name, description, ' + 'or configuration JSON objects. Case-insensitive by default. ' + 'Examples: ["customer"], ["sales", "revenue"], ["my_bucket"]. ' + 'Do not use empty strings or empty lists.' ), ], item_types: Annotated[ @@ -484,15 +567,16 @@ async def search( SearchType, Field( description='Search mode: "textual" (name/id/description) or "config-based" (stringified configuration ' - 'payloads).' + 'payloads). (default: "textual")' ), ] = 'textual', scopes: Annotated[ Sequence[str], Field( - description='Dot-separated keys to search in configuration payloads, used with "config-based" search. ' - 'Example: "parameters.field", "storage.input", "storage.output", "processors.before", "processors.after", ' - '"authorization", "tasks", "phases". Leave empty to search the whole configuration.' + description='JSONPath expressions to narrow config-based search to specific parts of the configuration. ' + 'Simple dot-notation (e.g. "parameters", "storage.input") and full JSONPath (e.g. "$.tasks[*]") are both ' + 'supported (e.g. "parameters.host", "storage.input[0].source"). ' + 'Leave empty to search the whole configuration.' ), ] = tuple(), mode: Annotated[ @@ -502,10 +586,6 @@ async def search( '(default: "literal").' ), ] = 'literal', - case_sensitive: Annotated[ - bool, - Field(description='If true, match patterns with case sensitivity (default: false).'), - ] = False, limit: Annotated[ int, Field( @@ -516,89 +596,128 @@ async def search( offset: Annotated[int, Field(description='Number of matching items to skip for pagination (default: 0).')] = 0, ) -> list[SearchHit]: """ - Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current - project. Returns matching items with IDs and metadata. - Supports two modes: - - textual: match patterns against ID, name, display name, description (and table columns) - - config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes - which can be derived from the configuration schemas or objects. + Searches for Keboola items (tables, buckets, components, configurations, transformations, flows, data-apps, etc.) + in the current project and returns matching ID + metadata. + + This tool supports two complementary search types: + + 1) textual + - Searches item metadata fields by matching patterns against id, name, displayName, and description. + - For tables, also searches column names and column descriptions. + 2) config-based + - Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted + to a string, optionally narrowed by JSON path `scopes`. + - Returns also `match_scopes` with JSON paths in configuration where a pattern was found. + + THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items + by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, + get_flows, or get_data_apps just to locate a specific item — use this tool instead. WHEN TO USE: - - User asks to "find", "locate", or "search for" something by name or text + - User asks to "find", "locate", or "search for" something by name, keyword, text pattern, configuration content or + value - User mentions a partial name and you need to find the full item (e.g., "find the customer table") - User asks "what tables/configs/flows do I have with X in the name?" - - User asks to find configs containing a value in parameters (use config-based + scopes and regex patterns) - - Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a - specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them - - User asks to "what is the genesis of this item?" or "explain me business logic of this item?" - User asks to "list all items with [name] or [configuration value/part] in it" - - DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. + - User asks where a value, table, component, specific configuration ID, or specific settings is used in components, + data-apps, flows, or transformations + - You need to trace lineage by searching for IDs referenced in configurations, or to find flows using a + specific component, or find usage of a bucket/table in transformations or components, or to find items with + specific parameters. + - User asks to "what is the genesis of this item?" or "explain me business logic of this item?" HOW IT WORKS: - - mode: "literal" (default) or "regex" (regular expressions) - - case_sensitive: false by default; set true for exact casing - - search_type: - - "textual": matches id/name/display_name/description fields - - "config-based": matches stringified configuration payloads (JSON) via scopes or the whole config using - regex patterns. - - scopes: dot-separated paths (e.g., "parameters", "storage.input", "parameters.script") - - For tables, textual search also checks column names and column descriptions - - Multiple patterns are ORed: any match includes the item - - Results are ordered by update time, newest first, and can be paginated via limit/offset + - Supports two types: + - search_type="textual": matches against id, name, displayName, and description, for tables also column names + and column descriptions + - search_type="config-based": matches inside configuration JSON objects, optionally narrowed by JSON path `scopes` + - case-insensitive search + - mode for pattern search: `literal` (default) or `regex` + - Multiple patterns work as OR condition - matches items containing ANY of the patterns + - Each result includes the item's ID, name, creation date, and relevant metadata + - scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed + against the stringified JSON node content in those areas. + - config-based always returns all matched paths per item in `match_scopes` IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID - - The search returns IDs that you can use with other tools (e.g., get_table, get_configs, get_flows) - - Use item_types to make the search more efficient when you know the type; scanning buckets and tables can be - expensive - - For exact ID lookups, use specific tools like get_table, get_configs, get_flows instead + - The search returns IDs that you can use with other tools (e.g., get_tables, get_configs, get_flows) + - Results are ordered by update time. The most recently updated items are returned first. + - Fill `item_types` to make the search more efficient when you know the item type; scanning buckets and tables can + be expensive + - For exact ID lookups, use specific tools like get_tables, get_configs, get_flows instead + - Use specific `scopes` only when you know the config structure (schema or real example); otherwise run config-based + search without scopes. + - Use find_component_id and get_configs tools to find configurations related to a specific component + - If results are too numerous or empty, ask the user to refine their query rather than enumerating all items. USAGE EXAMPLES: - - user_input: "Find all tables with 'customer' in the name" - → `patterns=["customer"], search_type="textual", mode="literal", item_types=["table"]` + 1) textual search examples: + - user_input: "Find all tables with 'customer' in the name" + → patterns=["customer"], item_types=["table"] + → Returns all tables whose id, name, displayName, or description contains "customer" + + - user_input: "Find tables with 'email' column" + → patterns=["email"], item_types=["table"] + → Returns all tables that have a column named "email" or with "email" in column description + + - user_input: "Search for the sales transformation" + → patterns=["sales"], item_types=["transformation"] + → Returns transformations with "sales" in any searchable field + + - user_input: "Find items named 'daily report' or 'weekly summary'" + → patterns=["daily.*report", "weekly.*summary"], item_types=[], mode="regex" + → Returns all items matching any of these patterns + + - user_input: "Show me all configurations related to Google Analytics" + → patterns=["google.*analytics"], item_types=["configuration"], mode="regex" + → Returns configurations with matching patterns - - user_input: "Find tables with 'email' column" - → `patterns=["email"], search_type="textual", mode="literal", item_types=["table"]` + 2) config-based search examples: + - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" + -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" + -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths - - user_input: "Search for the sales transformation" - → `patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"]` + - user_input: "Find configurations (etc.) using specific setting / id anywhere" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", - - user_input: "Find items named 'daily report' or 'weekly summary'" - → `patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[]` + - user_input: "Find configurations (etc.) using specific setting /id in parameters" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["parameters"] - - user_input: "Show me all configurations/components related to Google Analytics" - → `patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"]` + - user_input: "Find configurations (etc.) using specific setting / id in storage" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] - - user_input: "Find storage input mappings referencing specific tables:" - → `patterns=["\\"storage\\".*\\"input\\".*:\\s*\\"in\\..*\\.customers\\""], search_type="config-based", - mode="regex", item_types=["transformation", "component"]` + - user_input: "Find configurations (etc.) using specific setting / id in authorization" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", + scopes=["parameters.authorization", "authorization"] - - user input: "Find components or transformations using 'my_bucket' in output mappings" - → `patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", - scopes=["storage.output"], mode="literal"` + - user_input: "Find components/transformations using my_bucket in input or output mappings" + -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", + scopes=["storage.input", "storage.output"] + -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` - - user input: "Find configs with specific authentication type" - → `patterns=["\\"authentication\\":\\s*\\{.*\\"type\\":\\s*\\"oauth20\\""], search_type="config-based", - mode="regex", item_types=["component"]` + - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" + -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", + scopes=["tasks", "phases"] - - user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" - → `patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", - scopes=["tasks"]` + - user_input: "Find transformations using this table / column / specific code in its script" + -> patterns=["element"], item_types=["transformation"], search_type="config-based", + scopes=["parameters"] - - user input: "Find data apps using specific code part ..." - → `patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], - mode="regex"], scopes=["script"]` + - user_input: "Find data apps using something in its config / python code / setting" + -> patterns=["something"], item_types=["data-app"], search_type="config-based" + -> Returns data apps where script/config sections contain the keyword and includes `match_scopes` """ spec = SearchSpec( patterns=patterns, item_types=item_types, pattern_mode=mode, - case_sensitive=case_sensitive, search_type=search_type, search_scopes=scopes, + return_all_matched_patterns=(search_type == 'config-based'), ) offset = max(0, offset) From ac4e665e53047c2c83a0359361e1c355bc7d657a Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:31:26 +0100 Subject: [PATCH 17/29] AI-2161 test: add config-based search scope coverage --- tests/tools/test_search.py | 175 +++++++++++++++++++++++++++++++++++-- 1 file changed, 169 insertions(+), 6 deletions(-) diff --git a/tests/tools/test_search.py b/tests/tools/test_search.py index 1e302e76..7688b1e6 100644 --- a/tests/tools/test_search.py +++ b/tests/tools/test_search.py @@ -694,6 +694,135 @@ async def test_search_table_by_columns( if expected_count > 0: assert result[0].table_id == expected_first_table_id + @pytest.mark.asyncio + @pytest.mark.parametrize( + ( + 'patterns', + 'scopes', + 'component_configurations', + 'expected_hits', + ), + [ + ( + ['alpha', 'beta'], + ('parameters', 'storage.input'), + [ + { + 'id': 'test-config', + 'name': 'Test Config', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'alpha'}, + 'storage': {'input': [{'source': 'beta'}]}, + }, + 'rows': [], + } + ], + [('test-config', ['parameters.query', 'storage.input[0].source'])], + ), + ( + ['gamma'], + tuple(), + [ + { + 'id': 'test-config', + 'name': 'Test Config', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'alpha'}, + 'storage': { + 'input': [{'source': 'beta'}, {'source': 'gamma'}], + 'output': [{'destination': 'gamma'}], + }, + }, + 'rows': [], + } + ], + [('test-config', ['storage.input[1].source', 'storage.output[0].destination'])], + ), + ( + ['alpha', 'gamma'], + tuple(), + [ + { + 'id': 'test-config-a', + 'name': 'Test Config A', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'alpha'}, + 'storage': {'input': [{'source': 'beta'}]}, + }, + 'rows': [], + }, + { + 'id': 'test-config-b', + 'name': 'Test Config B', + 'created': '2024-01-03T00:00:00Z', + 'configuration': { + 'storage': {'output': [{'destination': 'gamma'}]}, + }, + 'rows': [], + }, + { + 'id': 'test-config-c', + 'name': 'Test Config C', + 'created': '2024-01-01T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'nomatch'}, + }, + 'rows': [], + }, + ], + [ + ('test-config-b', ['storage.output[0].destination']), + ('test-config-a', ['parameters.query']), + ], + ), + ], + ids=[ + 'all_matches_in_scopes', + 'most_specific_scope_only', + 'multiple_configurations_returned', + ], + ) + async def test_search_config_based_match_scopes( + self, + mocker: MockerFixture, + mcp_context_client: Context, + patterns: list[str], + scopes: tuple[str, ...], + component_configurations: list[dict[str, Any]], + expected_hits: list[tuple[str, list[str]]], + ): + keboola_client = KeboolaClient.from_state(mcp_context_client.session.state) + + keboola_client.storage_client.bucket_list = mocker.AsyncMock(return_value=[]) + keboola_client.storage_client.bucket_table_list = mocker.AsyncMock(return_value=[]) + keboola_client.storage_client.component_list = mocker.AsyncMock( + side_effect=lambda component_type, include=None: ( + [ + { + 'id': 'keboola.ex-db-mysql', + 'type': 'extractor', + 'configurations': component_configurations, + } + ] + if component_type == 'extractor' + else [] + ) + ) + keboola_client.storage_client.workspace_list = mocker.AsyncMock(return_value=[]) + + result = await search( + ctx=mcp_context_client, + patterns=patterns, + item_types=(cast(SearchItemType, 'configuration'),), + search_type='config-based', + scopes=scopes, + ) + + assert [(hit.configuration_id, hit.match_scopes) for hit in result] == expected_hits + @pytest.mark.parametrize( ('spec_kwargs', 'texts', 'expected'), @@ -774,6 +903,7 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li ('spec_kwargs', 'configuration', 'expected'), [ ( + # Scopes provided; each scope has one matching leaf – returns the exact leaf path. { 'patterns': ['alpha', 'beta'], 'item_types': ('configuration',), @@ -785,11 +915,12 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'storage': {'input': [{'source': 'beta'}], 'output': [{'destination': 'gamma'}]}, }, [ - {'scope': 'parameters', 'patterns': ['alpha']}, - {'scope': 'storage.input', 'patterns': ['beta']}, + {'scope': 'parameters.query', 'patterns': ['alpha']}, + {'scope': 'storage.input[0].source', 'patterns': ['beta']}, ], ), ( + # Both patterns match across two leaves inside the same scope; each leaf gets its own entry. { 'patterns': ['alpha', 'beta'], 'item_types': ('configuration',), @@ -801,11 +932,13 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'storage': {'input': [{'source': 'beta'}, {'source': 'alpha'}], 'output': [{'destination': 'gamma'}]}, }, [ - {'scope': 'parameters', 'patterns': ['alpha']}, - {'scope': 'storage.input', 'patterns': ['alpha', 'beta']}, + {'scope': 'parameters.query', 'patterns': ['alpha']}, + {'scope': 'storage.input[0].source', 'patterns': ['beta']}, + {'scope': 'storage.input[1].source', 'patterns': ['alpha']}, ], ), ( + # Pattern not present in any of the specified scopes → empty result. { 'patterns': ['gamma'], 'item_types': ('configuration',), @@ -819,6 +952,7 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li [], ), ( + # No scopes → walk the whole config; can match parent nodes containing the searched fragment. { 'patterns': ['gamma'], 'item_types': ('configuration',), @@ -828,9 +962,14 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'parameters': {'query': 'alpha'}, 'storage': {'input': [{'source': 'beta'}], 'output': [{'destination': 'gamma'}]}, }, - [{'scope': None, 'patterns': ['gamma']}], + [ + {'scope': 'storage', 'patterns': ['gamma']}, + {'scope': 'storage.output', 'patterns': ['gamma']}, + {'scope': 'storage.output[0].destination', 'patterns': ['gamma']}, + ], ), ( + # return_all_matched_patterns=False → stop after first matching leaf. { 'patterns': ['alpha', 'beta'], 'item_types': ('configuration',), @@ -841,7 +980,29 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'parameters': {'query': 'alpha'}, 'storage': {'input': [{'source': 'beta'}], 'output': [{'destination': 'gamma'}]}, }, - [{'scope': 'parameters', 'patterns': ['alpha']}], + [{'scope': 'parameters.query', 'patterns': ['alpha']}], + ), + ( + # Overlapping scopes should not return duplicate leaf hits. + { + 'patterns': ['alpha'], + 'item_types': ('configuration',), + 'search_scopes': ('parameters', 'parameters.query'), + 'return_all_matched_patterns': True, + }, + {'parameters': {'query': 'alpha'}}, + [{'scope': 'parameters.query', 'patterns': ['alpha']}], + ), + ( + # Scope pointing directly to scalar should still match (self-scope fallback). + { + 'patterns': ['wttr.in'], + 'item_types': ('configuration',), + 'search_scopes': ('parameters.api.baseUrl',), + 'return_all_matched_patterns': True, + }, + {'parameters': {'api': {'baseUrl': 'https://wttr.in'}}}, + [{'scope': 'parameters.api.baseUrl', 'patterns': ['wttr.in']}], ), ], ids=[ @@ -850,6 +1011,8 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'no_patterns_in_scope', 'all_patterns_no_scope', 'any_patterns_return_first_match', + 'overlapping_scopes_deduplicated', + 'scalar_scope_matches_self', ], ) def test_match_configuration_scopes(spec_kwargs: dict[str, Any], configuration: dict[str, Any], expected: list[dict]): From b390a73c55eda7ab16bef3e133dde434516fba2b Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:31:31 +0100 Subject: [PATCH 18/29] AI-2161 docs: refresh search tool documentation --- TOOLS.md | 154 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 94 insertions(+), 60 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index 71935805..9f3e2b22 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -53,7 +53,7 @@ including essential context and base instructions for working with it ### Search Tools - [find_component_id](#find_component_id): Returns list of component IDs that match the given query. -- [search](#search): Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc. +- [search](#search): Searches for Keboola items (tables, buckets, components, configurations, transformations, flows, data-apps, etc. ### Storage Tools - [get_buckets](#get_buckets): Lists buckets or retrieves full details of specific buckets, including descriptions, @@ -2587,80 +2587,119 @@ USAGE EXAMPLES: **Description**: -Searches for Keboola items (tables, buckets, configurations, transformations, flows, data-apps etc.) in the current -project. Returns matching items with IDs and metadata. -Supports two modes: -- textual: match patterns against ID, name, display name, description (and table columns) -- config-based: match patterns against stringified configuration payloads, optionally limited to specific scopes -which can be derived from the configuration schemas or objects. +Searches for Keboola items (tables, buckets, components, configurations, transformations, flows, data-apps, etc.) +in the current project and returns matching ID + metadata. +This tool supports two complementary search types: + +1) textual +- Searches item metadata fields by matching patterns against id, name, displayName, and description. +- For tables, also searches column names and column descriptions. + +2) config-based +- Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted +to a string, optionally narrowed by JSON path `scopes`. +- Returns also `match_scopes` with JSON paths in configuration where a pattern was found. + +THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items +by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, +get_flows, or get_data_apps just to locate a specific item — use this tool instead. WHEN TO USE: -- User asks to "find", "locate", or "search for" something by name or text +- User asks to "find", "locate", or "search for" something by name, keyword, text pattern, configuration content or +value - User mentions a partial name and you need to find the full item (e.g., "find the customer table") - User asks "what tables/configs/flows do I have with X in the name?" -- User asks to find configs containing a value in parameters (use config-based + scopes and regex patterns) -- Use this tool to trace lineage by searching for IDs referenced in configurations, or to find flows using a -specific component, or find usage of a bucket/table in transformations, or to find items with specific parameters. - You need to discover items before performing operations on them -- User asks to "what is the genesis of this item?" or "explain me business logic of this item?" - User asks to "list all items with [name] or [configuration value/part] in it" -- DO NOT use for listing all items of a specific type. Use get_configs, list_tables, get_flows, etc instead. +- User asks where a value, table, component, specific configuration ID, or specific settings is used in components, +data-apps, flows, or transformations +- You need to trace lineage by searching for IDs referenced in configurations, or to find flows using a + specific component, or find usage of a bucket/table in transformations or components, or to find items with + specific parameters. +- User asks to "what is the genesis of this item?" or "explain me business logic of this item?" HOW IT WORKS: -- mode: "literal" (default) or "regex" (regular expressions) -- case_sensitive: false by default; set true for exact casing -- search_type: - - "textual": matches id/name/display_name/description fields - - "config-based": matches stringified configuration payloads (JSON) via scopes or the whole config using - regex patterns. -- scopes: dot-separated paths (e.g., "parameters", "storage.input", "parameters.script") -- For tables, textual search also checks column names and column descriptions -- Multiple patterns are ORed: any match includes the item -- Results are ordered by update time, newest first, and can be paginated via limit/offset +- Supports two types: + - search_type="textual": matches against id, name, displayName, and description, for tables also column names + and column descriptions + - search_type="config-based": matches inside configuration JSON objects, optionally narrowed by JSON path `scopes` +- case-insensitive search +- mode for pattern search: `literal` (default) or `regex` +- Multiple patterns work as OR condition - matches items containing ANY of the patterns +- Each result includes the item's ID, name, creation date, and relevant metadata +- scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed +against the stringified JSON node content in those areas. +- config-based always returns all matched paths per item in `match_scopes` IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID -- The search returns IDs that you can use with other tools (e.g., get_table, get_configs, get_flows) -- Use item_types to make the search more efficient when you know the type; scanning buckets and tables can be -expensive -- For exact ID lookups, use specific tools like get_table, get_configs, get_flows instead +- The search returns IDs that you can use with other tools (e.g., get_tables, get_configs, get_flows) +- Results are ordered by update time. The most recently updated items are returned first. +- Fill `item_types` to make the search more efficient when you know the item type; scanning buckets and tables can +be expensive +- For exact ID lookups, use specific tools like get_tables, get_configs, get_flows instead +- Use specific `scopes` only when you know the config structure (schema or real example); otherwise run config-based +search without scopes. +- Use find_component_id and get_configs tools to find configurations related to a specific component +- If results are too numerous or empty, ask the user to refine their query rather than enumerating all items. USAGE EXAMPLES: -- user_input: "Find all tables with 'customer' in the name" - → `patterns=["customer"], search_type="textual", mode="literal", item_types=["table"]` +1) textual search examples: + - user_input: "Find all tables with 'customer' in the name" + → patterns=["customer"], item_types=["table"] + → Returns all tables whose id, name, displayName, or description contains "customer" + + - user_input: "Find tables with 'email' column" + → patterns=["email"], item_types=["table"] + → Returns all tables that have a column named "email" or with "email" in column description -- user_input: "Find tables with 'email' column" - → `patterns=["email"], search_type="textual", mode="literal", item_types=["table"]` + - user_input: "Search for the sales transformation" + → patterns=["sales"], item_types=["transformation"] + → Returns transformations with "sales" in any searchable field -- user_input: "Search for the sales transformation" - → `patterns=["sales"], search_type="textual", mode="literal", item_types=["transformation"]` + - user_input: "Find items named 'daily report' or 'weekly summary'" + → patterns=["daily.*report", "weekly.*summary"], item_types=[], mode="regex" + → Returns all items matching any of these patterns -- user_input: "Find items named 'daily report' or 'weekly summary'" - → `patterns=["daily.*report", "weekly.*summary"], search_type="textual", mode="regex", item_types=[]` + - user_input: "Show me all configurations related to Google Analytics" + → patterns=["google.*analytics"], item_types=["configuration"], mode="regex" + → Returns configurations with matching patterns -- user_input: "Show me all configurations/components related to Google Analytics" - → `patterns=["google.*analytics"], search_type="textual", mode="regex", item_types=["component"]` +2) config-based search examples: + - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" + -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" + -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths -- user_input: "Find storage input mappings referencing specific tables:" - → `patterns=["\"storage\".*\"input\".*:\s*\"in\..*\.customers\""], search_type="config-based", - mode="regex", item_types=["transformation", "component"]` + - user_input: "Find configurations (etc.) using specific setting / id anywhere" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", -- user input: "Find components or transformations using 'my_bucket' in output mappings" - → `patterns=["my_bucket"], item_types=["component", "transformation"], search_type="config-based", - scopes=["storage.output"], mode="literal"` + - user_input: "Find configurations (etc.) using specific setting /id in parameters" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["parameters"] -- user input: "Find configs with specific authentication type" - → `patterns=["\"authentication\":\s*\{.*\"type\":\s*\"oauth20\""], search_type="config-based", - mode="regex", item_types=["component"]` + - user_input: "Find configurations (etc.) using specific setting / id in storage" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] -- user input: "Find flows using this configuration ID: 01k9cz233cvd1rga3zzx40g8qj" - → `patterns=["01k9cz233cvd1rga3zzx40g8qj"], search_type="config-based", item_types=["flow"], mode="literal", - scopes=["tasks"]` + - user_input: "Find configurations (etc.) using specific setting / id in authorization" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", + scopes=["parameters.authorization", "authorization"] -- user input: "Find data apps using specific code part ..." - → `patterns=["regex-representing-the-code-part"], search_type="config-based", item_types=["data-app"], - mode="regex"], scopes=["script"]` + - user_input: "Find components/transformations using my_bucket in input or output mappings" + -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", + scopes=["storage.input", "storage.output"] + -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` + + - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" + -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", + scopes=["tasks", "phases"] + + - user_input: "Find transformations using this table / column / specific code in its script" + -> patterns=["element"], item_types=["transformation"], search_type="config-based", + scopes=["parameters"] + + - user_input: "Find data apps using something in its config / python code / setting" + -> patterns=["something"], item_types=["data-app"], search_type="config-based" + -> Returns data apps where script/config sections contain the keyword and includes `match_scopes` **Input JSON Schema**: @@ -2668,7 +2707,7 @@ USAGE EXAMPLES: { "properties": { "patterns": { - "description": "One or more search patterns to match against item ID, name, display name, or description. Supports regex patterns. Case-insensitive by default. Examples: [\"customer\"], [\"sales\", \"revenue\"], [\"test.*table\"], [\"key1.*:.*key2.*:.*value.*\"]. Do not use empty strings or empty lists.", + "description": "One or more search patterns to match against item ID, name, display name, description, or configuration JSON objects. Case-insensitive by default. Examples: [\"customer\"], [\"sales\", \"revenue\"], [\"my_bucket\"]. Do not use empty strings or empty lists.", "items": { "type": "string" }, @@ -2698,7 +2737,7 @@ USAGE EXAMPLES: }, "search_type": { "default": "textual", - "description": "Search mode: \"textual\" (name/id/description) or \"config-based\" (stringified configuration payloads).", + "description": "Search mode: \"textual\" (name/id/description) or \"config-based\" (stringified configuration payloads). (default: \"textual\")", "enum": [ "textual", "config-based" @@ -2707,7 +2746,7 @@ USAGE EXAMPLES: }, "scopes": { "default": [], - "description": "Dot-separated keys to search in configuration payloads, used with \"config-based\" search. Example: \"parameters.field\", \"storage.input\", \"storage.output\", \"processors.before\", \"processors.after\", \"authorization\", \"tasks\", \"phases\". Leave empty to search the whole configuration.", + "description": "JSONPath expressions to narrow config-based search to specific parts of the configuration. Simple dot-notation (e.g. \"parameters\", \"storage.input\") and full JSONPath (e.g. \"$.tasks[*]\") are both supported (e.g. \"parameters.host\", \"storage.input[0].source\"). Leave empty to search the whole configuration.", "items": { "type": "string" }, @@ -2722,11 +2761,6 @@ USAGE EXAMPLES: ], "type": "string" }, - "case_sensitive": { - "default": false, - "description": "If true, match patterns with case sensitivity (default: false).", - "type": "boolean" - }, "limit": { "default": 50, "description": "Maximum number of items to return (default: 50, max: 100).", From 19ba6582738ccd02017af0c466eb1fdd07659394 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:31:34 +0100 Subject: [PATCH 19/29] AI-2161 docs: add draft search description --- search_description_draft.txt | 112 +++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 search_description_draft.txt diff --git a/search_description_draft.txt b/search_description_draft.txt new file mode 100644 index 00000000..c43b56e6 --- /dev/null +++ b/search_description_draft.txt @@ -0,0 +1,112 @@ +Searches for Keboola items (tables, buckets, components, configurations, transformations, flows, data-apps, etc.) +in the current project and returns matching ID + metadata. + +This tool supports two complementary search types: + +1) textual +- Searches item metadata fields by matching patterns against id, name, displayName, and description. +- For tables, also searches column names and column descriptions. + +2) config-based +- Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted +to a string, optionally narrowed by JSON path `scopes`. +- Returns also `match_scopes` with JSON paths in configuration where a pattern was found. + +THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items +by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, +get_flows, or get_data_apps just to locate a specific item — use this tool instead. + +WHEN TO USE: +- User asks to "find", "locate", or "search for" something by name, keyword, text pattern, configuration content or +value +- User mentions a partial name and you need to find the full item (e.g., "find the customer table") +- User asks "what tables/configs/flows do I have with X in the name?" +- You need to discover items before performing operations on them +- User asks to "list all items with [name] or [configuration value/part] in it" +- User asks where a value, table, component, specific configuration ID, or specific settings is used in components, +data-apps, flows, or transformations +- You need to trace lineage by searching for IDs referenced in configurations, or to find flows using a + specific component, or find usage of a bucket/table in transformations or components, or to find items with + specific parameters. +- User asks to "what is the genesis of this item?" or "explain me business logic of this item?" + +HOW IT WORKS: +- Supports two types: + - search_type="textual": matches against id, name, displayName, and description, for tables also column names + and column descriptions + - search_type="config-based": matches inside configuration JSON objects, optionally narrowed by JSON path `scopes` +- case-insensitive search (default) +- mode for pattern search: `literal` (default) or `regex +- Multiple patterns work as OR condition - matches items containing ANY of the patterns +- Each result includes the item's ID, name, creation date, and relevant metadata +- scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed +against the stringified JSON node content in those areas. + +IMPORTANT: +- Always use this tool when the user mentions a name but you don't have the exact ID +- The search returns IDs that you can use with other tools (e.g., get_tables, get_configs, get_flows) +- Results are ordered by update time. The most recently updated items are returned first. +- Fill `item_types` to make the search more efficient when you know the item type; scanning buckets and tables can be +expensive +- For exact ID lookups, use specific tools like get_tables, get_configs, get_flows instead +- Use specific `scopes` only when you know the config structure (schema or real example); otherwise run config-based +search without scopes. +- Use find_component_id and get_configs tools to find configurations related to a specific component +- If results are too numerous or empty, ask the user to refine their query rather than enumerating all items. + +USAGE EXAMPLES: +1) textual search examples: + - user_input: "Find all tables with 'customer' in the name" + → patterns=["customer"], item_types=["table"] + → Returns all tables whose id, name, displayName, or description contains "customer" + + - user_input: "Find tables with 'email' column" + → patterns=["email"], item_types=["table"] + → Returns all tables that have a column named "email" or with "email" in column description + + - user_input: "Search for the sales transformation" + → patterns=["sales"], item_types=["transformation"] + → Returns transformations with "sales" in any searchable field + + - user_input: "Find items named 'daily report' or 'weekly summary'" + → patterns=["daily.*report", "weekly.*summary"], item_types=[], mode="regex" + → Returns all items matching any of these patterns + + - user_input: "Show me all configurations related to Google Analytics" + → patterns=["google.*analytics"], item_types=["configuration"], mode="regex" + → Returns configurations with matching patterns + +2) config-based search examples: + - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" + -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" + -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths + + - user_input: "Find configurations (etc.) using specific setting / id anywhere" + -> patterns=["setting", "id], item_types=["configuration"], search_type="config-based", + + - user_input: "Find configurations (etc.) using specific setting /id in parameters" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] + + - user_input: "Find configurations (etc.) using specific setting / id in storage" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] + + - user_input: "Find configurations (etc.) using specific setting / id in authorization" + -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", + scopes=["parameters.authorization", "authorization"] + + - user_input: "Find components/transformations using my_bucket in input or output mappings" + -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", + scopes=["storage.input", "storage.output"], return_all_matches=true + -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` + + - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" + -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", + scopes=["tasks", "phases"], return_all_matches=true + + - user_input: "Find transformations using this table / column / specific code in its script" + -> patterns=["element"], item_types=["transformation"], search_type="config-based", + scopes=["parameters"], return_all_matches=true + + - user_input: "Find data apps using something in its config / python code / setting" + -> patterns=["something"], item_types=["data-app"], search_type="config-based", return_all_matches=true + -> Returns data apps where script/config sections contain the keyword and includes `match_scopes` \ No newline at end of file From f2a489ed305d6578e650683062cb54bcbefd39ed Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:31:37 +0100 Subject: [PATCH 20/29] AI-2161 chore: update lockfile package version --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 1f67e055..f84955db 100644 --- a/uv.lock +++ b/uv.lock @@ -1200,7 +1200,7 @@ wheels = [ [[package]] name = "keboola-mcp-server" -version = "1.43.3" +version = "1.44.0" source = { editable = "." } dependencies = [ { name = "cryptography" }, From 6f4e672849123cfabd35f49a83aca1d90283d9c6 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 06:32:02 +0100 Subject: [PATCH 21/29] AI-2161 chore: remove draft search description file --- search_description_draft.txt | 112 ----------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 search_description_draft.txt diff --git a/search_description_draft.txt b/search_description_draft.txt deleted file mode 100644 index c43b56e6..00000000 --- a/search_description_draft.txt +++ /dev/null @@ -1,112 +0,0 @@ -Searches for Keboola items (tables, buckets, components, configurations, transformations, flows, data-apps, etc.) -in the current project and returns matching ID + metadata. - -This tool supports two complementary search types: - -1) textual -- Searches item metadata fields by matching patterns against id, name, displayName, and description. -- For tables, also searches column names and column descriptions. - -2) config-based -- Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted -to a string, optionally narrowed by JSON path `scopes`. -- Returns also `match_scopes` with JSON paths in configuration where a pattern was found. - -THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items -by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, -get_flows, or get_data_apps just to locate a specific item — use this tool instead. - -WHEN TO USE: -- User asks to "find", "locate", or "search for" something by name, keyword, text pattern, configuration content or -value -- User mentions a partial name and you need to find the full item (e.g., "find the customer table") -- User asks "what tables/configs/flows do I have with X in the name?" -- You need to discover items before performing operations on them -- User asks to "list all items with [name] or [configuration value/part] in it" -- User asks where a value, table, component, specific configuration ID, or specific settings is used in components, -data-apps, flows, or transformations -- You need to trace lineage by searching for IDs referenced in configurations, or to find flows using a - specific component, or find usage of a bucket/table in transformations or components, or to find items with - specific parameters. -- User asks to "what is the genesis of this item?" or "explain me business logic of this item?" - -HOW IT WORKS: -- Supports two types: - - search_type="textual": matches against id, name, displayName, and description, for tables also column names - and column descriptions - - search_type="config-based": matches inside configuration JSON objects, optionally narrowed by JSON path `scopes` -- case-insensitive search (default) -- mode for pattern search: `literal` (default) or `regex -- Multiple patterns work as OR condition - matches items containing ANY of the patterns -- Each result includes the item's ID, name, creation date, and relevant metadata -- scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed -against the stringified JSON node content in those areas. - -IMPORTANT: -- Always use this tool when the user mentions a name but you don't have the exact ID -- The search returns IDs that you can use with other tools (e.g., get_tables, get_configs, get_flows) -- Results are ordered by update time. The most recently updated items are returned first. -- Fill `item_types` to make the search more efficient when you know the item type; scanning buckets and tables can be -expensive -- For exact ID lookups, use specific tools like get_tables, get_configs, get_flows instead -- Use specific `scopes` only when you know the config structure (schema or real example); otherwise run config-based -search without scopes. -- Use find_component_id and get_configs tools to find configurations related to a specific component -- If results are too numerous or empty, ask the user to refine their query rather than enumerating all items. - -USAGE EXAMPLES: -1) textual search examples: - - user_input: "Find all tables with 'customer' in the name" - → patterns=["customer"], item_types=["table"] - → Returns all tables whose id, name, displayName, or description contains "customer" - - - user_input: "Find tables with 'email' column" - → patterns=["email"], item_types=["table"] - → Returns all tables that have a column named "email" or with "email" in column description - - - user_input: "Search for the sales transformation" - → patterns=["sales"], item_types=["transformation"] - → Returns transformations with "sales" in any searchable field - - - user_input: "Find items named 'daily report' or 'weekly summary'" - → patterns=["daily.*report", "weekly.*summary"], item_types=[], mode="regex" - → Returns all items matching any of these patterns - - - user_input: "Show me all configurations related to Google Analytics" - → patterns=["google.*analytics"], item_types=["configuration"], mode="regex" - → Returns configurations with matching patterns - -2) config-based search examples: - - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" - -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" - -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths - - - user_input: "Find configurations (etc.) using specific setting / id anywhere" - -> patterns=["setting", "id], item_types=["configuration"], search_type="config-based", - - - user_input: "Find configurations (etc.) using specific setting /id in parameters" - -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] - - - user_input: "Find configurations (etc.) using specific setting / id in storage" - -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", scopes=["storage"] - - - user_input: "Find configurations (etc.) using specific setting / id in authorization" - -> patterns=["setting", "id"], item_types=["configuration"], search_type="config-based", - scopes=["parameters.authorization", "authorization"] - - - user_input: "Find components/transformations using my_bucket in input or output mappings" - -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", - scopes=["storage.input", "storage.output"], return_all_matches=true - -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` - - - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" - -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", - scopes=["tasks", "phases"], return_all_matches=true - - - user_input: "Find transformations using this table / column / specific code in its script" - -> patterns=["element"], item_types=["transformation"], search_type="config-based", - scopes=["parameters"], return_all_matches=true - - - user_input: "Find data apps using something in its config / python code / setting" - -> patterns=["something"], item_types=["data-app"], search_type="config-based", return_all_matches=true - -> Returns data apps where script/config sections contain the keyword and includes `match_scopes` \ No newline at end of file From d6240cbf06c8f0d809752d7d6f90bb60a4ba04d7 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 07:15:36 +0100 Subject: [PATCH 22/29] AI-2161 docs: mention config-based search in project system prompt Co-authored-by: Codex --- .../resources/prompts/project_system_prompt.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/keboola_mcp_server/resources/prompts/project_system_prompt.md b/src/keboola_mcp_server/resources/prompts/project_system_prompt.md index 355f2cdd..fb74c746 100644 --- a/src/keboola_mcp_server/resources/prompts/project_system_prompt.md +++ b/src/keboola_mcp_server/resources/prompts/project_system_prompt.md @@ -1,13 +1,18 @@ ### Finding Items by Name When looking for specific items (tables, buckets, configurations, flows, data apps) by name, description, -or partial match, **always use the `search` tool first** rather than listing all items with `get_*` tools. +partial match, or configuration content/reference, **always use the `search` tool first** rather than listing all +items with `get_*` tools. -- `search` matches by regex against names, IDs, descriptions, and (for tables) column names. +- `search` supports: + - textual search over names, IDs, descriptions, and (for tables) column names + - config-based search over item configuration JSON contents, including scoped JSONPath search when useful - Listing all items with empty IDs (e.g., `get_buckets(bucket_ids=[])`, `get_configs()`, `get_flows(flow_ids=[])`) is wasteful on large projects and should only be used when you genuinely need a complete inventory. - If the user mentions a name but you do not have the exact ID, call `search` with an appropriate pattern and `item_types` filter. +- If the user asks where a table/component/config ID/value is used, call `search` with + `search_type="config-based"` (and use `scopes` when you know the config structure). - If `search` returns too many results or zero results, ask the user to be more specific rather than falling back to enumerating all items. From 52c32eac80238545522d718361687148af47afb6 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Wed, 18 Feb 2026 08:24:55 +0100 Subject: [PATCH 23/29] AI-2161 docs: rename project prompt section to finding items --- .../resources/prompts/project_system_prompt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/keboola_mcp_server/resources/prompts/project_system_prompt.md b/src/keboola_mcp_server/resources/prompts/project_system_prompt.md index fb74c746..e4f394db 100644 --- a/src/keboola_mcp_server/resources/prompts/project_system_prompt.md +++ b/src/keboola_mcp_server/resources/prompts/project_system_prompt.md @@ -1,4 +1,4 @@ -### Finding Items by Name +### Finding Items When looking for specific items (tables, buckets, configurations, flows, data apps) by name, description, partial match, or configuration content/reference, **always use the `search` tool first** rather than listing all From 241fbb20e2538f878cab233555367ad3c7594d3f Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 23 Feb 2026 14:52:02 +0100 Subject: [PATCH 24/29] AI-2161 feat: address search review feedback --- src/keboola_mcp_server/tools/search.py | 71 +++++++++++--------------- tests/tools/storage/test_usage.py | 4 +- tests/tools/test_search.py | 48 +++++++++++++++++ 3 files changed, 79 insertions(+), 44 deletions(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 0e83ff7d..8af88bab 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -23,7 +23,7 @@ from keboola_mcp_server.errors import tool_errors from keboola_mcp_server.links import Link, ProjectLinksManager from keboola_mcp_server.mcp import toon_serializer_compact -from keboola_mcp_server.tools.components.utils import get_nested +from keboola_mcp_server.tools.components.utils import _normalize_jsonpath, get_nested LOG = logging.getLogger(__name__) @@ -152,7 +152,7 @@ def check_id_fields(self) -> 'SearchHit': return self - def with_matches(self, matches: list['PatternMatch']) -> 'SearchHit': + def set_matches(self, matches: list['PatternMatch']) -> 'SearchHit': """Assign pattern matches to this search hit and return self for chaining.""" self._matches = matches unique_scopes = list(dict.fromkeys(match.scope for match in matches if match.scope)) @@ -160,7 +160,7 @@ def with_matches(self, matches: list['PatternMatch']) -> 'SearchHit': scope for scope in unique_scopes if not any( - other != scope and other.startswith(scope) and other[len(scope) : len(scope) + 1] in {'.', '['} + other.startswith(scope) and len(other) > len(scope) and other[len(scope)] in ('.', '[') for other in unique_scopes ) ] @@ -180,6 +180,7 @@ class SearchSpec(BaseModel): _compiled_patterns: list[re.Pattern] = PrivateAttr(default_factory=list) _clean_patterns: list[str] = PrivateAttr(default_factory=list) _all_nodes_expr: JSONPath | None = PrivateAttr(default=None) + # Tuple fields: (original_scope, parsed_scope_expr, parsed_descendants_expr) _scope_exprs: list[tuple[str, JSONPath, JSONPath]] = PrivateAttr(default_factory=list) @model_validator(mode='after') @@ -222,7 +223,7 @@ def _compile_jsonpath_exprs(self) -> 'SearchSpec': self._all_nodes_expr = jsonpath_ng.parse('$..*') self._scope_exprs = [] for scope in self.search_scopes: - normalized = scope if scope.startswith('$') else f'$.{scope}' + normalized = _normalize_jsonpath(scope if scope.startswith('$') else f'$.{scope}') try: self._scope_exprs.append((scope, jsonpath_ng.parse(normalized), jsonpath_ng.parse(f'{normalized}..*'))) except Exception as e: @@ -258,28 +259,14 @@ def match_patterns(self, value: str | JsonDict | None) -> list[str]: return matches - def _find_matches_for_expr(self, configuration: JsonDict, parsed_expr: JSONPath) -> list[PatternMatch]: - """Find pattern matches on JSON nodes matched by a JSONPath expression.""" + def _find_matches_for_expr( + self, configuration: JsonDict, parsed_expr: JSONPath, scalar_only: bool = False + ) -> list[PatternMatch]: + """Find pattern matches on JSON nodes matched by a JSONPath expression. If scalar_only is True, only scalar nodes are matched.""" matches: list[PatternMatch] = [] for jpath_match in parsed_expr.find(configuration): value = jpath_match.value - if matched := self.match_patterns(value): - matches.append( - PatternMatch( - scope=re.sub(r'\.\[', '[', str(jpath_match.full_path)), - patterns=matched, - ) - ) - if not self.return_all_matched_patterns: - return matches - return matches - - def _find_scalar_matches_for_expr(self, configuration: JsonDict, parsed_expr: JSONPath) -> list[PatternMatch]: - """Find pattern matches only on scalar nodes matched by a JSONPath expression.""" - matches: list[PatternMatch] = [] - for jpath_match in parsed_expr.find(configuration): - value = jpath_match.value - if value is None or isinstance(value, (dict, list)): + if scalar_only and isinstance(value, (dict, list)): continue if matched := self.match_patterns(value): matches.append( @@ -310,12 +297,13 @@ def match_configuration_scopes(self, configuration: JsonDict | None) -> list[Pat # or the same logical scope is provided multiple times. seen: set[str | None] = set() for _scope, self_expr, desc_expr in self._scope_exprs: - # Include self scope only for scalar values. For objects/lists, include descendants only. - self_matches = self._find_scalar_matches_for_expr(configuration, self_expr) - desc_matches = self._find_matches_for_expr(configuration, desc_expr) - - scope_matches = desc_matches if desc_matches else self_matches - for match in scope_matches: + # Search in self expression node for scalar matches first + self_matches = self._find_matches_for_expr(configuration, self_expr, scalar_only=True) + # If no scalar matches, search in descendants nodes + desc_matches: list[PatternMatch] = [] + if not self_matches: + desc_matches = self._find_matches_for_expr(configuration, desc_expr) + for match in self_matches or desc_matches: if match.scope in seen: continue seen.add(match.scope) @@ -323,11 +311,9 @@ def match_configuration_scopes(self, configuration: JsonDict | None) -> list[Pat if not self.return_all_matched_patterns: return all_matches return all_matches - - # No scope provided – search all descendants and return exact match paths. - if self._all_nodes_expr is None: - self._all_nodes_expr = jsonpath_ng.parse('$..*') - return self._find_matches_for_expr(configuration, self._all_nodes_expr) + else: + # No scope provided – search all descendants and return exact match paths. + return self._find_matches_for_expr(configuration, self._all_nodes_expr) def match_texts(self, texts: Iterable[str]) -> list[PatternMatch]: """ @@ -386,7 +372,7 @@ async def _fetch_buckets(client: KeboolaClient, spec: SearchSpec) -> list[Search name=bucket_name, display_name=bucket_display_name, description=bucket_description, - ).with_matches(matches) + ).set_matches(matches) ) return hits @@ -418,7 +404,7 @@ async def _fetch_tables(client: KeboolaClient, spec: SearchSpec) -> list[SearchH name=table_name, display_name=table_display_name, description=table_description, - ).with_matches(matches) + ).set_matches(matches) ) return hits @@ -497,7 +483,7 @@ async def _fetch_configs( updated=config_updated, name=config_name, description=config_description, - ).with_matches(matches) + ).set_matches(matches) elif spec.search_type == 'config-based': if matches := spec.match_configuration_scopes(config.get('configuration')): yield SearchHit( @@ -507,7 +493,7 @@ async def _fetch_configs( updated=config_updated, name=config_name, description=config_description, - ).with_matches(matches) + ).set_matches(matches) for row in config.get('rows', []): if not (row_id := row.get('id')): @@ -526,7 +512,7 @@ async def _fetch_configs( updated=config_updated or _get_field_value(row, ['created']), name=row_name, description=row_description, - ).with_matches(matches) + ).set_matches(matches) elif spec.search_type == 'config-based': if matches := spec.match_configuration_scopes(row.get('configuration')): @@ -538,7 +524,7 @@ async def _fetch_configs( updated=config_updated or _get_field_value(row, ['created']), name=row_name, description=row_description, - ).with_matches(matches) + ).set_matches(matches) @tool_errors() @@ -699,7 +685,8 @@ async def search( - user_input: "Find components/transformations using my_bucket in input or output mappings" -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", scopes=["storage.input", "storage.output"] - -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` + -> Returns matches with paths like `storage.input.tables[0].source`, `storage.input.files[0].source`, + or `storage.output.tables[0].destination` - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", @@ -707,7 +694,7 @@ async def search( - user_input: "Find transformations using this table / column / specific code in its script" -> patterns=["element"], item_types=["transformation"], search_type="config-based", - scopes=["parameters"] + scopes=["parameters", "storage"] - user_input: "Find data apps using something in its config / python code / setting" -> patterns=["something"], item_types=["data-app"], search_type="config-based" diff --git a/tests/tools/storage/test_usage.py b/tests/tools/storage/test_usage.py index dbe7c173..fe7d883e 100644 --- a/tests/tools/storage/test_usage.py +++ b/tests/tools/storage/test_usage.py @@ -24,14 +24,14 @@ def _sorted_usage(output: Sequence[storage_usage.UsageById]) -> list[storage_usa item_type='configuration', updated='2024-01-01T00:00:00Z', name='Config 1', - ).with_matches([PatternMatch(scope='storage.input', patterns=['id-1', 'id-2'])]), + ).set_matches([PatternMatch(scope='storage.input', patterns=['id-1', 'id-2'])]), SearchHit( component_id='keboola.ex-db', configuration_id='cfg-2', item_type='configuration', updated='2024-01-02T00:00:00Z', name='Config 2', - ).with_matches([PatternMatch(scope='storage.output', patterns=['id-1'])]), + ).set_matches([PatternMatch(scope='storage.output', patterns=['id-1'])]), ], { 'id-1': [ diff --git a/tests/tools/test_search.py b/tests/tools/test_search.py index 7688b1e6..f842e985 100644 --- a/tests/tools/test_search.py +++ b/tests/tools/test_search.py @@ -740,6 +740,40 @@ async def test_search_table_by_columns( ], [('test-config', ['storage.input[1].source', 'storage.output[0].destination'])], ), + ( + ['alpha'], + ('parameters',), + [ + { + 'id': 'test-config', + 'name': 'Test Config', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'alpha'}, + 'storage': {'input': [{'source': 'alpha'}]}, + }, + 'rows': [], + } + ], + [('test-config', ['parameters.query'])], + ), + ( + ['alpha'], + ('authorization.#apiKey',), + [ + { + 'id': 'test-config', + 'name': 'Test Config', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'authorization': {'#apiKey': 'alpha'}, + 'parameters': {'query': 'nomatch'}, + }, + 'rows': [], + } + ], + [('test-config', ['authorization.#apiKey'])], + ), ( ['alpha', 'gamma'], tuple(), @@ -782,6 +816,8 @@ async def test_search_table_by_columns( ids=[ 'all_matches_in_scopes', 'most_specific_scope_only', + 'scope_constrains_same_value_in_other_path', + 'hash_prefixed_scope_key_in_search_tool', 'multiple_configurations_returned', ], ) @@ -1004,6 +1040,17 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li {'parameters': {'api': {'baseUrl': 'https://wttr.in'}}}, [{'scope': 'parameters.api.baseUrl', 'patterns': ['wttr.in']}], ), + ( + # Scope with #-prefixed key should be normalized and parsed correctly. + { + 'patterns': ['alpha'], + 'item_types': ('configuration',), + 'search_scopes': ('authorization.#apiKey',), + 'return_all_matched_patterns': True, + }, + {'authorization': {'#apiKey': 'alpha'}}, + [{'scope': 'authorization.#apiKey', 'patterns': ['alpha']}], + ), ], ids=[ 'all_patterns_many_scopes', @@ -1013,6 +1060,7 @@ def test_match_texts(spec_kwargs: dict[str, Any], texts: list[str], expected: li 'any_patterns_return_first_match', 'overlapping_scopes_deduplicated', 'scalar_scope_matches_self', + 'hash_prefixed_scope_key_matches', ], ) def test_match_configuration_scopes(spec_kwargs: dict[str, Any], configuration: dict[str, Any], expected: list[dict]): From 6eb6277a46ae8ae8d10fd94360ab28a1fab6597a Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 23 Feb 2026 15:32:13 +0100 Subject: [PATCH 25/29] AI-2161 feat: group matched patterns by scope in search results --- src/keboola_mcp_server/tools/search.py | 25 ++++++--- tests/tools/test_search.py | 77 +++++++++++++++++++++++--- 2 files changed, 86 insertions(+), 16 deletions(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 8af88bab..77437fb0 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -2,6 +2,7 @@ import json import logging import re +from collections import defaultdict from typing import Annotated, Any, AsyncGenerator, Iterable, Literal, Mapping, Sequence import jsonpath_ng @@ -116,10 +117,9 @@ class SearchHit(BaseModel): name: str | None = Field(default=None, description='Name of the item.') display_name: str | None = Field(default=None, description='Display name of the item.') description: str | None = Field(default=None, description='Description of the item.') - match_scopes: list[str] = Field( + match_scopes: list[PatternMatch] = Field( default_factory=list, - description='Most specific JSONPath scopes within the configuration where a pattern was matched ' - '(config-based search only).', + description='Most specific JSONPath scopes with grouped matched patterns ' '(config-based search only).', ) links: list[Link] = Field(default_factory=list, description='Links to the item.') _matches: list[PatternMatch] = PrivateAttr(default_factory=list) @@ -155,8 +155,14 @@ def check_id_fields(self) -> 'SearchHit': def set_matches(self, matches: list['PatternMatch']) -> 'SearchHit': """Assign pattern matches to this search hit and return self for chaining.""" self._matches = matches - unique_scopes = list(dict.fromkeys(match.scope for match in matches if match.scope)) - self.match_scopes = [ + grouped_patterns_by_scope: dict[str, set[str]] = defaultdict(set) + for match in matches: + if not match.scope: + continue + grouped_patterns_by_scope[match.scope].update(match.patterns) + + unique_scopes = list(grouped_patterns_by_scope) + most_specific_scopes = [ scope for scope in unique_scopes if not any( @@ -164,6 +170,9 @@ def set_matches(self, matches: list['PatternMatch']) -> 'SearchHit': for other in unique_scopes ) ] + self.match_scopes = [ + PatternMatch(scope=scope, patterns=list(grouped_patterns_by_scope[scope])) for scope in most_specific_scopes + ] return self @@ -594,7 +603,7 @@ async def search( 2) config-based - Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted to a string, optionally narrowed by JSON path `scopes`. - - Returns also `match_scopes` with JSON paths in configuration where a pattern was found. + - Returns also `match_scopes` with JSON paths and matched patterns per scope. THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, @@ -625,7 +634,7 @@ async def search( - Each result includes the item's ID, name, creation date, and relevant metadata - scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed against the stringified JSON node content in those areas. - - config-based always returns all matched paths per item in `match_scopes` + - config-based always returns all matched paths per item in `match_scopes` (including matched patterns) IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID @@ -665,7 +674,7 @@ async def search( - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" - -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths + -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths + patterns - user_input: "Find configurations/transformations (etc.) using specific setting / id anywhere" -> patterns=["setting", "id"], item_types=["configuration", "transformations"], search_type="config-based", diff --git a/tests/tools/test_search.py b/tests/tools/test_search.py index f842e985..6ca83ea9 100644 --- a/tests/tools/test_search.py +++ b/tests/tools/test_search.py @@ -718,7 +718,15 @@ async def test_search_table_by_columns( 'rows': [], } ], - [('test-config', ['parameters.query', 'storage.input[0].source'])], + [ + ( + 'test-config', + [ + {'scope': 'parameters.query', 'patterns': ['alpha']}, + {'scope': 'storage.input[0].source', 'patterns': ['beta']}, + ], + ) + ], ), ( ['gamma'], @@ -738,7 +746,15 @@ async def test_search_table_by_columns( 'rows': [], } ], - [('test-config', ['storage.input[1].source', 'storage.output[0].destination'])], + [ + ( + 'test-config', + [ + {'scope': 'storage.input[1].source', 'patterns': ['gamma']}, + {'scope': 'storage.output[0].destination', 'patterns': ['gamma']}, + ], + ) + ], ), ( ['alpha'], @@ -755,7 +771,7 @@ async def test_search_table_by_columns( 'rows': [], } ], - [('test-config', ['parameters.query'])], + [('test-config', [{'scope': 'parameters.query', 'patterns': ['alpha']}])], ), ( ['alpha'], @@ -772,7 +788,31 @@ async def test_search_table_by_columns( 'rows': [], } ], - [('test-config', ['authorization.#apiKey'])], + [('test-config', [{'scope': 'authorization.#apiKey', 'patterns': ['alpha']}])], + ), + ( + ['alpha', 'beta'], + ('parameters',), + [ + { + 'id': 'test-config', + 'name': 'Test Config', + 'created': '2024-01-02T00:00:00Z', + 'configuration': { + 'parameters': {'query': 'alpha beta', 'query2': 'beta'}, + }, + 'rows': [], + } + ], + [ + ( + 'test-config', + [ + {'scope': 'parameters.query', 'patterns': ['alpha', 'beta']}, + {'scope': 'parameters.query2', 'patterns': ['beta']}, + ], + ) + ], ), ( ['alpha', 'gamma'], @@ -808,8 +848,8 @@ async def test_search_table_by_columns( }, ], [ - ('test-config-b', ['storage.output[0].destination']), - ('test-config-a', ['parameters.query']), + ('test-config-b', [{'scope': 'storage.output[0].destination', 'patterns': ['gamma']}]), + ('test-config-a', [{'scope': 'parameters.query', 'patterns': ['alpha']}]), ], ), ], @@ -818,6 +858,7 @@ async def test_search_table_by_columns( 'most_specific_scope_only', 'scope_constrains_same_value_in_other_path', 'hash_prefixed_scope_key_in_search_tool', + 'group_two_patterns_in_one_scope', 'multiple_configurations_returned', ], ) @@ -828,7 +869,7 @@ async def test_search_config_based_match_scopes( patterns: list[str], scopes: tuple[str, ...], component_configurations: list[dict[str, Any]], - expected_hits: list[tuple[str, list[str]]], + expected_hits: list[tuple[str, list[dict[str, Any]]]], ): keboola_client = KeboolaClient.from_state(mcp_context_client.session.state) @@ -857,7 +898,27 @@ async def test_search_config_based_match_scopes( scopes=scopes, ) - assert [(hit.configuration_id, hit.match_scopes) for hit in result] == expected_hits + normalized_actual = [ + ( + hit.configuration_id, + sorted( + ({'scope': m.scope, 'patterns': sorted(m.patterns)} for m in hit.match_scopes), + key=lambda x: x['scope'] or '', + ), + ) + for hit in result + ] + normalized_expected = [ + ( + config_id, + sorted( + ({'scope': m['scope'], 'patterns': sorted(m['patterns'])} for m in matches), + key=lambda x: x['scope'] or '', + ), + ) + for config_id, matches in expected_hits + ] + assert normalized_actual == normalized_expected @pytest.mark.parametrize( From 26e8869f8cdfe2452d6872be54df007c6725ce28 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 23 Feb 2026 15:39:10 +0100 Subject: [PATCH 26/29] AI-2161 feat: adapt usage to grouped match scopes --- src/keboola_mcp_server/tools/search.py | 14 ++++++-------- src/keboola_mcp_server/tools/storage/usage.py | 2 +- tests/tools/test_search.py | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 77437fb0..901ed97f 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -117,12 +117,11 @@ class SearchHit(BaseModel): name: str | None = Field(default=None, description='Name of the item.') display_name: str | None = Field(default=None, description='Display name of the item.') description: str | None = Field(default=None, description='Description of the item.') - match_scopes: list[PatternMatch] = Field( + matches: list[PatternMatch] = Field( default_factory=list, description='Most specific JSONPath scopes with grouped matched patterns ' '(config-based search only).', ) links: list[Link] = Field(default_factory=list, description='Links to the item.') - _matches: list[PatternMatch] = PrivateAttr(default_factory=list) def __eq__(self, other: object) -> bool: if isinstance(other, SearchHit): @@ -154,14 +153,13 @@ def check_id_fields(self) -> 'SearchHit': def set_matches(self, matches: list['PatternMatch']) -> 'SearchHit': """Assign pattern matches to this search hit and return self for chaining.""" - self._matches = matches - grouped_patterns_by_scope: dict[str, set[str]] = defaultdict(set) + patterns_by_scope: dict[str, set[str]] = defaultdict(set) for match in matches: if not match.scope: continue - grouped_patterns_by_scope[match.scope].update(match.patterns) + patterns_by_scope[match.scope].update(match.patterns) - unique_scopes = list(grouped_patterns_by_scope) + unique_scopes = list(patterns_by_scope) most_specific_scopes = [ scope for scope in unique_scopes @@ -170,8 +168,8 @@ def set_matches(self, matches: list['PatternMatch']) -> 'SearchHit': for other in unique_scopes ) ] - self.match_scopes = [ - PatternMatch(scope=scope, patterns=list(grouped_patterns_by_scope[scope])) for scope in most_specific_scopes + self.matches = [ + PatternMatch(scope=scope, patterns=list(patterns_by_scope[scope])) for scope in most_specific_scopes ] return self diff --git a/src/keboola_mcp_server/tools/storage/usage.py b/src/keboola_mcp_server/tools/storage/usage.py index 83e2e201..01fb399c 100644 --- a/src/keboola_mcp_server/tools/storage/usage.py +++ b/src/keboola_mcp_server/tools/storage/usage.py @@ -67,7 +67,7 @@ async def find_id_usage( # group usage references by pattern = target_id output: dict[str, list[ComponentUsageReference]] = defaultdict(list) for search_hit in search_hits: - for match in search_hit._matches: + for match in search_hit.matches: for target_id in match.patterns: output[target_id].append( # TODO: Consider whether adding configuration description is useful, it could overload context. diff --git a/tests/tools/test_search.py b/tests/tools/test_search.py index 6ca83ea9..91d6b52c 100644 --- a/tests/tools/test_search.py +++ b/tests/tools/test_search.py @@ -902,7 +902,7 @@ async def test_search_config_based_match_scopes( ( hit.configuration_id, sorted( - ({'scope': m.scope, 'patterns': sorted(m.patterns)} for m in hit.match_scopes), + ({'scope': m.scope, 'patterns': sorted(m.patterns)} for m in hit.matches), key=lambda x: x['scope'] or '', ), ) From a25546cb4f8fce57d0dd2511849655beccfaad7b Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 23 Feb 2026 15:41:17 +0100 Subject: [PATCH 27/29] AI-2161 docs: update TOOLS reference --- TOOLS.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/TOOLS.md b/TOOLS.md index d1335643..e866239d 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -2631,7 +2631,7 @@ This tool supports two complementary search types: 2) config-based - Searches item configurations (JSON objects) by matching patterns against the configuration values ​​converted to a string, optionally narrowed by JSON path `scopes`. -- Returns also `match_scopes` with JSON paths in configuration where a pattern was found. +- Returns also `match_scopes` with JSON paths and matched patterns per scope. THIS IS THE PRIMARY DISCOVERY TOOL. Always use it BEFORE any get_* tool when you need to find items by name or specific configuration content. Do NOT enumerate items with get_buckets, get_tables, get_configs, @@ -2662,7 +2662,7 @@ HOW IT WORKS: - Each result includes the item's ID, name, creation date, and relevant metadata - scopes (config-based) narrow matching to specific JSONPath areas within configurations; matching is performed against the stringified JSON node content in those areas. -- config-based always returns all matched paths per item in `match_scopes` +- config-based always returns all matched paths per item in `match_scopes` (including matched patterns) IMPORTANT: - Always use this tool when the user mentions a name but you don't have the exact ID @@ -2702,7 +2702,7 @@ USAGE EXAMPLES: - user_input: "Find transformations/configs/components referencing table in.c-prod.customers" -> patterns=["in.c-prod.customers"], item_types=["transformation", "configuration"], search_type="config-based" - -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths + -> No scopes = search whole stringified config; result includes `match_scopes` with exact paths + patterns - user_input: "Find configurations/transformations (etc.) using specific setting / id anywhere" -> patterns=["setting", "id"], item_types=["configuration", "transformations"], search_type="config-based", @@ -2722,7 +2722,8 @@ scopes=["storage"] - user_input: "Find components/transformations using my_bucket in input or output mappings" -> patterns=["my_bucket"], item_types=["configuration", "transformation"], search_type="config-based", scopes=["storage.input", "storage.output"] - -> Returns matches with paths like `storage.input[0].source` or `storage.output[0].target` + -> Returns matches with paths like `storage.input.tables[0].source`, `storage.input.files[0].source`, + or `storage.output.tables[0].destination` - user_input: "Find flows using configuration ID 01k9cz233cvd1rga3zzx40g8qj" -> patterns=["01k9cz233cvd1rga3zzx40g8qj"], item_types=["flow"], search_type="config-based", @@ -2730,7 +2731,7 @@ scopes=["storage"] - user_input: "Find transformations using this table / column / specific code in its script" -> patterns=["element"], item_types=["transformation"], search_type="config-based", - scopes=["parameters"] + scopes=["parameters", "storage"] - user_input: "Find data apps using something in its config / python code / setting" -> patterns=["something"], item_types=["data-app"], search_type="config-based" From 42eac29d1bfe7a78f3f9988a07316600c8f89805 Mon Sep 17 00:00:00 2001 From: mariankrotil Date: Mon, 23 Feb 2026 15:57:36 +0100 Subject: [PATCH 28/29] AI-2161 style: apply flake --- src/keboola_mcp_server/tools/search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 901ed97f..59cd4211 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -269,7 +269,8 @@ def match_patterns(self, value: str | JsonDict | None) -> list[str]: def _find_matches_for_expr( self, configuration: JsonDict, parsed_expr: JSONPath, scalar_only: bool = False ) -> list[PatternMatch]: - """Find pattern matches on JSON nodes matched by a JSONPath expression. If scalar_only is True, only scalar nodes are matched.""" + """Find pattern matches on JSON nodes matched by a JSONPath expression. If scalar_only is True, only scalar + nodes are matched.""" matches: list[PatternMatch] = [] for jpath_match in parsed_expr.find(configuration): value = jpath_match.value From 5ceae6b11edfea15c3c95664735472cc893ff710 Mon Sep 17 00:00:00 2001 From: Marian Krotil <154078172+mariankrotil@users.noreply.github.com> Date: Tue, 24 Feb 2026 14:55:57 +0100 Subject: [PATCH 29/29] Update src/keboola_mcp_server/tools/search.py --- src/keboola_mcp_server/tools/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/keboola_mcp_server/tools/search.py b/src/keboola_mcp_server/tools/search.py index 59cd4211..ab7af0b1 100644 --- a/src/keboola_mcp_server/tools/search.py +++ b/src/keboola_mcp_server/tools/search.py @@ -119,7 +119,7 @@ class SearchHit(BaseModel): description: str | None = Field(default=None, description='Description of the item.') matches: list[PatternMatch] = Field( default_factory=list, - description='Most specific JSONPath scopes with grouped matched patterns ' '(config-based search only).', + description='Most specific JSONPath scopes with grouped matched patterns (config-based search only).', ) links: list[Link] = Field(default_factory=list, description='Links to the item.')