cloudscape-docs-mcp/server.py at main · prem676/cloudscape-docs-mcp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
MCP Server for Cloudscape Design System Documentation.

Designed for high token efficiency and agentic workflows.
"""

import re
from pathlib import Path
from typing import Any

import lancedb
import torch
from mcp.server.fastmcp import FastMCP
from sentence_transformers import SentenceTransformer
from loguru import logger

# --- Configuration ---
DB_URI = Path("./data/lancedb")
MODEL_NAME = "Alibaba-NLP/gte-multilingual-base"
DOCS_ROOT = Path("docs")  # Used for path validation
MAX_UNIQUE_RESULTS = 5

# --- Initialize Server ---
mcp = FastMCP(
    "CloudscapeDocs",
    dependencies=["lancedb", "sentence-transformers", "torch"],
)

# --- Global Resources (Lazy Loaded) ---
_db: Any = None
_model: Any = None
_table: Any = None


def _determine_device() -> str:
    """Check for MPS (Apple Silicon) or CUDA support."""
    if torch.backends.mps.is_available():
        logger.debug("MPS is available")
        return "mps"
    if torch.cuda.is_available():
        logger.debug("CUDA is available")
        return "cuda"
    logger.debug("CPU is available")
    return "cpu"


def get_resources():
    """Lazy initialization of DB and Model."""
    global _db, _model, _table  # noqa: PLW0603

    if _table is None:
        device = _determine_device()
        logger.debug(f"Initializing LanceDB and Embedding Model on {device.upper()}...")

        _db = lancedb.connect(DB_URI)
        _table = _db.open_table("docs")

        # Load model with Mac-optimized settings (float16)
        _model = SentenceTransformer(
            MODEL_NAME,
            device=device,
            model_kwargs={"torch_dtype": torch.float16},
            trust_remote_code=True,
        )

    return _table, _model


def _extract_title(content: str, filename: str) -> str:
    """
    Extract YAML frontmatter title from a chunk.

    Falls back to the filename if not found.
    """
    # Regex to find 'title: "Something"' inside --- blocks
    match = re.search(r'title:\s*"([^"]+)"', content)
    if match:
        return match.group(1)

    # Fallback: Clean up filename (e.g., "collection_preferences.md" -> "Collection Preferences")
    return filename.replace(".md", "").replace("_", " ").title()


@mcp.tool()
def cloudscape_search_docs(query: str) -> str:
    """
    Search the Cloudscape documentation index for relevant files.

    Use this tool FIRST to find the correct file paths.
    It returns a list of files with their relevance scores.
    It does NOT return the full content.

    Args:
        query: The search term (e.g., "collection preferences", "table sorting props").

    Returns:
        A concise list of relevant files and their paths.

    """
    table, model = get_resources()

    query_vec = model.encode(
        query,
        normalize_embeddings=True,
    ).tolist()

    # Fetch more candidates than we need to allow for deduplication
    # (Because one file might have 10 matching chunks)
    candidates = table.search(query_vec).limit(25).to_list()

    if not candidates:
        return "No relevant documentation found."

    # Deduplicate results by file path
    seen_paths = set()
    unique_results = []

    for r in candidates:
        path = r["path"]
        if path not in seen_paths:
            seen_paths.add(path)

            # Extract a friendly title from content or filename
            title = _extract_title(r["content"], r["filename"])

            unique_results.append(
                {
                    "title": title,
                    "path": path,
                    "filename": r["filename"],
                    "score": r.get(
                        "_distance",
                        0.0,
                    ),  # Lower distance = better match usually, or similarity
                },
            )

            # Stop after MAX_UNIQUE_RESULTS to save tokens
            if len(unique_results) >= MAX_UNIQUE_RESULTS:
                break

    # Format output as a concise list for the agent
    output = ["Found the following relevant documentation files:\n"]

    for i, res in enumerate(unique_results, 1):
        output.append(f"{i}. [Title: {res['title']}]\n   Path: {res['path']}\n")

    output.append("\nUse 'cloudscape_read' with a specific 'Path' to view content.")

    return "\n".join(output)


@mcp.tool()
def cloudscape_read_doc(file_path: str) -> str:
    """
    Read the FULL content of a documentation file.

    Use this tool SECOND, after finding the correct path via 'cloudscape_search_docs'.

    Args:
        file_path: The exact path provided by the search tool (e.g., "docs/components/button.md").

    """
    target_path = Path(file_path)

    # 1. Security Check: Block directory traversal
    # Ensure the path contains no ".." and is relative to our current execution or docs folder
    try:
        # Resolve to absolute path
        abs_path = target_path.resolve()
        # Verify it exists
        if not abs_path.exists():
            return f"Error: File not found at {file_path}. Please check the path."

        # Optional: Check if it's within expected directory (if you want strict confinement)
        # root = Path.cwd().resolve()
        # if root not in abs_path.parents:
        #     return "Error: Access denied. Path is outside allowed directories."

    except OSError as e:
        return f"Error validating path: {e!s}"

    if not target_path.is_file():
        return f"Error: {file_path} is not a file."

    try:
        content = target_path.read_text(encoding="utf-8")
    except OSError as e:
        return f"Error reading file: {e!s}"
    else:
        return (
            f"--- BEGIN FILE: {file_path} ---\n{content}\n--- END FILE: {file_path} ---"
        )


if __name__ == "__main__":
    # Resources are lazy-loaded on first tool call to avoid startup timeout
    mcp.run()