Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,11 @@ select = ["E", "F", "I", "N", "W", "UP"]
python_version = "3.11"
strict = true
ignore_missing_imports = true
explicit_package_bases = true

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_functions = ["test_*"]
addopts = "-v --tb=short"
asyncio_mode = "auto"
31 changes: 31 additions & 0 deletions src/cocoindex_code/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Public API for writing custom chunkers.

Example usage::

from pathlib import Path
from cocoindex_code.chunking import Chunk, ChunkerFn, TextPosition

def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]:
pos = TextPosition(byte_offset=0, char_offset=0, line=1, column=0)
return "mylang", [Chunk(text=content, start=pos, end=pos)]
"""

from __future__ import annotations

import pathlib as _pathlib
from collections.abc import Callable as _Callable

import cocoindex as _coco

from cocoindex.resources.chunk import Chunk
from cocoindex.resources.chunk import TextPosition

# Callable alias (not Protocol) — consistent with codebase style.
# language_override=None keeps the language detected by detect_code_language.
# path is not resolved (no syscall); call path.resolve() inside the chunker if needed.
ChunkerFn = _Callable[[_pathlib.Path, str], tuple[str | None, list[Chunk]]]

# tracked=False: callables are not fingerprint-able; daemon restart re-indexes anyway.
CHUNKER_REGISTRY = _coco.ContextKey[dict[str, ChunkerFn]]("chunker_registry", tracked=False)

__all__ = ["Chunk", "ChunkerFn", "CHUNKER_REGISTRY", "TextPosition"]
28 changes: 27 additions & 1 deletion src/cocoindex_code/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import asyncio
import importlib
import logging
import os
import signal
Expand Down Expand Up @@ -45,8 +46,10 @@
decode_request,
encode_response,
)
from .chunking import ChunkerFn as _ChunkerFn
from .query import query_codebase
from .settings import (
ChunkerMapping,
global_settings_mtime_us,
load_project_settings,
load_user_settings,
Expand All @@ -56,6 +59,26 @@

logger = logging.getLogger(__name__)


def _resolve_chunker_registry(mappings: list[ChunkerMapping]) -> dict[str, _ChunkerFn]:
"""Resolve ``ChunkerMapping`` settings entries to a ``{suffix: fn}`` dict.

Each ``mapping.module`` must be a ``"module.path:callable"`` string importable
from the current environment.
"""
registry: dict[str, _ChunkerFn] = {}
for cm in mappings:
module_path, _, attr = cm.module.partition(":")
if not attr:
raise ValueError(f"chunker module {cm.module!r} must use 'module.path:callable' format")
mod = importlib.import_module(module_path)
fn = getattr(mod, attr)
if not callable(fn):
raise ValueError(f"chunker {cm.module!r}: {attr!r} is not callable")
registry[f".{cm.ext}"] = fn
return registry


# ---------------------------------------------------------------------------
# Daemon paths
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -123,7 +146,10 @@ async def get_project(self, project_root: str, *, suppress_auto_index: bool = Fa
if project_root not in self._projects:
root = Path(project_root)
project_settings = load_project_settings(root)
project = await Project.create(root, project_settings, self._embedder)
chunker_registry = _resolve_chunker_registry(project_settings.chunkers)
project = await Project.create(
root, project_settings, self._embedder, chunker_registry=chunker_registry
)
self._projects[project_root] = project
self._index_locks[project_root] = asyncio.Lock()
self._load_time_done[project_root] = asyncio.Event()
Expand Down
22 changes: 15 additions & 7 deletions src/cocoindex_code/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from cocoindex.resources.id import IdGenerator
from pathspec import GitIgnoreSpec

from .chunking import CHUNKER_REGISTRY
from .settings import PROJECT_SETTINGS
from .shared import (
CODEBASE_DIR,
Expand Down Expand Up @@ -158,13 +159,20 @@ async def process_file(
or "text"
)

chunks = splitter.split(
content,
chunk_size=CHUNK_SIZE,
min_chunk_size=MIN_CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
language=language,
)
chunker_registry = coco.use_context(CHUNKER_REGISTRY)
chunker = chunker_registry.get(suffix)
if chunker is not None:
language_override, chunks = chunker(Path(file.file_path.path), content)
if language_override is not None:
language = language_override
else:
chunks = splitter.split(
content,
chunk_size=CHUNK_SIZE,
min_chunk_size=MIN_CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
language=language,
)

id_gen = IdGenerator()

Expand Down
17 changes: 16 additions & 1 deletion src/cocoindex_code/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import cocoindex as coco
from cocoindex.connectors import sqlite

from .chunking import CHUNKER_REGISTRY, ChunkerFn
from .indexer import indexer_main
from .protocol import IndexingProgress
from .settings import PROJECT_SETTINGS, ProjectSettings, load_gitignore_spec
Expand Down Expand Up @@ -86,8 +87,21 @@ async def create(
project_root: Path,
project_settings: ProjectSettings,
embedder: Embedder,
chunker_registry: dict[str, ChunkerFn] | None = None,
) -> Project:
"""Create a project with explicit settings and embedder."""
"""Create a project with explicit settings and embedder.
Args:
project_root: Root directory of the codebase to index.
project_settings: Include/exclude patterns and language overrides.
embedder: Embedding model instance.
chunker_registry: Optional mapping of file suffix (e.g. ``".sls"``)
to a ``ChunkerFn``. When a suffix matches, the registered
chunker is called instead of the built-in ``RecursiveSplitter``.
Defaults to an empty registry. Shallow-copied on creation.
Passed as a parameter rather than via ``env`` to keep
``env`` internals out of the public API.
"""
index_dir = project_root / ".cocoindex_code"
index_dir.mkdir(parents=True, exist_ok=True)

Expand All @@ -107,6 +121,7 @@ async def create(
{f".{lo.ext}": lo.lang for lo in project_settings.language_overrides},
)
context.provide(GITIGNORE_SPEC, gitignore_spec)
context.provide(CHUNKER_REGISTRY, dict(chunker_registry) if chunker_registry else {})

env = coco.Environment(settings, context_provider=context)
app = coco.App(
Expand Down
11 changes: 11 additions & 0 deletions src/cocoindex_code/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,18 @@ class LanguageOverride:
lang: str # e.g. "php"


@dataclass
class ChunkerMapping:
ext: str # without dot, e.g. "toml"
module: str # "module.path:callable", e.g. "cocoindex_code.toml_chunker:toml_chunker"


@dataclass
class ProjectSettings:
include_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_INCLUDED_PATTERNS))
exclude_patterns: list[str] = field(default_factory=lambda: list(DEFAULT_EXCLUDED_PATTERNS))
language_overrides: list[LanguageOverride] = field(default_factory=list)
chunkers: list[ChunkerMapping] = field(default_factory=list)


# CocoIndex context key for project settings
Expand Down Expand Up @@ -265,17 +272,21 @@ def _project_settings_to_dict(settings: ProjectSettings) -> dict[str, Any]:
d["language_overrides"] = [
{"ext": lo.ext, "lang": lo.lang} for lo in settings.language_overrides
]
if settings.chunkers:
d["chunkers"] = [{"ext": cm.ext, "module": cm.module} for cm in settings.chunkers]
return d


def _project_settings_from_dict(d: dict[str, Any]) -> ProjectSettings:
overrides = [
LanguageOverride(ext=lo["ext"], lang=lo["lang"]) for lo in d.get("language_overrides", [])
]
chunkers = [ChunkerMapping(ext=cm["ext"], module=cm["module"]) for cm in d.get("chunkers", [])]
return ProjectSettings(
include_patterns=d.get("include_patterns", list(DEFAULT_INCLUDED_PATTERNS)),
exclude_patterns=d.get("exclude_patterns", list(DEFAULT_EXCLUDED_PATTERNS)),
language_overrides=overrides,
chunkers=chunkers,
)


Expand Down
45 changes: 45 additions & 0 deletions tests/example_toml_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Demo chunker: splits TOML files at top-level [section] boundaries.

Each ``[section]`` header starts a new chunk, keeping the section header
and its key-value pairs together. This produces semantically coherent units
instead of the arbitrary line-window slices from the default splitter.

Register in ``.cocoindex_code/settings.yml``::

chunkers:
- ext: toml
module: example_toml_chunker:toml_chunker
"""

from __future__ import annotations

import re as _re
from pathlib import Path as _Path

from cocoindex_code.chunking import Chunk, TextPosition

_SECTION_RE = _re.compile(r"^\[(?!\[)")


def _pos(line: int) -> TextPosition:
return TextPosition(byte_offset=0, char_offset=0, line=line, column=0)


def toml_chunker(path: _Path, content: str) -> tuple[str | None, list[Chunk]]:
"""Split a TOML file at top-level ``[section]`` headers."""
lines = content.splitlines()
section_starts = [i for i, ln in enumerate(lines) if _SECTION_RE.match(ln)]

if not section_starts:
return "toml", [Chunk(text=content, start=_pos(1), end=_pos(len(lines)))]

boundaries = section_starts + [len(lines)]
chunks: list[Chunk] = []
for start_idx, end_idx in zip(boundaries, boundaries[1:]):
text = "\n".join(lines[start_idx:end_idx]).strip()
if text:
chunks.append(Chunk(text=text, start=_pos(start_idx + 1), end=_pos(end_idx)))
return "toml", chunks


__all__ = ["toml_chunker"]
Loading