From 1b9ddc88eb0e7e78235945af3134f59f82bfe611 Mon Sep 17 00:00:00 2001
From: Daniel Smilkov <dsmilkov@gmail.com>
Date: Tue, 27 Feb 2024 09:48:08 -0500
Subject: [PATCH 1/3] save

---
 lilac/concepts/db_concept.py                  |  2 +
 lilac/embeddings/bge.py                       |  5 +-
 lilac/embeddings/cohere.py                    |  7 +--
 lilac/embeddings/gte.py                       |  8 ++-
 lilac/embeddings/nomic_embed.py               |  7 +--
 lilac/embeddings/openai.py                    |  7 +--
 lilac/embeddings/sbert.py                     |  5 +-
 .../components/concepts/ConceptView.svelte    | 55 +------------------
 .../datasetView/ItemMediaTextContent.svelte   |  8 +--
 9 files changed, 27 insertions(+), 77 deletions(-)

diff --git a/lilac/concepts/db_concept.py b/lilac/concepts/db_concept.py
index 45a406561..97d0513ad 100644
--- a/lilac/concepts/db_concept.py
+++ b/lilac/concepts/db_concept.py
@@ -473,6 +473,8 @@ def _validate_examples(
     self, examples: List[Union[ExampleIn, Example]], type: ConceptType
   ) -> None:
     for example in examples:
+      if not example.text and not example.img:
+        raise ValueError('The example must have a text or image associated with it.')
       inferred_type = 'text' if example.text else 'unknown'
       if inferred_type != type:
         raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".')
diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py
index 0ba749a24..696565ea0 100644
--- a/lilac/embeddings/bge.py
+++ b/lilac/embeddings/bge.py
@@ -16,7 +16,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -69,11 +69,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
       lambda docs: self._model.encode(docs)['dense_vecs'],
       docs,
       self.local_batch_size * 16,
-      chunker=clustering_spacy_chunker,
+      chunker=chunker,
     )
 
   @override
diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py
index a95eb437c..7559bc315 100644
--- a/lilac/embeddings/cohere.py
+++ b/lilac/embeddings/cohere.py
@@ -9,7 +9,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 
 if TYPE_CHECKING:
   from cohere import Client
@@ -65,6 +65,5 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]:
         ).embeddings
       ]
 
-    return chunked_compute_embedding(
-      _embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py
index 1da5d509d..5e4d71928 100644
--- a/lilac/embeddings/gte.py
+++ b/lilac/embeddings/gte.py
@@ -19,7 +19,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -69,8 +69,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
-      self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
+      self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )
 
   @override
@@ -78,8 +79,9 @@ def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]:
     # Trim the docs to the max context size.
 
     trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs)
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     text_chunks: Iterator[tuple[int, TextChunk]] = (
-      (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in clustering_spacy_chunker(doc)
+      (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc)
     )
     text_chunks, text_chunks_2 = itertools.tee(text_chunks)
     chunk_texts = (chunk[0] for _, chunk in text_chunks)
diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py
index 24f703eed..32c52e01b 100644
--- a/lilac/embeddings/nomic_embed.py
+++ b/lilac/embeddings/nomic_embed.py
@@ -14,7 +14,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
@@ -76,9 +76,8 @@ def _encode(doc: list[str]) -> list[np.ndarray]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    return chunked_compute_embedding(
-      _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker)
 
   @override
   def teardown(self) -> None:
diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py
index a9a70124c..318cfe4fa 100644
--- a/lilac/embeddings/openai.py
+++ b/lilac/embeddings/openai.py
@@ -10,7 +10,7 @@
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 
 API_NUM_PARALLEL_REQUESTS = 10
 API_OPENAI_BATCH_SIZE = 128
@@ -92,6 +92,5 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]:
       )
       return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data]
 
-    return chunked_compute_embedding(
-      embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker
-    )
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py
index 3beba604f..645623029 100644
--- a/lilac/embeddings/sbert.py
+++ b/lilac/embeddings/sbert.py
@@ -12,7 +12,7 @@
 from ..schema import Item
 from ..signal import TextEmbeddingSignal
 from ..splitters.spacy_splitter import clustering_spacy_chunker
-from .embedding import chunked_compute_embedding
+from .embedding import chunked_compute_embedding, identity_chunker
 from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
 
 # The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times
@@ -47,8 +47,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
+    chunker = clustering_spacy_chunker if self._split else identity_chunker
     return chunked_compute_embedding(
-      self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
+      self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )
 
   @override
diff --git a/web/blueprint/src/lib/components/concepts/ConceptView.svelte b/web/blueprint/src/lib/components/concepts/ConceptView.svelte
index 4fd7c5b32..02159c908 100644
--- a/web/blueprint/src/lib/components/concepts/ConceptView.svelte
+++ b/web/blueprint/src/lib/components/concepts/ConceptView.svelte
@@ -1,5 +1,4 @@
 <script lang="ts">
-  import {goto} from '$app/navigation';
   import {
     editConceptMutation,
     queryConceptModels,
@@ -7,21 +6,16 @@
   } from '$lib/queries/conceptQueries';
   import {queryAuthInfo} from '$lib/queries/serverQueries';
   import {queryEmbeddings} from '$lib/queries/signalQueries';
-  import {createDatasetViewStore} from '$lib/stores/datasetViewStore';
-  import {getNavigationContext} from '$lib/stores/navigationStore';
-  import {datasetLink} from '$lib/utils';
   import type {Concept} from '$lilac';
-  import {Button, ToastNotification} from 'carbon-components-svelte';
+  import {ToastNotification} from 'carbon-components-svelte';
   import {View, ViewOff} from 'carbon-icons-svelte';
   import ThumbsDownFilled from 'carbon-icons-svelte/lib/ThumbsDownFilled.svelte';
   import ThumbsUpFilled from 'carbon-icons-svelte/lib/ThumbsUpFilled.svelte';
-  import {get} from 'svelte/store';
   import Expandable from '../Expandable.svelte';
   import {hoverTooltip} from '../common/HoverTooltip';
   import ConceptExampleList from './ConceptExampleList.svelte';
   import ConceptMetrics from './ConceptMetrics.svelte';
   import ConceptPreview from './ConceptPreview.svelte';
-  import DatasetFieldEmbeddingSelector from './DatasetFieldEmbeddingSelector.svelte';
   import ConceptLabeler from './labeler/ConceptLabeler.svelte';
 
   export let concept: Concept;
@@ -30,7 +24,6 @@
   $: userId = $authInfo.data?.user?.id;
 
   const concepts = queryConcepts();
-  const navState = getNavigationContext();
 
   $: conceptInfo = $concepts.data?.find(
     c => c.namespace === concept.namespace && c.name === concept.concept_name
@@ -45,25 +38,6 @@
 
   $: randomPositive = positiveExamples[Math.floor(Math.random() * positiveExamples.length)];
 
-  // Apply to a dataset.
-  let applyDataset: {namespace: string; name: string} | undefined | null = undefined;
-  let applyPath: string[] | undefined;
-  let applyEmbedding: string | undefined = undefined;
-  function openDataset() {
-    if (applyPath == null || applyEmbedding == null || applyDataset == null) {
-      return;
-    }
-    const store = createDatasetViewStore(applyDataset.namespace, applyDataset.name);
-    store.addSearch({
-      path: applyPath,
-      type: 'concept',
-      concept_namespace: concept.namespace,
-      concept_name: concept.concept_name,
-      embedding: applyEmbedding
-    });
-    goto(datasetLink(applyDataset.namespace, applyDataset.name, $navState, get(store)));
-  }
-
   function remove(id: string) {
     if (!concept.namespace || !concept.concept_name) return;
     $conceptMutation.mutate([concept.namespace, concept.concept_name, {remove: [id]}]);
@@ -115,33 +89,6 @@
     </div>
   </Expandable>
 
-  <Expandable>
-    <div slot="above" class="text-md font-semibold">Apply to a dataset</div>
-    <div slot="below">
-      <DatasetFieldEmbeddingSelector
-        bind:dataset={applyDataset}
-        bind:path={applyPath}
-        bind:embedding={applyEmbedding}
-      />
-      {#if applyDataset != null && applyPath != null && applyEmbedding != null}
-        <div class="mt-4">
-          <Button iconDescription={'Open dataset and apply concept.'} on:click={() => openDataset()}
-            >Search by concept
-          </Button>
-        </div>
-      {:else}
-        <ToastNotification
-          hideCloseButton
-          kind="warning"
-          fullWidth
-          lowContrast
-          title="Choose a dataset with a computed embedding"
-          caption={'Dataset has no fields with computed embeddings. ' +
-            'Please compute an embedding index before you can search by concept.'}
-        />
-      {/if}
-    </div>
-  </Expandable>
   <Expandable>
     <div slot="above" class="text-md font-semibold">Collect labels</div>
     <div slot="below" class="w-full">
diff --git a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte
index 9a4dc0322..f1e7942f3 100644
--- a/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte
+++ b/web/blueprint/src/lib/components/datasetView/ItemMediaTextContent.svelte
@@ -76,7 +76,7 @@
   };
   $: {
     pathToSpans = {};
-    spanPaths.forEach(sp => {
+    (spanPaths || []).forEach(sp => {
       if (row == null) return;
       let valueNodes = getValueNodes(row, sp);
       const isSpanNestedUnder = pathMatchesPrefix(sp, path);
@@ -97,7 +97,7 @@
   let spanPathToValueInfos: Record<string, SpanValueInfo[]> = {};
   $: {
     spanPathToValueInfos = {};
-    for (const spanValueInfo of spanValueInfos) {
+    for (const spanValueInfo of spanValueInfos || []) {
       const spanPathStr = serializePath(spanValueInfo.spanPath);
       if (spanPathToValueInfos[spanPathStr] == null) {
         spanPathToValueInfos[spanPathStr] = [];
@@ -206,7 +206,7 @@
   $: {
     if (model != null && editor != null) {
       let minPosition: Monaco.Position | null = null;
-      for (const renderSpan of monacoSpans) {
+      for (const renderSpan of monacoSpans || []) {
         const span = L.span(renderSpan.span)!;
         const position = model.getPositionAt(span.start);
 
@@ -381,7 +381,7 @@
 
   const conceptQuery = queryConcepts();
   $: concepts = $conceptQuery.data;
-  let conceptsInMenu: Set<string>;
+  let conceptsInMenu: Set<string> = new Set();
   let addToConceptItems: DropdownItem[] = [];
 
   $: {

From 24c91f695cade0e259375d92f3d06209588d7ed0 Mon Sep 17 00:00:00 2001
From: Daniel Smilkov <dsmilkov@gmail.com>
Date: Tue, 27 Feb 2024 09:56:35 -0500
Subject: [PATCH 2/3] save

---
 .../components/concepts/ConceptView.svelte    | 55 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/web/blueprint/src/lib/components/concepts/ConceptView.svelte b/web/blueprint/src/lib/components/concepts/ConceptView.svelte
index 02159c908..4fd7c5b32 100644
--- a/web/blueprint/src/lib/components/concepts/ConceptView.svelte
+++ b/web/blueprint/src/lib/components/concepts/ConceptView.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+  import {goto} from '$app/navigation';
   import {
     editConceptMutation,
     queryConceptModels,
@@ -6,16 +7,21 @@
   } from '$lib/queries/conceptQueries';
   import {queryAuthInfo} from '$lib/queries/serverQueries';
   import {queryEmbeddings} from '$lib/queries/signalQueries';
+  import {createDatasetViewStore} from '$lib/stores/datasetViewStore';
+  import {getNavigationContext} from '$lib/stores/navigationStore';
+  import {datasetLink} from '$lib/utils';
   import type {Concept} from '$lilac';
-  import {ToastNotification} from 'carbon-components-svelte';
+  import {Button, ToastNotification} from 'carbon-components-svelte';
   import {View, ViewOff} from 'carbon-icons-svelte';
   import ThumbsDownFilled from 'carbon-icons-svelte/lib/ThumbsDownFilled.svelte';
   import ThumbsUpFilled from 'carbon-icons-svelte/lib/ThumbsUpFilled.svelte';
+  import {get} from 'svelte/store';
   import Expandable from '../Expandable.svelte';
   import {hoverTooltip} from '../common/HoverTooltip';
   import ConceptExampleList from './ConceptExampleList.svelte';
   import ConceptMetrics from './ConceptMetrics.svelte';
   import ConceptPreview from './ConceptPreview.svelte';
+  import DatasetFieldEmbeddingSelector from './DatasetFieldEmbeddingSelector.svelte';
   import ConceptLabeler from './labeler/ConceptLabeler.svelte';
 
   export let concept: Concept;
@@ -24,6 +30,7 @@
   $: userId = $authInfo.data?.user?.id;
 
   const concepts = queryConcepts();
+  const navState = getNavigationContext();
 
   $: conceptInfo = $concepts.data?.find(
     c => c.namespace === concept.namespace && c.name === concept.concept_name
@@ -38,6 +45,25 @@
 
   $: randomPositive = positiveExamples[Math.floor(Math.random() * positiveExamples.length)];
 
+  // Apply to a dataset.
+  let applyDataset: {namespace: string; name: string} | undefined | null = undefined;
+  let applyPath: string[] | undefined;
+  let applyEmbedding: string | undefined = undefined;
+  function openDataset() {
+    if (applyPath == null || applyEmbedding == null || applyDataset == null) {
+      return;
+    }
+    const store = createDatasetViewStore(applyDataset.namespace, applyDataset.name);
+    store.addSearch({
+      path: applyPath,
+      type: 'concept',
+      concept_namespace: concept.namespace,
+      concept_name: concept.concept_name,
+      embedding: applyEmbedding
+    });
+    goto(datasetLink(applyDataset.namespace, applyDataset.name, $navState, get(store)));
+  }
+
   function remove(id: string) {
     if (!concept.namespace || !concept.concept_name) return;
     $conceptMutation.mutate([concept.namespace, concept.concept_name, {remove: [id]}]);
@@ -89,6 +115,33 @@
     </div>
   </Expandable>
 
+  <Expandable>
+    <div slot="above" class="text-md font-semibold">Apply to a dataset</div>
+    <div slot="below">
+      <DatasetFieldEmbeddingSelector
+        bind:dataset={applyDataset}
+        bind:path={applyPath}
+        bind:embedding={applyEmbedding}
+      />
+      {#if applyDataset != null && applyPath != null && applyEmbedding != null}
+        <div class="mt-4">
+          <Button iconDescription={'Open dataset and apply concept.'} on:click={() => openDataset()}
+            >Search by concept
+          </Button>
+        </div>
+      {:else}
+        <ToastNotification
+          hideCloseButton
+          kind="warning"
+          fullWidth
+          lowContrast
+          title="Choose a dataset with a computed embedding"
+          caption={'Dataset has no fields with computed embeddings. ' +
+            'Please compute an embedding index before you can search by concept.'}
+        />
+      {/if}
+    </div>
+  </Expandable>
   <Expandable>
     <div slot="above" class="text-md font-semibold">Collect labels</div>
     <div slot="below" class="w-full">

From 9d9b2e1d18e60669b02139e0e5917089cf6234a4 Mon Sep 17 00:00:00 2001
From: Daniel Smilkov <dsmilkov@gmail.com>
Date: Tue, 27 Feb 2024 10:11:44 -0500
Subject: [PATCH 3/3] save

---
 lilac/concepts/concept.py       |  4 +++-
 lilac/embeddings/bge.py         |  8 ++++++--
 lilac/embeddings/cohere.py      |  8 ++++++--
 lilac/embeddings/gte.py         | 12 +++++++++---
 lilac/embeddings/nomic_embed.py |  9 +++++++--
 lilac/embeddings/openai.py      |  8 ++++++--
 lilac/embeddings/sbert.py       |  8 ++++++--
 7 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/lilac/concepts/concept.py b/lilac/concepts/concept.py
index 7580a3698..de353761f 100644
--- a/lilac/concepts/concept.py
+++ b/lilac/concepts/concept.py
@@ -66,8 +66,10 @@ class ExampleIn(BaseModel):
 
   @field_validator('text')
   @classmethod
-  def parse_text(cls, text: str) -> str:
+  def parse_text(cls, text: Optional[str]) -> Optional[str]:
     """Fixes surrogate errors in text: https://github.com/ijl/orjson/blob/master/README.md#str ."""
+    if not text:
+      return None
     return text.encode('utf-8', 'replace').decode('utf-8')
 
 
diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py
index 696565ea0..a65d51ff1 100644
--- a/lilac/embeddings/bge.py
+++ b/lilac/embeddings/bge.py
@@ -1,9 +1,10 @@
 """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
 import gc
-from typing import TYPE_CHECKING, ClassVar, Optional
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast
 
 from typing_extensions import override
 
+from ..splitters.chunk_splitter import TextChunk
 from ..utils import log
 
 if TYPE_CHECKING:
@@ -69,7 +70,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(
       lambda docs: self._model.encode(docs)['dense_vecs'],
       docs,
diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py
index 7559bc315..4fcff463f 100644
--- a/lilac/embeddings/cohere.py
+++ b/lilac/embeddings/cohere.py
@@ -1,5 +1,5 @@
 """Cohere embeddings."""
-from typing import TYPE_CHECKING, ClassVar, Optional
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast
 
 import numpy as np
 from typing_extensions import override
@@ -7,6 +7,7 @@
 from ..env import env
 from ..schema import Item
 from ..signal import TextEmbeddingSignal
+from ..splitters.chunk_splitter import TextChunk
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
 from .embedding import chunked_compute_embedding, identity_chunker
@@ -65,5 +66,8 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]:
         ).embeddings
       ]
 
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py
index 5e4d71928..074d57e31 100644
--- a/lilac/embeddings/gte.py
+++ b/lilac/embeddings/gte.py
@@ -1,7 +1,7 @@
 """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
 import gc
 import itertools
-from typing import TYPE_CHECKING, ClassVar, Iterator, Optional
+from typing import TYPE_CHECKING, Callable, ClassVar, Iterator, Optional, cast
 
 import modal
 from typing_extensions import override
@@ -69,7 +69,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(
       self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )
@@ -79,7 +82,10 @@ def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]:
     # Trim the docs to the max context size.
 
     trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs)
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     text_chunks: Iterator[tuple[int, TextChunk]] = (
       (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc)
     )
diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py
index 32c52e01b..118f7974a 100644
--- a/lilac/embeddings/nomic_embed.py
+++ b/lilac/embeddings/nomic_embed.py
@@ -1,10 +1,12 @@
 """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
 import gc
-from typing import TYPE_CHECKING, ClassVar, Optional
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast
 
 import numpy as np
 from typing_extensions import override
 
+from ..splitters.chunk_splitter import TextChunk
+
 if TYPE_CHECKING:
   from sentence_transformers import SentenceTransformer
 
@@ -76,7 +78,10 @@ def _encode(doc: list[str]) -> list[np.ndarray]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker)
 
   @override
diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py
index 318cfe4fa..0da8653ca 100644
--- a/lilac/embeddings/openai.py
+++ b/lilac/embeddings/openai.py
@@ -1,5 +1,5 @@
 """OpenAI embeddings."""
-from typing import ClassVar, Optional
+from typing import Callable, ClassVar, Optional, cast
 
 import numpy as np
 from tenacity import retry, stop_after_attempt, wait_random_exponential
@@ -8,6 +8,7 @@
 from ..env import env
 from ..schema import Item
 from ..signal import TextEmbeddingSignal
+from ..splitters.chunk_splitter import TextChunk
 from ..splitters.spacy_splitter import clustering_spacy_chunker
 from ..tasks import TaskExecutionType
 from .embedding import chunked_compute_embedding, identity_chunker
@@ -92,5 +93,8 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]:
       )
       return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data]
 
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker)
diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py
index 645623029..91c586e4b 100644
--- a/lilac/embeddings/sbert.py
+++ b/lilac/embeddings/sbert.py
@@ -1,8 +1,9 @@
 """Sentence-BERT embeddings. Open-source models, designed to run on device."""
-from typing import TYPE_CHECKING, ClassVar, Optional
+from typing import TYPE_CHECKING, Callable, ClassVar, Optional, cast
 
 from typing_extensions import override
 
+from ..splitters.chunk_splitter import TextChunk
 from ..tasks import TaskExecutionType
 
 if TYPE_CHECKING:
@@ -47,7 +48,10 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]:
     # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
     # The sentence transformer API actually does batching internally, so we pass
     # local_batch_size * 16 to allow the library to see all the chunks at once.
-    chunker = clustering_spacy_chunker if self._split else identity_chunker
+    chunker = cast(
+      Callable[[str], list[TextChunk]],
+      clustering_spacy_chunker if self._split else identity_chunker,
+    )
     return chunked_compute_embedding(
       self._model.encode, docs, self.local_batch_size * 16, chunker=chunker
     )