From 1b9ddc88eb0e7e78235945af3134f59f82bfe611 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Tue, 27 Feb 2024 09:48:08 -0500 Subject: [PATCH 1/3] save --- lilac/concepts/db_concept.py | 2 + lilac/embeddings/bge.py | 5 +- lilac/embeddings/cohere.py | 7 +-- lilac/embeddings/gte.py | 8 ++- lilac/embeddings/nomic_embed.py | 7 +-- lilac/embeddings/openai.py | 7 +-- lilac/embeddings/sbert.py | 5 +- .../components/concepts/ConceptView.svelte | 55 +------------------ .../datasetView/ItemMediaTextContent.svelte | 8 +-- 9 files changed, 27 insertions(+), 77 deletions(-) diff --git a/lilac/concepts/db_concept.py b/lilac/concepts/db_concept.py index 45a406561..97d0513ad 100644 --- a/lilac/concepts/db_concept.py +++ b/lilac/concepts/db_concept.py @@ -473,6 +473,8 @@ def _validate_examples( self, examples: List[Union[ExampleIn, Example]], type: ConceptType ) -> None: for example in examples: + if not example.text and not example.img: + raise ValueError('The example must have a text or image associated with it.') inferred_type = 'text' if example.text else 'unknown' if inferred_type != type: raise ValueError(f'Example type "{inferred_type}" does not match concept type "{type}".') diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 0ba749a24..696565ea0 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -16,7 +16,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -69,11 +69,12 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = clustering_spacy_chunker if self._split else identity_chunker return chunked_compute_embedding( lambda docs: self._model.encode(docs)['dense_vecs'], docs, self.local_batch_size * 16, - chunker=clustering_spacy_chunker, + chunker=chunker, ) @override diff --git a/lilac/embeddings/cohere.py b/lilac/embeddings/cohere.py index a95eb437c..7559bc315 100644 --- a/lilac/embeddings/cohere.py +++ b/lilac/embeddings/cohere.py @@ -9,7 +9,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker if TYPE_CHECKING: from cohere import Client @@ -65,6 +65,5 @@ def _embed_fn(docs: list[str]) -> list[np.ndarray]: ).embeddings ] - return chunked_compute_embedding( - _embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker - ) + chunker = clustering_spacy_chunker if self._split else identity_chunker + return chunked_compute_embedding(_embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/gte.py b/lilac/embeddings/gte.py index 1da5d509d..5e4d71928 100644 --- a/lilac/embeddings/gte.py +++ b/lilac/embeddings/gte.py @@ -19,7 +19,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -69,8 +69,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = clustering_spacy_chunker if self._split else identity_chunker return chunked_compute_embedding( - self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + self._model.encode, docs, self.local_batch_size * 16, chunker=chunker ) @override @@ -78,8 +79,9 @@ def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: # Trim the docs to the max context size. trimmed_docs = (doc[:GTE_CONTEXT_SIZE] for doc in docs) + chunker = clustering_spacy_chunker if self._split else identity_chunker text_chunks: Iterator[tuple[int, TextChunk]] = ( - (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in clustering_spacy_chunker(doc) + (i, chunk) for i, doc in enumerate(trimmed_docs) for chunk in chunker(doc) ) text_chunks, text_chunks_2 = itertools.tee(text_chunks) chunk_texts = (chunk[0] for _, chunk in text_chunks) diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py index 24f703eed..32c52e01b 100644 --- a/lilac/embeddings/nomic_embed.py +++ b/lilac/embeddings/nomic_embed.py @@ -14,7 +14,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. @@ -76,9 +76,8 @@ def _encode(doc: list[str]) -> list[np.ndarray]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. - return chunked_compute_embedding( - _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker - ) + chunker = clustering_spacy_chunker if self._split else identity_chunker + return chunked_compute_embedding(_encode, docs, self.local_batch_size * 16, chunker=chunker) @override def teardown(self) -> None: diff --git a/lilac/embeddings/openai.py b/lilac/embeddings/openai.py index a9a70124c..318cfe4fa 100644 --- a/lilac/embeddings/openai.py +++ b/lilac/embeddings/openai.py @@ -10,7 +10,7 @@ from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker API_NUM_PARALLEL_REQUESTS = 10 API_OPENAI_BATCH_SIZE = 128 @@ -92,6 +92,5 @@ def embed_fn(texts: list[str]) -> list[np.ndarray]: ) return [np.array(embedding.embedding, dtype=np.float32) for embedding in response.data] - return chunked_compute_embedding( - embed_fn, docs, self.local_batch_size, chunker=clustering_spacy_chunker - ) + chunker = clustering_spacy_chunker if self._split else identity_chunker + return chunked_compute_embedding(embed_fn, docs, self.local_batch_size, chunker=chunker) diff --git a/lilac/embeddings/sbert.py b/lilac/embeddings/sbert.py index 3beba604f..645623029 100644 --- a/lilac/embeddings/sbert.py +++ b/lilac/embeddings/sbert.py @@ -12,7 +12,7 @@ from ..schema import Item from ..signal import TextEmbeddingSignal from ..splitters.spacy_splitter import clustering_spacy_chunker -from .embedding import chunked_compute_embedding +from .embedding import chunked_compute_embedding, identity_chunker from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device # The `all-mpnet-base-v2` model provides the best quality, while `all-MiniLM-L6-v2`` is 5 times @@ -47,8 +47,9 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. + chunker = clustering_spacy_chunker if self._split else identity_chunker return chunked_compute_embedding( - self._model.encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + self._model.encode, docs, self.local_batch_size * 16, chunker=chunker ) @override diff --git a/web/blueprint/src/lib/components/concepts/ConceptView.svelte b/web/blueprint/src/lib/components/concepts/ConceptView.svelte index 4fd7c5b32..02159c908 100644 --- a/web/blueprint/src/lib/components/concepts/ConceptView.svelte +++ b/web/blueprint/src/lib/components/concepts/ConceptView.svelte @@ -1,5 +1,4 @@