diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py index 7530aafe..dff1f866 100644 --- a/lilac/data/clustering.py +++ b/lilac/data/clustering.py @@ -53,7 +53,7 @@ UMAP_DIM = 5 UMAP_SEED = 42 HDBSCAN_SELECTION_EPS = 0.05 -BATCH_SOFT_CLUSTER_NOISE = 1024 +BATCH_SOFT_CLUSTER_NOISE = 512 def cluster_impl( @@ -68,6 +68,7 @@ def cluster_impl( task_id: Optional[TaskId] = None, recompute_titles: bool = False, batch_size_titling: Optional[int] = None, + skip_noisy_assignment: bool = False, ) -> None: """Compute clusters for a field of the dataset.""" topic_fn = topic_fn or generate_title_openai @@ -108,6 +109,13 @@ def cluster_impl( else: raise ValueError('input must be provided.') + if use_garden and skip_noisy_assignment: + raise ValueError( + '`use_garden` and `skip_noisy_assignment` cannot both be True. ' + 'The garden implementation is heavily optimizied and will always ' + 'assign noisy points to the nearest cluster.' + ) + # Extract the text from the input path into a temporary column. TEXT_COLUMN = 'text' temp_text_path = (*cluster_output_path, TEXT_COLUMN) @@ -154,7 +162,12 @@ def cluster_documents(items: Iterator[Item]) -> Iterator[Item]: cluster_items = sparse_to_dense_compute( docs, lambda x: _hdbscan_cluster( - x, min_cluster_size, use_garden, num_docs=total_len, task_info=task_info + x, + min_cluster_size, + use_garden, + num_docs=total_len, + task_info=task_info, + skip_noisy_assignment=skip_noisy_assignment, ), ) for item, cluster_item in zip(items2, cluster_items): @@ -208,7 +221,13 @@ def cluster_titles(items: Iterator[Item]) -> Iterator[Item]: items, items2 = itertools.tee(items) docs = (item.get(CLUSTER_TITLE) for item in items) cluster_items = sparse_to_dense_compute( - docs, lambda x: _hdbscan_cluster(x, MIN_CLUSTER_SIZE_CATEGORY, use_garden) + docs, + lambda x: _hdbscan_cluster( + x, + MIN_CLUSTER_SIZE_CATEGORY, + use_garden=use_garden, + skip_noisy_assignment=skip_noisy_assignment, + ), ) for item, cluster_item in zip(items2, cluster_items): item[CATEGORY_ID] = (cluster_item or {}).get(CLUSTER_ID, -1) @@ -298,6 +317,7 @@ def _hdbscan_cluster( use_garden: bool = False, num_docs: Optional[int] = None, task_info: Optional[TaskInfo] = None, + skip_noisy_assignment: bool = False, ) -> Iterator[Item]: """Cluster docs with HDBSCAN.""" if use_garden: @@ -338,9 +358,9 @@ def _hdbscan_cluster( from umap import UMAP dim = all_vectors[0].size - with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'): - n_neighbors = min(30, len(all_vectors) - 1) - if UMAP_DIM < dim and UMAP_DIM < len(all_vectors): + n_neighbors = min(30, len(all_vectors) - 1) + if UMAP_DIM < dim and UMAP_DIM < len(all_vectors): + with DebugTimer(f'UMAP: Reducing dim from {dim} to {UMAP_DIM} of {len(all_vectors)} vectors'): reducer = UMAP( n_components=UMAP_DIM, n_neighbors=n_neighbors, @@ -375,14 +395,13 @@ def _hdbscan_cluster( if cluster_id == -1: noisy_vectors.append(all_vectors[i]) num_noisy = len(noisy_vectors) - perc_noisy = 100 * num_noisy / len(clusterer.labels_) - log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.') - noisy_labels: list[np.ndarray] = [] noisy_probs: list[np.ndarray] = [] labels = clusterer.labels_ memberships = clusterer.probabilities_ - if num_noisy > 0 and num_noisy < len(clusterer.labels_): + if not skip_noisy_assignment and num_noisy > 0 and num_noisy < len(clusterer.labels_): + perc_noisy = 100 * num_noisy / len(clusterer.labels_) + log(f'{num_noisy} noise points ({perc_noisy:.1f}%) will be assigned to nearest cluster.') with DebugTimer('HDBSCAN: Computing membership for the noise points'): for batch_noisy_vectors in chunks(noisy_vectors, BATCH_SOFT_CLUSTER_NOISE): batch_noisy_vectors = np.array(batch_noisy_vectors, dtype=np.float32) diff --git a/lilac/data/clustering_test.py b/lilac/data/clustering_test.py index 488e81a4..87d12a2b 100644 --- a/lilac/data/clustering_test.py +++ b/lilac/data/clustering_test.py @@ -62,9 +62,9 @@ def compute(docs: list[str]) -> list[Item]: if 'summar' in doc or 'hello' in doc or 'greeting' in doc: result.append([chunk_embedding(0, len(doc), np.array([1, 1, 1]))]) elif 'simpl' in doc or 'whats' in doc or 'time' in doc: - result.append([chunk_embedding(0, len(doc), np.array([0, 0, 0]))]) + result.append([chunk_embedding(0, len(doc), np.array([-1, -1, -1]))]) else: - result.append([chunk_embedding(0, len(doc), np.array([0.5, 0.5, 0.5]))]) + result.append([chunk_embedding(0, len(doc), np.array([100, 0, -100]))]) return result mocker.patch.object(JinaV2Small, 'compute', side_effect=compute) @@ -718,3 +718,93 @@ def topic_fn(docs: list[tuple[str, float]]) -> str: }, }, ] + + +def test_clusters_skip_noisy_assignment( + make_test_data: TestDataMaker, mocker: MockerFixture +) -> None: + texts: list[str] = [ + 'Can you summarize this article', + 'Can you rewrite this in a simpler way', + 'Can you provide a short summary of the following text', + 'Can you simplify this text', + 'Hello world', + ] + dataset = make_test_data([{'text': t} for t in texts]) + + def topic_fn(docs: list[tuple[str, float]]) -> str: + if 'summar' in docs[0][0]: + return 'summarization' + elif 'simpl' in docs[0][0]: + return 'simplification' + return 'other' + + mocker.patch.object(clustering, 'MIN_CLUSTER_SIZE_CATEGORY', 2) + _mock_jina(mocker) + + dataset.cluster( + 'text', + min_cluster_size=2, + topic_fn=topic_fn, + category_fn=lambda _: 'MockCategory', + skip_noisy_assignment=True, + ) + + rows = list(dataset.select_rows(['text', 'text__cluster'], combine_columns=True)) + assert rows == [ + { + 'text': 'Can you summarize this article', + 'text__cluster': { + 'cluster_id': 0, + 'cluster_membership_prob': 1.0, + 'cluster_title': 'summarization', + 'category_id': 0, + 'category_membership_prob': 1.0, + 'category_title': 'MockCategory', + }, + }, + { + 'text': 'Can you rewrite this in a simpler way', + 'text__cluster': { + 'cluster_id': 1, + 'cluster_membership_prob': 1.0, + 'cluster_title': 'simplification', + 'category_id': 1, + 'category_membership_prob': 1.0, + 'category_title': 'MockCategory', + }, + }, + { + 'text': 'Can you provide a short summary of the following text', + 'text__cluster': { + 'cluster_id': 0, + 'cluster_membership_prob': 1.0, + 'cluster_title': 'summarization', + 'category_id': 0, + 'category_membership_prob': 1.0, + 'category_title': 'MockCategory', + }, + }, + { + 'text': 'Can you simplify this text', + 'text__cluster': { + 'cluster_id': 1, + 'cluster_membership_prob': 1.0, + 'cluster_title': 'simplification', + 'category_id': 1, + 'category_membership_prob': 1.0, + 'category_title': 'MockCategory', + }, + }, + { + 'text': 'Hello world', + 'text__cluster': { + 'cluster_id': -1, + 'cluster_membership_prob': 0.0, + 'cluster_title': None, + 'category_id': -1, + 'category_membership_prob': 0.0, + 'category_title': None, + }, + }, + ] diff --git a/lilac/data/dataset.py b/lilac/data/dataset.py index ec7945d8..874d3138 100644 --- a/lilac/data/dataset.py +++ b/lilac/data/dataset.py @@ -506,6 +506,7 @@ def cluster( task_id: Optional[TaskId] = None, # TODO(0.4.0): colocate with topic_fn. category_fn: Optional[TopicFn] = None, + skip_noisy_assignment: bool = False, ) -> None: """Compute clusters for a field of the dataset. @@ -524,6 +525,9 @@ def cluster( of the task. category_fn: A function that returns a category for a set of related titles. It takes a list of (doc, membership_score) tuples and returns a single category name. + skip_noisy_assignment: If true, noisy points will not be assigned to the nearest cluster. + This only has an effect when the clustering is done locally (use_garden=False) and will + speedup clustering. """ pass diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py index 78bcdf5c..3827d063 100644 --- a/lilac/data/dataset_duckdb.py +++ b/lilac/data/dataset_duckdb.py @@ -3334,6 +3334,7 @@ def cluster( use_garden: bool = False, task_id: Optional[TaskId] = None, category_fn: Optional[TopicFn] = cluster_titling.generate_category_openai, + skip_noisy_assignment: bool = False, ) -> None: topic_fn = topic_fn or cluster_titling.generate_title_openai category_fn = category_fn or cluster_titling.generate_category_openai @@ -3347,6 +3348,7 @@ def cluster( overwrite=overwrite, use_garden=use_garden, task_id=task_id, + skip_noisy_assignment=skip_noisy_assignment, ) @override @@ -3950,6 +3952,7 @@ def _auto_bins(stats: StatsResult) -> list[Bin]: return [('0', const_val, None)] is_integer = stats.value_samples and all(isinstance(val, int) for val in stats.value_samples) + def _round(value: float) -> float: # Select a round ndigits as a function of the value range. We offset it by 2 to allow for some # decimal places as a function of the range. diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py index 4c4f4d8d..da7e27c8 100644 --- a/lilac/router_dataset_signals.py +++ b/lilac/router_dataset_signals.py @@ -91,6 +91,11 @@ class ClusterOptions(BaseModel): use_garden: bool = PydanticField( default=False, description='Accelerate computation by running remotely on Lilac Garden.' ) + skip_noisy_assignment: bool = PydanticField( + default=False, + description='Skip assignment of noisy points to the nearest cluster to speed up clustering.', + ) + overwrite: bool = False @@ -145,6 +150,7 @@ def run() -> None: use_garden=options.use_garden, overwrite=options.overwrite, task_id=task_id, + skip_noisy_assignment=options.skip_noisy_assignment, ) launch_task(task_id, run) diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte index 79d553d3..69fc0dcf 100644 --- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte +++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte @@ -7,6 +7,7 @@ input: Path; output_path?: Path; use_garden?: boolean; + skip_noisy_assignment?: boolean; overwrite?: boolean; }; @@ -101,7 +102,8 @@ use_garden: options.use_garden, output_path: outputColumn, input_selector: selectedFormatSelector, - overwrite: options.overwrite + overwrite: options.overwrite, + skip_noisy_assignment: options.skip_noisy_assignment } ]); close(); @@ -173,6 +175,20 @@ {/if} + +
+
Skip noisy assignment
+
+ Skip assignment of noisy points to the nearest cluster to speed up clustering. +
+ +
+
Overwrite
diff --git a/web/lib/fastapi_client/models/ClusterOptions.ts b/web/lib/fastapi_client/models/ClusterOptions.ts index a5b200fe..c7471b9f 100644 --- a/web/lib/fastapi_client/models/ClusterOptions.ts +++ b/web/lib/fastapi_client/models/ClusterOptions.ts @@ -14,6 +14,10 @@ export type ClusterOptions = { * Accelerate computation by running remotely on Lilac Garden. */ use_garden?: boolean; + /** + * Skip assignment of noisy points to the nearest cluster to speed up clustering. + */ + skip_noisy_assignment?: boolean; overwrite?: boolean; };