From 5baaca69360bdc3dfec7636e0deae8586d6c54ad Mon Sep 17 00:00:00 2001
From: william ghysels
Date: Sun, 18 Jan 2026 17:46:07 +0100
Subject: [PATCH] Remove UMAP tooling from core
Drop visualize_umap.py and related deps/docs so core stays focused on scan/search.
---
.github/workflows/ci.yml | 2 +-
CONTRIBUTING.md | 2 +-
README.md | 37 +----
config.json.example | 5 +-
requirements.txt | 3 -
visualize_umap.py | 337 ---------------------------------------
6 files changed, 11 insertions(+), 375 deletions(-)
delete mode 100644 visualize_umap.py
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index adaa30b..f565527 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,4 +14,4 @@ jobs:
python-version: "3.11"
- name: Compile Python (syntax check)
run: |
- python -m py_compile image_database.py visualize_umap.py
+ python -m py_compile image_database.py
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2718794..3d5d778 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -10,4 +10,4 @@ Please open a pull request for any change.
### Checks
-CI runs a lightweight syntax check (`python -m py_compile`) on `image_database.py` and `visualize_umap.py`.
+CI runs a lightweight syntax check (`python -m py_compile`) on `image_database.py`.
diff --git a/README.md b/README.md
index 5f6e96c..b0f6ffc 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ A searchable image database using SigLIP 2 (CLIP) embeddings and SQLite-vec for
- **Image Search**: Find similar images using a reference image
- **Combined Search**: Combine text and image queries with weighted blending
- **Interactive Mode**: Load model once and run multiple queries
-- **3D Visualization**: UMAP-based 3D visualization of image embeddings with clustering
- **HTML Gallery**: Beautiful search results with image previews and direct file access

@@ -34,7 +33,7 @@ cd CLIP-database
2. Install dependencies:
```bash
-cd github
+cd core
pip install -r requirements.txt
```
@@ -53,7 +52,7 @@ pip install sqlite-vec
Scan a directory and build the image database:
```bash
-cd github
+cd core
python image_database.py scan /path/to/images --db "/path/to/database.db"
```
@@ -64,7 +63,7 @@ Options:
- `--limit`: Limit number of images to process (for testing)
```bash
-cd github
+cd core
python image_database.py scan /path/to/images --batch-size 75 --inference-batch-size 16 --profile --limit 100
```
@@ -72,7 +71,7 @@ python image_database.py scan /path/to/images --batch-size 75 --inference-batch-
#### Text Search
```bash
-cd github
+cd core
python image_database.py search "a red car" -k 20 --db "/path/to/database.db"
```
@@ -83,25 +82,25 @@ python image_database.py search "a red car" --db "/path/to/database.db" -k 20
#### Image Search
```bash
-cd github
+cd core
python image_database.py search /path/to/image.jpg --image -k 20
```
#### Combined Search
```bash
-cd github
+cd core
python image_database.py search "sunset" --query2 /path/to/image.jpg --weights 0.7 0.3 -k 20
```
#### Negative Prompts
```bash
-cd github
+cd core
python image_database.py search "nature" --negative "buildings" -k 20
```
#### Interactive Mode
```bash
-cd github
+cd core
python image_database.py search --interactive
```
@@ -114,25 +113,6 @@ In interactive mode:
- Type `quit` or `exit` to end session
-### 3D Visualization
-
-Generate a UMAP 3D visualization of all image embeddings:
-
-```bash
-cd github
-python visualize_umap.py
-```
-
-This will:
-1. Load embeddings from the database
-2. Compute UMAP projections (cached for future runs)
-3. Cluster embeddings for color coding
-4. Generate an interactive HTML visualization
-
-Open the generated HTML file in your browser and click on points to see image previews.
-
-**Note:** The HTML results include "Open Image" and "Open Folder" links that use the `localexplorer:` protocol. To use these links, install a browser extension like [Local Explorer](https://chrome.google.com/webstore/detail/local-explorer/llbiblehpbpeflfgjcdfcpcakjhddedi) for Chrome/Edge. Without the extension, images will still display, but the file/folder links won't work.
-
## Model
This project uses [SigLIP 2 SO400M](https://huggingface.co/google/siglip2-so400m-patch14-224) from Google, which provides:
@@ -154,7 +134,6 @@ The SQLite database contains:
- Use `--inference-batch-size` to optimize GPU memory usage
- Enable `--profile` to identify bottlenecks
- The database uses WAL mode for better concurrent access
-- UMAP projections are cached to avoid recomputation
## License
diff --git a/config.json.example b/config.json.example
index 6488553..7ca1ec4 100644
--- a/config.json.example
+++ b/config.json.example
@@ -2,8 +2,5 @@
"database_dir": "C:\\MyExampleProject",
"model_cache_dir": "C:\\MyExampleProject\\models",
"results_dir": "results",
- "thumbnails_dir": "thumbnails",
- "umap_output_file": "umap_3d_visualization.html",
- "umap_cache_file": "umap_projections_cache.pkl",
- "umap_metadata_file": "umap_image_metadata.json"
+ "thumbnails_dir": "thumbnails"
}
diff --git a/requirements.txt b/requirements.txt
index a766da6..8e498c6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,4 @@ tqdm>=4.66.0
numpy>=1.24.0
sqlite-vec>=0.0.1
sentencepiece>=0.1.99
-umap-learn>=0.5.5
-plotly>=5.18.0
-pandas>=2.0.0
PyMuPDF>=1.23.0
\ No newline at end of file
diff --git a/visualize_umap.py b/visualize_umap.py
deleted file mode 100644
index 48ea33b..0000000
--- a/visualize_umap.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env python3
-"""
-Generate a UMAP 3D visualization of image embeddings with clustering and image preview
-
-NOTE: This code was generated with AI assistance.
-"""
-
-import sqlite3
-import sqlite_vec
-import numpy as np
-import pandas as pd
-import umap
-import plotly.express as px
-from sklearn.cluster import KMeans
-from tqdm import tqdm
-import pickle
-import os
-import json
-from pathlib import Path
-from typing import Dict
-
-# ============================================================================
-# Configuration Loading - Reads from config.json in project root
-# ============================================================================
-def load_config() -> Dict[str, str]:
- """Load configuration from config.json in the same directory as the script."""
- script_dir = Path(__file__).parent.absolute()
- config_path = script_dir / "config.json"
-
- if config_path.exists():
- try:
- with open(config_path, 'r', encoding='utf-8') as f:
- config = json.load(f)
- return config
- except Exception as e:
- print(f"Warning: Could not load config.json: {e}")
- print("Using default configuration.")
-
- # Default configuration if config.json doesn't exist
- return {
- "database_path": "image_database.db",
- "umap_output_file": "umap_3d_visualization.html",
- "umap_cache_file": "umap_projections_cache.pkl",
- "umap_metadata_file": "umap_image_metadata.json"
- }
-
-def resolve_path(config_path: str, base_dir: Path) -> str:
- """Resolve a path from config - use as-is if absolute, otherwise join with base directory."""
- if not config_path:
- return ""
- path = Path(config_path)
- # If path is already absolute, use it as-is
- if path.is_absolute():
- return str(path)
- # Otherwise, join with base directory (parent of code folder for outputs)
- return str(base_dir / path)
-
-# Load configuration
-_CONFIG = load_config()
-# For outputs, use parent directory. For absolute paths in config, they'll be used as-is.
-_OUTPUT_BASE = Path(__file__).parent.absolute().parent
-
-# Configuration variables (paths relative to output base, or absolute if specified)
-DB_PATH = resolve_path(_CONFIG.get("database_path", "image_database.db"), _OUTPUT_BASE)
-OUTPUT_HTML_FILE = resolve_path(_CONFIG.get("umap_output_file", "umap_3d_visualization.html"), _OUTPUT_BASE)
-UMAP_CACHE_FILE = resolve_path(_CONFIG.get("umap_cache_file", "umap_projections_cache.pkl"), _OUTPUT_BASE)
-IMAGE_METADATA_FILE = resolve_path(_CONFIG.get("umap_metadata_file", "umap_image_metadata.json"), _OUTPUT_BASE)
-# ============================================================================
-
-print("Loading embeddings from database...")
-conn = sqlite3.connect(DB_PATH, timeout=30.0)
-conn.execute("PRAGMA journal_mode=WAL")
-conn.enable_load_extension(True)
-sqlite_vec.load(conn)
-cursor = conn.cursor()
-
-# Get all embeddings with their file paths
-cursor.execute("""
- SELECT
- i.file_path,
- vec0.embedding
- FROM vec0
- JOIN image_embeddings ie ON vec0.rowid = ie.rowid
- JOIN images i ON ie.image_id = i.id
-""")
-
-print("Fetching all embeddings...")
-all_results = cursor.fetchall()
-conn.close()
-
-if not all_results:
- print("No embeddings found in database!")
- exit(1)
-
-print(f"Found {len(all_results):,} embeddings")
-
-# Extract vectors and paths
-print("Converting embeddings to numpy arrays...")
-all_vectors = []
-all_image_paths = []
-
-for file_path, emb_data in tqdm(all_results, desc="Processing embeddings"):
- try:
- # Handle both binary and JSON formats
- if isinstance(emb_data, bytes):
- # Try binary format first
- try:
- emb = np.frombuffer(emb_data, dtype=np.float32)
- if emb.shape[0] != 1152:
- # Wrong size, might be JSON string encoded as bytes
- emb = np.array(json.loads(emb_data.decode('utf-8')), dtype=np.float32)
- except:
- # Not binary, try JSON
- emb = np.array(json.loads(emb_data.decode('utf-8')), dtype=np.float32)
- elif isinstance(emb_data, str):
- # JSON string format
- emb = np.array(json.loads(emb_data), dtype=np.float32)
- else:
- continue
-
- if emb.shape[0] == 1152: # Verify dimension
- all_vectors.append(emb)
- all_image_paths.append(file_path)
- except Exception as e:
- print(f"Error processing {file_path}: {e}")
- continue
-
-if not all_vectors:
- print("No valid embeddings found!")
- exit(1)
-
-all_vectors = np.array(all_vectors)
-print(f"Embedding matrix shape: {all_vectors.shape}")
-
-# Fit UMAP (Takes ~5-10 mins for 100k points on a good CPU)
-print("\nFitting UMAP reducer (this may take a while)...")
-reducer = umap.UMAP(
- n_neighbors=15, # Balance between local and global structure
- min_dist=0.1, # How tightly points pack together
- n_components=3, # For a 3D plot
- metric='cosine', # Best for SigLIP/CLIP vectors
- verbose=True
-)
-
-# Check if we have saved UMAP projections
-use_cache = False
-
-if os.path.exists(UMAP_CACHE_FILE):
- print(f"\nLoading cached UMAP projections from {UMAP_CACHE_FILE}...")
- try:
- with open(UMAP_CACHE_FILE, 'rb') as f:
- cache_data = pickle.load(f)
- if (cache_data.get('vectors_hash') == hash(all_vectors.tobytes()) and
- len(cache_data.get('projections', [])) == len(all_vectors)):
- projections = cache_data['projections']
- print("Using cached UMAP projections (no recalculation needed!)")
- use_cache = True
- else:
- print("Cache outdated, will recalculate UMAP...")
- except Exception as e:
- print(f"Error loading cache: {e}, will recalculate UMAP...")
-
-if not use_cache:
- print("\nFitting UMAP reducer (this may take a while)...")
- projections = reducer.fit_transform(all_vectors)
- print(f"UMAP projections shape: {projections.shape}")
- # Save for future use
- print(f"Saving UMAP projections to {UMAP_CACHE_FILE} for future use...")
- with open(UMAP_CACHE_FILE, 'wb') as f:
- pickle.dump({
- 'projections': projections,
- 'vectors_hash': hash(all_vectors.tobytes())
- }, f)
-
-# Cluster the embeddings for coloring (fast, separate from UMAP)
-# Note: Clustering happens in 1152D space, but visualization is in 3D UMAP space.
-# This means clusters may appear intermingled because:
-# 1. UMAP preserves local structure but not global cluster boundaries
-# 2. 3D projection loses information from 1152 dimensions
-# 3. Points close in 1152D might be far in 3D, and vice versa
-# This is normal and expected behavior!
-print("\nClustering embeddings for color coding...")
-n_clusters = min(20, len(all_vectors) // 100) # Adaptive number of clusters
-if n_clusters < 2:
- n_clusters = 2
-
-kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-cluster_labels = kmeans.fit_predict(all_vectors)
-print(f"Created {n_clusters} clusters")
-print("Note: Clusters are computed in 1152D embedding space, but visualized in 3D UMAP space.")
-print(" Some intermingling is expected - clusters represent semantic similarity in high-D space.")
-
-# Create DataFrame
-print("\nCreating 3D visualization...")
-df = pd.DataFrame(projections, columns=['x', 'y', 'z'])
-df['path'] = all_image_paths
-df['cluster'] = cluster_labels
-df['filename'] = df['path'].apply(lambda p: p.split('\\')[-1] if '\\' in p else p.split('/')[-1])
-# Convert file paths to file:// URLs for direct image display
-df['image_url'] = df['path'].apply(lambda p: f"file:///{p.replace(chr(92), '/')}")
-
-# Create 3D scatter plot with cluster colors
-fig = px.scatter_3d(
- df,
- x='x',
- y='y',
- z='z',
- color='cluster',
- color_discrete_sequence=px.colors.qualitative.Set3,
- hover_name='filename',
- hover_data={'path': True, 'cluster': True, 'filename': False},
- title=f"UMAP Photo Universe ({len(df):,} images, {n_clusters} clusters)",
- opacity=0.6,
- labels={'x': 'UMAP 1', 'y': 'UMAP 2', 'z': 'UMAP 3', 'cluster': 'Cluster'},
- custom_data=['path', 'image_url', 'filename']
-)
-
-# Add JavaScript for sidebar image display
-fig.update_layout(
- scene=dict(
- xaxis_title='UMAP 1',
- yaxis_title='UMAP 2',
- zaxis_title='UMAP 3',
- ),
- width=1400,
- height=800
-)
-
-# Save image metadata to separate JSON file (much smaller)
-print(f"Saving image metadata to {IMAGE_METADATA_FILE}...")
-image_metadata = df[['path', 'image_url', 'filename']].to_dict('records')
-with open(IMAGE_METADATA_FILE, 'w', encoding='utf-8') as f:
- json.dump(image_metadata, f)
-
-# Use Plotly's efficient HTML writer (uses CDN, doesn't embed plotly.js)
-print(f"Generating HTML with Plotly's efficient writer...")
-fig.write_html(
- OUTPUT_HTML_FILE,
- include_plotlyjs='cdn', # Use CDN instead of embedding (much smaller file)
- div_id='plot'
-)
-
-# Post-process: add sidebar and load metadata from external JSON
-print("Adding custom sidebar functionality...")
-with open(OUTPUT_HTML_FILE, 'r', encoding='utf-8') as f:
- html = f.read()
-
-# Add CSS before
-css = """
-"""
-html = html.replace('', css + '')
-
-# Wrap plot div in container and add sidebar before
)
-html = html.replace('', '\\n ')
-# Add sidebar before closing body
-html = html.replace('', sidebar_html + '')
-
-# Add JavaScript before