diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index adaa30b..f565527 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,4 +14,4 @@ jobs: python-version: "3.11" - name: Compile Python (syntax check) run: | - python -m py_compile image_database.py visualize_umap.py + python -m py_compile image_database.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2718794..3d5d778 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,4 +10,4 @@ Please open a pull request for any change. ### Checks -CI runs a lightweight syntax check (`python -m py_compile`) on `image_database.py` and `visualize_umap.py`. +CI runs a lightweight syntax check (`python -m py_compile`) on `image_database.py`. diff --git a/README.md b/README.md index 5f6e96c..b0f6ffc 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ A searchable image database using SigLIP 2 (CLIP) embeddings and SQLite-vec for - **Image Search**: Find similar images using a reference image - **Combined Search**: Combine text and image queries with weighted blending - **Interactive Mode**: Load model once and run multiple queries -- **3D Visualization**: UMAP-based 3D visualization of image embeddings with clustering - **HTML Gallery**: Beautiful search results with image previews and direct file access  @@ -34,7 +33,7 @@ cd CLIP-database 2. Install dependencies: ```bash -cd github +cd core pip install -r requirements.txt ``` @@ -53,7 +52,7 @@ pip install sqlite-vec Scan a directory and build the image database: ```bash -cd github +cd core python image_database.py scan /path/to/images --db "/path/to/database.db" ``` @@ -64,7 +63,7 @@ Options: - `--limit`: Limit number of images to process (for testing) ```bash -cd github +cd core python image_database.py scan /path/to/images --batch-size 75 --inference-batch-size 16 --profile --limit 100 ``` @@ -72,7 +71,7 @@ python image_database.py scan /path/to/images --batch-size 75 --inference-batch- #### Text Search ```bash -cd github +cd core python image_database.py search "a red car" -k 20 --db "/path/to/database.db" ``` @@ -83,25 +82,25 @@ python image_database.py search "a red car" --db "/path/to/database.db" -k 20 #### Image Search ```bash -cd github +cd core python image_database.py search /path/to/image.jpg --image -k 20 ``` #### Combined Search ```bash -cd github +cd core python image_database.py search "sunset" --query2 /path/to/image.jpg --weights 0.7 0.3 -k 20 ``` #### Negative Prompts ```bash -cd github +cd core python image_database.py search "nature" --negative "buildings" -k 20 ``` #### Interactive Mode ```bash -cd github +cd core python image_database.py search --interactive ``` @@ -114,25 +113,6 @@ In interactive mode: - Type `quit` or `exit` to end session -### 3D Visualization - -Generate a UMAP 3D visualization of all image embeddings: - -```bash -cd github -python visualize_umap.py -``` - -This will: -1. Load embeddings from the database -2. Compute UMAP projections (cached for future runs) -3. Cluster embeddings for color coding -4. Generate an interactive HTML visualization - -Open the generated HTML file in your browser and click on points to see image previews. - -**Note:** The HTML results include "Open Image" and "Open Folder" links that use the `localexplorer:` protocol. To use these links, install a browser extension like [Local Explorer](https://chrome.google.com/webstore/detail/local-explorer/llbiblehpbpeflfgjcdfcpcakjhddedi) for Chrome/Edge. Without the extension, images will still display, but the file/folder links won't work. - ## Model This project uses [SigLIP 2 SO400M](https://huggingface.co/google/siglip2-so400m-patch14-224) from Google, which provides: @@ -154,7 +134,6 @@ The SQLite database contains: - Use `--inference-batch-size` to optimize GPU memory usage - Enable `--profile` to identify bottlenecks - The database uses WAL mode for better concurrent access -- UMAP projections are cached to avoid recomputation ## License diff --git a/config.json.example b/config.json.example index 6488553..7ca1ec4 100644 --- a/config.json.example +++ b/config.json.example @@ -2,8 +2,5 @@ "database_dir": "C:\\MyExampleProject", "model_cache_dir": "C:\\MyExampleProject\\models", "results_dir": "results", - "thumbnails_dir": "thumbnails", - "umap_output_file": "umap_3d_visualization.html", - "umap_cache_file": "umap_projections_cache.pkl", - "umap_metadata_file": "umap_image_metadata.json" + "thumbnails_dir": "thumbnails" } diff --git a/requirements.txt b/requirements.txt index a766da6..8e498c6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,4 @@ tqdm>=4.66.0 numpy>=1.24.0 sqlite-vec>=0.0.1 sentencepiece>=0.1.99 -umap-learn>=0.5.5 -plotly>=5.18.0 -pandas>=2.0.0 PyMuPDF>=1.23.0 \ No newline at end of file diff --git a/visualize_umap.py b/visualize_umap.py deleted file mode 100644 index 48ea33b..0000000 --- a/visualize_umap.py +++ /dev/null @@ -1,337 +0,0 @@ -#!/usr/bin/env python3 -""" -Generate a UMAP 3D visualization of image embeddings with clustering and image preview - -NOTE: This code was generated with AI assistance. -""" - -import sqlite3 -import sqlite_vec -import numpy as np -import pandas as pd -import umap -import plotly.express as px -from sklearn.cluster import KMeans -from tqdm import tqdm -import pickle -import os -import json -from pathlib import Path -from typing import Dict - -# ============================================================================ -# Configuration Loading - Reads from config.json in project root -# ============================================================================ -def load_config() -> Dict[str, str]: - """Load configuration from config.json in the same directory as the script.""" - script_dir = Path(__file__).parent.absolute() - config_path = script_dir / "config.json" - - if config_path.exists(): - try: - with open(config_path, 'r', encoding='utf-8') as f: - config = json.load(f) - return config - except Exception as e: - print(f"Warning: Could not load config.json: {e}") - print("Using default configuration.") - - # Default configuration if config.json doesn't exist - return { - "database_path": "image_database.db", - "umap_output_file": "umap_3d_visualization.html", - "umap_cache_file": "umap_projections_cache.pkl", - "umap_metadata_file": "umap_image_metadata.json" - } - -def resolve_path(config_path: str, base_dir: Path) -> str: - """Resolve a path from config - use as-is if absolute, otherwise join with base directory.""" - if not config_path: - return "" - path = Path(config_path) - # If path is already absolute, use it as-is - if path.is_absolute(): - return str(path) - # Otherwise, join with base directory (parent of code folder for outputs) - return str(base_dir / path) - -# Load configuration -_CONFIG = load_config() -# For outputs, use parent directory. For absolute paths in config, they'll be used as-is. -_OUTPUT_BASE = Path(__file__).parent.absolute().parent - -# Configuration variables (paths relative to output base, or absolute if specified) -DB_PATH = resolve_path(_CONFIG.get("database_path", "image_database.db"), _OUTPUT_BASE) -OUTPUT_HTML_FILE = resolve_path(_CONFIG.get("umap_output_file", "umap_3d_visualization.html"), _OUTPUT_BASE) -UMAP_CACHE_FILE = resolve_path(_CONFIG.get("umap_cache_file", "umap_projections_cache.pkl"), _OUTPUT_BASE) -IMAGE_METADATA_FILE = resolve_path(_CONFIG.get("umap_metadata_file", "umap_image_metadata.json"), _OUTPUT_BASE) -# ============================================================================ - -print("Loading embeddings from database...") -conn = sqlite3.connect(DB_PATH, timeout=30.0) -conn.execute("PRAGMA journal_mode=WAL") -conn.enable_load_extension(True) -sqlite_vec.load(conn) -cursor = conn.cursor() - -# Get all embeddings with their file paths -cursor.execute(""" - SELECT - i.file_path, - vec0.embedding - FROM vec0 - JOIN image_embeddings ie ON vec0.rowid = ie.rowid - JOIN images i ON ie.image_id = i.id -""") - -print("Fetching all embeddings...") -all_results = cursor.fetchall() -conn.close() - -if not all_results: - print("No embeddings found in database!") - exit(1) - -print(f"Found {len(all_results):,} embeddings") - -# Extract vectors and paths -print("Converting embeddings to numpy arrays...") -all_vectors = [] -all_image_paths = [] - -for file_path, emb_data in tqdm(all_results, desc="Processing embeddings"): - try: - # Handle both binary and JSON formats - if isinstance(emb_data, bytes): - # Try binary format first - try: - emb = np.frombuffer(emb_data, dtype=np.float32) - if emb.shape[0] != 1152: - # Wrong size, might be JSON string encoded as bytes - emb = np.array(json.loads(emb_data.decode('utf-8')), dtype=np.float32) - except: - # Not binary, try JSON - emb = np.array(json.loads(emb_data.decode('utf-8')), dtype=np.float32) - elif isinstance(emb_data, str): - # JSON string format - emb = np.array(json.loads(emb_data), dtype=np.float32) - else: - continue - - if emb.shape[0] == 1152: # Verify dimension - all_vectors.append(emb) - all_image_paths.append(file_path) - except Exception as e: - print(f"Error processing {file_path}: {e}") - continue - -if not all_vectors: - print("No valid embeddings found!") - exit(1) - -all_vectors = np.array(all_vectors) -print(f"Embedding matrix shape: {all_vectors.shape}") - -# Fit UMAP (Takes ~5-10 mins for 100k points on a good CPU) -print("\nFitting UMAP reducer (this may take a while)...") -reducer = umap.UMAP( - n_neighbors=15, # Balance between local and global structure - min_dist=0.1, # How tightly points pack together - n_components=3, # For a 3D plot - metric='cosine', # Best for SigLIP/CLIP vectors - verbose=True -) - -# Check if we have saved UMAP projections -use_cache = False - -if os.path.exists(UMAP_CACHE_FILE): - print(f"\nLoading cached UMAP projections from {UMAP_CACHE_FILE}...") - try: - with open(UMAP_CACHE_FILE, 'rb') as f: - cache_data = pickle.load(f) - if (cache_data.get('vectors_hash') == hash(all_vectors.tobytes()) and - len(cache_data.get('projections', [])) == len(all_vectors)): - projections = cache_data['projections'] - print("Using cached UMAP projections (no recalculation needed!)") - use_cache = True - else: - print("Cache outdated, will recalculate UMAP...") - except Exception as e: - print(f"Error loading cache: {e}, will recalculate UMAP...") - -if not use_cache: - print("\nFitting UMAP reducer (this may take a while)...") - projections = reducer.fit_transform(all_vectors) - print(f"UMAP projections shape: {projections.shape}") - # Save for future use - print(f"Saving UMAP projections to {UMAP_CACHE_FILE} for future use...") - with open(UMAP_CACHE_FILE, 'wb') as f: - pickle.dump({ - 'projections': projections, - 'vectors_hash': hash(all_vectors.tobytes()) - }, f) - -# Cluster the embeddings for coloring (fast, separate from UMAP) -# Note: Clustering happens in 1152D space, but visualization is in 3D UMAP space. -# This means clusters may appear intermingled because: -# 1. UMAP preserves local structure but not global cluster boundaries -# 2. 3D projection loses information from 1152 dimensions -# 3. Points close in 1152D might be far in 3D, and vice versa -# This is normal and expected behavior! -print("\nClustering embeddings for color coding...") -n_clusters = min(20, len(all_vectors) // 100) # Adaptive number of clusters -if n_clusters < 2: - n_clusters = 2 - -kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) -cluster_labels = kmeans.fit_predict(all_vectors) -print(f"Created {n_clusters} clusters") -print("Note: Clusters are computed in 1152D embedding space, but visualized in 3D UMAP space.") -print(" Some intermingling is expected - clusters represent semantic similarity in high-D space.") - -# Create DataFrame -print("\nCreating 3D visualization...") -df = pd.DataFrame(projections, columns=['x', 'y', 'z']) -df['path'] = all_image_paths -df['cluster'] = cluster_labels -df['filename'] = df['path'].apply(lambda p: p.split('\\')[-1] if '\\' in p else p.split('/')[-1]) -# Convert file paths to file:// URLs for direct image display -df['image_url'] = df['path'].apply(lambda p: f"file:///{p.replace(chr(92), '/')}") - -# Create 3D scatter plot with cluster colors -fig = px.scatter_3d( - df, - x='x', - y='y', - z='z', - color='cluster', - color_discrete_sequence=px.colors.qualitative.Set3, - hover_name='filename', - hover_data={'path': True, 'cluster': True, 'filename': False}, - title=f"UMAP Photo Universe ({len(df):,} images, {n_clusters} clusters)", - opacity=0.6, - labels={'x': 'UMAP 1', 'y': 'UMAP 2', 'z': 'UMAP 3', 'cluster': 'Cluster'}, - custom_data=['path', 'image_url', 'filename'] -) - -# Add JavaScript for sidebar image display -fig.update_layout( - scene=dict( - xaxis_title='UMAP 1', - yaxis_title='UMAP 2', - zaxis_title='UMAP 3', - ), - width=1400, - height=800 -) - -# Save image metadata to separate JSON file (much smaller) -print(f"Saving image metadata to {IMAGE_METADATA_FILE}...") -image_metadata = df[['path', 'image_url', 'filename']].to_dict('records') -with open(IMAGE_METADATA_FILE, 'w', encoding='utf-8') as f: - json.dump(image_metadata, f) - -# Use Plotly's efficient HTML writer (uses CDN, doesn't embed plotly.js) -print(f"Generating HTML with Plotly's efficient writer...") -fig.write_html( - OUTPUT_HTML_FILE, - include_plotlyjs='cdn', # Use CDN instead of embedding (much smaller file) - div_id='plot' -) - -# Post-process: add sidebar and load metadata from external JSON -print("Adding custom sidebar functionality...") -with open(OUTPUT_HTML_FILE, 'r', encoding='utf-8') as f: - html = f.read() - -# Add CSS before -css = """ -""" -html = html.replace('', css + '') - -# Wrap plot div in container and add sidebar before
) -html = html.replace('', '\\n