riyanshibohra · MauricioPerera · Jan 14, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,6 @@
+# GitHub Token for Gist creation (Required for "Open in Colab")
+# Create one at: https://github.com/settings/tokens (Scope: gist)
+GITHUB_TOKEN=your_github_token_here
+
+# Allowed CORS origins (comma separated)
+ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000,http://127.0.0.1:8000
diff --git a/CLI_README.md b/CLI_README.md
@@ -0,0 +1,76 @@
+# TuneKit CLI Manager
+
+Herramienta de línea de comandos para gestionar los modelos soportados por TuneKit. Permite añadir, listar y eliminar modelos de la configuración sin necesidad de editar código.
+
+## Uso Básico
+
+El script se encuentra en `TuneKit/cli.py`. Ejecútalo desde la raíz del proyecto.
+
+### Listar Modelos
+
+Muestra todos los modelos configurados actualmente.
+
+```bash
+python TuneKit/cli.py list
+```
+
+### Añadir un Modelo
+
+Inicia un asistente interactivo impulsado por "IA" que te guiará paso a paso.
+
+```bash
+python TuneKit/cli.py add
+```
+
+**Características Inteligentes:**
+- **Autocompletado desde Hugging Face**: Al introducir el ID del modelo (ej: `google/gemma-3-270m-it`), la CLI consultará automáticamente la API de Hugging Face para obtener:
+    - Ventana de contexto (Context Window).
+    - Sugerencia de nombre amigable.
+- **Validación**: Evita duplicados y campos vacíos.
+
+El asistente te pedirá:
+1. **Metadata**: ID, nombre, tamaño, GPU recomendada.
+2. **Puntuación**: Qué tan bueno es el modelo para diferentes tareas (0-100).
+3. **Razonamiento**: Características clave para mostrar al usuario.
+
+### Eliminar un Modelo
+
+Elimina un modelo existente por su clave única.
+
+```bash
+python TuneKit/cli.py remove <model_key>
+```
+
+Ejemplo:
+```bash
+python TuneKit/cli.py remove deepseek-1.3b
+```
+
+## Herramientas de Datos
+
+### Enriquecimiento de Datos (Enrich)
+
+Mejora automáticamente la calidad de tu dataset mediante métricas de calidad, filtrado y balanceo de clases.
+
+```bash
+python TuneKit/cli.py enrich <archivo.jsonl> [opciones]
+```
+
+**Opciones:**
+- `--top_n <N>`: Mantiene solo los N mejores ejemplos según su puntuación de calidad.
+- `--no-balance`: Desactiva el balanceo automático de clases (útil si no es una tarea de clasificación).
+- `-o <archivo>`: Especifica el archivo de salida (por defecto: `nombre_enriched.jsonl`).
+
+**Ejemplo:**
+```bash
+# Enriquecer y guardar solo los 100 mejores ejemplos
+python TuneKit/cli.py enrich data.jsonl --top_n 100
+```
+
+## Archivos de Configuración
+
+La configuración se almacena en `TuneKit/tunekit/data/models.json`. Este archivo es generado y gestionado automáticamente por el CLI, pero puede editarse manualmente si es necesario.
+
+## Desarrollo
+
+Si añades nuevos campos a la lógica de recomendación en `model_rec.py`, asegúrate de actualizar el CLI para soportarlos.
diff --git a/README.md b/README.md
@@ -116,21 +116,65 @@ TuneKit uses the standard conversation format:
 
 ---
 
-## Run Locally
+## Data Enrichment
 
-```bash
-# Clone the repo
-git clone https://github.com/riyanshibohra/TuneKit.git
-cd TuneKit
+TuneKit now includes tools to automatically improve your dataset quality before training:
 
-# Install dependencies
-pip install -r requirements.txt
+- **Quality Scoring**: Evaluates every conversation on complexity, lexical diversity, and dialogue balance.
+- **Smart Prioritization**: Automatically ranks examples and filters out low-quality ones.
+- **Class Balancing**: Detects underrepresented classes in classification datasets and automatically balances them.
 
-# Start the server
-uvicorn api.main:app --reload
-```
+---
 
-Open [http://localhost:8000](http://localhost:8000) in your browser.
+## Development Setup
+
+### Prerequisites
+- Python 3.10+
+- Git
+
+### Installation
+
+1. **Clone the repository**
+   ```bash
+   git clone https://github.com/riyanshibohra/TuneKit.git
+   cd TuneKit
+   ```
+
+2. **Create a virtual environment (Recommended)**
+   ```bash
+   # Windows
+   python -m venv venv
+   .\venv\Scripts\activate
+
+   # macOS/Linux
+   python3 -m venv venv
+   source venv/bin/activate
+   ```
+
+3. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. **Configuration**
+   Copy `.env.example` to `.env` and configure your tokens:
+   ```bash
+   # Windows (PowerShell)
+   cp .env.example .env
+
+   # macOS/Linux
+   cp .env.example .env
+   ```
+
+   > **Note:** A `GITHUB_TOKEN` is required to automatically create private Gists for the Colab notebooks.
+   > [Generate a token here](https://github.com/settings/tokens) (Scope: `gist`).
+
+5. **Start the server**
+   ```bash
+   uvicorn api.main:app --reload
+   ```
+
+   The app will be available at [http://localhost:8000](http://localhost:8000).
 
 ---
 

diff --git a/api/main.py b/api/main.py
@@ -31,6 +31,7 @@
     analyze_dataset,
     generate_package,
     recommend_model,
+    enrich_dataset,
 )
 from tunekit.training import (
     generate_training_notebook,
@@ -188,6 +189,12 @@ class GenerateRequest(BaseModel):
     session_id: str
 
 
+class EnrichRequest(BaseModel):
+    session_id: str
+    top_n: Optional[int] = None
+    balance: bool = True
+
+
 class SessionResponse(BaseModel):
     session_id: str
     status: str
@@ -748,6 +755,78 @@ async def generate(request: GenerateRequest):
     )
 
 
+@app.post("/enrich")
+async def enrich(request: EnrichRequest):
+    """Enrich the dataset with metrics, prioritization and balancing."""
+    session_id = request.session_id
+
+    if session_id not in sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = sessions[session_id]
+    state = session.get("state")
+
+    if not state:
+        raise HTTPException(status_code=400, detail="No data found")
+
+    # Reload raw_data if needed
+    if not reload_raw_data_if_needed(session):
+        raise HTTPException(status_code=400, detail="Could not load data")
+
+    # Set config
+    state["enrich_config"] = {
+        "top_n": request.top_n,
+        "balance": request.balance
+    }
+
+    result = enrich_dataset(state)
+
+    # Update state with enriched data
+    # IMPORTANT: We replace raw_data so downstream tools use the improved version
+    if result.get("enriched_data"):
+        state["raw_data"] = result["enriched_data"]
+        state["num_rows"] = len(result["enriched_data"])
+
+        # Update file on disk with enriched data?
+        # Maybe we should save a new file version.
+        # For now, let's just update memory state and maybe save to a temp file if we want to download it.
+
+        # Save enriched data to a new file for persistence/download
+        original_path = session["file_path"]
+        name, ext = os.path.splitext(original_path)
+        enriched_path = f"{name}_enriched{ext}"
+
+        try:
+            with open(enriched_path, 'w', encoding='utf-8') as f:
+                for entry in result["enriched_data"]:
+                    f.write(json.dumps(entry) + '\n')
+
+            # Update session to point to new file
+            session["file_path"] = enriched_path
+            state["file_path"] = enriched_path
+
+        except Exception as e:
+            print(f"Warning: Failed to save enriched file: {e}")
+
+    state.update(result)
+
+    # Re-run validation/analysis on new data
+    val_res = validate_quality(state)
+    state.update(val_res)
+
+    # Clear raw_data to save memory
+    state["raw_data"] = None
+    sessions[session_id]["state"] = state
+
+    return {
+        "session_id": session_id,
+        "status": "success",
+        "stats": result.get("enrichment_stats", {}),
+        "quality_score": state.get("quality_score"),
+        "quality_issues": state.get("quality_issues")
+    }
+
+
 @app.get("/download/{session_id}")
 async def download(session_id: str):
     """Download the generated training package as a ZIP file."""