diff --git a/.github/workflows/data-integrity.yml b/.github/workflows/data-integrity.yml new file mode 100644 index 0000000..9e9f51f --- /dev/null +++ b/.github/workflows/data-integrity.yml @@ -0,0 +1,31 @@ +name: Data Integrity Validation + +on: + push: + branches: [dev] + pull_request: + branches: [dev] + +jobs: + validate-data-integrity: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install optional dependencies + run: | + pip install jsonschema + + - name: Run data integrity validator on valid example data + run: | + python -m data_integrity.validator \ + --input data_integrity/examples/valid_data.json \ + --schema data_integrity/examples/sample_schema.json \ + --json diff --git a/.gitignore b/.gitignore index cc70900..b3c3516 100644 --- a/.gitignore +++ b/.gitignore @@ -180,3 +180,11 @@ cython_debug/ # PyPI configuration file .pypirc + +# Virtual environments +venv310/ +venv/ + +# Federated Learning Models +dreamsApp/app/models/production_chime_model/ +dreamsApp/app/models/temp_training_artifact/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3fcfddc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ +"python.terminal.useEnvFile": true, +"python-envs.defaultEnvManager": "ms-python.python:system", +"python-envs.pythonProjects": [] +} diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..f1382aa --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,357 @@ +# DREAMS Architecture Overview + +**Note**: This document provides a high-level system architecture overview. For detailed implementation of the DREAMS application components, see `dreamsApp/docs/architecture.md`. + +## System Architecture + +``` +Dreams + +┌─────────────────────────────────────────────────────────────────┐ +│ USER INTERFACE │ +│ (Beehive Frontend - Photo Upload with Captions) │ +└────────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ DREAMS API LAYER │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Upload Route: /upload │ │ +│ │ - Receives: Image + Caption + Timestamp │ │ +│ │ - Returns: Post ID + Sentiment + Location │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ PROCESSING PIPELINE │ +│ │ +│ ┌─────────────────┐ ┌──────────────────┐ ┌────────────────┐ │ +│ │ Image Analysis │ │ Text Analysis │ │ Location │ │ +│ │ │ │ │ │ Analysis │ │ +│ │ • BLIP Caption │ │ • Sentiment │ │ • GPS Extract │ │ +│ │ • DeepFace │ │ • Keywords │ │ • Proximity │ │ +│ │ Emotion │ │ • Clustering │ │ • Patterns │ │ +│ └────────┬────────┘ └────────┬─────────┘ └────────┬───────┘ │ +│ │ │ │ │ +│ └────────────────────┴─────────────────────┘ │ +│ │ │ +└────────────────────────────────┼────────────────────────────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────┐ +│ LOCATION-PROXIMITY MODULE │ +│ (Your Contribution) │ +│ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ 1. Location Extractor │ │ +│ │ Input: Image file │ │ +│ │ Output: {lat, lon, timestamp} │ │ +│ │ Tech: Pillow EXIF parsing │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ 2. Place Enrichment (Optional - Future) │ │ +│ │ Input: {lat, lon} │ │ +│ │ Output: {place_type, name, language, cultural_tags} │ │ +│ │ Tech: Google Places API / Nominatim │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ 3. Proximity Calculator │ │ +│ │ Input: Place1, Place2 │ │ +│ │ Output: Proximity score (0-1) │ │ +│ │ │ │ +│ │ Components: │ │ +│ │ • Geographic: Haversine distance │ │ +│ │ • Categorical: Place type matching │ │ +│ │ • Linguistic: Language similarity │ │ +│ │ • Cultural: Tag similarity (Jaccard) │ │ +│ │ • Composite: Weighted combination │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ 4. Emotion-Location Mapper │ │ +│ │ Input: location_id, sentiment, score, metadata │ │ +│ │ Storage: In-memory dictionary │ │ +│ │ Note: Temporary storage for prototype; should be │ │ +│ │ moved to persistent database (MongoDB) for │ │ +│ │ production scalability and data persistence. │ │ +│ │ │ │ +│ │ Functions: │ │ +│ │ • add_entry() │ │ +│ │ • get_location_sentiment_profile() │ │ +│ │ • find_emotional_hotspots() │ │ +│ │ • compare_place_types() │ │ +│ │ • temporal_emotion_trend() │ │ +│ └──────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ 5. Semantic Clustering │ │ +│ │ Input: Proximity matrix + Emotion profiles │ │ +│ │ Algorithm: DBSCAN │ │ +│ │ Output: Cluster labels + Emotion statistics │ │ +│ │ │ │ +│ │ Parameters: │ │ +│ │ • eps: 0.3-0.5 (neighborhood distance) │ │ +│ │ • min_samples: 2-3 (core point threshold) │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬───────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ DATA STORAGE LAYER │ +│ │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ MongoDB Collections: │ │ +│ │ │ │ +│ │ • posts: { │ │ +│ │ user_id, caption, timestamp, image_path, │ │ +│ │ sentiment: {label, score}, │ │ +│ │ location: {lat, lon, place_type, language} │ │ +│ │ } │ │ +│ │ │ │ +│ │ • keywords: { │ │ +│ │ user_id, │ │ +│ │ positive_keywords: [{keyword, embedding, timestamp}],│ │ +│ │ negative_keywords: [{keyword, embedding, timestamp}] │ │ +│ │ } │ │ +│ │ │ │ +│ │ • location_analysis: { │ │ +│ │ user_id, │ │ +│ │ locations: [{id, lat, lon, visits, emotions}], │ │ +│ │ clusters: [{id, members, emotion_dist}], │ │ +│ │ hotspots: [{location_id, sentiment, confidence}] │ │ +│ │ } │ │ +│ │ │ │ +│ │ • thematic_analysis: { │ │ +│ │ user_id, │ │ +│ │ data: {positive: [{theme: string, │ │ +│ │ interpretation: string}], │ │ +│ │ negative: [{theme: string, │ │ +│ │ interpretation: string}]} │ │ +│ │ } │ │ +│ └───────────────────────────────────────────────────────────┘ │ +└────────────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────┐ +│ VISUALIZATION LAYER │ +│ │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Dashboard Routes: │ │ +│ │ │ │ +│ │ • /dashboard/ │ │ +│ │ - Sentiment timeline │ │ +│ │ - Word clouds │ │ +│ │ - Thematic cards │ │ +│ │ │ │ +│ │ • /location_analysis/ (NEW) │ │ +│ │ - Emotional hotspots map │ │ +│ │ - Place type comparison chart │ │ +│ │ - Location clusters visualization │ │ +│ │ - Temporal-spatial patterns │ │ +│ │ │ │ +│ │ • /clusters/ │ │ +│ │ - Cluster details │ │ +│ │ - Member locations │ │ +│ │ - Emotion distributions │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Location-Proximity Pipeline + +```mermaid +flowchart LR + subgraph "Input Processing" + IMG[Photo Upload] + EXIF[EXIF Extraction] + GPS[GPS Coordinates] + end + + subgraph "Multi-Dimensional Analysis" + GEO[Geographic Distance] + CAT[Categorical Similarity] + LING[Linguistic Context] + CULT[Cultural Attributes] + end + + subgraph "Proximity Calculation" + WEIGHT[Weighted Composite] + SCORE[Proximity Score] + end + + subgraph "Emotion Integration" + SENT[Sentiment Analysis] + MAP[Location-Emotion Mapping] + PATTERN[Pattern Detection] + end + + IMG --> EXIF + EXIF --> GPS + GPS --> GEO + GPS --> CAT + GPS --> LING + GPS --> CULT + + GEO --> WEIGHT + CAT --> WEIGHT + LING --> WEIGHT + CULT --> WEIGHT + + WEIGHT --> SCORE + SCORE --> MAP + SENT --> MAP + MAP --> PATTERN +``` + +## Semantic Clustering Workflow + +```mermaid +graph TD + subgraph "Data Preparation" + LOCATIONS[Location Dataset] + FEATURES[Feature Extraction] + MATRIX[Proximity Matrix] + end + + subgraph "Clustering Process" + DBSCAN[DBSCAN Algorithm] + PARAMS[Parameter Tuning] + CLUSTERS[Generated Clusters] + end + + subgraph "Analysis & Validation" + METRICS[Quality Metrics] + EMOTION[Emotion Analysis] + INSIGHTS[Pattern Insights] + end + + subgraph "Output" + GROUPS[Semantic Groups] + HOTSPOTS[Emotional Hotspots] + TRENDS[Temporal Trends] + end + + LOCATIONS --> FEATURES + FEATURES --> MATRIX + MATRIX --> DBSCAN + DBSCAN --> PARAMS + PARAMS --> CLUSTERS + + CLUSTERS --> METRICS + CLUSTERS --> EMOTION + EMOTION --> INSIGHTS + + INSIGHTS --> GROUPS + INSIGHTS --> HOTSPOTS + INSIGHTS --> TRENDS +``` + +## Component Integration Points + +### 1. Photo Ingestion Integration + +**`dreamsApp/app/ingestion/routes.py`** +- Extract EXIF data from uploaded photos +- Call `location_proximity.location_extractor` +- Store location data in post schema + +### 2. Dashboard Integration + +**`dreamsApp/app/dashboard/main.py`** +- Location analysis endpoints +- Proximity visualization routes +- Emotion-location pattern displays + +### 3. API Extensions + +**`/api/v1/locations/proximity`** +- Calculate proximity between locations +- Return multi-dimensional scores +- Support batch processing + +## Data Flow Architecture + +### Input Data Sources +- **Photo EXIF**: GPS coordinates, timestamp, camera metadata +- **User Captions**: Text for sentiment analysis +- **External APIs**: Place type information, cultural context + +### Processing Layers +1. **Extraction Layer**: GPS, metadata, place information +2. **Analysis Layer**: Multi-dimensional proximity calculation +3. **Mapping Layer**: Emotion-location pattern detection +4. **Clustering Layer**: Semantic grouping and insights + +### Output Interfaces +- **REST API**: Programmatic access to proximity data +- **Dashboard**: Visual analytics and pattern exploration +- **Database**: Persistent storage of computed relationships + +## Scalability Considerations + +### Performance Optimization +- **Caching**: Proximity calculations for frequently accessed locations +- **Batch Processing**: Bulk analysis for large photo collections +- **Indexing**: Spatial indexing for geographic queries + +### Extensibility Points +- **Plugin Architecture**: Additional proximity dimensions +- **Model Integration**: Custom ML models for place recognition +- **API Versioning**: Backward compatibility for evolving features + +## Security & Privacy + +### Data Protection +- **Location Privacy**: Configurable precision levels +- **User Consent**: Explicit permission for location analysis +- **Data Retention**: Configurable storage policies + +### Access Control +- **Authentication**: User-specific location data +- **Authorization**: Role-based access to analytics +- **Audit Logging**: Track access to sensitive location information + +## Technology Stack + +### Core Technologies +- **Backend**: Python Flask, SQLAlchemy +- **Analysis**: NumPy, scikit-learn, Pandas +- **Visualization**: Matplotlib, Folium (future) +- **Database**: MongoDB (recommended), SQLite (development) + +### External Dependencies +- **Image Processing**: Pillow (EXIF extraction) +- **ML Models**: Hugging Face Transformers +- **Geospatial**: Haversine distance calculations +- **Clustering**: DBSCAN implementation + +## Development Phases + +### Phase 1: Foundation (Current) +- Multi-dimensional proximity calculation +- Basic emotion-location mapping +- DBSCAN clustering implementation +- Comprehensive testing suite + +### Phase 2: Integration +- DREAMS application integration +- REST API development +- Dashboard visualization +- Performance optimization + +### Phase 3: Enhancement +- Advanced ML models +- Real-time processing +- Interactive visualizations +- Cross-user analysis capabilities + +--- + +**Note**: This architecture document provides a high-level overview without implementation details. Specific implementation decisions will be made during the development phase based on performance requirements and integration constraints. \ No newline at end of file diff --git a/LOCATION_PROXIMITY_SUMMARY.md b/LOCATION_PROXIMITY_SUMMARY.md new file mode 100644 index 0000000..0c387f6 --- /dev/null +++ b/LOCATION_PROXIMITY_SUMMARY.md @@ -0,0 +1,260 @@ +# Location-Proximity Analysis Extension + +## Overview + +A new module for DREAMS that analyzes **multi-dimensional location proximity** to understand how semantically similar places influence emotional patterns in recovery journeys. + +**Key Innovation**: Goes beyond GPS coordinates to consider categorical, linguistic, and cultural dimensions of location similarity. + +--- + +## Module Location + +``` +DREAMS/location_proximity/ +├── __init__.py +├── location_extractor.py # Extract GPS from images +├── proximity_calculator.py # Multi-dimensional proximity +├── emotion_location_mapper.py # Emotion-location patterns +├── semantic_clustering.py # Cluster similar places +├── demo.py # Demonstration script +├── test_proximity.py # Test suite +├── requirements.txt # Dependencies +├── README.md # Module documentation +└── RESEARCH.md # Research foundation +``` + +--- + +## Quick Demo + +```bash +cd location_proximity +pip install -r requirements.txt +python demo.py +``` + +**Output**: +``` +DEMO 1: Multi-Dimensional Proximity Calculation +================================================================ +St. Mary's Church ↔ Holy Trinity Church : 0.850 +Alaska Native Medical Center ↔ Providence Hospital : 0.725 + +✓ Notice: Two churches have high proximity despite different locations +✓ Notice: Two hospitals cluster together semantically +``` + +--- + +## Key Features + +### 1. Multi-Dimensional Proximity +Calculates location similarity using: +- **Geographic**: Physical distance (Haversine) +- **Categorical**: Place type (church ↔ church) +- **Linguistic**: Language context +- **Cultural**: Shared cultural tags + +### 2. Emotion-Location Mapping +- Track emotional patterns at specific locations +- Identify "emotional hotspots" (places with consistent emotions) +- Compare emotions across place categories +- Temporal emotion trends at locations + +### 3. Semantic Clustering +- Group semantically similar places +- Analyze emotional patterns within clusters +- Discover cross-location patterns +- DBSCAN-based clustering (no predefined cluster count) + +--- + +## Research Questions Addressed + +1. **Do semantically similar places evoke similar emotions?** + - Compare two different churches vs. church and hospital + +2. **Is categorical proximity more predictive than geographic proximity?** + - Correlation analysis between proximity dimensions and emotions + +3. **Do certain place types consistently associate with specific emotions?** + - Statistical analysis across place categories + +4. **How do place-emotion associations evolve during recovery?** + - Temporal analysis of location patterns + +--- + +## Example Use Cases + +### Use Case 1: Categorical Analysis +**Question**: Do all churches evoke positive emotions, or just specific ones? + +```python +from location_proximity.emotion_location_mapper import EmotionLocationMapper + +mapper = EmotionLocationMapper() +# Add data... + +# Compare specific church vs. all churches +church_a_profile = mapper.get_location_sentiment_profile("church_a") +all_churches = mapper.compare_place_types()["church"] +``` + +### Use Case 2: Cross-Location Patterns +**Question**: Do Portuguese restaurants evoke similar emotions despite different locations? + +```python +from location_proximity.semantic_clustering import find_similar_place_patterns + +patterns = find_similar_place_patterns( + places=portuguese_restaurants, + emotion_mapper=mapper, + proximity_threshold=0.6 +) +``` + +### Use Case 3: Recovery Journey Mapping +**Question**: How does a person's emotional relationship with healthcare facilities change over time? + +```python +hospitals = [p for p in places if p['type'] == 'hospital'] +for hospital in hospitals: + trend = mapper.temporal_emotion_trend(hospital['id']) + # Analyze trend... +``` + +--- + +## Integration with DREAMS + +### Extend Post Schema +```python +# Add to dreamsApp/app/ingestion/routes.py +from location_proximity.location_extractor import extract_location_from_image + +location = extract_location_from_image(image_path) +if location: + post_doc['location'] = { + 'lat': location['lat'], + 'lon': location['lon'] + } +``` + +### Add Dashboard Route +```python +# Add to dreamsApp/app/dashboard/main.py +@bp.route('/location_analysis/') +def location_analysis(user_id): + # Use EmotionLocationMapper to analyze patterns + # Render visualization +``` + +--- + +## Metrics & Validation + +### Clustering Quality +- Silhouette score +- Davies-Bouldin index + +### Emotion Prediction +- Accuracy of predicting emotion from place type +- Precision/Recall for hotspot identification + +### Statistical Significance +- Chi-square tests for sentiment distributions +- ANOVA across place types +- Effect size calculations + +--- + +## Research Contribution + +**Novel Contribution**: First formalization of multi-dimensional location proximity for emotion analysis in recovery journeys. + +**Potential Publications**: +1. "Beyond GPS: Multi-Dimensional Location Proximity in Emotional Recovery Analysis" +2. "Semantic Place Similarity and Emotional Patterns in Digitized Memories" +3. "Affective Geography of Recovery: A Computational Approach" + +--- + +## Dependencies + +``` +Pillow>=10.0.0 # Image EXIF extraction +numpy>=1.24.0 # Numerical computations +scikit-learn>=1.3.0 # Clustering algorithms +``` + +--- + +## Testing + +```bash +cd location_proximity +pytest test_proximity.py -v +``` + +**Test Coverage**: +- Geographic distance calculation +- Proximity metrics (all dimensions) +- Emotion-location mapping +- Clustering functionality +- Edge cases and error handling + +--- + +## Future Enhancements + +### Phase 1 (Current) +- [x] Multi-dimensional proximity calculation +- [x] Emotion-location mapping +- [x] Semantic clustering +- [x] Demo and tests + +### Phase 2 (Next) +- [ ] Google Places API integration +- [ ] Automated place type detection +- [ ] Interactive map visualization (Folium) +- [ ] Real-time clustering + +### Phase 3 (Future) +- [ ] Image-based place recognition (CNN) +- [ ] Temporal-spatial modeling +- [ ] Cross-user analysis +- [ ] Causal inference methods + +--- + +## Contributing + +This module was developed as part of GSoC 2026 project: +> "Understanding proximity in locations and emotions through digitized memories" + +**Mentors**: Jihye Kwon, Pradeeban Kathiravelu +**Institution**: University of Alaska Fairbanks + +Contributions welcome! See [location_proximity/README.md](location_proximity/README.md) for details. + +--- + +## Contact + +- **Project**: https://github.com/KathiraveluLab/DREAMS +- **Discussions**: https://github.com/KathiraveluLab/DREAMS/discussions +- **Mentors**: jkwon2@alaska.edu, pkathiravelu@alaska.edu + +--- + +## License + +Same as DREAMS project (see [LICENSE](LICENSE)) + +--- + +**Status**: ✅ Ready for integration and testing +**Version**: 0.1.0 +**Last Updated**: 2024 diff --git a/README.md b/README.md index bab37ec..f7c7dca 100644 --- a/README.md +++ b/README.md @@ -4,33 +4,57 @@ Digitization for Recovery: Exploring Arts with Mining for Societal well-being. DREAMS is an extension of the Beehive project, focused on exploring time and ordering across photo memories to better understand personal recovery journeys. The goal is to build tools that help track and analyze visual narratives over time using data mining and intelligent processing. -## 🔨 Current Progress +## Current Progress -- ✅ Set up core infrastructure using Flask and Hugging Face models. -- ✅ Implemented a basic **Caption Sentiment Analysis API** to classify emotional tone in user-submitted captions. -- 🔄 Integrating this API into Beehive to capture sentiment when users upload photos. -- 🔬 Exploring time-based data structuring and narrative analysis features. +- Set up core infrastructure using Flask and Hugging Face models. +- Implemented a basic **Caption Sentiment Analysis API** to classify emotional tone in user-submitted captions. +- Integrating this API into Beehive to capture sentiment when users upload photos. +- Exploring time-based data structuring and narrative analysis features. -### 📁 [View the API Module](./dreamsApp/README.md) +### [View the API Module](./dreamsApp/README.md) -## 📦 Repositories +## Repositories - Beehive: [github.com/KathiraveluLab/beehive](https://github.com/KathiraveluLab/Beehive) - DREAMS: [github.com/KathiraveluLab/DREAMS](https://github.com/KathiraveluLab/DREAMS) -## 📁 Repository Structure +## Repository Structure + +```text +DREAMS/ +├── dreamsApp/ # Main application package +│ ├── app/ # Flask app package (app factory + blueprints) +│ │ ├── __init__.py # create_app() factory +│ │ ├── config.py # App configuration +│ │ ├── models.py # Database models +│ │ ├── auth.py # Authentication routes +│ │ │ +│ │ ├── ingestion/ # Image ingestion & processing +│ │ │ ├── __init__.py +│ │ │ └── routes.py +│ │ │ +│ │ ├── dashboard/ # Dashboard & analytics views +│ │ │ ├── __init__.py +│ │ │ └── main.py +│ │ │ +│ │ └── utils/ # Core ML / NLP utilities +│ │ ├── sentiment.py # Caption sentiment analysis +│ │ ├── keywords.py # Keyword extraction +│ │ ├── clustering.py # Keyword clustering (HDBSCAN) +│ │ └── llms.py # LLM (Gemini) integration +│ │ +│ └── docs/ # Project documentation +│ +├── data_integrity/ # Data validation utilities +├── location_proximity/ # Location-based analysis (future) +├── dream-integration/ # Integration & experimental code +├── tests/ # Unit and integration tests +│ +├── requirements.txt # Python dependencies +├── pytest.ini # Pytest configuration +└── README.md # Project documentation - DREAMS/ - ├── dreamsApp/ - │ ├── __init__.py # App factory - │ ├── captionSentiments.py # API logic and model loading - | ├── README.md - ├──tests/ - | ├──test_sentiment.py - ├──pytest.ini - ├──README.md - ├──requirements.txt ## Installation and Setup @@ -50,7 +74,7 @@ pip install -r requirements.txt pytest # 5. Start the Flask server in debug mode -flask --app dreamsApp run --debug +flask --app "dreamsApp.app:create_app()" run --debug ``` More coming soon! diff --git a/data_integrity/README.md b/data_integrity/README.md new file mode 100644 index 0000000..a6e967a --- /dev/null +++ b/data_integrity/README.md @@ -0,0 +1,214 @@ +# DREAMS Data Integrity Layer (Phase-1) + +A lightweight, optional validation utility for multimodal time-series data in DREAMS. + +## Purpose + +This module validates data **before analysis** to catch common issues: +- Malformed JSON structure +- Missing media files +- Future timestamps +- Out-of-order events + +**Important**: This layer **only reports issues** — it does NOT modify or "fix" data. + +## Quick Start + +### Basic Usage + +```bash +# From DREAMS root directory +python -m data_integrity.validator \ + --input data/person-01/data.json \ + --base-dir data/ +``` + +### With Schema Validation + +```bash +python -m data_integrity.validator \ + --input data/person-01/data.json \ + --schema schemas/sample_schema.json \ + --base-dir data/ +``` + +### Strict Temporal Ordering + +```bash +python -m data_integrity.validator \ + --input data/person-01/data.json \ + --base-dir data/ \ + --strict-temporal +``` + +## Try the Examples + +```bash +# Run on example data (will show intentional errors) +python -m data_integrity.validator \ + --input data_integrity/examples/sample_data.json \ + --schema data_integrity/examples/sample_schema.json \ + --base-dir . +``` + +The example data includes intentional errors to demonstrate validation: +- Out-of-order timestamps +- Future timestamps +- Missing media files + +## Architecture + +``` +data_integrity/ +├── __init__.py # Package initialization +├── validator.py # CLI entry point and orchestration +├── schema_validator.py # JSON Schema validation +├── path_validator.py # Media file existence checks +├── temporal_validator.py # Timestamp ordering validation +├── reporter.py # Unified error formatting +└── examples/ + ├── sample_data.json # Example DREAMS data + └── sample_schema.json # Example JSON schema +``` + +### Modular Design + +Each validator is independent and returns a list of `ValidationIssue` objects. This makes it easy to: +- Add new validators (e.g., geo-validators, embedding validators) +- Run validators selectively +- Customize error handling + +## Validation Checks + +### Schema Validation +- Validates JSON structure against optional schema +- Falls back gracefully if `jsonschema` is not installed +- Schema is **optional** — not enforced + +### Path Validation +- Checks that all referenced media files exist +- Supports common field names: `image`, `audio`, `video`, `media`, etc. +- Uses pathlib for cross-platform compatibility +- Resolves paths relative to `--base-dir` + +### Temporal Validation +- Detects future timestamps +- Detects out-of-order events in sequences +- Optional strict mode (no duplicate timestamps) +- Handles ISO 8601 strings and Unix timestamps + +## Exit Codes + +- `0` - All validations passed (or only warnings) +- `1` - Validation errors found + +## Options + +``` +--input, -i Path to input JSON data file (required) +--schema, -s Path to JSON schema file (optional) +--base-dir, -b Base directory for media paths (default: current directory) +--strict-temporal Require strictly increasing timestamps +--quiet, -q Only show errors (suppress warnings/info) +``` + +## Design Philosophy + +### Phase-1 Foundation +This is intentionally minimal and focused. Future phases may add: +- Geo-location validation +- Embedding quality checks +- Multi-person proximity analysis +- Statistical anomaly detection + +### Non-Invasive +- **Optional**: Contributors can ignore this layer +- **Read-only**: Never modifies data +- **Informative**: Clear error messages with location context + +### Extensible +Adding a new validator: +1. Create a module (e.g., `geo_validator.py`) +2. Implement a function returning `List[ValidationIssue]` +3. Call it from `validator.py` + +## Dependencies + +**Required:** +- Python 3.7+ +- Standard library only (pathlib, json, datetime) + +**Optional:** +- `jsonschema` - for schema validation (graceful fallback if missing) + +## Testing + +The example data includes intentional issues for testing: + +```bash +# Should report 4 errors: +# - Out-of-order timestamp (s04 < s03) +# - Future timestamp (s05) +# - Missing audio file (s04) +# - Missing image file (s05) + +python -m data_integrity.validator \ + --input data_integrity/examples/sample_data.json \ + --base-dir . +``` + +## Integration Examples + +### Pre-Analysis Check + +```python +from data_integrity.validator import main +import sys + +# Run validation before analysis +result = main() +if result != 0: + print("Data validation failed. Please fix issues before analysis.") + sys.exit(1) + +# Proceed with analysis... +``` + +### Programmatic Use + +```python +import json +from pathlib import Path +from data_integrity.reporter import ValidationReport +from data_integrity.schema_validator import validate_schema +from data_integrity.path_validator import validate_paths +from data_integrity.temporal_validator import validate_temporal + +# Load data +with open("data.json") as f: + data = json.load(f) + +# Run validators +report = ValidationReport() +report.extend(validate_schema(data, Path("schema.json"))) +report.extend(validate_paths(data, Path("."))) +report.extend(validate_temporal(data)) + +# Check results +if report.has_errors(): + print(report.format_summary()) +``` + +## Contributing + +When extending this layer: +- Keep validators independent +- Return `List[ValidationIssue]` +- Use clear, actionable error messages +- Include location context +- Avoid domain-specific assumptions +- Document the validation logic + +## License + +Same as DREAMS project. diff --git a/data_integrity/__init__.py b/data_integrity/__init__.py new file mode 100644 index 0000000..a899b28 --- /dev/null +++ b/data_integrity/__init__.py @@ -0,0 +1,14 @@ +""" +DREAMS Data Integrity Layer (Phase-1) + +A lightweight, optional validation utility for multimodal time-series data. +Validates structure, media paths, and temporal consistency WITHOUT modifying data. + +Usage: + python -m data_integrity.validator --input data.json --schema schema.json --base-dir ./ + +Extensibility: + Add new validators by implementing the validation pattern in new modules. +""" + +__version__ = "0.1.0" diff --git a/data_integrity/__main__.py b/data_integrity/__main__.py new file mode 100644 index 0000000..d6c8b0d --- /dev/null +++ b/data_integrity/__main__.py @@ -0,0 +1,7 @@ +"""Allow running as: python -m data_integrity""" + +from .validator import main +import sys + +if __name__ == "__main__": + sys.exit(main()) diff --git a/data_integrity/examples/invalid_schema_data.json b/data_integrity/examples/invalid_schema_data.json new file mode 100644 index 0000000..7e77469 --- /dev/null +++ b/data_integrity/examples/invalid_schema_data.json @@ -0,0 +1,8 @@ +{ + "samples": [ + { + "sample_id": 123, + "description": "Missing required timestamp field" + } + ] +} diff --git a/data_integrity/examples/millisecond_timestamps.json b/data_integrity/examples/millisecond_timestamps.json new file mode 100644 index 0000000..bfe7a1b --- /dev/null +++ b/data_integrity/examples/millisecond_timestamps.json @@ -0,0 +1,29 @@ +{ + "person_id": "p04", + "metadata": { + "collection_date": "2024-12-01", + "timestamp_format": "milliseconds" + }, + "samples": [ + { + "sample_id": "s01", + "timestamp": 1701417600000, + "description": "Unix timestamp in milliseconds (2023-12-01 08:00:00)" + }, + { + "sample_id": "s02", + "timestamp": 1701424800000, + "description": "Unix timestamp in milliseconds (2023-12-01 10:00:00)" + }, + { + "sample_id": "s03", + "timestamp": 1701432000, + "description": "Unix timestamp in seconds (2023-12-01 12:00:00)" + }, + { + "sample_id": "s04", + "timestamp": "2024-12-01T14:00:00Z", + "description": "ISO 8601 format" + } + ] +} diff --git a/data_integrity/examples/remote_urls_data.json b/data_integrity/examples/remote_urls_data.json new file mode 100644 index 0000000..b50dde8 --- /dev/null +++ b/data_integrity/examples/remote_urls_data.json @@ -0,0 +1,29 @@ +{ + "person_id": "p03", + "metadata": { + "collection_date": "2024-12-01", + "storage": "cloud" + }, + "samples": [ + { + "sample_id": "s01", + "timestamp": "2024-12-01T09:00:00Z", + "description": "Remote storage test", + "image": "https://storage.example.com/images/frame_001.jpg", + "audio": "s3://my-bucket/audio/recording_001.wav" + }, + { + "sample_id": "s02", + "timestamp": "2024-12-01T10:00:00Z", + "description": "Mixed local and remote", + "image": "data_integrity/README.md", + "video": "ftp://server.example.com/videos/clip_002.mp4" + }, + { + "sample_id": "s03", + "timestamp": "2024-12-01T11:00:00Z", + "description": "HTTP URL", + "media": "http://cdn.example.com/media/sample.mp3" + } + ] +} diff --git a/data_integrity/examples/sample_data.json b/data_integrity/examples/sample_data.json new file mode 100644 index 0000000..cce6f91 --- /dev/null +++ b/data_integrity/examples/sample_data.json @@ -0,0 +1,43 @@ +{ + "person_id": "p01", + "metadata": { + "collection_date": "2024-12-01", + "researcher": "Dr. Smith" + }, + "samples": [ + { + "sample_id": "s01", + "timestamp": "2024-12-01T08:00:00Z", + "description": "Morning recording session", + "image": "data/person-01/sample-01/frame_001.jpg", + "audio": "data/person-01/sample-01/audio_001.wav" + }, + { + "sample_id": "s02", + "timestamp": "2024-12-01T10:30:00Z", + "description": "Mid-morning follow-up", + "image": "data/person-01/sample-02/frame_002.jpg", + "audio": "data/person-01/sample-02/audio_002.wav" + }, + { + "sample_id": "s03", + "timestamp": "2024-12-01T14:15:00Z", + "description": "Afternoon session", + "image": "data/person-01/sample-03/frame_003.jpg", + "audio": "data/person-01/sample-03/audio_003.wav" + }, + { + "sample_id": "s04", + "timestamp": "2024-12-01T14:10:00Z", + "description": "Out-of-order timestamp (should trigger error)", + "image": "data/person-01/sample-04/frame_004.jpg", + "audio": "data/person-01/sample-04/nonexistent.wav" + }, + { + "sample_id": "s05", + "timestamp": "2025-12-15T18:00:00Z", + "description": "Future timestamp (should trigger error)", + "image": "data/person-01/sample-05/missing_image.jpg" + } + ] +} diff --git a/data_integrity/examples/sample_schema.json b/data_integrity/examples/sample_schema.json new file mode 100644 index 0000000..920e7ca --- /dev/null +++ b/data_integrity/examples/sample_schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "DREAMS Data Sample", + "description": "Minimal schema for validating DREAMS multimodal time-series data", + "type": "object", + "required": ["person_id", "samples"], + "properties": { + "person_id": { + "type": "string", + "description": "Unique identifier for the person/subject" + }, + "metadata": { + "type": "object", + "description": "Optional metadata about the collection" + }, + "samples": { + "type": "array", + "description": "Time-ordered sequence of samples/events", + "items": { + "type": "object", + "required": ["sample_id", "timestamp"], + "properties": { + "sample_id": { + "type": "string", + "description": "Unique identifier for this sample" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of when this sample was captured" + }, + "description": { + "type": "string", + "description": "Human-readable description" + }, + "image": { + "type": "string", + "description": "Path to image file" + }, + "audio": { + "type": "string", + "description": "Path to audio file" + }, + "video": { + "type": "string", + "description": "Path to video file" + } + } + }, + "minItems": 1 + } + } +} diff --git a/data_integrity/examples/valid_data.json b/data_integrity/examples/valid_data.json new file mode 100644 index 0000000..8914faa --- /dev/null +++ b/data_integrity/examples/valid_data.json @@ -0,0 +1,22 @@ +{ + "person_id": "p02", + "metadata": { + "collection_date": "2024-11-15", + "researcher": "Dr. Jones" + }, + "samples": [ + { + "sample_id": "s01", + "timestamp": "2024-11-15T09:00:00Z", + "description": "First session", + "image": "data_integrity/examples/sample_data.json", + "audio": "data_integrity/README.md" + }, + { + "sample_id": "s02", + "timestamp": "2024-11-15T11:00:00Z", + "description": "Second session", + "image": "data_integrity/examples/sample_schema.json" + } + ] +} diff --git a/data_integrity/path_validator.py b/data_integrity/path_validator.py new file mode 100644 index 0000000..9b0706c --- /dev/null +++ b/data_integrity/path_validator.py @@ -0,0 +1,132 @@ +""" +Media path validation for DREAMS data. + +Verifies that referenced image, audio, and video files exist on the filesystem. +Uses pathlib for cross-platform compatibility. +""" + +from typing import List, Dict, Any +from pathlib import Path + +from .reporter import ValidationIssue, Severity + + +def validate_paths(data: dict, base_dir: Path) -> List[ValidationIssue]: + """ + Validate that all media file paths exist. + + Args: + data: The data containing media paths + base_dir: Base directory for resolving relative paths + + Returns: + List of validation issues + """ + issues = [] + + # Extract paths from data structure + paths_to_check = _extract_media_paths(data) + + for path_info in paths_to_check: + path_str = path_info["path"] + location = path_info["location"] + + if not path_str: + issues.append(ValidationIssue( + severity=Severity.WARNING, + category="path", + message="Empty media path", + location=location + )) + continue + + # Skip remote URLs (for CI/CD and cloud storage) + if _is_remote_url(path_str): + issues.append(ValidationIssue( + severity=Severity.INFO, + category="path", + message="Skipping remote URL validation", + location=location, + details={"url": path_str} + )) + continue + + # Resolve path relative to base_dir + path = Path(path_str) + if not path.is_absolute(): + path = base_dir / path + + # Check existence + if not path.exists(): + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="path", + message=f"Media file not found: {path_str}", + location=location, + details={"resolved_path": str(path)} + )) + elif not path.is_file(): + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="path", + message=f"Path exists but is not a file: {path_str}", + location=location, + details={"resolved_path": str(path)} + )) + + return issues + + +def _is_remote_url(path_str: str) -> bool: + """ + Check if a path is a remote URL. + + Args: + path_str: Path string to check + + Returns: + True if path is a remote URL, False otherwise + """ + remote_schemes = ("http://", "https://", "s3://", "ftp://") + return path_str.lower().startswith(remote_schemes) + + +def _extract_media_paths(data: dict, parent_key: str = "") -> List[Dict[str, str]]: + """ + Recursively extract media paths from nested data structure. + + Looks for common media field names: image, audio, video, media, file_path, etc. + + Args: + data: Data structure to search + parent_key: Parent location for building path strings + + Returns: + List of dicts with 'path' and 'location' keys + """ + media_fields = { + "image", "audio", "video", "media", + "image_path", "audio_path", "video_path", "file_path", + "img", "sound", "recording" + } + + paths = [] + + if isinstance(data, dict): + for key, value in data.items(): + location = f"{parent_key}.{key}" if parent_key else key + + # Check if this key is a media field + if key.lower() in media_fields and isinstance(value, str): + paths.append({"path": value, "location": location}) + + # Recurse into nested structures + elif isinstance(value, (dict, list)): + paths.extend(_extract_media_paths(value, location)) + + elif isinstance(data, list): + for idx, item in enumerate(data): + location = f"{parent_key}[{idx}]" + paths.extend(_extract_media_paths(item, location)) + + return paths diff --git a/data_integrity/reporter.py b/data_integrity/reporter.py new file mode 100644 index 0000000..76cf559 --- /dev/null +++ b/data_integrity/reporter.py @@ -0,0 +1,105 @@ +""" +Unified error reporting for validation issues. + +All validators return lists of ValidationIssue objects for consistent reporting. +""" + +from dataclasses import dataclass +from typing import List, Optional +from enum import Enum + + +class Severity(Enum): + """Issue severity levels.""" + ERROR = "ERROR" + WARNING = "WARNING" + INFO = "INFO" + + +@dataclass +class ValidationIssue: + """Represents a single validation issue.""" + severity: Severity + category: str # e.g., "schema", "path", "temporal" + message: str + location: Optional[str] = None # e.g., "samples[3]", "person_id: p01" + details: Optional[dict] = None + + def to_dict(self) -> dict: + """Convert to JSON-serializable dictionary.""" + return { + "severity": self.severity.value, + "category": self.category, + "message": self.message, + "location": self.location, + "details": self.details + } + + +class ValidationReport: + """Aggregates and formats validation issues.""" + + def __init__(self): + self.issues: List[ValidationIssue] = [] + + def add(self, issue: ValidationIssue): + """Add a validation issue.""" + self.issues.append(issue) + + def extend(self, issues: List[ValidationIssue]): + """Add multiple validation issues.""" + self.issues.extend(issues) + + def has_errors(self) -> bool: + """Check if any errors exist.""" + return any(issue.severity == Severity.ERROR for issue in self.issues) + + def count_by_severity(self) -> dict: + """Count issues by severity.""" + counts = {severity: 0 for severity in Severity} + for issue in self.issues: + counts[issue.severity] += 1 + return counts + + def to_dict(self) -> dict: + """Convert to JSON-serializable dictionary for CI/CD pipelines.""" + counts = self.count_by_severity() + return { + "summary": { + "total_issues": len(self.issues), + "errors": counts[Severity.ERROR], + "warnings": counts[Severity.WARNING], + "info": counts[Severity.INFO], + "has_errors": self.has_errors() + }, + "issues": [issue.to_dict() for issue in self.issues] + } + + def format_summary(self) -> str: + """Generate human-readable summary.""" + if not self.issues: + return "✓ All validation checks passed." + + counts = self.count_by_severity() + lines = [ + "\n" + "=" * 60, + "VALIDATION REPORT", + "=" * 60, + ] + + for issue in self.issues: + icon = "✗" if issue.severity == Severity.ERROR else "⚠" if issue.severity == Severity.WARNING else "ℹ" + location_str = f" [{issue.location}]" if issue.location else "" + lines.append(f"\n{icon} {issue.severity.value} ({issue.category}){location_str}") + lines.append(f" {issue.message}") + if issue.details: + for key, value in issue.details.items(): + lines.append(f" {key}: {value}") + + lines.append("\n" + "-" * 60) + lines.append(f"Summary: {counts[Severity.ERROR]} errors, " + f"{counts[Severity.WARNING]} warnings, " + f"{counts[Severity.INFO]} info") + lines.append("=" * 60 + "\n") + + return "\n".join(lines) diff --git a/data_integrity/schema_validator.py b/data_integrity/schema_validator.py new file mode 100644 index 0000000..3450c94 --- /dev/null +++ b/data_integrity/schema_validator.py @@ -0,0 +1,82 @@ +""" +JSON Schema validation for DREAMS data structure. + +Validates that JSON data conforms to an expected schema. +Falls back gracefully if jsonschema library is not installed. +""" + +from typing import List, Optional +from pathlib import Path +import json + +from .reporter import ValidationIssue, Severity + + +def validate_schema(data: dict, schema_path: Optional[Path] = None) -> List[ValidationIssue]: + """ + Validate data against JSON schema. + + Args: + data: The data to validate + schema_path: Path to JSON schema file (optional) + + Returns: + List of validation issues + """ + issues = [] + + if schema_path is None: + return issues # Schema validation is optional + + if not schema_path.exists(): + issues.append(ValidationIssue( + severity=Severity.WARNING, + category="schema", + message=f"Schema file not found: {schema_path}", + location="validator" + )) + return issues + + # Try to load schema + try: + with open(schema_path, 'r', encoding='utf-8') as f: + schema = json.load(f) + except json.JSONDecodeError as e: + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="schema", + message=f"Invalid JSON in schema file: {e}", + location=str(schema_path) + )) + return issues + + # Try to use jsonschema if available + try: + import jsonschema + from jsonschema import Draft7Validator + + validator = Draft7Validator(schema) + errors = sorted(validator.iter_errors(data), key=lambda e: e.path) + + for error in errors: + # Build location path + location = ".".join(str(p) for p in error.absolute_path) if error.absolute_path else "root" + + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="schema", + message=error.message, + location=location, + details={"schema_path": str(error.schema_path)} + )) + + except ImportError: + issues.append(ValidationIssue( + severity=Severity.INFO, + category="schema", + message="jsonschema library not installed - skipping schema validation", + location="validator", + details={"hint": "pip install jsonschema"} + )) + + return issues diff --git a/data_integrity/temporal_validator.py b/data_integrity/temporal_validator.py new file mode 100644 index 0000000..b20190b --- /dev/null +++ b/data_integrity/temporal_validator.py @@ -0,0 +1,248 @@ +""" +Temporal consistency validation for DREAMS data. + +Validates timestamp ordering and detects temporal anomalies: +- Future timestamps +- Non-monotonic sequences +- Out-of-order events +""" + +from typing import List, Dict, Any, Optional +from datetime import datetime, timezone + +from .reporter import ValidationIssue, Severity + + +def validate_temporal(data: dict, strict_monotonic: bool = False) -> List[ValidationIssue]: + """ + Validate temporal consistency of time-ordered data. + + Args: + data: The data containing timestamps + strict_monotonic: If True, require strictly increasing timestamps (no duplicates) + + Returns: + List of validation issues + """ + issues = [] + + # Extract timestamps from data structure + timestamps = _extract_timestamps(data) + + if not timestamps: + issues.append(ValidationIssue( + severity=Severity.WARNING, + category="temporal", + message="No timestamps found in data", + location="root" + )) + return issues + + # Check for future timestamps + now = datetime.now(timezone.utc) + for ts_info in timestamps: + ts = ts_info["timestamp"] + location = ts_info["location"] + + if ts > now: + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="temporal", + message=f"Future timestamp detected: {ts.isoformat()}", + location=location, + details={"current_time": now.isoformat()} + )) + + # Check temporal ordering + issues.extend(_check_ordering(timestamps, strict_monotonic)) + + return issues + + +def _extract_timestamps(data: dict, parent_key: str = "") -> List[Dict[str, Any]]: + """ + Recursively extract timestamps from data structure. + + Looks for common timestamp field names and ISO 8601 formatted strings. + + Args: + data: Data structure to search + parent_key: Parent location for building path strings + + Returns: + List of dicts with 'timestamp', 'location', and 'raw' keys + """ + timestamp_fields = { + "timestamp", "time", "datetime", "created_at", "recorded_at", + "date", "ts", "event_time", "capture_time" + } + + timestamps = [] + + if isinstance(data, dict): + for key, value in data.items(): + location = f"{parent_key}.{key}" if parent_key else key + + # Check if this is a timestamp field + if key.lower() in timestamp_fields: + ts = _parse_timestamp(value) + if ts: + timestamps.append({ + "timestamp": ts, + "location": location, + "raw": value + }) + + # Recurse into nested structures + elif isinstance(value, (dict, list)): + timestamps.extend(_extract_timestamps(value, location)) + + elif isinstance(data, list): + for idx, item in enumerate(data): + location = f"{parent_key}[{idx}]" + timestamps.extend(_extract_timestamps(item, location)) + + return timestamps + + +def _parse_timestamp(value: Any) -> Optional[datetime]: + """ + Parse a timestamp from various formats. + + Supports: + - ISO 8601 strings + - Unix timestamps in seconds (int/float) + - Unix timestamps in milliseconds (auto-detected) + - datetime objects + + Returns: + datetime object with timezone, or None if parsing fails + """ + if isinstance(value, datetime): + # Ensure timezone awareness + if value.tzinfo is None: + return value.replace(tzinfo=timezone.utc) + return value + + if isinstance(value, str): + # Try ISO 8601 format + try: + dt = datetime.fromisoformat(value.replace('Z', '+00:00')) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except (ValueError, AttributeError): + pass + + if isinstance(value, (int, float)): + # Try Unix timestamp + try: + timestamp_value = value + # Check if timestamp is in milliseconds + # Year 9999 in seconds = 253402300800 + if timestamp_value > 253402300800: + timestamp_value = timestamp_value / 1000 + + return datetime.fromtimestamp(timestamp_value, tz=timezone.utc) + except (ValueError, OSError): + pass + + return None + + +def _check_ordering(timestamps: List[Dict[str, Any]], strict: bool) -> List[ValidationIssue]: + """ + Check if timestamps are in correct temporal order. + + Args: + timestamps: List of timestamp info dicts + strict: If True, require strictly increasing (no duplicates) + + Returns: + List of validation issues + """ + issues = [] + + if len(timestamps) < 2: + return issues + + # Sort by location to check sequence within arrays + # Group by parent array (e.g., all items in "samples") + grouped = _group_by_parent_array(timestamps) + + for group_name, group_timestamps in grouped.items(): + for i in range(1, len(group_timestamps)): + prev = group_timestamps[i - 1] + curr = group_timestamps[i] + + prev_ts = prev["timestamp"] + curr_ts = curr["timestamp"] + + if strict: + if curr_ts <= prev_ts: + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="temporal", + message=f"Non-strictly-monotonic timestamps: {curr_ts.isoformat()} <= {prev_ts.isoformat()}", + location=f"{curr['location']} (previous: {prev['location']})", + details={ + "current": curr["raw"], + "previous": prev["raw"] + } + )) + else: + if curr_ts < prev_ts: + issues.append(ValidationIssue( + severity=Severity.ERROR, + category="temporal", + message=f"Out-of-order timestamps: {curr_ts.isoformat()} < {prev_ts.isoformat()}", + location=f"{curr['location']} (previous: {prev['location']})", + details={ + "current": curr["raw"], + "previous": prev["raw"] + } + )) + + return issues + + +def _group_by_parent_array(timestamps: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + """ + Group timestamps by their parent array for sequential checking. + + For example, "samples[0].timestamp" and "samples[1].timestamp" + should be in the same group. + """ + groups = {} + + for ts_info in timestamps: + location = ts_info["location"] + + # Extract parent array (e.g., "samples" from "samples[0].timestamp") + if '[' in location: + parent = location.split('[')[0] + else: + parent = "root" + + if parent not in groups: + groups[parent] = [] + groups[parent].append(ts_info) + + # Sort each group by index + for parent, group in groups.items(): + group.sort(key=lambda x: _extract_index(x["location"])) + + return groups + + +def _extract_index(location: str) -> int: + """Extract array index from location string.""" + if '[' not in location: + return 0 + + try: + start = location.index('[') + end = location.index(']', start) + return int(location[start + 1:end]) + except (ValueError, IndexError): + return 0 diff --git a/data_integrity/validator.py b/data_integrity/validator.py new file mode 100644 index 0000000..2cb8c38 --- /dev/null +++ b/data_integrity/validator.py @@ -0,0 +1,137 @@ +""" +DREAMS Data Integrity Validator - CLI Entry Point + +Usage: + python -m data_integrity.validator --input data.json [options] + +Example: + python -m data_integrity.validator \\ + --input examples/sample_data.json \\ + --schema examples/sample_schema.json \\ + --base-dir ./ +""" + +import argparse +import json +import sys +from pathlib import Path + +from .reporter import ValidationReport, Severity +from .schema_validator import validate_schema +from .path_validator import validate_paths +from .temporal_validator import validate_temporal + + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description="DREAMS Data Integrity Validator (Phase-1)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Validate with schema + python -m data_integrity.validator --input data.json --schema schema.json + + # Validate without schema + python -m data_integrity.validator --input data.json --base-dir ./data + + # Strict temporal ordering + python -m data_integrity.validator --input data.json --strict-temporal + """ + ) + + parser.add_argument( + "--input", "-i", + required=True, + type=Path, + help="Path to input JSON data file" + ) + + parser.add_argument( + "--schema", "-s", + type=Path, + help="Path to JSON schema file (optional)" + ) + + parser.add_argument( + "--base-dir", "-b", + type=Path, + default=Path("."), + help="Base directory for resolving relative media paths (default: current directory)" + ) + + parser.add_argument( + "--strict-temporal", + action="store_true", + help="Require strictly increasing timestamps (no duplicates)" + ) + + parser.add_argument( + "--quiet", "-q", + action="store_true", + help="Only print errors, suppress warnings and info" + ) + + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON (for CI/CD pipelines)" + ) + + args = parser.parse_args() + + # Validate input file exists + if not args.input.exists(): + print(f"Error: Input file not found: {args.input}", file=sys.stderr) + return 1 + + # Load data + try: + with open(args.input, 'r', encoding='utf-8') as f: + data = json.load(f) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in input file: {e}", file=sys.stderr) + return 1 + + # Initialize report + report = ValidationReport() + + # Run validators + print(f"Validating: {args.input}") + print(f"Base directory: {args.base_dir.resolve()}") + if args.schema: + print(f"Schema: {args.schema}") + print() + + # Schema validation + if args.schema: + print("Running schema validation...") + issues = validate_schema(data, args.schema) + report.extend(issues) + + # Path validation + print("Running path validation...") + issues = validate_paths(data, args.base_dir) + report.extend(issues) + + # Temporal validation + print("Running temporal validation...") + issues = validate_temporal(data, strict_monotonic=args.strict_temporal) + report.extend(issues) + + # Filter by severity if quiet mode + if args.quiet: + report.issues = [i for i in report.issues if i.severity == Severity.ERROR] + + # Print report in requested format + if args.json: + print(json.dumps(report.to_dict(), indent=2)) + else: + print(report.format_summary()) + + # Exit with appropriate code + return 1 if report.has_errors() else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/TEST_PLAN.md b/docs/TEST_PLAN.md new file mode 100644 index 0000000..5a3a43c --- /dev/null +++ b/docs/TEST_PLAN.md @@ -0,0 +1,514 @@ +# DREAMS Testing Strategy and Validation Plan + +## Overview + +Comprehensive testing strategy for the DREAMS (Digitization for Recovery: Exploring Arts with Mining for Societal well-being) project, covering all modules including sentiment analysis, keyword clustering, location-proximity analysis, and system integration. This plan ensures robust validation of photo memory analysis for personal recovery journeys. + +## Overall Testing Strategy + +### Testing Pyramid +- **Unit Tests (70%)**: Individual component testing with high coverage +- **Integration Tests (20%)**: Module interaction and API validation +- **End-to-End Tests (10%)**: Complete user workflow verification + +### Testing Principles +- **Test-Driven Development**: Write tests before implementation where possible +- **Continuous Integration**: Automated testing on every commit +- **Code Coverage**: Minimum 85% coverage for critical paths +- **Performance Benchmarks**: Establish and monitor performance metrics +- **Security Testing**: Include privacy and data protection validation + +### Test Environments +- **Development**: Local testing with mock data +- **Staging**: Full system testing with realistic datasets +- **Production**: Monitoring and canary deployments + +### Quality Gates +- All unit tests pass +- Integration tests successful +- Code review approval +- Performance benchmarks met +- Security scan clean + +## Validation Plan + +### Functional Validation +- **Feature Completeness**: All requirements implemented and tested +- **Data Accuracy**: Sentiment scores, proximity calculations, clustering results validated against expected outcomes +- **API Compliance**: REST endpoints return correct responses +- **User Interface**: Dashboard displays accurate analytics + +### Performance Validation +- **Response Times**: API calls < 2 seconds, analysis < 30 seconds +- **Scalability**: Handle 1000+ photos per user +- **Resource Usage**: Memory < 1GB, CPU utilization reasonable +- **Concurrent Users**: Support multiple simultaneous analyses + +### Security Validation +- **Data Privacy**: Location data anonymized, user consent enforced +- **Access Control**: Authentication and authorization working +- **Input Validation**: SQL injection, XSS, and other attacks prevented +- **Audit Logging**: Sensitive operations logged appropriately + +### Usability Validation +- **User Experience**: Intuitive dashboard navigation +- **Error Handling**: Clear error messages and recovery options +- **Accessibility**: WCAG compliance for web interfaces +- **Cross-browser**: Compatible with major browsers + +## Module-Specific Testing + +### Core DREAMS Modules +- **Sentiment Analysis**: Caption emotion classification accuracy +- **Keyword Clustering**: Thematic grouping validation +- **Location-Proximity Analysis**: Multi-dimensional proximity calculations +- **Image Analysis**: Object detection and emotion recognition + +### Integration Testing +- **Data Flow**: Photo upload → analysis → storage → visualization +- **API Integration**: Frontend-backend communication +- **Database Operations**: CRUD operations and data consistency +- **External Services**: Model loading and API calls + +## Continuous Integration Pipeline + +### Automated Testing Stages +1. **Linting**: Code style and quality checks +2. **Unit Tests**: Fast feedback on component changes +3. **Integration Tests**: Module interaction validation +4. **Performance Tests**: Benchmarking against thresholds +5. **Security Scans**: Vulnerability assessment +6. **Deployment**: Automated staging deployment + +### Test Reporting +- **Coverage Reports**: Detailed coverage by module +- **Performance Metrics**: Historical performance tracking +- **Failure Analysis**: Root cause identification +- **Trend Analysis**: Test stability and reliability + +## Risk Mitigation + +### High-Risk Areas +- **ML Model Accuracy**: Regular validation against ground truth +- **Location Privacy**: Strict data handling protocols +- **Scalability**: Load testing and optimization +- **Data Loss**: Backup and recovery testing + +### Contingency Plans +- **Test Failures**: Automated rollback procedures +- **Performance Issues**: Optimization sprints +- **Security Vulnerabilities**: Immediate patching protocols +- **Data Incidents**: Incident response procedures + +--- + +## Location-Proximity Module Test Plan + +## Overview + +Comprehensive testing strategy for the location-proximity analysis module, covering unit tests, integration tests, and edge case validation. + +## Test Categories + +### 1. Unit Tests +- Individual component functionality +- Input validation and error handling +- Mathematical accuracy verification + +### 2. Integration Tests +- Component interaction validation +- End-to-end workflow testing +- API endpoint verification + +### 3. Performance Tests +- Large dataset processing +- Memory usage optimization +- Response time benchmarks + +### 4. Edge Case Tests +- Boundary conditions +- Invalid input handling +- Error recovery scenarios + +--- + +## Location Extractor Tests + +### Unit Tests + +#### Test Case: LE-UT-001 +**Description**: Extract GPS coordinates from valid EXIF data +**Input**: Image with GPS EXIF tags (lat: 64.8378, lon: -147.7164) +**Expected Output**: `{'lat': 64.8378, 'lon': -147.7164, 'altitude': None}` +**Priority**: High + +#### Test Case: LE-UT-002 +**Description**: Handle image without GPS data +**Input**: Image file with no EXIF GPS information +**Expected Output**: `None` +**Priority**: High + +#### Test Case: LE-UT-003 +**Description**: Process corrupted image file +**Input**: Corrupted/invalid image file +**Expected Output**: Exception handling, return `None` +**Priority**: Medium + +#### Test Case: LE-UT-004 +**Description**: Extract GPS with altitude information +**Input**: Image with GPS + altitude EXIF data +**Expected Output**: `{'lat': 64.8378, 'lon': -147.7164, 'altitude': 150.5}` +**Priority**: Low + +### Edge Cases + +#### Test Case: LE-EC-001 +**Description**: GPS coordinates at boundary values +**Input**: Image with lat=90.0, lon=180.0 +**Expected Output**: Valid coordinate extraction +**Priority**: Medium + +#### Test Case: LE-EC-002 +**Description**: Non-existent file path +**Input**: Path to non-existent image file +**Expected Output**: FileNotFoundError handling +**Priority**: High + +--- + +## Proximity Calculator Tests + +### Unit Tests + +#### Test Case: PC-UT-001 +**Description**: Calculate geographic distance (Haversine) +**Input**: +- Location 1: (64.8378, -147.7164) - Fairbanks +- Location 2: (61.2181, -149.9003) - Anchorage +**Expected Output**: ~358.5 km +**Tolerance**: ±1 km +**Priority**: High + +#### Test Case: PC-UT-002 +**Description**: Categorical proximity - same type +**Input**: +- Place 1: {'type': 'church', 'name': 'St. Mary'} +- Place 2: {'type': 'church', 'name': 'Holy Trinity'} +**Expected Output**: 1.0 (perfect match) +**Priority**: High + +#### Test Case: PC-UT-003 +**Description**: Categorical proximity - related types +**Input**: +- Place 1: {'type': 'hospital', 'name': 'General Hospital'} +- Place 2: {'type': 'clinic', 'name': 'Health Clinic'} +**Expected Output**: 0.5 (related match) +**Priority**: Medium + +#### Test Case: PC-UT-004 +**Description**: Categorical proximity - unrelated types +**Input**: +- Place 1: {'type': 'church', 'name': 'St. Mary'} +- Place 2: {'type': 'restaurant', 'name': 'Pizza Place'} +**Expected Output**: 0.0 (no match) +**Priority**: Medium + +#### Test Case: PC-UT-005 +**Description**: Linguistic similarity calculation +**Input**: +- Place 1: {'language': 'portuguese', 'name': 'Casa do Bacalhau'} +- Place 2: {'language': 'portuguese', 'name': 'Restaurante Lisboa'} +**Expected Output**: 1.0 (same language) +**Priority**: Medium + +#### Test Case: PC-UT-006 +**Description**: Cultural similarity (Jaccard index) +**Input**: +- Place 1: {'cultural_tags': ['european', 'catholic', 'traditional']} +- Place 2: {'cultural_tags': ['european', 'traditional', 'historic']} +**Expected Output**: 0.5 (2 common out of 4 total unique tags) +**Priority**: Medium + +#### Test Case: PC-UT-007 +**Description**: Composite proximity calculation +**Input**: Two locations with all proximity dimensions +**Expected Output**: Weighted sum: α·P_geo + β·P_cat + γ·P_ling + δ·P_cult +**Weights**: α=0.3, β=0.4, γ=0.15, δ=0.15 +**Priority**: High + +### Edge Cases + +#### Test Case: PC-EC-001 +**Description**: Identical locations +**Input**: Same GPS coordinates and attributes +**Expected Output**: Proximity score = 1.0 +**Priority**: High + +#### Test Case: PC-EC-002 +**Description**: Maximum distance locations +**Input**: Antipodal points (opposite sides of Earth) +**Expected Output**: Geographic proximity ≈ 0.0 +**Priority**: Low + +#### Test Case: PC-EC-003 +**Description**: Missing attribute handling +**Input**: Location with missing 'type' field +**Expected Output**: Graceful degradation, use available dimensions +**Priority**: Medium + +--- + +## Emotion-Location Mapper Tests + +### Unit Tests + +#### Test Case: ELM-UT-001 +**Description**: Add sentiment data for location +**Input**: +- Location ID: 'loc_001' +- Sentiment: 0.8 (positive) +- Timestamp: '2024-01-15T10:30:00Z' +**Expected Output**: Successfully stored sentiment record +**Priority**: High + +#### Test Case: ELM-UT-002 +**Description**: Calculate location sentiment profile +**Input**: Location with 5 sentiment records: [0.8, 0.6, 0.9, 0.7, 0.5] +**Expected Output**: +- Mean: 0.7 +- Std: 0.15 +- Count: 5 +**Priority**: High + +#### Test Case: ELM-UT-003 +**Description**: Identify emotional hotspot +**Input**: Location with consistent positive sentiment (≥60% above 0.6) +**Expected Output**: Classified as positive hotspot +**Priority**: Medium + +#### Test Case: ELM-UT-004 +**Description**: Compare place types sentiment +**Input**: Multiple churches and hospitals with sentiment data +**Expected Output**: Mean sentiment for 'church' (e.g., 0.75) is significantly higher than for 'hospital' (e.g., 0.4), with a t-test p-value < 0.05 +**Priority**: Medium + +#### Test Case: ELM-UT-005 +**Description**: Temporal emotion trend analysis +**Input**: Location with sentiment data over 6 months +**Expected Output**: Trend is 'improving' if linear regression slope > 0.1, 'declining' if slope < -0.1, 'stable' if -0.1 ≤ slope ≤ 0.1 +**Priority**: Low + +### Edge Cases + +#### Test Case: ELM-EC-001 +**Description**: Location with single sentiment record +**Input**: One sentiment value for location +**Expected Output**: Profile with count=1, std=0 +**Priority**: Medium + +#### Test Case: ELM-EC-002 +**Description**: Location with no sentiment data +**Input**: Request profile for location without data +**Expected Output**: Empty profile or appropriate default +**Priority**: High + +--- + +## Semantic Clustering Tests + +### Unit Tests + +#### Test Case: SC-UT-001 +**Description**: DBSCAN clustering with optimal parameters +**Input**: +- 15 locations (5 churches, 5 hospitals, 5 restaurants) +- eps=0.4, min_samples=2 +**Expected Output**: 3 distinct clusters +**Priority**: High + +#### Test Case: SC-UT-002 +**Description**: Cluster quality metrics calculation +**Input**: Generated clusters from test dataset +**Expected Output**: +- Silhouette score > 0.5 +- Davies-Bouldin index < 1.0 +**Priority**: Medium + +#### Test Case: SC-UT-003 +**Description**: Find similar place patterns +**Input**: +- Portuguese restaurants across different locations +- Proximity threshold: 0.6 +**Expected Output**: Grouped similar restaurants +**Priority**: Medium + +#### Test Case: SC-UT-004 +**Description**: Noise point identification +**Input**: Dataset with outlier locations +**Expected Output**: Outliers labeled as noise (-1) +**Priority**: Low + +### Edge Cases + +#### Test Case: SC-EC-001 +**Description**: Insufficient data for clustering +**Input**: Dataset with only 2 locations +**Expected Output**: Both points are labeled as noise (cluster label -1) if their distance exceeds eps. Otherwise, they form a single cluster. +**Priority**: Medium + +#### Test Case: SC-EC-002 +**Description**: All locations identical +**Input**: Multiple locations with identical attributes +**Expected Output**: Single cluster containing all points +**Priority**: Low + +--- + +## Integration Tests + +### Test Case: INT-001 +**Description**: End-to-end proximity analysis workflow +**Steps**: +1. Extract GPS from test images +2. Calculate multi-dimensional proximity +3. Map emotions to locations +4. Perform semantic clustering +**Expected Output**: Complete analysis pipeline execution +**Priority**: High + +### Test Case: INT-002 +**Description**: API endpoint integration +**Steps**: +1. POST /api/locations/analyze with image data +2. Verify proximity calculations +3. Check emotion mapping results +**Expected Output**: Valid JSON response with analysis results +**Priority**: High + +### Test Case: INT-003 +**Description**: Database integration +**Steps**: +1. Store location and sentiment data +2. Retrieve for analysis +3. Update with new calculations +**Expected Output**: Persistent data storage and retrieval +**Priority**: Medium + +--- + +## Performance Tests + +### Test Case: PERF-001 +**Description**: Large dataset processing +**Input**: 1000 locations with full attribute data +**Expected Output**: Processing time < 30 seconds +**Priority**: Medium + +### Test Case: PERF-002 +**Description**: Memory usage optimization +**Input**: Batch processing of 500 images +**Expected Output**: Memory usage < 1GB peak +**Priority**: Low + +### Test Case: PERF-003 +**Description**: Concurrent request handling +**Input**: 10 simultaneous proximity calculations +**Expected Output**: All requests complete successfully +**Priority**: Low + +--- + +## Test Data Requirements + +### Synthetic Test Dataset +``` +tests/data/ +├── images/ +│ ├── church_with_gps.jpg +│ ├── hospital_no_gps.jpg +│ ├── restaurant_corrupted.jpg +│ └── park_with_altitude.jpg +├── locations.json +├── sentiments.csv +└── expected_results.json +``` + +### Location Test Data Structure +```json +{ + "locations": [ + { + "id": "loc_001", + "name": "St. Mary's Church", + "type": "church", + "coordinates": {"lat": 64.8378, "lon": -147.7164}, + "language": "english", + "cultural_tags": ["christian", "traditional", "community"] + } + ] +} +``` + +### Sentiment Test Data Structure +```csv +location_id,sentiment_score,timestamp,user_id +loc_001,0.8,2024-01-15T10:30:00Z,user_123 +loc_001,0.6,2024-01-16T14:20:00Z,user_123 +``` + +--- + +## Test Execution Strategy + +### Automated Testing +- **Unit Tests**: Run on every commit (pytest) +- **Integration Tests**: Run on pull requests +- **Performance Tests**: Run weekly on main branch + +### Manual Testing +- **Edge Case Validation**: Monthly review +- **User Acceptance Testing**: Before major releases +- **Security Testing**: Quarterly assessment + +### Continuous Integration +```yaml +# .github/workflows/test.yml +- name: Run Location-Proximity Tests + run: | + cd location_proximity + pytest test_proximity.py -v --cov=. + pytest ../tests/integration/ -v +``` + +--- + +## Success Criteria + +### Unit Test Coverage +- **Minimum**: 85% code coverage +- **Target**: 95% code coverage +- **Critical paths**: 100% coverage + +### Performance Benchmarks +- **Proximity calculation**: < 10ms per pair +- **Clustering**: < 5 seconds for 100 locations +- **Memory usage**: < 100MB for standard operations + +### Quality Metrics +- **All tests pass**: Zero failing tests in CI +- **Code quality**: Pylint score > 8.0 +- **Documentation**: All public functions documented + +--- + +## Test Maintenance + +### Regular Updates +- Update test data quarterly +- Review edge cases after bug reports +- Benchmark performance monthly + +### Test Data Management +- Version control test datasets +- Anonymize real-world test data +- Maintain data consistency across environments \ No newline at end of file diff --git a/dream-integration/.gitignore b/dream-integration/.gitignore new file mode 100644 index 0000000..1d87888 --- /dev/null +++ b/dream-integration/.gitignore @@ -0,0 +1 @@ +venv310/ \ No newline at end of file diff --git a/dream-integration/app/.env.example b/dream-integration/app/.env.example new file mode 100644 index 0000000..f715f6c --- /dev/null +++ b/dream-integration/app/.env.example @@ -0,0 +1 @@ +MONGO_URI= \ No newline at end of file diff --git a/dream-integration/app/.gitignore b/dream-integration/app/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/dream-integration/app/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/dream-integration/app/app.py b/dream-integration/app/app.py index a22763c..72fba06 100644 --- a/dream-integration/app/app.py +++ b/dream-integration/app/app.py @@ -1,146 +1,90 @@ +import cmd import os import json import glob import subprocess import sys -from flask import Flask, render_template, request, redirect, url_for, send_from_directory, flash +from flask import Flask, render_template, request, redirect, url_for, flash, Response + +from db import users_col, samples_col, results_col, fs app = Flask(__name__, template_folder="templates", static_folder="static") app.secret_key = "dev" -# ---- PATHS ---- -BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) #dream-integration + +BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) DATA_DIR = os.path.join(BASE_DIR, "data") ANALYSIS_SCRIPTS_DIR = os.path.join(BASE_DIR, "analysis") -ALLOWED_IMG_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"} + def list_persons(): - """Return sorted list of person folder names under data/.""" - if not os.path.isdir(DATA_DIR): - return [] - return sorted([d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]) + return sorted(u["person_id"] for u in users_col.find()) def list_samples(person): - pdir = os.path.join(DATA_DIR, person) - if not os.path.isdir(pdir): - return [] - return sorted([d for d in os.listdir(pdir) if os.path.isdir(os.path.join(pdir, d)) and d.startswith("sample")]) - -def find_image(sample_dir): - for f in os.listdir(sample_dir): - _, ext = os.path.splitext(f) - if ext.lower() in ALLOWED_IMG_EXTS: - return f - return None - -def find_transcript(sample_dir): - # For both clip-*.txt and transcript-*.txt - p = os.path.join(sample_dir, "transcript.txt") - if os.path.isfile(p): - return os.path.basename(p) - - cand = sorted(glob.glob(os.path.join(sample_dir, "transcript-*.txt"))) - if cand: - return os.path.basename(cand[0]) - - cand2 = sorted(glob.glob(os.path.join(sample_dir, "clip-*.txt"))) - if cand2: - return os.path.basename(cand2[0]) - - return None - -def find_audio(sample_dir): - """Find any .mp3 or .wav file in sample_dir. Returns basename or None.""" - cand = sorted(glob.glob(os.path.join(sample_dir, "*.mp3")) + - glob.glob(os.path.join(sample_dir, "*.wav"))) - return os.path.basename(cand[0]) if cand else None - -def find_description(sample_dir): - cand = sorted(glob.glob(os.path.join(sample_dir, "description*.txt"))) - return os.path.basename(cand[0]) if cand else None - -def get_analysis_dir(person, sample): - """ - Preferred: data//analysis-// - Backward-compat: if data//analysis-p01 exists, use that instead. - """ - person_dir = os.path.join(DATA_DIR, person) - legacy = os.path.join(person_dir, "analysis-p01", sample) - if os.path.isdir(os.path.join(person_dir, "analysis-p01")): - os.makedirs(legacy, exist_ok=True) - return legacy - - modern = os.path.join(person_dir, f"analysis-{person}", sample) - os.makedirs(modern, exist_ok=True) - return modern - -def read_text(path): - if path and os.path.isfile(path): - with open(path, "r", encoding="utf-8") as f: - return f.read() - return "" - -def read_json(path): - if path and os.path.isfile(path): - with open(path, "r", encoding="utf-8") as f: - return json.load(f) - return {} - -def media_url(rel_path): - # serve any file from data/ through /media/ - return url_for("serve_media", path=rel_path) - -@app.route("/media/") -def serve_media(path): - """Serve files from data/ for previews (images, etc.).""" - safe_root = DATA_DIR - directory = os.path.dirname(path) - filename = os.path.basename(path) - return send_from_directory(os.path.join(safe_root, directory), filename) + return sorted( + s["sample_id"] + for s in samples_col.find({"person_id": person}) + ) + +def get_sample(person, sample): + return samples_col.find_one( + {"person_id": person, "sample_id": sample} + ) + +def read_text(text): + return text or "" + +def read_scores(person, sample, key): + r = results_col.find_one( + {"person_id": person, "sample_id": sample} + ) + return r.get(key, {}) if r else {} + + +@app.route("/media/image//") +def serve_image(person, sample): + s = get_sample(person, sample) + if not s or "image_id" not in s: + return "", 404 + + file = fs.get(s["image_id"]) + return Response(file.read(), mimetype="image/png") + + @app.route("/", methods=["GET", "POST"]) def index(): persons = list_persons() if not persons: - return "No persons found under ./data/. Create data/person-01/sample-01 and add files.", 200 + return "No persons found in MongoDB.", 200 + - # selection (defaults to first) person = request.values.get("person", persons[0]) samples = list_samples(person) if not samples: - return f"No samples found for {person}. Create data/{person}/sample-01.", 200 - sample = request.values.get("sample", samples[0]) + return f"No samples found for {person}.", 200 - # build paths - sample_dir = os.path.join(DATA_DIR, person, sample) - img_name = find_image(sample_dir) - transcript_name = find_transcript(sample_dir) - description_name = find_description(sample_dir) - - # relative paths (for /media) - img_rel = f"{person}/{sample}/{img_name}" if img_name else None - transcript_rel = f"{person}/{sample}/{transcript_name}" if transcript_name else None - description_rel = f"{person}/{sample}/{description_name}" if description_name else None - - # analysis output dir + expected jsons - out_dir = get_analysis_dir(person, sample) - text_json = os.path.join(out_dir, "text_scores.json") - image_json = os.path.join(out_dir, "image_scores.json") + sample = request.values.get("sample", samples[0]) + s = get_sample(person, sample) context = { "persons": persons, "samples": samples, "selected_person": person, "selected_sample": sample, - "img_url": media_url(img_rel) if img_rel else None, - "transcript_text": read_text(os.path.join(DATA_DIR, transcript_rel)) if transcript_rel else "", - "description_text": read_text(os.path.join(DATA_DIR, description_rel)) if description_rel else "", - "text_scores": read_json(text_json), - "image_scores": read_json(image_json), + "img_url": url_for("serve_image", person=person, sample=sample) + if s and "image_id" in s else None, + "transcript_text": read_text(s.get("transcript") if s else ""), + "description_text": read_text(s.get("description") if s else ""), + "text_scores": read_scores(person, sample, "text_scores"), + "image_scores": read_scores(person, sample, "image_scores"), } + return render_template("index.html", **context) + + @app.route("/analyze", methods=["POST"]) def analyze(): """ @@ -148,117 +92,120 @@ def analyze(): text_analysis.py: python analysis/text_analysis.py --transcript --description --output (As text_scores.json) - image_analysis.py: python analysis/image_analysis.py --image --output + + Analysis scripts REQUIRE filesystem paths. + We keep data/ as a staging area. """ person = request.form["person"] sample = request.form["sample"] sample_dir = os.path.join(DATA_DIR, person, sample) person_dir = os.path.join(DATA_DIR, person) - out_dir = get_analysis_dir(person, sample) + os.makedirs(sample_dir, exist_ok=True) + + + image_patterns = ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.bmp", "*.webp"] + + def _find_first_file(directory, patterns): + for pattern in patterns: + matches = glob.glob(os.path.join(directory, pattern)) + if matches: + return matches[0] + return None +# In analyze(): + img_path = _find_first_file(sample_dir, ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.bmp", "*.webp"]) + transcript_path = _find_first_file(sample_dir, ["transcript*.txt", "clip-*.txt"]) + description_path = _find_first_file(sample_dir, ["description*.txt"]) + + transcript_path = None + for pattern in ("transcript*.txt", "clip-*.txt"): + matches = glob.glob(os.path.join(sample_dir, pattern)) + if matches: + transcript_path = matches[0] + break + + description_matches = glob.glob(os.path.join(sample_dir, "description*.txt")) + description_path = description_matches[0] if description_matches else None + + + out_dir = os.path.join(sample_dir, "analysis") os.makedirs(out_dir, exist_ok=True) - # Find files - img = find_image(sample_dir) - transcript = find_transcript(sample_dir) - description = find_description(sample_dir) - - # Transcribe audio if transcript missing - if not transcript: - audio_basename = find_audio(sample_dir) - if audio_basename: - audio_path = os.path.join(sample_dir, audio_basename) - before_txt = set(glob.glob(os.path.join(sample_dir, "*.txt"))) - - try: - cmd_audio = [ - sys.executable, - os.path.join(ANALYSIS_SCRIPTS_DIR, "transcribe_and_save.py"), - audio_path - ] - subprocess.run(cmd_audio, check=True) - - after_txt = set(glob.glob(os.path.join(sample_dir, "*.txt"))) - new_txt = sorted(list(after_txt - before_txt)) - - # ignore description files - new_txt_filtered = [p for p in new_txt if not os.path.basename(p).lower().startswith("description")] - - chosen = None - if new_txt_filtered: - chosen = new_txt_filtered[0] - else: - fallback = find_transcript(sample_dir) - if fallback: - chosen = os.path.join(sample_dir, fallback) - - if chosen: - target = os.path.join(sample_dir, "transcript.txt") - if os.path.abspath(chosen) != os.path.abspath(target): - if os.path.exists(target): - os.remove(target) - os.replace(chosen, target) - transcript = "transcript.txt" - else: - flash("Transcription finished but no transcript file detected.", "error") - - except subprocess.CalledProcessError as e: - flash(f"Audio transcription failed: {e}", "error") - - # Paths - img_path = os.path.join(sample_dir, img) if img else None - transcript_path = os.path.join(sample_dir, transcript) if transcript else None - description_path = os.path.join(sample_dir, description) if description else None - - # Text analysis - NEW: Check if output already exists + # Text analysis text_out = os.path.join(out_dir, "text_scores.json") if not os.path.exists(text_out) and (transcript_path or description_path): - cmd_text = [ + cmd = [ sys.executable, os.path.join(ANALYSIS_SCRIPTS_DIR, "text_analysis.py"), "--output", text_out ] if transcript_path: - cmd_text += ["--transcript", transcript_path] + cmd += ["--transcript", transcript_path] if description_path: - cmd_text += ["--description", description_path] + cmd += ["--description", description_path] try: - subprocess.run(cmd_text, check=True) - # Fallback if script ignored --output (check inside sample folder too) - legacy_candidate = os.path.join(person_dir, "analysis-p01", sample, "text_scores.json") + subprocess.run(cmd, check=True) + + + legacy_candidate = os.path.join( + person_dir, "analysis-p01", sample, "text_scores.json" + ) if not os.path.exists(text_out) and os.path.exists(legacy_candidate): os.makedirs(os.path.dirname(text_out), exist_ok=True) os.replace(legacy_candidate, text_out) + except subprocess.CalledProcessError as e: flash(f"Text analysis failed: {e}", "error") - # Image analysis - ensure output lands in correct folder + + + # Image analysis img_out = os.path.join(out_dir, "image_scores.json") - if img_path: - # Only run if file doesn't already exist - if not os.path.exists(img_out): - cmd_img = [ - sys.executable, - os.path.join(ANALYSIS_SCRIPTS_DIR, "image_analysis.py"), - "--image", img_path, - "--output", img_out - ] - try: - subprocess.run(cmd_img, check=True) - # Fallback: if script ignored --output - legacy_candidate_img = os.path.join(person_dir, "analysis-p01", sample, "image_scores.json") - if not os.path.exists(img_out) and os.path.exists(legacy_candidate_img): - os.makedirs(os.path.dirname(img_out), exist_ok=True) - os.replace(legacy_candidate_img, img_out) - except subprocess.CalledProcessError as e: - flash(f"Image analysis failed: {e}", "error") + if img_path and not os.path.exists(img_out): + cmd_img = [ + sys.executable, + os.path.join(ANALYSIS_SCRIPTS_DIR, "image_analysis.py"), + "--image", img_path, + "--output", img_out + ] + + try: + subprocess.run(cmd_img, check=True) + + # Fallback if script ignored --output (legacy behavior) + legacy_candidate_img = os.path.join( + person_dir, "analysis-p01", sample, "image_scores.json" + ) + if not os.path.exists(img_out) and os.path.exists(legacy_candidate_img): + os.makedirs(os.path.dirname(img_out), exist_ok=True) + os.replace(legacy_candidate_img, img_out) + + except subprocess.CalledProcessError as e: + flash(f"Image analysis failed: {e}", "error") + + results = {} + if os.path.exists(text_out): + with open(text_out) as f: + results["text_scores"] = json.load(f) + + if os.path.exists(img_out): + with open(img_out) as f: + results["image_scores"] = json.load(f) + + if results: + results_col.update_one( + {"person_id": person, "sample_id": sample}, + {"$set": results}, + upsert=True + ) flash("Analysis complete.", "success") return redirect(url_for("index", person=person, sample=sample)) + + if __name__ == "__main__": - # run: python app/app.py - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/dream-integration/app/db.py b/dream-integration/app/db.py new file mode 100644 index 0000000..50b47aa --- /dev/null +++ b/dream-integration/app/db.py @@ -0,0 +1,18 @@ +from pymongo import MongoClient +from gridfs import GridFS +import os +from dotenv import load_dotenv + +load_dotenv() + +mongo_uri = os.getenv("MONGO_URI") +if not mongo_uri: + raise RuntimeError("MONGO_URI environment variable not set. Please create a .env file with it.") +client = MongoClient(mongo_uri) + +db = client["dreams"] + +users_col = db.users +samples_col = db.samples +results_col = db.results +fs = GridFS(db) diff --git a/dream-integration/contributing.md b/dream-integration/contributing.md index 024d2d1..ff72cb3 100644 --- a/dream-integration/contributing.md +++ b/dream-integration/contributing.md @@ -1,23 +1,26 @@ -# Contributing to DREAMS Multimodal Emotion & Sentiment Analysis +# Contributing to DREAMS Multimodal Emotion & Sentiment Analysis Thank you for your interest in contributing! 🙌 -This project is the groundwork for multimodal (audio, text, image) emotion and sentiment analysis with a structured pipeline and visualization. +This project is the groundwork for multimodal (audio, text, image) emotion and sentiment analysis with a structured pipeline and visualization. -We welcome improvements, fixes, and feature enhancements. Please follow the setup and usage steps carefully before contributing. +We welcome improvements, fixes, and feature enhancements. Please follow the setup and usage steps carefully before contributing. --- -## Setup Instructions +## Setup Instructions -1. **Fork and Clone** - - Fork this repository to your GitHub account. - - Clone your fork locally: - ```bash - git clone https://github.com//dream-integration.git - cd dream-integration - ``` +1. **Fork and Clone** + + - Fork this repository to your GitHub account. + - Clone your fork locally: + + ```bash + git clone https://github.com//dream-integration.git + cd dream-integration + ``` + +2. **Create Virtual Environment** -2. **Create Virtual Environment** ```bash python3 -m venv venv source venv/bin/activate # On Linux/Mac @@ -25,69 +28,68 @@ We welcome improvements, fixes, and feature enhancements. Please follow the setu ``` 3. **Install Requirements** - ```bash - pip install -r requirements.txt - ``` + ```bash + pip install -r requirements.txt + ``` 4. **Run Local Setup** - Navigate to the app directory and start Flask: + Navigate to the app directory and start Flask: - ```bash - cd app - python app.py - ``` + ```bash + cd app + python app.py + ``` ## Usage Instructions 1. Place your data in the following structure: - ```text - data/ - person-01/ - sample-01/ - clip-01.mp3 - image-01.jpg - description-01.txt - ``` + ```text + data/ + person-01/ + sample-01/ + clip-01.mp3 + image-01.jpg + description-01.txt + ``` 2. Run Modules Individually (to verify setup): - - Audio → Text (Whisper): - ```bash - python analysis/transcribe_and_save.py data/person-01/sample-01/clip-01.mp3 - ``` - **Note: A transcript.txt file should appear inside the sample folder.** - - - Text Emotion Analysis (DistilBERT): - ```bash - python analysis/text_analysis.py --transcript data/person-01/sample-01/transcript.txt \ - --description data/person-01/sample-01/description-01.txt \ - --output data/person-01/analysis-p01/sample-01/text_scores.json - ``` - **Note: text_scores.json should be created with proper emotion scores for the description and transcript both.** + - Audio → Text (Whisper): + ```bash + python analysis/transcribe_and_save.py data/person-01/sample-01/clip-01.mp3 + ``` + **Note: A transcript.txt file should appear inside the sample folder.** - - Image Emotion Analysis (DeepFace): - ```bash - python analysis/image_analysis.py --image data/person-01/sample-01/image-01.jpg \ - --output data/person-01/analysis-p01/sample-01/image_scores.json - ``` - **Note: image_scores.json should be created successfully.** + - Text Emotion Analysis (DistilBERT): + ```bash + python analysis/text_analysis.py --transcript data/person-01/sample-01/transcript.txt \ + --description data/person-01/sample-01/description-01.txt \ + --output data/person-01/analysis-p01/sample-01/text_scores.json + ``` -3. Once all modules generate their outputs, run the web app again: - ```bash - python app/app.py - ``` - **Note: The setup is complete now, you should now see all results (transcript + scores + visualizations) in the browser.** + **Note: text_scores.json should be created with proper emotion scores for the description and transcript both.** + - Image Emotion Analysis (DeepFace): -## Contributing Guidelines + ```bash + python analysis/image_analysis.py --image data/person-01/sample-01/image-01.jpg \ + --output data/person-01/analysis-p01/sample-01/image_scores.json + ``` -- Please read the Guidelines.md file carefully before contributing. -- Keep commits meaningful and modular (e.g., fix: improve transcript path handling). -- Always test your changes locally before submitting a Pull Request. -- Ensure the existing folder structure is not broken. -- Open an issue if you are proposing a change or a new feature. + **Note: image_scores.json should be created successfully.** +3. Once all modules generate their outputs, run the web app again: + ```bash + python app/app.py + ``` + **Note: The setup is complete now, you should now see all results (transcript + scores + visualizations) in the browser.** +## Contributing Guidelines +- Please read the Guidelines.md file carefully before contributing. +- Keep commits meaningful and modular (e.g., fix: improve transcript path handling). +- Always test your changes locally before submitting a Pull Request. +- Ensure the existing folder structure is not broken. +- Open an issue if you are proposing a change or a new feature. diff --git a/dream-integration/data/person-01/sample-01/analysis/image_scores.json b/dream-integration/data/person-01/sample-01/analysis/image_scores.json new file mode 100644 index 0000000..19c08ef --- /dev/null +++ b/dream-integration/data/person-01/sample-01/analysis/image_scores.json @@ -0,0 +1,12 @@ +{ + "dominant_emotion": "happy", + "emotion_scores": { + "angry": 0.023879146203398705, + "disgust": 1.6406268343871488e-07, + "fear": 0.013068304397165775, + "happy": 97.26760864257812, + "sad": 0.010611607693135738, + "surprise": 0.04025931656360626, + "neutral": 2.6445772647857666 + } +} \ No newline at end of file diff --git a/dream-integration/data/person-01/sample-01/analysis/text_scores.json b/dream-integration/data/person-01/sample-01/analysis/text_scores.json new file mode 100644 index 0000000..f5e1a73 --- /dev/null +++ b/dream-integration/data/person-01/sample-01/analysis/text_scores.json @@ -0,0 +1,32 @@ +{ + "description-01.txt": [ + { + "label": "joy", + "score": 0.9864851236343384 + }, + { + "label": "neutral", + "score": 0.005278781056404114 + }, + { + "label": "sadness", + "score": 0.004085569176822901 + }, + { + "label": "disgust", + "score": 0.0014205715851858258 + }, + { + "label": "anger", + "score": 0.001338689005933702 + }, + { + "label": "surprise", + "score": 0.0010697629768401384 + }, + { + "label": "fear", + "score": 0.0003214319876860827 + } + ] +} \ No newline at end of file diff --git a/dream-integration/script.py b/dream-integration/script.py new file mode 100644 index 0000000..e03246f --- /dev/null +++ b/dream-integration/script.py @@ -0,0 +1,151 @@ +import os +import json +from app.db import users_col, samples_col, results_col, fs + + + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(BASE_DIR, "data") + + + + +IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp"} +AUDIO_EXTS = {".mp3", ".wav"} + + + +def store_file_gridfs(path): + with open(path, "rb") as f: + return fs.put(f, filename=os.path.basename(path)) + +def read_text_file(path): + with open(path, "r", encoding="utf-8") as f: + return f.read() + + + +def migrate(): + if not os.path.isdir(DATA_DIR): + print("❌ data/ directory not found") + return + + for person in os.listdir(DATA_DIR): + person_dir = os.path.join(DATA_DIR, person) + if not os.path.isdir(person_dir): + continue + + print(f"\n👤 Migrating person: {person}") + + users_col.update_one( + {"person_id": person}, + {"$set": {"person_id": person}}, + upsert=True + ) + + for sample in os.listdir(person_dir): + if not sample.startswith("sample"): + continue + + sample_dir = os.path.join(person_dir, sample) + if not os.path.isdir(sample_dir): + continue + + print(f" 📦 Sample: {sample}") + + image_id = None + audio_id = None + transcript = None + transcript_fallback = None + description = "" + + + for file in os.listdir(sample_dir): + path = os.path.join(sample_dir, file) + name, ext = os.path.splitext(file) + ext = ext.lower() + + if ext in IMAGE_EXTS: + image_id = store_file_gridfs(path) + print(f" 🖼 Image stored: {file}") + + elif ext in AUDIO_EXTS: + audio_id = store_file_gridfs(path) + print(f" 🔊 Audio stored: {file}") + + elif ext == ".txt": + if name.startswith("transcript"): + transcript = read_text_file(path) + print(" 📄 Transcript loaded (transcript*)") + + elif name.startswith("clip") and transcript_fallback is None: + transcript_fallback = read_text_file(path) + + elif name.startswith("description"): + description = read_text_file(path) + print(" 📝 Description loaded") + + + if transcript is None and transcript_fallback is not None: + transcript = transcript_fallback + print(" 📄 Transcript loaded (clip* fallback)") + + samples_col.update_one( + {"person_id": person, "sample_id": sample}, + { + "$set": { + "person_id": person, + "sample_id": sample, + "image_id": image_id, + "audio_id": audio_id, + "transcript": transcript or "", + "description": description + } + }, + upsert=True + ) + + + analysis_dirs = [ + os.path.join(person_dir, "analysis-p01", sample), + os.path.join(person_dir, f"analysis-{person}", sample) + ] + + for a_dir in analysis_dirs: + if not os.path.isdir(a_dir): + continue + + print(f" 📊 Migrating analysis from {a_dir}") + + text_scores = {} + image_scores = {} + + text_path = os.path.join(a_dir, "text_scores.json") + image_path = os.path.join(a_dir, "image_scores.json") + + if os.path.isfile(text_path): + with open(text_path) as f: + text_scores = json.load(f) + + if os.path.isfile(image_path): + with open(image_path) as f: + image_scores = json.load(f) + + if text_scores or image_scores: + results_col.update_one( + {"person_id": person, "sample_id": sample}, + {"$set": { + "text_scores": text_scores, + "image_scores": image_scores + }}, + upsert=True + ) + + print("\n✅ Migration completed successfully.") + + + + + +if __name__ == "__main__": + migrate() diff --git a/dreamsApp/analytics/__init__.py b/dreamsApp/analytics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dreamsApp/analytics/emotion_episode.py b/dreamsApp/analytics/emotion_episode.py new file mode 100644 index 0000000..a6327f0 --- /dev/null +++ b/dreamsApp/analytics/emotion_episode.py @@ -0,0 +1,86 @@ +# dreamsApp/analytics/emotion_episode.py + +from dataclasses import dataclass +from datetime import datetime +from typing import Tuple, Optional, Dict, Any + +from .emotion_timeline import EmotionEvent + + +__all__ = ['Episode'] + + +@dataclass(frozen=True) +class Episode: + start_time: datetime + end_time: datetime + events: Tuple[EmotionEvent, ...] = () + source_subject_id: Optional[str] = None + + def __post_init__(self) -> None: + if not isinstance(self.events, tuple): + object.__setattr__(self, 'events', tuple(self.events)) + + if self.start_time > self.end_time: + raise ValueError( + f"start_time must be <= end_time: " + f"{self.start_time} > {self.end_time}" + ) + + for i, event in enumerate(self.events): + if event.timestamp < self.start_time: + raise ValueError( + f"Event at index {i} has timestamp {event.timestamp} " + f"before episode start_time {self.start_time}" + ) + if event.timestamp >= self.end_time: + raise ValueError( + f"Event at index {i} has timestamp {event.timestamp} " + f"at or after episode end_time {self.end_time}" + ) + + for i in range(len(self.events) - 1): + if self.events[i].timestamp > self.events[i + 1].timestamp: + raise ValueError( + f"Events must be chronologically ordered. " + f"Event at index {i} ({self.events[i].timestamp}) " + f"occurs after event at index {i + 1} ({self.events[i + 1].timestamp})" + ) + + def __len__(self) -> int: + return len(self.events) + + def is_empty(self) -> bool: + return len(self.events) == 0 + + def duration(self) -> float: + return (self.end_time - self.start_time).total_seconds() + + def contains_timestamp(self, timestamp: datetime) -> bool: + return self.start_time <= timestamp < self.end_time + + def to_dict(self) -> Dict[str, Any]: + events_list = [] + for event in self.events: + event_dict = { + 'timestamp': event.timestamp.isoformat(), + 'emotion_label': event.emotion_label, + } + if event.score is not None: + event_dict['score'] = event.score + if event.source_id is not None: + event_dict['source_id'] = event.source_id + if event.metadata is not None: + event_dict['metadata'] = event.metadata + events_list.append(event_dict) + + result: Dict[str, Any] = { + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat(), + 'events': events_list, + } + + if self.source_subject_id is not None: + result['source_subject_id'] = self.source_subject_id + + return result diff --git a/dreamsApp/analytics/emotion_proximity.py b/dreamsApp/analytics/emotion_proximity.py new file mode 100644 index 0000000..1763a6d --- /dev/null +++ b/dreamsApp/analytics/emotion_proximity.py @@ -0,0 +1,506 @@ +# dreamsApp/analytics/emotion_proximity.py + +""" +Time-Aware Emotion Proximity Utility (PR-3) + +Provides deterministic, structural comparison utilities for EmotionTimeline objects. +This module segments timelines into fixed time windows and computes simple numeric +distance metrics between aligned windows. + +WHAT THIS MODULE DOES: +- Maps emotion labels to ordinal numeric values (positive=1, neutral=0, negative=-1) +- Segments an EmotionTimeline into fixed-duration time windows +- Aggregates emotion scores per window (mean of mapped values) +- Compares two timelines over aligned windows using simple distance metrics + +WHAT THIS MODULE DOES NOT DO: +- Perform ML, inference, clustering, or learning +- Interpret emotions semantically or psychologically +- Detect trends, patterns, or anomalies +- Read from databases or external storage +- Persist results or expose APIs +- Visualize or render data +- Make assumptions about causality or meaning + +All operations are: +- Deterministic (same input → same output, when anchor_time is provided) +- Reversible (no data loss in transformations) +- Structural (no interpretation or inference) +- Side-effect free (pure functions) + +IMPORTANT DESIGN NOTES: +- When comparing two empty timelines without an explicit anchor_time, + behavior is undefined; callers should always provide anchor_time for + deterministic results in edge cases. +- Window indices can be negative if events occur before the anchor_time. +- Sparse window representation: only windows with events are stored. + +Dependencies: +- EmotionTimeline and EmotionEvent from emotion_timeline.py (PR-2) +""" + +from datetime import datetime, timedelta +from typing import Dict, List, Tuple, Optional, Any +from .emotion_timeline import EmotionTimeline, EmotionEvent + + +__all__ = [ + 'EMOTION_LABEL_MAP', + 'map_emotion_label', + 'segment_timeline_into_windows', + 'aggregate_window_scores', + 'get_aligned_window_range', + 'compare_timelines_distance', + 'compute_timeline_self_similarity', +] + + +# Simple ordinal mapping for emotion labels +# This is a structural convention, NOT a semantic interpretation +# The numeric values are ordinal only; distances between them are not meaningful +EMOTION_LABEL_MAP: Dict[str, float] = { + 'positive': 1.0, + 'neutral': 0.0, + 'negative': -1.0, +} + + +def map_emotion_label(label: str) -> float: + """ + Map an emotion label to its numeric ordinal value. + + This mapping is purely structural for distance computation. + It does NOT imply semantic meaning, intensity, or psychological interpretation. + + Args: + label: Emotion label string (case-insensitive). Must be a non-empty string. + + Returns: + Numeric value: 1.0 (positive), 0.0 (neutral), -1.0 (negative) + Returns 0.0 for unrecognized labels (treated as neutral structurally) + + Raises: + TypeError: If label is not a string + ValueError: If label is an empty string + + Example: + >>> map_emotion_label('positive') + 1.0 + >>> map_emotion_label('NEGATIVE') + -1.0 + >>> map_emotion_label('unknown') + 0.0 + """ + if not isinstance(label, str): + raise TypeError(f"label must be a string, got {type(label).__name__}") + if not label: + raise ValueError("label must not be an empty string") + return EMOTION_LABEL_MAP.get(label.lower(), 0.0) + + +def segment_timeline_into_windows( + timeline: EmotionTimeline, + window_duration: timedelta, + anchor_time: Optional[datetime] = None +) -> Dict[int, List[EmotionEvent]]: + """ + Segment an EmotionTimeline into fixed-duration time windows. + + Events are assigned to windows based on their timestamp. Window indices + are integers starting from 0 at the anchor time. Each window spans + [anchor + i*duration, anchor + (i+1)*duration). + + This is a structural grouping operation. It does NOT: + - Interpolate missing windows + - Smooth or aggregate data + - Interpret temporal patterns + + Design notes: + - Window indices can be negative if events precede anchor_time + - Only windows containing events are returned (sparse representation) + - Events exactly on window boundaries belong to the new window + + Args: + timeline: EmotionTimeline to segment (must be an EmotionTimeline instance) + window_duration: Duration of each window (must be positive timedelta) + anchor_time: Reference time for window alignment. + Defaults to timeline.start_time() if not provided. + + Returns: + Dict mapping window index (int) to list of EmotionEvent objects in that window. + Only windows containing events are included (sparse representation). + + Raises: + TypeError: If timeline is not an EmotionTimeline + TypeError: If window_duration is not a timedelta + ValueError: If window_duration is not positive + ValueError: If timeline is empty and no anchor_time provided + + Example: + >>> # Timeline with events at t=0s, t=30s, t=90s + >>> # Window duration = 60s + >>> # Result: {0: [event_0s, event_30s], 1: [event_90s]} + """ + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timeline must be an EmotionTimeline, got {type(timeline).__name__}") + if not isinstance(window_duration, timedelta): + raise TypeError(f"window_duration must be a timedelta, got {type(window_duration).__name__}") + if window_duration <= timedelta(0): + raise ValueError("window_duration must be positive") + + if timeline.is_empty(): + if anchor_time is None: + raise ValueError("Cannot segment empty timeline without anchor_time") + return {} + + if anchor_time is None: + anchor_time = timeline.start_time() + + windows: Dict[int, List[EmotionEvent]] = {} + + for event in timeline.events: + # Compute window index for this event + time_offset = event.timestamp - anchor_time + offset_seconds = time_offset.total_seconds() + window_seconds = window_duration.total_seconds() + + # Handle events before anchor time (negative indices) + # Floor division ensures correct window assignment for negative offsets + window_index = int(offset_seconds // window_seconds) + + if window_index not in windows: + windows[window_index] = [] + windows[window_index].append(event) + + return windows + + +def aggregate_window_scores( + windowed_events: Dict[int, List[EmotionEvent]], + use_event_scores: bool = False +) -> Dict[int, float]: + """ + Compute aggregate score for each window. + + For each window, computes the mean of either: + - Mapped emotion labels (default): map_emotion_label(event.emotion_label) + - Event scores (if use_event_scores=True and scores exist) + + This is a simple arithmetic mean with no weighting, smoothing, or + statistical adjustment. It does NOT interpret or infer meaning. + + Design notes: + - Windows with no valid scores (e.g., all None when use_event_scores=True) + are omitted from output + - Empty input returns empty dict + + Args: + windowed_events: Dict from segment_timeline_into_windows() + use_event_scores: If True, use event.score values instead of mapped labels. + Events without scores are skipped. + + Returns: + Dict mapping window index to aggregate score (float). + Windows with no valid scores are omitted. + + Raises: + TypeError: If windowed_events is not a dict + + Example: + >>> # Window 0 has ['positive', 'neutral'] → mean([1.0, 0.0]) = 0.5 + >>> # Window 1 has ['negative'] → mean([-1.0]) = -1.0 + >>> # Result: {0: 0.5, 1: -1.0} + """ + if not isinstance(windowed_events, dict): + raise TypeError(f"windowed_events must be a dict, got {type(windowed_events).__name__}") + + aggregates: Dict[int, float] = {} + + for window_index, events in windowed_events.items(): + if not events: + # Skip empty event lists (should not occur from segment_timeline_into_windows, + # but guard defensively) + continue + + if use_event_scores: + scores = [e.score for e in events if e.score is not None] + else: + scores = [map_emotion_label(e.emotion_label) for e in events] + + if scores: + aggregates[window_index] = sum(scores) / len(scores) + + return aggregates + + +def get_aligned_window_range( + scores_a: Dict[int, float], + scores_b: Dict[int, float] +) -> Tuple[int, int]: + """ + Determine the overlapping window index range between two score dicts. + + Returns the inclusive range [min_index, max_index] covering all windows + present in either input. This defines the alignment range for comparison. + + Design notes: + - Returns union of indices, not intersection + - (0, -1) is a sentinel indicating empty/invalid range (max < min) + + Args: + scores_a: Aggregated scores for timeline A + scores_b: Aggregated scores for timeline B + + Returns: + Tuple (min_index, max_index) of the union of window indices. + Returns (0, -1) if both inputs are empty (indicating no valid range). + + Raises: + TypeError: If either argument is not a dict + + Example: + >>> get_aligned_window_range({0: 0.5, 2: -0.5}, {1: 0.0, 3: 1.0}) + (0, 3) + """ + if not isinstance(scores_a, dict): + raise TypeError(f"scores_a must be a dict, got {type(scores_a).__name__}") + if not isinstance(scores_b, dict): + raise TypeError(f"scores_b must be a dict, got {type(scores_b).__name__}") + + all_indices = set(scores_a.keys()) | set(scores_b.keys()) + + if not all_indices: + # Sentinel value: max < min indicates empty range + return (0, -1) + + return (min(all_indices), max(all_indices)) + + +def compare_timelines_distance( + timeline_a: EmotionTimeline, + timeline_b: EmotionTimeline, + window_duration: timedelta, + anchor_time: Optional[datetime] = None, + use_event_scores: bool = False, + missing_value: float = 0.0 +) -> Dict[str, Any]: + """ + Compare two EmotionTimelines using simple distance metrics over aligned windows. + + Segments both timelines into fixed windows, aggregates scores per window, + then computes distance metrics across the aligned window range. + + Distance metrics computed: + - mean_absolute_difference: Mean of |score_a - score_b| per window + - sum_squared_difference: Sum of (score_a - score_b)^2 per window + - window_count: Number of windows in aligned range + - matched_windows: Number of windows with data in both timelines + - per_window_differences: Dict of window_index → (score_a, score_b, difference) + + This is a structural comparison. It does NOT: + - Interpret differences as meaningful changes + - Detect trends or anomalies + - Apply statistical tests or significance measures + - Handle causality or directionality + + Design notes: + - If both timelines are empty and no anchor_time is provided, the function + uses datetime.now() as a fallback. For deterministic behavior, always + provide an explicit anchor_time. + - missing_value is used for windows that exist in the aligned range but + have no events in one timeline (sparse window handling) + + Args: + timeline_a: First EmotionTimeline + timeline_b: Second EmotionTimeline + window_duration: Duration of each window + anchor_time: Reference time for alignment. + Defaults to earliest start_time of either timeline. + WARNING: If both timelines are empty, defaults to datetime.now() + which breaks determinism. Provide explicit anchor_time for + deterministic results. + use_event_scores: Use event.score instead of mapped labels + missing_value: Value to use when a window has no data (default 0.0) + + Returns: + Dict containing: + - mean_absolute_difference: float (or None if no windows) + - sum_squared_difference: float + - window_count: int + - matched_windows: int + - per_window_differences: Dict[int, Tuple[float, float, float]] + - anchor_time: ISO 8601 string of datetime used for alignment + - window_duration_seconds: float + + Raises: + TypeError: If timeline_a or timeline_b is not an EmotionTimeline + TypeError: If window_duration is not a timedelta + ValueError: If window_duration is not positive + + Example: + >>> result = compare_timelines_distance(timeline_a, timeline_b, timedelta(minutes=5)) + >>> result['mean_absolute_difference'] + 0.25 + """ + if not isinstance(timeline_a, EmotionTimeline): + raise TypeError(f"timeline_a must be an EmotionTimeline, got {type(timeline_a).__name__}") + if not isinstance(timeline_b, EmotionTimeline): + raise TypeError(f"timeline_b must be an EmotionTimeline, got {type(timeline_b).__name__}") + if not isinstance(window_duration, timedelta): + raise TypeError(f"window_duration must be a timedelta, got {type(window_duration).__name__}") + if window_duration <= timedelta(0): + raise ValueError("window_duration must be positive") + + # Determine anchor time for alignment + if anchor_time is None: + start_a = timeline_a.start_time() + start_b = timeline_b.start_time() + + if start_a is None and start_b is None: + # Both timelines are empty and no anchor_time was provided. + # Raise an error to enforce deterministic behavior. + raise ValueError("Cannot compare two empty timelines without an explicit anchor_time.") + elif start_a is None: + anchor_time = start_b + elif start_b is None: + anchor_time = start_a + else: + anchor_time = min(start_a, start_b) + + # Segment both timelines (anchor_time is guaranteed non-None here) + windows_a = segment_timeline_into_windows(timeline_a, window_duration, anchor_time) + windows_b = segment_timeline_into_windows(timeline_b, window_duration, anchor_time) + + # Aggregate scores per window + scores_a = aggregate_window_scores(windows_a, use_event_scores) + scores_b = aggregate_window_scores(windows_b, use_event_scores) + + # Get aligned range + min_idx, max_idx = get_aligned_window_range(scores_a, scores_b) + + # Handle empty case (sentinel: max < min) + if max_idx < min_idx: + return { + 'mean_absolute_difference': None, + 'sum_squared_difference': 0.0, + 'window_count': 0, + 'matched_windows': 0, + 'per_window_differences': {}, + 'anchor_time': anchor_time.isoformat(), + 'window_duration_seconds': window_duration.total_seconds(), + } + + # Compute differences across aligned windows + per_window_differences: Dict[int, Tuple[float, float, float]] = {} + absolute_differences: List[float] = [] + sum_squared = 0.0 + matched_count = 0 + + for idx in range(min_idx, max_idx + 1): + score_a = scores_a.get(idx, missing_value) + score_b = scores_b.get(idx, missing_value) + diff = score_a - score_b + + per_window_differences[idx] = (score_a, score_b, diff) + absolute_differences.append(abs(diff)) + sum_squared += diff * diff + + # Count as matched if both have actual data (not using missing_value) + if idx in scores_a and idx in scores_b: + matched_count += 1 + + window_count = max_idx - min_idx + 1 + mean_abs_diff = sum(absolute_differences) / len(absolute_differences) if absolute_differences else None + + return { + 'mean_absolute_difference': mean_abs_diff, + 'sum_squared_difference': sum_squared, + 'window_count': window_count, + 'matched_windows': matched_count, + 'per_window_differences': per_window_differences, + 'anchor_time': anchor_time.isoformat(), + 'window_duration_seconds': window_duration.total_seconds(), + } + + +def compute_timeline_self_similarity( + timeline: EmotionTimeline, + window_duration: timedelta, + use_event_scores: bool = False +) -> Dict[str, Any]: + """ + Compute self-similarity structure of a single timeline. + + Segments the timeline into windows and returns the score distribution + across windows. This provides a structural fingerprint of the timeline + without interpreting patterns or trends. + + Design notes: + - Uses population variance (divides by N), not sample variance (N-1) + - Empty timelines return zeroed structure + - score_range is (0.0, 0.0) for empty or scoreless timelines + + Args: + timeline: EmotionTimeline to analyze + window_duration: Duration of each window (must be positive) + use_event_scores: Use event.score instead of mapped labels + + Returns: + Dict containing: + - window_scores: Dict[int, float] of window index to score + - score_range: Tuple[float, float] of (min_score, max_score) + - score_variance: float (population variance of scores) + - window_count: int + - total_events: int + + Raises: + TypeError: If timeline is not an EmotionTimeline + TypeError: If window_duration is not a timedelta + ValueError: If window_duration is not positive + + Does NOT interpret variance as volatility or detect meaningful patterns. + """ + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timeline must be an EmotionTimeline, got {type(timeline).__name__}") + if not isinstance(window_duration, timedelta): + raise TypeError(f"window_duration must be a timedelta, got {type(window_duration).__name__}") + if window_duration <= timedelta(0): + raise ValueError("window_duration must be positive") + + if timeline.is_empty(): + return { + 'window_scores': {}, + 'score_range': (0.0, 0.0), + 'score_variance': 0.0, + 'window_count': 0, + 'total_events': 0, + } + + windows = segment_timeline_into_windows(timeline, window_duration) + scores = aggregate_window_scores(windows, use_event_scores) + + if not scores: + # All events had no valid scores (e.g., use_event_scores=True but all scores are None) + return { + 'window_scores': {}, + 'score_range': (0.0, 0.0), + 'score_variance': 0.0, + 'window_count': 0, + 'total_events': len(timeline), + } + + score_values = list(scores.values()) + min_score = min(score_values) + max_score = max(score_values) + + # Population variance (not sample variance) - divides by N, not N-1 + mean_score = sum(score_values) / len(score_values) + variance = sum((s - mean_score) ** 2 for s in score_values) / len(score_values) + + return { + 'window_scores': scores, + 'score_range': (min_score, max_score), + 'score_variance': variance, + 'window_count': len(scores), + 'total_events': len(timeline), + } diff --git a/dreamsApp/analytics/emotion_segmentation.py b/dreamsApp/analytics/emotion_segmentation.py new file mode 100644 index 0000000..2e26efa --- /dev/null +++ b/dreamsApp/analytics/emotion_segmentation.py @@ -0,0 +1,297 @@ +# dreamsApp/analytics/emotion_segmentation.py + +""" +Temporal Segmentation Utilities (PR-4) + +Provides structural utilities for slicing and aligning EmotionTimeline objects. +Performs segmentation and alignment without aggregation or comparison. + +Key Features: +- Fixed-duration window segmentation +- Gap-based timeline splitting +- Multi-timeline alignment to shared windows + +Design Principles: +- Immutable operations (returns new EmotionTimeline objects) +- Preserves all events (no data loss) +- Deterministic and side-effect free +- Window boundaries: [start, end) convention + +Dependencies: EmotionTimeline and EmotionEvent from emotion_timeline.py +""" + +from datetime import datetime, timedelta +from typing import List, Dict, Tuple, Optional +from dataclasses import dataclass +from .emotion_timeline import EmotionTimeline, EmotionEvent + + +__all__ = [ + 'TimeWindow', + 'segment_timeline_fixed_windows', + 'segment_timeline_by_gaps', + 'align_timelines_to_windows', +] + + +@dataclass(frozen=True) +class TimeWindow: + """ + Time window with explicit boundaries [start_time, end_time). + + Attributes: + start_time: Start of window (inclusive) + end_time: End of window (exclusive) + index: Optional numeric index + """ + start_time: datetime + end_time: datetime + index: Optional[int] = None + + def __post_init__(self): + if self.end_time <= self.start_time: + raise ValueError(f"end_time must be after start_time: {self.start_time} >= {self.end_time}") + + def duration(self) -> timedelta: + """Return the duration of this window.""" + return self.end_time - self.start_time + + def contains(self, timestamp: datetime) -> bool: + """Check if timestamp falls within [start, end).""" + return self.start_time <= timestamp < self.end_time + + def __repr__(self) -> str: + """Return string representation for debugging.""" + idx_str = f"idx={self.index}, " if self.index is not None else "" + return f"TimeWindow({idx_str}{self.start_time.isoformat()} to {self.end_time.isoformat()})" + + +def segment_timeline_fixed_windows( + timeline: EmotionTimeline, + window_duration: timedelta, + anchor_time: Optional[datetime] = None +) -> List[Tuple[TimeWindow, EmotionTimeline]]: + """ + Segment timeline into fixed-duration windows. + + Returns new EmotionTimeline objects for each window. Preserves all events. + Empty windows included. Window boundaries: [start, end). + + Args: + timeline: EmotionTimeline to segment + window_duration: Duration of each window (must be positive) + anchor_time: Reference time for alignment (defaults to timeline start) + + Returns: + List of (TimeWindow, EmotionTimeline) tuples ordered by window index. + + Raises: + TypeError: If arguments have wrong types + ValueError: If window_duration <= 0 or empty timeline without anchor_time + """ + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timeline must be an EmotionTimeline, got {type(timeline).__name__}") + if not isinstance(window_duration, timedelta): + raise TypeError(f"window_duration must be a timedelta, got {type(window_duration).__name__}") + if window_duration <= timedelta(0): + raise ValueError("window_duration must be positive") + + # Handle empty timeline + if timeline.is_empty(): + if anchor_time is None: + raise ValueError("Cannot segment empty timeline without anchor_time") + return [] + + # Determine anchor time + if anchor_time is None: + anchor_time = timeline.start_time() + + # Compute window range + first_timestamp = timeline.start_time() + last_timestamp = timeline.end_time() + + # Calculate window indices for first and last events + first_offset = (first_timestamp - anchor_time).total_seconds() + last_offset = (last_timestamp - anchor_time).total_seconds() + window_seconds = window_duration.total_seconds() + + first_window_idx = int(first_offset // window_seconds) + last_window_idx = int(last_offset // window_seconds) + + # Generate all windows in range (including empty ones) + segments: List[Tuple[TimeWindow, EmotionTimeline]] = [] + + for window_idx in range(first_window_idx, last_window_idx + 1): + # Define window boundaries + window_start = anchor_time + timedelta(seconds=window_idx * window_seconds) + window_end = anchor_time + timedelta(seconds=(window_idx + 1) * window_seconds) + + window = TimeWindow( + start_time=window_start, + end_time=window_end, + index=window_idx + ) + + # Filter events that fall within this window + events_in_window = [ + event for event in timeline.events + if window.contains(event.timestamp) + ] + + # Create new EmotionTimeline for this segment + segment_timeline = EmotionTimeline(events_in_window) + + segments.append((window, segment_timeline)) + + return segments + + +def segment_timeline_by_gaps( + timeline: EmotionTimeline, + gap_threshold: timedelta +) -> List[Tuple[TimeWindow, EmotionTimeline]]: + """ + Split timeline at points where time gaps exceed threshold. + + Identifies session boundaries or recording breaks. Each continuous sequence + becomes a separate segment. + + Args: + timeline: EmotionTimeline to split + gap_threshold: Minimum gap to trigger split (must be positive) + + Returns: + List of (TimeWindow, EmotionTimeline) tuples ordered chronologically. + Each segment contains continuous events with gaps < threshold. + + Raises: + TypeError: If arguments have wrong types + ValueError: If gap_threshold <= 0 + """ + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timeline must be an EmotionTimeline, got {type(timeline).__name__}") + if not isinstance(gap_threshold, timedelta): + raise TypeError(f"gap_threshold must be a timedelta, got {type(gap_threshold).__name__}") + if gap_threshold <= timedelta(0): + raise ValueError("gap_threshold must be positive") + + # Handle empty timeline + if timeline.is_empty(): + return [] + + # Handle single event + if len(timeline.events) == 1: + event = timeline.events[0] + window = TimeWindow( + start_time=event.timestamp, + end_time=event.timestamp + timedelta(microseconds=1), + index=0 + ) + segment_timeline = EmotionTimeline(subject_id=timeline.subject_id, events=(event,)) + return [(window, segment_timeline)] + + # Split based on gaps + segments: List[Tuple[TimeWindow, EmotionTimeline]] = [] + current_segment_events = [timeline.events[0]] + segment_start = timeline.events[0].timestamp + + for i in range(1, len(timeline.events)): + prev_event = timeline.events[i - 1] + curr_event = timeline.events[i] + gap = curr_event.timestamp - prev_event.timestamp + + if gap >= gap_threshold: + # Gap exceeds threshold - finalize current segment + segment_end = prev_event.timestamp + timedelta(microseconds=1) + window = TimeWindow( + start_time=segment_start, + end_time=segment_end, + index=len(segments) + ) + segment_timeline = EmotionTimeline(subject_id=timeline.subject_id, events=tuple(current_segment_events)) + segments.append((window, segment_timeline)) + + # Start new segment + current_segment_events = [curr_event] + segment_start = curr_event.timestamp + else: + # Continue current segment + current_segment_events.append(curr_event) + + # Finalize last segment + segment_end = timeline.events[-1].timestamp + timedelta(microseconds=1) + window = TimeWindow( + start_time=segment_start, + end_time=segment_end, + index=len(segments) + ) + segment_timeline = EmotionTimeline(subject_id=timeline.subject_id, events=tuple(current_segment_events)) + segments.append((window, segment_timeline)) + + return segments + + +def align_timelines_to_windows( + timelines: List[EmotionTimeline], + windows: List[TimeWindow] +) -> Dict[int, List[EmotionTimeline]]: + """ + Align multiple timelines to shared window boundaries. + + Extracts events from each timeline that fall within each window. + Returns aligned segments as new EmotionTimeline objects. + + Args: + timelines: List of EmotionTimeline objects to align + windows: List of TimeWindow objects defining boundaries + + Returns: + Dict mapping window index to list of aligned segments. + Format: {window_index: [timeline_1_segment, timeline_2_segment, ...]} + Empty segments included as empty EmotionTimeline objects. + + Raises: + TypeError: If arguments have wrong types + ValueError: If timelines or windows lists are empty + """ + # Validate inputs + if not isinstance(timelines, list): + raise TypeError(f"timelines must be a list, got {type(timelines).__name__}") + if not isinstance(windows, list): + raise TypeError(f"windows must be a list, got {type(windows).__name__}") + if not timelines: + raise ValueError("timelines list cannot be empty") + if not windows: + raise ValueError("windows list cannot be empty") + + # Validate timeline types + for i, timeline in enumerate(timelines): + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timelines[{i}] must be an EmotionTimeline, got {type(timeline).__name__}") + + # Validate window types + for i, window in enumerate(windows): + if not isinstance(window, TimeWindow): + raise TypeError(f"windows[{i}] must be a TimeWindow, got {type(window).__name__}") + + # Align each timeline to each window + aligned: Dict[int, List[EmotionTimeline]] = {} + + for window in windows: + window_index = window.index if window.index is not None else windows.index(window) + aligned_segments = [] + + for timeline in timelines: + # Extract events within this window + events_in_window = [ + event for event in timeline.events + if window.contains(event.timestamp) + ] + + # Create new EmotionTimeline for this segment + segment_timeline = EmotionTimeline(subject_id=timeline.subject_id, events=tuple(events_in_window)) + aligned_segments.append(segment_timeline) + + aligned[window_index] = aligned_segments + + return aligned diff --git a/dreamsApp/analytics/emotion_timeline.py b/dreamsApp/analytics/emotion_timeline.py new file mode 100644 index 0000000..acb03fb --- /dev/null +++ b/dreamsApp/analytics/emotion_timeline.py @@ -0,0 +1,219 @@ +# dreamsApp/analytics/emotion_timeline.py + +""" +Time-Aware Emotion Timeline Engine + +Provides immutable, chronologically-ordered structural containers for temporal +emotion data. This module is PURELY STRUCTURAL and does NOT perform: +- Sentiment analysis or inference +- Trend detection or prediction +- Statistical aggregation or smoothing +- Emotion interpretation or classification +""" + +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Optional, Dict, Any, Tuple, List + + +@dataclass(frozen=True) +class EmotionEvent: + """ + A single emotional observation tied to a timestamp. + + Immutable structural container for one data point in an emotion timeline. + All validation is assumed to occur upstream. + + Expected emotion_label values: 'positive', 'negative', 'neutral' + (any string accepted; interpretation happens elsewhere) + + Attributes: + timestamp: When the emotion was observed + emotion_label: Emotion category (e.g., 'positive', 'negative', 'neutral') + score: Optional intensity/confidence value + source_id: Optional identifier for data origin (e.g., 'video_analysis') + metadata: Optional additional context + + Does NOT: + - Validate or normalize scores + - Interpret or infer emotions + - Validate timestamps + """ + timestamp: datetime + emotion_label: str + score: Optional[float] = None + source_id: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + + +@dataclass(frozen=True) +class EmotionTimeline: + """ + Chronologically-ordered, immutable collection of EmotionEvent objects. + + Single reusable temporal abstraction for emotion data across one subject. + Enforces chronological ordering and provides lightweight temporal utilities. + + This is a structural container only—it does NOT perform trend analysis, + volatility detection, proximity logic, or statistical operations. + + Attributes: + subject_id: Identifier for the person/entity + events: Immutable tuple of EmotionEvent objects (must be chronological) + metadata: Optional timeline-level metadata + + Does NOT: + - Analyze or interpret emotions + - Detect trends or predict future states + - Aggregate or smooth data + - Filter or modify events + """ + subject_id: str + events: Tuple[EmotionEvent, ...] = () + metadata: Optional[Dict[str, Any]] = None + + def __post_init__(self): + """ + Enforce chronological ordering of events. + + Raises: + ValueError: If events are not in strict chronological order + """ + if len(self.events) > 1: + for i, (current_event, next_event) in enumerate(zip(self.events, self.events[1:])): + if current_event.timestamp > next_event.timestamp: + raise ValueError( + f"Events must be chronologically ordered. " + f"Event at index {i} ({current_event.timestamp}) " + f"occurs after event at index {i + 1} ({next_event.timestamp})" + ) + + def __len__(self) -> int: + """Number of events in timeline.""" + return len(self.events) + + def is_empty(self) -> bool: + """Check if timeline has no events.""" + return len(self.events) == 0 + + def is_chronologically_ordered(self) -> bool: + """ + Check if events are in chronological order. + + Since EmotionTimeline enforces ordering via __post_init__, + this always returns True for successfully constructed instances. + Provided for testing and validation purposes. + + Returns: + bool: True (ordering is guaranteed by construction) + """ + return True + + def start_time(self) -> Optional[datetime]: + """ + Timestamp of the first event. + + Returns: + datetime of first event, or None if timeline is empty + """ + return None if self.is_empty() else self.events[0].timestamp + + def end_time(self) -> Optional[datetime]: + """ + Timestamp of the last event. + + Returns: + datetime of last event, or None if timeline is empty + """ + return None if self.is_empty() else self.events[-1].timestamp + + def time_span(self) -> Optional[timedelta]: + """ + Total time span from first to last event. + + Returns: + timedelta between first and last event, or None if timeline + has fewer than 2 events + """ + if len(self.events) < 2: + return None + return self.events[-1].timestamp - self.events[0].timestamp + + def time_gaps(self) -> Tuple[timedelta, ...]: + """ + Time deltas between consecutive events. + + Returns tuple of timedelta objects representing gaps between adjacent + events. For N events, returns N-1 gaps. Empty tuple if < 2 events. + + This is a lightweight structural helper—does NOT analyze or interpret gaps. + """ + if len(self.events) < 2: + return () + + gaps = [] + for i in range(len(self.events) - 1): + gaps.append(self.events[i + 1].timestamp - self.events[i].timestamp) + return tuple(gaps) + + def to_dict(self) -> Dict[str, Any]: + """ + Export as JSON-serializable dictionary. + + Converts timestamps to ISO 8601 format. Does NOT modify, aggregate, + or interpret data. + + Returns: + Dict with keys: subject_id, events (list), metadata (optional) + """ + events_list = [] + for event in self.events: + event_dict = { + 'timestamp': event.timestamp.isoformat(), + 'emotion_label': event.emotion_label, + } + if event.score is not None: + event_dict['score'] = event.score + if event.source_id is not None: + event_dict['source_id'] = event.source_id + if event.metadata is not None: + event_dict['metadata'] = event.metadata + events_list.append(event_dict) + + result = { + 'subject_id': self.subject_id, + 'events': events_list, + } + + if self.metadata is not None: + result['metadata'] = self.metadata + + return result + + @classmethod + def from_events( + cls, + subject_id: str, + events: List[EmotionEvent], + metadata: Optional[Dict[str, Any]] = None + ) -> 'EmotionTimeline': + """ + Construct timeline from unsorted list of EmotionEvent objects. + + Convenience constructor that sorts events by timestamp before + creating the immutable timeline. + + Args: + subject_id: Identifier for the subject + events: List of EmotionEvent objects (will be sorted by timestamp) + metadata: Optional timeline-level metadata + + Returns: + EmotionTimeline with chronologically ordered events + """ + sorted_events = sorted(events, key=lambda e: e.timestamp) + return cls( + subject_id=subject_id, + events=tuple(sorted_events), + metadata=metadata + ) diff --git a/dreamsApp/analytics/episode_proximity.py b/dreamsApp/analytics/episode_proximity.py new file mode 100644 index 0000000..97b8122 --- /dev/null +++ b/dreamsApp/analytics/episode_proximity.py @@ -0,0 +1,117 @@ +# dreamsApp/analytics/episode_proximity.py + +from datetime import datetime, timedelta +from enum import Enum +from typing import Tuple + +from .emotion_episode import Episode + + +__all__ = [ + 'ProximityRelation', + 'compute_temporal_overlap', + 'are_episodes_adjacent', + 'classify_episode_proximity', +] + + +class ProximityRelation(Enum): + OVERLAPPING = "overlapping" + ADJACENT = "adjacent" + DISJOINT = "disjoint" + + +def compute_temporal_overlap( + episode_a: Episode, + episode_b: Episode +) -> float: + if not isinstance(episode_a, Episode): + raise TypeError(f"episode_a must be an Episode, got {type(episode_a).__name__}") + if not isinstance(episode_b, Episode): + raise TypeError(f"episode_b must be an Episode, got {type(episode_b).__name__}") + + overlap_start = max(episode_a.start_time, episode_b.start_time) + overlap_end = min(episode_a.end_time, episode_b.end_time) + + if overlap_start >= overlap_end: + return 0.0 + + overlap_duration = (overlap_end - overlap_start).total_seconds() + + duration_a = episode_a.duration() + duration_b = episode_b.duration() + + if duration_a == 0.0 and duration_b == 0.0: + return 1.0 if episode_a.start_time == episode_b.start_time else 0.0 + + min_duration = min(duration_a, duration_b) if min(duration_a, duration_b) > 0 else max(duration_a, duration_b) + + return overlap_duration / min_duration + + +def compute_temporal_gap( + episode_a: Episode, + episode_b: Episode +) -> float: + if not isinstance(episode_a, Episode): + raise TypeError(f"episode_a must be an Episode, got {type(episode_a).__name__}") + if not isinstance(episode_b, Episode): + raise TypeError(f"episode_b must be an Episode, got {type(episode_b).__name__}") + + if episode_a.end_time <= episode_b.start_time: + gap = (episode_b.start_time - episode_a.end_time).total_seconds() + elif episode_b.end_time <= episode_a.start_time: + gap = (episode_a.start_time - episode_b.end_time).total_seconds() + else: + gap = 0.0 + + return max(0.0, gap) + + +def are_episodes_adjacent( + episode_a: Episode, + episode_b: Episode, + adjacency_threshold: timedelta = timedelta(0) +) -> bool: + if not isinstance(episode_a, Episode): + raise TypeError(f"episode_a must be an Episode, got {type(episode_a).__name__}") + if not isinstance(episode_b, Episode): + raise TypeError(f"episode_b must be an Episode, got {type(episode_b).__name__}") + if not isinstance(adjacency_threshold, timedelta): + raise TypeError(f"adjacency_threshold must be a timedelta, got {type(adjacency_threshold).__name__}") + if adjacency_threshold < timedelta(0): + raise ValueError("adjacency_threshold must be non-negative") + + overlap = compute_temporal_overlap(episode_a, episode_b) + if overlap > 0.0: + return False + + gap_seconds = compute_temporal_gap(episode_a, episode_b) + threshold_seconds = adjacency_threshold.total_seconds() + + return gap_seconds <= threshold_seconds + + +def classify_episode_proximity( + episode_a: Episode, + episode_b: Episode, + adjacency_threshold: timedelta = timedelta(0) +) -> ProximityRelation: + if not isinstance(episode_a, Episode): + raise TypeError(f"episode_a must be an Episode, got {type(episode_a).__name__}") + if not isinstance(episode_b, Episode): + raise TypeError(f"episode_b must be an Episode, got {type(episode_b).__name__}") + if not isinstance(adjacency_threshold, timedelta): + raise TypeError(f"adjacency_threshold must be a timedelta, got {type(adjacency_threshold).__name__}") + if adjacency_threshold < timedelta(0): + raise ValueError("adjacency_threshold must be non-negative") + + overlap = compute_temporal_overlap(episode_a, episode_b) + if overlap > 0.0: + return ProximityRelation.OVERLAPPING + + gap_seconds = compute_temporal_gap(episode_a, episode_b) + if gap_seconds <= adjacency_threshold.total_seconds(): + return ProximityRelation.ADJACENT + + return ProximityRelation.DISJOINT diff --git a/dreamsApp/analytics/episode_segmentation.py b/dreamsApp/analytics/episode_segmentation.py new file mode 100644 index 0000000..b4b22c8 --- /dev/null +++ b/dreamsApp/analytics/episode_segmentation.py @@ -0,0 +1,41 @@ +# dreamsApp/analytics/episode_segmentation.py + +from datetime import timedelta +from typing import List + +from .emotion_timeline import EmotionTimeline +from .emotion_episode import Episode +from .emotion_segmentation import segment_timeline_by_gaps + + +__all__ = ['segment_timeline_to_episodes'] + + +def segment_timeline_to_episodes( + timeline: EmotionTimeline, + gap_threshold: timedelta +) -> List[Episode]: + if not isinstance(timeline, EmotionTimeline): + raise TypeError(f"timeline must be an EmotionTimeline, got {type(timeline).__name__}") + if not isinstance(gap_threshold, timedelta): + raise TypeError(f"gap_threshold must be a timedelta, got {type(gap_threshold).__name__}") + if gap_threshold <= timedelta(0): + raise ValueError("gap_threshold must be positive") + + if timeline.is_empty(): + return [] + + segments = segment_timeline_by_gaps(timeline, gap_threshold) + + episodes: List[Episode] = [] + + for window, segment_timeline in segments: + episode = Episode( + start_time=window.start_time, + end_time=window.end_time, + events=segment_timeline.events, + source_subject_id=timeline.subject_id + ) + episodes.append(episode) + + return episodes diff --git a/dreamsApp/analytics/temporal_narrative_graph.py b/dreamsApp/analytics/temporal_narrative_graph.py new file mode 100644 index 0000000..2274085 --- /dev/null +++ b/dreamsApp/analytics/temporal_narrative_graph.py @@ -0,0 +1,136 @@ +# dreamsApp/analytics/temporal_narrative_graph.py + +from dataclasses import dataclass +from datetime import timedelta +from typing import Tuple, List, Dict, Any, Optional + +from .emotion_episode import Episode +from .episode_proximity import ProximityRelation, classify_episode_proximity + + +__all__ = [ + 'NarrativeEdge', + 'TemporalNarrativeGraph', + 'build_narrative_graph', +] + + +@dataclass(frozen=True) +class NarrativeEdge: + source_index: int + target_index: int + relation: ProximityRelation + + def __post_init__(self) -> None: + if self.source_index < 0 or self.target_index < 0: + raise ValueError( + f"Indices must be non-negative: " + f"source_index={self.source_index}, target_index={self.target_index}" + ) + if self.source_index >= self.target_index: + raise ValueError( + f"source_index must be less than target_index for canonical ordering: " + f"{self.source_index} >= {self.target_index}" + ) + + def to_dict(self) -> Dict[str, Any]: + return { + 'source_index': self.source_index, + 'target_index': self.target_index, + 'relation': self.relation.value, + } + + +@dataclass(frozen=True) +class TemporalNarrativeGraph: + nodes: Tuple[Episode, ...] + edges: Tuple[NarrativeEdge, ...] + adjacency_threshold: Optional[timedelta] = None + + def __len__(self) -> int: + return len(self.nodes) + + def node_count(self) -> int: + return len(self.nodes) + + def edge_count(self) -> int: + return len(self.edges) + + def is_empty(self) -> bool: + return len(self.nodes) == 0 + + def edges_for_node(self, node_index: int) -> Tuple[NarrativeEdge, ...]: + if node_index < 0 or node_index >= len(self.nodes): + raise IndexError(f"node_index {node_index} out of bounds for graph with {len(self.nodes)} nodes") + + return tuple( + edge for edge in self.edges + if edge.source_index == node_index or edge.target_index == node_index + ) + + def edges_by_relation(self, relation: ProximityRelation) -> Tuple[NarrativeEdge, ...]: + return tuple(edge for edge in self.edges if edge.relation == relation) + + def to_dict(self) -> Dict[str, Any]: + return { + 'nodes': [node.to_dict() for node in self.nodes], + 'edges': [edge.to_dict() for edge in self.edges], + 'adjacency_threshold_seconds': ( + self.adjacency_threshold.total_seconds() + if self.adjacency_threshold is not None + else None + ), + 'node_count': self.node_count(), + 'edge_count': self.edge_count(), + } + + +def build_narrative_graph( + episodes: List[Episode], + adjacency_threshold: timedelta = timedelta(0), + include_disjoint_edges: bool = False +) -> TemporalNarrativeGraph: + if not isinstance(episodes, list): + raise TypeError(f"episodes must be a list, got {type(episodes).__name__}") + if not isinstance(adjacency_threshold, timedelta): + raise TypeError(f"adjacency_threshold must be a timedelta, got {type(adjacency_threshold).__name__}") + if adjacency_threshold < timedelta(0): + raise ValueError("adjacency_threshold must be non-negative") + + for i, episode in enumerate(episodes): + if not isinstance(episode, Episode): + raise TypeError(f"episodes[{i}] must be an Episode, got {type(episode).__name__}") + + if not episodes: + return TemporalNarrativeGraph( + nodes=(), + edges=(), + adjacency_threshold=adjacency_threshold + ) + + nodes = tuple(episodes) + + edges: List[NarrativeEdge] = [] + n = len(nodes) + + for i in range(n): + for j in range(i + 1, n): + relation = classify_episode_proximity( + nodes[i], + nodes[j], + adjacency_threshold + ) + + if relation != ProximityRelation.DISJOINT or include_disjoint_edges: + edge = NarrativeEdge( + source_index=i, + target_index=j, + relation=relation + ) + edges.append(edge) + + return TemporalNarrativeGraph( + nodes=nodes, + edges=tuple(edges), + adjacency_threshold=adjacency_threshold + ) diff --git a/dreamsApp/analytics/time_aware_proximity.py b/dreamsApp/analytics/time_aware_proximity.py new file mode 100644 index 0000000..fe7ca95 --- /dev/null +++ b/dreamsApp/analytics/time_aware_proximity.py @@ -0,0 +1,161 @@ +# dreamsApp/analytics/time_aware_proximity.py + +""" +Time-aware proximity and comparison for EmotionTimeline objects. +Purely structural - no ML, statistics, or interpretation. +""" + +from datetime import datetime, timedelta +from typing import Tuple, Dict, Optional, List, Literal + +from .emotion_timeline import EmotionTimeline, EmotionEvent + + +def align_timelines_by_window( + timelines: Tuple[EmotionTimeline, ...], + window: timedelta, + anchor: Literal["start", "end", "explicit"], + anchor_time: Optional[datetime] = None +) -> Dict[int, Tuple[Optional[EmotionEvent], ...]]: + """ + Align timelines into fixed windows. At most one event per timeline per window. + Returns dict mapping window index to tuple of events (None if missing). + """ + if window <= timedelta(0): + raise ValueError("Window must be a positive timedelta") + + if anchor == "explicit" and anchor_time is None: + raise ValueError("anchor_time required when anchor='explicit'") + + if not timelines: + return {} + + all_empty = all(timeline.is_empty() for timeline in timelines) + if all_empty: + return {} + + all_timestamps: List[datetime] = [] + for timeline in timelines: + for event in timeline.events: + all_timestamps.append(event.timestamp) + + if not all_timestamps: + return {} + + global_start = min(all_timestamps) + global_end = max(all_timestamps) + + if anchor == "start": + anchor_dt = global_start + elif anchor == "end": + anchor_dt = global_end + else: + anchor_dt = anchor_time + + if anchor == "end": + num_windows_before = 0 + temp = anchor_dt + while temp > global_start: + temp -= window + num_windows_before += 1 + + window_start_base = anchor_dt - (num_windows_before * window) + else: + window_start_base = anchor_dt + + total_span = global_end - window_start_base + num_windows = max(1, int(total_span / window) + 1) + + result: Dict[int, Tuple[Optional[EmotionEvent], ...]] = {} + + for window_idx in range(num_windows): + window_start = window_start_base + (window_idx * window) + window_end = window_start + window + + events_in_window: List[Optional[EmotionEvent]] = [] + + for timeline in timelines: + selected_event: Optional[EmotionEvent] = None + + # Select earliest event in window (deterministic) + for event in timeline.events: + if window_start <= event.timestamp < window_end: + selected_event = event + break + + events_in_window.append(selected_event) + + result[window_idx] = tuple(events_in_window) + + return result + + +def temporal_distance( + a: EmotionTimeline, + b: EmotionTimeline, + window: timedelta, + anchor: Literal["start", "end", "explicit"] = "start", + anchor_time: Optional[datetime] = None +) -> float: + """ + Distance between timelines based on presence/absence in windows. + Symmetric, deterministic. Zero only for perfect alignment. + """ + # Empty timelines are treated as perfectly aligned by definition + if a.is_empty() and b.is_empty(): + return 0.0 + + if a.is_empty() or b.is_empty(): + non_empty = a if not a.is_empty() else b + return float(len(non_empty.events)) + + aligned = align_timelines_by_window( + timelines=(a, b), + window=window, + anchor=anchor, + anchor_time=anchor_time + ) + + if not aligned: + return 0.0 + + mismatches = 0 + total_windows = len(aligned) + + for window_idx, events in aligned.items(): + event_a = events[0] + event_b = events[1] + + a_present = event_a is not None + b_present = event_b is not None + + if a_present != b_present: + mismatches += 1 + + if total_windows == 0: + return 0.0 + + return float(mismatches) + + +def proximity_matrix( + timelines: Tuple[EmotionTimeline, ...], + window: timedelta +) -> List[List[float]]: + """ + Pairwise distance matrix. Square, symmetric, diagonal = 0. + """ + n = len(timelines) + + if n == 0: + return [] + + matrix: List[List[float]] = [[0.0] * n for _ in range(n)] + + for i in range(n): + for j in range(i + 1, n): + dist = temporal_distance(timelines[i], timelines[j], window) + matrix[i][j] = dist + matrix[j][i] = dist + + return matrix diff --git a/dreamsApp/app/__init__.py b/dreamsApp/app/__init__.py index 8ca3035..5d989bd 100644 --- a/dreamsApp/app/__init__.py +++ b/dreamsApp/app/__init__.py @@ -47,7 +47,7 @@ def load_user(user_id): return User(user_data) return None - from app.auth import bp as auth_bp + from .auth import bp as auth_bp app.register_blueprint(auth_bp, url_prefix='/auth') from .ingestion.routes import bp as ingestion_bp diff --git a/dreamsApp/app/auth.py b/dreamsApp/app/auth.py index 0c769ae..6847290 100644 --- a/dreamsApp/app/auth.py +++ b/dreamsApp/app/auth.py @@ -1,6 +1,6 @@ from flask import Blueprint, render_template, redirect, url_for, flash, request, current_app from flask_login import login_user, logout_user, login_required -from app.models import User +from .models import User from werkzeug.security import generate_password_hash bp = Blueprint('auth', __name__) diff --git a/dreamsApp/app/builder.py b/dreamsApp/app/builder.py new file mode 100644 index 0000000..28ae119 --- /dev/null +++ b/dreamsApp/app/builder.py @@ -0,0 +1,48 @@ +# app/builder.py + +from datetime import datetime +from typing import List, Dict, Any, Optional +from ..analytics.emotion_timeline import EmotionEvent, EmotionTimeline + + +def build_emotion_timeline( + subject_id: str, + records: List[Dict[str, Any]], + timeline_metadata: Optional[Dict[str, Any]] = None +) -> EmotionTimeline: + """ + Construct an EmotionTimeline from validated records. + + Sorts records by timestamp and creates EmotionEvent objects. + No filtering, aggregation, smoothing, or inference is performed. + + Args: + subject_id: Identifier for the subject (person) + records: List of dicts, each with keys: timestamp, emotion_label, + and optionally: score, source_id, metadata + timeline_metadata: Optional metadata for the timeline itself + + Returns: + EmotionTimeline: Immutable ordered collection of EmotionEvent objects + """ + # Sort records by timestamp + sorted_records = sorted(records, key=lambda r: r['timestamp']) + + # Build EmotionEvent objects + events = [] + for record in sorted_records: + event = EmotionEvent( + timestamp=record['timestamp'], + emotion_label=record['emotion_label'], + score=record.get('score'), + source_id=record.get('source_id'), + metadata=record.get('metadata') + ) + events.append(event) + + # Create and return immutable timeline + return EmotionTimeline( + subject_id=subject_id, + events=tuple(events), + metadata=timeline_metadata + ) diff --git a/dreamsApp/app/dashboard/main.py b/dreamsApp/app/dashboard/main.py index 7f5ca27..75c8148 100644 --- a/dreamsApp/app/dashboard/main.py +++ b/dreamsApp/app/dashboard/main.py @@ -6,10 +6,38 @@ import numpy as np import io import base64 -from flask_login import login_required +import threading +from flask_login import login_required, current_user from wordcloud import WordCloud -from app.utils.llms import generate +from ..utils.llms import generate from flask import jsonify +import datetime +from bson.objectid import ObjectId +from bson.errors import InvalidId + +# Security: Whitelist of valid CHIME labels +VALID_CHIME_LABELS = {'Connectedness', 'Hope', 'Identity', 'Meaning', 'Empowerment', 'None'} + +# Security: Rate limiting configuration +MAX_CORRECTIONS_PER_HOUR = 10 + +def generate_wordcloud_b64(keywords, colormap): + """Refactor: Helper to generate base64 encoded word cloud image.""" + if not keywords: + return None + wordcloud = WordCloud( + width=800, + height=400, + background_color='#121212', + colormap=colormap + ).generate(' '.join(keywords)) + + buf = io.BytesIO() + wordcloud.to_image().save(buf, 'png') + buf.seek(0) + data = base64.b64encode(buf.read()).decode('utf-8') + buf.close() + return data @bp.route('/', methods =['GET']) @login_required @@ -18,9 +46,6 @@ def main(): unique_users = mongo.distinct('user_id') return render_template('dashboard/main.html', users=unique_users) - -import matplotlib.pyplot as plt - @bp.route('/user/', methods =['GET']) @login_required def profile(target): @@ -51,37 +76,102 @@ def profile(target): df["rolling_avg"] = df["score"].rolling(window=5, min_periods=1).mean() df["ema_score"] = df["score"].ewm(span=5, adjust=False).mean() - # 📈 Create user-friendly visual - plt.figure(figsize=(12, 6)) + # Create user-friendly visual + plt.style.use('dark_background') + plt.figure(figsize=(12, 6), facecolor='#121212') + ax = plt.gca() + ax.set_facecolor('#1e1e1e') plt.plot(df["timestamp"], df["cumulative_score"], - label="Overall Emotional Journey", color="blue", marker="o", alpha=0.5) + label="Overall Emotional Journey", color="#90caf9", marker="o", alpha=0.5) plt.plot(df["timestamp"], df["rolling_avg"], - label="5-Day Emotional Smoothing", color="orange", linestyle="--", marker="x") + label="5-Day Emotional Smoothing", color="#ffcc80", linestyle="--", marker="x") plt.plot(df["timestamp"], df["ema_score"], - label="Recent Emotional Trend", color="green", linestyle="-", marker="s") + label="Recent Emotional Trend", color="#a5d6a7", linestyle="-", marker="s") - plt.axhline(0, color="gray", linestyle="--", linewidth=1) + plt.axhline(0, color="#555555", linestyle="--", linewidth=1) # Friendly and interpretive title and axis labels - plt.title("How This Person’s Feelings Shifted Over Time", fontsize=14) - plt.xlabel("When Posts Were Made", fontsize=12) - plt.ylabel("Mood Score (Higher = Happier)", fontsize=12) + plt.title("How This Person’s Feelings Shifted Over Time", fontsize=14, color='white', fontweight='bold') + plt.xlabel("When Posts Were Made", fontsize=12, color='#e0e0e0') + plt.ylabel("Mood Score (Higher = Happier)", fontsize=12, color='#e0e0e0') # Improve legend - plt.legend(title="What the Lines Mean", fontsize=10) - plt.grid(True) - plt.xticks(rotation=45) + plt.legend(title="What the Lines Mean", fontsize=10, facecolor='#222', edgecolor='#444') + plt.grid(color='#333333', linestyle=':', alpha=0.5) + plt.xticks(rotation=45, color='#888888') + plt.yticks(color='#888888') plt.tight_layout() # Save to base64 for embedding buf = io.BytesIO() - plt.savefig(buf, format='png') + plt.savefig(buf, format='png', facecolor='#121212') buf.seek(0) plot_data = base64.b64encode(buf.read()).decode('utf-8') buf.close() + plt.clf() # Clear timeline plot + + # --- CHIME Radar Chart --- + chime_counts = { + "Connectedness": 0, "Hope": 0, "Identity": 0, + "Meaning": 0, "Empowerment": 0 + } + + # Optimize lookup for case-insensitivity + chime_lookup = {k.lower(): k for k in chime_counts} + + for post in user_posts: + # Prioritize user correction if available + label_to_use = post.get('corrected_label') + if not label_to_use and post.get('chime_analysis'): + label_to_use = post['chime_analysis'].get('label', '') + + if label_to_use: + original_key = chime_lookup.get(label_to_use.lower()) + if original_key: + chime_counts[original_key] += 1 + + categories = list(chime_counts.keys()) + values = list(chime_counts.values()) + + # Radar chart requires closing the loop + N = len(categories) + angles = [n / float(N) * 2 * np.pi for n in range(N)] + values += values[:1] + angles += angles[:1] + + # Setup the plot with dark theme colors to match dashboard + plt.style.use('dark_background') + fig = plt.figure(figsize=(7, 7), facecolor='#121212') # Deep dark background + ax = plt.subplot(111, polar=True) + ax.set_facecolor('#1e1e1e') # Slightly lighter plot area + + # Set radial limits based on data but with a minimum for visual clarity + max_val = max(values) if any(values) else 1 + limit = max(2, max_val + 1) + ax.set_ylim(0, limit) + + # Draw axes and labels + plt.xticks(angles[:-1], categories, color='#00d4ff', size=12, fontweight='bold') + ax.tick_params(colors='#888888') # Radial scale label color + ax.grid(color='#444444', linestyle='--') + + # Plot data with vibrant blue fill and markers + ax.plot(angles, values, color='#00d4ff', linewidth=3, linestyle='solid', marker='o', markersize=8) + ax.fill(angles, values, color='#00d4ff', alpha=0.3) + + plt.title("Personal Recovery Footprint", size=18, color='white', pad=20, fontweight='bold') + + buf = io.BytesIO() + # Save with specific facecolor to ensure transparency/consistency + plt.savefig(buf, format='png', bbox_inches='tight', facecolor='#121212') + buf.seek(0) + chime_plot_data = base64.b64encode(buf.read()).decode('utf-8') + buf.close() + plt.clf() # Clean up radar plot + plt.style.use('default') # Reset style for next plots # Fetch keywords from MongoDB keywords_data = current_app.mongo['keywords'].find_one({'user_id': target_user_id}) @@ -97,27 +187,24 @@ def profile(target): else: thematics = thematics_data["data"] - # Generate word cloud for positive keywords - wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(' '.join(positive_keywords)) - - # Save word cloud to buffer - buf = io.BytesIO() - wordcloud_positive.to_image().save(buf, 'png') - buf.seek(0) - wordcloud_positive_data = base64.b64encode(buf.read()).decode('utf-8') - buf.close() - - # Generate word cloud for negative keywords - wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(' '.join(negative_keywords)) + # Generate word clouds using helper function + wordcloud_positive_data = generate_wordcloud_b64(positive_keywords, 'GnBu') + wordcloud_negative_data = generate_wordcloud_b64(negative_keywords, 'OrRd') - # Save word cloud to buffer - buf = io.BytesIO() - wordcloud_negative.to_image().save(buf, 'png') - buf.seek(0) - wordcloud_negative_data = base64.b64encode(buf.read()).decode('utf-8') - buf.close() + # Sort posts to get the latest one + # The user_posts list is already sorted by timestamp ascending. The latest post is the last one. + latest_post = user_posts[-1] if user_posts else None - return render_template('dashboard/profile.html', plot_url=plot_data, positive_wordcloud_url=wordcloud_positive_data, negative_wordcloud_url=wordcloud_negative_data, thematics=thematics,user_id=str(target_user_id)) + return render_template( + 'dashboard/profile.html', + plot_url=plot_data, + chime_plot_url=chime_plot_data, + positive_wordcloud_url=wordcloud_positive_data, + negative_wordcloud_url=wordcloud_negative_data, + thematics=thematics, + user_id=str(target_user_id), + latest_post=latest_post # Pass only the latest post for feedback + ) @bp.route('/clusters/') @login_required @@ -166,4 +253,138 @@ def thematic_refresh(user_id): return jsonify({ "success": False, "message": str(e) - }), 500 \ No newline at end of file + }), 500 + +@bp.route('/correct_chime', methods=['POST']) +@login_required +def correct_chime(): + data = request.get_json() + post_id = data.get('post_id') + corrected_label = data.get('corrected_label') + + if not all([post_id, corrected_label]): + return jsonify({'success': False, 'error': 'Missing fields'}), 400 + + # SECURITY: Validate ObjectId format + try: + post_object_id = ObjectId(post_id) + except (InvalidId, TypeError): + return jsonify({'success': False, 'error': 'Invalid post ID format'}), 400 + + # SECURITY: Validate label is in allowed set + if corrected_label not in VALID_CHIME_LABELS: + return jsonify({'success': False, 'error': 'Invalid label value'}), 400 + + mongo = current_app.mongo['posts'] + + # SECURITY: Rate limiting - max corrections per user per hour + one_hour_ago = datetime.datetime.utcnow() - datetime.timedelta(hours=1) + recent_corrections = mongo.count_documents({ + 'user_id': current_user.get_id(), + 'correction_timestamp': {'$gte': one_hour_ago} + }) + + if recent_corrections >= MAX_CORRECTIONS_PER_HOUR: + return jsonify({'success': False, 'error': 'Rate limit exceeded. Try again later.'}), 429 + + # Step 1: ALWAYS save the correction to the queue first + now = datetime.datetime.utcnow() + result = mongo.update_one( + {'_id': post_object_id, 'user_id': current_user.get_id()}, + { + '$set': { + 'corrected_label': corrected_label, # Current correction + 'is_fl_processed': False, # Added to queue + 'correction_timestamp': now + }, + '$push': { + # AUDIT TRAIL: Keep history of all corrections for auditing + 'correction_history': { + 'label': corrected_label, + 'timestamp': now, + 'user_id': current_user.get_id() + } + } + } + ) + + if result.modified_count > 0: + # Step 2: Check if we should trigger training (non-blocking) + _maybe_trigger_fl_training(current_app._get_current_object()) + return jsonify({'success': True}) + else: + return jsonify({'success': False, 'error': 'Post not found or no change'}), 404 + + +def _maybe_trigger_fl_training(app): + """ + Check queue size and trigger training if threshold is met. + Uses atomic database lock to ensure only ONE training runs at a time. + If lock is busy, the correction is already saved - it will be processed next round. + """ + FL_BATCH_SIZE = app.config.get('FL_BATCH_SIZE', 50) + LOCK_TIMEOUT_HOURS = 2 # If lock is older than this, assume it's stale + + with app.app_context(): + mongo = app.mongo + + # Quick count check + pending_count = mongo['posts'].count_documents({ + 'corrected_label': {'$exists': True}, + 'is_fl_processed': False + }) + + if pending_count < FL_BATCH_SIZE: + return # Not enough corrections yet, exit quickly + + # Try to acquire atomic lock + # Only ONE request can successfully flip is_running from False to True + lock_collection = mongo['fl_training_lock'] + + # Ensure lock document exists (first-time setup) + lock_collection.update_one( + {'_id': 'singleton'}, + {'$setOnInsert': {'is_running': False}}, + upsert=True + ) + + # SECURITY: Check for stale lock (stuck for more than LOCK_TIMEOUT_HOURS) + stale_threshold = datetime.datetime.utcnow() - datetime.timedelta(hours=LOCK_TIMEOUT_HOURS) + lock_collection.update_one( + {'_id': 'singleton', 'is_running': True, 'started_at': {'$lt': stale_threshold}}, + {'$set': {'is_running': False, 'stale_reset_at': datetime.datetime.now()}} + ) + + # Atomically try to acquire lock + lock_result = lock_collection.find_one_and_update( + {'_id': 'singleton', 'is_running': False}, + {'$set': {'is_running': True, 'started_at': datetime.datetime.now()}}, + return_document=False # Return the OLD document + ) + + if lock_result is None or lock_result.get('is_running', True): + # Lock is busy - another training is running + # Our correction is already saved in queue, it will be processed next round + return + + # We got the lock! Start training in background thread + def run_training_with_lock(): + # Wrap entire function in app_context since this runs in a separate thread + with app.app_context(): + try: + # Import here to avoid circular dependency (fl_worker imports create_app) + from dreamsApp.app.fl_worker import run_federated_round + run_federated_round() + except Exception as e: + # Log the error since daemon threads fail silently + import logging + logging.error(f"FL Training failed in background thread: {str(e)}", exc_info=True) + finally: + # Always release lock when done (success or failure) + mongo['fl_training_lock'].update_one( + {'_id': 'singleton'}, + {'$set': {'is_running': False, 'finished_at': datetime.datetime.now()}} + ) + + thread = threading.Thread(target=run_training_with_lock, daemon=True) + thread.start() \ No newline at end of file diff --git a/dreamsApp/app/exporters.py b/dreamsApp/app/exporters.py new file mode 100644 index 0000000..d4a23dd --- /dev/null +++ b/dreamsApp/app/exporters.py @@ -0,0 +1,89 @@ +# app/exporters.py + +""" +Export utilities for EmotionTimeline. + +Note: For basic JSON export, use timeline.to_dict() method directly. +This module provides additional specialized export formats. +""" + +from typing import Dict, List, Any +from ..analytics.emotion_timeline import EmotionTimeline + + +def timeline_to_csv_rows(timeline: EmotionTimeline) -> List[Dict[str, Any]]: + """ + Convert EmotionTimeline to a list of CSV-compatible dicts. + + Each dict represents one event row. Suitable for exporting to CSV + or pandas DataFrames for analysis. + + Args: + timeline: EmotionTimeline to export + + Returns: + List of dicts, each representing an event with flattened fields + """ + rows = [] + for event in timeline.events: + row = { + 'subject_id': timeline.subject_id, + 'timestamp': event.timestamp.isoformat(), + 'emotion_label': event.emotion_label, + 'score': event.score, + 'source_id': event.source_id, + } + rows.append(row) + + return rows + + +def timeline_events_summary(timeline: EmotionTimeline) -> Dict[str, Any]: + """ + Export timeline as a summary with event list and metadata. + + Suitable for research notebooks and dashboard visualization. + Preserves all information in a structure-friendly format. + + Args: + timeline: EmotionTimeline to summarize + + Returns: + Dict with subject_id, event_count, time_span, and events + """ + if timeline.is_empty(): + return { + 'subject_id': timeline.subject_id, + 'event_count': 0, + 'time_span_seconds': None, + 'first_event': None, + 'last_event': None, + 'events': [], + 'metadata': timeline.metadata, + } + + first_timestamp = timeline.start_time() + last_timestamp = timeline.end_time() + time_span = timeline.time_span() + time_span_seconds = time_span.total_seconds() if time_span is not None else 0.0 + + events_data = [] + for i, event in enumerate(timeline.events): + events_data.append({ + 'index': i, + 'timestamp': event.timestamp.isoformat(), + 'emotion_label': event.emotion_label, + 'score': event.score, + 'source_id': event.source_id, + 'metadata': event.metadata, + }) + + return { + 'subject_id': timeline.subject_id, + 'event_count': len(timeline.events), + 'time_span_seconds': time_span_seconds, + 'first_event': first_timestamp.isoformat(), + 'last_event': last_timestamp.isoformat(), + 'events': events_data, + 'metadata': timeline.metadata, + } diff --git a/dreamsApp/app/fl_worker.py b/dreamsApp/app/fl_worker.py new file mode 100644 index 0000000..7d80404 --- /dev/null +++ b/dreamsApp/app/fl_worker.py @@ -0,0 +1,292 @@ +import torch +import shutil +import os +import datetime +from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig +from dreamsApp.app import create_app +from dreamsApp.app.utils.logger import setup_logger + +# Setup Logger +logger = setup_logger('fl_worker') + +# --- CONFIGURATION --- +BASE_MODEL_ID = "ashh007/dreams-chime-bert" +# Determine absolute paths based on app location to ensure robustness +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +# dreamsApp/app/models/production_chime_model +PRODUCTION_MODEL_DIR = os.path.join(BASE_DIR, "models", "production_chime_model") +# dreamsApp/app/models/temp_training_artifact +TEMP_MODEL_DIR = os.path.join(BASE_DIR, "models", "temp_training_artifact") + +BATCH_SIZE = 50 +LEARNING_RATE = 1e-5 # Conservative learning rate + +# "Anchor Set": 5 obvious examples that MUST remain correct (Prevent catastrophic forgetting) +ANCHOR_EXAMPLES = [ + {"text": "I feel completely safe and surrounded.", "label": "Connectedness"}, + {"text": "I see a bright future ahead.", "label": "Hope"}, + {"text": "I don't know who I am anymore.", "label": "Identity"}, + {"text": "My life has deep purpose.", "label": "Meaning"}, + {"text": "I have the power to change my situation.", "label": "Empowerment"} +] + +def validate_model(model, tokenizer, training_samples, label2id): + """ + Returns True if model passes BOTH Safety Checks and Improvement Checks. + """ + model.eval() + logger.info("Running Validation Gate...") + + # 1. ANCHOR CHECK (Safety) + correct_anchors = 0 + with torch.no_grad(): + for example in ANCHOR_EXAMPLES: + inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding=True) + outputs = model(**inputs) + pred_id = torch.argmax(outputs.logits).item() + + # Dynamic Label Check + target_str = example["label"] + target_id = label2id.get(target_str) + + # If the label exists in model config and matches prediction + if target_id is not None and pred_id == target_id: + correct_anchors += 1 + else: + # Debug print for failure + # Get the string label for the prediction + id2label = {v: k for k, v in label2id.items()} + pred_str = id2label.get(pred_id, "Unknown") + # SECURITY NOTE: Only logging hardcoded anchor examples, not user data + logger.debug(f"[Anchor Fail] Text: '{example['text'][:30]}...' Expected: {target_str}, Got: {pred_str}") + + logger.info(f"[Safety Check] Anchor Accuracy: {correct_anchors}/{len(ANCHOR_EXAMPLES)}") + if correct_anchors < 4: # Stricter check for catastrophic forgetting + logger.error("FAIL: Model has forgotten basic concepts (Catastrophic Forgetting).") + return False + + # 2. IMPROVEMENT CHECK (Did it learn?) + correct_new = 0 + total_new = len(training_samples) + with torch.no_grad(): + for text, label_idx in training_samples: + inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) + outputs = model(**inputs) + pred_label = torch.argmax(outputs.logits).item() + if pred_label == label_idx: + correct_new += 1 + + logger.info(f"[Improvement Check] Training Set Accuracy: {correct_new}/{total_new}") + + if total_new > 0 and correct_new / total_new < 0.5: + logger.error("FAIL: Model failed to learn the new corrections.") + return False + + return True + +def run_federated_round(): + app = create_app() + with app.app_context(): + mongo = app.mongo + logger.info("FL WORKER: Waking up...") + + try: + # CLEANUP: Reset any stale 'processing' documents (older than 1 hour) + one_hour_ago = datetime.datetime.utcnow() - datetime.timedelta(hours=1) + stale_reset = mongo['posts'].update_many( + { + 'is_fl_processed': 'processing', + 'processing_started_at': {'$lt': one_hour_ago} + }, + {'$set': {'is_fl_processed': False}, '$unset': {'processing_started_at': ''}} + ) + if stale_reset.modified_count > 0: + logger.warning(f"Reset {stale_reset.modified_count} stale 'processing' documents.") + + # 1. Atomically CLAIM Pending Data (Prevents Race Condition) + # Step 1a: Find IDs of documents to claim + query = { + 'corrected_label': {'$exists': True}, + 'is_fl_processed': False # Only unclaimed documents + } + + BATCH_SIZE = app.config.get('FL_BATCH_SIZE', 50) + candidate_ids = [doc['_id'] for doc in mongo['posts'].find(query, {'_id': 1}).limit(BATCH_SIZE)] + + if len(candidate_ids) < BATCH_SIZE: + logger.info(f"Only {len(candidate_ids)} corrections available. Waiting for {BATCH_SIZE}.") + return + + # Step 1b: Atomically claim these documents by setting status to 'processing' + claim_result = mongo['posts'].update_many( + {'_id': {'$in': candidate_ids}, 'is_fl_processed': False}, # Re-check status! + {'$set': { + 'is_fl_processed': 'processing', + 'processing_started_at': datetime.datetime.now() + }} + ) + + if claim_result.modified_count < BATCH_SIZE: + logger.warning(f"Race condition detected: Only claimed {claim_result.modified_count}/{BATCH_SIZE} documents. Another worker may be running. Aborting.") + # Release any documents we did claim back to 'False' + mongo['posts'].update_many( + {'_id': {'$in': candidate_ids}, 'is_fl_processed': 'processing'}, + {'$set': {'is_fl_processed': False}, '$unset': {'processing_started_at': ''}} + ) + return + + # Step 1c: Now fetch the full documents we successfully claimed + pending_posts = list(mongo['posts'].find({'_id': {'$in': candidate_ids}, 'is_fl_processed': 'processing'})) + logger.info(f"Successfully claimed {len(pending_posts)} documents for training.") + + # Prepare Data + # We need to fetch the configuration to know the label map + try: + config = AutoConfig.from_pretrained(BASE_MODEL_ID) + label2id = config.label2id + except Exception as e: + # Fallback if config fetch fails + logger.warning(f"Could not load config from HuggingFace: {e}. Using fallback label map.") + label2id = {"Connectedness": 0, "Hope": 1, "Identity": 2, "Meaning": 3, "Empowerment": 4} + + training_data = [] # List of (text, label_idx) + valid_ids = [] + + # SECURITY: Do not log the caption/text content to avoid exposing user data + for p in pending_posts: + lbl = p.get('corrected_label') + if lbl in label2id: + training_data.append((p.get('caption'), label2id[lbl])) + valid_ids.append(p['_id']) + elif lbl == 'None': + # Mark 'None' as processed but don't train + mongo['posts'].update_one({'_id': p['_id']}, {'$set': {'is_fl_processed': True, 'fl_status': 'skipped'}}) + # Log only the document ID, not the content + logger.debug(f"Skipped 'None' label for post {p['_id']}") + + if not training_data: + logger.info("No valid labels found (mostly 'None'). Marking processed and exiting.") + return + + # SECURITY: Only log counts/statistics, never actual user text + logger.info(f"Starting Training Round with {len(training_data)} samples.") + + # 2. Load Model (CONTINUOUS LEARNING) + if os.path.exists(PRODUCTION_MODEL_DIR): + logger.info(f"Loading existing Production Model from {PRODUCTION_MODEL_DIR}...") + load_path = PRODUCTION_MODEL_DIR + else: + logger.info("First run: Loading Base Model from Hugging Face...") + load_path = BASE_MODEL_ID + + tokenizer = AutoTokenizer.from_pretrained(load_path) + model = AutoModelForSequenceClassification.from_pretrained(load_path, num_labels=len(label2id)) + + # Freeze BERT Base, Train Head + base_model_prefix = model.base_model_prefix + if hasattr(model, base_model_prefix): + base_model = getattr(model, base_model_prefix) + for param in base_model.parameters(): + param.requires_grad = False + else: + logger.warning(f"Could not find base model with prefix '{base_model_prefix}'. Training all layers, which may be unintended.") + + logger.debug("Base layers frozen. Training classifier head only.") + + # 3. Training Loop + model.train() + optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=LEARNING_RATE) + + texts = [item[0] for item in training_data] + labels_tensor = torch.tensor([item[1] for item in training_data]) + inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") + + EPOCHS = 3 + for epoch in range(EPOCHS): + optimizer.zero_grad() + outputs = model(**inputs, labels=labels_tensor) + loss = outputs.loss + loss.backward() + optimizer.step() + logger.info(f"[Epoch {epoch+1}/{EPOCHS}] Loss: {loss.item():.4f}") + + # 4. Save to TEMP + if os.path.exists(TEMP_MODEL_DIR): + shutil.rmtree(TEMP_MODEL_DIR) # Clean start + model.save_pretrained(TEMP_MODEL_DIR) + tokenizer.save_pretrained(TEMP_MODEL_DIR) + logger.debug(f"Model saved to temp directory: {TEMP_MODEL_DIR}") + + # 5. Validation Gate + passed = validate_model(model, tokenizer, training_data, label2id) + + if passed: + logger.info("Update Accepted! Promoting to Production...") + # ATOMIC SWAP using os.rename (instant on same filesystem) + backup_dir = PRODUCTION_MODEL_DIR + "_backup" + + # Ensure parent directory exists + os.makedirs(os.path.dirname(PRODUCTION_MODEL_DIR), exist_ok=True) + + try: + # Step 1: Move current production to backup (atomic) + if os.path.exists(PRODUCTION_MODEL_DIR): + if os.path.exists(backup_dir): + shutil.rmtree(backup_dir) # Clear old backup + os.rename(PRODUCTION_MODEL_DIR, backup_dir) + + # Step 2: Move temp to production (atomic) + os.rename(TEMP_MODEL_DIR, PRODUCTION_MODEL_DIR) + + # Step 3: Remove backup (safe, production already updated) + if os.path.exists(backup_dir): + shutil.rmtree(backup_dir) + + logger.info(f"SUCCESS: Central Model updated at {PRODUCTION_MODEL_DIR}") + except OSError as e: + # Rollback: restore backup if swap failed + logger.error(f"Atomic swap failed: {e}") + if os.path.exists(backup_dir) and not os.path.exists(PRODUCTION_MODEL_DIR): + os.rename(backup_dir, PRODUCTION_MODEL_DIR) + logger.info("Restored previous production model from backup.") + raise + else: + logger.warning("Update Rejected. Discarding changes.") + + # Cleanup Temp + if os.path.exists(TEMP_MODEL_DIR): + shutil.rmtree(TEMP_MODEL_DIR) + + # 6. Finish + logger.info("Updating database records...") + mongo['posts'].update_many( + {'_id': {'$in': valid_ids}}, + { + '$set': { + 'is_fl_processed': True, # Mark as fully processed (was 'processing') + 'fl_round_date': datetime.datetime.now() + }, + '$unset': {'processing_started_at': ''} # Clean up temp field + } + ) + logger.info(f"Round Successfully Completed. Processed {len(valid_ids)} items.") + + except Exception as e: + logger.error(f"CRITICAL FAILURE during FL round: {str(e)}", exc_info=True) + # Cleanup temp if it exists after a failure + if os.path.exists(TEMP_MODEL_DIR): + shutil.rmtree(TEMP_MODEL_DIR) + # Release any documents we claimed back to the queue + try: + mongo['posts'].update_many( + {'is_fl_processed': 'processing'}, + {'$set': {'is_fl_processed': False}, '$unset': {'processing_started_at': ''}} + ) + logger.info("Released claimed documents back to queue after failure.") + except Exception as release_error: + logger.warning(f"Failed to release claimed documents back to queue: {release_error}") + + +# Allow running as standalone script for manual testing +if __name__ == "__main__": + run_federated_round() diff --git a/dreamsApp/app/ingestion/routes.py b/dreamsApp/app/ingestion/routes.py index 2eb0ed5..630a44f 100644 --- a/dreamsApp/app/ingestion/routes.py +++ b/dreamsApp/app/ingestion/routes.py @@ -6,9 +6,10 @@ from . import bp -from app.utils.sentiment import get_image_caption_and_sentiment -from app.utils.keywords import extract_keywords_and_vectors -from app.utils.clustering import cluster_keywords_for_all_users +from ..utils.sentiment import get_image_caption_and_sentiment, get_chime_category, select_text_for_analysis +from ..utils.keywords import extract_keywords_and_vectors +from ..utils.clustering import cluster_keywords_for_all_users +from ..utils.location_extractor import extract_gps_from_image from sentence_transformers import SentenceTransformer model = SentenceTransformer("all-MiniLM-L6-V2") @@ -28,10 +29,19 @@ def upload_post(): upload_path = current_app.config['UPLOAD_FOLDER'] image_path = os.path.join(upload_path, filename) image.save(image_path) + + # Extract GPS from EXIF if available + gps_data = extract_gps_from_image(image_path) + result = get_image_caption_and_sentiment(image_path, caption) sentiment = result["sentiment"] generated_caption = result["imgcaption"] + + # Refactor: Use shared selection logic to determine which text to analyze for recovery + text_for_analysis = select_text_for_analysis(caption, generated_caption) + chime_result = get_chime_category(text_for_analysis) + # keyword generation from the caption # Extract keyword + vector pairs @@ -73,6 +83,8 @@ def upload_post(): 'image_path': image_path, 'generated_caption': generated_caption, 'sentiment' : sentiment, + 'chime_analysis': chime_result, + 'location': gps_data, } mongo = current_app.mongo diff --git a/dreamsApp/app/templates/dashboard/profile.html b/dreamsApp/app/templates/dashboard/profile.html index 1e0ad72..2f30fcf 100644 --- a/dreamsApp/app/templates/dashboard/profile.html +++ b/dreamsApp/app/templates/dashboard/profile.html @@ -48,9 +48,8 @@ .img-section img { border-radius: 0.75rem; box-shadow: 0 0.5rem 1rem rgba(255, 255, 255, 0.05); - max-height: 350px; + max-height: 500px; object-fit: contain; - background-color: #222; } .btn-outline-primary { @@ -93,6 +92,24 @@

How Has the Mood Changed Over Time?

+ +
+

Recovery Dimensions (CHIME)

+

+ This chart visualizes alignment with the five key processes of personal recovery: Connectedness, Hope, Identity, Meaning, and Empowerment. +

+
+ CHIME Radar Chart +
+
+
Connectedness
+
Hope
+
Identity
+
Meaning
+
Empowerment
+
+
+
@@ -156,10 +173,109 @@

Challenging Themes

+ + {% if latest_post %} +
+

Latest Entry Analysis

+ +
+
+
+
{{ latest_post.timestamp.strftime('%Y-%m-%d %H:%M') }}
+ Sentiment: {{ latest_post.sentiment.label }} +
+

"{{ latest_post.caption }}"

+ +
+
+ AI Classification: + {% set current_label = latest_post.corrected_label if latest_post.corrected_label else latest_post.chime_analysis.label %} + + {{ current_label }} + + {% if latest_post.corrected_label %} + Verified ✓ + {% endif %} +
+ + {% if not latest_post.corrected_label %} +
+ + +
+ + + + {% endif %} +
+
+
+
+ {% endif %} +