From d752b8fbf3caacb408259909d66ded4f6ea7204a Mon Sep 17 00:00:00 2001 From: KrishanYadav333 Date: Fri, 6 Feb 2026 13:57:22 +0530 Subject: [PATCH 1/3] Pre-GSoC: Multi-dimensional location proximity framework with documentation, code stubs, and tests This PR establishes the foundation for multi-dimensional location-proximity analysis in DREAMS, building upon existing EXIF extraction (PR #77) and emotion proximity (PR #70). ## Documentation (9 new/updated files) - docs/api_design.md: REST API specification for location-proximity endpoints - docs/evaluation_metrics.md: Quantitative metrics and ablation study plan - docs/exif_extraction_research.md: Library comparison research (informed PR #77) - docs/integration_guide.md: Step-by-step integration instructions - docs/project_roadmap.md: GSoC 2026 timeline aligned with official dates (350h) - docs/risk_analysis.md: Risk matrix and mitigation strategies - docs/TEST_PLAN.md: Extended with 50+ location-proximity test cases - plans/pre_gsoc_contribution_plan.md: 7-week, 18-PR contribution roadmap - dreamsApp/docs/data-model.md: Added location_analysis and emotion_location_entries collections ## Code Implementation - dreamsApp/exif_extractor.py: NEW - Complete EXIF extraction with dual-library fallback - dreamsApp/location_proximity.py: Updated stubs with EXIFExtractor integration - ARCHITECTURE.md: Updated diagram to show integration with PR #77 and #70 - LOCATION_PROXIMITY_SUMMARY.md: Added acknowledgment of existing work ## Tests - tests/test_exif_extraction.py: NEW - Unit tests for EXIF extractor with mocking ## Code Quality - Removed emojis from entire project (8 files) for professional documentation - data_integrity/reporter.py: Replaced emoji indicators with text - dream-integration/app/templates/index.html: Replaced emoji UI elements - dreamsApp/app/dashboard/main.py: Removed emoji comments ## Integration Points - Builds upon PR #77 (kunal-595): Uses existing EXIFExtractor class - Complements PR #70 (AnvayKharb): Adds spatial proximity to time-aware emotion analysis - Aligns with PR #79 (anish1206): Emotion-location work supports CHIME framework ## Key Features - Multi-dimensional proximity: geographic + categorical + linguistic + cultural - Emotion-location hotspot detection - Semantic clustering with DBSCAN - MongoDB schema extensions for location data - Performance benchmarks and evaluation framework Total: 10 new files, 9 updated files, 350 hours planned for GSoC 2026 implementation --- .gitignore | 1 + ARCHITECTURE.md | 8 +- LOCATION_PROXIMITY_SUMMARY.md | 11 +- data_integrity/reporter.py | 4 +- docs/TEST_PLAN.md | 246 ++++++++++ docs/api_design.md | 447 ++++++++++++++++++ docs/evaluation_metrics.md | 380 ++++++++++++++++ docs/exif_extraction_research.md | 205 +++++++++ docs/integration_guide.md | 498 +++++++++++++++++++++ docs/project_roadmap.md | 463 +++++++++++++++++++ docs/risk_analysis.md | 404 +++++++++++++++++ dream-integration/app/templates/index.html | 8 +- dreamsApp/app/dashboard/main.py | 2 +- dreamsApp/docs/data-model.md | 109 +++++ dreamsApp/exif_extractor.py | 172 +++++++ dreamsApp/location_proximity.py | 11 +- location_proximity/README.md | 9 +- plans/pre_gsoc_contribution_plan.md | 197 ++++++++ tests/test_exif_extraction.py | 70 +++ 19 files changed, 3227 insertions(+), 18 deletions(-) create mode 100644 docs/api_design.md create mode 100644 docs/evaluation_metrics.md create mode 100644 docs/exif_extraction_research.md create mode 100644 docs/integration_guide.md create mode 100644 docs/project_roadmap.md create mode 100644 docs/risk_analysis.md create mode 100644 dreamsApp/exif_extractor.py create mode 100644 plans/pre_gsoc_contribution_plan.md create mode 100644 tests/test_exif_extraction.py diff --git a/.gitignore b/.gitignore index f14c4d8..2ba88cd 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,7 @@ MANIFEST # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest +docs/PR_SUMMARY.md *.spec # Installer logs diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index f1382aa..18df4f5 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -40,14 +40,14 @@ Dreams │ ▼ ┌────────────────────────────────────────────────────────────────┐ -│ LOCATION-PROXIMITY MODULE │ -│ (Your Contribution) │ +│ LOCATION-PROXIMITY MODULE (GSoC 2026) │ +│ Building upon PR #77 (EXIF) & PR #70 (Emotion) │ │ │ │ ┌──────────────────────────────────────────────────────────┐ │ -│ │ 1. Location Extractor │ │ +│ │ 1. Location Extractor (uses existing EXIFExtractor) │ │ │ │ Input: Image file │ │ │ │ Output: {lat, lon, timestamp} │ │ -│ │ Tech: Pillow EXIF parsing │ │ +│ │ Tech: Pillow EXIF parsing (from PR #77) │ │ │ └──────────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ diff --git a/LOCATION_PROXIMITY_SUMMARY.md b/LOCATION_PROXIMITY_SUMMARY.md index 0c387f6..843840b 100644 --- a/LOCATION_PROXIMITY_SUMMARY.md +++ b/LOCATION_PROXIMITY_SUMMARY.md @@ -6,6 +6,11 @@ A new module for DREAMS that analyzes **multi-dimensional location proximity** t **Key Innovation**: Goes beyond GPS coordinates to consider categorical, linguistic, and cultural dimensions of location similarity. +**Building Upon Existing Work**: +- **PR #77** (kunal-595): EXIF GPS extraction - we use `dreamsApp/exif_extractor.py` for location data +- **PR #70** (AnvayKharb): Time-aware emotion proximity - we integrate with `analytics/emotion_proximity.py` +- **Our Contribution**: Multi-dimensional spatial proximity analysis (geographic + categorical + linguistic + cultural) + --- ## Module Location @@ -41,8 +46,8 @@ DEMO 1: Multi-Dimensional Proximity Calculation St. Mary's Church ↔ Holy Trinity Church : 0.850 Alaska Native Medical Center ↔ Providence Hospital : 0.725 -✓ Notice: Two churches have high proximity despite different locations -✓ Notice: Two hospitals cluster together semantically +Notice: Two churches have high proximity despite different locations +Notice: Two hospitals cluster together semantically ``` --- @@ -255,6 +260,6 @@ Same as DREAMS project (see [LICENSE](LICENSE)) --- -**Status**: ✅ Ready for integration and testing +**Status**: Ready for integration and testing **Version**: 0.1.0 **Last Updated**: 2024 diff --git a/data_integrity/reporter.py b/data_integrity/reporter.py index 76cf559..d523ade 100644 --- a/data_integrity/reporter.py +++ b/data_integrity/reporter.py @@ -78,7 +78,7 @@ def to_dict(self) -> dict: def format_summary(self) -> str: """Generate human-readable summary.""" if not self.issues: - return "✓ All validation checks passed." + return "All validation checks passed." counts = self.count_by_severity() lines = [ @@ -88,7 +88,7 @@ def format_summary(self) -> str: ] for issue in self.issues: - icon = "✗" if issue.severity == Severity.ERROR else "⚠" if issue.severity == Severity.WARNING else "ℹ" + icon = "X" if issue.severity == Severity.ERROR else "!" if issue.severity == Severity.WARNING else "i" location_str = f" [{issue.location}]" if issue.location else "" lines.append(f"\n{icon} {issue.severity.value} ({issue.category}){location_str}") lines.append(f" {issue.message}") diff --git a/docs/TEST_PLAN.md b/docs/TEST_PLAN.md index 5a3a43c..7eb50a0 100644 --- a/docs/TEST_PLAN.md +++ b/docs/TEST_PLAN.md @@ -249,7 +249,253 @@ Comprehensive testing strategy for the location-proximity analysis module, cover **Input**: Antipodal points (opposite sides of Earth) **Expected Output**: Geographic proximity ≈ 0.0 **Priority**: Low +#### Test Case: PC-EC-003 +**Description**: Missing dimensions (no cultural tags) +**Input**: Places without cultural_tags field +**Expected Output**: Cultural similarity defaults to 0.0, weights redistributed +**Priority**: Medium + +#### Test Case: PC-EC-004 +**Description**: Zero weight dimension +**Input**: Composite proximity with one dimension weight = 0 +**Expected Output**: Excluded dimension ignored, other weights sum to 1.0 +**Priority**: Medium + +--- + +## Clustering Test Cases + +### Unit Tests - DBSCAN Clustering + +#### Test Case: CL-UT-001 +**Description**: Cluster homogeneous place types +**Input**: 9 locations (3 parks, 3 hospitals, 3 churches) from `tests/data/locations.json` +**Expected Output**: 3 clusters, each containing same place type +**Validation**: +- Cluster 0: [park_001, park_002, park_003] +- Cluster 1: [hospital_001, hospital_002, hospital_003] +- Cluster 2: [church_001, church_002, church_003] +**Priority**: Critical + +#### Test Case: CL-UT-002 +**Description**: DBSCAN parameter sensitivity +**Input**: Same 9 locations with varying eps (0.2, 0.4, 0.6) +**Expected Output**: +- eps=0.2: More clusters (over-segmentation) +- eps=0.4: 3 clean clusters (optimal) +- eps=0.6: Fewer clusters (under-segmentation) +**Priority**: High + +#### Test Case: CL-UT-003 +**Description**: Noise point detection +**Input**: 9 locations + 2 outliers with unique attributes +**Expected Output**: Outliers labeled as noise (cluster_id = -1) +**Priority**: Medium + +#### Test Case: CL-UT-004 +**Description**: Minimum cluster size enforcement +**Input**: min_samples=3, locations with 2 similar + 1 outlier +**Expected Output**: Group of 2 not forming cluster (below threshold) +**Priority**: Medium + +### Integration Tests - Clustering with Emotions + +#### Test Case: CL-IT-001 +**Description**: Cluster emotion profile aggregation +**Input**: +- 9 locations clustered into 3 groups +- Sentiment data from `tests/data/sentiments.json` +**Expected Output**: +- Church cluster: 80%+ positive emotions +- Hospital cluster: 60%+ negative emotions +- Park cluster: 70%+ positive emotions +**Priority**: Critical + +#### Test Case: CL-IT-002 +**Description**: Temporal emotion evolution within cluster +**Input**: Cluster with visits across 2 months +**Expected Output**: Timeline showing emotion trend over time +**Priority**: Medium + +### Quality Metrics Tests + +#### Test Case: CL-QM-001 +**Description**: Silhouette score calculation +**Input**: Clustered locations with proximity matrix +**Expected Output**: Silhouette score > 0.5 (good separation) +**Priority**: High + +#### Test Case: CL-QM-002 +**Description**: Davies-Bouldin index +**Input**: Clustered locations +**Expected Output**: DB index < 1.0 (tight, well-separated clusters) +**Priority**: Medium + +#### Test Case: CL-QM-003 +**Description**: Clustering purity +**Input**: Predicted clusters vs. ground truth (place types) +**Expected Output**: Purity > 0.80 (accurate grouping) +**Priority**: High + +--- + +## Emotion-Location Pattern Detection + +### Hotspot Detection Tests + +#### Test Case: HS-UT-001 +**Description**: Positive emotional hotspot identification +**Input**: Location with 5 visits, 4 positive (80%), 1 neutral +**Expected Output**: Identified as positive hotspot (confidence=0.80) +**Min Visits**: 3 +**Min Confidence**: 0.60 +**Priority**: Critical + +#### Test Case: HS-UT-002 +**Description**: Negative emotional hotspot identification +**Input**: Hospital with 6 visits, 5 negative (83%), 1 neutral +**Expected Output**: Identified as negative hotspot (confidence=0.83) +**Priority**: Critical + +#### Test Case: HS-UT-003 +**Description**: Insufficient visits - no hotspot +**Input**: Location with 2 visits (below min_visits=3) +**Expected Output**: Not classified as hotspot +**Priority**: Medium + +#### Test Case: HS-UT-004 +**Description**: Mixed emotions - no dominant sentiment +**Input**: Location with balanced emotions (33% each) +**Expected Output**: No hotspot (confidence < 0.60 threshold) +**Priority**: Medium + +### Place-Type Emotion Comparison + +#### Test Case: PT-UT-001 +**Description**: Aggregate emotions by place type +**Input**: All church visits from `tests/data/sentiments.json` +**Expected Output**: +- Mean positive score: 0.82 +- Dominant sentiment: positive (>75%) +**Priority**: High + +#### Test Case: PT-UT-002 +**Description**: Statistical significance test +**Input**: Church emotions vs. Hospital emotions +**Expected Output**: t-test p-value < 0.05 (significantly different) +**Priority**: Medium + +### Temporal Emotion Trends + +#### Test Case: TE-UT-001 +**Description**: Weekly emotion aggregation +**Input**: Location with 8 visits across 4 weeks +**Expected Output**: +- Week 1-4 emotion distribution per week +- Trend direction (improving/declining/stable) +**Priority**: Medium + +#### Test Case: TE-UT-002 +**Description**: Seasonal pattern detection +**Input**: Year-long visit history at location +**Expected Output**: Identify seasonal variations (e.g., positive in summer) +**Priority**: Low (future enhancement) + +--- + +## End-to-End Integration Tests + +### Test Case: E2E-001 +**Description**: Complete photo upload to dashboard pipeline +**Steps**: +1. Upload photo with GPS EXIF data +2. Extract location and sentiment +3. Store in MongoDB +4. Compute proximity to existing locations +5. Update location_analysis collection +6. Trigger clustering if threshold met +7. Display on dashboard + +**Expected Results**: +- Photo processed < 3 seconds +- Location extracted correctly +- Proximity scores computed for nearby locations +- Dashboard shows updated analysis within 5 seconds + +**Priority**: Critical + +### Test Case: E2E-002 +**Description**: No GPS fallback to manual location +**Steps**: +1. Upload photo without GPS data +2. System prompts for manual location +3. User provides coordinates +4. Pipeline continues normally + +**Expected Results**: +- Graceful handling of missing GPS +- Manual location stored with accuracy='manual' +- All analysis proceeds as normal + +**Priority**: High + +### Test Case: E2E-003 +**Description**: Real-time dashboard updates +**Steps**: +1. User has existing location analysis dashboard open +2. Upload new photo at new location +3. Dashboard refreshes automatically or shows update notification +**Expected Results**: +- New location appears on map +- Cluster assignments updated if applicable +- Hotspots recalculated + +**Priority**: Medium + +--- + +## Performance & Load Testing + +### Test Case: PERF-001 +**Description**: Upload processing time benchmark +**Input**: Single photo upload with location +**Expected**: Complete processing < 3 seconds +**Measurement**: Average over 100 uploads +**Priority**: Critical + +### Test Case: PERF-002 +**Description**: Proximity calculation latency +**Input**: Compute proximity between 2 locations +**Expected**: < 100 milliseconds +**Measurement**: Average over 1000 calculations +**Priority**: High + +### Test Case: PERF-003 +**Description**: Clustering performance scaling +**Input**: Varying number of locations (10, 50, 100, 500) +**Expected**: +- 100 locations: < 2 seconds +- 500 locations: < 10 seconds +**Priority**: High + +### Test Case: PERF-004 +**Description**: Dashboard load time +**Input**: Request location analysis dashboard +**Expected**: Initial load < 1 second (excluding map tiles) +**Priority**: Medium + +### Test Case: LOAD-001 +**Description**: Concurrent upload handling +**Input**: 100 simultaneous photo uploads +**Expected**: All complete successfully, average time < 5 seconds +**Priority**: High + +### Test Case: LOAD-002 +**Description**: Database query performance under load +**Input**: 50 concurrent dashboard requests +**Expected**: All respond < 2 seconds +**Priority**: Medium #### Test Case: PC-EC-003 **Description**: Missing attribute handling **Input**: Location with missing 'type' field diff --git a/docs/api_design.md b/docs/api_design.md new file mode 100644 index 0000000..99f371c --- /dev/null +++ b/docs/api_design.md @@ -0,0 +1,447 @@ +# DREAMS API Design - Location Proximity & Emotion Analysis + +## Overview + +This document outlines the REST API design for multi-dimensional location-proximity analysis within DREAMS. The API builds upon: + +- **Existing EXIF extraction** (PR #77 by kunal-595): GPS coordinate extraction from image metadata +- **Existing emotion proximity** (PR #70 by AnvayKharb): Time-aware emotion timeline comparison + +Our API adds **spatial proximity endpoints** for geographic clustering, place-type similarity, and emotion-location mapping. + +--- + +## API Endpoints + +### 1. Ingestion & Analysis Endpoints + +#### POST `/api/upload` +**Description**: Upload photo with caption, extract location, analyze sentiment, and compute proximity patterns. + +**Request**: +```json +{ + "user_id": "string", + "image": "base64_encoded_image", + "caption": "string", + "timestamp": "ISO8601_datetime", + "manual_location": { // Optional fallback if no EXIF GPS + "lat": 61.2181, + "lon": -149.9003 + } +} +``` + +**Response**: +```json +{ + "post_id": "string", + "sentiment": { + "label": "positive|neutral|negative", + "score": 0.85 + }, + "location": { + "lat": 61.2181, + "lon": -149.9003, + "accuracy": "high|medium|low|none", + "place_type": "park", // Inferred or manual + "nearby_locations": [ + { + "location_id": "string", + "distance_meters": 150.5, + "proximity_score": 0.75 + } + ] + }, + "keywords": ["keyword1", "keyword2"], + "processing_time_ms": 1234 +} +``` + +**Integration Point**: `dreamsApp/app/ingestion/routes.py` + +--- + +#### GET `/api/location/proximity` +**Description**: Calculate multi-dimensional proximity between two locations. + +**Query Parameters**: +- `location1_id` (string): First location ID +- `location2_id` (string): Second location ID +- `weights` (optional string): JSON object `{"geo": 0.3, "cat": 0.4, "ling": 0.15, "cult": 0.15}` + +**Response**: +```json +{ + "location1": { + "id": "park_001", + "name": "Delaney Park Strip", + "type": "park" + }, + "location2": { + "id": "park_002", + "name": "Chugach State Park", + "type": "park" + }, + "proximity_scores": { + "geographic": 0.45, + "categorical": 1.0, + "linguistic": 1.0, + "cultural": 0.67, + "composite": 0.78 + }, + "distance_meters": 8542.3 +} +``` + +--- + +#### POST `/api/location/cluster` +**Description**: Cluster user's locations using multi-dimensional proximity. + +**Request**: +```json +{ + "user_id": "string", + "method": "dbscan|kmeans", + "params": { + "eps": 0.4, + "min_samples": 2 + } +} +``` + +**Response**: +```json +{ + "clusters": [ + { + "cluster_id": 0, + "label": "Parks & Recreation", + "members": ["park_001", "park_002", "park_003"], + "centroid": {"lat": 61.19, "lon": -149.88}, + "emotion_profile": { + "positive": 0.75, + "neutral": 0.15, + "negative": 0.10 + } + } + ], + "noise_points": ["location_xyz"], + "silhouette_score": 0.68 +} +``` + +--- + +### 2. Emotion-Location Query Endpoints + +#### GET `/api/location/{location_id}/emotions` +**Description**: Get emotion profile for a specific location. + +**Response**: +```json +{ + "location_id": "church_001", + "name": "St. Mary's Catholic Church", + "total_visits": 5, + "emotion_distribution": { + "positive": 0.80, + "neutral": 0.15, + "negative": 0.05 + }, + "mean_score": 0.82, + "timeline": [ + { + "timestamp": "2024-01-21T10:00:00Z", + "sentiment": "positive", + "score": 0.88 + } + ] +} +``` + +--- + +#### GET `/api/location/hotspots` +**Description**: Find emotional hotspots for a user. + +**Query Parameters**: +- `user_id` (string): User ID +- `sentiment` (string): Filter by `positive|neutral|negative` +- `min_confidence` (float): Minimum confidence threshold (default 0.6) +- `min_visits` (int): Minimum visits required (default 3) + +**Response**: +```json +{ + "hotspots": [ + { + "location_id": "church_001", + "name": "St. Mary's Catholic Church", + "sentiment": "positive", + "confidence": 0.80, + "visit_count": 5, + "coordinates": {"lat": 61.2167, "lon": -149.8944} + } + ] +} +``` + +--- + +#### GET `/api/location/place-type-comparison` +**Description**: Compare emotions across place types. + +**Query Parameters**: +- `user_id` (string): User ID + +**Response**: +```json +{ + "place_types": { + "church": { + "positive": 0.85, + "neutral": 0.10, + "negative": 0.05, + "mean_score": 0.82, + "visit_count": 8 + }, + "hospital": { + "positive": 0.15, + "neutral": 0.20, + "negative": 0.65, + "mean_score": 0.31, + "visit_count": 6 + }, + "park": { + "positive": 0.70, + "neutral": 0.20, + "negative": 0.10, + "mean_score": 0.75, + "visit_count": 10 + } + } +} +``` + +--- + +### 3. Dashboard Visualization Endpoints + +#### GET `/api/dashboard/{user_id}/location-analysis` +**Description**: Get comprehensive location analysis for dashboard. + +**Response**: +```json +{ + "summary": { + "total_locations": 24, + "unique_place_types": 5, + "clusters": 4, + "hotspots": 3 + }, + "clusters": [...], // Same as cluster endpoint + "hotspots": [...], // Same as hotspots endpoint + "temporal_patterns": { + "weekly_distribution": { + "Monday": {"positive": 0.7, "neutral": 0.2, "negative": 0.1}, + "Tuesday": {...} + }, + "place_type_evolution": [ + { + "week": "2024-W01", + "church": {"positive": 0.8}, + "hospital": {"negative": 0.6} + } + ] + } +} +``` + +--- + +## Data Flow Architecture + +``` +┌─────────────────┐ +│ Photo Upload │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ EXIF Extraction │ ──► GPS Coordinates +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Sentiment │ ──► Emotion Score +│ Analysis │ +└────────┬────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ Location-Emotion Mapper │ +│ - Store location + emotion pair │ +│ - Update visit history │ +└────────┬────────────────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ Proximity Calculation │ +│ - Find nearby locations │ +│ - Compute multi-dim scores │ +└────────┬────────────────────────┘ + │ + ▼ +┌─────────────────────────────────┐ +│ Pattern Detection │ +│ - Identify hotspots │ +│ - Cluster analysis │ +│ - Temporal trends │ +└────────┬────────────────────────┘ + │ + ▼ +┌─────────────────┐ +│ MongoDB Storage │ +└─────────────────┘ +``` + +--- + +## Error Handling + +### Standard Error Response +```json +{ + "error": { + "code": "ERROR_CODE", + "message": "Human-readable error message", + "details": { + "field": "Additional context" + } + } +} +``` + +### Error Codes +- `INVALID_IMAGE_FORMAT`: Unsupported image format +- `NO_GPS_DATA`: No GPS coordinates in EXIF or manual location +- `LOCATION_NOT_FOUND`: Location ID doesn't exist +- `INSUFFICIENT_DATA`: Not enough data for clustering/analysis +- `INVALID_COORDINATES`: GPS coordinates out of valid range +- `PROCESSING_FAILED`: General processing error + +--- + +## Rate Limiting + +- **Upload**: 10 requests/minute per user +- **Query endpoints**: 100 requests/minute per user +- **Dashboard**: 20 requests/minute per user + +Headers: +``` +X-RateLimit-Limit: 10 +X-RateLimit-Remaining: 7 +X-RateLimit-Reset: 1640995200 +``` + +--- + +## Authentication + +All endpoints require Bearer token authentication: +``` +Authorization: Bearer +``` + +User context is extracted from the JWT token. The `user_id` in requests must match the authenticated user (except for admin users). + +--- + +## Integration Points + +### Existing DREAMS Components + +1. **Ingestion Pipeline** (`dreamsApp/app/ingestion/routes.py`) + - Extend `POST /upload` to include location extraction + - Add location-proximity calculations after sentiment analysis + +2. **Dashboard** (`dreamsApp/app/dashboard/main.py`) + - Add new route `/location_analysis/` + - Integrate location map visualization + - Display cluster cards and hotspot markers + +3. **Data Models** (`dreamsApp/app/models.py`) + - Extend `Post` model with location fields + - Add `LocationAnalysis` model for storing clusters/hotspots + - Add `EmotionLocationEntry` for tracking location-emotion pairs + +### New Components + +1. **Location Proximity Calculator** (`location_proximity/proximity_calculator.py`) + - Called by API endpoints to compute multi-dimensional scores + +2. **Emotion-Location Mapper** (`location_proximity/emotion_location_mapper.py`) + - Manages emotion-location associations + - Provides hotspot detection and pattern analysis + +3. **Semantic Clusterer** (`location_proximity/semantic_clustering.py`) + - Clusters locations using DBSCAN + - Generates emotion profiles per cluster + +--- + +## Performance Considerations + +### Caching Strategy +- Cache proximity scores between location pairs (TTL: 1 hour) +- Cache cluster results per user (invalidate on new upload) +- Cache hotspot calculations (invalidate on new location-emotion pair) + +### Optimization +- Batch proximity calculations for nearby locations +- Precompute distance matrices for frequent queries +- Use spatial indexing (MongoDB geospatial queries) for radius searches + +### Expected Performance +- Upload processing: < 3 seconds (including all analysis) +- Proximity query: < 100ms +- Clustering: < 2 seconds for 100 locations +- Dashboard load: < 1 second + +--- + +## Future Enhancements + +1. **Real-time Place Enrichment** + - Google Places API integration for place type inference + - Automatic tagging of cultural/linguistic attributes + +2. **Collaborative Filtering** + - Cross-user emotion patterns at shared locations + - Privacy-preserving aggregation + +3. **Temporal Predictions** + - Predict likely emotional response at a location based on history + - Recommend emotionally beneficial locations + +4. **WebSocket Support** + - Real-time clustering updates + - Live emotion-location mapping during photo uploads + +--- + +## Testing Strategy + +- **Unit Tests**: Mock external dependencies (DB, ML models) +- **Integration Tests**: Test full pipeline with synthetic data +- **Load Tests**: Simulate 100 concurrent users +- **Validation**: Compare results against `tests/data/expected_results.json` + +--- + +**Document Version**: 1.0 +**Last Updated**: February 3, 2026 +**Author**: Krishan (GSoC 2026 Contributor) diff --git a/docs/evaluation_metrics.md b/docs/evaluation_metrics.md new file mode 100644 index 0000000..a49a891 --- /dev/null +++ b/docs/evaluation_metrics.md @@ -0,0 +1,380 @@ +# Evaluation Metrics & Ablation Study Plan + +## Overview + +This document defines the evaluation methodology for the DREAMS location-proximity module, including quantitative metrics, qualitative assessment criteria, and a systematic ablation study plan to validate the multi-dimensional proximity approach. + +--- + +## 1. Quantitative Evaluation Metrics + +### 1.1 Proximity Calculation Accuracy + +**Metric**: Mean Absolute Error (MAE) against human-annotated proximity scores + +**Method**: +- Collect human judgments for 50 location pairs +- Humans rate semantic similarity on 0-1 scale +- Compare with computed proximity scores + +**Formula**: +``` +MAE = (1/n) * Σ|human_score_i - computed_score_i| +``` + +**Success Criteria**: MAE < 0.15 + +### 1.2 Clustering Quality + +**Metrics**: +- **Silhouette Score**: Measures cluster cohesion and separation (-1 to +1) +- **Davies-Bouldin Index**: Lower is better (minimum 0) +- **Purity**: Percentage of correctly clustered items + +**Success Criteria**: +- Silhouette Score > 0.5 +- Davies-Bouldin Index < 1.0 +- Purity > 0.80 + +**Validation**: +```python +from sklearn.metrics import silhouette_score, davies_bouldin_score + +# Using synthetic data from tests/data/expected_results.json +expected_clusters = { + 0: ["church_001", "church_002", "church_003"], + 1: ["hospital_001", "hospital_002", "hospital_003"], + 2: ["park_001", "park_002", "park_003"] +} + +# Compute metrics +silhouette = silhouette_score(proximity_matrix, cluster_labels) +davies_bouldin = davies_bouldin_score(proximity_matrix, cluster_labels) +purity = compute_purity(cluster_labels, ground_truth_labels) +``` + +### 1.3 Hotspot Detection Precision & Recall + +**Metrics**: +- **Precision**: Of detected hotspots, how many are true positives? +- **Recall**: Of true hotspots, how many were detected? +- **F1 Score**: Harmonic mean of precision and recall + +**Ground Truth**: Manually labeled hotspots in test dataset + +**Formula**: +``` +Precision = TP / (TP + FP) +Recall = TP / (TP + FN) +F1 = 2 * (Precision * Recall) / (Precision + Recall) +``` + +**Success Criteria**: F1 Score > 0.75 + +### 1.4 Emotion Prediction Accuracy + +**Metric**: Accuracy of predicting emotion at a location based on place type + +**Method**: +- Hold out 20% of emotion-location pairs +- Predict sentiment using place-type averages +- Compare with ground truth + +**Formula**: +``` +Accuracy = (Correct Predictions) / (Total Predictions) +``` + +**Success Criteria**: Accuracy > 0.65 (better than random baseline of 0.33) + +### 1.5 Performance Benchmarks + +**Metrics**: +- Processing time per photo upload +- Proximity calculation latency +- Clustering computation time +- Memory usage + +**Success Criteria**: +- Upload processing: < 3 seconds +- Proximity query: < 100ms +- Clustering (100 locations): < 2 seconds +- Memory footprint: < 500MB + +--- + +## 2. Qualitative Evaluation + +### 2.1 User Study Design + +**Participants**: 10-15 mental health researchers and clinicians + +**Tasks**: +1. Review 5 user recovery timelines with location-emotion visualizations +2. Assess whether location clusters match their clinical intuition +3. Evaluate usefulness of hotspot identification +4. Rate interpretability of proximity scores (1-5 scale) + +**Questions**: +- "Do the location clusters make semantic sense?" +- "Are emotional hotspots clinically meaningful?" +- "Would this analysis support recovery tracking?" +- "Are proximity scores interpretable?" + +**Success Criteria**: +- Mean usefulness rating > 3.5/5 +- 70%+ agreement on cluster meaningfulness + +### 2.2 Case Study Analysis + +**Method**: Detailed analysis of 3 synthetic user journeys + +**Dimensions**: +- Temporal evolution of place-emotion associations +- Identification of recovery milestones via location patterns +- Discovery of unexpected semantic proximity patterns + +**Documentation**: Rich narrative descriptions with visualizations + +--- + +## 3. Ablation Study Plan + +### 3.1 Study Overview + +**Purpose**: Determine the contribution of each proximity dimension (geographic, categorical, linguistic, cultural) to overall system performance. + +**Method**: Systematically remove each dimension and measure impact on clustering quality and emotion prediction accuracy. + +### 3.2 Experimental Conditions + +| Condition | Geographic | Categorical | Linguistic | Cultural | Description | +|-----------|------------|-------------|------------|----------|-------------| +| **Full** | Yes | Yes | Yes | Yes | All dimensions (baseline) | +| **Ablate-Geo** | No | Yes | Yes | Yes | Remove geographic distance | +| **Ablate-Cat** | Yes | No | Yes | Yes | Remove categorical similarity | +| **Ablate-Ling** | Yes | Yes | No | Yes | Remove linguistic context | +| **Ablate-Cult** | Yes | Yes | Yes | No | Remove cultural tags | +| **Geo-Only** | Yes | No | No | No | Geographic distance only | +| **Cat-Only** | No | Yes | No | No | Categorical similarity only | + +### 3.3 Evaluation for Each Condition + +**Metrics Measured**: +- Silhouette Score +- Davies-Bouldin Index +- Clustering Purity +- Emotion Prediction Accuracy +- Human Interpretability Rating (qualitative) + +**Dataset**: `tests/data/locations.json` with 17 locations across 7 types + +### 3.4 Expected Outcomes + +**Hypothesis 1**: Categorical dimension contributes most to clustering quality +- **Rationale**: Place type (church, hospital) is strongest semantic signal +- **Test**: Ablate-Cat should show largest performance drop + +**Hypothesis 2**: Geographic dimension alone is insufficient +- **Rationale**: Two distant churches are more similar than a church and nearby hospital +- **Test**: Geo-Only should have poor clustering purity + +**Hypothesis 3**: Multi-dimensional approach outperforms single dimensions +- **Rationale**: Combined signals capture richer semantics +- **Test**: Full model should achieve best metrics + +### 3.5 Implementation + +```python +# ablation_study.py + +import json +import numpy as np +from sklearn.metrics import silhouette_score +from location_proximity.proximity_calculator import composite_proximity + +def run_ablation_study(): + """Run systematic ablation study on proximity dimensions.""" + + # Load test data + with open('tests/data/locations.json') as f: + locations = json.load(f)['locations'] + + # Define ablation conditions + conditions = { + 'Full': {'geo': 0.3, 'cat': 0.4, 'ling': 0.15, 'cult': 0.15}, + 'Ablate-Geo': {'geo': 0.0, 'cat': 0.55, 'ling': 0.225, 'cult': 0.225}, + 'Ablate-Cat': {'geo': 0.5, 'cat': 0.0, 'ling': 0.25, 'cult': 0.25}, + 'Ablate-Ling': {'geo': 0.35, 'cat': 0.47, 'ling': 0.0, 'cult': 0.18}, + 'Ablate-Cult': {'geo': 0.35, 'cat': 0.47, 'ling': 0.18, 'cult': 0.0}, + 'Geo-Only': {'geo': 1.0, 'cat': 0.0, 'ling': 0.0, 'cult': 0.0}, + 'Cat-Only': {'geo': 0.0, 'cat': 1.0, 'ling': 0.0, 'cult': 0.0} + } + + results = {} + + for condition_name, weights in conditions.items(): + # Compute proximity matrix with current weights + proximity_matrix = compute_proximity_matrix(locations, weights) + + # Cluster using DBSCAN + from location_proximity.semantic_clustering import SemanticLocationClusterer + clusterer = SemanticLocationClusterer(eps=0.4, min_samples=2) + labels = clusterer.cluster_by_proximity(proximity_matrix) + + # Compute metrics + silhouette = silhouette_score(proximity_matrix, labels) if len(set(labels)) > 1 else 0 + purity = compute_purity(labels, ground_truth_from_place_types(locations)) + + results[condition_name] = { + 'silhouette': silhouette, + 'purity': purity, + 'num_clusters': len(set(labels)) - (1 if -1 in labels else 0) + } + + return results +``` + +### 3.6 Results Documentation + +Results will be documented in: +- **Quantitative Table**: Metrics for each condition +- **Visualization**: Bar charts comparing conditions +- **Statistical Analysis**: ANOVA to test significance of differences +- **Interpretation**: Narrative explanation of findings + +--- + +## 4. Validation Against Expected Results + +### 4.1 Synthetic Dataset Validation + +**File**: `tests/data/expected_results.json` + +**Tests**: +1. **Proximity Scores**: Verify computed scores fall within expected ranges +```python +# Church-Church proximity should be 0.8-1.0 +assert 0.8 <= compute_proximity(church_001, church_002) <= 1.0 + +# Church-Hospital proximity should be 0.1-0.4 +assert 0.1 <= compute_proximity(church_001, hospital_001) <= 0.4 +``` + +2. **Clustering**: Verify 3 clusters detected (parks, hospitals, churches) +```python +assert num_clusters == 3 +assert set(clusters[0]) == set(["church_001", "church_002", "church_003"]) +``` + +3. **Emotion Patterns**: Verify place-type emotion distributions +```python +church_sentiment = aggregate_by_place_type('church') +assert church_sentiment['positive'] >= 0.70 # Expected mean 0.75 +``` + +### 4.2 Test Suite + +All validation tests in `tests/test_evaluation_metrics.py`: + +```python +def test_proximity_accuracy(): + """Test proximity scores against expected ranges.""" + # Implementation + +def test_clustering_quality(): + """Test clustering meets quality thresholds.""" + # Implementation + +def test_hotspot_detection(): + """Test hotspot detection precision/recall.""" + # Implementation + +def test_emotion_prediction(): + """Test emotion prediction accuracy.""" + # Implementation + +def test_performance_benchmarks(): + """Test processing times meet requirements.""" + # Implementation +``` + +--- + +## 5. Baseline Comparisons + +### 5.1 Baseline Methods + +**Baseline 1: Geographic Distance Only** +- Use Haversine formula only +- No semantic considerations + +**Baseline 2: K-Means Clustering (Fixed K=3)** +- Traditional clustering without proximity matrix +- Geographic features only + +**Baseline 3: Random Emotion Prediction** +- Predict emotions randomly (33% each class) +- Lower bound on performance + +### 5.2 Comparison Metrics + +| Metric | Random | Geo-Only | K-Means | Multi-Dim (Ours) | +|--------|--------|----------|---------|------------------| +| Silhouette Score | - | TBD | TBD | **Target: > 0.5** | +| Clustering Purity | 33% | TBD | TBD | **Target: > 80%** | +| Emotion Prediction | 33% | TBD | TBD | **Target: > 65%** | +| Interpretability | Low | Medium | Low | **Target: High** | + +--- + +## 6. Continuous Monitoring + +### 6.1 Production Metrics + +Once deployed, monitor: +- Average proximity calculation time +- Clustering success rate (% of users with valid clusters) +- User engagement with location analysis dashboard +- Error rates in EXIF extraction + +### 6.2 A/B Testing + +**Test**: Multi-dimensional proximity vs. Geographic-only + +**Metrics**: +- Dashboard engagement time +- User-reported usefulness +- Clinical insights discovered + +**Duration**: 4 weeks with 50 users per group + +--- + +## 7. Timeline + +| Phase | Duration | Deliverables | +|-------|----------|-------------| +| **Metric Implementation** | Week 1 | All metrics coded and tested | +| **Ablation Study** | Week 2 | Results for all conditions | +| **User Study** | Week 3-4 | Qualitative feedback collected | +| **Baseline Comparison** | Week 2 | Comparison table completed | +| **Documentation** | Week 5 | Final evaluation report | + +--- + +## 8. Success Criteria Summary + +- **Proximity Accuracy**: MAE < 0.15 +- **Clustering Quality**: Silhouette > 0.5, Purity > 0.80 +- **Hotspot Detection**: F1 > 0.75 +- **Emotion Prediction**: Accuracy > 0.65 +- **Performance**: Upload < 3s, Query < 100ms +- **User Study**: Usefulness > 3.5/5 +- **Ablation Study**: Multi-dimensional > single dimensions + +--- + +**Version**: 1.0 +**Last Updated**: February 3, 2026 +**Author**: Krishan (GSoC 2026 Contributor) diff --git a/docs/exif_extraction_research.md b/docs/exif_extraction_research.md new file mode 100644 index 0000000..e908ecf --- /dev/null +++ b/docs/exif_extraction_research.md @@ -0,0 +1,205 @@ +# EXIF Extraction Research + +## Overview + +**Note**: EXIF extraction has been implemented in `dreamsApp/exif_extractor.py` by PR #77 (kunal-595). This research document provided the foundation for that implementation. + +This document compares EXIF extraction libraries for photo metadata analysis in the DREAMS project, focusing on location data, timestamps, and camera information needed for recovery journey tracking. + +## Library Comparison: Pillow vs exifread + +### Pillow (PIL.ExifTags) + +**Pros:** +- Built into PIL/Pillow (already used for image processing) +- Simple API with `Image._getexif()` +- Good for basic EXIF data +- Lightweight for standard use cases + +**Cons:** +- Limited EXIF tag support +- No GPS coordinate parsing helpers +- Inconsistent handling of malformed data +- Returns numeric tag IDs requiring manual mapping + +**Code Example:** +```python +from PIL import Image +from PIL.ExifTags import TAGS, GPSTAGS + +def extract_exif_pillow(image_path): + image = Image.open(image_path) + exif = image._getexif() + if not exif: + return {} + + data = {} + for tag_id, value in exif.items(): + tag = TAGS.get(tag_id, tag_id) + if tag == 'GPSInfo': + gps_data = {} + for gps_tag_id, gps_value in value.items(): + gps_tag = GPSTAGS.get(gps_tag_id, gps_tag_id) + gps_data[gps_tag] = gps_value + data[tag] = gps_data + else: + data[tag] = value + return data +``` + +### exifread + +**Pros:** +- Comprehensive EXIF tag support +- Better handling of malformed/corrupted data +- Detailed GPS parsing +- More robust for edge cases +- Returns human-readable tag names + +**Cons:** +- Additional dependency +- Slightly more complex API +- Larger memory footprint + +**Code Example:** +```python +import exifread + +def extract_exif_exifread(image_path): + with open(image_path, 'rb') as f: + tags = exifread.process_file(f) + + data = {} + for tag, value in tags.items(): + if tag.startswith('GPS'): + data[tag] = str(value) + elif tag in ['EXIF DateTime', 'Image DateTime']: + data[tag] = str(value) + elif tag == 'Image Make': + data[tag] = str(value) + return data +``` + +## Edge Cases Identified + +### 1. Missing GPS Data +- **Issue:** Many photos lack GPS coordinates +- **Impact:** Cannot determine location for proximity analysis +- **Mitigation:** Fallback to user-provided location or skip location-based features + +### 2. Corrupted EXIF Headers +- **Issue:** Malformed EXIF data causes parsing failures +- **Impact:** Complete metadata loss +- **Mitigation:** Use exifread's robust parsing + try/catch blocks + +### 3. Timezone Inconsistencies +- **Issue:** EXIF timestamps don't include timezone info +- **Impact:** Incorrect temporal ordering across locations +- **Mitigation:** Use GPS coordinates to infer timezone or prompt user + +### 4. Camera-Specific Formats +- **Issue:** Different manufacturers use proprietary EXIF extensions +- **Impact:** Inconsistent metadata availability +- **Mitigation:** Normalize to common subset of tags + +### 5. Privacy-Stripped Images +- **Issue:** Social media platforms remove EXIF data +- **Impact:** No metadata available for analysis +- **Mitigation:** Detect stripped images and request manual input + +### 6. Large File Handling +- **Issue:** High-resolution images may cause memory issues +- **Impact:** Processing failures on resource-constrained systems +- **Mitigation:** Stream processing or thumbnail extraction + +## Recommended Implementation + +### Primary Choice: exifread +- Better edge case handling +- More comprehensive GPS support +- Robust parsing for corrupted data + +### Fallback Strategy +```python +def extract_metadata(image_path): + try: + return extract_exif_exifread(image_path) + except Exception: + try: + return extract_exif_pillow(image_path) + except Exception: + return {} # No metadata available +``` + +## GPS Coordinate Conversion + +Both libraries require manual GPS coordinate conversion: + +```python +def convert_gps_to_decimal(gps_coord, direction): + """Convert GPS coordinates from DMS to decimal degrees.""" + if not gps_coord: + return None + + degrees = float(gps_coord[0]) + minutes = float(gps_coord[1]) + seconds = float(gps_coord[2]) + + decimal = degrees + (minutes / 60.0) + (seconds / 3600.0) + + if direction in ['S', 'W']: + decimal = -decimal + + return decimal +``` + +## Testing Strategy + +### Test Cases Required +1. **Standard photos** with complete EXIF +2. **GPS-enabled photos** from different devices +3. **Corrupted EXIF** data scenarios +4. **Privacy-stripped** images +5. **Various camera manufacturers** (Canon, Nikon, iPhone, Android) +6. **Different file formats** (JPEG, TIFF, RAW) + +### Performance Benchmarks +- Processing time per image +- Memory usage with large files +- Error handling robustness + +## Integration with DREAMS + +### Metadata Schema +```python +{ + "timestamp": "2024-01-15T14:30:00", + "location": { + "lat": 61.2181, + "lon": -149.9003, + "accuracy": "high" # high/medium/low/none + }, + "camera": { + "make": "Apple", + "model": "iPhone 12", + "settings": {...} + }, + "processing": { + "exif_source": "exifread", # exifread/pillow/manual + "extraction_time": "2024-01-15T14:35:00" + } +} +``` + +### Error Handling +- Log extraction failures for debugging +- Graceful degradation when metadata unavailable +- User prompts for critical missing data (location, timestamp) + +## Next Steps + +1. Implement robust EXIF extraction module +2. Create comprehensive test suite +3. Add GPS coordinate validation +4. Integrate with photo upload pipeline +5. Add user interface for manual metadata entry \ No newline at end of file diff --git a/docs/integration_guide.md b/docs/integration_guide.md new file mode 100644 index 0000000..f4f0936 --- /dev/null +++ b/docs/integration_guide.md @@ -0,0 +1,498 @@ +# DREAMS Integration Guide - Location Proximity Module + +## Overview + +This guide provides step-by-step instructions for integrating the multi-dimensional location-proximity analysis module into the existing DREAMS platform. This work builds upon: + +- **PR #77** (by kunal-595): EXIF GPS extraction already implemented in `dreamsApp/exif_extractor.py` +- **PR #70** (by AnvayKharb): Time-aware emotion proximity in `dreamsApp/analytics/emotion_proximity.py` + +Our contribution adds **multi-dimensional spatial proximity** (geographic + categorical + linguistic + cultural) to complement the existing time-aware emotion analysis. + +--- + +## Prerequisites + +- DREAMS platform installed and running +- MongoDB instance configured +- Python 3.8+ environment +- Required packages: `exifread`, `scikit-learn`, `numpy` + +--- + +## Integration Architecture + +``` +┌──────────────────────────────────────────────────────┐ +│ Existing DREAMS Platform │ +│ │ +│ ┌─────────────┐ ┌──────────────┐ │ +│ │ Beehive │─────▶│ Ingestion │ │ +│ │ Frontend │ │ Pipeline │ │ +│ └─────────────┘ └──────┬───────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ Sentiment Analysis │ │ +│ │ (existing) │ │ +│ └──────────────┬───────────────────────┘ │ +│ │ │ +└─────────────────┼────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ NEW: Location-Proximity Module │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ │ +│ │ EXIF │─────▶│ Proximity │ │ +│ │ Extractor │ │ Calculator │ │ +│ └──────────────┘ └──────┬───────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────────┐ │ +│ │ Emotion-Location Mapper │ │ +│ └──────────────┬───────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────────────────┐ │ +│ │ Semantic Clusterer │ │ +│ └──────────────┬───────────────────┘ │ +└─────────────────┼────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ MongoDB Storage │ +│ - posts (extended with location) │ +│ - location_analysis │ +│ - emotion_location_entries │ +└──────────────────────────────────────────────────────┘ +``` + +--- + +## Step 1: Extend Post Ingestion Route + +**File**: `dreamsApp/app/ingestion/routes.py` + +### 1.1 Import Location Modules + +Add to the top of the file: +```python +from dreamsApp.exif_extractor import EXIFExtractor # From PR #77 (kunal-595) +from dreamsApp.analytics.emotion_proximity import segment_timeline_into_windows # From PR #70 (AnvayKharb) +from dreamsApp.location_proximity import extract_location, find_nearby_locations # New multi-dimensional proximity +``` + +### 1.2 Modify Upload Route + +Extend the existing `POST /upload` endpoint: + +```python +@ingestion_bp.route('/upload', methods=['POST']) +def upload_photo(): + # Existing code for image upload and sentiment analysis... + + # Use existing EXIF extractor from PR #77 + extractor = EXIFExtractor() + metadata = extractor.extract_metadata(image_path) + location_data = metadata.get('location', {}) + + # Fallback to manual location if no GPS in EXIF + if location_data.get('accuracy') == 'none' and 'manual_location' in request.json: + manual = request.json['manual_location'] + location_data = { + 'lat': manual['lat'], + 'lon': manual['lon'], + 'accuracy': 'manual' + } + + # Store post with location + post_doc = { + 'user_id': user_id, + 'caption': caption, + 'timestamp': datetime.utcnow(), + 'image_path': image_path, + 'sentiment': sentiment_result, + 'location': location_data # NEW FIELD + } + + post_id = db.posts.insert_one(post_doc).inserted_id + + # NEW: If location available, find nearby locations and update analysis + if location_data.get('lat') and location_data.get('lon'): + from location_proximity.emotion_location_mapper import EmotionLocationMapper + + mapper = EmotionLocationMapper() + + # Add emotion-location entry + mapper.add_entry( + location_id=str(post_id), # Use post_id as location_id initially + sentiment=sentiment_result['label'], + score=sentiment_result['score'], + metadata={ + 'timestamp': post_doc['timestamp'], + 'coordinates': location_data, + 'user_id': user_id + } + ) + + # Store in emotion_location_entries collection + db.emotion_location_entries.insert_one({ + 'user_id': user_id, + 'location_id': str(post_id), + 'post_id': post_id, + 'sentiment': sentiment_result['label'], + 'score': sentiment_result['score'], + 'timestamp': post_doc['timestamp'], + 'coordinates': location_data + }) + + # Find nearby locations + user_locations = list(db.posts.find({ + 'user_id': user_id, + 'location.lat': {'$exists': True} + })) + + nearby = find_nearby_locations( + target_location={'lat': location_data['lat'], 'lon': location_data['lon']}, + locations=[ + {'lat': loc['location']['lat'], 'lon': loc['location']['lon']} + for loc in user_locations + ], + radius_meters=1000 # 1km radius + ) + + # Update location_analysis collection + db.location_analysis.update_one( + {'user_id': user_id}, + { + '$push': { + 'locations': { + 'id': str(post_id), + 'coordinates': location_data, + 'timestamp': post_doc['timestamp'], + 'sentiment': sentiment_result['label'], + 'nearby_count': len(nearby) + } + }, + '$set': {'updated_at': datetime.utcnow()} + }, + upsert=True + ) + + return jsonify({ + 'post_id': str(post_id), + 'sentiment': sentiment_result, + 'location': location_data, + 'nearby_locations': len(nearby) if location_data.get('lat') else 0 + }) +``` + +--- + +## Step 2: Create Location Analysis Dashboard Route + +**File**: `dreamsApp/app/dashboard/main.py` + +### 2.1 Add New Route + +```python +from flask import render_template +from location_proximity.semantic_clustering import SemanticLocationClusterer +from location_proximity.emotion_location_mapper import EmotionLocationMapper + +@dashboard_bp.route('/location_analysis/') +def location_analysis(user_id): + """Display location-emotion analysis dashboard.""" + + # Get user's location data + analysis_doc = db.location_analysis.find_one({'user_id': user_id}) + + if not analysis_doc: + return render_template('dashboard/location_analysis.html', + error="No location data available") + + # Get emotion-location entries + entries = list(db.emotion_location_entries.find({'user_id': user_id})) + + # Initialize mapper and load data + mapper = EmotionLocationMapper() + for entry in entries: + mapper.add_entry( + location_id=entry['location_id'], + sentiment=entry['sentiment'], + score=entry['score'], + metadata={ + 'timestamp': entry['timestamp'], + 'coordinates': entry['coordinates'] + } + ) + + # Find hotspots + positive_hotspots = mapper.find_emotional_hotspots('positive', min_visits=3) + negative_hotspots = mapper.find_emotional_hotspots('negative', min_visits=3) + + # Perform clustering if enough locations + clusters = [] + if len(analysis_doc.get('locations', [])) >= 6: + # Build proximity matrix (simplified - use actual multi-dimensional in production) + from location_proximity.proximity_calculator import compute_proximity_matrix + + locations = analysis_doc['locations'] + proximity_matrix = compute_proximity_matrix(locations) + + clusterer = SemanticLocationClusterer(eps=0.4, min_samples=2) + cluster_labels = clusterer.cluster_by_proximity(proximity_matrix) + + # Get emotion profiles for clusters + clusters = clusterer.cluster_with_emotions(proximity_matrix, entries) + + return render_template('dashboard/location_analysis.html', + user_id=user_id, + locations=analysis_doc.get('locations', []), + positive_hotspots=positive_hotspots, + negative_hotspots=negative_hotspots, + clusters=clusters, + total_locations=len(analysis_doc.get('locations', []))) +``` + +### 2.2 Create Template + +**File**: `dreamsApp/app/templates/dashboard/location_analysis.html` + +```html +{% extends "base.html" %} + +{% block content %} +
+

Location-Emotion Analysis

+ + {% if error %} +
{{ error }}
+ {% else %} + +
+
+
+
+
Total Locations
+

{{ total_locations }}

+
+
+
+
+
+
+
Positive Hotspots
+

{{ positive_hotspots|length }}

+
+
+
+
+
+
+
Negative Hotspots
+

{{ negative_hotspots|length }}

+
+
+
+
+
+
+
Clusters
+

{{ clusters|length }}

+
+
+
+
+ + +
+
+

Emotional Hotspots Map

+
+ +
+
+
+ + +
+
+

Location Clusters

+ {% for cluster in clusters %} +
+
+ Cluster {{ cluster.cluster_id }}: {{ cluster.label }} +
+
+

Members: {{ cluster.members|length }}

+

Emotion Profile:

+
+
+ Positive {{ "%.0f"|format(cluster.emotion_distribution.positive * 100) }}% +
+
+ Neutral {{ "%.0f"|format(cluster.emotion_distribution.neutral * 100) }}% +
+
+ Negative {{ "%.0f"|format(cluster.emotion_distribution.negative * 100) }}% +
+
+
+
+ {% endfor %} +
+
+ {% endif %} +
+{% endblock %} +``` + +--- + +## Step 3: Database Indexes + +Add indexes for efficient querying: + +```python +# In a migration script or app initialization +db.posts.create_index([('user_id', 1), ('location.lat', 1)]) +db.emotion_location_entries.create_index([('user_id', 1), ('timestamp', -1)]) +db.emotion_location_entries.create_index([('user_id', 1), ('location_id', 1)]) +db.location_analysis.create_index([('user_id', 1)]) +``` + +--- + +## Step 4: Configuration + +**File**: `dreamsApp/app/config.py` + +Add location-proximity settings: + +```python +class Config: + # Existing config... + + # Location-Proximity Settings + LOCATION_PROXIMITY_WEIGHTS = { + 'geographic': 0.3, + 'categorical': 0.4, + 'linguistic': 0.15, + 'cultural': 0.15 + } + + CLUSTERING_PARAMS = { + 'eps': 0.4, + 'min_samples': 2 + } + + HOTSPOT_MIN_VISITS = 3 + HOTSPOT_MIN_CONFIDENCE = 0.6 + + NEARBY_RADIUS_METERS = 1000 +``` + +--- + +## Step 5: Testing Integration + +Create integration test: + +**File**: `tests/test_location_emotion_integration.py` + +```python +import pytest +import json +from dreamsApp.app import create_app + +def test_full_pipeline(): + """Test full pipeline: upload → location extraction → emotion mapping → clustering.""" + + app = create_app('testing') + client = app.test_client() + + # Load test data + with open('tests/data/locations.json') as f: + test_locations = json.load(f)['locations'] + + # Simulate uploads for multiple locations + for loc in test_locations[:5]: + response = client.post('/upload', json={ + 'user_id': 'test_user', + 'image': 'base64_image_here', + 'caption': f'Visit to {loc["name"]}', + 'manual_location': loc['coordinates'] + }) + + assert response.status_code == 200 + data = response.json + assert 'location' in data + assert data['location']['lat'] == loc['coordinates']['lat'] + + # Check location analysis was created + response = client.get('/dashboard/test_user/location_analysis') + assert response.status_code == 200 +``` + +--- + +## Step 6: Deployment Checklist + +- [ ] Install required packages: `pip install exifread scikit-learn` +- [ ] Create MongoDB indexes +- [ ] Update `requirements.txt` +- [ ] Add location-proximity settings to config +- [ ] Extend ingestion route with location extraction +- [ ] Create location analysis dashboard route and template +- [ ] Run integration tests +- [ ] Update API documentation +- [ ] Deploy to staging environment +- [ ] Monitor performance and errors + +--- + +## Troubleshooting + +### Issue: No GPS data in uploaded images + +**Solution**: Ensure fallback to manual location: +```python +if 'manual_location' in request.json: + location_data = request.json['manual_location'] +``` + +### Issue: Clustering fails with too few locations + +**Solution**: Add minimum check: +```python +if len(locations) < 6: + return {'error': 'Need at least 6 locations for clustering'} +``` + +### Issue: Slow proximity calculations + +**Solution**: Implement caching: +```python +from functools import lru_cache + +@lru_cache(maxsize=1000) +def cached_proximity(loc1_id, loc2_id): + return compute_proximity(loc1, loc2) +``` + +--- + +## Next Steps + +1. **Add Place Type Inference**: Use Google Places API to automatically tag locations +2. **Implement Real-time Updates**: WebSocket support for live clustering +3. **Cross-User Analysis**: Privacy-preserving aggregation of emotion patterns +4. **Mobile Support**: Optimize for mobile dashboard viewing + +--- + +**Integration Version**: 1.0 +**Last Updated**: February 3, 2026 +**Author**: Krishan (GSoC 2026 Contributor) diff --git a/docs/project_roadmap.md b/docs/project_roadmap.md new file mode 100644 index 0000000..ba1c972 --- /dev/null +++ b/docs/project_roadmap.md @@ -0,0 +1,463 @@ +# DREAMS Project Roadmap & Future Work + +## Overview + +This document outlines the current state of the DREAMS platform, the location-proximity module implementation roadmap for GSoC 2026, and future enhancements for continued development post-GSoC. + +--- + +## Current State (February 2026) + +### Completed Features + +**Core DREAMS Platform**: +- Flask backend with user authentication +- Image upload and caption processing +- BLIP-based image captioning +- RoBERTa sentiment analysis +- Keyword extraction and HDBSCAN clustering +- MongoDB storage for posts, keywords, and themes +- LLM-based thematic analysis (Gemini integration) +- Dashboard with sentiment timelines and word clouds + +**Location-Proximity Foundation** (Pre-GSoC): +- Comprehensive research documentation +- System architecture diagrams +- Integration with existing EXIF extractor (PR #77 by kunal-595) +- Integration with emotion-timeline proximity (PR #70 by AnvayKharb) +- Test plan and evaluation framework +- Synthetic dataset with 17 locations and expected results +- Function stubs and interface definitions for multi-dimensional proximity +- API design specification +- Data model extensions + +### In Progress + +- Implementation of 6 basic location functions (building on PR #77) +- Multi-dimensional proximity calculator (geographic + categorical + linguistic + cultural) +- Emotion-location mapper (integrating with PR #70's emotion proximity) +- Semantic clustering module (complementing existing emotion timeline segmentation) + +--- + + +## GSoC 2026 Roadmap (Aligned with Official Timeline) + + + +### Pre-GSoC: Community Bonding & Planning (Feb 19 – Apr 30, 2026) +**Duration**: 10 weeks | **Effort**: ~40h (prep, onboarding, planning; not counted in GSoC 350h coding period) + +- Finalize project requirements and architecture +- Deep-dive into DREAMS codebase and data models +- Refine test plans and synthetic datasets +- Mentor meetings and onboarding + + +### Phase 1: Core Implementation (May 1 – July 7, 2026) +**Duration**: 10 weeks | **Effort**: 150h + +#### Deliverables: +1. **Basic Location Functions** (May) + - `calculate_distance()` - Haversine formula + - `validate_coordinates()` - GPS validation + - `extract_location()` - EXIF integration + - `compute_proximity()` - Distance + threshold check + - `find_nearby_locations()` - Radius-based search + - `cluster_locations()` - Simple geographic clustering + +2. **Multi-Dimensional Proximity** (June) + - `Place` class with categorical/linguistic/cultural attributes + - Geographic proximity (normalized Haversine) + - Categorical similarity (place type matching) + - Linguistic similarity (language context) + - Cultural similarity (Jaccard index on tags) + - Composite proximity with configurable weights + +3. **Emotion-Location Integration** (late June – early July) + - `EmotionLocationMapper` class + - `add_entry()` - Store emotion-location pairs + - `get_location_sentiment_profile()` - Aggregate per location + - `find_emotional_hotspots()` - Detect consistent emotions + - `compare_place_types()` - Category-level patterns + - `temporal_emotion_trend()` - Time-series analysis + +4. **Semantic Clustering** (July) + - `SemanticLocationClusterer` with DBSCAN + - Cluster emotion profile aggregation + - Visualization support + - Parameter tuning utilities + +**Milestones**: +- All unit tests passing (90%+ coverage) +- Validated against synthetic dataset +- Performance benchmarks met + +--- + + +### Phase 2: Integration & Testing (July 8 – September 1, 2026) +**Duration**: 8 weeks | **Effort**: 110h + +#### Deliverables: +1. **Backend Integration** (July) + - Extend `app/ingestion/routes.py` with location extraction + - Implement 4 REST API endpoints: + - `POST /api/upload` (enhanced with location) + - `GET /api/location/proximity` + - `POST /api/location/cluster` + - `GET /api/location/hotspots` + - MongoDB schema extensions and indexes + - API authentication and rate limiting + +2. **Dashboard Visualization** (August) + - `/location_analysis/` route + - HTML/CSS template with: + - Interactive map (Leaflet.js) with hotspot markers + - Cluster visualization cards + - Place-type comparison bar charts + - Temporal emotion patterns + - JavaScript for dynamic content loading + - Mobile-responsive design + +3. **End-to-End Testing** (August) + - Integration tests for full pipeline + - Performance testing (upload < 3s, clustering < 2s) + - Load testing (100 concurrent users) + - Cross-browser compatibility testing + +**Milestones**: +- Complete backend API functional +- Dashboard displays all analyses correctly +- All integration tests passing +- Performance targets achieved + +--- + + +### Phase 3: Evaluation, User Study & Polish (September 2 – October 15, 2026) +**Duration**: 6 weeks | **Effort**: 60h + +#### Deliverables: +1. **Evaluation Metrics** (early September) + - Proximity accuracy (MAE < 0.15) + - Clustering quality (Silhouette > 0.5, Purity > 0.80) + - Hotspot detection (F1 > 0.75) + - Emotion prediction accuracy (> 0.65) + - Performance benchmarks + +2. **Ablation Study** (mid September) + - 7 experimental conditions (Full, Ablate-Geo, etc.) + - Statistical analysis of results + - Baseline comparisons (Geo-only, K-means, Random) + - Results visualization and documentation + +3. **User Study** (late September) + - Protocol design + - Recruit 10-15 mental health researchers + - Conduct interviews/surveys + - Analyze qualitative feedback + - Document findings and recommendations + +4. **Documentation & Demo** (early October) + - Comprehensive demo script + - Video demonstration + - Case study analyses + - Updated architecture documentation + - API reference guide + +**Milestones**: +- All evaluation metrics meet success criteria +- Ablation study confirms multi-dimensional approach +- User feedback validates usefulness +- Complete documentation ready for handoff + +--- + + +### Final Phase: Wrap-up & Submission (October 16 – November 11, 2026) +**Duration**: 4 weeks | **Effort**: 30h + +#### Deliverables: +- GSoC final report +- Final presentation to mentors/community +- Code cleanup and refactoring +- Contributor guide for future developers +- Knowledge transfer documentation + +**Milestone**: Project ready for production deployment + +--- + +**Total GSoC Coding Effort (Phases 1–4): 350 hours** + +--- + +## Post-GSoC Enhancements (Future Work) + +### Short-term (3-6 months) + +#### 1. Advanced Place Enrichment +**Description**: Integrate external APIs for automatic place type detection and tagging. + +**Features**: +- Google Places API integration for: + - Automatic place type inference from GPS + - Business name and category extraction + - Photo matching for location verification +- Nominatim (OpenStreetMap) as free alternative +- Automatic cultural tag extraction from place descriptions + +**Benefits**: +- Reduces manual input burden +- Improves proximity accuracy with rich metadata +- Enables cross-user location matching + +**Effort**: 40 hours + +--- + +#### 2. Real-time Collaborative Features +**Description**: Enable cross-user emotion pattern analysis while preserving privacy. + +**Features**: +- Anonymized aggregation of emotions at public locations +- "Others felt positive here too" insights +- Heatmap of community emotional landscape +- Privacy-preserving differential privacy techniques + +**Benefits**: +- Social validation for recovery journeys +- Community-level mental health insights +- Research opportunities for population-level analysis + +**Effort**: 60 hours + +--- + +#### 3. Mobile App Integration +**Description**: Native mobile support for location-aware photo uploads. + +**Features**: +- React Native mobile app +- Automatic GPS capture on photo upload +- Offline mode with sync +- Push notifications for emotional hotspot proximity +- Map-based photo browsing + +**Benefits**: +- Improves data quality (native GPS access) +- Better user experience +- Real-time location-emotion tracking + +**Effort**: 120 hours + +--- + +### Medium-term (6-12 months) + +#### 4. Predictive Emotion Modeling +**Description**: ML models to predict emotional responses at locations. + +**Features**: +- Train models on historical location-emotion data +- Predict likely emotion at new/unvisited locations +- Personalized recommendations for emotionally beneficial places +- Transfer learning across users with similar patterns + +**Benefits**: +- Proactive mental health support +- Personalized location recommendations +- Research insights into emotion-place associations + +**Effort**: 80 hours + +--- + +#### 5. Temporal Pattern Mining +**Description**: Advanced analysis of how location-emotion associations evolve. + +**Features**: +- Change point detection in temporal trends +- Season/time-of-day emotion patterns +- Recovery milestone identification +- Longitudinal trajectory modeling + +**Benefits**: +- Identify recovery inflection points +- Understand cyclical patterns +- Support intervention timing decisions + +**Effort**: 60 hours + +--- + +#### 6. Clinician Dashboard +**Description**: Specialized interface for therapists and researchers. + +**Features**: +- Multi-patient aggregate view (with consent) +- Customizable reports for clinical sessions +- Export data in standard formats (CSV, FHIR) +- Annotation tools for clinical notes +- HIPAA compliance and data security + +**Benefits**: +- Clinical research support +- Therapy integration +- Evidence-based interventions + +**Effort**: 100 hours + +--- + +### Long-term (1-2 years) + +#### 7. Multi-modal Emotion Analysis +**Description**: Integrate additional data sources beyond photos/captions. + +**Features**: +- Audio analysis (voice emotion detection) +- Video micro-expression analysis +- Wearable integration (heart rate, activity) +- Social media cross-posting analysis +- Calendar/schedule correlation + +**Benefits**: +- Richer emotional context +- Triangulation of emotion signals +- Holistic recovery tracking + +**Effort**: 200 hours + +--- + +#### 8. Intervention Recommendation Engine +**Description**: AI-powered suggestions for mental health interventions. + +**Features**: +- Pattern-based intervention recommendations +- "You felt better after visiting parks" insights +- Evidence-based coping strategies +- Connection to mental health resources +- Crisis detection and emergency protocols + +**Benefits**: +- Actionable guidance for users +- Bridge between analysis and intervention +- Potential life-saving crisis support + +**Effort**: 150 hours + +--- + +#### 9. Research Platform & Data Sharing +**Description**: Enable academic research while protecting privacy. + +**Features**: +- De-identified data export for researchers +- Federated learning across institutions +- Open dataset publication (with consent) +- Replication tools for published studies +- IRB-compliant data access workflows + +**Benefits**: +- Accelerate mental health research +- Validate findings across populations +- Build evidence base for digital therapeutics + +**Effort**: 120 hours + +--- + +## Technology Evolution + +### Current Stack +- **Backend**: Python Flask +- **Database**: MongoDB +- **ML**: Hugging Face Transformers, scikit-learn +- **Frontend**: Jinja2 templates, vanilla JavaScript +- **Visualization**: Leaflet.js (maps) + +### Planned Upgrades + +**Near-term**: +- **Frontend**: Migrate to React or Vue.js for richer interactivity +- **API**: GraphQL for flexible data querying +- **Caching**: Redis for proximity score caching +- **Task Queue**: Celery for async clustering jobs + +**Long-term**: +- **Database**: TimescaleDB for better time-series performance +- **ML Ops**: MLflow for model versioning and deployment +- **Monitoring**: Prometheus + Grafana for production observability +- **Scaling**: Kubernetes for horizontal scaling + +--- + +## Community & Ecosystem + +### Open Source Growth +- **Contributors**: Attract additional GSoC students for future summers +- **Plugin System**: Allow third-party emotion analysis models +- **API Clients**: Official Python/JavaScript client libraries +- **Documentation**: Sphinx-generated API docs, video tutorials + +### Integration Partnerships +- **Beehive**: Deeper integration with photo storytelling +- **Mental Health Apps**: Partnerships with existing platforms +- **Wearable Devices**: Official integrations with Fitbit, Apple Health +- **EHR Systems**: FHIR-compliant data export for clinical records + +### Research Collaborations +- **University Partnerships**: Pilot studies with psychology departments +- **Funding**: Grants for large-scale clinical trials +- **Publications**: Research papers on location-emotion findings +- **Conferences**: Presentations at mental health informatics conferences + +--- + +## Timeline Overview + +``` +2026 +│ +├─ Feb-Apr: Pre-GSoC Contributions (18 PRs) +├─ May-Aug: GSoC Implementation (350h) +├─ Sep-Dec: Production Deployment + Short-term Enhancements +│ +2027 +│ +├─ Q1-Q2: Medium-term Features (Predictive Modeling, Temporal Mining) +├─ Q3-Q4: Long-term Features (Multi-modal Analysis) +│ +2028+ +│ +└─ Research Platform, Intervention Engine, Ecosystem Growth +``` + +--- + +## Success Metrics (Long-term) + +### Usage Metrics +- **Users**: 10,000+ active users by 2027 +- **Photos Analyzed**: 1M+ photos with location data +- **Research Studies**: 10+ published papers using DREAMS data + +### Impact Metrics +- **Recovery Outcomes**: Demonstrated improvement in recovery trajectories +- **Clinical Adoption**: 50+ clinicians actively using platform +- **Community Engagement**: Active open-source community (100+ stars) + +### Technical Metrics +- **Performance**: 99.9% uptime, < 2s average response time +- **Scalability**: Support 100k+ concurrent users +- **Accuracy**: Emotion prediction accuracy > 0.75 + +--- + +**Version**: 1.0 +**Last Updated**: February 3, 2026 +**Author**: Krishan (GSoC 2026 Contributor) +**Status**: Living document - updated quarterly diff --git a/docs/risk_analysis.md b/docs/risk_analysis.md new file mode 100644 index 0000000..5ec8d7b --- /dev/null +++ b/docs/risk_analysis.md @@ -0,0 +1,404 @@ +# Risk Analysis & Mitigation Strategies + +## Overview + +This document identifies potential risks to the DREAMS location-proximity GSoC project and outlines comprehensive mitigation strategies to ensure successful completion within the 350-hour timeframe. + +--- + +## Risk Matrix + +| Risk ID | Risk | Probability | Impact | Severity | Mitigation Priority | +|---------|------|-------------|--------|----------|-------------------| +| R1 | EXIF data unavailable in most photos | High | High | **Critical** | 1 | +| R2 | Multi-dimensional proximity doesn't improve results | Medium | High | **High** | 2 | +| R3 | Integration conflicts with existing code | Medium | Medium | **Medium** | 3 | +| R4 | Performance issues with large datasets | Medium | High | **High** | 4 | +| R5 | Scope creep beyond 350 hours | Medium | High | **High** | 5 | +| R6 | MongoDB schema changes break existing features | Low | High | **Medium** | 6 | +| R7 | User study recruitment difficulties | Medium | Low | **Low** | 7 | +| R8 | Clustering produces meaningless results | Low | Medium | **Low** | 8 | +| R9 | Mentor availability constraints | Low | Medium | **Low** | 9 | +| R10 | Technical dependencies (libraries, APIs) fail | Low | Medium | **Low** | 10 | + +**Severity Calculation**: Probability × Impact + +--- + +## Detailed Risk Analysis & Mitigation + +### R1: EXIF Data Unavailable in Most Photos +**Probability**: High (70%) +**Impact**: High +**Description**: Users may upload photos without GPS metadata (social media downloads, privacy-stripped images, scanned photos). + +#### Mitigation Strategies + +**Primary**: Fallback to manual location entry +```python +if location_data.get('accuracy') == 'none': + # Prompt user for manual location + return {"requires_manual_location": True} +``` + +**Secondary**: Place inference from caption/keywords +```python +# Extract location names from caption +location_mentions = extract_place_names(caption) +# Use geocoding API to get coordinates +coords = geocode(location_mentions[0]) +``` + +**Tertiary**: Use IP-based geolocation as rough estimate +```python +# For logged-in users, approximate from IP +approx_location = geolocate_ip(request.remote_addr) +``` + +**Validation**: Track percentage of photos with GPS in test data. If < 30%, prioritize fallback mechanisms. + +**Timeline Impact**: +5 hours for robust fallback implementation + +--- + +### R2: Multi-Dimensional Proximity Doesn't Improve Results +**Probability**: Medium (40%) +**Impact**: High +**Description**: Ablation study may show that additional dimensions (linguistic, cultural) don't significantly improve clustering or emotion prediction over geographic distance alone. + +#### Mitigation Strategies + +**Primary**: Rigorous ablation study early (Week 8) +- Run all 7 experimental conditions +- If multi-dimensional doesn't outperform, pivot to geo + categorical only +- Document findings as research contribution (negative results are valuable) + +**Secondary**: Adaptive weighting +```python +# Learn optimal weights from data +weights = optimize_weights(validation_set) +``` + +**Tertiary**: Focus on interpretability over performance +- Even if metrics are similar, multi-dimensional may be more interpretable +- User study can validate semantic meaningfulness + +**Success Criteria Adjustment**: If multi-dimensional < 5% better than geo+categorical, simplify to two dimensions. + +**Timeline Impact**: None (ablation already planned) + +--- + +### R3: Integration Conflicts with Existing Code +**Probability**: Medium (50%) +**Impact**: Medium +**Description**: Extending ingestion pipeline and dashboard may conflict with ongoing development or existing functionality. + +#### Mitigation Strategies + +**Primary**: Regular communication with mentors +- Weekly check-ins on any parallel development +- Review PRs in main branch before integration + +**Secondary**: Modular design with clear interfaces +```python +# Use dependency injection for easy testing +class LocationProximityService: + def __init__(self, db_client, exif_extractor): + self.db = db_client + self.exif = exif_extractor +``` + +**Tertiary**: Feature flags for gradual rollout +```python +if app.config.get('ENABLE_LOCATION_PROXIMITY'): + # New functionality +``` + +**Validation**: Integration tests run against latest main branch weekly. + +**Timeline Impact**: +10 hours for conflict resolution (already budgeted in Phase 2) + +--- + +### R4: Performance Issues with Large Datasets +**Probability**: Medium (40%) +**Impact**: High +**Description**: Proximity calculations for 1000+ locations may exceed 3-second upload target. + +#### Mitigation Strategies + +**Primary**: Optimization techniques +- **Spatial indexing**: MongoDB geospatial queries for nearby locations +- **Caching**: Cache proximity scores between location pairs +- **Batch processing**: Compute proximity matrix in background task +- **Approximate algorithms**: Use locality-sensitive hashing for large-scale + +**Secondary**: Performance benchmarks early +```python +@pytest.mark.benchmark +def test_proximity_performance(): + """Ensure proximity calculation < 100ms for 100 locations.""" + start = time.time() + compute_proximity_matrix(100_locations) + assert time.time() - start < 0.1 +``` + +**Tertiary**: Incremental computation +```python +# Only compute proximity for new location vs. existing +# Don't recompute entire matrix on each upload +``` + +**Success Criteria**: If upload > 3s with 100 locations, move clustering to async background job. + +**Timeline Impact**: +8 hours for optimization (included in Week 6-7) + +--- + +### R5: Scope Creep Beyond 350 Hours +**Probability**: Medium (50%) +**Impact**: High +**Description**: Feature requests or perfectionism may expand scope beyond planned milestones. + +#### Mitigation Strategies + +**Primary**: Strict scope management +- **MVP focus**: Core features only (proximity, clustering, hotspots) +- **Future work list**: Document "nice-to-haves" for post-GSoC +- **Weekly hour tracking**: Monitor actual vs. planned hours + +**Secondary**: Ruthless prioritization +``` +P0 (Must-have): Basic proximity, clustering, integration +P1 (Should-have): Dashboard visualization, ablation study +P2 (Nice-to-have): Advanced analytics, real-time updates +P3 (Future): Cross-user analysis, ML predictions +``` + +**Tertiary**: Timeboxing +- Each task has maximum hour allocation +- If exceeded, move to "polish" phase or defer + +**Validation**: If cumulative hours > planned by 10%, cut P2 features. + +**Timeline Impact**: None (proactive management) + +--- + +### R6: MongoDB Schema Changes Break Existing Features +**Probability**: Low (20%) +**Impact**: High +**Description**: Adding new collections or fields may inadvertently break existing queries or functionality. + +#### Mitigation Strategies + +**Primary**: Backward compatibility +```python +# Add fields, don't modify existing ones +post_doc = { + # ... existing fields unchanged + 'location': location_data # NEW, optional +} +``` + +**Secondary**: Comprehensive testing +- Run full existing test suite before/after schema changes +- Integration tests validate old functionality still works + +**Tertiary**: Database migrations +```python +# Migration script to add new fields safely +def migrate_add_location_field(): + db.posts.update_many( + {'location': {'$exists': False}}, + {'$set': {'location': {'accuracy': 'none'}}} + ) +``` + +**Validation**: All existing tests pass after schema extension. + +**Timeline Impact**: +4 hours for careful migration (budgeted) + +--- + +### R7: User Study Recruitment Difficulties +**Probability**: Medium (40%) +**Impact**: Low +**Description**: May struggle to recruit 10-15 mental health researchers for user study in August. + +#### Mitigation Strategies + +**Primary**: Early recruitment +- Start outreach in Week 8 (2 weeks before user study) +- Leverage mentors' professional networks +- Offer small incentive (e.g., $25 Amazon gift card) + +**Secondary**: Alternative participants +- PhD students in clinical psychology +- Recovery support group facilitators +- DREAMS/Beehive existing community members + +**Tertiary**: Internal validation +- If < 5 external participants, conduct internal review with mentors +- Document as "expert evaluation" instead of "user study" + +**Success Criteria**: Minimum 5 participants provides sufficient qualitative feedback. + +**Timeline Impact**: None (user study is enhancement, not blocker) + +--- + +### R8: Clustering Produces Meaningless Results +**Probability**: Low (25%) +**Impact**: Medium +**Description**: DBSCAN may produce many outliers or fail to find coherent clusters with real data. + +#### Mitigation Strategies + +**Primary**: Adaptive parameters +```python +# Automatically tune eps and min_samples +from sklearn.model_selection import GridSearchCV +best_params = grid_search_dbscan(proximity_matrix) +``` + +**Secondary**: Alternative algorithms +- Try HDBSCAN (hierarchical DBSCAN) for adaptive density +- Try Agglomerative Clustering with proximity distance matrix +- Ensemble of multiple clustering methods + +**Tertiary**: Fallback to simpler grouping +```python +# If clustering fails, fall back to place-type grouping +if silhouette_score < 0.3: + # Just group by place_type + clusters = group_by_place_type(locations) +``` + +**Validation**: Synthetic dataset should always produce 3 clean clusters. + +**Timeline Impact**: +6 hours for parameter tuning (budgeted) + +--- + +### R9: Mentor Availability Constraints +**Probability**: Low (20%) +**Impact**: Medium +**Description**: Mentors may have limited availability during summer for weekly meetings. + +#### Mitigation Strategies + +**Primary**: Asynchronous communication +- Detailed weekly progress reports via email/GitHub discussions +- Use project board (GitHub Projects) for transparency +- Record demo videos for async review + +**Secondary**: Flexible meeting schedule +- Schedule meetings 2 weeks in advance +- Offer multiple time slot options +- Accept shorter 30-min check-ins if needed + +**Tertiary**: Self-sufficiency +- Make decisions independently when appropriate +- Document rationale for mentor review later +- Escalate only blockers that require immediate input + +**Success Criteria**: Minimum 1 mentor interaction per week (meeting or detailed async feedback). + +**Timeline Impact**: None + +--- + +### R10: Technical Dependencies Fail +**Probability**: Low (15%) +**Impact**: Medium +**Description**: External libraries (scikit-learn, exifread) or services (Google Places API) may have issues. + +#### Mitigation Strategies + +**Primary**: Pin dependency versions +```txt +# requirements.txt +scikit-learn==1.4.0 +exifread==3.0.0 +``` + +**Secondary**: Fallback implementations +```python +# If exifread fails, use Pillow +try: + from exifread import process_file +except ImportError: + # Use Pillow fallback + from PIL import Image +``` + +**Tertiary**: No external API dependencies for MVP +- Defer Google Places integration to future work +- Core functionality works offline with synthetic place types + +**Validation**: Test in clean environment before each phase. + +**Timeline Impact**: None (good practice) + +--- + +## Risk Monitoring & Response Plan + +### Weekly Risk Review +Every mentor meeting, review: +1. Have any risks materialized? +2. Has probability/impact changed for any risk? +3. Are mitigation strategies working? + +### Escalation Criteria +Escalate to mentors immediately if: +- Any critical risk materializes +- Cumulative hours > 10% over plan +- Core functionality blocker arises + +### Risk Log +Maintain `docs/risk_log.md` with: +- Date risk identified +- Mitigation actions taken +- Current status +- Lessons learned + +--- + +## Contingency Plans by Phase + +### Phase 1 Contingency +**If**: Core implementation takes 140h instead of 120h +**Then**: Reduce dashboard polish in Phase 2 (cut 20h) + +### Phase 2 Contingency +**If**: Integration issues consume extra time +**Then**: Defer advanced visualizations, focus on basic dashboard + +### Phase 3 Contingency +**If**: Evaluation reveals major issues +**Then**: Allocate Final Week hours to fixes instead of documentation + +--- + +## Success Probability Assessment + +Given mitigation strategies: + +| Outcome | Probability | +|---------|-------------| +| **Complete Success** (All deliverables, on time) | 70% | +| **Partial Success** (Core features, minor delays) | 25% | +| **Significant Issues** (Major delays or missing features) | 5% | + +**Overall Project Risk Level**: **LOW-MEDIUM** + +With proactive risk management, rigorous testing, and mentor collaboration, this project has a high likelihood of successful completion within the 350-hour GSoC timeframe. + +--- + +**Version**: 1.0 +**Last Updated**: February 3, 2026 +**Author**: Krishan (GSoC 2026 Contributor) diff --git a/dream-integration/app/templates/index.html b/dream-integration/app/templates/index.html index 6e5a54e..f111e69 100644 --- a/dream-integration/app/templates/index.html +++ b/dream-integration/app/templates/index.html @@ -70,7 +70,7 @@

Description

{% if text_scores or image_scores %}
-

📊 Enhanced Emotion Analysis Results

+

Enhanced Emotion Analysis Results

Debug Information:
@@ -82,10 +82,10 @@

📊 Enhanced Emotion Analysis Results

@@ -108,7 +108,7 @@

📊 Enhanced Emotion Analysis Results

- 💡 Tip: Use Ctrl+Scroll to zoom charts for better detail viewing + Tip: Use Ctrl+Scroll to zoom charts for better detail viewing
diff --git a/dreamsApp/app/dashboard/main.py b/dreamsApp/app/dashboard/main.py index 2231508..1897fa7 100644 --- a/dreamsApp/app/dashboard/main.py +++ b/dreamsApp/app/dashboard/main.py @@ -51,7 +51,7 @@ def profile(target): df["rolling_avg"] = df["score"].rolling(window=5, min_periods=1).mean() df["ema_score"] = df["score"].ewm(span=5, adjust=False).mean() - # 📈 Create user-friendly visual + # Create user-friendly visual plt.figure(figsize=(12, 6)) plt.plot(df["timestamp"], df["cumulative_score"], diff --git a/dreamsApp/docs/data-model.md b/dreamsApp/docs/data-model.md index b99d60f..b58dfdd 100644 --- a/dreamsApp/docs/data-model.md +++ b/dreamsApp/docs/data-model.md @@ -93,10 +93,119 @@ Each theme entry includes: --- +--- + +## 5. Collection: `location_analysis` + +Stores location-proximity analysis results, clusters, and emotional hotspots per user. + +| Field | Type | Description | +|--------------|----------------|----------------------------------------------| +| `_id` | ObjectId | Document ID | +| `user_id` | string | Associated user | +| `locations` | array | All locations with visit history and emotions| +| `clusters` | array | Semantic clusters of similar locations | +| `hotspots` | array | Emotional hotspots (consistent emotions) | +| `updated_at` | datetime | Last update timestamp | + +**Location Entry Example:** +```json +{ + "id": "loc_001", + "name": "St. Mary's Church", + "coordinates": {"lat": 61.2167, "lon": -149.8944}, + "place_type": "church", + "language": "english", + "cultural_tags": ["catholic", "christian", "traditional"], + "visits": [ + { + "timestamp": "2024-01-21T10:00:00Z", + "post_id": "...", + "sentiment": "positive", + "score": 0.88 + } + ], + "emotion_profile": { + "positive": 0.80, + "neutral": 0.15, + "negative": 0.05 + } +} +``` + +**Cluster Example:** +```json +{ + "cluster_id": 0, + "label": "Religious Places", + "members": ["loc_001", "loc_002", "loc_003"], + "centroid": {"lat": 61.2186, "lon": -149.8870}, + "emotion_distribution": { + "positive": 0.82, + "neutral": 0.12, + "negative": 0.06 + }, + "place_types": ["church", "church", "church"], + "created_at": "2024-02-01T00:00:00Z" +} +``` + +**Hotspot Example:** +```json +{ + "location_id": "loc_001", + "name": "St. Mary's Church", + "sentiment": "positive", + "confidence": 0.80, + "visit_count": 5, + "coordinates": {"lat": 61.2167, "lon": -149.8944} +} +``` + +--- + +## 6. Collection: `emotion_location_entries` + +Stores individual emotion-location mappings for fine-grained temporal analysis. + +| Field | Type | Description | +|---------------|----------|------------------------------------------| +| `_id` | ObjectId | Document ID | +| `user_id` | string | Associated user | +| `location_id` | string | Reference to location in location_analysis| +| `post_id` | ObjectId | Reference to post | +| `sentiment` | string | Emotion label (positive/neutral/negative)| +| `score` | float | Sentiment confidence (0-1) | +| `timestamp` | datetime | When emotion was recorded | +| `place_type` | string | Type of place (church, hospital, park) | +| `coordinates` | object | GPS coordinates | + +**Example:** +```json +{ + "user_id": "user_001", + "location_id": "loc_001", + "post_id": ObjectId("..."), + "sentiment": "positive", + "score": 0.88, + "timestamp": "2024-01-21T10:00:00Z", + "place_type": "church", + "coordinates": {"lat": 61.2167, "lon": -149.8944} +} +``` + +This collection enables: +- Temporal emotion trend analysis at specific locations +- Cross-location emotion pattern detection +- Place-type emotion aggregation + +--- + ## Access Control - Only the `users` collection is authenticated via Flask-Login. - All other collections are accessed programmatically by the backend and admin panel. +- Location data is pseudonymized with user IDs to protect privacy. --- diff --git a/dreamsApp/exif_extractor.py b/dreamsApp/exif_extractor.py new file mode 100644 index 0000000..132c589 --- /dev/null +++ b/dreamsApp/exif_extractor.py @@ -0,0 +1,172 @@ +"""EXIF metadata extraction module for photo analysis.""" + +import exifread +from PIL import Image +from PIL.ExifTags import TAGS, GPSTAGS +from datetime import datetime +import logging + +logger = logging.getLogger(__name__) + +class EXIFExtractor: + """Extract and process EXIF metadata from images.""" + + def extract_metadata(self, image_path): + """Extract metadata using fallback strategy.""" + try: + return self._extract_exifread(image_path) + except Exception as e: + logger.warning(f"exifread failed: {e}, trying Pillow") + try: + return self._extract_pillow(image_path) + except Exception as e2: + logger.error(f"Both extractors failed: {e2}") + return self._empty_metadata() + + def _extract_exifread(self, image_path): + """Extract using exifread library.""" + with open(image_path, 'rb') as f: + tags = exifread.process_file(f) + + return { + "timestamp": self._parse_timestamp(tags), + "location": self._parse_gps_exifread(tags), + "camera": self._parse_camera_exifread(tags), + "processing": {"exif_source": "exifread"} + } + + def _extract_pillow(self, image_path): + """Extract using Pillow as fallback.""" + image = Image.open(image_path) + exif = image._getexif() + + if not exif: + return self._empty_metadata() + + return { + "timestamp": self._parse_timestamp_pillow(exif), + "location": self._parse_gps_pillow(exif), + "camera": self._parse_camera_pillow(exif), + "processing": {"exif_source": "pillow"} + } + + def _parse_gps_exifread(self, tags): + """Parse GPS coordinates from exifread tags.""" + lat = self._get_gps_coordinate(tags, 'GPS GPSLatitude', 'GPS GPSLatitudeRef') + lon = self._get_gps_coordinate(tags, 'GPS GPSLongitude', 'GPS GPSLongitudeRef') + + if lat and lon: + return {"lat": lat, "lon": lon, "accuracy": "high"} + return {"accuracy": "none"} + + def _get_gps_coordinate(self, tags, coord_key, ref_key): + """Convert GPS coordinate to decimal degrees.""" + coord = tags.get(coord_key) + ref = tags.get(ref_key) + + if not coord or not ref: + return None + + coord_str = str(coord) + ref_str = str(ref) + + # Parse coordinate string format + parts = coord_str.replace('[', '').replace(']', '').split(', ') + if len(parts) != 3: + return None + + try: + degrees = float(parts[0]) + minutes = float(parts[1]) + seconds = float(parts[2]) + + decimal = degrees + (minutes / 60.0) + (seconds / 3600.0) + + if ref_str in ['S', 'W']: + decimal = -decimal + + return decimal + except (ValueError, IndexError): + return None + + def _parse_timestamp(self, tags): + """Parse timestamp from exifread tags.""" + for key in ['EXIF DateTime', 'Image DateTime', 'EXIF DateTimeOriginal']: + if key in tags: + try: + dt_str = str(tags[key]) + return datetime.strptime(dt_str, '%Y:%m:%d %H:%M:%S').isoformat() + except ValueError: + continue + return None + + def _parse_camera_exifread(self, tags): + """Parse camera info from exifread tags.""" + return { + "make": str(tags.get('Image Make', '')), + "model": str(tags.get('Image Model', '')) + } + + def _parse_timestamp_pillow(self, exif): + """Parse timestamp from Pillow exif data.""" + for tag in [36867, 306, 36868]: # DateTimeOriginal, DateTime, DateTimeDigitized + if tag in exif: + try: + dt_str = exif[tag] + if isinstance(dt_str, bytes): + dt_str = dt_str.decode('utf-8') + return datetime.strptime(dt_str, '%Y:%m:%d %H:%M:%S').isoformat() + except (ValueError, AttributeError): + continue + return None + + def _parse_gps_pillow(self, exif): + """Parse GPS coordinates from Pillow exif data.""" + if 'GPSInfo' not in exif: + return {"accuracy": "none"} + + gps_info = exif['GPSInfo'] + + def get_coordinate(coord, ref): + if coord not in gps_info or ref not in gps_info: + return None + coord_vals = gps_info[coord] + ref_val = gps_info[ref] + + try: + degrees = coord_vals[0][0] / coord_vals[0][1] + minutes = coord_vals[1][0] / coord_vals[1][1] + seconds = coord_vals[2][0] / coord_vals[2][1] + + decimal = degrees + (minutes / 60.0) + (seconds / 3600.0) + + if ref_val in [b'S', b'W']: + decimal = -decimal + + return decimal + except (IndexError, TypeError, ZeroDivisionError): + return None + + lat = get_coordinate(2, 1) # GPSLatitude, GPSLatitudeRef + lon = get_coordinate(4, 3) # GPSLongitude, GPSLongitudeRef + + if lat and lon: + return {"lat": lat, "lon": lon, "accuracy": "high"} + return {"accuracy": "none"} + + def _parse_camera_pillow(self, exif): + """Parse camera info from Pillow exif data.""" + return { + "make": exif.get(271, "").decode('utf-8') if isinstance(exif.get(271), bytes) else str(exif.get(271, "")), + "model": exif.get(272, "").decode('utf-8') if isinstance(exif.get(272), bytes) else str(exif.get(272, "")) + } + + def _empty_metadata(self): + """Return empty metadata structure.""" + return { + "timestamp": None, + "location": {"accuracy": "none"}, + "camera": {"make": "", "model": ""}, + "processing": {"exif_source": "none"} + } + diff --git a/dreamsApp/location_proximity.py b/dreamsApp/location_proximity.py index 240515a..12340f3 100644 --- a/dreamsApp/location_proximity.py +++ b/dreamsApp/location_proximity.py @@ -1,6 +1,11 @@ -"""Location proximity analysis module for photo clustering.""" +"""Location proximity analysis module for photo clustering. + +Builds upon existing EXIF extraction (PR #77) and emotion proximity (PR #70) +to add multi-dimensional location-based clustering and analysis. +""" from typing import List, Dict, Optional, Tuple, TypedDict +from dreamsApp.exif_extractor import EXIFExtractor class Location(TypedDict): @@ -18,13 +23,15 @@ class ProximityResult(TypedDict): def extract_location(metadata: Dict) -> Optional[Location]: """Extract location data from photo metadata. + Integrates with existing EXIFExtractor from PR #77. + Args: metadata: Photo metadata dictionary containing location information Returns: Dictionary with lat/lon coordinates and accuracy, or None if no location data """ - raise NotImplementedError + raise NotImplementedError # TODO: Use EXIFExtractor for actual implementation def compute_proximity(location1: Location, location2: Location, threshold_meters: float) -> ProximityResult: diff --git a/location_proximity/README.md b/location_proximity/README.md index 242e9f6..de24e01 100644 --- a/location_proximity/README.md +++ b/location_proximity/README.md @@ -4,6 +4,11 @@ This module extends DREAMS to analyze how semantically similar locations (not just geographically close ones) influence emotional patterns in recovery journeys. +**Building Upon**: +- **PR #77** (kunal-595): Uses existing `dreamsApp/exif_extractor.py` for GPS extraction +- **PR #70** (AnvayKharb): Integrates with `analytics/emotion_proximity.py` for emotion timelines +- **This Module**: Adds multi-dimensional spatial proximity (geographic + categorical + linguistic + cultural) + --- ## Core Concept @@ -157,8 +162,8 @@ St. Mary's Church ↔ Alaska Native Medical Center : 0.120 Holy Trinity Church ↔ Providence Hospital : 0.115 Alaska Native Medical Center ↔ Providence Hospital : 0.725 -✓ Notice: Two churches have high proximity despite different locations -✓ Notice: Two hospitals cluster together semantically +Notice: Two churches have high proximity despite different locations +Notice: Two hospitals cluster together semantically ``` --- diff --git a/plans/pre_gsoc_contribution_plan.md b/plans/pre_gsoc_contribution_plan.md new file mode 100644 index 0000000..b0e635c --- /dev/null +++ b/plans/pre_gsoc_contribution_plan.md @@ -0,0 +1,197 @@ +# Pre-GSoC Contribution Plan for DREAMS: Multi-Dimensional Location Proximity and Emotion Analysis + +## Overview + +This pre-GSoC contribution plan outlines 18 pull requests (PRs) spread over 7 weeks, designed to strengthen the DREAMS project's foundation in multi-dimensional location proximity and emotion analysis. The plan focuses on enhancing the Flask backend with MongoDB integration, building upon existing modules like `exif_extractor` and `location_proximity`, while ensuring alignment with the GSoC 2026 proposal. + +**Recent Contributions by Other Developers**: +- **PR #77** (kunal-595): EXIF GPS extraction implemented - we integrate with this +- **PR #70** (AnvayKharb): Time-aware emotion proximity - we complement this with spatial proximity +- **PR #79** (anish1206): CHIME mental health framework - our emotion-location work aligns with CHIME dimensions + +**Total Duration**: 7 weeks +**Total PRs**: 18 +**Focus Areas**: Architecture (4 PRs), Research (3 PRs), Interfaces (4 PRs), Testing (4 PRs), Proposal Alignment (3 PRs) +**Key Technologies**: Python Flask, MongoDB, scikit-learn, Hugging Face Transformers + +--- + +## Weekly Breakdown + +### Week 1: Research Foundation and Architecture Setup +**Focus**: Establish research base and architectural foundations. + +#### PR 1: Research Literature Review Update +**Description**: Expand the research foundation in `location_proximity/RESEARCH.md` with additional literature on affective geography and semantic similarity. Add references to recent papers on emotion-location associations in mental health recovery. +**Dependencies**: None +**Deliverables**: +- Updated `location_proximity/RESEARCH.md` with 5+ new citations +- Summary of key findings in `docs/research_summary.md` + +#### PR 2: Architecture Documentation Refinement +**Description**: Refine `ARCHITECTURE.md` to include detailed MongoDB schema designs for location-emotion data storage. Update Mermaid diagrams to reflect MongoDB integration points. +**Dependencies**: PR 1 +**Deliverables**: +- Enhanced `ARCHITECTURE.md` with MongoDB-specific sections +- New schema diagrams for location and emotion collections + +#### PR 3: Database Schema Implementation +**Description**: Implement MongoDB schemas in `dreamsApp/app/models.py` for storing location proximity data and emotion-location mappings. Ensure compatibility with existing post schema. +**Dependencies**: PR 2 +**Deliverables**: +- Updated `dreamsApp/app/models.py` with new MongoDB collections +- Migration scripts for schema updates + +### Week 2: Core Location Proximity Enhancements +**Focus**: Strengthen core proximity calculation modules. + +#### PR 4: Enhanced EXIF Location Extraction +**Status**: **Completed by PR #77** (kunal-595) - EXIF extractor already implemented in `dreamsApp/exif_extractor.py` +**Description**: ~~Improve `dreamsApp/exif_extractor.py`~~ Integration with existing EXIF extractor to ensure compatibility with multi-dimensional proximity module. +**Dependencies**: PR 3 +**Deliverables**: +- ~~Enhanced `dreamsApp/exif_extractor.py` with better error handling~~ Already exists +- Integration tests with existing `EXIFExtractor` class + +#### PR 5: Multi-Dimensional Proximity Calculator Refinement +**Description**: Refine `location_proximity/proximity_calculator.py` to optimize weighted proximity calculations and add configurable dimension weights. +**Dependencies**: PR 4 +**Deliverables**: +- Updated `location_proximity/proximity_calculator.py` +- Performance benchmarks for proximity calculations + +#### PR 6: Emotion-Location Mapper Implementation +**Description**: Complete implementation of `location_proximity/emotion_location_mapper.py` with methods for temporal emotion trends and hotspot identification. +**Dependencies**: PR 5 +**Deliverables**: +- Functional `location_proximity/emotion_location_mapper.py` +- Integration with sentiment analysis from `dreamsApp/app/utils/sentiment.py` + +### Week 3: Semantic Clustering and Interface Development +**Focus**: Implement clustering algorithms and initial API interfaces. + +#### PR 7: Semantic Clustering Enhancements +**Description**: Enhance `location_proximity/semantic_clustering.py` with improved DBSCAN parameters and add visualization support for clusters. +**Dependencies**: PR 6 +**Deliverables**: +- Updated `location_proximity/semantic_clustering.py` +- Clustering quality metrics implementation + +#### PR 8: REST API Endpoints for Location Analysis +**Description**: Add new API endpoints in `dreamsApp/app/ingestion/routes.py` for proximity calculations and location-emotion queries. +**Dependencies**: PR 7 +**Deliverables**: +- New routes in `dreamsApp/app/ingestion/routes.py` +- API documentation updates + +#### PR 9: Dashboard UI Components for Location Proximity +**Description**: Create new dashboard templates in `dreamsApp/app/templates/dashboard/` for visualizing location proximity patterns. +**Dependencies**: PR 8 +**Deliverables**: +- New HTML templates for location analysis +- Basic JavaScript for map visualizations + +### Week 4: Emotion Analysis Integration and Testing +**Focus**: Integrate emotion analysis and begin comprehensive testing. + +#### PR 10: Sentiment Analysis Integration with Locations +**Description**: Integrate emotion analysis from `dreamsApp/app/utils/sentiment.py` with location data in the ingestion pipeline. +**Dependencies**: PR 9 +**Deliverables**: +- Updated ingestion routes with emotion-location mapping +- Data flow integration in `dreamsApp/app/ingestion/routes.py` + +#### PR 11: Unit Tests for Location Proximity Modules +**Description**: Create comprehensive unit tests for all location proximity components in `tests/test_location_proximity.py`. +**Dependencies**: PR 10 +**Deliverables**: +- Complete test suite in `tests/test_location_proximity.py` +- Test data fixtures for locations and emotions + +#### PR 12: Integration Tests for Location-Emotion Pipeline +**Description**: Develop integration tests covering the full pipeline from image upload to emotion-location analysis. +**Dependencies**: PR 11 +**Deliverables**: +- New integration test file `tests/test_location_emotion_integration.py` +- End-to-end test scenarios + +### Week 5: Performance Optimization and Documentation +**Focus**: Optimize performance and enhance documentation. + +#### PR 13: Performance Optimization for Proximity Calculations +**Description**: Implement caching and batch processing optimizations for proximity calculations in the Flask app. +**Dependencies**: PR 12 +**Deliverables**: +- Caching layer in `dreamsApp/app/utils/` +- Performance improvements documentation + +#### PR 14: Comprehensive Documentation Updates +**Description**: Update all README files and create user guides for location proximity features. +**Dependencies**: PR 13 +**Deliverables**: +- Updated `README.md`, `location_proximity/README.md` +- User guide in `docs/location_proximity_guide.md` + +#### PR 15: Demo Script and Example Improvements +**Description**: Enhance the demo script in `location_proximity/demo.py` with more comprehensive examples and better output formatting. +**Dependencies**: PR 14 +**Deliverables**: +- Improved `location_proximity/demo.py` +- Sample data for demonstrations + +### Week 6: Advanced Features and Validation +**Focus**: Implement advanced features and validation metrics. + +#### PR 16: Validation Metrics and Statistical Analysis +**Description**: Implement validation metrics from the research foundation, including clustering quality and emotion prediction accuracy. +**Dependencies**: PR 15 +**Deliverables**: +- New module `location_proximity/validation_metrics.py` +- Statistical analysis functions + +#### PR 17: Cross-User Location-Emotion Analysis +**Description**: Add features for analyzing location-emotion patterns across multiple users while maintaining privacy. +**Dependencies**: PR 16 +**Deliverables**: +- Cross-user analysis functions in `location_proximity/emotion_location_mapper.py` +- Privacy-preserving aggregation methods + +### Week 7: Final Integration and Proposal Alignment +**Focus**: Finalize integrations and ensure proposal compliance. + +#### PR 18: Final Proposal Alignment and Integration Testing +**Description**: Conduct final review to ensure all contributions align with GSoC 2026 proposal requirements. Perform comprehensive integration testing. +**Dependencies**: PR 17 +**Deliverables**: +- Proposal alignment checklist +- Final integration test results +- Updated project roadmap + +--- + +## Dependencies and Prerequisites + +- **Technical Prerequisites**: Python 3.8+, Flask, MongoDB, scikit-learn, Pillow +- **Project Knowledge**: Familiarity with DREAMS architecture and existing modules +- **Testing Environment**: Access to test MongoDB instance and sample image data + +## Success Metrics + +- All 18 PRs merged successfully +- 90%+ test coverage for new location proximity code +- Performance benchmarks meeting requirements +- Documentation completeness for all new features +- Alignment with GSoC proposal objectives + +## Risk Mitigation + +- Weekly code reviews to catch integration issues early +- Incremental testing to ensure stability +- Regular alignment checks with project mentors +- Backup plans for complex PRs with multiple dependencies + +--- + +**Plan Created**: December 2025 +**Total Estimated Effort**: 18 PRs across 7 weeks +**Primary Contributor**: Krishan (Pre-GSoC Contributor) \ No newline at end of file diff --git a/tests/test_exif_extraction.py b/tests/test_exif_extraction.py new file mode 100644 index 0000000..47f10e3 --- /dev/null +++ b/tests/test_exif_extraction.py @@ -0,0 +1,70 @@ +"""Tests for EXIF extraction functionality.""" + +import pytest +from unittest.mock import patch, mock_open +from dreamsApp.exif_extractor import EXIFExtractor + +class TestEXIFExtractor: + + def setup_method(self): + self.extractor = EXIFExtractor() + + def test_empty_metadata_structure(self): + """Test empty metadata has correct structure.""" + empty = self.extractor._empty_metadata() + + assert "timestamp" in empty + assert "location" in empty + assert "camera" in empty + assert "processing" in empty + assert empty["location"]["accuracy"] == "none" + + @patch('exifread.process_file') + @patch('builtins.open', new_callable=mock_open) + def test_exifread_extraction(self, mock_file, mock_process): + """Test successful exifread extraction.""" + mock_process.return_value = { + 'EXIF DateTime': type('Tag', (), {'__str__': lambda: '2024:01:15 14:30:00'})(), + 'GPS GPSLatitude': type('Tag', (), {'__str__': lambda: '[61, 13, 4.68]'})(), + 'GPS GPSLatitudeRef': type('Tag', (), {'__str__': lambda: 'N'})(), + 'Image Make': type('Tag', (), {'__str__': lambda: 'Apple'})() + } + + result = self.extractor._extract_exifread('test.jpg') + + assert result["processing"]["exif_source"] == "exifread" + assert result["timestamp"] == "2024-01-15T14:30:00" + assert result["camera"]["make"] == "Apple" + + def test_gps_coordinate_conversion(self): + """Test GPS coordinate conversion to decimal.""" + tags = { + 'GPS GPSLatitude': type('Tag', (), {'__str__': lambda: '[61, 13, 4.68]'})(), + 'GPS GPSLatitudeRef': type('Tag', (), {'__str__': lambda: 'N'})() + } + + coord = self.extractor._get_gps_coordinate(tags, 'GPS GPSLatitude', 'GPS GPSLatitudeRef') + + assert coord is not None + assert abs(coord - 61.2180) < 0.001 # Approximate check + + def test_missing_gps_data(self): + """Test handling of missing GPS data.""" + tags = {} + + coord = self.extractor._get_gps_coordinate(tags, 'GPS GPSLatitude', 'GPS GPSLatitudeRef') + + assert coord is None + + @patch('dreamsApp.exif_extractor.EXIFExtractor._extract_exifread') + @patch('dreamsApp.exif_extractor.EXIFExtractor._extract_pillow') + def test_fallback_strategy(self, mock_pillow, mock_exifread): + """Test fallback from exifread to Pillow.""" + mock_exifread.side_effect = Exception("exifread failed") + mock_pillow.return_value = {"processing": {"exif_source": "pillow"}} + + result = self.extractor.extract_metadata('test.jpg') + + assert result["processing"]["exif_source"] == "pillow" + mock_exifread.assert_called_once() + mock_pillow.assert_called_once() \ No newline at end of file From 80837b1fb061c718e905aa069ace8fbbbc37a156 Mon Sep 17 00:00:00 2001 From: Krishan Yadav Date: Sun, 8 Feb 2026 12:05:44 +0530 Subject: [PATCH 2/3] Update exif_extractor.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- dreamsApp/exif_extractor.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/dreamsApp/exif_extractor.py b/dreamsApp/exif_extractor.py index 132c589..69d6cf2 100644 --- a/dreamsApp/exif_extractor.py +++ b/dreamsApp/exif_extractor.py @@ -67,26 +67,24 @@ def _get_gps_coordinate(self, tags, coord_key, ref_key): if not coord or not ref: return None - coord_str = str(coord) - ref_str = str(ref) - - # Parse coordinate string format - parts = coord_str.replace('[', '').replace(']', '').split(', ') - if len(parts) != 3: - return None - try: - degrees = float(parts[0]) - minutes = float(parts[1]) - seconds = float(parts[2]) - + # Directly access the 'values' attribute which contains a list of Ratio objects. + # This is more robust than parsing the string representation. + if not hasattr(coord, 'values') or len(coord.values) != 3: + return None + + degrees = float(coord.values[0]) + minutes = float(coord.values[1]) + seconds = float(coord.values[2]) + decimal = degrees + (minutes / 60.0) + (seconds / 3600.0) - - if ref_str in ['S', 'W']: + + if hasattr(ref, 'values') and str(ref.values) in ['S', 'W']: decimal = -decimal return decimal - except (ValueError, IndexError): + except (ValueError, IndexError, TypeError) as e: + logger.warning(f"Could not parse GPS coordinate: {coord}. Error: {e}") return None def _parse_timestamp(self, tags): From 9db0ecbb7c850893c183f8ccdde684a34798419a Mon Sep 17 00:00:00 2001 From: Krishan Yadav Date: Sun, 8 Feb 2026 12:05:54 +0530 Subject: [PATCH 3/3] Update exif_extractor.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- dreamsApp/exif_extractor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dreamsApp/exif_extractor.py b/dreamsApp/exif_extractor.py index 69d6cf2..2593d21 100644 --- a/dreamsApp/exif_extractor.py +++ b/dreamsApp/exif_extractor.py @@ -154,9 +154,11 @@ def get_coordinate(coord, ref): def _parse_camera_pillow(self, exif): """Parse camera info from Pillow exif data.""" + make_val = exif.get(271, "") + model_val = exif.get(272, "") return { - "make": exif.get(271, "").decode('utf-8') if isinstance(exif.get(271), bytes) else str(exif.get(271, "")), - "model": exif.get(272, "").decode('utf-8') if isinstance(exif.get(272), bytes) else str(exif.get(272, "")) + "make": make_val.decode('utf-8') if isinstance(make_val, bytes) else str(make_val), + "model": model_val.decode('utf-8') if isinstance(model_val, bytes) else str(model_val) } def _empty_metadata(self):