diff --git a/.gitignore b/.gitignore index 71ef3ad..ba30730 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ src/lib/services/initFirebase.js *.log /prescraped-data-*/ /prescraped-*.json -/scraping-data/ \ No newline at end of file +# Test files with secrets +test-*.js diff --git a/documentation/CACHE_STRATEGY_README.md b/CACHE_STRATEGY_README.md similarity index 100% rename from documentation/CACHE_STRATEGY_README.md rename to CACHE_STRATEGY_README.md diff --git a/documentation/CACHE_TESTING_README.md b/CACHE_TESTING_README.md similarity index 100% rename from documentation/CACHE_TESTING_README.md rename to CACHE_TESTING_README.md diff --git a/documentation/FIRESTORE_UPLOAD_README.md b/FIRESTORE_UPLOAD_README.md similarity index 100% rename from documentation/FIRESTORE_UPLOAD_README.md rename to FIRESTORE_UPLOAD_README.md diff --git a/documentation/FIX_NULL_LYRICS_README.md b/FIX_NULL_LYRICS_README.md similarity index 89% rename from documentation/FIX_NULL_LYRICS_README.md rename to FIX_NULL_LYRICS_README.md index b2583d3..29c3d0f 100644 --- a/documentation/FIX_NULL_LYRICS_README.md +++ b/FIX_NULL_LYRICS_README.md @@ -40,17 +40,17 @@ npm run fix-lyrics-cached 1. **Fix songs for a specific artist** (FASTEST): ```bash -node scripts/fix-null-lyrics.js --artist grace-petrie --dry-run +node fix-null-lyrics.js --artist grace-petrie --dry-run ``` 2. **Fix only cached songs** (FAST): ```bash -node scripts/fix-null-lyrics.js --check-cached-only --dry-run +node fix-null-lyrics.js --check-cached-only --dry-run ``` 3. **Scan limited songs** (SLOW): ```bash -node scripts/fix-null-lyrics.js --max-songs 100 --dry-run +node fix-null-lyrics.js --max-songs 100 --dry-run ``` ### Filter by Artist (Recommended) @@ -58,12 +58,12 @@ node scripts/fix-null-lyrics.js --max-songs 100 --dry-run Fix only songs from a specific artist - this is the fastest method: ```bash # Dry run - the script will search for the artist automatically -node scripts/fix-null-lyrics.js --artist "grace petrie" --dry-run -node scripts/fix-null-lyrics.js --artist "Grace Petrie" --dry-run -node scripts/fix-null-lyrics.js --artist grace-petrie --dry-run +node fix-null-lyrics.js --artist "grace petrie" --dry-run +node fix-null-lyrics.js --artist "Grace Petrie" --dry-run +node fix-null-lyrics.js --artist grace-petrie --dry-run # Actually fix -node scripts/fix-null-lyrics.js --artist "kendrick lamar" +node fix-null-lyrics.js --artist "kendrick lamar" ``` **The script now smartly searches for artists!** You can use: @@ -82,7 +82,7 @@ This method: Only process songs that are in artists' `cachedSongIds` arrays: ```bash -node scripts/fix-null-lyrics.js --check-cached-only +node fix-null-lyrics.js --check-cached-only ``` **This is the recommended approach** as it: @@ -95,12 +95,12 @@ node scripts/fix-null-lyrics.js --check-cached-only Process songs in smaller batches: ```bash -node scripts/fix-null-lyrics.js --batch-size 5 +node fix-null-lyrics.js --batch-size 5 ``` Limit total number of songs to process: ```bash -node scripts/fix-null-lyrics.js --max-songs 50 +node fix-null-lyrics.js --max-songs 50 ``` **Note:** When scanning ALL songs (without `--artist` or `--check-cached-only`), the script defaults to a maximum of 10,000 songs to prevent runaway scans. Use `--max-songs` to adjust this limit. @@ -109,14 +109,14 @@ node scripts/fix-null-lyrics.js --max-songs 50 See detailed information about each song: ```bash -node scripts/fix-null-lyrics.js --verbose +node fix-null-lyrics.js --verbose ``` ### Combine Options ```bash -node scripts/fix-null-lyrics.js --artist baby-jey --dry-run --verbose -node scripts/fix-null-lyrics.js --check-cached-only --batch-size 3 --max-songs 20 +node fix-null-lyrics.js --artist baby-jey --dry-run --verbose +node fix-null-lyrics.js --check-cached-only --batch-size 3 --max-songs 20 ``` ## What It Does @@ -279,9 +279,9 @@ For permanently failed songs: **Solutions:** 1. Try different name formats: ```bash - node scripts/fix-null-lyrics.js --artist "grace petrie" --dry-run - node scripts/fix-null-lyrics.js --artist "Grace Petrie" --dry-run - node scripts/fix-null-lyrics.js --artist grace-petrie --dry-run + node fix-null-lyrics.js --artist "grace petrie" --dry-run + node fix-null-lyrics.js --artist "Grace Petrie" --dry-run + node fix-null-lyrics.js --artist grace-petrie --dry-run ``` 2. The script will search using: @@ -299,7 +299,7 @@ For permanently failed songs: **Solution:** Use the `--check-cached-only` flag: ```bash -node scripts/fix-null-lyrics.js --check-cached-only --dry-run +node fix-null-lyrics.js --check-cached-only --dry-run ``` This targets only songs in `cachedSongIds` arrays (songs that should have lyrics) and fetches them one at a time instead of all at once, avoiding timeouts. @@ -326,7 +326,7 @@ If you see HTTP 429 errors, the script is hitting Genius too fast. Try: ### Firebase Connection Issues -The script uses the same Firebase configuration as your other scripts (`scripts/firebase-uploader.js`, etc.). If those work, this will too! +The script uses the same Firebase configuration as your other scripts (`firebase-uploader.js`, etc.). If those work, this will too! If you encounter connection issues: 1. Check that your Firebase config in `src/lib/services/initFirebase.js` is correct diff --git a/documentation/IMAGE_RENDERING_FIXES.md b/IMAGE_RENDERING_FIXES.md similarity index 100% rename from documentation/IMAGE_RENDERING_FIXES.md rename to IMAGE_RENDERING_FIXES.md diff --git a/documentation/IMAGE_TEST_TOOL_README.md b/IMAGE_TEST_TOOL_README.md similarity index 100% rename from documentation/IMAGE_TEST_TOOL_README.md rename to IMAGE_TEST_TOOL_README.md diff --git a/Most Popular J Artists on Genius.html b/Most Popular J Artists on Genius.html new file mode 100644 index 0000000..ffba41f --- /dev/null +++ b/Most Popular J Artists on Genius.html @@ -0,0 +1,4006 @@ + + + + + Most Popular J Artists on Genius + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ GENIUS +
+ + + +
+ + Sign Up + + +
+
+ +
+ + + + + + + + +
+ + + + +
+
+ + + + + +

Most Popular J Artists on Genius

+ + + + + + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
\ No newline at end of file diff --git a/documentation/NULL_LYRICS_HANDLING.md b/NULL_LYRICS_HANDLING.md similarity index 96% rename from documentation/NULL_LYRICS_HANDLING.md rename to NULL_LYRICS_HANDLING.md index 9a40b69..c8f915d 100644 --- a/documentation/NULL_LYRICS_HANDLING.md +++ b/NULL_LYRICS_HANDLING.md @@ -99,23 +99,23 @@ This prevents them from being treated as "cached" in the future and cluttering t A bulk scraper utility has been created to proactively fix all songs with null lyrics across your entire database! ### Location -- **Script**: `scripts/fix-null-lyrics.js` +- **Script**: `fix-null-lyrics.js` - **Documentation**: `FIX_NULL_LYRICS_README.md` ### Quick Start ```bash # Dry run to see what would be fixed -node scripts/fix-null-lyrics.js --dry-run +node fix-null-lyrics.js --dry-run # Fix all songs with null lyrics -node scripts/fix-null-lyrics.js +node fix-null-lyrics.js # Fix only songs from a specific artist -node scripts/fix-null-lyrics.js --artist baby-jey +node fix-null-lyrics.js --artist baby-jey # Fix only songs that are supposed to be cached -node scripts/fix-null-lyrics.js --check-cached-only +node fix-null-lyrics.js --check-cached-only ``` ### Features diff --git a/documentation/PRECISE_LYRICS_SCRAPER_SUMMARY.md b/PRECISE_LYRICS_SCRAPER_SUMMARY.md similarity index 100% rename from documentation/PRECISE_LYRICS_SCRAPER_SUMMARY.md rename to PRECISE_LYRICS_SCRAPER_SUMMARY.md diff --git a/documentation/PRESCRAPER_README.md b/PRESCRAPER_README.md similarity index 92% rename from documentation/PRESCRAPER_README.md rename to PRESCRAPER_README.md index 8f67606..bd2ca4c 100644 --- a/documentation/PRESCRAPER_README.md +++ b/PRESCRAPER_README.md @@ -6,14 +6,14 @@ A comprehensive bulk scraping and upload system for LyricType that fetches artis The prescraper system consists of two main components: -1. **`scripts/prescraper.js`** - Scrapes artist songs and lyrics from Genius API -2. **`scripts/firebase-uploader.js`** - Uploads prescraped data to Firebase Firestore +1. **`prescraper.js`** - Scrapes artist songs and lyrics from Genius API +2. **`firebase-uploader.js`** - Uploads prescraped data to Firebase Firestore This system allows you to bulk-populate your database with artist data, song metadata, and lyrics for a better user experience. ## Features -### 🚀 Prescraper (`scripts/prescraper.js`) +### 🚀 Prescraper (`prescraper.js`) - ✅ Loads artists from existing `genius-artists-*.json` files - ✅ Fetches complete song lists for each artist (up to 1000 songs) - ✅ Scrapes lyrics for configurable number of top songs per artist @@ -24,7 +24,7 @@ This system allows you to bulk-populate your database with artist data, song met - ✅ Detailed logging and statistics - ✅ Configurable via CLI arguments -### 🔥 Firebase Uploader (`scripts/firebase-uploader.js`) +### 🔥 Firebase Uploader (`firebase-uploader.js`) - ✅ Uploads artists, songs, and lyrics to Firestore - ✅ Batch operations for efficiency - ✅ Duplicate detection and skip existing data @@ -81,7 +81,7 @@ Either: #### Basic Usage ```bash # Scrape 10 songs per artist for all letters -node scripts/prescraper.js +node prescraper.js # Or use npm script npm start @@ -90,13 +90,13 @@ npm start #### Advanced Options ```bash # Test with limited data -node scripts/prescraper.js --test 5 --letters a,b --songs 3 +node prescraper.js --test 5 --letters a,b --songs 3 # Scrape specific letters only -node scripts/prescraper.js --letters j,k,l --songs 15 +node prescraper.js --letters j,k,l --songs 15 # Help -node scripts/prescraper.js --help +node prescraper.js --help ``` #### CLI Options @@ -110,7 +110,7 @@ node scripts/prescraper.js --help #### Basic Usage ```bash # Upload latest prescraped data -node scripts/firebase-uploader.js +node firebase-uploader.js # Or use npm script npm run upload @@ -119,17 +119,17 @@ npm run upload #### Advanced Options ```bash # Dry run (test without uploading) -node scripts/firebase-uploader.js --dry-run +node firebase-uploader.js --dry-run npm run upload-dry # Upload specific directory -node scripts/firebase-uploader.js --dir ./prescraped-data-2025-09-14/ +node firebase-uploader.js --dir ./prescraped-data-2025-09-14/ # Force overwrite existing data -node scripts/firebase-uploader.js --force +node firebase-uploader.js --force # Help -node scripts/firebase-uploader.js --help +node firebase-uploader.js --help ``` #### CLI Options @@ -334,10 +334,10 @@ The prescraper creates partial files as it works. To resume: The uploader skips existing artists by default: ```bash # Skip existing data (default) -node scripts/firebase-uploader.js +node firebase-uploader.js # Or force overwrite -node scripts/firebase-uploader.js --force +node firebase-uploader.js --force ``` ## Advanced Configuration diff --git a/documentation/QUEUE_INTEGRATION_SUMMARY.md b/QUEUE_INTEGRATION_SUMMARY.md similarity index 100% rename from documentation/QUEUE_INTEGRATION_SUMMARY.md rename to QUEUE_INTEGRATION_SUMMARY.md diff --git a/documentation/QUICK_START_NULL_LYRICS.md b/QUICK_START_NULL_LYRICS.md similarity index 91% rename from documentation/QUICK_START_NULL_LYRICS.md rename to QUICK_START_NULL_LYRICS.md index e8d38f9..685ec01 100644 --- a/documentation/QUICK_START_NULL_LYRICS.md +++ b/QUICK_START_NULL_LYRICS.md @@ -54,13 +54,13 @@ Fix null lyrics for a single artist (much faster than scanning all songs): ```bash # Dry run for specific artist - try any of these formats! -node scripts/fix-null-lyrics.js --artist "grace petrie" --dry-run -node scripts/fix-null-lyrics.js --artist "Grace Petrie" --dry-run -node scripts/fix-null-lyrics.js --artist grace-petrie --dry-run +node fix-null-lyrics.js --artist "grace petrie" --dry-run +node fix-null-lyrics.js --artist "Grace Petrie" --dry-run +node fix-null-lyrics.js --artist grace-petrie --dry-run # Fix for specific artist -node scripts/fix-null-lyrics.js --artist "kendrick lamar" -node scripts/fix-null-lyrics.js --artist "Kendrick Lamar" +node fix-null-lyrics.js --artist "kendrick lamar" +node fix-null-lyrics.js --artist "Kendrick Lamar" ``` **Smart artist search!** The script will find artists using: @@ -79,22 +79,22 @@ The artist filter: Process songs in smaller batches (slower but safer): ```bash -node scripts/fix-null-lyrics.js --batch-size 3 +node fix-null-lyrics.js --batch-size 3 ``` Limit how many songs to process: ```bash -node scripts/fix-null-lyrics.js --max-songs 20 +node fix-null-lyrics.js --max-songs 20 ``` See detailed info about each song: ```bash -node scripts/fix-null-lyrics.js --verbose +node fix-null-lyrics.js --verbose ``` Combine options: ```bash -node scripts/fix-null-lyrics.js --artist drake --batch-size 5 --verbose +node fix-null-lyrics.js --artist drake --batch-size 5 --verbose ``` ## How the Automatic System Works diff --git a/documentation/SCRAPER_README.md b/SCRAPER_README.md similarity index 96% rename from documentation/SCRAPER_README.md rename to SCRAPER_README.md index 3ec8c6b..a8873ce 100644 --- a/documentation/SCRAPER_README.md +++ b/SCRAPER_README.md @@ -25,25 +25,25 @@ A Node.js script to scrape artist links from Genius.com artist index pages. #### Single Letter Scraping ```bash # Scrape artists for letter 'j' with IDs (default, slower) -node scripts/genius-scraper.js +node genius-scraper.js # Scrape artists for a specific letter with IDs -node scripts/genius-scraper.js a -node scripts/genius-scraper.js k -node scripts/genius-scraper.js z +node genius-scraper.js a +node genius-scraper.js k +node genius-scraper.js z # Fast mode: Skip ID extraction for quicker results -node scripts/genius-scraper.js j --no-ids -node scripts/genius-scraper.js a --no-ids +node genius-scraper.js j --no-ids +node genius-scraper.js a --no-ids ``` #### Bulk Scraping (All Letters A-Z) ```bash # Scrape ALL letters with IDs (very slow - several hours!) -node scripts/genius-scraper.js all +node genius-scraper.js all # Bulk scrape ALL letters without IDs (much faster - ~30 minutes) -node scripts/genius-scraper.js all --no-ids +node genius-scraper.js all --no-ids ``` ### Example Output (with ID extraction) diff --git a/documentation/SCRAPING_TEST_RESULTS.md b/SCRAPING_TEST_RESULTS.md similarity index 100% rename from documentation/SCRAPING_TEST_RESULTS.md rename to SCRAPING_TEST_RESULTS.md diff --git a/SSR_REMOVAL_PLAN.md b/SSR_REMOVAL_PLAN.md new file mode 100644 index 0000000..9e8a76f --- /dev/null +++ b/SSR_REMOVAL_PLAN.md @@ -0,0 +1,289 @@ +# SSR Removal & Image Processing Optimization Plan + +## Overview +This document outlines the comprehensive plan to remove Server-Side Rendering (SSR) from LyricType and implement an optimized image processing system using server-side dithering with client-side WebGL color mapping. + +## Current State Analysis + +### Current Architecture +- **SSR Function**: Handles all `/api/**` routes including image proxying +- **Image Processing**: Client-side dithering using Canvas API +- **Function Invocations**: Every image load = 1 function call +- **Cost Structure**: High due to repeated processing of same images + +### Current Image Flow +``` +Genius Image URL → Firebase Function Proxy → Client Download → Client Dithering → Display +``` + +### Identified Problems +1. **High Function Costs**: Every image request hits Firebase Functions +2. **Repeated Processing**: Same images dithered multiple times +3. **Performance**: Client-side dithering blocks UI thread +4. **Scalability**: Processing cost scales linearly with users +5. **Network**: Full color images downloaded for binary output + +## Target Architecture + +### New Image Flow +``` +Genius Image URL → [One-time] Server Dither → Binary Storage → Client WebGL Coloring → Display +``` + +### Key Principles +1. **Process Once, Use Forever**: Dither images server-side once +2. **Store Binary Data**: Only 1-bit per pixel needed +3. **Client Coloring**: Real-time theme application via WebGL +4. **Theme Decoupling**: Backend agnostic to frontend themes +5. **Graceful Fallback**: Maintain compatibility during transition + +## Implementation Phases + +### Phase 1: Foundation & Testing (Week 1) +#### 1.1 Binary Format Implementation +- [ ] Modify image proxy to return binary dithered data +- [ ] Implement server-side Atkinson dithering algorithm +- [ ] Add binary data logging for verification +- [ ] Test binary format compression ratios + +#### 1.2 WebGL Renderer Development +- [ ] Create WebGL shader for binary→color mapping +- [ ] Implement fallback for WebGL-unsupported browsers +- [ ] Performance testing and optimization +- [ ] Integration with existing component architecture + +#### 1.3 Testing Infrastructure +- [ ] Unit tests for binary conversion +- [ ] Visual regression tests for dithering accuracy +- [ ] Performance benchmarks (WebGL vs Canvas) +- [ ] Browser compatibility testing + +### Phase 2: Storage & Caching (Week 2) +#### 2.1 Database Schema Design +```javascript +// Firestore document structure +{ + imageId: "hash_of_original_url", + originalUrl: "https://genius.com/...", + binaryData: "compressed_binary_string", // or blob reference + width: 200, + height: 200, + processedAt: timestamp, + compressionFormat: "gzip" | "lz4" | "custom", + processingVersion: "1.0" // for future algorithm updates +} +``` + +#### 2.2 Caching Strategy +- [ ] Implement cache-first lookup in client +- [ ] Add cache warming for popular artists +- [ ] Implement cache invalidation strategy +- [ ] Add metrics for cache hit rates + +#### 2.3 Background Processing +- [ ] Create background function for batch processing +- [ ] Implement queue system for new image processing +- [ ] Add retry logic for failed processing +- [ ] Create admin tools for cache management + +### Phase 3: Migration & Optimization (Week 3) +#### 3.1 Gradual Migration +- [ ] Implement feature flag for new vs old system +- [ ] A/B testing infrastructure +- [ ] User preference storage +- [ ] Rollback mechanisms + +#### 3.2 Performance Optimization +- [ ] Implement image prefetching for popular artists +- [ ] Add service worker caching +- [ ] Optimize WebGL shader performance +- [ ] Implement lazy loading for large artist lists + +#### 3.3 SSR Removal +- [ ] Audit all SSR usage points +- [ ] Migrate remaining functionality to client-side +- [ ] Update Firebase hosting configuration +- [ ] Remove SSR function and dependencies + +### Phase 4: Production & Monitoring (Week 4) +#### 4.1 Production Deployment +- [ ] Blue-green deployment strategy +- [ ] Production monitoring setup +- [ ] Error tracking and alerting +- [ ] Performance monitoring dashboard + +#### 4.2 Cost Analysis +- [ ] Function invocation tracking +- [ ] Storage cost monitoring +- [ ] Performance metrics collection +- [ ] ROI calculation and reporting + +## Technical Specifications + +### Binary Format Design +```javascript +// Proposed binary format +{ + header: { + width: uint16, // 2 bytes + height: uint16, // 2 bytes + version: uint8, // 1 byte + compression: uint8, // 1 byte + checksum: uint32 // 4 bytes + }, + data: compressed_binary_array // 1 bit per pixel, compressed +} +``` + +### WebGL Shader Specifications +```glsl +// Vertex Shader +attribute vec2 a_position; +attribute vec2 a_texCoord; +varying vec2 v_texCoord; + +void main() { + gl_Position = vec4(a_position, 0.0, 1.0); + v_texCoord = a_texCoord; +} + +// Fragment Shader +precision mediump float; +uniform vec3 u_primaryColor; +uniform vec3 u_secondaryColor; +uniform sampler2D u_texture; +varying vec2 v_texCoord; + +void main() { + float value = texture2D(u_texture, v_texCoord).r; + vec3 color = mix(u_primaryColor, u_secondaryColor, value); + gl_FragColor = vec4(color, 1.0); +} +``` + +### API Design +```javascript +// New image service API +class ImageService { + async getDitheredImage(originalUrl, options = {}) { + // 1. Check cache first + // 2. Fallback to processing + // 3. Return binary data + } + + renderWithTheme(binaryData, primaryColor, secondaryColor) { + // WebGL rendering with theme colors + } + + prefetchImages(urls) { + // Background prefetching + } +} +``` + +## Performance Targets + +### Function Invocation Reduction +- **Current**: 1 invocation per image load +- **Target**: 1 invocation per unique image (lifetime) +- **Expected Reduction**: 95%+ + +### Image Loading Performance +- **Current**: 500-2000ms (download + dither) +- **Target**: 50-200ms (cache lookup + WebGL render) +- **Expected Improvement**: 5-10x faster + +### Storage Efficiency +- **Current**: ~40KB per 200x200 color image +- **Target**: ~2-5KB per dithered binary image +- **Expected Reduction**: 80-90% + +### Theme Switching +- **Current**: Re-download and re-process all images +- **Target**: Instant WebGL re-rendering +- **Expected Improvement**: Near-instantaneous + +## Risk Assessment + +### High Risk +1. **WebGL Compatibility**: Some older browsers may not support WebGL + - **Mitigation**: Canvas fallback implementation + +2. **Binary Format Changes**: Future algorithm updates may require format changes + - **Mitigation**: Versioned format with migration tools + +### Medium Risk +1. **Storage Costs**: Large number of cached images + - **Mitigation**: Compression and LRU eviction + +2. **Processing Queue Bottlenecks**: High demand for new image processing + - **Mitigation**: Horizontal scaling and priority queues + +### Low Risk +1. **Visual Quality Differences**: Minor differences in dithering output + - **Mitigation**: Extensive visual testing and user feedback + +## Success Metrics + +### Primary KPIs +- Function invocation count reduction +- Image loading performance improvement +- User experience metrics (bounce rate, engagement) +- Cost reduction percentage + +### Secondary KPIs +- Cache hit rate +- WebGL vs Canvas usage ratio +- Storage utilization +- Processing queue performance + +## Rollback Strategy + +### Immediate Rollback +- Feature flag to disable new system +- Automatic fallback to existing proxy +- No data loss or corruption risk + +### Gradual Rollback +- Percentage-based traffic routing +- User-specific opt-out mechanism +- Detailed monitoring during transition + +## Post-Implementation Optimizations + +### Future Enhancements +1. **Machine Learning**: Predictive image prefetching based on user behavior +2. **Progressive Enhancement**: Higher quality images for high-DPI displays +3. **Batch Processing**: Bulk image processing for new artist imports +4. **Edge Computing**: Regional processing for global performance +5. **Advanced Compression**: Custom compression algorithms for binary data + +### Monitoring & Analytics +1. **Real-time Dashboards**: Function costs, cache performance, user experience +2. **Automated Alerts**: Performance degradation, error rates, cost spikes +3. **A/B Testing Framework**: Continuous optimization and feature testing + +## Timeline Summary + +| Week | Focus Area | Key Deliverables | +|------|------------|------------------| +| 1 | Foundation | Binary format, WebGL renderer, testing | +| 2 | Storage | Database schema, caching, background processing | +| 3 | Migration | Feature flags, optimization, SSR removal | +| 4 | Production | Deployment, monitoring, cost analysis | + +## Dependencies + +### Internal +- Firebase Functions runtime compatibility +- Firestore storage limits and pricing +- SvelteKit client-side architecture + +### External +- Browser WebGL support levels +- Genius.com image availability and formats +- Third-party monitoring and analytics tools + +--- + +This plan provides a comprehensive roadmap for eliminating SSR while dramatically improving performance and reducing costs through intelligent caching and modern web technologies. diff --git a/scripts/add-search-tokens.js b/add-search-tokens.js similarity index 99% rename from scripts/add-search-tokens.js rename to add-search-tokens.js index 8672adf..58017ba 100644 --- a/scripts/add-search-tokens.js +++ b/add-search-tokens.js @@ -1,14 +1,12 @@ #!/usr/bin/env node -//TODO: REMOVE THIS SCRIPT, PUT THE INDIVIDUAL FUNCTIONALITY INTO THE UPLOAD ARTISTS SCRIPT - import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import { initializeApp } from 'firebase/app'; import { getFirestore, collection, doc, getDoc, updateDoc, query, orderBy, limit, startAfter, getDocs } from 'firebase/firestore'; import unidecode from 'unidecode'; -import { firebaseConfig } from '../src/lib/services/initFirebase.js'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); diff --git a/scripts/check-database.js b/check-database.js similarity index 100% rename from scripts/check-database.js rename to check-database.js diff --git a/cloudflare-image-proxy/package.json b/cloudflare-image-proxy/package.json deleted file mode 100644 index 015ff4a..0000000 --- a/cloudflare-image-proxy/package.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "name": "lyrictype-image-proxy", - "version": "1.0.0", - "description": "Cloudflare Worker to proxy Genius image requests", - "main": "worker.js", - "scripts": { - "deploy": "wrangler deploy", - "dev": "wrangler dev" - }, - "keywords": ["cloudflare", "worker", "proxy", "image"], - "author": "", - "license": "MIT" -} - diff --git a/cloudflare-image-proxy/wrangler.toml b/cloudflare-image-proxy/wrangler.toml deleted file mode 100644 index 96225ae..0000000 --- a/cloudflare-image-proxy/wrangler.toml +++ /dev/null @@ -1,12 +0,0 @@ -name = "lyrictype-image-proxy" -main = "worker.js" -compatibility_date = "2024-01-01" - -# After deploying, set this environment variable in Cloudflare dashboard -# [vars] -# AUTH_KEY = "your-secret-key-here" - -# For production deployment -# [[env.production]] -# name = "lyrictype-image-proxy" - diff --git a/documentation/ARTIST_UPDATE_SYSTEM_PLAN.md b/documentation/ARTIST_UPDATE_SYSTEM_PLAN.md deleted file mode 100644 index 3f7247d..0000000 --- a/documentation/ARTIST_UPDATE_SYSTEM_PLAN.md +++ /dev/null @@ -1,936 +0,0 @@ -# Artist Update System - Implementation Plan - -## 📋 Overview - -A comprehensive system for periodically updating the artist database by: -1. Scraping the latest artist list from Genius -2. Identifying new artists not in our database -3. Prescraping song data for only new artists -4. Manually uploading new data with popular flag updates - -**Run Frequency:** ~Once per month (manual execution) - ---- - -## 🗂️ Data Organization Structure - -All scraped data will be stored locally under `scraping-data/` with timestamp-based organization: - -``` -scraping-data/ -├── artist-lists/ -│ └── 2026-01-04-18-30/ -│ ├── artists-0.json # Numbers/symbols -│ ├── artists-a.json -│ ├── artists-b.json -│ ├── ... -│ ├── artists-z.json -│ ├── summary.json # Totals, timestamp, metadata -│ └── .complete # Marker file indicating completion -│ -├── new-artists/ -│ └── 2026-01-04-18-30/ -│ ├── new-artists-0.json # Filtered: only new artists -│ ├── new-artists-a.json -│ ├── ... -│ ├── new-artists-z.json -│ ├── comparison-report.json # Details on what's new vs existing -│ └── .complete -│ -└── song-data/ - └── 2026-01-04-18-30/ - ├── songs-0.json # Prescraped songs for new artists - ├── songs-a.json - ├── ... - ├── songs-z.json - ├── scraping-summary.json # Stats on lyrics scraped - └── .complete -``` - ---- - -## 🔄 Complete Workflow - -### **Phase 1: Scrape Artist Lists** -Fetch current artist data from Genius for all letters. - -**Script:** `scripts/scrape-artists.js` - -**Input:** None -**Output:** `scraping-data/artist-lists/{timestamp}/` - -**Data Structure (per file):** -```json -{ - "letter": "a", - "scrapedAt": "2026-01-04T18:30:00.000Z", - "totalArtists": 1245, - "artists": { - "popular": [ - { - "name": "Artist Name", - "url": "https://genius.com/artists/Artist-name", - "id": "123456", - "type": "popular" - } - ], - "regular": [ - { - "name": "Regular Artist", - "url": "https://genius.com/artists/Regular-artist", - "id": "789012", - "type": "regular" - } - ] - } -} -``` - -### **Phase 2: Compare with Database** -Identify which artists are new vs. already in Firestore. - -**Script:** `scripts/compare-artists.js` - -**Input:** -- Latest artist list from Phase 1 -- Current Firestore artists collection - -**Output:** `scraping-data/new-artists/{timestamp}/` - -**Comparison Report Structure:** -```json -{ - "timestamp": "2026-01-04T18:30:00.000Z", - "sourceDirectory": "scraping-data/artist-lists/2026-01-04-18-30", - "statistics": { - "totalGeniusArtists": 50000, - "totalFirestoreArtists": 48500, - "newArtists": 1500, - "existingArtists": 48500, - "popularChanges": { - "addedToPopular": 15, - "removedFromPopular": 12, - "unchangedPopular": 505 - } - }, - "perLetter": { - "a": { - "geniusTotal": 2000, - "firestoreTotal": 1950, - "newCount": 50, - "popularInGenius": 20, - "popularInFirestore": 20 - } - }, - "newArtistsByLetter": { - "a": 50, - "b": 45, - "...": "..." - } -} -``` - -**New Artists File Structure (per letter):** -```json -{ - "letter": "a", - "comparisonDate": "2026-01-04T18:30:00.000Z", - "newArtists": [ - { - "name": "New Artist", - "url": "https://genius.com/artists/New-artist", - "id": "999999", - "type": "regular", - "isNew": true, - "reason": "not_in_firestore" - } - ], - "popularUpdates": [ - { - "name": "Existing Artist", - "id": "111111", - "action": "add_popular", - "reason": "now_in_genius_popular_top_20" - }, - { - "name": "Another Artist", - "id": "222222", - "action": "remove_popular", - "reason": "no_longer_in_genius_popular_top_20" - } - ] -} -``` - -### **Phase 3: Prescrape Songs** -Scrape songs and lyrics for ONLY new artists. - -**Script:** `scripts/prescrape-new-artists.js` - -**Input:** -- New artist lists from Phase 2 -- Configuration (songs per artist, delays, etc.) - -**Output:** `scraping-data/song-data/{timestamp}/` - -**Song Data Structure (matches current prescraper format):** -```json -{ - "letter": "a", - "scrapedAt": "2026-01-04T19:15:00.000Z", - "artists": [ - { - "name": "New Artist", - "urlKey": "new-artist", - "url": "https://genius.com/artists/New-artist", - "geniusId": "999999", - "totalSongs": 25, - "allSongs": [...], - "scrapedSongs": [...], - "processingStats": { - "totalSongs": 25, - "songsAttempted": 10, - "lyricsScraped": 8, - "lyricsFailed": 2 - } - } - ], - "summary": { - "totalArtists": 50, - "totalSongs": 1250, - "totalLyrics": 890 - } -} -``` - -### **Phase 4: Upload to Database** -Manual step to upload new data after inspection. - -**Script:** `scripts/upload-update.js` - -**Input:** -- Song data from Phase 3 -- Comparison report from Phase 2 - -**Actions:** -1. **Clear all popular flags** from Firestore artists -2. **Upload new artists** with songs, lyrics, and search tokens -3. **Update popular flags** for exactly 20 artists per letter (from Genius) -4. **Update metadata** (lastUpdated timestamps) - -**Output:** Updated Firestore database - ---- - -## 🛠️ Script Details - -### **1. scripts/scrape-artists.js** - -**Purpose:** Scrape all artist lists from Genius - -**Features:** -- Scrapes all 27 letters (0, a-z) -- Saves to timestamped directory -- Includes artist IDs (via iOS app link extraction) -- Separates popular vs regular artists -- Creates summary.json with totals -- Creates .complete marker when done -- **TUI with progress bar and real-time statistics** -- Error handling: continues on failures, logs to errors.json - -**CLI Options:** -```bash -node scripts/scrape-artists.js # Full scrape -node scripts/scrape-artists.js --letters j,k # Specific letters only -node scripts/scrape-artists.js --no-ids # Skip ID extraction (faster) -node scripts/scrape-artists.js --output-dir ./custom/path # Custom output -node scripts/scrape-artists.js --quiet # Minimal output (no TUI) -``` - -**Based on:** Current `genius-scraper.js` (refactored) - -**TUI Display:** -- Current letter being scraped -- Artists processed / total artists -- Current artist being processed -- Error counts by type -- Estimated time remaining - ---- - -### **2. scripts/compare-artists.js** - -**Purpose:** Compare Genius artists with Firestore to identify new artists - -**Features:** -- Reads latest (or specified) artist list -- Queries Firestore for existing artists -- Identifies new artists not in database -- Detects popular status changes -- Generates filtered lists of only new artists -- Creates detailed comparison report -- **TUI with progress bar for Firestore queries** -- Error handling: continues on failures, logs to errors.json - -**CLI Options:** -```bash -node scripts/compare-artists.js # Use latest artist list -node scripts/compare-artists.js --date 2026-01-04-18-30 # Specific timestamp -node scripts/compare-artists.js --dry-run # Preview only -node scripts/compare-artists.js --quiet # Minimal output -``` - -**Logic:** -```javascript -// Pseudo-code -const geniusArtists = loadArtistLists(timestamp); -const firestoreArtists = await fetchAllFirestoreArtists(); - -const newArtists = geniusArtists.filter(artist => - !firestoreArtists.some(fa => fa.geniusId === artist.id) -); - -const popularUpdates = calculatePopularChanges( - geniusArtists.popular, - firestoreArtists.filter(fa => fa.type === 'popular') -); -``` - -**TUI Display:** -- Loading progress for Firestore queries -- Artists compared / total artists -- Current letter being compared -- New artists found count -- Popular changes detected - ---- - -### **3. scripts/prescrape-new-artists.js** - -**Purpose:** Scrape songs and lyrics for only new artists - -**Features:** -- Reads new-artists lists from Phase 2 -- Uses same scraping logic as current prescraper -- Configurable songs per artist (default: 10) -- Rate limiting and retries -- Progress tracking per letter -- Saves results in timestamp-matching directory -- **TUI with detailed progress and error tracking** -- Error handling: continues on failures, logs to errors.json - -**CLI Options:** -```bash -node scripts/prescrape-new-artists.js # Use latest comparison -node scripts/prescrape-new-artists.js --date 2026-01-04-18-30 -node scripts/prescrape-new-artists.js --songs 15 # Scrape 15 songs per artist -node scripts/prescrape-new-artists.js --letters j,k # Only specific letters -node scripts/prescrape-new-artists.js --quiet # Minimal output -``` - -**Based on:** Current `prescraper.js` (adapted for artist list input) - -**TUI Display:** -- Overall progress (artists processed / total) -- Current letter being processed -- Current artist and song being scraped -- Songs scraped / lyrics found counts -- Error counts by type (network, parsing, rate limit) -- Processing speed (songs/second) -- Estimated time remaining - ---- - -### **4. scripts/upload-update.js** - -**Purpose:** Upload new data to Firestore (manual inspection step) - -**Features:** -- Reads song data and comparison report -- Shows preview of changes before upload -- Clears all popular flags first -- Uploads new artists with search tokens -- Sets popular flags for top 20 per letter -- Batch uploads with smart rate limiting -- **TUI with upload progress tracking** -- Error handling: continues with partial, logs to errors.json - -**CLI Options:** -```bash -node scripts/upload-update.js # Use latest data -node scripts/upload-update.js --date 2026-01-04-18-30 # Specific timestamp -node scripts/upload-update.js --dry-run # Preview changes only -node scripts/upload-update.js --skip-popular # Don't update popular flags -node scripts/upload-update.js --letters j,k # Only specific letters -node scripts/upload-update.js --batch-size 50 # Slower batching -node scripts/upload-update.js --quiet # Minimal output -node scripts/upload-update.js --yes # Skip confirmation prompt -``` - -**Upload Steps:** -1. Load comparison report and song data -2. **Preview Mode** - Show what will be uploaded: - - X new artists to add - - Y popular flags to add - - Z popular flags to remove -3. Confirm with user (Y/n) - unless --yes flag -4. Clear all popular flags in Firestore -5. Upload new artists (with search tokens) -6. Upload new songs -7. Set popular flags (exactly 20 per letter) -8. Display summary - -**TUI Display:** -- Current upload phase (artists / songs / flags) -- Items uploaded / total items -- Current batch being uploaded -- Upload speed (items/second) -- Error counts by type -- Estimated time remaining - ---- - -### **5. scripts/artist-uploader.js** (Consolidated Utility) - -**Purpose:** Core upload functionality (used by upload-update.js) - -**Features:** -- Merges `upload-to-firestore.js` + `upload-remaining-artists.js` -- Generates search tokens automatically -- Smart batching (auto-adjusts on rate limits) -- Skip existing artists option -- Update vs create modes -- Validation and sanitization - -**Exports functions used by other scripts** - ---- - -## 📝 NPM Scripts (package.json) - -```json -{ - "scripts": { - "update:scrape-artists": "node scripts/scrape-artists.js", - "update:compare": "node scripts/compare-artists.js", - "update:prescrape": "node scripts/prescrape-new-artists.js", - "update:upload": "node scripts/upload-update.js", - "update:all": "npm run update:scrape-artists && npm run update:compare && npm run update:prescrape", - "update:latest": "node scripts/upload-update.js" - } -} -``` - -**Typical Monthly Workflow:** -```bash -# Step 1: Scrape latest artists from Genius (~10 minutes) -npm run update:scrape-artists -# TUI shows real-time progress, no emojis - -# Step 2: Compare with database and identify new artists (~2 minutes) -npm run update:compare -# TUI shows comparison progress and results - -# Step 3: Prescrape songs for new artists only (~2 hours) -npm run update:prescrape -# TUI shows detailed progress: current artist, song, errors - -# Step 4: Inspect data in scraping-data/ directories (manual) -# Review comparison-report.json and errors.json files - -# Step 5: Upload to database (after manual inspection, ~5 minutes) -npm run update:upload -# Shows preview, asks for confirmation, then uploads with TUI - -# Or run all scraping steps at once (Steps 1-3): -npm run update:all -# Then inspect and upload separately -``` - ---- - -## 🔧 Implementation Order - -### **Week 1: Data Structure & Core Utilities** -- [ ] Create `scraping-data/` directory structure -- [ ] Create timestamp utility functions -- [ ] Set up TUI libraries (`cli-progress`, `chalk` for subtle colors) -- [ ] Create shared TUI module for progress bars -- [ ] Create shared error logging module (errors.json) -- [ ] Refactor `artist-uploader.js` (merge upload scripts) -- [ ] Add search token generation to upload process -- [ ] Test upload with rate limiting - -### **Week 2: Artist Scraping** -- [ ] Refactor `genius-scraper.js` → `scrape-artists.js` -- [ ] Add TUI with progress bar and statistics -- [ ] Update to save in new directory structure -- [ ] Add summary.json generation -- [ ] Add errors.json generation -- [ ] Add .complete marker creation -- [ ] Remove all emojis from output -- [ ] Test full artist scraping workflow - -### **Week 3: Comparison Logic** -- [ ] Create `compare-artists.js` -- [ ] Add TUI with progress tracking -- [ ] Implement Firestore artist fetching -- [ ] Build comparison logic (new vs existing) -- [ ] Implement popular status change detection -- [ ] Generate filtered new-artists lists -- [ ] Create detailed comparison report -- [ ] Add error handling and logging -- [ ] Remove all emojis from output -- [ ] Test with real data - -### **Week 4: Prescraper Adaptation** -- [ ] Create `prescrape-new-artists.js` -- [ ] Add comprehensive TUI with detailed progress -- [ ] Adapt prescraper to read artist lists -- [ ] Update to use new directory structure -- [ ] Add error handling (continue on failures) -- [ ] Add errors.json generation -- [ ] Remove all emojis from output -- [ ] Test with small artist list -- [ ] Test with full new artists list - -### **Week 5: Upload System** -- [ ] Create `upload-update.js` -- [ ] Add TUI with upload progress -- [ ] Implement preview mode -- [ ] Add popular flag clearing logic -- [ ] Add popular flag setting (top 20 per letter) -- [ ] Implement confirmation prompts -- [ ] Add error handling (continue with partial) -- [ ] Add errors.json generation -- [ ] Remove all emojis from output -- [ ] Test with dry-run mode -- [ ] Test full upload workflow - -### **Week 6: Integration & Testing** -- [ ] End-to-end testing of full workflow -- [ ] Verify all TUIs work correctly -- [ ] Verify no emojis in any output -- [ ] Test error handling (graceful degradation) -- [ ] Test errors.json generation -- [ ] Create comprehensive documentation -- [ ] Update README with new workflow -- [ ] Archive old scripts to `scripts/archived/` -- [ ] Create migration guide -- [ ] Document TUI libraries and dependencies - ---- - -## 🎯 Key Features - -### **1. Incremental Updates** -- Only process new artists (saves hours of scraping) -- Preserves existing data (no overwriting) -- Only updates popular flags (no re-uploads) - -### **2. Data Safety** -- Everything saved locally first -- Manual inspection before upload -- Dry-run modes for all scripts -- Rollback capability - -### **3. Transparency** -- Detailed comparison reports -- Clear summary of changes -- Progress tracking throughout -- Comprehensive logs - -### **4. Flexibility** -- Can run full workflow or individual steps -- Can target specific letters -- Can reprocess data from any timestamp -- Configurable batch sizes for rate limiting - -### **5. Popular Artists Management** -- Clear all flags before update (ensures exactly 20 per letter) -- Based on Genius's current popular list -- Automatic detection of status changes -- Detailed logging of changes - ---- - -## Example Complete Run - -```bash -# January 4, 2026 - Monthly update - -# 1. Scrape latest artist lists from Genius -$ npm run update:scrape-artists - -================================================================================ -ARTIST LIST SCRAPER -================================================================================ - -Progress: [████████████████████████] 100% (27/27 letters) | Time: 8m 42s - -Statistics: - Popular Artists: 540 - Regular Artists: 49,460 - Total Artists: 50,000 - -Errors: - Network errors: 2 - ID extraction failed: 18 - -Output: scraping-data/artist-lists/2026-01-04-18-30/ -[SUCCESS] Artist scraping complete -================================================================================ - -# 2. Compare with database -$ npm run update:compare - -================================================================================ -ARTIST COMPARISON -================================================================================ - -Progress: [████████████████████████] 100% | Time: 1m 23s - -Results: - Genius artists: 50,000 - Firestore artists: 48,753 - New artists found: 1,247 - Popular changes: 23 - -Output: scraping-data/new-artists/2026-01-04-18-30/ -[SUCCESS] Comparison complete -================================================================================ - -# 3. Prescrape songs for new artists only -$ npm run update:prescrape - -================================================================================ -SONG PRESCRAPER - New Artists Only -================================================================================ - -Progress: [████████████████████████] 100% (1247/1247) | Time: 1h 47m - -Current: Letter J - Artist: John Doe - Song: Example Song - -Statistics: - Artists processed: 1,247 - Songs scraped: 12,470 - Lyrics found: 9,856 - Processing speed: 1.9 songs/sec - -Errors: - Network timeout: 45 - Lyrics not found: 2,569 - Parsing failed: 12 - -Output: scraping-data/song-data/2026-01-04-18-30/ -[SUCCESS] Prescraping complete -================================================================================ - -# 4. Inspect data (manual step) -$ cat scraping-data/new-artists/2026-01-04-18-30/comparison-report.json -# Review: New artists look good, popular changes make sense - -# 5. Upload to database -$ npm run update:upload - -================================================================================ -UPLOAD PREVIEW -================================================================================ - -Data Source: scraping-data/song-data/2026-01-04-18-30/ - -Changes to be made: - New artists: 1,247 - New songs: 12,470 - Popular flags to add: 540 (20 per letter x 27) - Popular flags to remove: 540 - -Proceed with upload? (Y/n): y - -================================================================================ -UPLOADING TO FIRESTORE -================================================================================ - -Progress: [████████████████████████] 100% | Time: 4m 12s - -Phase: Uploading Songs (Batch 125/125) - -Statistics: - Artists uploaded: 1,247 - Songs uploaded: 12,470 - Popular flags set: 540 - Upload speed: 3.2 items/sec - -Errors: - Rate limit (retried): 3 - Write failed: 0 - -[SUCCESS] Database update complete -================================================================================ -``` - ---- - -## 📊 Data Flow Diagram - -``` -┌─────────────────────┐ -│ Genius Website │ -└──────────┬──────────┘ - │ - ▼ -┌─────────────────────────────────────────────────┐ -│ Step 1: scrape-artists.js │ -│ Output: scraping-data/artist-lists/{timestamp} │ -└──────────┬──────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────┐ -│ Step 2: compare-artists.js │ -│ Input: Artist lists + Firestore artists │ -│ Output: scraping-data/new-artists/{timestamp} │ -│ (filtered: only new + popular changes) │ -└──────────┬──────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────┐ -│ Step 3: prescrape-new-artists.js │ -│ Input: New artist lists │ -│ Output: scraping-data/song-data/{timestamp} │ -│ (songs + lyrics for new artists only) │ -└──────────┬──────────────────────────────────────┘ - │ - ▼ - [Manual Inspection] - │ - ▼ -┌─────────────────────────────────────────────────┐ -│ Step 4: upload-update.js │ -│ Input: Song data + Comparison report │ -│ Actions: │ -│ 1. Clear all popular flags │ -│ 2. Upload new artists (with search tokens) │ -│ 3. Upload new songs │ -│ 4. Set popular flags (top 20 per letter) │ -│ Output: Updated Firestore database │ -└─────────────────────────────────────────────────┘ -``` - ---- - -## 🗑️ Script Cleanup Plan - -### **Scripts to Keep (Maintain/Refactor)** -- `genius-scraper.js` → refactor to `scrape-artists.js` -- `prescraper.js` → adapt to `prescrape-new-artists.js` -- `firebase-uploader.js` → keep for backward compatibility with manual prescraping -- `fix-null-lyrics.js` → keep (unrelated to update system) -- `search-songs-by-id.js` → keep (utility) -- `check-database.js` → keep (utility) - -### **Scripts to Consolidate** -- `upload-to-firestore.js` + `upload-remaining-artists.js` → merge into `artist-uploader.js` -- `add-search-tokens.js` → integrate into upload process (no separate step) - -### **Scripts to Archive** -Move to `scripts/archived/` for reference: -- `upload-to-firestore.js` (after merging) -- `upload-remaining-artists.js` (after merging) -- `add-search-tokens.js` (after integration) - ---- - -## Documentation Updates Needed - -1. **Update README.md** - Add new workflow section -2. **Create ARTIST_UPDATE_GUIDE.md** - User guide for monthly updates -3. **Update documentation/** - Update references to old scripts -4. **Create migration guide** - For transitioning from old to new system -5. **Document TUI usage** - How to interpret progress displays -6. **Document error handling** - Where to find errors.json files - -## Dependencies to Add - -Add to package.json: -```json -{ - "dependencies": { - "cli-progress": "^3.12.0", - "chalk": "^5.3.0" - } -} -``` - ---- - -## Success Criteria - -- [ ] Can scrape all artists from Genius in <15 minutes -- [ ] Can identify new artists in <5 minutes -- [ ] Can prescrape only new artists (saves 90%+ time vs full scrape) -- [ ] Can upload with manual inspection step -- [ ] Popular flags maintained at exactly 20 per letter -- [ ] Search tokens automatically included -- [ ] All data preserved locally with timestamps -- [ ] Can rerun any step independently -- [ ] TUI shows real-time progress and statistics -- [ ] No emojis in any script output or logs -- [ ] Errors logged to errors.json with clean console output -- [ ] Handles rate limits gracefully (continues with partial results) -- [ ] All phases show estimated time remaining -- [ ] Error counts displayed by type during execution - - ---- - -## 📋 Implementation Decisions - -### **Configuration Settings** -- **Songs per artist:** 10 (configurable via CLI) -- **Old data archival:** Manual deletion only (no automated cleanup) -- **Error handling:** Continue with partial results, document in summary.json -- **Rollback support:** Not needed -- **Logging style:** NO EMOJIS in any scripts or logs - -### **TUI Requirements** - -All main scraping scripts must include a Terminal User Interface (TUI) with: - -#### **Progress Tracking** -- Progress bar showing % completion of current phase -- Estimated time remaining for current phase -- Current letter being processed (e.g., "Processing: Letter J") -- Current item being processed (artist name or song title) - -#### **Statistics Display** -- Items processed / total items (e.g., "Artists: 45/1247") -- Success count -- Error count by type: - - Network errors - - Parsing errors - - Rate limit errors - - Other errors -- Items per second (processing speed) - -#### **Real-time Updates** -- Updates every 100ms for smooth progress bar -- Current action description (e.g., "Scraping: Artist Name - Song Title") -- No emoji characters in any output -- Clean, professional terminal output - -#### **Example TUI Layout** -``` -================================================================================ -ARTIST LIST SCRAPER - Phase 1/3 -================================================================================ - -Progress: [████████████████░░░░░░░░] 65.3% (17/26 letters) | ETA: 3m 15s - -Current Letter: Q -Current Artist: Queen -Action: Extracting artist ID from page - -Statistics: - Popular Artists: 340 | Regular Artists: 12,450 - Total Artists: 12,790 - -Errors: - Network errors: 3 - ID extraction failed: 12 - Other errors: 0 - -Processing Speed: 2.3 artists/sec -================================================================================ -``` - -#### **TUI Libraries** -Consider using: -- `cli-progress` - Progress bars -- `ora` - Spinners (if needed) -- `chalk` - Colors (optional, subtle use) -- `boxen` - Bordered boxes (optional) - ---- - -## 🎨 Code Style Guidelines - -### **Logging Standards** -- **NO EMOJIS** in any script output or logs -- Use simple prefixes: `[INFO]`, `[WARN]`, `[ERROR]`, `[SUCCESS]` -- Keep logs concise and machine-readable -- Timestamps in ISO format: `2026-01-04T18:30:00.000Z` -- Errors logged to summary.json, not verbose console output - -### **Error Handling** -- Scripts must continue on errors (graceful degradation) -- Collect all errors during processing -- Write error summary to `errors.json` in output directory -- Console shows error counts only, not full error messages -- Critical errors only halt execution - -### **Example Error Summary (errors.json)** -```json -{ - "phase": "prescraping", - "timestamp": "2026-01-04T19:15:00.000Z", - "totalErrors": 15, - "errorsByType": { - "network_timeout": 3, - "parsing_failed": 7, - "rate_limit": 2, - "lyrics_not_found": 3 - }, - "errors": [ - { - "type": "network_timeout", - "artist": "Artist Name", - "song": "Song Title", - "url": "https://genius.com/...", - "timestamp": "2026-01-04T19:15:23.000Z", - "message": "Request timeout after 10000ms" - } - ] -} -``` - ---- - -## Summary of Key Requirements - -### **User Experience** -- Terminal User Interface (TUI) with progress bars on all main scripts -- Real-time statistics and error tracking -- Estimated time remaining for all operations -- Clean, professional output with NO EMOJIS -- Simple log prefixes: [INFO], [WARN], [ERROR], [SUCCESS] - -### **Error Handling** -- Continue processing on errors (graceful degradation) -- Errors logged to `errors.json` in each output directory -- Console shows only error counts, not verbose messages -- Summary includes error breakdown by type - -### **Data Management** -- 10 songs per artist (configurable) -- All data saved locally in `scraping-data/` with timestamps -- No automated archival (manual deletion when needed) -- Timestamped directories for tracking and re-processing - -### **Popular Artists** -- Determined by Genius's current popular list -- Exactly 20 popular artists per letter -- Clear all flags before update to ensure accuracy -- Changes tracked in comparison report - -### **Workflow** -- 4 separate phases: Scrape → Compare → Prescrape → Upload -- Manual inspection before upload -- Can process specific letters only -- Can reprocess from any timestamp - ---- - -**Last Updated:** 2026-01-04 -**Status:** Planning Phase - Requirements Finalized -**Next Step:** Begin Week 1 implementation (TUI setup and core utilities) - diff --git a/firebase-uploader.js b/firebase-uploader.js new file mode 100644 index 0000000..eed1be8 --- /dev/null +++ b/firebase-uploader.js @@ -0,0 +1,575 @@ +#!/usr/bin/env node + +import fs from 'fs/promises'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { initializeApp } from 'firebase/app'; +import { getFirestore, collection, writeBatch, doc, getDoc, setDoc } from 'firebase/firestore'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +/** + * Firebase Uploader Configuration + */ +const config = { + // Input configuration + input: { + directory: null, // Will be set via CLI or default to latest + pattern: 'prescraped-*.json', // File pattern to match + skipExisting: true // Skip artists that already exist in Firestore + }, + + // Upload configuration + upload: { + batchSize: 100, // Number of operations per batch + delayBetweenBatches: 1000, // Delay between batches (ms) + collections: { + artists: 'artists', + songs: 'songs', + albumArt: 'albumArt' + } + }, + + // Processing options + processing: { + uploadArtistImages: false, // Skip artist image processing for now + uploadAlbumArt: false, // Skip album art processing for now + dryRun: false // If true, don't actually upload + }, + + // Filtering options + filtering: { + startLetter: null, // Filter artists starting with this letter (a-z) + endLetter: null, // Filter artists ending with this letter (a-z) + maxArtists: null // Limit number of artists to process (for testing) + } +}; + +/** + * Global state tracking + */ +const state = { + uploaded: { + artists: 0, + songs: 0, + albumArt: 0 + }, + skipped: { + artists: 0, + songs: 0, + albumArt: 0 + }, + errors: { + artists: 0, + songs: 0, + albumArt: 0 + }, + startTime: null +}; + +// Initialize Firebase +let db = null; + +/** + * Initialize Firebase connection + */ +async function initializeFirebase() { + try { + console.log('🔧 Initializing Firebase...'); + + // Initialize Firebase using centralized config (same as working script) + const app = initializeApp(firebaseConfig); + db = getFirestore(app); + + // Check if we should use emulator + if (process.env.FIRESTORE_EMULATOR_HOST) { + console.log('🧪 Using Firestore emulator at', process.env.FIRESTORE_EMULATOR_HOST); + } + + console.log('✅ Firebase initialized successfully'); + + return db; + } catch (error) { + console.error('❌ Error initializing Firebase:', error); + throw error; + } +} + +/** + * Load prescraped data files + */ +async function loadPrescrapedData(inputDir) { + console.log(`📂 Loading prescraped data from: ${inputDir}`); + + try { + const files = await fs.readdir(inputDir); + const jsonFiles = files.filter(file => file.match(/^prescraped-.+\.json$/)); + + if (jsonFiles.length === 0) { + throw new Error(`No prescraped files found in ${inputDir}`); + } + + console.log(`Found ${jsonFiles.length} prescraped files: ${jsonFiles.join(', ')}`); + + const allData = []; + + for (const file of jsonFiles) { + const filePath = path.join(inputDir, file); + const content = await fs.readFile(filePath, 'utf8'); + const data = JSON.parse(content); + + console.log(`📄 Loaded ${file}: ${data.artists.length} artists, ${data.summary.totalSongs} songs, ${data.summary.totalLyrics} lyrics`); + allData.push(data); + } + + // Flatten all artists from all files + const allArtists = allData.flatMap(data => data.artists); + console.log(`✅ Total loaded: ${allArtists.length} artists`); + + return allArtists; + + } catch (error) { + console.error('❌ Error loading prescraped data:', error); + throw error; + } +} + +/** + * Check if artist already exists in Firestore + */ +async function checkArtistExists(urlKey) { + try { + const artistRef = doc(db, config.upload.collections.artists, urlKey); + const docSnap = await getDoc(artistRef); + return docSnap.exists(); + } catch (error) { + console.error(`❌ Error checking if artist exists (${urlKey}):`, error); + return false; // Assume doesn't exist if we can't check + } +} + +/** + * Upload artist to Firestore + */ +async function uploadArtist(artistData) { + const urlKey = artistData.urlKey; + + try { + // Check if already exists and skip if configured to do so + if (config.input.skipExisting) { + const exists = await checkArtistExists(urlKey); + if (exists) { + console.log(` ⏭️ Artist ${artistData.name} already exists, skipping`); + state.skipped.artists++; + return { skipped: true }; + } + } + + // Prepare artist document data (matching your Firebase Functions structure) + const artistDoc = { + name: artistData.name, + geniusId: parseInt(artistData.geniusId, 10), + url: artistData.url, + imageUrl: artistData.imageUrl || null, + totalSongs: artistData.totalSongs, + songIds: artistData.allSongs.map(song => song.id), + cachedSongIds: artistData.scrapedSongs.filter(song => song.lyrics).map(song => song.id), + songsLastUpdated: new Date(), + lyricsScraped: artistData.processingStats.lyricsScraped, + isFullyCached: true, // We've fetched all available songs + cacheVersion: 1, + createdAt: new Date(), + // Prescraped metadata + prescrapedAt: new Date(artistData.processedAt), + prescrapedStats: artistData.processingStats + }; + + if (!config.processing.dryRun) { + const artistRef = doc(db, config.upload.collections.artists, urlKey); + await setDoc(artistRef, artistDoc); + } + + console.log(` ✅ Uploaded artist: ${artistData.name} (${artistData.totalSongs} songs, ${artistData.processingStats.lyricsScraped} lyrics)`); + state.uploaded.artists++; + + return { uploaded: true, songIds: artistDoc.songIds, cachedSongIds: artistDoc.cachedSongIds }; + + } catch (error) { + console.error(` ❌ Error uploading artist ${artistData.name}:`, error); + state.errors.artists++; + return { error: error.message }; + } +} + +/** + * Upload songs to Firestore in batches + */ +async function uploadSongs(artistData) { + console.log(` 📚 Uploading ${artistData.allSongs.length} songs...`); + + try { + // Prepare all songs (both with and without lyrics) + const songsToUpload = []; + + // Create a map of scraped songs for quick lookup + const scrapedSongsMap = new Map(); + artistData.scrapedSongs.forEach(song => { + scrapedSongsMap.set(song.id, song); + }); + + // Process all songs + for (const song of artistData.allSongs) { + const scrapedSong = scrapedSongsMap.get(song.id); + + const songDoc = { + title: song.title, + url: song.url, + songArtImageUrl: song.songArtImageUrl, + artistNames: song.artistNames, + primaryArtist: song.primaryArtist, + albumArtId: song.albumArtId, + addedAt: new Date(), + // Lyrics data (if available) + lyrics: scrapedSong?.lyrics || null, + lyricsScrapedAt: scrapedSong?.lyrics ? new Date(scrapedSong.scrapedAt) : null, + scrapingAttempts: scrapedSong?.lyrics ? 1 : 0, + scrapingError: scrapedSong?.scrapingError || null, + scrapingStatus: scrapedSong?.lyrics ? 'completed' : (scrapedSong?.scrapingError ? 'failed' : 'pending'), + scrapingDuration: scrapedSong?.scrapingDuration || null + }; + + songsToUpload.push({ id: song.id, data: songDoc }); + } + + // Upload in batches + const batchSize = config.upload.batchSize; + let uploaded = 0; + + for (let i = 0; i < songsToUpload.length; i += batchSize) { + const batch = writeBatch(db); + const batchSongs = songsToUpload.slice(i, i + batchSize); + + for (const song of batchSongs) { + if (!config.processing.dryRun) { + const songRef = doc(db, config.upload.collections.songs, song.id); + batch.set(songRef, song.data); + } + } + + if (!config.processing.dryRun) { + await batch.commit(); + } + + uploaded += batchSongs.length; + console.log(` 📄 Uploaded batch: ${uploaded}/${songsToUpload.length} songs`); + + // Delay between batches + if (i + batchSize < songsToUpload.length) { + await new Promise(resolve => setTimeout(resolve, config.upload.delayBetweenBatches)); + } + } + + state.uploaded.songs += uploaded; + console.log(` ✅ Completed song upload: ${uploaded} songs`); + + return { uploaded: uploaded }; + + } catch (error) { + console.error(` ❌ Error uploading songs for ${artistData.name}:`, error); + state.errors.songs += artistData.allSongs.length; + return { error: error.message }; + } +} + +/** + * Process a single artist: upload artist and songs + */ +async function processArtist(artistData, artistIndex, totalArtists) { + console.log(`\n[${artistIndex + 1}/${totalArtists}] 🎨 Processing: ${artistData.name}`); + + try { + // Upload artist document + const artistResult = await uploadArtist(artistData); + + if (artistResult.skipped) { + console.log(` ⏭️ Skipped artist and songs`); + return; + } + + if (artistResult.error) { + console.log(` ❌ Skipping songs due to artist upload error`); + return; + } + + // Upload songs + const songsResult = await uploadSongs(artistData); + + if (songsResult.error) { + console.log(` ⚠️ Artist uploaded but songs failed`); + } + + } catch (error) { + console.error(` 💥 Critical error processing ${artistData.name}:`, error); + state.errors.artists++; + } +} + +/** + * Get the sorting letter for an artist name (ignores common articles) + */ +function getSortingLetter(artistName) { + const name = artistName.toLowerCase(); + const articles = ['the ', 'a ', 'an ']; + + for (const article of articles) { + if (name.startsWith(article)) { + return name.charAt(article.length); + } + } + + return name.charAt(0); +} + +/** + * Filter artists by letter range and/or limit count + */ +function filterArtists(artists) { + let filtered = [...artists]; + + // Apply letter filtering + if (config.filtering.startLetter || config.filtering.endLetter) { + const startLetter = config.filtering.startLetter?.toLowerCase() || 'a'; + const endLetter = config.filtering.endLetter?.toLowerCase() || 'z'; + + filtered = filtered.filter(artist => { + const sortingChar = getSortingLetter(artist.name); + return sortingChar >= startLetter && sortingChar <= endLetter; + }); + + console.log(`🔤 Filtered by letters ${startLetter.toUpperCase()}-${endLetter.toUpperCase()}: ${filtered.length}/${artists.length} artists`); + } + + // Apply count limit + if (config.filtering.maxArtists && config.filtering.maxArtists > 0) { + const originalCount = filtered.length; + filtered = filtered.slice(0, config.filtering.maxArtists); + console.log(`🔢 Limited to ${config.filtering.maxArtists} artists: ${filtered.length}/${originalCount} selected`); + } + + return filtered; +} + +/** + * Find the latest prescraped directory + */ +async function findLatestPrescrapedDir() { + try { + const entries = await fs.readdir(__dirname, { withFileTypes: true }); + const prescrapedDirs = entries + .filter(entry => entry.isDirectory() && entry.name.startsWith('prescraped-data-')) + .map(entry => entry.name) + .sort() + .reverse(); // Latest first + + if (prescrapedDirs.length === 0) { + throw new Error('No prescraped data directories found'); + } + + return path.join(__dirname, prescrapedDirs[0]); + } catch (error) { + console.error('❌ Error finding prescraped directories:', error); + throw error; + } +} + +/** + * Print configuration summary + */ +function printConfig() { + console.log('\n' + '='.repeat(60)); + console.log('🚀 LYRICTYPE FIREBASE UPLOADER STARTING'); + console.log('='.repeat(60)); + console.log(`📋 Configuration:`); + console.log(` Input directory: ${config.input.directory || 'auto-detect latest'}`); + console.log(` Skip existing: ${config.input.skipExisting}`); + console.log(` Batch size: ${config.upload.batchSize}`); + console.log(` Dry run: ${config.processing.dryRun}`); + if (config.filtering.startLetter || config.filtering.endLetter) { + const start = config.filtering.startLetter?.toUpperCase() || 'A'; + const end = config.filtering.endLetter?.toUpperCase() || 'Z'; + console.log(` Letter filter: ${start}-${end}`); + } + if (config.filtering.maxArtists) { + console.log(` Max artists: ${config.filtering.maxArtists}`); + } + if (config.processing.dryRun) { + console.log(' ⚠️ DRY RUN MODE: No data will be uploaded'); + } + console.log('='.repeat(60) + '\n'); +} + +/** + * Main execution function + */ +async function main() { + try { + state.startTime = Date.now(); + + printConfig(); + + // Parse command line arguments + if (process.argv.includes('--help') || process.argv.includes('-h')) { + console.log(` +Usage: node firebase-uploader.js [options] + +Options: + --dir Directory containing prescraped JSON files + --start-letter Only process artists starting with this letter (a-z) + --end-letter Only process artists up to this letter (a-z) + --max-artists Limit number of artists to process (for testing) + --dry-run Don't actually upload, just show what would be done + --force Upload even if artists already exist + --emulator Use local Firestore emulator (requires firebase emulators:start) + --help, -h Show this help message + +Examples: + node firebase-uploader.js --start-letter n --end-letter z # Upload artists N-Z + node firebase-uploader.js --start-letter n --max-artists 5 --dry-run # Test with 5 artists starting with N + node firebase-uploader.js --dir ./prescraped-data-2025-09-14/ + node firebase-uploader.js --dry-run # Test run without uploading + node firebase-uploader.js --force # Overwrite existing artists + node firebase-uploader.js --emulator # Use local emulator for testing + `); + return; + } + + // Parse CLI arguments + const dirIndex = process.argv.indexOf('--dir'); + if (dirIndex !== -1 && process.argv[dirIndex + 1]) { + config.input.directory = process.argv[dirIndex + 1]; + } + + const startLetterIndex = process.argv.indexOf('--start-letter'); + if (startLetterIndex !== -1 && process.argv[startLetterIndex + 1]) { + const letter = process.argv[startLetterIndex + 1].toLowerCase(); + if (letter.match(/^[a-z]$/)) { + config.filtering.startLetter = letter; + } else { + console.error('❌ --start-letter must be a single letter (a-z)'); + process.exit(1); + } + } + + const endLetterIndex = process.argv.indexOf('--end-letter'); + if (endLetterIndex !== -1 && process.argv[endLetterIndex + 1]) { + const letter = process.argv[endLetterIndex + 1].toLowerCase(); + if (letter.match(/^[a-z]$/)) { + config.filtering.endLetter = letter; + } else { + console.error('❌ --end-letter must be a single letter (a-z)'); + process.exit(1); + } + } + + const maxArtistsIndex = process.argv.indexOf('--max-artists'); + if (maxArtistsIndex !== -1 && process.argv[maxArtistsIndex + 1]) { + const count = parseInt(process.argv[maxArtistsIndex + 1], 10); + if (count > 0) { + config.filtering.maxArtists = count; + } else { + console.error('❌ --max-artists must be a positive number'); + process.exit(1); + } + } + + if (process.argv.includes('--dry-run')) { + config.processing.dryRun = true; + console.log('🧪 DRY RUN MODE: No data will be uploaded'); + } + + if (process.argv.includes('--force')) { + config.input.skipExisting = false; + console.log('💪 FORCE MODE: Will overwrite existing artists'); + } + + if (process.argv.includes('--emulator')) { + process.env.FIRESTORE_EMULATOR_HOST = 'localhost:8080'; + console.log('🧪 EMULATOR MODE: Using local Firestore emulator'); + } + + // Determine input directory + if (!config.input.directory) { + config.input.directory = await findLatestPrescrapedDir(); + console.log(`📁 Auto-detected input directory: ${config.input.directory}`); + } + + // Initialize Firebase + await initializeFirebase(); + + // Load prescraped data + const allArtists = await loadPrescrapedData(config.input.directory); + + if (allArtists.length === 0) { + console.log('⚠️ No artists found in prescraped data'); + return; + } + + // Apply filtering + const artists = filterArtists(allArtists); + + if (artists.length === 0) { + console.log('⚠️ No artists match the specified filters'); + return; + } + + console.log(`\n🎯 Uploading ${artists.length} artists to Firestore...`); + console.log('Press Ctrl+C to stop gracefully\n'); + + // Process each artist + for (let i = 0; i < artists.length; i++) { + const artist = artists[i]; + await processArtist(artist, i, artists.length); + + // Small delay between artists to be gentle on Firestore + if (i < artists.length - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + // Final summary + const totalTime = Date.now() - state.startTime; + console.log(`\n${'='.repeat(60)}`); + console.log('🎉 FIREBASE UPLOAD COMPLETED!'); + console.log('='.repeat(60)); + console.log(`⏱️ Total time: ${Math.round(totalTime / 1000)}s`); + console.log(`📊 Final stats:`); + console.log(` Artists uploaded: ${state.uploaded.artists}, skipped: ${state.skipped.artists}, errors: ${state.errors.artists}`); + console.log(` Songs uploaded: ${state.uploaded.songs}, errors: ${state.errors.songs}`); + console.log(` Total operations: ${state.uploaded.artists + state.uploaded.songs}`); + if (config.processing.dryRun) { + console.log(` ⚠️ DRY RUN: No actual data was uploaded`); + } + console.log('='.repeat(60)); + + } catch (error) { + console.error('💥 Fatal error:', error); + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n\n🛑 Received SIGINT, shutting down gracefully...'); + console.log(`📊 Final stats:`); + console.log(` Artists uploaded: ${state.uploaded.artists}`); + console.log(` Songs uploaded: ${state.uploaded.songs}`); + console.log(` Total errors: ${state.errors.artists + state.errors.songs}`); + process.exit(0); +}); + +// Run the script +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} diff --git a/scripts/fix-null-lyrics.js b/fix-null-lyrics.js similarity index 99% rename from scripts/fix-null-lyrics.js rename to fix-null-lyrics.js index 99a466c..63e9b33 100644 --- a/scripts/fix-null-lyrics.js +++ b/fix-null-lyrics.js @@ -19,7 +19,7 @@ import { getFirestore, doc, getDoc, updateDoc, arrayRemove, arrayUnion, incremen import { fileURLToPath } from 'url'; import { dirname } from 'path'; import * as cheerio from 'cheerio'; -import { firebaseConfig } from '../src/lib/services/initFirebase.js'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; // Note: fetch is built-in for Node.js 18+, no need to import diff --git a/functions/.gitignore b/functions/.gitignore index 918c24d..d225730 100644 --- a/functions/.gitignore +++ b/functions/.gitignore @@ -3,4 +3,5 @@ local-config.json build *.log /prescraped-data-*/ -/prescraped-*.json \ No newline at end of file +/prescraped-*.json +test-*.js \ No newline at end of file diff --git a/functions/check-database.js b/functions/check-database.js new file mode 100644 index 0000000..22bb47c --- /dev/null +++ b/functions/check-database.js @@ -0,0 +1,49 @@ +import { initializeApp } from 'firebase-admin/app'; +import { getFirestore } from 'firebase-admin/firestore'; + +const app = initializeApp(); +const db = getFirestore(app); + +// Check a few songs to see their structure +async function checkSongs() { + try { + // Get the artist document for Demi Lovato first + const artistDoc = await db.collection('artists').doc('Demi-lovato').get(); + if (!artistDoc.exists) { + console.log('❌ Artist Demi-lovato not found'); + return; + } + + const artistData = artistDoc.data(); + const songIds = artistData.songIds || []; + console.log(`✅ Artist Demi-lovato has ${songIds.length} songs`); + + // Check the first 3 songs + for (let i = 0; i < Math.min(3, songIds.length); i++) { + const songId = songIds[i]; + console.log(`\n📝 Checking song ${i + 1}: ${songId}`); + + const songDoc = await db.collection('songs').doc(songId).get(); + if (songDoc.exists) { + const songData = songDoc.data(); + console.log(` Title: ${songData.title || 'N/A'}`); + console.log(` Has lyrics: ${!!songData.lyrics}`); + console.log(` Lyrics length: ${songData.lyrics ? songData.lyrics.length : 0}`); + if (songData.lyrics) { + console.log(` First 100 chars: ${songData.lyrics.substring(0, 100)}`); + } else { + console.log(` No lyrics found!`); + } + } else { + console.log(` ❌ Song document not found!`); + } + } + + process.exit(0); + } catch (error) { + console.error('Error:', error); + process.exit(1); + } +} + +checkSongs(); diff --git a/functions/index.js b/functions/index.js index bbd3358..eace090 100644 --- a/functions/index.js +++ b/functions/index.js @@ -7,7 +7,7 @@ import pako from 'pako'; import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'url'; -import { HttpsProxyAgent } from 'https-proxy-agent'; +import HttpsProxyAgent from 'https-proxy-agent'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -308,10 +308,9 @@ async function processAndStoreArtistImage(imageUrl, artistUrlKey) { const sharp = (await import('sharp')).default; let image = sharp(Buffer.from(imageBuffer)); - // Get image metadata - const metadata = await image.metadata(); - const nativeWidth = metadata.width; - const nativeHeight = metadata.height; + // Use native image dimensions + const nativeWidth = img.naturalWidth || img.width; + const nativeHeight = img.naturalHeight || img.height; console.log(`📐 Native resolution: ${nativeWidth}x${nativeHeight} (${metadata.format})`); @@ -331,18 +330,12 @@ async function processAndStoreArtistImage(imageUrl, artistUrlKey) { image = image.resize(finalWidth, finalHeight, { fit: 'inside' }); } - // Convert to raw RGBA pixels - const { data, info } = await image - .raw() - .ensureAlpha() - .toBuffer({ resolveWithObject: true }); - - // Create ImageData-like object for convertToGrayscale - const imageData = { - data: data, - width: info.width, - height: info.height - }; + // Create canvas with native size + const canvas = createCanvas(nativeWidth, nativeHeight); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, nativeWidth, nativeHeight); + + const imageData = ctx.getImageData(0, 0, nativeWidth, nativeHeight); // Convert to 8-bit grayscale const grayscaleData = convertToGrayscale(imageData); @@ -532,10 +525,9 @@ async function processAndStoreAlbumArt(imageUrl, albumArtId) { const sharp = (await import('sharp')).default; let image = sharp(Buffer.from(imageBuffer)); - // Get image metadata - const metadata = await image.metadata(); - const nativeWidth = metadata.width; - const nativeHeight = metadata.height; + // Use native image dimensions + const nativeWidth = img.naturalWidth || img.width; + const nativeHeight = img.naturalHeight || img.height; console.log(`📐 Native resolution: ${nativeWidth}x${nativeHeight} (${metadata.format})`); @@ -555,18 +547,12 @@ async function processAndStoreAlbumArt(imageUrl, albumArtId) { image = image.resize(finalWidth, finalHeight, { fit: 'inside' }); } - // Convert to raw RGBA pixels - const { data, info } = await image - .raw() - .ensureAlpha() - .toBuffer({ resolveWithObject: true }); - - // Create ImageData-like object for convertToGrayscale - const imageData = { - data: data, - width: info.width, - height: info.height - }; + // Create canvas with native size + const canvas = createCanvas(nativeWidth, nativeHeight); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, nativeWidth, nativeHeight); + + const imageData = ctx.getImageData(0, 0, nativeWidth, nativeHeight); // Convert to 8-bit grayscale const grayscaleData = convertToGrayscale(imageData); diff --git a/functions/index.js.backup b/functions/index.js.backup new file mode 100644 index 0000000..1318bcf --- /dev/null +++ b/functions/index.js.backup @@ -0,0 +1,1970 @@ +import { onRequest, onCall, HttpsError } from 'firebase-functions/v2/https'; +import { defineString } from 'firebase-functions/params'; +import { initializeApp } from 'firebase-admin/app'; +import { getFirestore, FieldValue } from 'firebase-admin/firestore'; +import * as cheerio from 'cheerio'; +import pako from 'pako'; +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// Initialize Firebase Admin SDK +const app = initializeApp(); +const db = getFirestore(app); + +const geniusApiKeyParam = defineString('GENIUS_KEY'); + +// Use the global fetch that comes with Node.js 18+ instead of node-fetch +// This is more compatible with Firebase Functions environment +const fetchWithTimeout = async (url, options = {}) => { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), options.timeout || 10000); + + try { + const response = await fetch(url, { + ...options, + signal: controller.signal + }); + clearTimeout(timeoutId); + return response; + } catch (error) { + clearTimeout(timeoutId); + if (error.name === 'AbortError') { + throw new Error(`Request timeout after ${options.timeout || 10000}ms`); + } + throw error; + } +}; + +// SSR server removed - now using pure static hosting with optimized binary image system +// All image processing is now done via dedicated Firebase Functions with Firestore caching + +// Keep existing health check +export const healthCheck = onRequest({ + timeoutSeconds: 10, + region: 'us-central1' +}, (req, res) => { + res.status(200).send('OK'); +}); + +/** + * Server-side Atkinson dithering algorithm + * @param {object} imageData - Canvas ImageData object + * @returns {Uint8Array} Binary array (1 bit per pixel, packed into bytes) + */ +function atkinsonDitherToBinary(imageData) { + const width = imageData.width; + const height = imageData.height; + const data = new Uint8ClampedArray(imageData.data); + + // Convert to grayscale first + for (let i = 0; i < data.length; i += 4) { + const gray = (data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114); + data[i] = data[i + 1] = data[i + 2] = gray; + } + + // Atkinson dithering matrix + const matrix = [ + [0, 0, 1/8, 1/8], + [1/8, 1/8, 1/8, 0], + [0, 1/8, 0, 0] + ]; + + // Create binary output array (1 bit per pixel, packed into bytes) + const binarySize = Math.ceil((width * height) / 8); + const binaryData = new Uint8Array(binarySize); + + for (let y = 0; y < height; y++) { + for (let x = 0; x < width; x++) { + const idx = (y * width + x) * 4; + const oldPixel = data[idx]; + + // Determine if pixel should be dark or light + const isDark = oldPixel < 128; + + // Store binary result (1 for light, 0 for dark) + const bitIndex = y * width + x; + const byteIndex = Math.floor(bitIndex / 8); + const bitPosition = 7 - (bitIndex % 8); + + if (!isDark) { // Light pixel = 1 + binaryData[byteIndex] |= (1 << bitPosition); + } + + const newPixel = isDark ? 0 : 255; + const error = (oldPixel - newPixel) / 8; + + // Propagate error using Atkinson matrix + for (let i = 0; i < matrix.length; i++) { + for (let j = 0; j < matrix[i].length; j++) { + if (matrix[i][j] === 0) continue; + + const ny = y + i; + const nx = x + j - 1; + + if (ny < height && nx >= 0 && nx < width) { + const nidx = (ny * width + nx) * 4; + data[nidx] += error; + data[nidx + 1] += error; + data[nidx + 2] += error; + } + } + } + } + } + + return binaryData; +} + +/** + * Analyze binary dithered data for compression and statistics + */ +function analyzeBinaryData(binaryData, width, height) { + const totalPixels = width * height; + const totalBytes = binaryData.length; + const originalSize = totalPixels * 4; // RGBA + const compressionRatio = totalBytes / originalSize; + + // Calculate white pixel count + let setBits = 0; + for (let byte of binaryData) { + setBits += byte.toString(2).split('1').length - 1; + } + + return { + totalPixels, + totalBytes, + originalSize, + compressionRatio: compressionRatio.toFixed(3), + compressionPercent: ((1 - compressionRatio) * 100).toFixed(1), + setBits, + whiteFraction: (setBits / totalPixels).toFixed(3), + whitePercent: ((setBits / totalPixels) * 100).toFixed(1), + }; +} + +/** + * Process artist image to binary format and store in database + * Fast response - client gets binary data ASAP + */ +async function processAndStoreArtistImage(imageUrl, artistUrlKey, targetSize = 200) { + try { + console.log(`🚀 FAST processing artist image: ${imageUrl}`); + const startTime = Date.now(); + + // Fetch the image + const imageResponse = await fetchWithTimeout(imageUrl, { timeout: 8000 }); + if (!imageResponse.ok) { + throw new Error(`Failed to fetch image: ${imageResponse.status}`); + } + + const imageBuffer = await imageResponse.arrayBuffer(); + console.log(`📦 Downloaded: ${imageBuffer.byteLength} bytes in ${Date.now() - startTime}ms`); + + // Process with canvas + const { createCanvas, loadImage } = await import('canvas'); + const img = await loadImage(Buffer.from(imageBuffer)); + + // Create canvas with target size + const canvas = createCanvas(targetSize, targetSize); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, targetSize, targetSize); + + const imageData = ctx.getImageData(0, 0, targetSize, targetSize); + + // Apply dithering and get binary data + const binaryData = atkinsonDitherToBinary(imageData); + const analysis = analyzeBinaryData(binaryData, targetSize, targetSize); + + console.log(`⚡ Binary processed in ${Date.now() - startTime}ms: ${analysis.totalBytes} bytes (${analysis.compressionPercent}% compression)`); + + // Compress with Pako + const compressedData = pako.deflate(binaryData); + console.log(`🗜️ Pako compressed: ${binaryData.length} → ${compressedData.length} bytes (${((1 - compressedData.length / binaryData.length) * 100).toFixed(1)}% reduction)`); + + // Store in artist document + const base64Binary = Buffer.from(compressedData).toString('base64'); + const imageMetadata = { + binaryImageData: base64Binary, + imageWidth: targetSize, + imageHeight: targetSize, + originalImageUrl: imageUrl, + originalSize: imageBuffer.byteLength, + binarySize: analysis.totalBytes, + compressedSize: compressedData.length, + base64Size: base64Binary.length, + compressionRatio: analysis.compressionRatio, + pakoCompressionRatio: compressedData.length / binaryData.length, + totalCompressionRatio: compressedData.length / imageBuffer.byteLength, + processedAt: new Date(), + processingVersion: '1.1-pako', + compressionMethod: 'pako-deflate' + }; + + // Update or create artist document with binary data + try { + await db.collection('artists').doc(artistUrlKey).update({ + imageUrl: imageUrl, + ...imageMetadata + }); + } catch (updateError) { + if (updateError.code === 'not-found') { + // Document doesn't exist, create it + console.log(`📝 Creating new artist document: ${artistUrlKey}`); + await db.collection('artists').doc(artistUrlKey).set({ + imageUrl: imageUrl, + ...imageMetadata, + createdAt: new Date() + }); + } else { + throw updateError; + } + } + + console.log(`💾 Stored binary data for artist ${artistUrlKey} in ${Date.now() - startTime}ms total`); + + return { + success: true, + binaryData: base64Binary, + metadata: { + width: targetSize, + height: targetSize, + originalSize: imageBuffer.byteLength, + binarySize: analysis.totalBytes, + compressedSize: compressedData.length, + base64Size: base64Binary.length, + compressionRatio: analysis.compressionRatio, + pakoCompressionRatio: compressedData.length / binaryData.length, + totalCompressionRatio: compressedData.length / imageBuffer.byteLength, + compressionPercent: analysis.compressionPercent, + pakoCompressionPercent: ((1 - compressedData.length / binaryData.length) * 100).toFixed(1), + totalCompressionPercent: ((1 - compressedData.length / imageBuffer.byteLength) * 100).toFixed(1), + whitePixelPercent: analysis.whitePercent, + processingTimeMs: Date.now() - startTime, + compressionMethod: 'pako-deflate' + } + }; + + } catch (error) { + console.error(`❌ Error processing artist image:`, error); + throw error; + } +} + +// Fast Artist Image Processing - prioritizes speed for client response +export const processArtistImageBinary = onRequest({ + timeoutSeconds: 15, + minInstances: 0, + maxInstances: 20, + region: 'us-central1', + invoker: 'public' +}, async (req, res) => { + // Enable CORS + res.set('Access-Control-Allow-Origin', '*'); + res.set('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); + res.set('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { + res.status(200).send(''); + return; + } + + const imageUrl = req.query.url || req.body?.url; + const artistUrlKey = req.query.artistKey || req.body?.artistKey; + const targetSize = parseInt(req.query.size || req.body?.size || '200'); + + if (!imageUrl) { + res.status(400).json({ error: 'No image URL provided' }); + return; + } + + if (!artistUrlKey) { + res.status(400).json({ error: 'No artist key provided' }); + return; + } + + try { + const result = await processAndStoreArtistImage(imageUrl, artistUrlKey, targetSize); + res.json(result); + } catch (error) { + console.error('❌ Error in processArtistImageBinary:', error); + res.status(500).json({ + error: 'Failed to process artist image', + details: error.message + }); + } +}); + +/** + * Try to convert unsupported image formats to supported ones + * @param {string} imageUrl - Original image URL + * @returns {string} Modified URL that might work better + */ +function tryAlternativeImageFormat(imageUrl) { + // Convert .webp to .jpg - Genius often has both formats + if (imageUrl.includes('.webp')) { + const jpgUrl = imageUrl.replace('.webp', '.jpg'); + console.log(`🔄 Trying alternative format: ${jpgUrl}`); + return jpgUrl; + } + + // For other formats, try to get a .jpg version by removing size specifications + // e.g., file.464x464x1.png -> file.jpg + const baseUrl = imageUrl.replace(/\.\d+x\d+x?\d*\.(png|gif|webp)$/i, '.jpg'); + if (baseUrl !== imageUrl) { + console.log(`🔄 Trying simplified format: ${baseUrl}`); + return baseUrl; + } + + return imageUrl; +} + +/** + * Process album art to binary format and store in database + * Similar to artist processing but stores in albumArt collection + * Uses 800x800 resolution for high quality on results screen + */ +async function processAndStoreAlbumArt(imageUrl, albumArtId, targetSize = 800) { + const startTime = Date.now(); + let lastError = null; + + // Try original URL first, then alternative formats + const urlsToTry = [imageUrl]; + + // Add alternative format if original might be problematic + const altUrl = tryAlternativeImageFormat(imageUrl); + if (altUrl !== imageUrl) { + urlsToTry.push(altUrl); + } + + // Also try without size specification for backup + const simpleUrl = imageUrl.replace(/\.\d+x\d+x?\d*\./g, '.'); + if (simpleUrl !== imageUrl && !urlsToTry.includes(simpleUrl)) { + urlsToTry.push(simpleUrl); + } + + for (let i = 0; i < urlsToTry.length; i++) { + const currentUrl = urlsToTry[i]; + + try { + console.log(`🚀 FAST processing album art: ${currentUrl} (ID: ${albumArtId})${i > 0 ? ` [attempt ${i + 1}]` : ''}`); + + // Fetch the image + const imageResponse = await fetchWithTimeout(currentUrl, { timeout: 8000 }); + if (!imageResponse.ok) { + throw new Error(`Failed to fetch image: ${imageResponse.status}`); + } + + const imageBuffer = await imageResponse.arrayBuffer(); + console.log(`📦 Downloaded: ${imageBuffer.byteLength} bytes in ${Date.now() - startTime}ms`); + + // Process with canvas + const { createCanvas, loadImage } = await import('canvas'); + const img = await loadImage(Buffer.from(imageBuffer)); + + // Create canvas with target size + const canvas = createCanvas(targetSize, targetSize); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, targetSize, targetSize); + + const imageData = ctx.getImageData(0, 0, targetSize, targetSize); + + // Apply dithering and get binary data + const binaryData = atkinsonDitherToBinary(imageData); + const analysis = analyzeBinaryData(binaryData, targetSize, targetSize); + + console.log(`⚡ Binary processed in ${Date.now() - startTime}ms: ${analysis.totalBytes} bytes (${analysis.compressionPercent}% compression)`); + + // Compress with Pako + const compressedData = pako.deflate(binaryData); + console.log(`🗜️ Pako compressed: ${binaryData.length} → ${compressedData.length} bytes (${((1 - compressedData.length / binaryData.length) * 100).toFixed(1)}% reduction)`); + + // Store in albumArt collection + const base64Binary = Buffer.from(compressedData).toString('base64'); + const albumArtMetadata = { + binaryImageData: base64Binary, + imageWidth: targetSize, + imageHeight: targetSize, + originalImageUrl: imageUrl, // Store original URL for reference + processedImageUrl: currentUrl, // Store URL that actually worked + processedAt: new Date(), + processingVersion: '1.1-pako', + compressionMethod: 'pako-deflate' + }; + + // Store in albumArt collection using the provided ID + await db.collection('albumArt').doc(albumArtId).set(albumArtMetadata); + + console.log(`💾 Stored album art binary data for ${albumArtId} in ${Date.now() - startTime}ms total${i > 0 ? ` (used fallback URL)` : ''}`); + + return { + success: true, + binaryData: base64Binary, + metadata: { + albumArtId: albumArtId, + width: targetSize, + height: targetSize, + originalSize: imageBuffer.byteLength, + binarySize: analysis.totalBytes, + compressedSize: compressedData.length, + compressionRatio: analysis.compressionRatio, + pakoCompressionRatio: compressedData.length / binaryData.length, + totalCompressionRatio: compressedData.length / imageBuffer.byteLength, + compressionPercent: analysis.compressionPercent, + pakoCompressionPercent: ((1 - compressedData.length / binaryData.length) * 100).toFixed(1), + totalCompressionPercent: ((1 - compressedData.length / imageBuffer.byteLength) * 100).toFixed(1), + whitePixelPercent: analysis.whitePercent, + processingTimeMs: Date.now() - startTime, + compressionMethod: 'pako-deflate', + usedFallbackUrl: i > 0 + } + }; + + } catch (error) { + lastError = error; + const isUnsupportedFormat = error.message.includes('Unsupported image type'); + const isLastAttempt = i === urlsToTry.length - 1; + + if (isUnsupportedFormat && !isLastAttempt) { + console.log(`⚠️ Format not supported for ${currentUrl}, trying alternative...`); + continue; // Try next URL + } else if (isLastAttempt) { + console.error(`❌ Error processing album art ${albumArtId} (all ${urlsToTry.length} URLs failed):`, error.message); + break; // Give up after all attempts + } else { + console.log(`⚠️ Error with ${currentUrl}, trying alternative:`, error.message); + continue; // Try next URL + } + } + } + + // If we get here, all attempts failed + throw lastError || new Error('All image format attempts failed'); +} + +// Fast Album Art Processing - prioritizes speed for client response +export const processAlbumArtBinary = onRequest({ + timeoutSeconds: 15, + minInstances: 0, + maxInstances: 20, + region: 'us-central1', + invoker: 'public' +}, async (req, res) => { + // Enable CORS + res.set('Access-Control-Allow-Origin', '*'); + res.set('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); + res.set('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { + res.status(200).send(''); + return; + } + + const imageUrl = req.query.url || req.body?.url; + const albumArtId = req.query.albumArtId || req.body?.albumArtId; + const targetSize = parseInt(req.query.size || req.body?.size || '200'); + + if (!imageUrl) { + res.status(400).json({ error: 'No image URL provided' }); + return; + } + + if (!albumArtId) { + res.status(400).json({ error: 'No album art ID provided' }); + return; + } + + try { + const result = await processAndStoreAlbumArt(imageUrl, albumArtId, targetSize); + res.json(result); + } catch (error) { + console.error('❌ Error in processAlbumArtBinary:', error); + res.status(500).json({ + error: 'Failed to process album art', + details: error.message + }); + } +}); + +// Binary Image Processing Function +export const processImageBinary = onRequest({ + timeoutSeconds: 30, + minInstances: 0, + maxInstances: 10, + region: 'us-central1', + invoker: 'public' +}, async (req, res) => { + // Enable CORS + res.set('Access-Control-Allow-Origin', '*'); + res.set('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); + res.set('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { + res.status(200).send(''); + return; + } + + const imageUrl = req.query.url || req.body?.url; + const targetSize = parseInt(req.query.size || req.body?.size || '200'); + const returnBinary = req.query.binary === 'true' || req.body?.binary === true; + const logAnalysis = req.query.log === 'true' || req.body?.log === true; + + if (!imageUrl) { + res.status(400).json({ error: 'No image URL provided' }); + return; + } + + try { + console.log(`🎨 Processing image: ${imageUrl} (size: ${targetSize}, binary: ${returnBinary})`); + + // Fetch the image + const imageResponse = await fetchWithTimeout(imageUrl, { timeout: 10000 }); + if (!imageResponse.ok) { + throw new Error(`Failed to fetch image: ${imageResponse.status}`); + } + + const imageBuffer = await imageResponse.arrayBuffer(); + console.log(`📦 Downloaded image: ${imageBuffer.byteLength} bytes`); + + // Process with canvas + const { createCanvas, loadImage } = await import('canvas'); + const img = await loadImage(Buffer.from(imageBuffer)); + + // Create canvas with target size + const canvas = createCanvas(targetSize, targetSize); + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0, targetSize, targetSize); + + const imageData = ctx.getImageData(0, 0, targetSize, targetSize); + console.log(`🖼️ Got image data: ${imageData.width}x${imageData.height}`); + + if (returnBinary) { + // Apply dithering and get binary data + const binaryData = atkinsonDitherToBinary(imageData); + const analysis = analyzeBinaryData(binaryData, targetSize, targetSize); + + if (logAnalysis) { + console.log('🔍 BINARY IMAGE ANALYSIS:'); + console.log(`📊 Image: ${targetSize}x${targetSize} pixels`); + console.log(`📦 Original size: ${imageBuffer.byteLength} bytes`); + console.log(`🗜️ Binary size: ${analysis.totalBytes} bytes`); + console.log(`📉 Compression: ${analysis.compressionPercent}% reduction (${analysis.compressionRatio}x)`); + console.log(`⚫ White pixels: ${analysis.whitePercent}% (${analysis.setBits}/${analysis.totalPixels})`); + } + + // Return binary data with metadata + const base64Binary = Buffer.from(binaryData).toString('base64'); + + res.json({ + success: true, + format: 'binary', + data: base64Binary, + metadata: { + width: targetSize, + height: targetSize, + originalSize: imageBuffer.byteLength, + binarySize: analysis.totalBytes, + compressionRatio: analysis.compressionRatio, + compressionPercent: analysis.compressionPercent, + whitePixelPercent: analysis.whitePercent + } + }); + } else { + // Return original or processed image + const buffer = canvas.toBuffer('image/png'); + res.set('Content-Type', 'image/png'); + res.send(buffer); + } + + } catch (error) { + console.error('❌ Error processing image:', error); + res.status(500).json({ + error: 'Failed to process image', + details: error.message + }); + } +}); + +// ======================================== +// NEW CACHE STRATEGY FUNCTIONS +// ======================================== + +/** + * Helper function to get Genius API key with fallback + * @returns {string} The Genius API key + */ +async function getGeniusApiKey() { + let geniusApiKey = geniusApiKeyParam.value(); + + if (!geniusApiKey) { + try { + const localConfigPath = path.join(__dirname, 'local-config.json'); + const localConfig = JSON.parse(fs.readFileSync(localConfigPath, 'utf8')); + geniusApiKey = localConfig.genius.key; + } catch (error) { + console.error('Error loading local config:', error); + throw new Error('API key not found. Please configure your API key.'); + } + } + + return geniusApiKey; +} + +/** + * Helper function to check if artist songs need refresh (older than 1 week) + * @param {Date} songsLastUpdated - Last update timestamp + * @returns {boolean} Whether refresh is needed + */ +function needsRefresh(songsLastUpdated, isFullyCached) { + // If we know the artist is NOT fully cached, always continue fetching pages + if (isFullyCached === false) return true; + + if (!songsLastUpdated) return true; + const oneWeekAgo = new Date(); + oneWeekAgo.setDate(oneWeekAgo.getDate() - 7); + return new Date(songsLastUpdated) < oneWeekAgo; +} + +/** + * Extract artist image URL from songs data + * Searches through songs to find a matching artist ID and returns their image_url + * @param {Object[]} songs - Array of song objects from Genius API + * @param {number} targetArtistId - The artist ID to search for + * @param {number} maxSongsToCheck - Maximum number of songs to check (default: 11) + * @returns {string|null} Artist image URL or null if not found + */ +function extractArtistImageUrl(songs, targetArtistId, maxSongsToCheck = 11) { + console.log(`Searching for image URL for artist ID ${targetArtistId} in ${Math.min(songs.length, maxSongsToCheck)} songs`); + + const songsToCheck = songs.slice(0, maxSongsToCheck); + // Ensure targetArtistId is a number for comparison with API response + const targetId = typeof targetArtistId === 'string' ? parseInt(targetArtistId, 10) : targetArtistId; + + for (const song of songsToCheck) { + // Check primary artist first + if (song.primary_artist && song.primary_artist.id === targetId) { + const imageUrl = song.primary_artist.image_url; + if (imageUrl) { + console.log(`Found artist image URL in primary artist: ${imageUrl}`); + return imageUrl; + } + } + + // Check featured artists if primary artist doesn't match + if (song.featured_artists && Array.isArray(song.featured_artists)) { + for (const featuredArtist of song.featured_artists) { + if (featuredArtist.id === targetId) { + const imageUrl = featuredArtist.image_url; + if (imageUrl) { + console.log(`Found artist image URL in featured artists: ${imageUrl}`); + return imageUrl; + } + } + } + } + } + + console.log(`No image URL found for artist ID ${targetId} in ${songsToCheck.length} songs`); + return null; +} + +/** + * Fetch song metadata from Genius API for a specific artist page + * @param {number} artistId - Genius artist ID + * @param {number} page - Page number (1-based) + * @returns {Object} { songs: Song[], hasMore: boolean, totalSongs: number } + */ +async function getSongsByArtist(artistId, page = 1) { + console.log(`Fetching songs for artist ${artistId}, page ${page}`); + + try { + const geniusApiKey = await getGeniusApiKey(); + const headers = { "Authorization": `Bearer ${geniusApiKey}` }; + + // Fetch 50 songs per page, sorted by popularity + const response = await fetchWithTimeout( + `https://api.genius.com/artists/${artistId}/songs?per_page=50&page=${page}&sort=popularity`, + { headers } + ); + + if (!response.ok) { + throw new Error(`Genius API error: ${response.status} ${response.statusText}`); + } + + const data = await response.json(); + + if (!data.response || !data.response.songs) { + throw new Error('Invalid API response structure'); + } + + const songs = data.response.songs; + console.log(`Fetched ${songs.length} songs for artist ${artistId}, page ${page}`); + + // Transform songs to our schema format + const transformedSongs = songs.map(song => ({ + id: song.id.toString(), // Use as Firestore document ID + title: song.title, + url: song.url, + songArtImageUrl: song.song_art_image_url, + artistNames: song.artist_names, + primaryArtist: { + id: song.primary_artist.id, + name: song.primary_artist.name, + url: song.primary_artist.url + }, + // Lyrics fields - initially null + lyrics: null, + lyricsScrapedAt: null, + scrapingAttempts: 0, + scrapingError: null, + // Metadata + addedAt: new Date(), + scrapingStatus: 'pending' + })); + + const hasMore = songs.length === 50; // If we got a full page, there might be more + + return { + songs: transformedSongs, + rawSongs: songs, // Include raw API response for image URL extraction + hasMore, + totalSongs: songs.length, // This is just for current page, will be updated later + pageNumber: page + }; + + } catch (error) { + console.error(`Error fetching songs for artist ${artistId}, page ${page}:`, error); + throw error; + } +} + +/** + * Extract the hash/ID from a Genius image URL for album art deduplication + * Example: https://images.genius.com/bda1518357007cbd7ab978c4a6764e26.711x711x1.jpg + * Returns: bda1518357007cbd7ab978c4a6764e26 + */ +function extractGeniusImageHash(imageUrl) { + try { + if (!imageUrl) return null; + + // Extract the filename from the URL + const filename = imageUrl.split('/').pop(); + + // Extract the hash (everything before the first dot) + const hash = filename.split('.')[0]; + + // Validate it looks like a hash (32 character hex string) + if (hash && /^[a-f0-9]{32}$/i.test(hash)) { + return hash.toLowerCase(); + } + + // Fallback: use the full filename if it doesn't match expected pattern + console.warn(`Unexpected Genius URL format: ${imageUrl}, using filename as ID`); + return filename.replace(/[^a-zA-Z0-9]/g, '-').toLowerCase(); + + } catch (error) { + console.error('Error extracting hash from Genius URL:', error); + return null; + } +} + +/** + * Store song documents in Firestore songs collection + * @param {Object[]} songs - Array of song objects + * @returns {Promise} Array of song IDs that were stored + */ +async function storeSongsInFirestore(songs) { + console.log(`Storing ${songs.length} songs in Firestore`); + + try { + const batch = db.batch(); + const songsCollection = db.collection('songs'); + const storedSongIds = []; + + for (const song of songs) { + const songRef = songsCollection.doc(song.id); + + // Check if song already exists to avoid overwriting existing data + const existingDoc = await songRef.get(); + + if (!existingDoc.exists) { + // Extract album art ID for future processing (but don't process yet) + let albumArtId = null; + if (song.songArtImageUrl) { + albumArtId = extractGeniusImageHash(song.songArtImageUrl); + } + + // Remove the id from the document data since it's used as the document ID + const { id, ...songData } = song; + + // Add album art ID to song data for later processing during lyric scraping + songData.albumArtId = albumArtId; + + batch.set(songRef, songData); + storedSongIds.push(song.id); + console.log(`Queued song ${song.id} for storage: ${song.title}${albumArtId ? ` (album art ID: ${albumArtId})` : ''}`); + } else { + console.log(`Song ${song.id} already exists, skipping: ${song.title}`); + storedSongIds.push(song.id); // Still include in list since it's available + } + } + + if (storedSongIds.length > 0) { + await batch.commit(); + console.log(`Successfully stored ${storedSongIds.length} songs in Firestore`); + } else { + console.log('No new songs to store'); + } + + return storedSongIds; + + } catch (error) { + console.error('Error storing songs in Firestore:', error); + throw error; + } +} + +/** + * Check if album art exists, process if not + * @param {string} imageUrl - Original album art URL + * @param {string} albumArtId - Extracted hash ID + * @returns {Promise} True if processed/exists, false if failed + */ +async function checkAndProcessAlbumArt(imageUrl, albumArtId) { + try { + // Check if album art already exists + const albumArtRef = db.collection('albumArt').doc(albumArtId); + const albumArtDoc = await albumArtRef.get(); + + if (albumArtDoc.exists) { + // Already processed + return true; + } + + // Process and store the album art + console.log(`🎨 Processing new album art: ${albumArtId}`); + await processAndStoreAlbumArt(imageUrl, albumArtId, 800); + console.log(`✅ Successfully processed album art: ${albumArtId}`); + return true; + + } catch (error) { + console.error(`❌ Error processing album art ${albumArtId}:`, error); + return false; + } +} + +/** + * Update artist document with new song IDs and metadata + * @param {string} artistUrlKey - Artist document ID (URL slug) + * @param {string[]} newSongIds - Array of new song IDs to add + * @param {Object} metadata - Additional metadata to update + */ +async function updateArtistSongList(artistUrlKey, newSongIds, metadata) { + console.log(`Updating artist ${artistUrlKey} with ${newSongIds.length} new songs`); + + try { + const artistRef = db.collection('artists').doc(artistUrlKey); + + // First, get the current artist document to check existing songIds + const artistDoc = await artistRef.get(); + if (!artistDoc.exists) { + throw new Error(`Artist document not found: ${artistUrlKey}`); + } + + const artistData = artistDoc.data(); + const existingSongIds = artistData.songIds || []; + + // Filter out song IDs that are already in the artist's list + const trulyNewSongIds = newSongIds.filter(id => !existingSongIds.includes(id)); + + if (trulyNewSongIds.length === 0) { + console.log('No new song IDs to add to artist document'); + return; + } + + const updateData = { + songIds: FieldValue.arrayUnion(...trulyNewSongIds), + songsFetched: metadata.songsFetched, + totalSongs: metadata.totalSongs, + songsLastUpdated: new Date(), + isFullyCached: metadata.isFullyCached || false, + cacheVersion: 1 + }; + + await artistRef.update(updateData); + console.log(`Successfully updated artist ${artistUrlKey} with ${trulyNewSongIds.length} new song IDs`); + + } catch (error) { + console.error(`Error updating artist song list for ${artistUrlKey}:`, error); + throw error; + } +} + +/** + * Core logic for populating artist songs (without Firebase Functions wrapper) + * @param {string} artistUrlKey - Artist document ID + * @returns {Promise} Result object + */ +async function populateArtistSongsCore(artistUrlKey, { onlyFirstPage = false } = {}) { + console.log(`Starting song population for artist: ${artistUrlKey}`); + + // Get artist document from Firestore + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (!artistDoc.exists) { + throw new Error('Artist not found'); + } + + const artistData = artistDoc.data(); + console.log(`Found artist: ${artistData.name} (Genius ID: ${artistData.geniusId})`); + + const artistId = artistData.geniusId; + if (!artistId) { + throw new Error('Artist does not have a Genius ID'); + } + + // Check if refresh is needed + if (!needsRefresh(artistData.songsLastUpdated, artistData.isFullyCached)) { + console.log('Artist songs are up to date, no refresh needed'); + + // Even if songs are up to date, check if we need to extract image URL + // Only attempt if imageUrl is undefined (never attempted), not null (already attempted) + if (artistData.imageUrl === undefined && artistData.songIds && artistData.songIds.length > 0) { + console.log('Songs up to date but missing image URL - attempting extraction from existing songs...'); + + try { + // Try to extract image URL from first few existing songs + const firstSongIds = artistData.songIds.slice(0, 5); // Check first 5 songs + let foundImageUrl = null; + + for (const songId of firstSongIds) { + const songDoc = await db.collection('songs').doc(songId).get(); + if (songDoc.exists) { + const songData = songDoc.data(); + if (songData.primaryArtist && songData.primaryArtist.id === artistId) { + // This is a bit tricky since we don't have the full API response here + // We need to make a small API call to get the artist image + console.log('Found matching song, need to fetch from API for image URL...'); + + // Make a quick API call to get the artist details + const geniusApiKey = await getGeniusApiKey(); + const headers = { "Authorization": `Bearer ${geniusApiKey}` }; + + const artistResponse = await fetchWithTimeout( + `https://api.genius.com/artists/${artistId}`, + { headers } + ); + + if (artistResponse.ok) { + const artistApiData = await artistResponse.json(); + if (artistApiData.response && artistApiData.response.artist) { + foundImageUrl = artistApiData.response.artist.image_url; + break; + } + } + } + } + } + + // If no image URL found through songs, try direct artist API call as fallback + if (!foundImageUrl) { + console.log('No matching songs found, trying direct artist API call...'); + try { + const geniusApiKey = await getGeniusApiKey(); + const headers = { "Authorization": `Bearer ${geniusApiKey}` }; + + const artistResponse = await fetchWithTimeout( + `https://api.genius.com/artists/${artistId}`, + { headers } + ); + + if (artistResponse.ok) { + const artistApiData = await artistResponse.json(); + if (artistApiData.response && artistApiData.response.artist) { + foundImageUrl = artistApiData.response.artist.image_url; + console.log(`Found image URL via direct artist API call: ${foundImageUrl}`); + } + } + } catch (directApiError) { + console.error('Error in direct artist API call:', directApiError); + } + } + + // Process and store image if found, otherwise store null + if (foundImageUrl) { + try { + console.log(`🖼️ Found artist image via API, processing to binary format...`); + await processAndStoreArtistImage(foundImageUrl, artistUrlKey, 200); + console.log(`✅ Successfully processed and stored artist image binary data`); + } catch (imageUpdateError) { + console.error('Error processing/storing artist image:', imageUpdateError); + // Fallback: store just the URL if binary processing fails + await db.collection('artists').doc(artistUrlKey).update( { + imageUrl: foundImageUrl + }); + console.log(`⚠️ Stored URL only due to processing error: ${foundImageUrl}`); + } + } else { + await db.collection('artists').doc(artistUrlKey).update( { + imageUrl: null + }); + console.log('No image URL found, stored null'); + } + + } catch (imageError) { + console.error('Error extracting image URL for up-to-date artist:', imageError); + // Don't fail the entire operation if image extraction fails + } + } + + return { + success: true, + message: 'Songs already up to date', + totalSongs: (artistData.songIds || []).length, + isUpToDate: true + }; + } + + let page = 1; + let allSongIds = [...(artistData.songIds || [])]; + let totalFetched = 0; + const maxSongs = 1000; + let artistImageUrlExtracted = false; // Track if we've already extracted the image URL + + console.log(`Starting with ${allSongIds.length} existing songs`); + + // Fetch songs page by page + while (totalFetched < maxSongs) { + console.log(`Fetching page ${page}...`); + + const result = await getSongsByArtist(artistId, page); + + if (result.songs.length === 0) { + console.log('No more songs available'); + break; + } + + // Extract and store artist image URL (only once, from first few songs) + if (!artistImageUrlExtracted && artistData.imageUrl === undefined) { + console.log('Attempting to extract artist image URL from songs data...'); + + // Calculate how many songs we should check based on which page we're on + // We want to check up to 11 songs total across pages + const songsCheckedSoFar = (page - 1) * 50; + const maxSongsToCheckThisPage = Math.max(0, 11 - songsCheckedSoFar); + + if (maxSongsToCheckThisPage > 0) { + // Use raw songs data from API response for image URL extraction + const artistImageUrl = extractArtistImageUrl(result.rawSongs, artistId, maxSongsToCheckThisPage); + + if (artistImageUrl) { + try { + // Process image to binary format and store immediately + console.log(`🖼️ Found artist image, processing to binary format...`); + await processAndStoreArtistImage(artistImageUrl, artistUrlKey, 200); + console.log(`✅ Successfully processed and stored artist image binary data`); + artistImageUrlExtracted = true; + } catch (imageUpdateError) { + console.error('Error processing/storing artist image:', imageUpdateError); + // Fallback: store just the URL if binary processing fails + try { + await db.collection('artists').doc(artistUrlKey).update( { + imageUrl: artistImageUrl + }); + console.log(`⚠️ Stored URL only due to processing error: ${artistImageUrl}`); + artistImageUrlExtracted = true; + } catch (urlFallbackError) { + console.error('Error storing artist image URL fallback:', urlFallbackError); + } + // Don't fail the entire operation if image update fails + } + } + } + + // If we've checked 11 songs total and still haven't found an image URL, store null + const totalSongsChecked = Math.min(songsCheckedSoFar + result.songs.length, 11); + if (totalSongsChecked >= 11 && !artistImageUrlExtracted) { + try { + await db.collection('artists').doc(artistUrlKey).update( { + imageUrl: null + }); + console.log('No artist image URL found after checking 11 songs, stored null'); + } catch (imageUpdateError) { + console.error('Error updating artist image URL to null:', imageUpdateError); + } + artistImageUrlExtracted = true; + } + } + + // Store songs in Firestore + const storedSongIds = await storeSongsInFirestore(result.songs); + + // Add new song IDs to our list (filter out duplicates) + const newSongIds = storedSongIds.filter(id => !allSongIds.includes(id)); + allSongIds.push(...newSongIds); + + totalFetched += result.songs.length; + + // Update artist document with progress + await updateArtistSongList(artistUrlKey, storedSongIds, { + songsFetched: allSongIds.length, + totalSongs: allSongIds.length, + isFullyCached: !result.hasMore + }); + + console.log(`Page ${page} complete: ${newSongIds.length} new songs, ${allSongIds.length} total`); + + if (page === 1) { + console.log(`✅ First 50 songs cached for ${artistUrlKey}. Triggering queue build on client...`); + } + + // Early exit after first page if requested + if (onlyFirstPage) { + console.log(`⏹️ Early return after first page for ${artistUrlKey}`); + break; + } + + // Break if no more pages + if (!result.hasMore) { + console.log('Reached end of songs for artist'); + break; + } + + page++; + + // Small delay to be respectful to the API + await new Promise(resolve => setTimeout(resolve, 200)); + } + + console.log(`Completed song population for ${artistUrlKey}: ${allSongIds.length} total songs`); + + return { + success: true, + totalSongs: allSongIds.length, + newSongs: totalFetched, + isFullyCached: totalFetched < maxSongs, + pagesProcessed: page - 1 + }; +} + +/** + * Orchestrates fetching all songs for an artist (up to 1000 songs) + * Called when client finds empty songIds array or when refresh is needed + */ +export const populateArtistSongs = onCall({ + timeoutSeconds: 300, // 5 minutes for large artists + minInstances: 0, + maxInstances: 10, + region: 'us-central1' +}, async (request, context) => { + const { artistUrlKey, onlyFirstPage = false } = request.data; + + if (!artistUrlKey) { + throw new HttpsError('invalid-argument', 'Artist URL key is required'); + } + + try { + return await populateArtistSongsCore(artistUrlKey, { onlyFirstPage }); + } catch (error) { + console.error(`Error populating songs for artist ${artistUrlKey}:`, error); + throw new HttpsError('internal', `Failed to populate artist songs: ${error.message}`); + } +}); + +/** + * Core logic for scraping song lyrics (without Firebase Functions wrapper) + * @param {string[]} songIds - Array of song IDs to scrape + * @param {string} artistUrlKey - Artist document ID + * @returns {Promise} Result object + */ +async function scrapeSongLyricsCore(songIds, artistUrlKey) { + console.log(`Starting lyrics scraping for ${songIds.length} songs`); + + const results = { + successful: [], + failed: [], + skipped: [] + }; + + for (const songId of songIds) { + try { + console.log(`Scraping lyrics for song ${songId}`); + + // Get song document from Firestore + const songDoc = await db.collection('songs').doc(songId).get(); + if (!songDoc.exists) { + results.failed.push({ songId, error: 'Song not found' }); + continue; + } + + const songData = songDoc.data(); + + // Skip if already has lyrics or failed permanently + if (songData.lyrics || songData.scrapingStatus === 'failed') { + console.log(`Skipping song ${songId}: already processed`); + results.skipped.push(songId); + continue; + } + + // Check retry limit + if (songData.scrapingAttempts >= 2) { + console.log(`Skipping song ${songId}: max retries exceeded`); + results.failed.push({ songId, error: 'Max retries exceeded' }); + continue; + } + + // Update status to 'scraping' + await db.collection('songs').doc(songId).update( { + scrapingStatus: 'scraping', + scrapingAttempts: (songData.scrapingAttempts || 0) + 1 + }); + + // Use existing lyrics scraping logic + const lyrics = await scrapeLyricsFromUrl(songData.url); + + if (lyrics && lyrics.trim().length > 0) { + // Process album art now that we know this song will be used + if (songData.albumArtId && songData.songArtImageUrl) { + try { + console.log(`🎨 Processing album art for scraped song: ${songData.albumArtId}`); + await checkAndProcessAlbumArt(songData.songArtImageUrl, songData.albumArtId); + console.log(`✅ Album art processed for song ${songId}`); + } catch (albumArtError) { + console.warn(`⚠️ Album art processing failed for song ${songId}: ${albumArtError.message}`); + // Don't fail lyric scraping if album art processing fails + } + } + + // Update song document with lyrics + await db.collection('songs').doc(songId).update( { + lyrics: lyrics, + lyricsScrapedAt: new Date(), + scrapingStatus: 'completed', + scrapingError: null + }); + + results.successful.push(songId); + console.log(`Successfully scraped lyrics for song ${songId}: ${songData.title}`); + + // Update artist cachedSongIds immediately for real-time access + await db.collection('artists').doc(artistUrlKey).update( { + cachedSongIds: FieldValue.arrayUnion(songId), + lyricsScraped: FieldValue.increment(1) + }); + } else { + throw new Error('No lyrics found or empty lyrics'); + } + + } catch (error) { + console.error(`Error scraping song ${songId}:`, error); + + // Update song document with error + await db.collection('songs').doc(songId).update( { + scrapingStatus: 'failed', + scrapingError: error.message + }); + + results.failed.push({ songId, error: error.message }); + } + + // Small delay between songs + await new Promise(resolve => setTimeout(resolve, 300)); + } + + // Artist cachedSongIds are now updated individually after each successful scrape + + console.log(`Lyrics scraping completed: ${results.successful.length} successful, ${results.failed.length} failed, ${results.skipped.length} skipped`); + + return { + success: true, + results: results, + scrapedCount: results.successful.length + }; +} + +/** + * Scrape lyrics for specified songs using existing scraping logic + * Batch operation for efficiency with retry logic + */ +export const scrapeSongLyrics = onCall({ + timeoutSeconds: 300, // 5 minutes for batch scraping + minInstances: 0, + maxInstances: 20, + region: 'us-central1' +}, async (request, context) => { + const { songIds, artistUrlKey } = request.data; + + if (!songIds || !Array.isArray(songIds) || songIds.length === 0) { + throw new HttpsError('invalid-argument', 'Song IDs array is required'); + } + + if (!artistUrlKey) { + throw new HttpsError('invalid-argument', 'Artist URL key is required'); + } + + try { + return await scrapeSongLyricsCore(songIds, artistUrlKey); + } catch (error) { + console.error('Error in scrapeSongLyrics:', error); + throw new HttpsError('internal', `Failed to scrape lyrics: ${error.message}`); + } +}); + +// Note: This function scrapes the complete lyrics for each song + +/** + * Scrape only the actual lyrics from a Genius song URL + * This function extracts ONLY the actual song lyrics, avoiding annotations, + * descriptions, and other non-lyrical content from the start. + * + * @param {string} songUrl - The Genius song URL + * @returns {Promise} The complete extracted lyrics + */ +async function scrapeLyricsFromUrl(songUrl) { + try { + console.log(`Attempting to scrape lyrics from: ${songUrl}`); + + // Validate URL format + if (!songUrl || typeof songUrl !== 'string') { + throw new Error(`Invalid song URL: ${songUrl}`); + } + + if (!songUrl.includes('genius.com')) { + throw new Error(`URL does not appear to be a Genius URL: ${songUrl}`); + } + + // Fetch the song page with proper error handling + const songPageResponse = await fetchWithTimeout(songUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + } + }); + + if (!songPageResponse.ok) { + throw new Error(`HTTP ${songPageResponse.status}: ${songPageResponse.statusText} for URL: ${songUrl}`); + } + + const songPageHtml = await songPageResponse.text(); + + if (!songPageHtml || songPageHtml.length < 100) { + throw new Error('Received empty or invalid HTML response'); + } + + // Parse the page with cheerio + const $ = cheerio.load(songPageHtml); + + // Target ALL lyrics containers - Genius often splits lyrics across multiple divs + const lyricsContainers = $('div[data-lyrics-container="true"]'); + + if (lyricsContainers.length === 0) { + throw new Error('No lyrics containers found'); + } + + console.log(`Found ${lyricsContainers.length} lyrics container(s)`); + + let allLyricsText = ''; + + // Process each lyrics container + lyricsContainers.each((index, container) => { + const $container = $(container); + + // Remove elements that should be excluded from lyrics + $container.find('[data-exclude-from-selection="true"]').remove(); + + // Remove headers, footers, and annotation elements + $container.find('.LyricsHeader__Container, .LyricsFooter__Container').remove(); + $container.find('a[href*="/annotations/"]').remove(); + + // Get the raw text content, preserving line breaks + let containerText = $container.html() || ''; + + // Convert HTML to clean text + containerText = containerText + // Convert
tags to newlines + .replace(//gi, '\n') + // Remove all HTML tags completely (including , section headers, etc.) + .replace(/<[^>]*>/gi, '') + // Decode HTML entities + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' ') + // Clean up whitespace + .split('\n') + .map(line => line.trim()) + .filter(line => { + // Filter out section headers and empty lines + if (!line) return false; + if (line.match(/^\[.*\]$/)) return false; // Remove [Intro], [Verse], etc. + if (line.match(/^(Intro|Verse|Chorus|Bridge|Outro|Pre-Chorus|Post-Chorus|Hook|Refrain)(\s|\d|$)/i)) return false; + return true; + }) + .join('\n'); + + if (containerText.trim()) { + if (allLyricsText) allLyricsText += '\n\n'; + allLyricsText += containerText.trim(); + } + }); + + // Final cleanup + let lyrics = allLyricsText + // Remove multiple consecutive newlines + .replace(/\n{3,}/g, '\n\n') + // Remove any remaining section markers that might have slipped through + .replace(/^\[.*\]$/gm, '') + // Clean up any remaining whitespace issues + .split('\n') + .map(line => line.trim()) + .filter(line => line.length > 0) + .join('\n') + .trim(); + + if (!lyrics || lyrics.length < 10) { + throw new Error('Extracted lyrics are too short or empty'); + } + + console.log(`Successfully scraped ${lyrics.length} characters of clean lyrics`); + return lyrics; + + } catch (error) { + console.error(`Error scraping lyrics from ${songUrl}:`, error); + // Include more context in the error for debugging + throw new Error(`Failed to scrape lyrics from ${songUrl}: ${error.message}`); + } +} + +//TODO: Push songs to db when they are scraped instead of waiting for all songs to be scraped +/** + * Core logic for loading songs around a specific position (without Firebase Functions wrapper) + * @param {string} songId - The song ID to start from + * @param {boolean} shouldReverse - Whether to load previous songs + * @param {string} artistUrlKey - Artist document ID + * @returns {Promise} Result object + */ +async function loadStartingFromIdCore(songId, shouldReverse = false, artistUrlKey, rangeSize = 10) { + console.log(`Loading songs starting from ${songId} for artist ${artistUrlKey}, reverse: ${shouldReverse}, rangeSize: ${rangeSize}`); + + // Get artist document to find songIds array + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (!artistDoc.exists) { + throw new Error('Artist not found'); + } + + const artistData = artistDoc.data(); + const songIds = artistData.songIds || []; + const cachedSongIds = artistData.cachedSongIds || []; + + console.log(`Artist has ${songIds.length} total songs, ${cachedSongIds.length} cached`); + + // Find position of songId in the songIds array + const currentPosition = songIds.indexOf(songId.toString()); + if (currentPosition === -1) { + throw new Error('Song not found in artist song list'); + } + + console.log(`Found song at position ${currentPosition}`); + + // Determine target range (configurable number of songs in the specified direction) + const windowSize = Math.max(1, Number(rangeSize) || 10); + let startPos, endPos; + if (shouldReverse) { + // Load the current song and up to (windowSize-1) previous songs + startPos = Math.max(0, currentPosition - (windowSize - 1)); + endPos = currentPosition + 1; // end is non-inclusive, so +1 to include current + } else { + // Load the current song and up to (windowSize-1) next songs + startPos = currentPosition; + endPos = Math.min(songIds.length, currentPosition + windowSize); + } + + const targetSongIds = songIds.slice(startPos, endPos); + + // Filter out songs that already have lyrics cached + const songsNeedingLyrics = targetSongIds.filter(id => !cachedSongIds.includes(id)); + + console.log(`Found ${songsNeedingLyrics.length} songs needing lyrics out of ${targetSongIds.length} target songs`); + + // Scrape missing lyrics if any + let scrapingResults = null; + if (songsNeedingLyrics.length > 0) { + try { + console.log(`Attempting to scrape lyrics for ${songsNeedingLyrics.length} songs`); + + // Call core scraping function directly + const scrapingResponse = await scrapeSongLyricsCore(songsNeedingLyrics, artistUrlKey); + scrapingResults = scrapingResponse.results; + + console.log(`Scraping completed: ${scrapingResults.successful.length} successful, ${scrapingResults.failed.length} failed`); + + } catch (scrapingError) { + console.error('Error during lyrics scraping:', scrapingError); + // Don't fail the entire function if scraping fails + // Just log the error and continue with what we have + scrapingResults = { + successful: [], + failed: songsNeedingLyrics.map(id => ({ songId: id, error: scrapingError.message })), + skipped: [] + }; + } + } else { + console.log('All target songs already have cached lyrics'); + } + + // Fetch and return the loaded songs + const loadedSongs = {}; + let songsLoadedCount = 0; + + for (const songId of targetSongIds) { + try { + const songDoc = await db.collection('songs').doc(songId).get(); + if (songDoc.exists) { + loadedSongs[songId] = { id: songId, ...songDoc.data() }; + songsLoadedCount++; + } else { + console.warn(`Song document not found for ID: ${songId}`); + } + } catch (songError) { + console.error(`Error loading song ${songId}:`, songError); + // Continue with other songs even if one fails + } + } + + console.log(`Successfully loaded ${songsLoadedCount} songs`); + + return { + success: true, + queuePosition: currentPosition, + loadedSongs: loadedSongs, + scrapingResults: scrapingResults, + targetRange: { start: startPos, end: endPos }, + songsScraped: scrapingResults ? scrapingResults.successful.length : 0, + songsLoaded: songsLoadedCount, + totalTargetSongs: targetSongIds.length + }; +} + +/** + * Intelligently loads songs around a specific position in the queue + * Handles forward/backward navigation efficiently + */ +export const loadStartingFromId = onCall({ + timeoutSeconds: 120, + minInstances: 0, + maxInstances: 20, + region: 'us-central1' +}, async (request, context) => { + const { songId, shouldReverse = false, artistUrlKey, rangeSize = 10 } = request.data; + + if (!songId || !artistUrlKey) { + throw new HttpsError('invalid-argument', 'Song ID and artist URL key are required'); + } + + try { + return await loadStartingFromIdCore(songId, shouldReverse, artistUrlKey, rangeSize); + } catch (error) { + console.error(`Error in loadStartingFromId for song ${songId}:`, error); + + // Provide more detailed error information + if (error instanceof HttpsError) { + throw error; // Re-throw HttpsError as-is + } else { + throw new HttpsError('internal', `Failed to load songs: ${error.message}`, { + songId, + artistUrlKey, + shouldReverse, + originalError: error.message + }); + } + } +}); + +// ======================================== +// LEGACY FUNCTIONS (Keep for backward compatibility during transition) +// ======================================== + +/** + * Legacy function - keep for backward compatibility + * TODO: Gradually migrate clients to use new system + */ +export const initialArtistSearch = onCall({ + timeoutSeconds: 60, + minInstances: 0, + maxInstances: 100, + region: 'us-central1' +}, async (request, context) => { + // TODO: Implement legacy compatibility or redirect to new system + console.log('Legacy initialArtistSearch called - consider migrating to new system'); + throw new HttpsError('unimplemented', 'This function is being migrated to the new caching system'); +}); + +/** + * Legacy function - keep for backward compatibility + * TODO: Gradually migrate clients to use new system + */ +export const searchByArtistId = onCall({ + timeoutSeconds: 60, + minInstances: 0, + maxInstances: 100, + region: 'us-central1' +}, async (request, context) => { + // TODO: Implement legacy compatibility or redirect to new system + console.log('Legacy searchByArtistId called - consider migrating to new system'); + throw new HttpsError('unimplemented', 'This function is being migrated to the new caching system'); +}); + +// ======================================== +// UTILITY FUNCTIONS FOR TESTING +// ======================================== + +/** + * Test function to validate the new caching system + */ +export const testCacheSystem = onCall({ + timeoutSeconds: 120, + region: 'us-central1' +}, async (request, context) => { + const { artistUrlKey, testType = 'populate' } = request.data; + + if (!artistUrlKey) { + throw new HttpsError('invalid-argument', 'Artist URL key is required for testing'); + } + + // Initialize results at the beginning to avoid reference errors + const results = { + testType: testType, + artistUrlKey: artistUrlKey, + timestamp: new Date(), + steps: [] + }; + + try { + console.log(`Running cache system test for artist: ${artistUrlKey}`); + + // Test song population + if (testType === 'full' || testType === 'populate') { + console.log('Testing song population...'); + try { + const populateResult = await populateArtistSongsCore(artistUrlKey); + results.steps.push({ + step: 'populate', + success: populateResult.success, + data: populateResult + }); + } catch (error) { + console.error('Error in populate step:', error); + results.steps.push({ + step: 'populate', + success: false, + error: error.message + }); + } + } + + // Test lyrics scraping + if (testType === 'full' || testType === 'scrape') { + console.log('Testing lyrics scraping...'); + + try { + // Get first few song IDs from artist for testing + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (artistDoc.exists) { + const artistData = artistDoc.data(); + const testSongIds = (artistData.songIds || []).slice(0, 2); // Test with first 2 songs + + if (testSongIds.length > 0) { + const scrapeResult = await scrapeSongLyricsCore(testSongIds, artistUrlKey); + results.steps.push({ + step: 'scrape', + success: scrapeResult.success, + data: scrapeResult + }); + } else { + results.steps.push({ + step: 'scrape', + success: false, + error: 'No songs available for testing' + }); + } + } else { + results.steps.push({ + step: 'scrape', + success: false, + error: 'Artist not found' + }); + } + } catch (error) { + console.error('Error in scrape step:', error); + results.steps.push({ + step: 'scrape', + success: false, + error: error.message + }); + } + } + + // Test smart loading + if (testType === 'full' || testType === 'load') { + console.log('Testing smart loading...'); + + try { + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (artistDoc.exists) { + const artistData = artistDoc.data(); + const songIds = artistData.songIds || []; + + if (songIds.length > 5) { + const testSongId = songIds[2]; // Test with 3rd song + const loadResult = await loadStartingFromIdCore(testSongId, false, artistUrlKey); + results.steps.push({ + step: 'load', + success: loadResult.success, + data: loadResult + }); + } else { + results.steps.push({ + step: 'load', + success: false, + error: 'Not enough songs for testing smart loading' + }); + } + } else { + results.steps.push({ + step: 'load', + success: false, + error: 'Artist not found' + }); + } + } catch (error) { + console.error('Error in load step:', error); + results.steps.push({ + step: 'load', + success: false, + error: error.message + }); + } + } + + // Determine overall success + const allStepsSuccessful = results.steps.every(step => step.success); + + return { + success: allStepsSuccessful, + testResults: results, + summary: { + totalSteps: results.steps.length, + successfulSteps: results.steps.filter(step => step.success).length, + failedSteps: results.steps.filter(step => !step.success).length + } + }; + + } catch (error) { + console.error('Error in cache system test:', error); + + // Add the error as a failed step + results.steps.push({ + step: 'error', + success: false, + error: error.message + }); + + return { + success: false, + error: error.message, + testResults: results, + summary: { + totalSteps: results.steps.length, + successfulSteps: results.steps.filter(step => step.success).length, + failedSteps: results.steps.filter(step => !step.success).length + } + }; + } +}); + +/** + * Helper function to get artist information for testing + */ +export const getArtistInfo = onCall({ + timeoutSeconds: 30, + region: 'us-central1' +}, async (request, context) => { + const { artistUrlKey } = request.data; + + if (!artistUrlKey) { + throw new HttpsError('invalid-argument', 'Artist URL key is required'); + } + + try { + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (!artistDoc.exists) { + throw new HttpsError('not-found', 'Artist not found'); + } + + const artistData = artistDoc.data(); + + return { + success: true, + artist: { + name: artistData.name, + geniusId: artistData.geniusId, + urlKey: artistUrlKey, // Add the URL key to the response + totalSongs: (artistData.songIds || []).length, + cachedSongs: (artistData.cachedSongIds || []).length, + songIds: artistData.songIds || [], // Include songIds array + lastUpdated: artistData.songsLastUpdated, + isFullyCached: artistData.isFullyCached || false + } + }; + + } catch (error) { + console.error(`Error getting artist info for ${artistUrlKey}:`, error); + throw new HttpsError('internal', `Failed to get artist info: ${error.message}`); + } +}); + +/** + * Diagnostic function to inspect song data and URLs + */ +export const diagnoseSongData = onCall({ + timeoutSeconds: 60, + region: 'us-central1' +}, async (request, context) => { + const { artistUrlKey, songId } = request.data; + + if (!artistUrlKey) { + throw new HttpsError('invalid-argument', 'Artist URL key is required'); + } + + try { + const results = { + timestamp: new Date(), + artistUrlKey: artistUrlKey, + diagnostics: {} + }; + + // Get artist info + const artistDoc = await db.collection('artists').doc(artistUrlKey).get(); + if (!artistDoc.exists) { + throw new HttpsError('not-found', 'Artist not found'); + } + + const artistData = artistDoc.data(); + results.diagnostics.artist = { + name: artistData.name, + totalSongs: (artistData.songIds || []).length, + cachedSongs: (artistData.cachedSongIds || []).length, + firstFewSongIds: (artistData.songIds || []).slice(0, 5) + }; + + // If specific song ID provided, examine it + if (songId) { + const songDoc = await db.collection('songs').doc(songId).get(); + if (songDoc.exists) { + const songData = songDoc.data(); + results.diagnostics.specificSong = { + id: songId, + title: songData.title, + url: songData.url, + urlValid: songData.url && songData.url.includes('genius.com'), + hasLyrics: !!songData.lyrics, + scrapingStatus: songData.scrapingStatus, + scrapingAttempts: songData.scrapingAttempts, + scrapingError: songData.scrapingError + }; + } else { + results.diagnostics.specificSong = { + id: songId, + exists: false + }; + } + } else { + // Examine first few songs + const songIds = (artistData.songIds || []).slice(0, 3); + results.diagnostics.sampleSongs = []; + + for (const id of songIds) { + const songDoc = await db.collection('songs').doc(id).get(); + if (songDoc.exists) { + const songData = songDoc.data(); + results.diagnostics.sampleSongs.push({ + id: id, + title: songData.title, + url: songData.url, + urlValid: songData.url && songData.url.includes('genius.com'), + hasLyrics: !!songData.lyrics, + scrapingStatus: songData.scrapingStatus + }); + } else { + results.diagnostics.sampleSongs.push({ + id: id, + exists: false + }); + } + } + } + + return { + success: true, + diagnostics: results + }; + + } catch (error) { + console.error(`Error in diagnoseSongData:`, error); + throw new HttpsError('internal', `Failed to diagnose song data: ${error.message}`); + } +}); + +/** + * Test lyrics scraping for a specific song URL + */ +export const testLyricsScraping = onCall({ + timeoutSeconds: 60, + region: 'us-central1' +}, async (request, context) => { + const { songUrl, songId } = request.data; + + if (!songUrl && !songId) { + throw new HttpsError('invalid-argument', 'Either songUrl or songId is required'); + } + + try { + let testUrl = songUrl; + + // If songId provided, get URL from database + if (songId && !songUrl) { + const songDoc = await db.collection('songs').doc(songId).get(); + if (!songDoc.exists) { + throw new HttpsError('not-found', 'Song not found'); + } + testUrl = songDoc.data().url; + } + + console.log(`Testing lyrics scraping for URL: ${testUrl}`); + + const startTime = Date.now(); + const lyrics = await scrapeLyricsFromUrl(testUrl); + const duration = Date.now() - startTime; + + return { + success: true, + url: testUrl, + lyrics: lyrics, + lyricsLength: lyrics.length, + lyricsLines: lyrics.split('\n').length, + scrapingDuration: duration + }; + + } catch (error) { + console.error(`Error testing lyrics scraping:`, error); + return { + success: false, + url: songUrl || 'unknown', + error: error.message, + errorType: error.constructor.name + }; + } +}); diff --git a/functions/package-lock.json b/functions/package-lock.json index 43ff98c..afe181b 100644 --- a/functions/package-lock.json +++ b/functions/package-lock.json @@ -716,11 +716,13 @@ } }, "node_modules/@emnapi/runtime": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.8.1.tgz", - "integrity": "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg==", + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz", + "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==", + "dev": true, "license": "MIT", "optional": true, + "peer": true, "dependencies": { "tslib": "^2.4.0" } @@ -1116,471 +1118,6 @@ "node": ">=6" } }, - "node_modules/@img/colour": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.0.0.tgz", - "integrity": "sha512-A5P/LfWGFSl6nsckYtjw9da+19jB8hkJ6ACTGcDfEJ0aE+l2n2El7dsVM7UVHZQ9s2lmYMWlrS21YLy2IR1LUw==", - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/@img/sharp-darwin-arm64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz", - "integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==", - "cpu": [ - "arm64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-darwin-arm64": "1.2.4" - } - }, - "node_modules/@img/sharp-darwin-x64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz", - "integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==", - "cpu": [ - "x64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-darwin-x64": "1.2.4" - } - }, - "node_modules/@img/sharp-libvips-darwin-arm64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz", - "integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==", - "cpu": [ - "arm64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "darwin" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-darwin-x64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz", - "integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==", - "cpu": [ - "x64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "darwin" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-arm": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz", - "integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==", - "cpu": [ - "arm" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-arm64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz", - "integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==", - "cpu": [ - "arm64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-ppc64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.4.tgz", - "integrity": "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==", - "cpu": [ - "ppc64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-riscv64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-riscv64/-/sharp-libvips-linux-riscv64-1.2.4.tgz", - "integrity": "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==", - "cpu": [ - "riscv64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-s390x": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.4.tgz", - "integrity": "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==", - "cpu": [ - "s390x" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linux-x64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz", - "integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==", - "cpu": [ - "x64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linuxmusl-arm64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz", - "integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==", - "cpu": [ - "arm64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-libvips-linuxmusl-x64": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz", - "integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==", - "cpu": [ - "x64" - ], - "license": "LGPL-3.0-or-later", - "optional": true, - "os": [ - "linux" - ], - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-linux-arm": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz", - "integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==", - "cpu": [ - "arm" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-arm": "1.2.4" - } - }, - "node_modules/@img/sharp-linux-arm64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz", - "integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==", - "cpu": [ - "arm64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-arm64": "1.2.4" - } - }, - "node_modules/@img/sharp-linux-ppc64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.5.tgz", - "integrity": "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==", - "cpu": [ - "ppc64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-ppc64": "1.2.4" - } - }, - "node_modules/@img/sharp-linux-riscv64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-riscv64/-/sharp-linux-riscv64-0.34.5.tgz", - "integrity": "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==", - "cpu": [ - "riscv64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-riscv64": "1.2.4" - } - }, - "node_modules/@img/sharp-linux-s390x": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.5.tgz", - "integrity": "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==", - "cpu": [ - "s390x" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-s390x": "1.2.4" - } - }, - "node_modules/@img/sharp-linux-x64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz", - "integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==", - "cpu": [ - "x64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linux-x64": "1.2.4" - } - }, - "node_modules/@img/sharp-linuxmusl-arm64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz", - "integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==", - "cpu": [ - "arm64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linuxmusl-arm64": "1.2.4" - } - }, - "node_modules/@img/sharp-linuxmusl-x64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz", - "integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==", - "cpu": [ - "x64" - ], - "license": "Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-libvips-linuxmusl-x64": "1.2.4" - } - }, - "node_modules/@img/sharp-wasm32": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.5.tgz", - "integrity": "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==", - "cpu": [ - "wasm32" - ], - "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", - "optional": true, - "dependencies": { - "@emnapi/runtime": "^1.7.0" - }, - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-win32-arm64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz", - "integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==", - "cpu": [ - "arm64" - ], - "license": "Apache-2.0 AND LGPL-3.0-or-later", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-win32-ia32": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.5.tgz", - "integrity": "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg==", - "cpu": [ - "ia32" - ], - "license": "Apache-2.0 AND LGPL-3.0-or-later", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - } - }, - "node_modules/@img/sharp-win32-x64": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz", - "integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==", - "cpu": [ - "x64" - ], - "license": "Apache-2.0 AND LGPL-3.0-or-later", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - } - }, "node_modules/@inquirer/checkbox": { "version": "4.2.2", "resolved": "https://registry.npmjs.org/@inquirer/checkbox/-/checkbox-4.2.2.tgz", @@ -6202,9 +5739,9 @@ } }, "node_modules/detect-libc": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", - "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz", + "integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==", "license": "Apache-2.0", "engines": { "node": ">=8" @@ -12398,9 +11935,9 @@ "license": "MIT" }, "node_modules/semver": { - "version": "7.7.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", - "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "version": "7.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", + "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", "license": "ISC", "bin": { "semver": "bin/semver.js" @@ -12510,50 +12047,6 @@ "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", "license": "ISC" }, - "node_modules/sharp": { - "version": "0.34.5", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz", - "integrity": "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==", - "hasInstallScript": true, - "license": "Apache-2.0", - "dependencies": { - "@img/colour": "^1.0.0", - "detect-libc": "^2.1.2", - "semver": "^7.7.3" - }, - "engines": { - "node": "^18.17.0 || ^20.3.0 || >=21.0.0" - }, - "funding": { - "url": "https://opencollective.com/libvips" - }, - "optionalDependencies": { - "@img/sharp-darwin-arm64": "0.34.5", - "@img/sharp-darwin-x64": "0.34.5", - "@img/sharp-libvips-darwin-arm64": "1.2.4", - "@img/sharp-libvips-darwin-x64": "1.2.4", - "@img/sharp-libvips-linux-arm": "1.2.4", - "@img/sharp-libvips-linux-arm64": "1.2.4", - "@img/sharp-libvips-linux-ppc64": "1.2.4", - "@img/sharp-libvips-linux-riscv64": "1.2.4", - "@img/sharp-libvips-linux-s390x": "1.2.4", - "@img/sharp-libvips-linux-x64": "1.2.4", - "@img/sharp-libvips-linuxmusl-arm64": "1.2.4", - "@img/sharp-libvips-linuxmusl-x64": "1.2.4", - "@img/sharp-linux-arm": "0.34.5", - "@img/sharp-linux-arm64": "0.34.5", - "@img/sharp-linux-ppc64": "0.34.5", - "@img/sharp-linux-riscv64": "0.34.5", - "@img/sharp-linux-s390x": "0.34.5", - "@img/sharp-linux-x64": "0.34.5", - "@img/sharp-linuxmusl-arm64": "0.34.5", - "@img/sharp-linuxmusl-x64": "0.34.5", - "@img/sharp-wasm32": "0.34.5", - "@img/sharp-win32-arm64": "0.34.5", - "@img/sharp-win32-ia32": "0.34.5", - "@img/sharp-win32-x64": "0.34.5" - } - }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", diff --git a/genius-scraper.js b/genius-scraper.js new file mode 100644 index 0000000..313446e --- /dev/null +++ b/genius-scraper.js @@ -0,0 +1,468 @@ +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import fs from 'fs'; +import path from 'path'; + +class GeniusArtistScraper { + constructor() { + this.baseUrl = 'https://genius.com/artists-index/'; + this.results = { + popularArtists: [], + regularArtists: [] + }; + this.requestDelay = 500; // 500ms delay between requests to be respectful + } + + /** + * Extract artist ID from iOS app link on artist page + * @param {string} artistUrl - The URL of the artist page + * @returns {string|null} The artist ID or null if not found + */ + async extractArtistId(artistUrl) { + try { + // Add delay to be respectful + await new Promise(resolve => setTimeout(resolve, this.requestDelay)); + + const response = await axios.get(artistUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + } + }); + + const $ = cheerio.load(response.data); + + // Look for iOS app link: + const iosAppLink = $('link[rel="alternate"][href*="ios-app://"]').attr('href'); + + if (iosAppLink) { + // Extract the ID from the end of the URL: ios-app://709482991/genius/artists/673285 + const match = iosAppLink.match(/\/artists\/(\d+)$/); + if (match) { + return match[1]; + } + } + + return null; + } catch (error) { + console.warn(`Failed to extract ID for ${artistUrl}:`, error.message); + return null; + } + } + + /** + * Add progress indicator for long-running operations + */ + logProgress(current, total, type) { + const percentage = ((current / total) * 100).toFixed(1); + const progressBar = '█'.repeat(Math.floor(percentage / 5)) + '░'.repeat(20 - Math.floor(percentage / 5)); + process.stdout.write(`\r${type}: [${progressBar}] ${percentage}% (${current}/${total})`); + if (current === total) console.log(); // New line when complete + } + + /** + * Scrape artist links from a specific letter page + * @param {string} letter - The letter to scrape (e.g., 'j', 'a', 'b') + * @param {boolean} includeIds - Whether to fetch artist IDs (slower) + * @returns {Object} Object containing popularArtists and regularArtists arrays + */ + async scrapeArtistsByLetter(letter, includeIds = true) { + try { + console.log(`Scraping artists for letter: ${letter.toUpperCase()}`); + + const url = `${this.baseUrl}${letter.toLowerCase()}`; + const response = await axios.get(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + } + }); + + const $ = cheerio.load(response.data); + + // Reset results for this letter + this.results = { + popularArtists: [], + regularArtists: [] + }; + + // Extract popular artists + console.log('Extracting popular artists...'); + $('li.artists_index_list-popular_artist').each((index, element) => { + const artistLink = $(element).find('a.artists_index_list-artist_name'); + const name = artistLink.text().trim(); + const url = artistLink.attr('href'); + + if (name && url) { + this.results.popularArtists.push({ + name: name, + url: url, + type: 'popular', + id: null // Will be populated later if includeIds is true + }); + } + }); + + // Extract regular artists + console.log('Extracting regular artists...'); + // Look for ul.artists_index_list that comes after popular artists + const regularArtistLists = $('ul.artists_index_list').not(':has(.artists_index_list-popular_artist)'); + + regularArtistLists.each((listIndex, listElement) => { + $(listElement).find('li').each((index, element) => { + const artistLink = $(element).find('a').first(); + const name = artistLink.text().trim(); + const url = artistLink.attr('href'); + + // Only include links that point to artist pages + if (name && url && url.includes('/artists/')) { + this.results.regularArtists.push({ + name: name, + url: url, + type: 'regular', + id: null // Will be populated later if includeIds is true + }); + } + }); + }); + + console.log(`Found ${this.results.popularArtists.length} popular artists`); + console.log(`Found ${this.results.regularArtists.length} regular artists`); + + // Extract artist IDs if requested + if (includeIds) { + console.log('\n🔍 Extracting artist IDs from individual pages...'); + console.log('⚠️ This may take several minutes due to rate limiting'); + + // Process popular artists + if (this.results.popularArtists.length > 0) { + console.log('\nFetching popular artist IDs:'); + for (let i = 0; i < this.results.popularArtists.length; i++) { + const artist = this.results.popularArtists[i]; + this.logProgress(i + 1, this.results.popularArtists.length, 'Popular Artists'); + artist.id = await this.extractArtistId(artist.url); + } + } + + // Process regular artists + if (this.results.regularArtists.length > 0) { + console.log('\nFetching regular artist IDs:'); + for (let i = 0; i < this.results.regularArtists.length; i++) { + const artist = this.results.regularArtists[i]; + this.logProgress(i + 1, this.results.regularArtists.length, 'Regular Artists'); + artist.id = await this.extractArtistId(artist.url); + } + } + + // Count successful ID extractions + const popularWithIds = this.results.popularArtists.filter(a => a.id !== null).length; + const regularWithIds = this.results.regularArtists.filter(a => a.id !== null).length; + + console.log(`\n✅ Successfully extracted ${popularWithIds}/${this.results.popularArtists.length} popular artist IDs`); + console.log(`✅ Successfully extracted ${regularWithIds}/${this.results.regularArtists.length} regular artist IDs`); + } + + return this.results; + + } catch (error) { + console.error(`Error scraping letter ${letter}:`, error.message); + throw error; + } + } + + /** + * Save results to JSON file + * @param {string} letter - The letter that was scraped + * @param {Object} data - The scraped data + * @param {string} outputDir - Directory to save the file (optional) + */ + saveToFile(letter, data, outputDir = '.') { + const filename = `genius-artists-${letter.toLowerCase()}.json`; + const filepath = path.join(outputDir, filename); + const output = { + letter: letter.toUpperCase(), + timestamp: new Date().toISOString(), + totalArtists: data.popularArtists.length + data.regularArtists.length, + popularCount: data.popularArtists.length, + regularCount: data.regularArtists.length, + artists: { + popular: data.popularArtists, + regular: data.regularArtists + } + }; + + fs.writeFileSync(filepath, JSON.stringify(output, null, 2)); + console.log(`Results saved to ${filepath}`); + return filepath; + } + + /** + * Display summary of scraped data + */ + displaySummary(data) { + console.log('\n=== SCRAPING SUMMARY ==='); + console.log(`Popular Artists: ${data.popularArtists.length}`); + console.log(`Regular Artists: ${data.regularArtists.length}`); + console.log(`Total Artists: ${data.popularArtists.length + data.regularArtists.length}`); + + // Show ID extraction summary if IDs were fetched + const popularWithIds = data.popularArtists.filter(a => a.id !== null).length; + const regularWithIds = data.regularArtists.filter(a => a.id !== null).length; + const totalWithIds = popularWithIds + regularWithIds; + + if (data.popularArtists.length > 0 && data.popularArtists[0].id !== undefined) { + console.log(`\nArtist IDs Extracted: ${totalWithIds}/${data.popularArtists.length + data.regularArtists.length} (${((totalWithIds / (data.popularArtists.length + data.regularArtists.length)) * 100).toFixed(1)}%)`); + } + + if (data.popularArtists.length > 0) { + console.log('\nFirst 5 Popular Artists:'); + data.popularArtists.slice(0, 5).forEach((artist, index) => { + const idDisplay = artist.id ? ` (ID: ${artist.id})` : ''; + console.log(` ${index + 1}. ${artist.name}${idDisplay} - ${artist.url}`); + }); + } + + if (data.regularArtists.length > 0) { + console.log('\nFirst 5 Regular Artists:'); + data.regularArtists.slice(0, 5).forEach((artist, index) => { + const idDisplay = artist.id ? ` (ID: ${artist.id})` : ''; + console.log(` ${index + 1}. ${artist.name}${idDisplay} - ${artist.url}`); + }); + } + } + + /** + * Prepare data for Firebase Firestore (for future use) + * @param {string} letter - The letter that was scraped + * @param {Object} data - The scraped data + * @returns {Array} Array of artist documents ready for Firestore + */ + /** + * Create output directory with timestamp + * @returns {string} The created directory path + */ + createOutputDirectory() { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-').split('T')[0]; + const dirname = `genius-artists-${timestamp}`; + + if (!fs.existsSync(dirname)) { + fs.mkdirSync(dirname, { recursive: true }); + } + + return dirname; + } + + /** + * Scrape all letters (a-z) and save results to a folder + * @param {boolean} includeIds - Whether to fetch artist IDs + * @returns {Object} Summary of bulk scraping results + */ + async scrapeAllLetters(includeIds = true) { + console.log('🎵 Starting bulk scraping for all letters (A-Z)...\n'); + + const outputDir = this.createOutputDirectory(); + console.log(`📁 Results will be saved to: ${outputDir}/\n`); + + const letters = 'abcdefghijklmnopqrstuvwxyz'.split(''); + const results = { + successful: [], + failed: [], + totalArtists: 0, + totalPopular: 0, + totalRegular: 0, + totalWithIds: 0, + startTime: new Date(), + outputDirectory: outputDir + }; + + for (let i = 0; i < letters.length; i++) { + const letter = letters[i]; + const progress = `[${i + 1}/${letters.length}]`; + + try { + console.log(`\n${progress} 🔄 Processing letter: ${letter.toUpperCase()}`); + + const letterResults = await this.scrapeArtistsByLetter(letter, includeIds); + + // Save to file + const filepath = this.saveToFile(letter, letterResults, outputDir); + + // Update summary stats + const letterTotal = letterResults.popularArtists.length + letterResults.regularArtists.length; + const letterWithIds = includeIds ? + letterResults.popularArtists.filter(a => a.id !== null).length + + letterResults.regularArtists.filter(a => a.id !== null).length : 0; + + results.successful.push({ + letter: letter.toUpperCase(), + popular: letterResults.popularArtists.length, + regular: letterResults.regularArtists.length, + total: letterTotal, + withIds: letterWithIds, + filepath: filepath + }); + + results.totalArtists += letterTotal; + results.totalPopular += letterResults.popularArtists.length; + results.totalRegular += letterResults.regularArtists.length; + results.totalWithIds += letterWithIds; + + console.log(`✅ Letter ${letter.toUpperCase()}: ${letterTotal} artists processed`); + + } catch (error) { + console.error(`❌ Failed to process letter ${letter.toUpperCase()}:`, error.message); + results.failed.push({ + letter: letter.toUpperCase(), + error: error.message + }); + } + + // Small delay between letters to be extra respectful + if (i < letters.length - 1) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + results.endTime = new Date(); + results.duration = Math.round((results.endTime - results.startTime) / 1000 / 60); // minutes + + // Save summary file + const summaryPath = path.join(outputDir, 'scraping-summary.json'); + fs.writeFileSync(summaryPath, JSON.stringify(results, null, 2)); + + return results; + } + + /** + * Display bulk scraping summary + */ + displayBulkSummary(results) { + console.log('\n' + '='.repeat(60)); + console.log('🎯 BULK SCRAPING COMPLETE'); + console.log('='.repeat(60)); + + console.log(`📊 Overall Statistics:`); + console.log(` • Letters processed: ${results.successful.length}/${results.successful.length + results.failed.length}`); + console.log(` • Total artists: ${results.totalArtists.toLocaleString()}`); + console.log(` • Popular artists: ${results.totalPopular.toLocaleString()}`); + console.log(` • Regular artists: ${results.totalRegular.toLocaleString()}`); + + if (results.totalWithIds > 0) { + const idSuccessRate = ((results.totalWithIds / results.totalArtists) * 100).toFixed(1); + console.log(` • Artists with IDs: ${results.totalWithIds.toLocaleString()} (${idSuccessRate}%)`); + } + + console.log(` • Duration: ${results.duration} minutes`); + console.log(` • Output directory: ${results.outputDirectory}/`); + + if (results.failed.length > 0) { + console.log(`\n❌ Failed letters (${results.failed.length}):`); + results.failed.forEach(fail => { + console.log(` • ${fail.letter}: ${fail.error}`); + }); + } + + console.log(`\n✅ Results saved to: ${results.outputDirectory}/`); + console.log(`📄 Summary saved to: ${path.join(results.outputDirectory, 'scraping-summary.json')}`); + } + + prepareForFirestore(letter, data) { + const firestoreData = []; + + // Add popular artists + data.popularArtists.forEach(artist => { + firestoreData.push({ + name: artist.name, + url: artist.url, + id: artist.id, + type: 'popular', + letter: letter.toLowerCase(), + scrapedAt: new Date() + }); + }); + + // Add regular artists + data.regularArtists.forEach(artist => { + firestoreData.push({ + name: artist.name, + url: artist.url, + id: artist.id, + type: 'regular', + letter: letter.toLowerCase(), + scrapedAt: new Date() + }); + }); + + return firestoreData; + } +} + +// Main execution function +async function main() { + const scraper = new GeniusArtistScraper(); + + // Parse command line arguments + const args = process.argv.slice(2); + const firstArg = args[0] || 'j'; + const includeIds = !args.includes('--no-ids'); // Include IDs by default, unless --no-ids flag is passed + + try { + console.log('🎵 Genius Artist Scraper Starting...\n'); + + if (!includeIds) { + console.log('⚡ Fast mode: Skipping artist ID extraction\n'); + } + + // Check if bulk scraping all letters + if (firstArg.toLowerCase() === 'all') { + console.log('🌟 Bulk mode: Scraping all letters A-Z'); + + if (includeIds) { + console.log('⚠️ This will take several hours with ID extraction enabled'); + console.log('💡 Consider using --no-ids flag for much faster bulk scraping\n'); + } + + // Perform bulk scraping + const bulkResults = await scraper.scrapeAllLetters(includeIds); + + // Display bulk summary + scraper.displayBulkSummary(bulkResults); + + console.log('\n🎉 Bulk scraping completed successfully!'); + + } else { + // Single letter scraping (original functionality) + const letter = firstArg; + + // Scrape the specified letter + const results = await scraper.scrapeArtistsByLetter(letter, includeIds); + + // Display summary + scraper.displaySummary(results); + + // Save to file + const filename = scraper.saveToFile(letter, results); + + // Prepare for Firestore (just showing the structure for now) + const firestoreData = scraper.prepareForFirestore(letter, results); + console.log(`\nPrepared ${firestoreData.length} documents for Firestore`); + + console.log('\n✅ Scraping completed successfully!'); + console.log(`📁 Data saved to: ${filename}`); + + if (includeIds) { + console.log('\n💡 Tip: Use --no-ids flag for faster scraping without artist IDs'); + console.log('💡 Tip: Use "all" to scrape all letters A-Z at once'); + } + } + + } catch (error) { + console.error('❌ Scraping failed:', error.message); + process.exit(1); + } +} + +// Run the scraper if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} + +export default GeniusArtistScraper; \ No newline at end of file diff --git a/lookup-song.js b/lookup-song.js new file mode 100644 index 0000000..9d19fc5 --- /dev/null +++ b/lookup-song.js @@ -0,0 +1,112 @@ +/** + * Quick Song Lookup by ID + * Usage: node lookup-song.js [song-id2] [song-id3] ... + */ + +import { initializeApp } from 'firebase/app'; +import { getFirestore, doc, getDoc } from 'firebase/firestore'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; + +// Initialize Firebase +const app = initializeApp(firebaseConfig); +const db = getFirestore(app); + +async function lookupSong(songId) { + try { + const songIdStr = songId.toString().trim(); + + console.log(`\n🔍 Looking up song ID: ${songIdStr}...`); + + const songRef = doc(db, 'songs', songIdStr); + const songSnap = await getDoc(songRef); + + if (!songSnap.exists()) { + console.log(`❌ Song not found in database\n`); + return; + } + + const songData = songSnap.data(); + + console.log(`\n${'='.repeat(80)}`); + console.log(`✅ SONG FOUND`); + console.log(`${'='.repeat(80)}\n`); + + console.log(`ID: ${songIdStr}`); + console.log(`Title: "${songData.title || 'Unknown'}"`); + console.log(`Artist: ${songData.artistNames || songData.primaryArtist?.name || 'Unknown'}`); + console.log(`URL: ${songData.url || 'N/A'}`); + + if (songData.albumName) { + console.log(`Album: ${songData.albumName}`); + } + + // Check lyrics + const hasLyrics = songData.lyrics && + songData.lyrics !== 'null' && + songData.lyrics !== null && + typeof songData.lyrics === 'string' && + songData.lyrics.trim().length > 0; + + console.log(`\n🎵 Lyrics:`); + if (hasLyrics) { + const lyricsLength = songData.lyrics.length; + const wordCount = songData.lyrics.split(/\s+/).length; + + console.log(` ✅ HAS LYRICS (${lyricsLength.toLocaleString()} chars, ~${wordCount} words)`); + + // Show first few lines + const lines = songData.lyrics.split('\n').filter(l => l.trim()).slice(0, 3); + console.log(` First lines:`); + lines.forEach(line => console.log(` ${line}`)); + if (songData.lyrics.split('\n').length > 3) { + console.log(` ...`); + } + } else { + console.log(` ❌ NO LYRICS`); + } + + console.log(`\n🔧 Status: ${songData.scrapingStatus || 'none'} (${songData.scrapingAttempts || 0} attempts)`); + + if (songData.scrapingError) { + console.log(`⚠️ Error: ${songData.scrapingError}`); + } + + console.log(`\n${'='.repeat(80)}\n`); + + } catch (error) { + console.error(`❌ Error: ${error.message}\n`); + } +} + +async function main() { + const songIds = process.argv.slice(2); + + if (songIds.length === 0) { + console.log(`\n${'='.repeat(80)}`); + console.log(`🔍 FIRESTORE SONG LOOKUP`); + console.log(`${'='.repeat(80)}\n`); + console.log(`Usage: node lookup-song.js [song-id2] [song-id3] ...\n`); + console.log(`Examples:`); + console.log(` node lookup-song.js 10000344`); + console.log(` node lookup-song.js 10000344 9592352 7470207\n`); + process.exit(0); + } + + console.log(`\n🔍 Searching for ${songIds.length} song(s)...`); + + for (let i = 0; i < songIds.length; i++) { + await lookupSong(songIds[i]); + + if (i < songIds.length - 1) { + console.log(`${'─'.repeat(80)}\n`); + } + } + + console.log(`✅ Done!\n`); +} + +main().catch(error => { + console.error('❌ Fatal error:', error); + process.exit(1); +}); + diff --git a/lyrictypesk@0.0.1 b/lyrictypesk@0.0.1 new file mode 100644 index 0000000..e69de29 diff --git a/package-lock.json b/package-lock.json index f17fc9d..624af3c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,10 +9,7 @@ "version": "0.0.1", "license": "MIT", "dependencies": { - "axios": "^1.13.2", - "chalk": "^5.6.2", - "cheerio": "^1.1.2", - "cli-progress": "^3.12.0", + "cheerio": "^1.0.0-rc.12", "firebase": "^12.2.1", "firebase-admin": "^12.1.0", "js-cookie": "^3.0.5", @@ -4946,35 +4943,9 @@ "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "devOptional": true, "license": "MIT" }, - "node_modules/axios": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.2.tgz", - "integrity": "sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==", - "license": "MIT", - "dependencies": { - "follow-redirects": "^1.15.6", - "form-data": "^4.0.4", - "proxy-from-env": "^1.1.0" - } - }, - "node_modules/axios/node_modules/form-data": { - "version": "4.0.5", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", - "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", - "license": "MIT", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "es-set-tostringtag": "^2.1.0", - "hasown": "^2.0.2", - "mime-types": "^2.1.12" - }, - "engines": { - "node": ">= 6" - } - }, "node_modules/axobject-query": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz", @@ -5403,6 +5374,7 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "devOptional": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", @@ -5453,6 +5425,7 @@ "version": "5.6.2", "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.6.2.tgz", "integrity": "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA==", + "dev": true, "license": "MIT", "engines": { "node": "^12.17.0 || ^14.13 || >=16.0.0" @@ -5706,18 +5679,6 @@ "node": ">=10" } }, - "node_modules/cli-progress": { - "version": "3.12.0", - "resolved": "https://registry.npmjs.org/cli-progress/-/cli-progress-3.12.0.tgz", - "integrity": "sha512-tRkV3HJ1ASwm19THiiLIXLO7Im7wlTuKnvkYaTkyoAPefqjNg7W7DHKUlGRxy9vxDvbyCYQkQozvptuMkGCg8A==", - "license": "MIT", - "dependencies": { - "string-width": "^4.2.3" - }, - "engines": { - "node": ">=4" - } - }, "node_modules/cli-spinners": { "version": "2.9.2", "resolved": "https://registry.npmjs.org/cli-spinners/-/cli-spinners-2.9.2.tgz", @@ -5884,6 +5845,7 @@ "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "devOptional": true, "license": "MIT", "dependencies": { "delayed-stream": "~1.0.0" @@ -6482,6 +6444,7 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "devOptional": true, "license": "MIT", "engines": { "node": ">=0.4.0" @@ -6614,6 +6577,7 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "devOptional": true, "license": "MIT", "dependencies": { "call-bind-apply-helpers": "^1.0.1", @@ -6772,6 +6736,7 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -6781,6 +6746,7 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -6790,6 +6756,7 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "devOptional": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0" @@ -6802,6 +6769,7 @@ "version": "2.1.0", "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "devOptional": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", @@ -7776,26 +7744,6 @@ "dev": true, "license": "MIT" }, - "node_modules/follow-redirects": { - "version": "1.15.11", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz", - "integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==", - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "license": "MIT", - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -7919,6 +7867,7 @@ "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "devOptional": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/ljharb" @@ -7999,6 +7948,7 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "devOptional": true, "license": "MIT", "dependencies": { "call-bind-apply-helpers": "^1.0.2", @@ -8023,6 +7973,7 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "devOptional": true, "license": "MIT", "dependencies": { "dunder-proto": "^1.0.1", @@ -8370,6 +8321,7 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -8413,6 +8365,7 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -8425,6 +8378,7 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "devOptional": true, "license": "MIT", "dependencies": { "has-symbols": "^1.0.3" @@ -8450,6 +8404,7 @@ "version": "2.0.2", "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "devOptional": true, "license": "MIT", "dependencies": { "function-bind": "^1.1.2" @@ -9714,6 +9669,7 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -9773,6 +9729,7 @@ "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "devOptional": true, "license": "MIT", "engines": { "node": ">= 0.6" @@ -9782,6 +9739,7 @@ "version": "2.1.35", "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "devOptional": true, "license": "MIT", "dependencies": { "mime-db": "1.52.0" @@ -11197,6 +11155,7 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==", + "dev": true, "license": "MIT" }, "node_modules/pupa": { diff --git a/package.json b/package.json index a3f9dec..d2034b1 100644 --- a/package.json +++ b/package.json @@ -5,8 +5,6 @@ "scripts": { "build": "vite build", "dev": "vite dev", - "scrape": "npm run update:scrape && npm run update:compare && npm run update:prescrape && npm run update:upload-artists", - "upload": "npm run update:upload-songs && npm run update:update-popular", "dev:vite": "vite dev", "preview": "vite preview", "check": "svelte-kit sync && svelte-check --tsconfig ./jsconfig.json", @@ -43,10 +41,7 @@ "vite": "^5.0.3" }, "dependencies": { - "axios": "^1.13.2", - "chalk": "^5.6.2", - "cheerio": "^1.1.2", - "cli-progress": "^3.12.0", + "cheerio": "^1.0.0-rc.12", "firebase": "^12.2.1", "firebase-admin": "^12.1.0", "js-cookie": "^3.0.5", diff --git a/prescraper.js b/prescraper.js new file mode 100644 index 0000000..849fd82 --- /dev/null +++ b/prescraper.js @@ -0,0 +1,920 @@ +#!/usr/bin/env node + +import fs from 'fs/promises'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import * as cheerio from 'cheerio'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +/** + * Prescraper Configuration + */ +const config = { + // Number of songs to scrape lyrics for per artist (configurable via CLI) + maxSongsToScrape: 10, + + // Which artists to process + artistFilters: { + letters: ['all'], // Specific letters like ['a', 'b'] or 'all' + types: ['popular', 'regular'], // Include popular, regular, or both + maxArtistsPerLetter: null, // Limit artists per letter (for testing) + skipExisting: true // Skip artists already in output files + }, + + // Rate limiting to be respectful to Genius + delays: { + betweenArtists: 1000, // 1 second + betweenSongs: 500, // 0.5 seconds + betweenPages: 200 // 0.2 seconds + }, + + // Output configuration + output: { + directory: `./prescraped-data-${new Date().toISOString().split('T')[0]}/`, + filePerLetter: true, // One file per letter vs one big file + resumable: true // Save progress and allow resuming + }, + + // API configuration + api: { + timeout: 10000, // 10 second timeout + maxRetries: 3, + userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + } +}; + +/** + * Global state tracking + */ +const state = { + processed: { + artists: 0, + songs: 0, + lyrics: 0 + }, + errors: { + artists: 0, + songs: 0, + lyrics: 0 + }, + startTime: null +}; + +/** + * Utility function for delays + */ +const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +/** + * Enhanced fetch with timeout and retries + */ +async function fetchWithTimeout(url, options = {}) { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), options.timeout || config.api.timeout); + + let lastError; + for (let attempt = 1; attempt <= config.api.maxRetries; attempt++) { + try { + const response = await fetch(url, { + ...options, + signal: controller.signal, + headers: { + 'User-Agent': config.api.userAgent, + ...options.headers + } + }); + + clearTimeout(timeoutId); + + if (!response.ok && response.status >= 500) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + return response; + } catch (error) { + lastError = error; + clearTimeout(timeoutId); + + if (error.name === 'AbortError') { + console.warn(`⏰ Request timeout (attempt ${attempt}/${config.api.maxRetries}): ${url}`); + } else { + console.warn(`🔄 Request failed (attempt ${attempt}/${config.api.maxRetries}): ${error.message}`); + } + + if (attempt < config.api.maxRetries) { + const delayMs = Math.pow(2, attempt) * 1000; // Exponential backoff + console.log(`⏳ Retrying in ${delayMs}ms...`); + await delay(delayMs); + } + } + } + + throw lastError || new Error(`Failed after ${config.api.maxRetries} attempts`); +} + +/** + * Load artist data from genius-artists JSON files + */ +async function loadArtistData() { + console.log('📂 Loading artist data from genius-artists files...'); + + const artistsDir = path.join(__dirname, 'genius-artists-2025-07-11'); + const artists = []; + + try { + const files = await fs.readdir(artistsDir); + const jsonFiles = files.filter(file => file.startsWith('genius-artists-') && file.endsWith('.json')); + + console.log(`Found ${jsonFiles.length} artist files`); + + for (const file of jsonFiles) { + const letter = file.replace('genius-artists-', '').replace('.json', ''); + + // Skip if not in target letters + if (config.artistFilters.letters[0] !== 'all' && !config.artistFilters.letters.includes(letter)) { + continue; + } + + const filePath = path.join(artistsDir, file); + const content = await fs.readFile(filePath, 'utf8'); + const data = JSON.parse(content); + + // Combine popular and regular artists based on filter + let letterArtists = []; + + if (config.artistFilters.types.includes('popular')) { + letterArtists.push(...(data.artists.popular || [])); + } + + if (config.artistFilters.types.includes('regular')) { + letterArtists.push(...(data.artists.regular || [])); + } + + // Apply max artists per letter limit if specified + if (config.artistFilters.maxArtistsPerLetter) { + letterArtists = letterArtists.slice(0, config.artistFilters.maxArtistsPerLetter); + } + + // Add letter info to each artist + letterArtists.forEach(artist => { + artist.letter = letter; + artist.urlKey = artist.url.split('/').pop(); // Extract URL key for Firebase compatibility + }); + + artists.push(...letterArtists); + console.log(`📄 Loaded ${letterArtists.length} artists from ${file}`); + } + + console.log(`✅ Total artists loaded: ${artists.length}`); + return artists; + + } catch (error) { + console.error('❌ Error loading artist data:', error); + throw error; + } +} + +/** + * Get Genius API key from local config + */ +async function getGeniusApiKey() { + try { + const configPath = path.join(__dirname, 'functions', 'local-config.json'); + const localConfig = JSON.parse(await fs.readFile(configPath, 'utf8')); + return localConfig.genius.key; + } catch (error) { + console.error('❌ Error loading Genius API key from functions/local-config.json'); + console.error('Please ensure you have a valid local-config.json file with your Genius API key'); + throw error; + } +} + +/** + * Fetch song metadata from Genius API for a specific artist page + * (Ported from Firebase Functions) + */ +async function getSongsByArtist(artistId, geniusApiKey, page = 1) { + console.log(` 📀 Fetching songs for artist ${artistId}, page ${page}`); + + try { + const headers = { "Authorization": `Bearer ${geniusApiKey}` }; + + // Fetch 50 songs per page, sorted by popularity + const response = await fetchWithTimeout( + `https://api.genius.com/artists/${artistId}/songs?per_page=50&page=${page}&sort=popularity`, + { headers } + ); + + if (!response.ok) { + throw new Error(`Genius API error: ${response.status} ${response.statusText}`); + } + + const data = await response.json(); + + if (!data.response || !data.response.songs) { + throw new Error('Invalid API response structure'); + } + + const songs = data.response.songs; + + // Transform songs to our schema format + const transformedSongs = songs.map(song => ({ + id: song.id.toString(), + title: cleanUnicodeText(song.title), + url: song.url, + songArtImageUrl: song.song_art_image_url, + artistNames: cleanUnicodeText(song.artist_names), + primaryArtist: { + id: song.primary_artist.id, + name: cleanUnicodeText(song.primary_artist.name), + url: song.primary_artist.url + }, + // Album art ID extraction (same logic as Firebase Functions) + albumArtId: extractGeniusImageHash(song.song_art_image_url) + })); + + const hasMore = songs.length === 50; // If we got a full page, there might be more + + return { + songs: transformedSongs, + rawSongs: songs, // Include raw API response for image URL extraction + hasMore, + pageNumber: page + }; + + } catch (error) { + console.error(`❌ Error fetching songs for artist ${artistId}, page ${page}:`, error); + throw error; + } +} + +/** + * Extract the hash/ID from a Genius image URL for album art deduplication + * (Ported from Firebase Functions) + */ +function extractGeniusImageHash(imageUrl) { + try { + if (!imageUrl) return null; + + // Extract the filename from the URL + const filename = imageUrl.split('/').pop(); + + // Extract the hash (everything before the first dot) + const hash = filename.split('.')[0]; + + // Validate it looks like a hash (32 character hex string) + if (hash && /^[a-f0-9]{32}$/i.test(hash)) { + return hash.toLowerCase(); + } + + // Fallback: use the full filename if it doesn't match expected pattern + console.warn(`Unexpected Genius URL format: ${imageUrl}, using filename as ID`); + return filename.replace(/[^a-zA-Z0-9]/g, '-').toLowerCase(); + + } catch (error) { + console.error('Error extracting hash from Genius URL:', error); + return null; + } +} + +/** + * Extract artist image URL from songs data + * (Ported from Firebase Functions) + */ +function extractArtistImageUrl(songs, targetArtistId, maxSongsToCheck = 11) { + const targetId = typeof targetArtistId === 'string' ? parseInt(targetArtistId, 10) : targetArtistId; + + for (const song of songs.slice(0, maxSongsToCheck)) { + // Check primary artist first + if (song.primary_artist && song.primary_artist.id === targetId) { + const imageUrl = song.primary_artist.image_url; + if (imageUrl) { + return imageUrl; + } + } + + // Check featured artists if primary artist doesn't match + if (song.featured_artists && Array.isArray(song.featured_artists)) { + for (const featuredArtist of song.featured_artists) { + if (featuredArtist.id === targetId) { + const imageUrl = featuredArtist.image_url; + if (imageUrl) { + return imageUrl; + } + } + } + } + } + + return null; +} + +/** + * Get all songs for an artist (up to 1000 songs) + * (Ported from Firebase Functions populateArtistSongsCore logic) + */ +async function getAllSongsForArtist(artist, geniusApiKey) { + console.log(`🎵 Fetching all songs for: ${artist.name}`); + + let page = 1; + let allSongs = []; + const maxSongs = 1000; + let artistImageUrl = null; + + while (allSongs.length < maxSongs) { + try { + const result = await getSongsByArtist(artist.id, geniusApiKey, page); + + if (result.songs.length === 0) { + console.log(` ✅ No more songs available (${allSongs.length} total)`); + break; + } + + // Extract artist image URL from first page if not found yet + if (!artistImageUrl && page === 1) { + artistImageUrl = extractArtistImageUrl(result.rawSongs, artist.id); + } + + allSongs.push(...result.songs); + console.log(` 📄 Page ${page}: ${result.songs.length} songs (${allSongs.length} total)`); + + // Break if no more pages + if (!result.hasMore) { + console.log(` ✅ Reached end of songs (${allSongs.length} total)`); + break; + } + + page++; + + // Rate limiting + await delay(config.delays.betweenPages); + + } catch (error) { + console.error(` ❌ Error fetching page ${page}:`, error); + state.errors.songs++; + break; + } + } + + return { + songs: allSongs, + artistImageUrl: artistImageUrl + }; +} + +/** + * Parse letter range input and return array of letters + * Supports: 'a,b,c', 'a-c', 'a c', 'all', or mixed formats + */ +function parseLetterRange(input) { + if (!input || input.toLowerCase() === 'all') { + return ['all']; + } + + const letters = new Set(); + + // Split by commas and spaces, then process each part + const parts = input.toLowerCase().split(/[,\s]+/).filter(part => part.length > 0); + + for (const part of parts) { + if (part === 'all') { + return ['all']; + } else if (part.includes('-')) { + // Handle range like 'a-g' or 'c-f' + const [start, end] = part.split('-'); + if (start && end && start.length === 1 && end.length === 1) { + const startCode = start.charCodeAt(0); + const endCode = end.charCodeAt(0); + + if (startCode >= 97 && startCode <= 122 && endCode >= 97 && endCode <= 122) { + // Valid range (a-z) + for (let code = startCode; code <= endCode; code++) { + letters.add(String.fromCharCode(code)); + } + } else { + console.warn(`⚠️ Invalid letter range: ${part} (must be a-z)`); + } + } else { + console.warn(`⚠️ Invalid range format: ${part} (use format like 'a-g')`); + } + } else if ((part.length === 1 && part >= 'a' && part <= 'z') || part === '0') { + // Single letter or number indicator + letters.add(part); + } else { + console.warn(`⚠️ Invalid letter: ${part} (must be a-z or 0 for numbers)`); + } + } + + // Convert to sorted array, with '0' first + return Array.from(letters).sort((a, b) => { + if (a === '0') return -1; + if (b === '0') return 1; + return a.localeCompare(b); + }); +} + +/** + * Print configuration summary + */ +function printConfig() { + console.log('\n' + '='.repeat(60)); + console.log('🚀 LYRICTYPE PRESCRAPER STARTING'); + console.log('='.repeat(60)); + console.log(`📋 Configuration:`); + console.log(` Max songs to scrape per artist: ${config.maxSongsToScrape}`); + const letterDisplay = config.artistFilters.letters[0] === 'all' ? 'all (0, a-z)' : config.artistFilters.letters.join(', '); + console.log(` Target letters: ${letterDisplay}`); + console.log(` Artist types: ${config.artistFilters.types.join(', ')}`); + if (config.artistFilters.maxArtistsPerLetter) { + console.log(` Max artists per letter: ${config.artistFilters.maxArtistsPerLetter}`); + } + console.log(` Output directory: ${config.output.directory}`); + console.log(` Resumable: ${config.output.resumable}`); + console.log('='.repeat(60) + '\n'); +} + +/** + * Clean problematic Unicode characters from text + */ +function cleanUnicodeText(text) { + if (!text || typeof text !== 'string') { + return text; + } + + // Define problematic Unicode characters to remove or replace + const replacements = { + // Zero-width and invisible characters (remove) + '\u200B': '', // ZERO WIDTH SPACE + '\u200C': '', // ZERO WIDTH NON-JOINER + '\u200D': '', // ZERO WIDTH JOINER + '\u200E': '', // LEFT-TO-RIGHT MARK + '\u200F': '', // RIGHT-TO-LEFT MARK + '\u00AD': '', // SOFT HYPHEN + '\uFEFF': '', // BYTE ORDER MARK + + // Line separators (replace with regular newlines) + '\u2028': '\n', // LINE SEPARATOR + '\u2029': '\n', // PARAGRAPH SEPARATOR + '\u0085': '\n', // NEXT LINE + + // Other problematic characters + '\u000B': '\n', // VERTICAL TAB + '\u000C': '\n', // FORM FEED + '\u001C': '', // FILE SEPARATOR + '\u001D': '', // GROUP SEPARATOR + '\u001E': '', // RECORD SEPARATOR + '\u001F': '', // UNIT SEPARATOR + }; + + // Apply replacements + let cleaned = text; + for (const [oldChar, newChar] of Object.entries(replacements)) { + cleaned = cleaned.replace(new RegExp(oldChar, 'g'), newChar); + } + + // Clean up multiple consecutive newlines + cleaned = cleaned.replace(/\n{3,}/g, '\n\n'); + + return cleaned; +} + +/** + * Scrape lyrics from a Genius song URL + * (Ported from Firebase Functions scrapeLyricsFromUrl) + */ +async function scrapeLyricsFromUrl(songUrl) { + try { + // Validate URL format + if (!songUrl || typeof songUrl !== 'string') { + throw new Error(`Invalid song URL: ${songUrl}`); + } + + if (!songUrl.includes('genius.com')) { + throw new Error(`URL does not appear to be a Genius URL: ${songUrl}`); + } + + // Fetch the song page with proper error handling + const songPageResponse = await fetchWithTimeout(songUrl); + + if (!songPageResponse.ok) { + throw new Error(`HTTP ${songPageResponse.status}: ${songPageResponse.statusText} for URL: ${songUrl}`); + } + + const songPageHtml = await songPageResponse.text(); + + if (!songPageHtml || songPageHtml.length < 100) { + throw new Error('Received empty or invalid HTML response'); + } + + // Parse the page with cheerio + const $ = cheerio.load(songPageHtml); + + // Target ALL lyrics containers - Genius often splits lyrics across multiple divs + const lyricsContainers = $('div[data-lyrics-container="true"]'); + + if (lyricsContainers.length === 0) { + throw new Error('No lyrics containers found'); + } + + let allLyricsText = ''; + + // Process each lyrics container + lyricsContainers.each((index, container) => { + const $container = $(container); + + // Remove elements that should be excluded from lyrics + $container.find('[data-exclude-from-selection="true"]').remove(); + + // Remove headers, footers, and annotation elements + $container.find('.LyricsHeader__Container, .LyricsFooter__Container').remove(); + $container.find('a[href*="/annotations/"]').remove(); + + // Get the raw text content, preserving line breaks + let containerText = $container.html() || ''; + + // Convert HTML to clean text + containerText = containerText + // Convert
tags to newlines + .replace(//gi, '\n') + // Remove all HTML tags completely (including , section headers, etc.) + .replace(/<[^>]*>/gi, '') + // Decode HTML entities + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' ') + // Clean up whitespace + .split('\n') + .map(line => line.trim()) + .filter(line => { + // Filter out section headers and empty lines + if (!line) return false; + if (line.match(/^\[.*\]$/)) return false; // Remove [Intro], [Verse], etc. + if (line.match(/^(Intro|Verse|Chorus|Bridge|Outro|Pre-Chorus|Post-Chorus|Hook|Refrain)(\s|\d|$)/i)) return false; + return true; + }) + .join('\n'); + + if (containerText.trim()) { + if (allLyricsText) allLyricsText += '\n\n'; + allLyricsText += containerText.trim(); + } + }); + + // Final cleanup + let lyrics = allLyricsText + // Remove multiple consecutive newlines + .replace(/\n{3,}/g, '\n\n') + // Remove any remaining section markers that might have slipped through + .replace(/^\[.*\]$/gm, '') + // Clean up any remaining whitespace issues + .split('\n') + .map(line => line.trim()) + .filter(line => line.length > 0) + .join('\n') + .trim(); + + if (!lyrics || lyrics.length < 10) { + throw new Error('Extracted lyrics are too short or empty'); + } + + // Clean problematic Unicode characters + const cleanedLyrics = cleanUnicodeText(lyrics); + + return cleanedLyrics; + + } catch (error) { + console.error(` ❌ Error scraping lyrics from ${songUrl}:`, error.message); + throw error; + } +} + +/** + * Scrape lyrics for selected songs of an artist + */ +async function scrapeLyricsForArtist(artist, songs, maxSongs) { + console.log(` 🎤 Scraping lyrics for top ${Math.min(maxSongs, songs.length)} songs...`); + + const songsToScrape = songs.slice(0, maxSongs); + const scrapedSongs = []; + + for (let i = 0; i < songsToScrape.length; i++) { + const song = songsToScrape[i]; + + try { + console.log(` 📝 [${i + 1}/${songsToScrape.length}] ${song.title}`); + const startTime = Date.now(); + + const lyrics = await scrapeLyricsFromUrl(song.url); + const scrapingDuration = Date.now() - startTime; + + scrapedSongs.push({ + ...song, + lyrics: lyrics, + scrapedAt: new Date().toISOString(), + scrapingDuration: scrapingDuration + }); + + state.processed.lyrics++; + console.log(` ✅ Success (${scrapingDuration}ms, ${lyrics.length} chars)`); + + // Rate limiting between songs + if (i < songsToScrape.length - 1) { + await delay(config.delays.betweenSongs); + } + + } catch (error) { + console.error(` ❌ Failed to scrape "${song.title}": ${error.message}`); + state.errors.lyrics++; + + // Still add the song without lyrics for metadata completeness + scrapedSongs.push({ + ...song, + lyrics: null, + scrapingError: error.message, + scrapedAt: new Date().toISOString() + }); + } + } + + return scrapedSongs; +} + +/** + * Process a single artist: fetch songs and scrape lyrics + */ +async function processArtist(artist, geniusApiKey, artistIndex, totalArtists) { + const startTime = Date.now(); + + console.log(`\n[${artistIndex + 1}/${totalArtists}] 🎨 Processing: ${artist.name}`); + console.log(` 🔗 URL: ${artist.url}`); + console.log(` 🆔 Genius ID: ${artist.id}`); + + try { + // Get all songs for the artist + const { songs, artistImageUrl } = await getAllSongsForArtist(artist, geniusApiKey); + + if (songs.length === 0) { + console.log(` ⚠️ No songs found for ${artist.name}`); + state.errors.artists++; + return null; + } + + console.log(` 📚 Found ${songs.length} songs total`); + state.processed.songs += songs.length; + + // Scrape lyrics for top N songs + const scrapedSongs = await scrapeLyricsForArtist(artist, songs, config.maxSongsToScrape); + + const processingTime = Date.now() - startTime; + const successfulScrapes = scrapedSongs.filter(s => s.lyrics).length; + + console.log(` ✅ Completed: ${successfulScrapes}/${scrapedSongs.length} lyrics scraped (${processingTime}ms)`); + state.processed.artists++; + + // Return structured data + return { + // Original artist data + name: artist.name, + geniusId: artist.id, + url: artist.url, + urlKey: artist.urlKey, + letter: artist.letter, + type: artist.type, + + // Scraped metadata + imageUrl: artistImageUrl, + totalSongs: songs.length, + + // Complete song list (metadata only) + allSongs: songs, + + // Songs with scraped lyrics + scrapedSongs: scrapedSongs, + + // Processing metadata + processingStats: { + totalSongsFound: songs.length, + songsScraped: scrapedSongs.length, + lyricsScraped: successfulScrapes, + scrapingErrors: scrapedSongs.length - successfulScrapes, + processingTime: processingTime + }, + + processedAt: new Date().toISOString() + }; + + } catch (error) { + console.error(` 💥 Failed to process ${artist.name}: ${error.message}`); + state.errors.artists++; + return null; + } +} + +/** + * Save processed data to JSON file + */ +async function saveToFile(data, letter) { + const filename = `prescraped-${letter}.json`; + const filepath = path.join(config.output.directory, filename); + + const output = { + letter: letter, + processedAt: new Date().toISOString(), + artists: data.filter(item => item !== null), // Remove failed artists + summary: { + totalProcessed: data.filter(item => item !== null).length, + totalFailed: data.filter(item => item === null).length, + totalSongs: data.reduce((sum, item) => sum + (item?.totalSongs || 0), 0), + totalLyrics: data.reduce((sum, item) => sum + (item?.processingStats?.lyricsScraped || 0), 0) + } + }; + + await fs.writeFile(filepath, JSON.stringify(output, null, 2)); + console.log(`💾 Saved results to: ${filename}`); + + return output; +} + +/** + * Main execution function + */ +async function main() { + try { + state.startTime = Date.now(); + + // Parse command line arguments + if (process.argv.includes('--help') || process.argv.includes('-h')) { + console.log(` +Usage: node prescraper.js [options] + +Options: + --songs Number of songs to scrape per artist (default: ${config.maxSongsToScrape}) + --letters Letters/ranges to process (default: all) + --test Test mode: limit artists per letter + --help, -h Show this help message + +Letter Formats: + all Process all letters (0, a-z) + 0 Process artists starting with numbers + a,b,c Process specific letters + a-g Process range from a to g + c g Process letters c and g (space-separated) + 0,a-c,x,z Mixed: numbers, range a-c, plus letters x and z + +Examples: + node prescraper.js --songs 5 --letters a,b,c + node prescraper.js --letters 0 # Process artists starting with numbers + node prescraper.js --letters a-g # Process letters a through g + node prescraper.js --letters "c g" # Process letters c and g + node prescraper.js --letters 0,a-c,x-z # Process numbers, a-c and x-z ranges + node prescraper.js --test 2 # Process only 2 artists per letter + `); + return; + } + + // Parse CLI arguments + const argsIndex = process.argv.indexOf('--songs'); + if (argsIndex !== -1 && process.argv[argsIndex + 1]) { + config.maxSongsToScrape = parseInt(process.argv[argsIndex + 1], 10); + } + + const lettersIndex = process.argv.indexOf('--letters'); + if (lettersIndex !== -1 && process.argv[lettersIndex + 1]) { + const lettersArg = process.argv[lettersIndex + 1]; + config.artistFilters.letters = parseLetterRange(lettersArg); + console.log(`📝 Parsed letters: ${lettersArg} → ${config.artistFilters.letters.join(', ')}`); + } + + const testIndex = process.argv.indexOf('--test'); + if (testIndex !== -1 && process.argv[testIndex + 1]) { + config.artistFilters.maxArtistsPerLetter = parseInt(process.argv[testIndex + 1], 10); + console.log(`🧪 TEST MODE: Processing max ${config.artistFilters.maxArtistsPerLetter} artists per letter`); + } + + // Show configuration after parsing CLI arguments + printConfig(); + + // Load Genius API key + console.log('🔑 Loading Genius API key...'); + const geniusApiKey = await getGeniusApiKey(); + console.log('✅ API key loaded'); + + // Load artist data + const artists = await loadArtistData(); + + if (artists.length === 0) { + console.log('⚠️ No artists found matching criteria'); + return; + } + + // Create output directory + await fs.mkdir(config.output.directory, { recursive: true }); + console.log(`📁 Created output directory: ${config.output.directory}`); + + console.log(`\n🎯 Processing ${artists.length} artists...`); + console.log('Press Ctrl+C to stop gracefully\n'); + + // Group artists by letter for organized processing + const artistsByLetter = artists.reduce((acc, artist) => { + if (!acc[artist.letter]) acc[artist.letter] = []; + acc[artist.letter].push(artist); + return acc; + }, {}); + + const letters = Object.keys(artistsByLetter).sort(); + console.log(`📋 Processing ${letters.length} letters: ${letters.join(', ')}`); + + // Process each letter + for (const letter of letters) { + const letterArtists = artistsByLetter[letter]; + console.log(`\n${'='.repeat(50)}`); + console.log(`📖 LETTER: ${letter.toUpperCase()} (${letterArtists.length} artists)`); + console.log('='.repeat(50)); + + const letterResults = []; + + // Process each artist in the letter + for (let i = 0; i < letterArtists.length; i++) { + const artist = letterArtists[i]; + + try { + const result = await processArtist(artist, geniusApiKey, i, letterArtists.length); + letterResults.push(result); + + // Rate limiting between artists + if (i < letterArtists.length - 1) { + await delay(config.delays.betweenArtists); + } + + } catch (error) { + console.error(`💥 Critical error processing ${artist.name}:`, error); + letterResults.push(null); + state.errors.artists++; + } + } + + // Save results for this letter + try { + await saveToFile(letterResults, letter); + + const successful = letterResults.filter(r => r !== null).length; + const failed = letterResults.filter(r => r === null).length; + const totalLyrics = letterResults.reduce((sum, r) => sum + (r?.processingStats?.lyricsScraped || 0), 0); + + console.log(`\n✅ Letter ${letter.toUpperCase()} completed:`); + console.log(` Artists: ${successful} successful, ${failed} failed`); + console.log(` Lyrics scraped: ${totalLyrics}`); + + } catch (saveError) { + console.error(`❌ Error saving results for letter ${letter}:`, saveError); + } + } + + // Final summary + const totalTime = Date.now() - state.startTime; + console.log(`\n${'='.repeat(60)}`); + console.log('🎉 PRESCRAPING COMPLETED!'); + console.log('='.repeat(60)); + console.log(`⏱️ Total time: ${Math.round(totalTime / 1000)}s`); + console.log(`📊 Final stats:`); + console.log(` Artists processed: ${state.processed.artists}`); + console.log(` Songs fetched: ${state.processed.songs}`); + console.log(` Lyrics scraped: ${state.processed.lyrics}`); + console.log(` Total errors: ${state.errors.artists + state.errors.songs + state.errors.lyrics}`); + console.log(`📁 Output saved to: ${config.output.directory}`); + console.log('='.repeat(60)); + + } catch (error) { + console.error('💥 Fatal error:', error); + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n\n🛑 Received SIGINT, shutting down gracefully...'); + console.log(`📊 Final stats:`); + console.log(` Artists processed: ${state.processed.artists}`); + console.log(` Songs processed: ${state.processed.songs}`); + console.log(` Lyrics scraped: ${state.processed.lyrics}`); + console.log(` Errors: ${state.errors.artists + state.errors.songs + state.errors.lyrics}`); + process.exit(0); +}); + +// Run the script +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} + +export { + config, + loadArtistData, + getAllSongsForArtist, + extractGeniusImageHash, + extractArtistImageUrl +}; diff --git a/scripts/ARTIST_UPDATE_SYSTEM_README.md b/scripts/ARTIST_UPDATE_SYSTEM_README.md deleted file mode 100644 index 8e2836d..0000000 --- a/scripts/ARTIST_UPDATE_SYSTEM_README.md +++ /dev/null @@ -1,562 +0,0 @@ -# Artist Update System - -A comprehensive, automated system for scraping, comparing, and uploading artist and song data from Genius to Firestore. - -## Quick Start - -### Test the System (2 minutes) -```bash -npm run test:all -``` - -### Full Update (30-60 minutes) -```bash -# Scrape and prepare data -npm run scrape - -# Review data in scraping-data/, then upload -npm run upload -``` - -## Overview - -This system provides a complete workflow for: -1. **Scraping** artist lists from Genius -2. **Comparing** with existing Firestore data -3. **Prescraping** songs for new artists -4. **Uploading** new artists and songs to Firestore -5. **Updating** popular artist flags - -### Key Features - -- ✅ **Incremental Updates**: Only processes new/changed artists -- ✅ **Safe by Default**: Dry-run mode, skip existing data -- ✅ **Testing Built-In**: `--limit` option on all scripts -- ✅ **Progress Tracking**: TUI with progress bars and ETAs -- ✅ **Workflow Timer**: Cumulative elapsed time tracked across all scripts -- ✅ **Error Handling**: Comprehensive error logging -- ✅ **Manual Control**: Each step is a separate command -- ✅ **Timestamped Data**: All data saved with timestamps -- ✅ **Search Tokens**: Automatic generation for autocomplete - -### Two-Phase Workflow - -The system is split into two main commands for safety: - -1. **`npm run scrape`** - Data gathering phase (20-45 min) - - Scrapes artist lists from Genius - - Compares with Firestore to find new artists - - Scrapes songs for new artists - - Uploads new artist documents (with search tokens) - - **Stops and prompts you to review data** - -2. **`npm run upload`** - Data upload phase (2-6 min) - - Uploads songs to Firestore - - Updates popular artist flags - -This separation allows you to: -- Review scraped data before committing to Firestore -- Catch any issues before modifying song data -- Re-run the upload phase if needed without re-scraping - -## Scripts - -| Script | Purpose | Input | Output | -|--------|---------|-------|--------| -| `scrape-artists.js` | Scrape Genius artist lists | Genius website | `artist-lists/` | -| `compare-artists.js` | Identify new artists | `artist-lists/` + Firestore | `new-artists/` | -| `prescrape-new-artists.js` | Scrape songs for new artists | `new-artists/` | `song-data/` | -| `upload-artists.js` | Upload artists to Firestore | `new-artists/` | Firestore `artists` | -| `upload-songs.js` | Upload songs to Firestore | `song-data/` | Firestore `songs` | -| `update-popular-flags.js` | Update popular flags | `new-artists/` | Firestore `artists` | - -## NPM Commands - -### Production Workflow - -```bash -# Recommended: Two-phase workflow -npm run scrape # 20-45 min: Scrape, compare, prescrape, upload artists -npm run upload # 2-6 min: Upload songs and update flags - -# Individual steps (for debugging/manual control) -npm run update:scrape # 15-30 min: Scrape all letters -npm run update:compare # ~1 min: Compare with Firestore -npm run update:prescrape # 5-15 min: Scrape songs -npm run update:upload-artists # ~1 min: Upload artists -npm run update:upload-songs # 1-5 min: Upload songs -npm run update:update-popular # ~1 min: Update flags -``` - -### Test Workflow - -```bash -# Individual tests -npm run test:scrape # 30 sec: 10 artists from letter J -npm run test:compare # 15 sec: Compare -npm run test:prescrape # 10 sec: 5 artists, 2 songs each -npm run test:upload-artists # 5 sec: Dry-run 5 artists -npm run test:upload-songs # 3 sec: Dry-run 5 songs -npm run test:update-popular # 2 sec: Dry-run flag updates - -# Full test -npm run test:all # ~2 min: Complete workflow test -``` - -## Workflow Details - -### Step 1: Scrape Artists - -**Command**: `npm run update:scrape` -**Script**: `scripts/scrape-artists.js` -**Time**: 15-30 minutes (all letters) - -Scrapes artist lists from Genius by letter (0, a-z), including: -- Popular artists (top 20 per letter) -- Regular artists (all others) -- Genius IDs (extracted from iOS app links) - -**Output**: -``` -scraping-data/artist-lists/2026-01-04-20-22/ -├── artists-0.json -├── artists-a.json -├── ... -├── artists-z.json -├── summary.json -├── errors.json -└── .complete -``` - -**Options**: -```bash -# Specific letters -node scripts/scrape-artists.js --letters a,b,c - -# Test with limited data -node scripts/scrape-artists.js --letters j --limit 10 - -# Skip ID extraction (faster) -node scripts/scrape-artists.js --no-ids -``` - -### Step 2: Compare with Firestore - -**Command**: `npm run update:compare` -**Script**: `scripts/compare-artists.js` -**Time**: ~1 minute - -Compares scraped artists with existing Firestore data to identify: -- New artists (not in Firestore) -- Popular status changes (add/remove) - -**Output**: -``` -scraping-data/new-artists/2026-01-04-21-01/ -├── new-artists-0.json -├── new-artists-a.json -├── ... -├── new-artists-z.json -├── comparison-report.json ← Summary of changes -├── errors.json -└── .complete -``` - -**Options**: -```bash -# Use specific timestamp -node scripts/compare-artists.js --date 2026-01-04-20-22 - -# Preview only -node scripts/compare-artists.js --dry-run -``` - -### Step 3: Prescrape Songs - -**Command**: `npm run update:prescrape` -**Script**: `scripts/prescrape-new-artists.js` -**Time**: 5-15 minutes (depends on new artist count) - -Scrapes songs and lyrics for newly identified artists: -- Up to 10 songs per artist (configurable) -- Full lyrics extraction -- Robust error handling with retries - -**Output**: -``` -scraping-data/song-data/2026-01-04-20-48/ -├── songs-0.json -├── songs-a.json -├── ... -├── songs-z.json -├── prescrape-summary.json -├── errors.json -└── .complete -``` - -**Options**: -```bash -# Test with limited data -node scripts/prescrape-new-artists.js --limit 5 --max-songs 2 - -# Specific letters -node scripts/prescrape-new-artists.js --letters a,b,c - -# More songs per artist -node scripts/prescrape-new-artists.js --max-songs 20 -``` - -### Step 4: Upload Artists - -**Command**: `npm run update:upload-artists` -**Script**: `scripts/upload-artists.js` -**Time**: ~1 minute - -Uploads new artists to Firestore with: -- Search tokens for autocomplete -- Sanitized field values -- Validated slugs -- Batch processing (500 per batch) - -**Options**: -```bash -# Dry run (recommended first) -node scripts/upload-artists.js --dry-run - -# Test with limited data -node scripts/upload-artists.js --limit 10 --dry-run - -# Overwrite existing (careful!) -node scripts/upload-artists.js --no-skip -``` - -### Step 5: Upload Songs - -**Command**: `npm run update:upload-songs` -**Script**: `scripts/upload-songs.js` -**Time**: 1-5 minutes (depends on song count) - -Uploads songs to Firestore with: -- Song ID extraction -- Artist slug linking -- Sanitized lyrics -- Batch processing (500 per batch) - -**Options**: -```bash -# Dry run -node scripts/upload-songs.js --dry-run - -# Test with limited data -node scripts/upload-songs.js --limit 10 --dry-run - -# Overwrite existing -node scripts/upload-songs.js --no-skip -``` - -### Step 6: Update Popular Flags - -**Command**: `npm run update:update-popular` -**Script**: `scripts/update-popular-flags.js` -**Time**: ~1 minute - -Updates popular artist flags based on comparison: -- Adds popular flag to new popular artists -- Removes popular flag from artists no longer popular -- Maintains exactly 20 popular artists per letter - -**Options**: -```bash -# Dry run -node scripts/update-popular-flags.js --dry-run - -# Use specific comparison -node scripts/update-popular-flags.js --date 2026-01-04-21-01 -``` - -## Data Structure - -### Directory Layout - -``` -scraping-data/ -├── artist-lists/ ← Step 1: Scraped artist lists -│ └── YYYY-MM-DD-HH-MM/ -├── new-artists/ ← Step 2: Comparison results -│ └── YYYY-MM-DD-HH-MM/ -├── song-data/ ← Step 3: Prescraped songs -│ └── YYYY-MM-DD-HH-MM/ -└── upload-data/ ← Steps 4-6: Upload summaries - └── YYYY-MM-DD-HH-MM/ -``` - -### Firestore Collections - -#### `artists` Collection - -```javascript -// Document ID: artist-slug (e.g., "kendrick-lamar") -{ - name: "Kendrick Lamar", - url: "https://genius.com/artists/Kendrick-lamar", - geniusId: 1421, - type: "popular", // or "regular" - isPopular: true, - searchTokens: ["k", "ke", "ken", "kend", ...], - nameForSorting: "kendrick lamar", - firstLetter: "k", - uploadedAt: "2026-01-04T21:10:00.000Z", - updatedAt: "2026-01-04T21:20:00.000Z" -} -``` - -#### `songs` Collection - -```javascript -// Document ID: song-slug (e.g., "kendrick-lamar-humble-lyrics") -{ - title: "HUMBLE.", - url: "https://genius.com/Kendrick-lamar-humble-lyrics", - artist: "Kendrick Lamar", - artistSlug: "kendrick-lamar", - lyrics: "Nobody pray for me...", - uploadedAt: "2026-01-04T21:15:00.000Z", - scrapedAt: "2026-01-04T20:48:00.000Z" -} -``` - -## Common Workflows - -### Monthly Update - -```bash -# 1. Scrape latest data (30 min) -npm run update:scrape - -# 2. Compare with database (1 min) -npm run update:compare - -# 3. Review comparison-report.json -cat scraping-data/new-artists/$(ls -t scraping-data/new-artists | head -1)/comparison-report.json - -# 4. Prescrape songs (10 min) -npm run update:prescrape - -# 5. Upload everything (5 min) -npm run update:upload-artists -npm run update:upload-songs -npm run update:update-popular -``` - -### Quick Test Before Production - -```bash -# Test entire workflow with limited data -npm run test:all - -# If successful, run production -npm run update:all -``` - -### Scrape Specific Letters Only - -```bash -# Scrape only letters J and K -node scripts/scrape-artists.js --letters j,k - -# Compare (will only process these letters) -npm run update:compare - -# Prescrape only these letters -node scripts/prescrape-new-artists.js --letters j,k - -# Upload -npm run update:upload-artists -npm run update:upload-songs -npm run update:update-popular -``` - -### Re-upload from Existing Data - -```bash -# Upload from specific timestamp (no re-scraping) -node scripts/upload-artists.js --date 2026-01-04-21-01 --dry-run -node scripts/upload-songs.js --date 2026-01-04-20-48 --dry-run - -# If looks good, remove --dry-run -node scripts/upload-artists.js --date 2026-01-04-21-01 -node scripts/upload-songs.js --date 2026-01-04-20-48 -``` - -## Error Handling - -### Error Logging - -All scripts log errors to `errors.json` files: - -```json -{ - "totalErrors": 15, - "errorsByType": { - "network_timeout": 8, - "invalid_slug": 5, - "sanitization_failed": 2 - }, - "detailedErrors": [ - { - "timestamp": "2026-01-04T20:22:15.123Z", - "type": "network_timeout", - "message": "Request timeout", - "details": { - "artist": "Artist Name", - "url": "https://..." - } - } - ] -} -``` - -### Handling Errors - -1. **Check error counts** in summary files -2. **Review errors.json** for details -3. **Re-run specific letters** if needed -4. **Most errors are graceful** - process continues - -### Common Issues - -**Too many network timeouts?** -- Increase delays in script config -- Run specific letters separately -- Use `--no-ids` to skip ID extraction - -**Slug extraction failures?** -- Review invalid URLs in errors.json -- Usually rare, won't affect most artists - -**Upload failures?** -- Check Firestore permissions -- Verify Firebase config -- Check network connection - -## Best Practices - -### Safety - -1. **Always test first**: `npm run test:all` -2. **Use dry-run**: `--dry-run` on upload commands -3. **Review reports**: Check comparison-report.json -4. **Check errors**: Review error counts in summaries -5. **Manual steps**: Upload is separate from scraping - -### Performance - -1. **Batch processing**: Default 500 items per batch -2. **Rate limiting**: Built-in delays between requests -3. **Skip existing**: Default behavior, use `--no-skip` carefully -4. **Parallel testing**: Test multiple letters at once - -### Monitoring - -1. **Progress bars**: Real-time status during execution -2. **Summary files**: Statistics for each run -3. **Error logs**: Categorized error tracking -4. **Timestamps**: Easy to track runs over time - -## Troubleshooting - -### Script fails to find latest timestamp - -**Problem**: "No new-artists data found" -**Solution**: Run the previous step first or specify `--date` - -### Firestore permission denied - -**Problem**: Upload fails with permission error -**Solution**: Check Firebase config and Firestore rules - -### Rate limited by Genius - -**Problem**: Many 429 errors -**Solution**: Increase delays or run specific letters - -### Out of memory - -**Problem**: Node runs out of memory -**Solution**: Reduce batch size or process fewer letters - -## Development - -### Adding New Features - -1. **Utility modules**: Add to `scripts/utils/` -2. **Error types**: Use `errorLogger.logError(type, details, message)` -3. **TUI**: Use `tui.printInfo`, `tui.createProgressBar`, etc. -4. **Testing**: Add `--limit` and `--dry-run` options - -### Code Style - -- No emojis in logs or comments -- Categorized error logging -- Progress bars for long operations -- Helpful CLI help messages -- Timestamped output directories - -## Workflow Timer - -The system tracks elapsed time across all scripts using the `artist-lists` directory timestamp: - -- **Automatic Tracking**: Uses directory creation time from first script -- **No Extra Files**: Leverages existing timestamped directories -- **Live Display**: Progress bars show "Total: 2m 34s" that updates every second -- **Per-Script Summary**: Each script displays current elapsed time upon completion -- **Final Total**: `upload-artists.js` (last script) shows the total workflow time - -Example output: -``` -Workflow Elapsed Time: 2m 34s -──────────────────────────────────────────────────────── -TOTAL WORKFLOW TIME: 23m 47s -──────────────────────────────────────────────────────── -``` - -The timer helps you: -- Track actual processing time across multiple scripts -- Estimate how long future runs will take -- Identify bottlenecks in the workflow - -## Implementation Timeline - -- **Week 1**: Utility modules (timestamp, paths, TUI, error logging) -- **Week 2-3**: Scraping scripts (scrape, compare, prescrape) -- **Week 4**: Upload scripts (artists, songs, popular flags) -- **Total**: 4 weeks of implementation - -## System Requirements - -- Node.js >= 18 -- Firebase project with Firestore -- Network access to Genius.com -- ~1GB disk space for cached data - -## Dependencies - -- `firebase`: Firestore integration -- `cheerio`: HTML parsing -- `axios`: HTTP requests -- `unidecode`: Text normalization -- `cli-progress`: Progress bars -- `chalk`: Colored terminal output - -## License - -MIT - -## Support - -For issues, questions, or contributions, please refer to the implementation plan and week-specific completion documents: -- `ARTIST_UPDATE_SYSTEM_PLAN.md` -- `WEEK_1_COMPLETE.md` -- `WEEK_2_3_COMPLETE.md` -- `WEEK_4_COMPLETE.md` - diff --git a/scripts/compare-artists.js b/scripts/compare-artists.js deleted file mode 100644 index a932156..0000000 --- a/scripts/compare-artists.js +++ /dev/null @@ -1,492 +0,0 @@ -#!/usr/bin/env node - -/** - * Artist Comparison Script - * Compares scraped artist lists with Firestore to identify new artists - */ - -import fs from 'fs/promises'; -import { initializeApp } from 'firebase/app'; -import { getFirestore, collection, getDocs, query, orderBy, limit, startAfter } from 'firebase/firestore'; -import { firebaseConfig } from '../src/lib/services/initFirebase.js'; -import * as tui from './utils/tui.js'; -import * as paths from './utils/paths.js'; -import { generateTimestamp, getCurrentISO, calculateETA, getWorkflowElapsed } from './utils/timestamp.js'; -import { createErrorLogger } from './utils/error-logger.js'; - -class ArtistComparator { - constructor(options = {}) { - this.inputTimestamp = options.timestamp || null; - this.outputDir = options.outputDir || null; - this.errorLogger = createErrorLogger('artist-comparison'); - this.stats = { - geniusTotal: 0, - firestoreTotal: 0, - newArtists: 0, - existingArtists: 0, - popularInGenius: 0, - popularInFirestore: 0, - popularToAdd: 0, - popularToRemove: 0 - }; - this.firestoreArtists = new Map(); // Map of slug -> artist data - } - - /** - * Extract slug from Genius artist URL - * @param {string} url - The Genius artist URL - * @returns {string|null} The artist slug - */ - extractSlug(url) { - if (!url) return null; - const match = url.match(/\/artists\/([^/?#]+)/); - return match ? match[1].toLowerCase() : null; - } - - /** - * Load artist lists from scraping-data - */ - async loadArtistLists() { - const timestamp = this.inputTimestamp || await paths.findLatestTimestamp('artist-lists'); - - if (!timestamp) { - throw new Error('No artist lists found. Run scrape-artists.js first.'); - } - - const inputDir = paths.getArtistListsDir(timestamp); - const isComplete = await paths.isDirectoryComplete(inputDir); - - if (!isComplete) { - tui.printWarning(`Artist list directory not marked complete: ${inputDir}`); - } - - tui.printInfo(`Loading artist lists from: ${timestamp}`); - - const allArtists = []; - const letters = paths.getAllLetters(); - - for (const letter of letters) { - const filePath = paths.getLetterFilePath(inputDir, letter, 'artists'); - - try { - const content = await fs.readFile(filePath, 'utf8'); - const data = JSON.parse(content); - - // Add artists with letter metadata - if (data.artists && data.artists.popular) { - data.artists.popular.forEach(artist => { - allArtists.push({ ...artist, letter, source: 'popular' }); - }); - } - - if (data.artists && data.artists.regular) { - data.artists.regular.forEach(artist => { - allArtists.push({ ...artist, letter, source: 'regular' }); - }); - } - } catch (error) { - if (error.code !== 'ENOENT') { - this.errorLogger.logError('file_read_error', { - file: filePath, - letter - }, error.message); - } - } - } - - this.stats.geniusTotal = allArtists.length; - this.stats.popularInGenius = allArtists.filter(a => a.type === 'popular').length; - - tui.printInfo(`Loaded ${tui.formatNumber(allArtists.length)} artists from Genius`); - - return { allArtists, timestamp, inputDir }; - } - - /** - * Fetch all artists from Firestore - */ - async fetchFirestoreArtists() { - tui.printInfo('Fetching artists from Firestore...'); - - const app = initializeApp(firebaseConfig); - const db = getFirestore(app); - - const artistsRef = collection(db, 'artists'); - let lastDoc = null; - let batchCount = 0; - const batchSize = 1000; - - const progressBar = tui.createProgressBar('Fetching from Firestore', 100); - - while (true) { - batchCount++; - - let q = query(artistsRef, orderBy('name'), limit(batchSize)); - if (lastDoc) { - q = query(artistsRef, orderBy('name'), startAfter(lastDoc), limit(batchSize)); - } - - try { - const snapshot = await getDocs(q); - - if (snapshot.empty) { - break; - } - - snapshot.docs.forEach(doc => { - const data = doc.data(); - // Use document ID (slug) as the key for reliable comparison - this.firestoreArtists.set(doc.id.toLowerCase(), { - slug: doc.id, - name: data.name, - geniusId: data.geniusId || null, - type: data.type || 'regular', - url: data.url || null - }); - }); - - lastDoc = snapshot.docs[snapshot.docs.length - 1]; - - // Update progress (estimate based on batch count) - const progress = Math.min(95, batchCount * 5); - progressBar.update(progress); - - } catch (error) { - this.errorLogger.logError('firestore_fetch_error', { - batch: batchCount - }, error.message); - break; - } - } - - progressBar.update(100); - progressBar.stop(); - - this.stats.firestoreTotal = this.firestoreArtists.size; - this.stats.popularInFirestore = Array.from(this.firestoreArtists.values()) - .filter(a => a.type === 'popular').length; - - tui.printInfo(`Loaded ${tui.formatNumber(this.firestoreArtists.size)} artists from Firestore`); - } - - /** - * Compare and identify new artists - */ - compareArtists(geniusArtists) { - tui.printInfo('Comparing artist lists...'); - - const newArtists = []; - const existingArtists = []; - const popularUpdates = { - toAdd: [], - toRemove: [] - }; - - // Identify new artists by comparing slugs - for (const artist of geniusArtists) { - const slug = this.extractSlug(artist.url); - - if (!slug) { - // Can't extract slug, treat as new - newArtists.push(artist); - continue; - } - - if (this.firestoreArtists.has(slug.toLowerCase())) { - existingArtists.push(artist); - } else { - newArtists.push(artist); - } - } - - // Identify popular status changes by comparing slugs - // First, find all current popular artists in Firestore (by slug) - const currentPopularSlugs = new Set( - Array.from(this.firestoreArtists.entries()) - .filter(([slug, data]) => data.type === 'popular') - .map(([slug, data]) => slug.toLowerCase()) - ); - - // Find artists that should be popular (from Genius, by slug) - const shouldBePopularSlugs = new Set(); - for (const artist of geniusArtists) { - if (artist.type === 'popular') { - const slug = this.extractSlug(artist.url); - if (slug) { - shouldBePopularSlugs.add(slug.toLowerCase()); - } - } - } - - // Artists to add popular flag (in Genius popular but not in Firestore popular) - for (const artist of geniusArtists) { - if (artist.type === 'popular') { - const slug = this.extractSlug(artist.url); - if (slug) { - const slugLower = slug.toLowerCase(); - if (this.firestoreArtists.has(slugLower) && !currentPopularSlugs.has(slugLower)) { - popularUpdates.toAdd.push({ - name: artist.name, - slug: slug, - action: 'add_popular', - reason: 'now_in_genius_popular_top_20' - }); - } - } - } - } - - // Artists to remove popular flag (in Firestore popular but not in Genius popular) - for (const [slug, firestoreArtist] of this.firestoreArtists.entries()) { - if (firestoreArtist.type === 'popular' && !shouldBePopularSlugs.has(slug.toLowerCase())) { - popularUpdates.toRemove.push({ - name: firestoreArtist.name, - slug: slug, - action: 'remove_popular', - reason: 'no_longer_in_genius_popular_top_20' - }); - } - } - - this.stats.newArtists = newArtists.length; - this.stats.existingArtists = existingArtists.length; - this.stats.popularToAdd = popularUpdates.toAdd.length; - this.stats.popularToRemove = popularUpdates.toRemove.length; - - return { newArtists, popularUpdates }; - } - - /** - * Save new artists by letter - */ - async saveNewArtistsByLetter(newArtists, outputDir, sourceTimestamp) { - const artistsByLetter = {}; - - // Group by letter - for (const artist of newArtists) { - const letter = artist.letter || '0'; - if (!artistsByLetter[letter]) { - artistsByLetter[letter] = []; - } - artistsByLetter[letter].push(artist); - } - - // Save each letter file - const letters = paths.getAllLetters(); - for (const letter of letters) { - const artists = artistsByLetter[letter] || []; - const filePath = paths.getLetterFilePath(outputDir, letter, 'new-artists'); - - const data = { - letter: letter.toUpperCase(), - comparisonDate: getCurrentISO(), - sourceTimestamp, - newArtists: artists, - count: artists.length - }; - - await fs.writeFile(filePath, JSON.stringify(data, null, 2)); - } - } - - /** - * Save comparison report - */ - async saveComparisonReport(outputDir, sourceTimestamp, popularUpdates) { - const perLetter = {}; - const newArtistsByLetter = {}; - - // Calculate per-letter statistics - const letters = paths.getAllLetters(); - for (const letter of letters) { - const filePath = paths.getLetterFilePath(outputDir, letter, 'new-artists'); - - try { - const content = await fs.readFile(filePath, 'utf8'); - const data = JSON.parse(content); - newArtistsByLetter[letter] = data.count; - perLetter[letter] = { - newCount: data.count - }; - } catch { - newArtistsByLetter[letter] = 0; - } - } - - const report = { - timestamp: getCurrentISO(), - sourceDirectory: `scraping-data/artist-lists/${sourceTimestamp}`, - statistics: { - totalGeniusArtists: this.stats.geniusTotal, - totalFirestoreArtists: this.stats.firestoreTotal, - newArtists: this.stats.newArtists, - existingArtists: this.stats.existingArtists, - popularChanges: { - addedToPopular: this.stats.popularToAdd, - removedFromPopular: this.stats.popularToRemove, - totalChanges: this.stats.popularToAdd + this.stats.popularToRemove - } - }, - newArtistsByLetter, - popularUpdates, - errors: this.errorLogger.getSummary() - }; - - const reportPath = `${outputDir}/comparison-report.json`; - await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); - } - - /** - * Run comparison - */ - async compare() { - tui.printHeader('ARTIST COMPARISON'); - - // Load Genius artists - const { allArtists, timestamp: sourceTimestamp, inputDir } = await this.loadArtistLists(); - - // Fetch Firestore artists - await this.fetchFirestoreArtists(); - - // Compare - const { newArtists, popularUpdates } = this.compareArtists(allArtists); - - // Prepare output directory - const outputTimestamp = generateTimestamp(); - const outputDir = this.outputDir || await paths.createTimestampedDir('new-artists', outputTimestamp); - - tui.printInfo(`Saving results to: ${outputTimestamp}`); - - // Save filtered lists - await this.saveNewArtistsByLetter(newArtists, outputDir, sourceTimestamp); - - // Save comparison report - await this.saveComparisonReport(outputDir, sourceTimestamp, popularUpdates); - - // Save errors if any - await this.errorLogger.saveToFile(outputDir); - - // Mark complete - await paths.markDirectoryComplete(outputDir); - - return { outputDir, outputTimestamp }; - } - - /** - * Display results - */ - displayResults() { - console.log(''); - tui.printStats('Comparison Results', { - 'Genius Artists': tui.formatNumber(this.stats.geniusTotal), - 'Firestore Artists': tui.formatNumber(this.stats.firestoreTotal), - 'New Artists': tui.formatNumber(this.stats.newArtists), - 'Existing Artists': tui.formatNumber(this.stats.existingArtists) - }); - - if (this.stats.popularToAdd > 0 || this.stats.popularToRemove > 0) { - console.log(''); - tui.printStats('Popular Status Changes', { - 'To Add': tui.formatNumber(this.stats.popularToAdd), - 'To Remove': tui.formatNumber(this.stats.popularToRemove), - 'Total Changes': tui.formatNumber(this.stats.popularToAdd + this.stats.popularToRemove) - }); - } - - if (this.errorLogger.hasErrors()) { - tui.printErrorSummary(this.errorLogger.getErrorCounts()); - } - } -} - -/** - * Parse CLI arguments - */ -function parseArgs() { - const args = process.argv.slice(2); - const options = { - timestamp: null, - outputDir: null, - dryRun: false, - quiet: false - }; - - for (let i = 0; i < args.length; i++) { - const arg = args[i]; - - if (arg === '--date' && args[i + 1]) { - options.timestamp = args[i + 1]; - i++; - } else if (arg === '--output-dir' && args[i + 1]) { - options.outputDir = args[i + 1]; - i++; - } else if (arg === '--dry-run') { - options.dryRun = true; - } else if (arg === '--quiet') { - options.quiet = true; - } else if (arg === '--help' || arg === '-h') { - console.log(` -Usage: node scripts/compare-artists.js [options] - -Options: - --date Use specific artist list timestamp (YYYY-MM-DD-HH-MM) - Default: use latest - --output-dir Custom output directory - --dry-run Preview only, don't save files - --quiet Minimal output - --help, -h Show this help message - -Examples: - node scripts/compare-artists.js - node scripts/compare-artists.js --date 2026-01-04-18-30 - node scripts/compare-artists.js --dry-run -`); - process.exit(0); - } - } - - return options; -} - -/** - * Main execution - */ -async function main() { - const options = parseArgs(); - - if (options.dryRun) { - tui.printWarning('DRY RUN MODE: No files will be saved'); - } - - const comparator = new ArtistComparator({ - timestamp: options.timestamp, - outputDir: options.outputDir - }); - - try { - const { outputDir, outputTimestamp } = await comparator.compare(); - - comparator.displayResults(); - - const workflowElapsed = await getWorkflowElapsed(); - if (workflowElapsed !== null) { - tui.printWorkflowTime(workflowElapsed); - } - - tui.printSuccess('Comparison complete!'); - tui.printInfo(`Output: ${outputDir}`); - tui.printFooter(); - - } catch (error) { - tui.printError(`Comparison failed: ${error.message}`); - console.error(error); - process.exit(1); - } -} - -if (import.meta.url === `file://${process.argv[1]}`) { - main(); -} - -export default ArtistComparator; - diff --git a/scripts/prescrape-new-artists.js b/scripts/prescrape-new-artists.js deleted file mode 100644 index f296bc0..0000000 --- a/scripts/prescrape-new-artists.js +++ /dev/null @@ -1,551 +0,0 @@ -#!/usr/bin/env node - -/** - * Prescrape New Artists - * Scrapes song data for newly identified artists from comparison step - */ - -import fs from 'fs/promises'; -import * as cheerio from 'cheerio'; -import * as tui from './utils/tui.js'; -import * as paths from './utils/paths.js'; -import { generateTimestamp, getCurrentISO, getWorkflowElapsed } from './utils/timestamp.js'; -import { createErrorLogger } from './utils/error-logger.js'; - -class NewArtistPrescraper { - constructor(options = {}) { - this.inputTimestamp = options.timestamp || null; - this.outputDir = options.outputDir || null; - this.maxSongsPerArtist = options.maxSongsPerArtist || 10; - this.maxArtists = options.maxArtists || null; // Limit total artists for testing - this.letters = options.letters || paths.getAllLetters(); - this.dryRun = options.dryRun || false; - this.delays = { - betweenArtists: options.delayBetweenArtists || 1000, - betweenSongs: options.delayBetweenSongs || 500, - betweenPages: options.delayBetweenPages || 200 - }; - this.api = { - timeout: options.timeout || 10000, - maxRetries: options.maxRetries || 3, - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' - }; - this.errorLogger = createErrorLogger('prescraper'); - this.stats = { - totalArtists: 0, - processedArtists: 0, - skippedArtists: 0, - totalSongs: 0, - processedSongs: 0, - scrapedLyrics: 0, - failedLyrics: 0 - }; - this.currentProgress = { - letter: '', - artist: '', - song: '' - }; - } - - /** - * Delay utility - */ - delay(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); - } - - /** - * Fetch with timeout and retries - */ - async fetchWithTimeout(url, options = {}) { - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), this.api.timeout); - - let lastError; - for (let attempt = 1; attempt <= this.api.maxRetries; attempt++) { - try { - const response = await fetch(url, { - ...options, - signal: controller.signal, - headers: { - 'User-Agent': this.api.userAgent, - ...options.headers - } - }); - - clearTimeout(timeoutId); - - if (!response.ok && response.status >= 500) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - return response; - } catch (error) { - lastError = error; - clearTimeout(timeoutId); - - if (attempt < this.api.maxRetries) { - const delayMs = Math.pow(2, attempt) * 1000; - await this.delay(delayMs); - } - } - } - - throw lastError || new Error(`Failed after ${this.api.maxRetries} attempts`); - } - - /** - * Load new artists from comparison output - */ - async loadNewArtists() { - const timestamp = this.inputTimestamp || await paths.findLatestTimestamp('new-artists'); - - if (!timestamp) { - throw new Error('No new-artists data found. Run compare-artists.js first.'); - } - - const inputDir = paths.getNewArtistsDir(timestamp); - const isComplete = await paths.isDirectoryComplete(inputDir); - - if (!isComplete) { - tui.printWarning(`New-artists directory not marked complete: ${inputDir}`); - } - - tui.printInfo(`Loading new artists from: ${timestamp}`); - - const artistsByLetter = {}; - let totalArtists = 0; - - for (const letter of this.letters) { - const filePath = paths.getLetterFilePath(inputDir, letter, 'new-artists'); - - try { - const content = await fs.readFile(filePath, 'utf8'); - const data = JSON.parse(content); - - if (data.newArtists && data.newArtists.length > 0) { - artistsByLetter[letter] = data.newArtists; - totalArtists += data.newArtists.length; - } - } catch (error) { - if (error.code !== 'ENOENT') { - this.errorLogger.logError('file_read_error', { - file: filePath, - letter - }, error.message); - } - } - } - - this.stats.totalArtists = totalArtists; - // Apply limit if specified - if (this.maxArtists && this.maxArtists < totalArtists) { - tui.printWarning(`Limiting to first ${this.maxArtists} artists for testing`); - let remainingLimit = this.maxArtists; - - for (const letter of Object.keys(artistsByLetter)) { - if (remainingLimit <= 0) { - delete artistsByLetter[letter]; - } else if (artistsByLetter[letter].length > remainingLimit) { - artistsByLetter[letter] = artistsByLetter[letter].slice(0, remainingLimit); - remainingLimit = 0; - } else { - remainingLimit -= artistsByLetter[letter].length; - } - } - - // Recalculate total - totalArtists = Object.values(artistsByLetter).reduce((sum, arr) => sum + arr.length, 0); - this.stats.totalArtists = totalArtists; - } - - tui.printInfo(`Loaded ${tui.formatNumber(totalArtists)} new artists across ${Object.keys(artistsByLetter).length} letters`); - - return { artistsByLetter, timestamp }; - } - - /** - * Scrape songs for a single artist - */ - async scrapeArtistSongs(artist) { - this.currentProgress.artist = artist.name; - - try { - const response = await this.fetchWithTimeout(artist.url); - const html = await response.text(); - const $ = cheerio.load(html); - - const songs = []; - const seenUrls = new Set(); - - // Find all links containing "-lyrics" in the href - $('a[href*="-lyrics"]').each((i, el) => { - if (songs.length >= this.maxSongsPerArtist) return false; // Stop when we have enough - - const songUrl = $(el).attr('href'); - if (!songUrl) return; - - // Build full URL - const fullUrl = songUrl.startsWith('http') ? songUrl : `https://genius.com${songUrl}`; - - // Skip if we've already seen this URL - if (seenUrls.has(fullUrl)) return; - seenUrls.add(fullUrl); - - // Extract title - try multiple methods - let title = $(el).text().trim(); - - // If no text, try getting it from the URL - if (!title || title.length === 0) { - const urlMatch = fullUrl.match(/genius\.com\/(.+)-lyrics/); - if (urlMatch) { - title = urlMatch[1].replace(/-/g, ' '); - } - } - - // Only add if we have a valid title and URL looks like a song page - if (title && title.length > 0 && fullUrl.includes('genius.com/') && fullUrl.includes('-lyrics')) { - songs.push({ - title, - url: fullUrl, - artist: artist.name, - artistUrl: artist.url - }); - } - }); - - return songs; - } catch (error) { - this.errorLogger.logError('artist_scrape_failed', { - artist: artist.name, - url: artist.url - }, error.message); - return []; - } - } - - /** - * Scrape lyrics for a single song - */ - async scrapeSongLyrics(song) { - this.currentProgress.song = song.title; - - try { - await this.delay(this.delays.betweenSongs); - const response = await this.fetchWithTimeout(song.url); - const html = await response.text(); - const $ = cheerio.load(html); - - let lyrics = ''; - const lyricsContainers = $('[data-lyrics-container="true"]'); - - if (lyricsContainers.length > 0) { - lyricsContainers.each((i, container) => { - const text = $(container).text().trim(); - if (text) { - lyrics += text + '\n\n'; - } - }); - lyrics = lyrics.trim(); - } - - if (!lyrics || lyrics.length === 0) { - this.errorLogger.logError('empty_lyrics', { - song: song.title, - artist: song.artist, - url: song.url - }, 'No lyrics found'); - this.stats.failedLyrics++; - return null; - } - - this.stats.scrapedLyrics++; - return lyrics; - } catch (error) { - this.errorLogger.logError('lyrics_scrape_failed', { - song: song.title, - artist: song.artist, - url: song.url - }, error.message); - this.stats.failedLyrics++; - return null; - } - } - - /** - * Process a single letter - */ - async processLetter(letter, artists, outputDir, progressBar) { - this.currentProgress.letter = letter.toUpperCase(); - const processedSongs = []; - - for (const artist of artists) { - this.currentProgress.artist = artist.name; - this.currentProgress.song = ''; - - // Update progress - progressBar.increment({ - status: `Letter ${letter.toUpperCase()}: ${artist.name}` - }); - - // Scrape artist's songs - const songs = await this.scrapeArtistSongs(artist); - this.stats.processedArtists++; - - if (songs.length === 0) { - this.stats.skippedArtists++; - await this.delay(this.delays.betweenArtists); - continue; - } - - this.stats.totalSongs += songs.length; - - // Scrape lyrics for each song - for (const song of songs) { - const lyrics = await this.scrapeSongLyrics(song); - - if (lyrics) { - processedSongs.push({ - ...song, - lyrics, - scrapedAt: getCurrentISO() - }); - } - - this.stats.processedSongs++; - - // Update progress with song info - progressBar.update({ - status: `Letter ${letter.toUpperCase()}: ${artist.name} - ${song.title}` - }); - } - - await this.delay(this.delays.betweenArtists); - } - - // Save letter file - if (!this.dryRun) { - const filePath = paths.getLetterFilePath(outputDir, letter, 'songs'); - const data = { - letter: letter.toUpperCase(), - scrapedAt: getCurrentISO(), - totalSongs: processedSongs.length, - songs: processedSongs - }; - - await fs.writeFile(filePath, JSON.stringify(data, null, 2)); - } - - return processedSongs.length; - } - - /** - * Run prescraping - */ - async prescrape() { - tui.printHeader('PRESCRAPE NEW ARTISTS'); - - if (this.dryRun) { - tui.printWarning('DRY RUN MODE: No files will be saved'); - } - - // Load new artists - const { artistsByLetter, timestamp: sourceTimestamp } = await this.loadNewArtists(); - - if (this.stats.totalArtists === 0) { - tui.printInfo('No new artists to prescrape!'); - return null; - } - - // Prepare output directory - const outputTimestamp = generateTimestamp(); - const outputDir = this.outputDir || await paths.createTimestampedDir('song-data', outputTimestamp); - - tui.printInfo(`Saving results to: ${outputTimestamp}`); - tui.printInfo(`Max songs per artist: ${this.maxSongsPerArtist}`); - - // Create progress bar - const progressBar = tui.createProgressBar( - 'Prescraping', - this.stats.totalArtists, - 'Initializing...' - ); - - // Process each letter - for (const letter of this.letters) { - const artists = artistsByLetter[letter]; - if (!artists || artists.length === 0) continue; - - await this.processLetter(letter, artists, outputDir, progressBar); - } - - progressBar.stop(); - - // Save summary - if (!this.dryRun) { - const summaryPath = `${outputDir}/prescrape-summary.json`; - const summary = { - timestamp: getCurrentISO(), - sourceDirectory: `scraping-data/new-artists/${sourceTimestamp}`, - configuration: { - maxSongsPerArtist: this.maxSongsPerArtist, - letters: this.letters - }, - statistics: { - totalArtists: this.stats.totalArtists, - processedArtists: this.stats.processedArtists, - skippedArtists: this.stats.skippedArtists, - totalSongs: this.stats.totalSongs, - processedSongs: this.stats.processedSongs, - scrapedLyrics: this.stats.scrapedLyrics, - failedLyrics: this.stats.failedLyrics - }, - errors: this.errorLogger.getSummary() - }; - - await fs.writeFile(summaryPath, JSON.stringify(summary, null, 2)); - - // Save errors - await this.errorLogger.saveToFile(outputDir); - - // Mark complete - await paths.markDirectoryComplete(outputDir); - } - - return { outputDir, outputTimestamp }; - } - - /** - * Display results - */ - displayResults() { - console.log(''); - tui.printStats('Prescraping Results', { - 'Total Artists': tui.formatNumber(this.stats.totalArtists), - 'Processed Artists': tui.formatNumber(this.stats.processedArtists), - 'Skipped Artists': tui.formatNumber(this.stats.skippedArtists), - 'Total Songs': tui.formatNumber(this.stats.totalSongs), - 'Scraped Lyrics': tui.formatNumber(this.stats.scrapedLyrics), - 'Failed Lyrics': tui.formatNumber(this.stats.failedLyrics), - 'Success Rate': `${((this.stats.scrapedLyrics / this.stats.totalSongs) * 100).toFixed(1)}%` - }); - - if (this.errorLogger.hasErrors()) { - tui.printErrorSummary(this.errorLogger.getErrorCounts()); - } - } -} - -/** - * Parse CLI arguments - */ -function parseArgs() { - const args = process.argv.slice(2); - const options = { - timestamp: null, - outputDir: null, - maxSongsPerArtist: 10, - maxArtists: null, - letters: null, - dryRun: false, - quiet: false - }; - - for (let i = 0; i < args.length; i++) { - const arg = args[i]; - - if (arg === '--date' && args[i + 1]) { - options.timestamp = args[i + 1]; - i++; - } else if (arg === '--output-dir' && args[i + 1]) { - options.outputDir = args[i + 1]; - i++; - } else if (arg === '--max-songs' && args[i + 1]) { - options.maxSongsPerArtist = parseInt(args[i + 1], 10); - i++; - } else if (arg === '--limit' && args[i + 1]) { - options.maxArtists = parseInt(args[i + 1], 10); - i++; - } else if (arg === '--letters' && args[i + 1]) { - options.letters = args[i + 1].split(',').map(l => l.trim().toLowerCase()); - i++; - } else if (arg === '--dry-run') { - options.dryRun = true; - } else if (arg === '--quiet') { - options.quiet = true; - } else if (arg === '--help' || arg === '-h') { - console.log(` -Usage: node scripts/prescrape-new-artists.js [options] - -Options: - --date Use specific new-artists data (YYYY-MM-DD-HH-MM) - Default: use latest - --output-dir Custom output directory - --max-songs Max songs per artist (default: 10) - --limit Max artists to process (for testing) - --letters Comma-separated letters to process (e.g., 'a,b,c') - Default: all letters - --dry-run Preview only, don't save files - --quiet Minimal output - --help, -h Show this help message - -Examples: - node scripts/prescrape-new-artists.js - node scripts/prescrape-new-artists.js --limit 10 --max-songs 2 - node scripts/prescrape-new-artists.js --max-songs 20 - node scripts/prescrape-new-artists.js --letters a,b,c - node scripts/prescrape-new-artists.js --date 2026-01-04-20-26 -`); - process.exit(0); - } - } - - return options; -} - -/** - * Main execution - */ -async function main() { - const options = parseArgs(); - - const prescraper = new NewArtistPrescraper({ - timestamp: options.timestamp, - outputDir: options.outputDir, - maxSongsPerArtist: options.maxSongsPerArtist, - maxArtists: options.maxArtists, - letters: options.letters, - dryRun: options.dryRun - }); - - try { - const result = await prescraper.prescrape(); - - if (result) { - prescraper.displayResults(); - - const workflowElapsed = await getWorkflowElapsed(); - if (workflowElapsed !== null) { - tui.printWorkflowTime(workflowElapsed); - } - - tui.printSuccess('Prescraping complete!'); - tui.printInfo(`Output: ${result.outputDir}`); - } else { - tui.printInfo('No work to do.'); - } - - tui.printFooter(); - } catch (error) { - tui.printError(`Prescraping failed: ${error.message}`); - console.error(error); - process.exit(1); - } -} - -if (import.meta.url === `file://${process.argv[1]}`) { - main(); -} - -export default NewArtistPrescraper; - diff --git a/scripts/scrape-artists.js b/scripts/scrape-artists.js deleted file mode 100644 index 1be4afb..0000000 --- a/scripts/scrape-artists.js +++ /dev/null @@ -1,413 +0,0 @@ -#!/usr/bin/env node - -/** - * Artist List Scraper - * Scrapes artist lists from Genius and saves to timestamped directories - */ - -import axios from 'axios'; -import * as cheerio from 'cheerio'; -import fs from 'fs/promises'; -import * as tui from './utils/tui.js'; -import * as paths from './utils/paths.js'; -import { generateTimestamp, getCurrentISO, calculateETA, getWorkflowElapsed } from './utils/timestamp.js'; -import { createErrorLogger } from './utils/error-logger.js'; - -class ArtistScraper { - constructor(options = {}) { - this.baseUrl = 'https://genius.com/artists-index/'; - this.requestDelay = options.requestDelay || 500; - this.includeIds = options.includeIds !== false; - this.maxArtistsPerLetter = options.maxArtistsPerLetter || null; // Limit for testing - this.outputDir = options.outputDir || null; - this.errorLogger = createErrorLogger('artist-scraping'); - this.stats = { - totalArtists: 0, - popularArtists: 0, - regularArtists: 0, - artistsWithIds: 0, - idExtractionFailed: 0, - networkErrors: 0 - }; - } - - /** - * Extract artist ID from iOS app link - */ - async extractArtistId(artistUrl, artistName) { - try { - await new Promise(resolve => setTimeout(resolve, this.requestDelay)); - - const response = await axios.get(artistUrl, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' - }, - timeout: 10000 - }); - - const $ = cheerio.load(response.data); - const iosAppLink = $('link[rel="alternate"][href*="ios-app://"]').attr('href'); - - if (iosAppLink) { - const match = iosAppLink.match(/\/artists\/(\d+)$/); - if (match) { - this.stats.artistsWithIds++; - return match[1]; - } - } - - this.stats.idExtractionFailed++; - this.errorLogger.logError('id_extraction_failed', { - artist: artistName, - url: artistUrl - }, 'iOS app link not found'); - - return null; - } catch (error) { - this.stats.idExtractionFailed++; - - if (error.code === 'ECONNABORTED' || error.code === 'ETIMEDOUT') { - this.errorLogger.logError('network_timeout', { - artist: artistName, - url: artistUrl - }, `Request timeout: ${error.message}`); - } else { - this.errorLogger.logError('network_error', { - artist: artistName, - url: artistUrl - }, error.message); - } - - return null; - } - } - - /** - * Scrape artists for a specific letter - */ - async scrapeArtistsByLetter(letter) { - const url = `${this.baseUrl}${letter.toLowerCase()}`; - - try { - const response = await axios.get(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' - }, - timeout: 15000 - }); - - const $ = cheerio.load(response.data); - const results = { - popular: [], - regular: [] - }; - - // Extract popular artists - $('li.artists_index_list-popular_artist').each((index, element) => { - const artistLink = $(element).find('a.artists_index_list-artist_name'); - const name = artistLink.text().trim(); - const url = artistLink.attr('href'); - - if (name && url) { - results.popular.push({ - name, - url, - type: 'popular', - id: null - }); - } - }); - - // Extract regular artists - const regularArtistLists = $('ul.artists_index_list').not(':has(.artists_index_list-popular_artist)'); - regularArtistLists.each((listIndex, listElement) => { - $(listElement).find('li').each((index, element) => { - const artistLink = $(element).find('a').first(); - const name = artistLink.text().trim(); - const url = artistLink.attr('href'); - - if (name && url && url.includes('/artists/')) { - results.regular.push({ - name, - url, - type: 'regular', - id: null - }); - } - }); - }); - - // Apply limit if specified (for testing) - if (this.maxArtistsPerLetter) { - const totalArtists = results.popular.length + results.regular.length; - if (totalArtists > this.maxArtistsPerLetter) { - // Prioritize popular artists, then regular - if (results.popular.length >= this.maxArtistsPerLetter) { - results.popular = results.popular.slice(0, this.maxArtistsPerLetter); - results.regular = []; - } else { - const remainingSlots = this.maxArtistsPerLetter - results.popular.length; - results.regular = results.regular.slice(0, remainingSlots); - } - } - } - - this.stats.popularArtists += results.popular.length; - this.stats.regularArtists += results.regular.length; - this.stats.totalArtists += results.popular.length + results.regular.length; - - return results; - - } catch (error) { - this.stats.networkErrors++; - this.errorLogger.logError('network_error', { - letter, - url - }, error.message); - - return { popular: [], regular: [] }; - } - } - - /** - * Extract IDs for all artists in results - */ - async extractIds(results, letter, progressBar) { - const allArtists = [...results.popular, ...results.regular]; - const total = allArtists.length; - - for (let i = 0; i < total; i++) { - const artist = allArtists[i]; - artist.id = await this.extractArtistId(artist.url, artist.name); - - if (progressBar) { - progressBar.update(i + 1, { - info: `Letter ${letter.toUpperCase()} - ${artist.name.substring(0, 30)}` - }); - } - } - } - - /** - * Save letter results to file - */ - async saveLetterFile(letter, results, outputDir) { - const filePath = paths.getLetterFilePath(outputDir, letter, 'artists'); - const data = { - letter: letter.toUpperCase(), - scrapedAt: getCurrentISO(), - totalArtists: results.popular.length + results.regular.length, - popularCount: results.popular.length, - regularCount: results.regular.length, - artists: results - }; - - await fs.writeFile(filePath, JSON.stringify(data, null, 2)); - } - - /** - * Scrape all letters - */ - async scrapeAll(lettersToScrape = null) { - const timestamp = generateTimestamp(); - const outputDir = this.outputDir || await paths.createTimestampedDir('artist-lists', timestamp); - - tui.printHeader('ARTIST LIST SCRAPER'); - tui.printInfo(`Output directory: ${outputDir}`); - tui.printInfo(`Include IDs: ${this.includeIds ? 'Yes' : 'No (faster)'}`); - - if (this.maxArtistsPerLetter) { - tui.printWarning(`Limiting to ${this.maxArtistsPerLetter} artists per letter (testing mode)`); - } - - const allLetters = paths.getAllLetters(); - const letters = lettersToScrape || allLetters; - - tui.printInfo(`Letters to scrape: ${letters.length} (${letters.join(', ')})`); - console.log(''); - - const startTime = Date.now(); - const progressBar = tui.createProgressBar('Scraping Progress', letters.length); - - for (let i = 0; i < letters.length; i++) { - const letter = letters[i]; - const displayLetter = letter === '0' ? 'Numbers' : letter.toUpperCase(); - - progressBar.update(i, { - info: `Current: Letter ${displayLetter}` - }); - - // Scrape artists for this letter - const results = await this.scrapeArtistsByLetter(letter); - - // Extract IDs if requested - if (this.includeIds && (results.popular.length > 0 || results.regular.length > 0)) { - const idBar = tui.createProgressBar(` Extracting IDs (${displayLetter})`, - results.popular.length + results.regular.length); - await this.extractIds(results, letter, idBar); - idBar.stop(); - } - - // Save to file - await this.saveLetterFile(letter, results, outputDir); - - // Update progress - const elapsed = (Date.now() - startTime) / 1000; - const eta = calculateETA(i + 1, letters.length, elapsed); - progressBar.update(i + 1, { - info: `Completed: ${displayLetter} | ETA: ${eta}` - }); - - // Delay between letters - if (i < letters.length - 1) { - await new Promise(resolve => setTimeout(resolve, 1000)); - } - } - - progressBar.stop(); - - // Save summary and errors - await this.saveSummary(outputDir, timestamp, letters); - await this.errorLogger.saveToFile(outputDir); - await paths.markDirectoryComplete(outputDir); - - return { outputDir, timestamp }; - } - - /** - * Save summary.json - */ - async saveSummary(outputDir, timestamp, letters) { - const summary = { - timestamp: getCurrentISO(), - timestampDir: timestamp, - lettersScraped: letters, - statistics: { - totalArtists: this.stats.totalArtists, - popularArtists: this.stats.popularArtists, - regularArtists: this.stats.regularArtists, - artistsWithIds: this.stats.artistsWithIds, - idExtractionFailed: this.stats.idExtractionFailed - }, - errors: this.errorLogger.getSummary() - }; - - const summaryPath = `${outputDir}/summary.json`; - await fs.writeFile(summaryPath, JSON.stringify(summary, null, 2)); - } - - /** - * Display final results - */ - displayResults() { - console.log(''); - tui.printStats('Statistics', { - 'Total Artists': tui.formatNumber(this.stats.totalArtists), - 'Popular Artists': tui.formatNumber(this.stats.popularArtists), - 'Regular Artists': tui.formatNumber(this.stats.regularArtists), - 'Artists with IDs': tui.formatNumber(this.stats.artistsWithIds), - 'ID Extraction Failed': tui.formatNumber(this.stats.idExtractionFailed) - }); - - if (this.errorLogger.hasErrors()) { - tui.printErrorSummary(this.errorLogger.getErrorCounts()); - } - } -} - -/** - * Parse CLI arguments - */ -function parseArgs() { - const args = process.argv.slice(2); - const options = { - letters: null, - includeIds: true, - maxArtistsPerLetter: null, - outputDir: null, - quiet: false - }; - - for (let i = 0; i < args.length; i++) { - const arg = args[i]; - - if (arg === '--letters' && args[i + 1]) { - const letterArg = args[i + 1]; - options.letters = letterArg.split(',').map(l => l.trim().toLowerCase()); - i++; - } else if (arg === '--no-ids') { - options.includeIds = false; - } else if (arg === '--limit' && args[i + 1]) { - options.maxArtistsPerLetter = parseInt(args[i + 1], 10); - i++; - } else if (arg === '--output-dir' && args[i + 1]) { - options.outputDir = args[i + 1]; - i++; - } else if (arg === '--quiet') { - options.quiet = true; - } else if (arg === '--help' || arg === '-h') { - console.log(` -Usage: node scripts/scrape-artists.js [options] - -Options: - --letters Comma-separated letters to scrape (e.g., "a,b,c" or "j,k") - Default: all letters (0, a-z) - --no-ids Skip artist ID extraction (much faster) - --limit Max artists per letter (for testing) - --output-dir Custom output directory - --quiet Minimal output (no TUI) - --help, -h Show this help message - -Examples: - node scripts/scrape-artists.js - node scripts/scrape-artists.js --letters j --limit 10 - node scripts/scrape-artists.js --letters j,k - node scripts/scrape-artists.js --no-ids - node scripts/scrape-artists.js --letters a,b,c --no-ids -`); - process.exit(0); - } - } - - return options; -} - -/** - * Main execution - */ -async function main() { - const options = parseArgs(); - - const scraper = new ArtistScraper({ - includeIds: options.includeIds, - maxArtistsPerLetter: options.maxArtistsPerLetter, - outputDir: options.outputDir - }); - - try { - const { outputDir, timestamp } = await scraper.scrapeAll(options.letters); - - scraper.displayResults(); - - const workflowElapsed = await getWorkflowElapsed(); - if (workflowElapsed !== null) { - tui.printWorkflowTime(workflowElapsed); - } - - tui.printSuccess(`Scraping complete!`); - tui.printInfo(`Output: ${outputDir}`); - tui.printFooter(); - - } catch (error) { - tui.printError(`Scraping failed: ${error.message}`); - console.error(error); - process.exit(1); - } -} - -if (import.meta.url === `file://${process.argv[1]}`) { - main(); -} - -export default ArtistScraper; - diff --git a/scripts/utils/error-logger.js b/scripts/utils/error-logger.js deleted file mode 100644 index 4d51493..0000000 --- a/scripts/utils/error-logger.js +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Error Logger - * Handles error collection and logging to errors.json - */ - -import fs from 'fs/promises'; -import path from 'path'; -import { getCurrentISO } from './timestamp.js'; - -/** - * Error Logger class - */ -export class ErrorLogger { - constructor(phase) { - this.phase = phase; - this.errors = []; - this.errorCounts = {}; - } - - /** - * Log an error - * @param {string} type - Error type (e.g., 'network_timeout', 'parsing_failed') - * @param {object} context - Error context (artist, song, url, etc.) - * @param {string} message - Error message - */ - logError(type, context, message) { - const error = { - type, - timestamp: getCurrentISO(), - message, - ...context - }; - - this.errors.push(error); - - // Update counts - this.errorCounts[type] = (this.errorCounts[type] || 0) + 1; - } - - /** - * Get total error count - * @returns {number} Total errors - */ - getTotalErrors() { - return this.errors.length; - } - - /** - * Get error counts by type - * @returns {object} Error counts - */ - getErrorCounts() { - return { ...this.errorCounts }; - } - - /** - * Get all errors - * @returns {array} All error objects - */ - getAllErrors() { - return [...this.errors]; - } - - /** - * Check if there are any errors - * @returns {boolean} True if errors exist - */ - hasErrors() { - return this.errors.length > 0; - } - - /** - * Save errors to errors.json file - * @param {string} outputDir - Directory to save errors.json - * @returns {Promise} - */ - async saveToFile(outputDir) { - if (!this.hasErrors()) { - return; - } - - const errorData = { - phase: this.phase, - timestamp: getCurrentISO(), - totalErrors: this.getTotalErrors(), - errorsByType: this.getErrorCounts(), - errors: this.getAllErrors() - }; - - const filePath = path.join(outputDir, 'errors.json'); - await fs.writeFile(filePath, JSON.stringify(errorData, null, 2)); - } - - /** - * Create a summary object - * @returns {object} Error summary - */ - getSummary() { - return { - totalErrors: this.getTotalErrors(), - errorsByType: this.getErrorCounts() - }; - } - - /** - * Clear all errors - */ - clear() { - this.errors = []; - this.errorCounts = {}; - } -} - -/** - * Create a new error logger - * @param {string} phase - Phase name (e.g., 'scraping', 'prescraping', 'uploading') - * @returns {ErrorLogger} Error logger instance - */ -export function createErrorLogger(phase) { - return new ErrorLogger(phase); -} - diff --git a/scripts/utils/timestamp.js b/scripts/utils/timestamp.js deleted file mode 100644 index 49236ff..0000000 --- a/scripts/utils/timestamp.js +++ /dev/null @@ -1,163 +0,0 @@ -/** - * Timestamp Utilities - * Handles timestamp generation and parsing for directory naming - */ - -import fs from 'fs/promises'; -import path from 'path'; -import { fileURLToPath } from 'url'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); -const WORKSPACE_ROOT = path.resolve(__dirname, '../..'); - -/** - * Generate a timestamp string for directory naming - * Format: YYYY-MM-DD-HH-MM - * @returns {string} Timestamp string - */ -export function generateTimestamp() { - const now = new Date(); - const year = now.getFullYear(); - const month = String(now.getMonth() + 1).padStart(2, '0'); - const day = String(now.getDate()).padStart(2, '0'); - const hours = String(now.getHours()).padStart(2, '0'); - const minutes = String(now.getMinutes()).padStart(2, '0'); - - return `${year}-${month}-${day}-${hours}-${minutes}`; -} - -/** - * Parse a timestamp string into a Date object - * @param {string} timestamp - Timestamp in format YYYY-MM-DD-HH-MM - * @returns {Date} Date object - */ -export function parseTimestamp(timestamp) { - const parts = timestamp.split('-'); - if (parts.length !== 5) { - throw new Error(`Invalid timestamp format: ${timestamp}. Expected YYYY-MM-DD-HH-MM`); - } - - const [year, month, day, hours, minutes] = parts.map(Number); - return new Date(year, month - 1, day, hours, minutes); -} - -/** - * Validate a timestamp string - * @param {string} timestamp - Timestamp to validate - * @returns {boolean} True if valid - */ -export function isValidTimestamp(timestamp) { - try { - const parts = timestamp.split('-'); - if (parts.length !== 5) return false; - - const date = parseTimestamp(timestamp); - return !isNaN(date.getTime()); - } catch { - return false; - } -} - -/** - * Format a Date object as ISO string - * @param {Date} date - Date to format - * @returns {string} ISO string - */ -export function toISOString(date) { - return date.toISOString(); -} - -/** - * Get current ISO timestamp - * @returns {string} Current time as ISO string - */ -export function getCurrentISO() { - return new Date().toISOString(); -} - -/** - * Format duration in seconds to human readable string - * @param {number} seconds - Duration in seconds - * @returns {string} Formatted duration (e.g., "2h 15m 30s") - */ -export function formatDuration(seconds) { - if (seconds < 60) { - return `${Math.round(seconds)}s`; - } - - const hours = Math.floor(seconds / 3600); - const minutes = Math.floor((seconds % 3600) / 60); - const secs = Math.floor(seconds % 60); - - const parts = []; - if (hours > 0) parts.push(`${hours}h`); - if (minutes > 0) parts.push(`${minutes}m`); - if (secs > 0 || parts.length === 0) parts.push(`${secs}s`); - - return parts.join(' '); -} - -/** - * Calculate estimated time remaining - * @param {number} completed - Items completed - * @param {number} total - Total items - * @param {number} elapsedSeconds - Elapsed time in seconds - * @returns {string} Formatted ETA - */ -export function calculateETA(completed, total, elapsedSeconds) { - if (completed === 0) return 'Calculating...'; - if (completed >= total) return '0s'; - - const rate = completed / elapsedSeconds; - const remaining = total - completed; - const etaSeconds = remaining / rate; - - return formatDuration(etaSeconds); -} - -/** - * Get elapsed time from a directory's creation timestamp - * @param {string} dirPath - Path to the directory - * @returns {Promise} Elapsed seconds, or null if directory doesn't exist - */ -export async function getWorkflowElapsedFromDir(dirPath) { - try { - const stats = await fs.stat(dirPath); - const elapsed = (Date.now() - stats.birthtimeMs) / 1000; - return elapsed; - } catch (error) { - return null; - } -} - -/** - * Get elapsed time from the latest artist-lists directory - * @returns {Promise} Elapsed seconds, or null if no directory exists - */ -export async function getWorkflowElapsed() { - try { - const artistListsBase = path.join(WORKSPACE_ROOT, 'scraping-data', 'artist-lists'); - const entries = await fs.readdir(artistListsBase, { withFileTypes: true }); - - // Find all timestamp directories - const timestampDirs = entries - .filter(entry => entry.isDirectory() && isValidTimestamp(entry.name)) - .map(entry => ({ - name: entry.name, - path: path.join(artistListsBase, entry.name) - })) - .sort((a, b) => b.name.localeCompare(a.name)); // Most recent first - - if (timestampDirs.length === 0) { - return null; - } - - // Use the most recent directory - const latestDir = timestampDirs[0]; - return await getWorkflowElapsedFromDir(latestDir.path); - } catch (error) { - return null; - } -} - diff --git a/scripts/utils/tui.js b/scripts/utils/tui.js deleted file mode 100644 index 9e45dc7..0000000 --- a/scripts/utils/tui.js +++ /dev/null @@ -1,256 +0,0 @@ -/** - * TUI (Terminal User Interface) Utilities - * Provides progress bars and formatted output - * NO EMOJIS - Clean professional output only - */ - -import cliProgress from 'cli-progress'; -import chalk from 'chalk'; -import * as timestamp from './timestamp.js'; - -/** - * Create a new progress bar - * @param {string} title - Title for the progress bar - * @param {number} total - Total items - * @param {boolean} showWorkflowTime - Show workflow elapsed time - * @returns {object} Progress bar instance - */ -export function createProgressBar(title, total, showWorkflowTime = true) { - const format = showWorkflowTime - ? `${title}: [{bar}] {percentage}% | {value}/{total} | ETA: {eta_formatted} | Total: {workflow_time}` - : `${title}: [{bar}] {percentage}% | {value}/{total} | ETA: {eta_formatted}`; - - const bar = new cliProgress.SingleBar({ - format, - barCompleteChar: '\u2588', - barIncompleteChar: '\u2591', - hideCursor: true, - clearOnComplete: false, - stopOnComplete: true - }); - - bar.start(total, 0, { - eta_formatted: 'Calculating...', - workflow_time: '0s' - }); - - // Override update to include workflow time - if (showWorkflowTime) { - const originalUpdate = bar.update.bind(bar); - bar.update = async function(value, payload = {}) { - const elapsed = await timestamp.getWorkflowElapsed(); - if (elapsed !== null) { - payload.workflow_time = timestamp.formatDuration(elapsed); - } - originalUpdate(value, payload); - }; - } - - return bar; -} - -/** - * Create a multi-bar progress container - * @returns {object} MultiBar instance - */ -export function createMultiBar() { - return new cliProgress.MultiBar({ - clearOnComplete: false, - hideCursor: true, - format: '{title}: [{bar}] {percentage}% | {value}/{total} | {info}' - }); -} - -/** - * Print section header - * @param {string} title - Header title - */ -export function printHeader(title) { - const width = 80; - const line = '='.repeat(width); - console.log('\n' + line); - console.log(title); - console.log(line + '\n'); -} - -/** - * Print section footer - */ -export function printFooter() { - const width = 80; - console.log('='.repeat(width) + '\n'); -} - -/** - * Print info message - * @param {string} message - Message to print - */ -export function printInfo(message) { - console.log(`[INFO] ${message}`); -} - -/** - * Print success message - * @param {string} message - Message to print - */ -export function printSuccess(message) { - console.log(chalk.green(`[SUCCESS] ${message}`)); -} - -/** - * Print warning message - * @param {string} message - Message to print - */ -export function printWarning(message) { - console.log(chalk.yellow(`[WARN] ${message}`)); -} - -/** - * Print error message - * @param {string} message - Message to print - */ -export function printError(message) { - console.log(chalk.red(`[ERROR] ${message}`)); -} - -/** - * Print statistics table - * @param {string} title - Table title - * @param {object} stats - Statistics object - */ -export function printStats(title, stats) { - console.log(`\n${title}:`); - for (const [key, value] of Object.entries(stats)) { - const formattedKey = key.replace(/([A-Z])/g, ' $1').trim(); - const capitalizedKey = formattedKey.charAt(0).toUpperCase() + formattedKey.slice(1); - console.log(` ${capitalizedKey}: ${value}`); - } -} - -/** - * Print error summary - * @param {object} errorCounts - Error counts by type - */ -export function printErrorSummary(errorCounts) { - if (Object.keys(errorCounts).length === 0) { - return; - } - - console.log('\nErrors:'); - for (const [type, count] of Object.entries(errorCounts)) { - if (count > 0) { - const formattedType = type.replace(/_/g, ' '); - console.log(` ${formattedType}: ${count}`); - } - } -} - -/** - * Format number with commas - * @param {number} num - Number to format - * @returns {string} Formatted number - */ -export function formatNumber(num) { - return num.toLocaleString(); -} - -/** - * Create a status display that updates in place - * @returns {object} Status display object - */ -export function createStatusDisplay() { - let lastLine = ''; - - return { - /** - * Update the status line - * @param {string} message - Status message - */ - update(message) { - if (lastLine) { - process.stdout.write('\r' + ' '.repeat(lastLine.length) + '\r'); - } - process.stdout.write(message); - lastLine = message; - }, - - /** - * Clear the status line - */ - clear() { - if (lastLine) { - process.stdout.write('\r' + ' '.repeat(lastLine.length) + '\r'); - lastLine = ''; - } - }, - - /** - * Finish with a newline - */ - finish() { - if (lastLine) { - process.stdout.write('\n'); - lastLine = ''; - } - } - }; -} - -/** - * Ask for user confirmation - * @param {string} question - Question to ask - * @returns {Promise} True if user confirms - */ -export async function confirm(question) { - const readline = await import('readline'); - const rl = readline.createInterface({ - input: process.stdin, - output: process.stdout - }); - - return new Promise((resolve) => { - rl.question(`${question} (Y/n): `, (answer) => { - rl.close(); - const normalized = answer.toLowerCase().trim(); - resolve(normalized === 'y' || normalized === 'yes' || normalized === ''); - }); - }); -} - -/** - * Print progress info (current item being processed) - * @param {string} label - Label (e.g., "Current Letter") - * @param {string} value - Value to display - */ -export function printProgressInfo(label, value) { - console.log(`${label}: ${value}`); -} - -/** - * Clear console (use sparingly) - */ -export function clearConsole() { - console.clear(); -} - -/** - * Print workflow elapsed time - * @param {number} seconds - Elapsed seconds - * @param {string} label - Optional label - */ -export function printWorkflowTime(seconds, label = 'Workflow Elapsed Time') { - console.log(`${label}: ${chalk.cyan(timestamp.formatDuration(seconds))}`); -} - -/** - * Print total workflow time at completion - * @param {number} seconds - Total elapsed seconds - */ -export async function printTotalWorkflowTime(seconds) { - console.log(''); - console.log(chalk.bold.cyan('─'.repeat(60))); - console.log(chalk.bold.cyan(`TOTAL WORKFLOW TIME: ${timestamp.formatDuration(seconds)}`)); - console.log(chalk.bold.cyan('─'.repeat(60))); - console.log(''); -} - diff --git a/scripts/search-songs-by-id.js b/search-songs-by-id.js similarity index 98% rename from scripts/search-songs-by-id.js rename to search-songs-by-id.js index 851516c..e5c7bd2 100644 --- a/scripts/search-songs-by-id.js +++ b/search-songs-by-id.js @@ -5,7 +5,7 @@ import { initializeApp } from 'firebase/app'; import { getFirestore, doc, getDoc } from 'firebase/firestore'; -import { firebaseConfig } from '../src/lib/services/initFirebase.js'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; import * as readline from 'readline'; // Initialize Firebase diff --git a/src/lib/components/GrayscaleImageRenderer.svelte b/src/lib/components/GrayscaleImageRenderer.svelte index 763975a..33d1e96 100644 --- a/src/lib/components/GrayscaleImageRenderer.svelte +++ b/src/lib/components/GrayscaleImageRenderer.svelte @@ -1,24 +1,7 @@
{#if selectedSong.imageUrl} - {selectedSong.title} + {#key selectedSong.imageUrl} + {#if $ditherImages && grayscaleImageData && imageMetadata && !useFallback} + + {:else} + {selectedSong.title} + {/if} + {/key} {:else}
@@ -274,6 +400,18 @@
Completed: {new Date(selectedSong.completedAt).toLocaleDateString()}
+ + +
{:else}
@@ -580,8 +718,31 @@ .completion-date { text-align: center; opacity: 0.6; + color: var(--primary-color); + } + + .replay-button { + display: flex; + align-items: center; + justify-content: center; + gap: 6px; margin-top: auto; + padding: 8px 16px; + background-color: var(--secondary-color); color: var(--primary-color); + border: 1px solid var(--primary-color); + cursor: pointer; + font-family: inherit; + transition: background-color 0.15s ease; + } + + .replay-button:hover { + background-color: var(--primary-color); + color: var(--secondary-color); + } + + .replay-button:hover svg path { + fill: var(--secondary-color); } .no-selection { diff --git a/src/lib/components/TypingTest.svelte b/src/lib/components/TypingTest.svelte index 927f695..9760c1e 100644 --- a/src/lib/components/TypingTest.svelte +++ b/src/lib/components/TypingTest.svelte @@ -1,5 +1,5 @@ + + + diff --git a/tooltip-styles.css b/tooltip-styles.css new file mode 100644 index 0000000..9325fdf --- /dev/null +++ b/tooltip-styles.css @@ -0,0 +1,173 @@ +/* Tooltip styling matching the website's topbar design */ + +.topbar-tooltip { + position: fixed; + z-index: 9999; + pointer-events: none; + background-color: var(--secondary-color); + border: 2px solid var(--primary-color); + min-width: 120px; + max-width: 300px; + padding: 8px 12px; + font-family: 'SysFont', sans-serif; + font-size: 14px; + line-height: 1.2; + display: flex; + justify-content: center; + align-items: center; + position: relative; + box-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3); + /* Background lines pattern */ + background-image: + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%), + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%), + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%), + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%), + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%), + linear-gradient(to right, var(--primary-color) 0%, var(--primary-color) 100%); + background-size: + 100% 1.5px, + 100% 1.5px, + 100% 1.5px, + 100% 1.5px, + 100% 1.5px, + 100% 1.5px; + background-position: + 0 2px, + 0 6px, + 0 10px, + 0 14px, + 0 18px, + 0 22px; + background-repeat: no-repeat; +} + +.topbar-tooltip-text { + background-color: var(--secondary-color); + color: var(--primary-color); + padding: 2px 6px; + z-index: 1; + position: relative; + text-align: center; + white-space: nowrap; + /* White outline effect around text */ + text-shadow: + -1px -1px 0 var(--secondary-color), + 1px -1px 0 var(--secondary-color), + -1px 1px 0 var(--secondary-color), + 1px 1px 0 var(--secondary-color), + -2px 0 0 var(--secondary-color), + 2px 0 0 var(--secondary-color), + 0 -2px 0 var(--secondary-color), + 0 2px 0 var(--secondary-color); + user-select: none; +} + +/* Arrow pointing up (tooltip appears below cursor) */ +.topbar-tooltip::before { + content: ''; + position: absolute; + top: -8px; + left: 50%; + transform: translateX(-50%); + width: 0; + height: 0; + border-left: 6px solid transparent; + border-right: 6px solid transparent; + border-bottom: 6px solid var(--primary-color); + z-index: 2; +} + +.topbar-tooltip::after { + content: ''; + position: absolute; + top: -6px; + left: 50%; + transform: translateX(-50%); + width: 0; + height: 0; + border-left: 4px solid transparent; + border-right: 4px solid transparent; + border-bottom: 4px solid var(--secondary-color); + z-index: 3; +} + +/* Alternative version with arrow pointing down (tooltip appears above cursor) */ +.topbar-tooltip.arrow-down::before { + top: auto; + bottom: -8px; + border-bottom: none; + border-top: 6px solid var(--primary-color); +} + +.topbar-tooltip.arrow-down::after { + top: auto; + bottom: -6px; + border-bottom: none; + border-top: 4px solid var(--secondary-color); +} + +/* Alternative version with arrow pointing left (tooltip appears to the right) */ +.topbar-tooltip.arrow-left::before { + top: 50%; + left: -8px; + transform: translateY(-50%); + border-bottom: 6px solid transparent; + border-top: 6px solid transparent; + border-right: 6px solid var(--primary-color); + border-left: none; +} + +.topbar-tooltip.arrow-left::after { + top: 50%; + left: -6px; + transform: translateY(-50%); + border-bottom: 4px solid transparent; + border-top: 4px solid transparent; + border-right: 4px solid var(--secondary-color); + border-left: none; +} + +/* Alternative version with arrow pointing right (tooltip appears to the left) */ +.topbar-tooltip.arrow-right::before { + top: 50%; + right: -8px; + left: auto; + transform: translateY(-50%); + border-bottom: 6px solid transparent; + border-top: 6px solid transparent; + border-left: 6px solid var(--primary-color); + border-right: none; +} + +.topbar-tooltip.arrow-right::after { + top: 50%; + right: -6px; + left: auto; + transform: translateY(-50%); + border-bottom: 4px solid transparent; + border-top: 4px solid transparent; + border-left: 4px solid var(--secondary-color); + border-right: none; +} + +/* Fade in/out animation */ +.topbar-tooltip { + opacity: 0; + transition: opacity 0.2s ease-in-out; +} + +.topbar-tooltip.visible { + opacity: 1; +} + +/* Responsive sizing */ +@media (max-width: 768px) { + .topbar-tooltip { + font-size: 12px; + padding: 6px 10px; + min-width: 100px; + max-width: 250px; + } +} + diff --git a/ui-debug.log b/ui-debug.log new file mode 100644 index 0000000..edaaf3e --- /dev/null +++ b/ui-debug.log @@ -0,0 +1,2 @@ +Web / API server started at 127.0.0.1:4000 +Web / API server started at ::1:4000 diff --git a/upload-remaining-artists.js b/upload-remaining-artists.js new file mode 100644 index 0000000..4a395cb --- /dev/null +++ b/upload-remaining-artists.js @@ -0,0 +1,469 @@ +import { initializeApp } from 'firebase/app'; +import { getFirestore, collection, writeBatch, doc } from 'firebase/firestore'; +import fs from 'fs'; +import path from 'path'; +import unidecode from 'unidecode'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; + +// Initialize Firebase using centralized config +const app = initializeApp(firebaseConfig); +const db = getFirestore(app); + +class RemainingArtistUploader { + constructor() { + this.batchSize = 50; // Much smaller batches to avoid rate limits + this.collectionName = 'artists'; + this.delayBetweenBatches = 5000; // 5 seconds between batches + this.maxRetries = 5; + this.uploadStats = { + totalProcessed: 0, + successful: 0, + failed: 0, + skipped: 0, + retried: 0, + startTime: new Date() + }; + } + + sanitizeFieldValue(text) { + if (!text) return text; + + // Remove null bytes and other problematic control characters + let sanitized = text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, ''); + + // Remove Unicode tag characters and other problematic invisible characters + // Use proper Unicode regex patterns for JavaScript + sanitized = sanitized.replace(/[\uFE00-\uFE0F]/g, ''); // Variation selectors + sanitized = sanitized.replace(/[\u200B-\u200F]/g, ''); // Zero-width characters + sanitized = sanitized.replace(/[\u2060-\u206F]/g, ''); // Additional invisible characters + sanitized = sanitized.replace(/[\uFEFF]/g, ''); // Byte order mark + + // Remove Unicode tag characters (which require surrogate pairs in JavaScript) + // Tag characters are in the range U+E0000-U+E007F, represented as surrogate pairs + sanitized = sanitized.replace(/\uDB40[\uDC00-\uDC7F]/g, ''); // Tag characters + + // Also remove any orphaned high surrogates that might cause encoding issues + sanitized = sanitized.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, ''); // Orphaned high surrogates + sanitized = sanitized.replace(/(? text.replace(/[.,\-_'"!?&@#$%^*()+=\[\]{};:|<>\/\\`~]/g, '').replace(/\s+/g, ' ').trim(); + const cleanNameNoPunct = removePunctuation(cleanName); + const normalizedNameNoPunct = removePunctuation(normalizedName); + + // Generate tokens for all versions: original, normalized, and punctuation-free + const versions = [cleanName]; + if (normalizedName !== cleanName) { + versions.push(normalizedName); + } + if (cleanNameNoPunct !== cleanName && cleanNameNoPunct.length > 0) { + versions.push(cleanNameNoPunct); + } + if (normalizedNameNoPunct !== normalizedName && normalizedNameNoPunct !== cleanNameNoPunct && normalizedNameNoPunct.length > 0) { + versions.push(normalizedNameNoPunct); + } + + for (const version of versions) { + for (let i = 1; i <= version.length; i++) { + tokens.add(version.substring(0, i)); + } + + const words = version.split(/\s+/); + for (const word of words) { + if (word.length > 0) { + for (let i = 1; i <= word.length; i++) { + tokens.add(word.substring(0, i)); + } + } + } + tokens.add(version); + } + + tokens.add(name); + + return Array.from(tokens) + .filter(token => token.length > 0 && token.length <= 100) + .map(token => this.sanitizeFieldValue(token)) + .filter(token => token && token.length > 0); + } + + extractSlug(url) { + const match = url.match(/\/artists\/(.+)$/); + if (!match) return null; + + let slug = match[1]; + slug = unidecode(slug); + slug = slug.replace(/[\/]/g, '-'); + slug = slug.replace(/[.#$\[\]]/g, '-'); + slug = slug.replace(/[^\w\-_.~]/g, '-'); + slug = slug.replace(/-+/g, '-'); + slug = slug.replace(/^-+|-+$/g, ''); + + if (slug.length > 800) { + slug = slug.substring(0, 800).replace(/-+$/, ''); + } + + if (!slug || slug.trim() === '') { + return null; + } + + return slug; + } + + validateFirestoreData(data, path = '') { + const issues = []; + + if (data === null || data === undefined) { + return issues; + } + + if (typeof data === 'string') { + // Check for invalid UTF-8 sequences + try { + encodeURIComponent(data); + } catch (e) { + issues.push(`Invalid UTF-8 in string at ${path}`); + } + + // Check for control characters + if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(data)) { + issues.push(`Control characters in string at ${path}`); + } + + // Check for extremely long strings + if (data.length > 1048487) { // Firestore's 1MB limit for strings + issues.push(`String too long at ${path}: ${data.length} characters`); + } + } else if (Array.isArray(data)) { + // Check array size + if (data.length > 20000) { // Firestore has array size limits + issues.push(`Array too large at ${path}: ${data.length} items`); + } + + data.forEach((item, index) => { + issues.push(...this.validateFirestoreData(item, `${path}[${index}]`)); + }); + } else if (typeof data === 'object') { + Object.entries(data).forEach(([key, value]) => { + // Check field name validity + if (key.includes('.') || key.includes('/') || key.startsWith('__')) { + issues.push(`Invalid field name: "${key}"`); + } + + issues.push(...this.validateFirestoreData(value, path ? `${path}.${key}` : key)); + }); + } + + return issues; + } + + transformArtist(artist) { + if (!artist.name || !artist.url) { + return null; + } + + const slug = this.extractSlug(artist.url); + if (!slug) { + return null; + } + + const sanitizedName = this.sanitizeFieldValue(artist.name); + const sanitizedUrl = this.sanitizeFieldValue(artist.url); + + // Additional validation for problematic cases + if (!sanitizedName || sanitizedName.trim() === '' || !sanitizedUrl || sanitizedUrl.trim() === '') { + console.warn(`❌ Failed to sanitize artist: ${artist.name} - ${artist.url}`); + return null; + } + + const nameForSorting = sanitizedName.toLowerCase().replace(/^(the |a |an )/, ''); + const firstLetter = nameForSorting.charAt(0).toLowerCase(); + + // Generate search tokens with extra validation + let searchTokens; + try { + searchTokens = this.generateSearchTokens(sanitizedName); + + // Limit search tokens to prevent oversized arrays + if (searchTokens.length > 5000) { + console.warn(`⚠️ Too many search tokens for ${sanitizedName}, limiting to 5000`); + searchTokens = searchTokens.slice(0, 5000); + } + } catch (error) { + console.warn(`❌ Failed to generate search tokens for ${sanitizedName}: ${error.message}`); + searchTokens = [sanitizedName.toLowerCase()]; + } + + const transformedArtist = { + id: slug, + name: sanitizedName, + url: sanitizedUrl, + geniusId: artist.id || null, + type: artist.type || 'regular', + searchTokens: searchTokens, + nameForSorting: nameForSorting, + uploadedAt: new Date().toISOString(), + firstLetter: firstLetter + }; + + // Final validation check + const validationIssues = this.validateFirestoreData(transformedArtist); + if (validationIssues.length > 0) { + console.warn(`❌ Validation failed for ${sanitizedName}: ${validationIssues.join(', ')}`); + return null; + } + + return transformedArtist; + } + + getRemainingArtists(dataDir) { + const allArtists = []; + const fileKeys = ['0', ...('abcdefghijklmnopqrstuvwxyz'.split(''))]; + + for (const key of fileKeys) { + const filename = `genius-artists-${key}.json`; + const filepath = path.join(dataDir, filename); + + if (fs.existsSync(filepath)) { + try { + const fileContent = fs.readFileSync(filepath, 'utf8'); + const letterData = JSON.parse(fileContent); + + if (letterData.artists?.popular) { + allArtists.push(...letterData.artists.popular); + } + + if (letterData.artists?.regular) { + allArtists.push(...letterData.artists.regular); + } + } catch (error) { + console.error(`❌ Error reading ${filename}:`, error.message); + } + } + } + + const transformedArtists = allArtists + .map(artist => this.transformArtist(artist)) + .filter(artist => artist !== null); + + // Get artists from the problematic batches (8 and 33) + const problematicBatches = [8, 33]; + const problematicArtists = []; + + for (const batchNum of problematicBatches) { + const startIdx = (batchNum - 1) * 500; // Original batch size was 500 + const endIdx = Math.min(startIdx + 500, transformedArtists.length); + const batchArtists = transformedArtists.slice(startIdx, endIdx); + problematicArtists.push(...batchArtists); + } + + console.log(`📊 Found ${problematicArtists.length} artists from problematic batches`); + return problematicArtists; + } + + async uploadBatchWithRateLimit(batchArtists, batchNum, totalBatches, attempt = 1) { + try { + const batch = writeBatch(db); + const artistsCollection = collection(db, this.collectionName); + + for (const artist of batchArtists) { + const docRef = doc(artistsCollection, artist.id); + const { id, ...documentData } = artist; + batch.set(docRef, documentData); + } + + await batch.commit(); + + const retryText = attempt > 1 ? ` (retry ${attempt})` : ''; + console.log(`✅ Batch ${batchNum}/${totalBatches} uploaded (${batchArtists.length} artists)${retryText}`); + + return { success: true, uploaded: batchArtists.length, failed: 0 }; + } catch (error) { + // If batch fails, try uploading documents individually to identify problematic ones + if (attempt === 1) { + console.warn(`⚠️ Batch ${batchNum} failed, trying individual uploads...`); + return await this.uploadIndividually(batchArtists, batchNum, totalBatches); + } + throw error; + } + } + + async uploadIndividually(batchArtists, batchNum, totalBatches) { + const artistsCollection = collection(db, this.collectionName); + let uploaded = 0; + let failed = 0; + + for (const artist of batchArtists) { + try { + const docRef = doc(artistsCollection, artist.id); + const { id, ...documentData } = artist; + + const batch = writeBatch(db); + batch.set(docRef, documentData); + await batch.commit(); + + uploaded++; + } catch (error) { + console.warn(`❌ Failed to upload individual artist: ${artist.name} - ${error.message}`); + failed++; + + // Log problematic artist for analysis + fs.appendFileSync('failed_artists.json', JSON.stringify({ + name: artist.name, + url: artist.url, + error: error.message, + timestamp: new Date().toISOString() + }) + '\n'); + } + } + + if (uploaded > 0) { + console.log(`✅ Batch ${batchNum}/${totalBatches} individual upload: ${uploaded} successful, ${failed} failed`); + } + + return { success: uploaded > 0, uploaded, failed }; + } + + async uploadRemainingArtists(dataDir) { + console.log('🚀 Rate-Limited Upload of Remaining Artists...\n'); + + const remainingArtists = this.getRemainingArtists(dataDir); + const totalBatches = Math.ceil(remainingArtists.length / this.batchSize); + + console.log(`📦 Uploading ${remainingArtists.length} artists in ${totalBatches} small batches`); + console.log(`⏱️ Batch size: ${this.batchSize} (reduced from 500)`); + console.log(`⏰ Delay between batches: ${this.delayBetweenBatches}ms`); + console.log(''); + + for (let i = 0; i < totalBatches; i++) { + const startIdx = i * this.batchSize; + const endIdx = Math.min(startIdx + this.batchSize, remainingArtists.length); + const batchArtists = remainingArtists.slice(startIdx, endIdx); + + let success = false; + let lastError = null; + + // Retry logic with exponential backoff + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + try { + const result = await this.uploadBatchWithRateLimit(batchArtists, i + 1, totalBatches, attempt); + + if (result.success) { + this.uploadStats.successful += result.uploaded; + this.uploadStats.failed += result.failed; + success = true; + break; + } else { + throw new Error('Upload failed'); + } + } catch (error) { + lastError = error; + + if (attempt < this.maxRetries) { + const backoffDelay = Math.min(1000 * Math.pow(2, attempt), 30000); // Max 30 seconds + console.warn(`⚠️ Batch ${i + 1} attempt ${attempt} failed, retrying in ${backoffDelay}ms... (${error.message})`); + this.uploadStats.retried++; + await new Promise(resolve => setTimeout(resolve, backoffDelay)); + } + } + } + + if (!success) { + console.error(`❌ Batch ${i + 1} failed after ${this.maxRetries} attempts:`, lastError.message); + this.uploadStats.failed += batchArtists.length; + } + + this.uploadStats.totalProcessed += batchArtists.length; + + // Progress update + const progress = ((i + 1) / totalBatches * 100).toFixed(1); + console.log(`📈 Progress: ${progress}% (${this.uploadStats.successful}/${remainingArtists.length} successful)`); + + // Rate limiting delay between batches (except for the last batch) + if (i < totalBatches - 1) { + console.log(`⏸️ Waiting ${this.delayBetweenBatches}ms before next batch...`); + await new Promise(resolve => setTimeout(resolve, this.delayBetweenBatches)); + } + } + + this.uploadStats.endTime = new Date(); + this.displaySummary(remainingArtists.length); + } + + displaySummary(totalArtists) { + const duration = Math.round((this.uploadStats.endTime - this.uploadStats.startTime) / 1000); + + console.log('\n' + '='.repeat(60)); + console.log('🎯 RATE-LIMITED UPLOAD COMPLETE'); + console.log('='.repeat(60)); + console.log(`📊 Upload Statistics:`); + console.log(` • Total processed: ${this.uploadStats.totalProcessed.toLocaleString()}`); + console.log(` • Successful: ${this.uploadStats.successful.toLocaleString()}`); + console.log(` • Failed: ${this.uploadStats.failed.toLocaleString()}`); + console.log(` • Retried: ${this.uploadStats.retried.toLocaleString()}`); + console.log(` • Success rate: ${((this.uploadStats.successful / totalArtists) * 100).toFixed(1)}%`); + console.log(` • Duration: ${duration} seconds`); + console.log(` • Collection: ${this.collectionName}`); + + if (this.uploadStats.successful > 0) { + console.log(`\n✅ Successfully uploaded ${this.uploadStats.successful} remaining artists!`); + } + + if (this.uploadStats.failed > 0) { + console.log(`\n⚠️ ${this.uploadStats.failed} artists still failed due to rate limits.`); + console.log(`💡 Consider running this script again later or upgrading Firebase plan.`); + } + } +} + +// Run the uploader +const uploader = new RemainingArtistUploader(); +const dataDir = process.argv[2] || 'genius-artists-2025-07-11'; + +uploader.uploadRemainingArtists(dataDir) + .then(() => { + console.log('\n🏁 Rate-limited upload complete!'); + process.exit(0); + }) + .catch(error => { + console.error('❌ Upload failed:', error); + process.exit(1); + }); \ No newline at end of file diff --git a/upload-to-firestore.js b/upload-to-firestore.js new file mode 100644 index 0000000..9048dce --- /dev/null +++ b/upload-to-firestore.js @@ -0,0 +1,526 @@ +import { initializeApp } from 'firebase/app'; +import { getFirestore, collection, writeBatch, doc } from 'firebase/firestore'; +import fs from 'fs'; +import path from 'path'; +import unidecode from 'unidecode'; +import { firebaseConfig } from './src/lib/services/initFirebase.js'; + +// Initialize Firebase using centralized config +const app = initializeApp(firebaseConfig); +const db = getFirestore(app); + +class FirestoreUploader { + constructor() { + this.batchSize = 500; // Firestore batch limit + this.collectionName = 'artists'; + this.maxRetries = 3; // Maximum retry attempts for failed batches + this.uploadStats = { + totalProcessed: 0, + successful: 0, + failed: 0, + skipped: 0, + retried: 0, + startTime: new Date() + }; + } + + /** + * Extract artist slug from Genius URL and validate for Firestore + * @param {string} url - The full Genius artist URL + * @returns {string} The artist slug (validated for Firestore) + */ + extractSlug(url) { + // Extract everything after "/artists/" + const match = url.match(/\/artists\/(.+)$/); + if (!match) return null; + + let slug = match[1]; + + // CRITICAL: Apply Unicode normalization first to handle non-ASCII characters + // This is the main fix for the batch failure issue + slug = unidecode(slug); + + // Firestore document ID validation and sanitization + // Document IDs must be valid UTF-8 characters + // Cannot contain forward slashes, and must be <= 1500 bytes + + // Replace invalid characters for Firestore document IDs + slug = slug.replace(/[\/]/g, '-'); // Replace forward slashes + slug = slug.replace(/[.#$\[\]]/g, '-'); // Replace other problematic characters + slug = slug.replace(/[^\w\-_.~]/g, '-'); // Replace any other non-URL-safe characters + + // Clean up multiple consecutive dashes + slug = slug.replace(/-+/g, '-'); + + // Remove leading/trailing dashes + slug = slug.replace(/^-+|-+$/g, ''); + + // Ensure it's not too long (Firestore limit is 1500 bytes) + if (slug.length > 800) { // More conservative limit + slug = slug.substring(0, 800).replace(/-+$/, ''); // Remove trailing dash if cut mid-word + } + + // Ensure it's not empty after sanitization + if (!slug || slug.trim() === '') { + return null; + } + + return slug; + } + + /** + * Sanitize field value for Firestore storage + * @param {string} text - Text to sanitize + * @returns {string} Sanitized text safe for Firestore + */ + sanitizeFieldValue(text) { + if (!text) return text; + + // Remove null bytes and other problematic control characters + let sanitized = text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, ''); + + // Remove Unicode tag characters and other problematic invisible characters + // Use proper Unicode regex patterns for JavaScript + sanitized = sanitized.replace(/[\uFE00-\uFE0F]/g, ''); // Variation selectors + sanitized = sanitized.replace(/[\u200B-\u200F]/g, ''); // Zero-width characters + sanitized = sanitized.replace(/[\u2060-\u206F]/g, ''); // Additional invisible characters + sanitized = sanitized.replace(/[\uFEFF]/g, ''); // Byte order mark + + // Remove Unicode tag characters (which require surrogate pairs in JavaScript) + // Tag characters are in the range U+E0000-U+E007F, represented as surrogate pairs + sanitized = sanitized.replace(/\uDB40[\uDC00-\uDC7F]/g, ''); // Tag characters + + // Also remove any orphaned high surrogates that might cause encoding issues + sanitized = sanitized.replace(/[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g, ''); // Orphaned high surrogates + sanitized = sanitized.replace(/(? text.replace(/[.,\-_'"!?&@#$%^*()+=\[\]{};:|<>\/\\`~]/g, '').replace(/\s+/g, ' ').trim(); + const cleanNameNoPunct = removePunctuation(cleanName); + const normalizedNameNoPunct = removePunctuation(normalizedName); + + // Generate tokens for all versions: original, normalized, and punctuation-free + const versions = [cleanName]; + if (normalizedName !== cleanName) { + versions.push(normalizedName); + } + if (cleanNameNoPunct !== cleanName && cleanNameNoPunct.length > 0) { + versions.push(cleanNameNoPunct); + } + if (normalizedNameNoPunct !== normalizedName && normalizedNameNoPunct !== cleanNameNoPunct && normalizedNameNoPunct.length > 0) { + versions.push(normalizedNameNoPunct); + } + + for (const version of versions) { + // Add progressively longer substrings starting from the beginning + for (let i = 1; i <= version.length; i++) { + tokens.add(version.substring(0, i)); + } + + // Add tokens for each word (useful for multi-word artists) + const words = version.split(/\s+/); + for (const word of words) { + if (word.length > 0) { + for (let i = 1; i <= word.length; i++) { + tokens.add(word.substring(0, i)); + } + } + } + + // Add the full name + tokens.add(version); + } + + // Add original case version + tokens.add(name); + + // Filter and sanitize tokens to ensure Firestore compatibility + return Array.from(tokens) + .filter(token => token.length > 0 && token.length <= 100) // Limit token length + .map(token => this.sanitizeFieldValue(token)) // Sanitize each token + .filter(token => token && token.length > 0); // Remove any tokens that became empty after sanitization + } + + /** + * Transform artist data for Firestore + * @param {Object} artist - Artist data from scraped JSON + * @returns {Object|null} Transformed artist document or null if invalid + */ + transformArtist(artist) { + if (!artist.name || !artist.url) { + return null; + } + + const slug = this.extractSlug(artist.url); + if (!slug) { + console.warn(`Could not extract slug from URL: ${artist.url}`); + return null; + } + + // Sanitize all string fields for Firestore compatibility + const sanitizedName = this.sanitizeFieldValue(artist.name); + const sanitizedUrl = this.sanitizeFieldValue(artist.url); + + const nameForSorting = sanitizedName.toLowerCase().replace(/^(the |a |an )/, ''); + const firstLetter = nameForSorting.charAt(0).toLowerCase(); + const searchTokens = this.generateSearchTokens(sanitizedName); + + return { + id: slug, + name: sanitizedName, + url: sanitizedUrl, + geniusId: artist.id || null, + type: artist.type || 'regular', + searchTokens: searchTokens, + nameForSorting: nameForSorting, + uploadedAt: new Date().toISOString(), // Use ISO string for consistency + firstLetter: firstLetter + }; + } + + /** + * Read all JSON files from the scraped data directory + * @param {string} dataDir - Path to the directory containing scraped JSON files + * @returns {Array} Array of all artists from all files + */ + readAllArtistFiles(dataDir) { + console.log(`📁 Reading artist data from: ${dataDir}`); + + const allArtists = []; + // Include numbers/symbols file (0) and all letters (a-z) + const fileKeys = ['0', ...('abcdefghijklmnopqrstuvwxyz'.split(''))]; + + for (const key of fileKeys) { + const filename = `genius-artists-${key}.json`; + const filepath = path.join(dataDir, filename); + + if (fs.existsSync(filepath)) { + try { + const fileContent = fs.readFileSync(filepath, 'utf8'); + const letterData = JSON.parse(fileContent); + + // Add popular artists + if (letterData.artists?.popular) { + allArtists.push(...letterData.artists.popular); + } + + // Add regular artists + if (letterData.artists?.regular) { + allArtists.push(...letterData.artists.regular); + } + + const displayKey = key === '0' ? 'Numbers/Symbols' : key.toUpperCase(); + console.log(`✅ ${displayKey}: ${letterData.totalArtists || 0} artists loaded`); + + } catch (error) { + console.error(`❌ Error reading ${filename}:`, error.message); + } + } else { + const displayKey = key === '0' ? 'Numbers/Symbols' : key.toUpperCase(); + console.warn(`⚠️ File not found: ${filename} (${displayKey})`); + } + } + + console.log(`\n📊 Total artists loaded: ${allArtists.length}`); + return allArtists; + } + + /** + * Upload artists to Firestore in batches + * @param {Array} artists - Array of artist objects + */ + async uploadToFirestore(artists) { + console.log(`\n🔥 Starting Firestore upload...`); + console.log(`📝 Collection: ${this.collectionName}`); + console.log(`📦 Batch size: ${this.batchSize}`); + + const transformedArtists = []; + const duplicateSlugs = new Set(); + const slugCounts = new Map(); + + // Transform and deduplicate artists + console.log('\n🔄 Transforming artist data...'); + for (const artist of artists) { + const transformed = this.transformArtist(artist); + if (transformed) { + // Check for duplicate slugs + const slug = transformed.id; + if (slugCounts.has(slug)) { + slugCounts.set(slug, slugCounts.get(slug) + 1); + duplicateSlugs.add(slug); + + // Modify slug to make it unique + transformed.id = `${slug}-${slugCounts.get(slug)}`; + console.warn(`⚠️ Duplicate slug detected: ${slug} -> ${transformed.id}`); + } else { + slugCounts.set(slug, 1); + } + + transformedArtists.push(transformed); + } else { + this.uploadStats.skipped++; + } + } + + console.log(`\n📊 Transformation complete:`); + console.log(` • Valid artists: ${transformedArtists.length}`); + console.log(` • Skipped: ${this.uploadStats.skipped}`); + console.log(` • Duplicate slugs found: ${duplicateSlugs.size}`); + + // Upload in batches + const totalBatches = Math.ceil(transformedArtists.length / this.batchSize); + console.log(`\n📤 Uploading ${transformedArtists.length} artists in ${totalBatches} batches...`); + + for (let i = 0; i < totalBatches; i++) { + const startIdx = i * this.batchSize; + const endIdx = Math.min(startIdx + this.batchSize, transformedArtists.length); + const batchArtists = transformedArtists.slice(startIdx, endIdx); + + let success = false; + let lastError = null; + + // Retry logic for failed batches + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + try { + await this.uploadBatch(batchArtists, i + 1, totalBatches, attempt); + this.uploadStats.successful += batchArtists.length; + success = true; + break; + } catch (error) { + lastError = error; + if (attempt < this.maxRetries) { + console.warn(`⚠️ Batch ${i + 1} attempt ${attempt} failed, retrying... (${error.message})`); + this.uploadStats.retried++; + // Wait longer between retry attempts + await new Promise(resolve => setTimeout(resolve, 1000 * attempt)); + } + } + } + + if (!success) { + console.error(`❌ Batch ${i + 1} failed after ${this.maxRetries} attempts:`, lastError.message); + + // Try to identify problematic documents + await this.diagnoseBatchFailure(batchArtists, i + 1); + + this.uploadStats.failed += batchArtists.length; + } + + this.uploadStats.totalProcessed += batchArtists.length; + + // Small delay between batches to be respectful + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + this.uploadStats.endTime = new Date(); + this.displayUploadSummary(); + } + + /** + * Diagnose batch failure by testing individual documents + * @param {Array} batchArtists - Artists that failed to upload + * @param {number} batchNum - Batch number for logging + */ + async diagnoseBatchFailure(batchArtists, batchNum) { + console.log(`🔍 Diagnosing batch ${batchNum} failure...`); + const problematicDocs = []; + + // Test a few documents individually to identify issues + const sampleSize = Math.min(5, batchArtists.length); + for (let i = 0; i < sampleSize; i++) { + const artist = batchArtists[i]; + try { + // Validate document ID + if (!artist.id || artist.id.includes('/') || artist.id.includes('.')) { + problematicDocs.push(`Invalid ID: "${artist.id}" for artist "${artist.name}"`); + continue; + } + + // Check for field validation issues + const { id, ...documentData } = artist; + + // Check for overly large fields + const dataSize = JSON.stringify(documentData).length; + if (dataSize > 1000000) { // 1MB limit + problematicDocs.push(`Document too large: ${artist.name} (${dataSize} bytes)`); + } + + // Check search tokens array size + if (documentData.searchTokens && documentData.searchTokens.length > 1000) { + problematicDocs.push(`Too many search tokens: ${artist.name} (${documentData.searchTokens.length} tokens)`); + } + + } catch (error) { + problematicDocs.push(`Validation error for "${artist.name}": ${error.message}`); + } + } + + if (problematicDocs.length > 0) { + console.log(`🚨 Found potential issues:`); + problematicDocs.forEach(issue => console.log(` • ${issue}`)); + } else { + console.log(`🤔 No obvious issues found in sample. May be a temporary network error.`); + } + } + + /** + * Upload a single batch to Firestore + * @param {Array} batchArtists - Artists for this batch + * @param {number} batchNum - Current batch number + * @param {number} totalBatches - Total number of batches + * @param {number} attempt - Current attempt number (for retry logic) + */ + async uploadBatch(batchArtists, batchNum, totalBatches, attempt = 1) { + const batch = writeBatch(db); + const artistsCollection = collection(db, this.collectionName); + + for (const artist of batchArtists) { + const docRef = doc(artistsCollection, artist.id); + // Remove the id from the document data since it's used as the document ID + const { id, ...documentData } = artist; + batch.set(docRef, documentData); + } + + await batch.commit(); + + const retryText = attempt > 1 ? ` (retry ${attempt})` : ''; + const progress = ((this.uploadStats.totalProcessed + batchArtists.length) / (this.uploadStats.totalProcessed + this.uploadStats.successful + this.uploadStats.failed + (totalBatches - batchNum) * this.batchSize)) * 100; + console.log(`✅ Batch ${batchNum}/${totalBatches} uploaded (${batchArtists.length} artists)${retryText} - ${progress.toFixed(1)}%`); + } + + /** + * Display upload summary statistics + */ + displayUploadSummary() { + const duration = Math.round((this.uploadStats.endTime - this.uploadStats.startTime) / 1000); + + console.log('\n' + '='.repeat(60)); + console.log('🎯 FIRESTORE UPLOAD COMPLETE'); + console.log('='.repeat(60)); + + console.log(`📊 Upload Statistics:`); + console.log(` • Total processed: ${this.uploadStats.totalProcessed.toLocaleString()}`); + console.log(` • Successful: ${this.uploadStats.successful.toLocaleString()}`); + console.log(` • Failed: ${this.uploadStats.failed.toLocaleString()}`); + console.log(` • Skipped: ${this.uploadStats.skipped.toLocaleString()}`); + console.log(` • Retried: ${this.uploadStats.retried.toLocaleString()}`); + console.log(` • Duration: ${duration} seconds`); + console.log(` • Collection: ${this.collectionName}`); + + if (this.uploadStats.failed > 0) { + console.log(`\n⚠️ ${this.uploadStats.failed} artists failed to upload. Check logs above for details.`); + } + + if (this.uploadStats.successful > 0) { + console.log(`\n✅ Successfully uploaded ${this.uploadStats.successful} artists to Firestore!`); + console.log(`🔍 Search tokens generated for efficient autocomplete`); + console.log(`🔑 Document IDs use artist URL slugs for direct access`); + } + } + + /** + * Main execution method + * @param {string} dataDir - Directory containing scraped artist JSON files + */ + async run(dataDir) { + try { + console.log('🚀 Firestore Artist Uploader Starting...\n'); + + // Validate data directory + if (!fs.existsSync(dataDir)) { + throw new Error(`Data directory not found: ${dataDir}`); + } + + // Read all artist data + const allArtists = this.readAllArtistFiles(dataDir); + + if (allArtists.length === 0) { + throw new Error('No artist data found in the specified directory'); + } + + // Upload to Firestore + await this.uploadToFirestore(allArtists); + + console.log('\n🎉 Upload process completed successfully!'); + + } catch (error) { + console.error('\n❌ Upload failed:', error.message); + process.exit(1); + } + } +} + +// Main execution +async function main() { + const uploader = new FirestoreUploader(); + + // Get data directory from command line arguments + const dataDir = process.argv[2] || 'genius-artists-2025-07-11'; + + console.log(`📁 Using data directory: ${dataDir}`); + console.log(`🔍 Looking for files like: genius-artists-a.json, genius-artists-b.json, etc.\n`); + + await uploader.run(dataDir); +} + +// Run if this file is executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} + +export default FirestoreUploader; \ No newline at end of file diff --git a/vite b/vite new file mode 100644 index 0000000..e69de29