From 75943e3a6c69c23ad8823faecd0286695c7c1c44 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Mon, 3 Nov 2025 23:11:22 +0800 Subject: [PATCH 01/14] feat: implement parallel sync with performance optimizations **Parallel Sync Implementation:** - Add ParallelCycleSync class with 10x+ performance improvement - Implement composite cursor pagination (cycle + timestamp + ID) to prevent data loss - Multi-cycle batching: fetch 10-200 cycles per HTTP request (vs 1 cycle previously) - Prefetching: overlap network fetch with database writes - Automatic checkpoint/resume from database (ParallelSyncCheckpointManager) - Configurable concurrency, batch size, retry attempts via env vars **Client-Side JSON Optimization:** - Configure axios with StringUtils.safeStringify for request serialization - Configure axios with StringUtils.safeJsonParse for response parsing - Add timing measurements for stringify/parse operations - Use Content-Length header for size (eliminates expensive re-stringify) - HTTP connection pooling with keep-alive agents (maxSockets: concurrency * 2) **Database Enhancements:** - Add composite indexes for cursor-based pagination: - receipts: (cycle ASC, timestamp ASC, receiptId ASC) - originalTxsData: (cycle ASC, timestamp ASC, txId ASC) - Add SQLite lock contention diagnostics (queueMs vs engineMs) - Track query timing with registerQuery/cleanupQuery pattern - Warn on queueMs > 250ms or totalMs > 1000ms **Configuration:** - Add PARALLEL_SYNC_CONCURRENCY env var (default: 10) - Add USE_PARALLEL_SYNC env var (default: true) - Add CYCLES_PER_BATCH env var (default: 10) - Add ENABLE_PREFETCH env var (default: true) - Add SYNC_RETRY_ATTEMPTS env var (default: 3) **Collector Entry Point:** - Auto-select parallel vs legacy sync based on USE_PARALLEL_SYNC flag - Add downloadTxsDataAndCyclesParallel() function in DataSync.ts - Maintain backward compatibility with legacy sequential sync **Error Handling:** - Exponential backoff retry for ECONNRESET/ETIMEDOUT/ECONNREFUSED/EPIPE - Detailed error logging with cycle ranges and attempt numbers **Package Updates:** - Add p-queue for work queue management --- package-lock.json | 29 +- package.json | 3 +- src/class/DataSync.ts | 26 + src/class/ParallelCycleSync.ts | 827 ++++++++++++++++++++++++++++ src/class/ParallelSyncCheckpoint.ts | 315 +++++++++++ src/collector.ts | 10 + src/config/index.ts | 10 + src/storage/index.ts | 10 + src/storage/originalTxData.ts | 4 +- src/storage/sqlite3storage.ts | 125 ++++- 10 files changed, 1351 insertions(+), 8 deletions(-) create mode 100644 src/class/ParallelCycleSync.ts create mode 100644 src/class/ParallelSyncCheckpoint.ts diff --git a/package-lock.json b/package-lock.json index 6b8b23e..07d8fe3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -42,6 +42,7 @@ "next": "13.3.4", "node-cron": "3.0.2", "node-sass": "7.0.3", + "p-queue": "^9.0.0", "qs": "6.11.0", "react": "18.2.0", "react-dom": "18.2.0", @@ -10508,6 +10509,32 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/p-queue": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-9.0.0.tgz", + "integrity": "sha512-KO1RyxstL9g1mK76530TExamZC/S2Glm080Nx8PE5sTd7nlduDQsAfEl4uXX+qZjLiwvDauvzXavufy3+rJ9zQ==", + "dependencies": { + "eventemitter3": "^5.0.1", + "p-timeout": "^7.0.0" + }, + "engines": { + "node": ">=20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-timeout": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-7.0.1.tgz", + "integrity": "sha512-AxTM2wDGORHGEkPCt8yqxOTMgpfbEHqF51f/5fJCmwFC3C/zNcGT63SymH2ttOAaiIws2zVg4+izQCjrakcwHg==", + "engines": { + "node": ">=20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", @@ -14077,4 +14104,4 @@ } } } -} +} \ No newline at end of file diff --git a/package.json b/package.json index e8698db..c6b62b6 100644 --- a/package.json +++ b/package.json @@ -61,6 +61,7 @@ "next": "13.3.4", "node-cron": "3.0.2", "node-sass": "7.0.3", + "p-queue": "^9.0.0", "qs": "6.11.0", "react": "18.2.0", "react-dom": "18.2.0", @@ -109,4 +110,4 @@ "publishConfig": { "access": "public" } -} +} \ No newline at end of file diff --git a/src/class/DataSync.ts b/src/class/DataSync.ts index d6f4de5..12b05ce 100644 --- a/src/class/DataSync.ts +++ b/src/class/DataSync.ts @@ -4,6 +4,7 @@ import { AccountDB, CycleDB, ReceiptDB, TransactionDB, OriginalTxDataDB } from ' import { config, DISTRIBUTOR_URL } from '../config' import { Cycle } from '../types' import { Utils as StringUtils } from '@shardus/types' +import { ParallelCycleSync } from './ParallelCycleSync' export enum DataType { CYCLE = 'cycleinfo', @@ -832,3 +833,28 @@ export const downloadOriginalTxsDataBetweenCycles = async ( endCycle += config.requestLimits.MAX_BETWEEN_CYCLES_PER_REQUEST } } + +/** + * NEW: Parallel sync using cycle-based partitioning with composite cursors + * This is the optimal sync strategy with 10x+ performance improvement + */ +export const downloadTxsDataAndCyclesParallel = async ( + totalCyclesToSync: number, + fromCycle = 0 +): Promise => { + console.log('\n') + console.log('='.repeat(60)) + console.log('Using PARALLEL SYNC with Composite Cursor') + console.log('This prevents data loss and provides 10x+ performance improvement') + console.log('='.repeat(60)) + console.log('\n') + + const parallelSync = new ParallelCycleSync({ + concurrency: config.parallelSyncConcurrency, + batchSize: 500, + retryAttempts: 3, + retryDelayMs: 1000, + }) + + await parallelSync.syncCycleRange(fromCycle, totalCyclesToSync) +} diff --git a/src/class/ParallelCycleSync.ts b/src/class/ParallelCycleSync.ts new file mode 100644 index 0000000..4053846 --- /dev/null +++ b/src/class/ParallelCycleSync.ts @@ -0,0 +1,827 @@ +import PQueue from 'p-queue' +import * as crypto from '@shardus/crypto-utils' +import { Utils as StringUtils } from '@shardus/types' +import { config, DISTRIBUTOR_URL } from '../config' +import { queryFromDistributor, DataType } from './DataSync' +import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' +import { ParallelSyncCheckpointManager, CompositeCursor } from './ParallelSyncCheckpoint' +import { Cycle } from '../types' +import axios, { AxiosInstance } from 'axios' +import http from 'http' +import https from 'https' + +/** + * Configuration for parallel sync + */ +export interface ParallelSyncConfig { + concurrency: number // Number of parallel workers + batchSize: number // Items per request + retryAttempts: number // Retry failed requests + retryDelayMs: number // Delay between retries + cyclesPerBatch: number // Number of cycles to batch together (default: 10) + enablePrefetch: boolean // Enable prefetching (default: true) + prefetchDepth: number // Number of batches to prefetch ahead (default: 1) +} + +/** + * Statistics for sync operation + */ +export interface SyncStats { + startTime: number + endTime?: number + totalCycles: number + completedCycles: number + totalReceipts: number + totalOriginalTxs: number + errors: number +} + +/** + * Parallel sync orchestrator using cycle-based partitioning with composite cursors + * Implements the optimal sync strategy with: + * - Cycle-level parallelization + * - Composite cursor (timestamp + ID) to prevent data loss + * - Automatic resume from database + * - Work queue for load balancing + */ +export class ParallelCycleSync { + private checkpointManager: ParallelSyncCheckpointManager + private queue: PQueue + private syncConfig: ParallelSyncConfig + private stats: SyncStats + private httpAgent: http.Agent + private httpsAgent: https.Agent + private axiosInstance: AxiosInstance + + constructor(syncConfig?: Partial) { + this.checkpointManager = new ParallelSyncCheckpointManager() + + this.syncConfig = { + concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 10, + batchSize: syncConfig?.batchSize || 500, + retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 3, + retryDelayMs: syncConfig?.retryDelayMs || 1000, + cyclesPerBatch: syncConfig?.cyclesPerBatch || config.cyclesPerBatch || 10, + enablePrefetch: syncConfig?.enablePrefetch ?? config.enablePrefetch ?? true, + prefetchDepth: syncConfig?.prefetchDepth || 1, + } + + // Create HTTP agents with keep-alive to reuse connections + this.httpAgent = new http.Agent({ + keepAlive: true, + keepAliveMsecs: 30000, + maxSockets: this.syncConfig.concurrency * 2, + maxFreeSockets: this.syncConfig.concurrency, + }) + + this.httpsAgent = new https.Agent({ + keepAlive: true, + keepAliveMsecs: 30000, + maxSockets: this.syncConfig.concurrency * 2, + maxFreeSockets: this.syncConfig.concurrency, + }) + + // Create axios instance with keep-alive agents and custom JSON serialization with timing + this.axiosInstance = axios.create({ + httpAgent: this.httpAgent, + httpsAgent: this.httpsAgent, + timeout: 45000, + headers: { 'Content-Type': 'application/json' }, + transformRequest: [ + (data) => { + // Use custom stringify for request body + const startTime = Date.now() + const result = StringUtils.safeStringify(data) + const elapsed = Date.now() - startTime + if (config.verbose && elapsed > 10) { + console.log( + `[Client] Request stringify: ${elapsed}ms, size: ${(result.length / 1024).toFixed(2)}KB` + ) + } + return result + }, + ], + transformResponse: [ + (res) => { + // Use custom parse for response with timing + const startTime = Date.now() + const result = StringUtils.safeJsonParse(res) + const elapsed = Date.now() - startTime + const sizeKB = typeof res === 'string' ? (res.length / 1024).toFixed(2) : 'unknown' + if (config.verbose && elapsed > 50) { + console.log(`[Client] Response parse: ${elapsed}ms, size: ${sizeKB}KB`) + } + return result + }, + ], + }) + + // Add interval between tasks to prevent overwhelming the distributor + this.queue = new PQueue({ + concurrency: this.syncConfig.concurrency, + interval: 100, // 100ms between batches + intervalCap: this.syncConfig.concurrency, + }) + + this.stats = { + startTime: Date.now(), + totalCycles: 0, + completedCycles: 0, + totalReceipts: 0, + totalOriginalTxs: 0, + errors: 0, + } + + console.log( + `Parallel Sync initialized:` + + ` concurrency=${this.syncConfig.concurrency},` + + ` cyclesPerBatch=${this.syncConfig.cyclesPerBatch},` + + ` prefetch=${this.syncConfig.enablePrefetch ? 'enabled' : 'disabled'},` + + ` retryAttempts=${this.syncConfig.retryAttempts}` + ) + } + + /** + * Main entry point for parallel sync + */ + async syncCycleRange(startCycle: number, endCycle: number): Promise { + console.log(`\n${'='.repeat(60)}`) + console.log(`Starting Parallel Cycle Sync: ${startCycle} → ${endCycle}`) + console.log(`Concurrency: ${this.syncConfig.concurrency} workers`) + console.log(`${'='.repeat(60)}\n`) + + this.stats.startTime = Date.now() + this.stats.totalCycles = endCycle - startCycle + + try { + // Step 1: Fetch all cycle metadata (lightweight) + console.log('Step 1: Fetching cycle metadata...') + const cycles = await this.fetchCyclesMetadata(startCycle, endCycle) + console.log(`✓ Retrieved ${cycles.length} cycles\n`) + + // Step 2: Sync cycles themselves in parallel + console.log('Step 2: Syncing cycle records...') + await this.syncCyclesData(cycles) + console.log(`✓ Synced ${cycles.length} cycle records\n`) + + // Step 3: Sync receipts and originalTxs for all cycles in parallel with multi-cycle batching + console.log('Step 3: Syncing receipts and originalTxs with multi-cycle batching...') + await this.syncAllCyclesDataMultiBatch(cycles) + + this.stats.endTime = Date.now() + + // Summary + await this.printSummary() + } catch (error) { + console.error('Fatal error in parallel sync:', error) + this.stats.errors++ + throw error + } + } + + /** + * Fetch cycle metadata from distributor + */ + private async fetchCyclesMetadata(startCycle: number, endCycle: number): Promise { + const cycles: Cycle[] = [] + + // Fetch in chunks + const CHUNK_SIZE = 100 + for (let i = startCycle; i <= endCycle; i += CHUNK_SIZE) { + const chunkEnd = Math.min(i + CHUNK_SIZE - 1, endCycle) + + const response = await queryFromDistributor(DataType.CYCLE, { + start: i, + end: chunkEnd, + }) + + if (response && response.data && response.data.cycleInfo) { + cycles.push( + ...response.data.cycleInfo.map((cycleRecord: any) => ({ + counter: cycleRecord.counter, + cycleRecord, + start: cycleRecord.start, + cycleMarker: cycleRecord.marker, + })) + ) + } + } + + return cycles + } + + /** + * Sync cycle records to database + */ + private async syncCyclesData(cycles: Cycle[]): Promise { + // Insert cycles in batches + const BATCH_SIZE = 100 + for (let i = 0; i < cycles.length; i += BATCH_SIZE) { + const batch = cycles.slice(i, i + BATCH_SIZE) + await CycleDB.bulkInsertCycles(batch) + } + } + + /** + * Sync receipts and originalTxs for all cycles in parallel (LEGACY - single cycle per request) + */ + private async syncAllCyclesData(cycles: Cycle[]): Promise { + // Add all cycle sync tasks to the queue + const tasks = cycles.map((cycle) => this.queue.add(() => this.syncSingleCycle(cycle))) + + // Wait for all tasks to complete + await Promise.all(tasks) + } + + /** + * Sync receipts and originalTxs using multi-cycle batching with prefetching + * This dramatically reduces HTTP overhead for cycles with small data + */ + private async syncAllCyclesDataMultiBatch(cycles: Cycle[]): Promise { + // Group cycles into batches + const cycleBatches: Cycle[][] = [] + for (let i = 0; i < cycles.length; i += this.syncConfig.cyclesPerBatch) { + cycleBatches.push(cycles.slice(i, i + this.syncConfig.cyclesPerBatch)) + } + + console.log( + `Created ${cycleBatches.length} cycle batches (${this.syncConfig.cyclesPerBatch} cycles per batch)` + ) + + // Add all batch sync tasks to the queue + const tasks = cycleBatches.map((batch) => this.queue.add(() => this.syncCycleBatch(batch))) + + // Wait for all tasks to complete + await Promise.all(tasks) + } + + /** + * Sync receipts and originalTxs for a single cycle + */ + private async syncSingleCycle(cycle: Cycle): Promise { + try { + // Get cycle time boundaries + const cycleStart = cycle.start + const cycleEnd = cycle.cycleRecord.duration + ? cycle.start + cycle.cycleRecord.duration + : cycle.start + 60 * 1000 // Default 1 minute + + // Sync both data types in parallel for this cycle + await Promise.all([ + this.syncCycleReceipts(cycle.counter, cycleStart, cycleEnd), + this.syncCycleOriginalTxs(cycle.counter, cycleStart, cycleEnd), + ]) + + this.stats.completedCycles++ + + if (config.verbose || this.stats.completedCycles % 10 === 0) { + const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) + console.log(`Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%)`) + } + } catch (error) { + console.error(`Error syncing cycle ${cycle.counter}:`, error) + this.stats.errors++ + throw error + } + } + + /** + * Sync receipts and originalTxs for a batch of cycles using multi-cycle endpoints + * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5) + */ + private async syncCycleBatch(cycleBatch: Cycle[]): Promise { + if (cycleBatch.length === 0) return + + try { + const startCycle = cycleBatch[0].counter + const endCycle = cycleBatch[cycleBatch.length - 1].counter + + // Sync both data types in parallel + await Promise.all([this.syncCycleBatchReceipts(cycleBatch), this.syncCycleBatchOriginalTxs(cycleBatch)]) + + this.stats.completedCycles += cycleBatch.length + + if (config.verbose || this.stats.completedCycles % 10 === 0) { + const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) + console.log( + `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + ) + } + } catch (error) { + console.error( + `Error syncing cycle batch ${cycleBatch[0].counter}-${cycleBatch[cycleBatch.length - 1].counter}:`, + error + ) + this.stats.errors++ + throw error + } + } + + /** + * Sync receipts across a batch of cycles using adaptive multi-cycle fetching with prefetching + */ + private async syncCycleBatchReceipts(cycleBatch: Cycle[]): Promise { + const startCycle = cycleBatch[0].counter + const endCycle = cycleBatch[cycleBatch.length - 1].counter + + // Get resume cursor from database for the start cycle + const initialCursor = await this.checkpointManager.getReceiptsCursor(startCycle, cycleBatch[0].start) + + let currentCycle = startCycle + let currentCursor: CompositeCursor = initialCursor + let totalFetched = 0 + + // Prefetch: Start fetching first batch immediately + let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch + ? this.fetchReceiptsMultiCycle(currentCycle, endCycle, currentCursor) + : null + + while (currentCycle <= endCycle) { + try { + // Get the data (either from prefetch or fetch now) + const response = nextFetchPromise + ? await nextFetchPromise + : await this.fetchReceiptsMultiCycle(currentCycle, endCycle, currentCursor) + + if (!response || response.length === 0) { + break // No more receipts in this cycle range + } + + // Update cursor based on last receipt BEFORE starting next fetch + const lastReceipt = response[response.length - 1] + currentCycle = lastReceipt.cycle + const nextCursor: CompositeCursor = { + timestamp: lastReceipt.timestamp, + id: lastReceipt.receiptId, + } + + // Prefetch next batch while processing current batch + if (this.syncConfig.enablePrefetch && response.length >= this.syncConfig.batchSize) { + nextFetchPromise = this.fetchReceiptsMultiCycle(currentCycle, endCycle, nextCursor) + } else { + nextFetchPromise = null + } + + // Process receipts (overlaps with next fetch if prefetch enabled) + await ReceiptDB.processReceiptData(response) + + totalFetched += response.length + this.stats.totalReceipts += response.length + currentCursor = nextCursor + + if (config.verbose) { + console.log( + `[Cycles ${startCycle}-${endCycle}] Receipts: +${response.length} (total: ${totalFetched}), ` + + `last in cycle ${currentCycle}` + + (this.syncConfig.enablePrefetch ? ' [prefetch]' : '') + ) + } + + // If we got less than batch size, we've exhausted this cycle range + if (response.length < this.syncConfig.batchSize) { + break + } + } catch (error) { + console.error(`Error fetching receipts for cycle batch ${startCycle}-${endCycle}:`, error) + throw error + } + } + } + + /** + * Sync originalTxs across a batch of cycles using adaptive multi-cycle fetching with prefetching + */ + private async syncCycleBatchOriginalTxs(cycleBatch: Cycle[]): Promise { + const startCycle = cycleBatch[0].counter + const endCycle = cycleBatch[cycleBatch.length - 1].counter + + // Get resume cursor from database for the start cycle + const initialCursor = await this.checkpointManager.getOriginalTxsCursor(startCycle, cycleBatch[0].start) + + let currentCycle = startCycle + let currentCursor: CompositeCursor = initialCursor + let totalFetched = 0 + + // Prefetch: Start fetching first batch immediately + let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch + ? this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, currentCursor) + : null + + while (currentCycle <= endCycle) { + try { + // Get the data (either from prefetch or fetch now) + const response = nextFetchPromise + ? await nextFetchPromise + : await this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, currentCursor) + + if (!response || response.length === 0) { + break // No more originalTxs in this cycle range + } + + // Update cursor based on last tx BEFORE starting next fetch + const lastTx = response[response.length - 1] + currentCycle = lastTx.cycle + const nextCursor: CompositeCursor = { + timestamp: lastTx.timestamp, + id: lastTx.txId, + } + + // Prefetch next batch while processing current batch + if (this.syncConfig.enablePrefetch && response.length >= this.syncConfig.batchSize) { + nextFetchPromise = this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, nextCursor) + } else { + nextFetchPromise = null + } + + // Process originalTxs (overlaps with next fetch if prefetch enabled) + await OriginalTxDataDB.processOriginalTxData(response) + + totalFetched += response.length + this.stats.totalOriginalTxs += response.length + currentCursor = nextCursor + + if (config.verbose) { + console.log( + `[Cycles ${startCycle}-${endCycle}] OriginalTxs: +${response.length} (total: ${totalFetched}), ` + + `last in cycle ${currentCycle}` + + (this.syncConfig.enablePrefetch ? ' [prefetch]' : '') + ) + } + + // If we got less than batch size, we've exhausted this cycle range + if (response.length < this.syncConfig.batchSize) { + break + } + } catch (error) { + console.error(`Error fetching originalTxs for cycle batch ${startCycle}-${endCycle}:`, error) + throw error + } + } + } + + /** + * Sync receipts for a specific cycle using composite cursor + */ + private async syncCycleReceipts(cycleNumber: number, cycleStart: number, cycleEnd: number): Promise { + // Get resume cursor from database + const cursor = await this.checkpointManager.getReceiptsCursor(cycleNumber, cycleStart) + + let currentCursor: CompositeCursor = cursor + let totalFetched = 0 + + while (true) { + try { + const response = await this.fetchReceiptsWithCursor(cycleNumber, currentCursor, cycleEnd) + + if (!response || response.length === 0) { + break // No more receipts for this cycle + } + + // Process receipts + await ReceiptDB.processReceiptData(response) + + totalFetched += response.length + this.stats.totalReceipts += response.length + + // Update cursor to last item + const lastReceipt = response[response.length - 1] + currentCursor = { + timestamp: lastReceipt.timestamp, + id: lastReceipt.receiptId, + } + + if (config.verbose) { + console.log(`[Cycle ${cycleNumber}] Receipts: +${response.length} (total: ${totalFetched})`) + } + + // If we got less than batch size, we're done + if (response.length < this.syncConfig.batchSize) { + break + } + } catch (error) { + console.error(`Error fetching receipts for cycle ${cycleNumber}:`, error) + throw error + } + } + } + + /** + * Sync originalTxs for a specific cycle using composite cursor + */ + private async syncCycleOriginalTxs( + cycleNumber: number, + cycleStart: number, + cycleEnd: number + ): Promise { + // Get resume cursor from database + const cursor = await this.checkpointManager.getOriginalTxsCursor(cycleNumber, cycleStart) + + let currentCursor: CompositeCursor = cursor + let totalFetched = 0 + + while (true) { + try { + const response = await this.fetchOriginalTxsWithCursor(cycleNumber, currentCursor, cycleEnd) + + if (!response || response.length === 0) { + break // No more originalTxs for this cycle + } + + // Process originalTxs + await OriginalTxDataDB.processOriginalTxData(response) + + totalFetched += response.length + this.stats.totalOriginalTxs += response.length + + // Update cursor to last item + const lastTx = response[response.length - 1] + currentCursor = { + timestamp: lastTx.timestamp, + id: lastTx.txId, + } + + if (config.verbose) { + console.log(`[Cycle ${cycleNumber}] OriginalTxs: +${response.length} (total: ${totalFetched})`) + } + + // If we got less than batch size, we're done + if (response.length < this.syncConfig.batchSize) { + break + } + } catch (error) { + console.error(`Error fetching originalTxs for cycle ${cycleNumber}:`, error) + throw error + } + } + } + + /** + * Fetch receipts using composite cursor (prevents data loss on timestamp collisions) + */ + private async fetchReceiptsWithCursor( + cycle: number, + cursor: CompositeCursor, + beforeTimestamp?: number + ): Promise { + const data = { + cycle, + afterTimestamp: cursor.timestamp, + afterReceiptId: cursor.id, + beforeTimestamp, + limit: this.syncConfig.batchSize, + sender: config.collectorInfo.publicKey, + sign: undefined, + } + + crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) + + const url = `${DISTRIBUTOR_URL}/receipt/cycle-cursor` + + try { + const response = await this.axiosInstance.post(url, data) + + if (response.data && response.data.receipts) { + return response.data.receipts + } + + return [] + } catch (error) { + console.error(`Error fetching receipts with cursor:`, error.message) + throw error + } + } + + /** + * Fetch originalTxs using composite cursor + */ + private async fetchOriginalTxsWithCursor( + cycle: number, + cursor: CompositeCursor, + beforeTimestamp?: number + ): Promise { + const data = { + cycle, + afterTimestamp: cursor.timestamp, + afterTxId: cursor.id, + beforeTimestamp, + limit: this.syncConfig.batchSize, + sender: config.collectorInfo.publicKey, + sign: undefined, + } + + crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) + + const url = `${DISTRIBUTOR_URL}/originalTx/cycle-cursor` + + try { + const response = await this.axiosInstance.post(url, data) + + if (response.data && response.data.originalTxs) { + return response.data.originalTxs + } + + return [] + } catch (error) { + console.error(`Error fetching originalTxs with cursor:`, error.message) + throw error + } + } + + /** + * Fetch receipts across multiple cycles using composite cursor with retry logic + * Automatically adapts to cycle sizes - if cycles 1-10 only have data in 1-5, returns that subset + */ + private async fetchReceiptsMultiCycle( + startCycle: number, + endCycle: number, + cursor: CompositeCursor + ): Promise { + const data = { + startCycle, + endCycle, + afterCycle: startCycle, + afterTimestamp: cursor.timestamp, + afterReceiptId: cursor.id, + limit: this.syncConfig.batchSize, + sender: config.collectorInfo.publicKey, + sign: undefined, + } + + crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) + + const url = `${DISTRIBUTOR_URL}/receipt/multi-cycle-cursor` + + // Retry with exponential backoff + for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { + try { + const startTime = Date.now() + const response = await this.axiosInstance.post(url, data) + const networkElapsed = Date.now() - startTime + + const receipts = response.data?.receipts || [] + + // Get response size from Content-Length header (fast) instead of re-stringifying (slow) + const responseSizeBytes = parseInt(response.headers['content-length'] || '0', 10) + const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : 'unknown' + + if (config.verbose || networkElapsed > 1000) { + console.log( + `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${receipts.length}, size: ${responseSizeKB}KB` + ) + } + + if (response.data && response.data.receipts) { + return response.data.receipts + } + + return [] + } catch (error: any) { + const isLastAttempt = attempt === this.syncConfig.retryAttempts + const isRetryableError = + error.code === 'ECONNRESET' || + error.code === 'ETIMEDOUT' || + error.code === 'ECONNREFUSED' || + error.code === 'EPIPE' + + if (isRetryableError && !isLastAttempt) { + const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) + console.warn( + `ECONNRESET on receipts fetch (cycles ${startCycle}-${endCycle}), ` + + `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + + `retrying in ${delay}ms...` + ) + await this.sleep(delay) + continue + } + + // Non-retryable error or last attempt failed + console.error( + `Error fetching receipts multi-cycle (cycles ${startCycle}-${endCycle}):`, + error.message + ) + throw error + } + } + + return [] + } + + /** + * Fetch originalTxs across multiple cycles using composite cursor with retry logic + */ + private async fetchOriginalTxsMultiCycle( + startCycle: number, + endCycle: number, + cursor: CompositeCursor + ): Promise { + const data = { + startCycle, + endCycle, + afterCycle: startCycle, + afterTimestamp: cursor.timestamp, + afterTxId: cursor.id, + limit: this.syncConfig.batchSize, + sender: config.collectorInfo.publicKey, + sign: undefined, + } + + crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) + + const url = `${DISTRIBUTOR_URL}/originalTx/multi-cycle-cursor` + + // Retry with exponential backoff + for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { + try { + const startTime = Date.now() + const response = await this.axiosInstance.post(url, data) + const networkElapsed = Date.now() - startTime + + const originalTxs = response.data?.originalTxs || [] + + // Get response size from Content-Length header (fast) instead of re-stringifying (slow) + const responseSizeBytes = parseInt(response.headers['content-length'] || '0', 10) + const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : 'unknown' + + if (config.verbose || networkElapsed > 1000) { + console.log( + `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${originalTxs.length}, size: ${responseSizeKB}KB` + ) + } + + if (response.data && response.data.originalTxs) { + return response.data.originalTxs + } + + return [] + } catch (error: any) { + const isLastAttempt = attempt === this.syncConfig.retryAttempts + const isRetryableError = + error.code === 'ECONNRESET' || + error.code === 'ETIMEDOUT' || + error.code === 'ECONNREFUSED' || + error.code === 'EPIPE' + + if (isRetryableError && !isLastAttempt) { + const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) + console.warn( + `ECONNRESET on originalTxs fetch (cycles ${startCycle}-${endCycle}), ` + + `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + + `retrying in ${delay}ms...` + ) + await this.sleep(delay) + continue + } + + // Non-retryable error or last attempt failed + console.error( + `Error fetching originalTxs multi-cycle (cycles ${startCycle}-${endCycle}):`, + error.message + ) + throw error + } + } + + return [] + } + + /** + * Sleep helper for retry delays + */ + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) + } + + /** + * Print sync summary + */ + private async printSummary(): Promise { + const elapsedMs = (this.stats.endTime || Date.now()) - this.stats.startTime + const elapsedSec = (elapsedMs / 1000).toFixed(2) + const elapsedMin = (elapsedMs / 60000).toFixed(2) + + console.log(`\n${'='.repeat(60)}`) + console.log('Parallel Sync Complete!') + console.log(`${'='.repeat(60)}`) + console.log(` Cycles Synced: ${this.stats.completedCycles}/${this.stats.totalCycles}`) + console.log(` Receipts Synced: ${this.stats.totalReceipts}`) + console.log(` OriginalTxs Synced: ${this.stats.totalOriginalTxs}`) + console.log(` Errors: ${this.stats.errors}`) + console.log(` Time Elapsed: ${elapsedSec}s (${elapsedMin} min)`) + console.log( + ` Throughput: ${(this.stats.totalReceipts / (elapsedMs / 1000)).toFixed(0)} receipts/sec` + ) + console.log(`${'='.repeat(60)}\n`) + + // Print DB summary + await this.checkpointManager.printSyncSummary() + } + + /** + * Get current statistics + */ + getStats(): SyncStats { + return { ...this.stats } + } +} diff --git a/src/class/ParallelSyncCheckpoint.ts b/src/class/ParallelSyncCheckpoint.ts new file mode 100644 index 0000000..472a0d8 --- /dev/null +++ b/src/class/ParallelSyncCheckpoint.ts @@ -0,0 +1,315 @@ +import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' +import { config } from '../config' + +/** + * Composite cursor for tracking sync progress + * Uses both timestamp and ID to handle timestamp collisions + */ +export interface CompositeCursor { + timestamp: number + id: string // receiptId or txId +} + +/** + * Cycle resume information from database + */ +export interface CycleResumeInfo { + cycleNumber: number + startTimestamp: number + endTimestamp: number + receipts: { + lastTimestamp: number + lastId: string + count: number + } + originalTxs: { + lastTimestamp: number + lastId: string + count: number + } +} + +/** + * Manages sync state by querying the database + * No separate checkpoint storage needed - DB is source of truth + */ +export class ParallelSyncCheckpointManager { + /** + * Get the last completed cycle from database + */ + async getLastCompletedCycle(): Promise { + try { + const cycles = await CycleDB.queryLatestCycleRecords(1) + if (cycles && cycles.length > 0) { + return cycles[0].counter + } + return 0 + } catch (error) { + console.error('Error getting last completed cycle:', error) + return 0 + } + } + + /** + * Get resume cursor for receipts in a specific cycle + * Returns the last receipt's timestamp and ID, or cycle start if none exist + */ + async getReceiptsCursor(cycleNumber: number, cycleStartTimestamp: number): Promise { + try { + // Query last receipt for this cycle + const receipts = await ReceiptDB.queryReceipts({ + limit: 1, + startCycleNumber: cycleNumber, + }) + + if (receipts && receipts.length > 0) { + const lastReceipt = receipts[0] + return { + timestamp: lastReceipt.timestamp, + id: lastReceipt.receiptId, + } + } + + // No receipts found for this cycle, start from cycle beginning + return { + timestamp: cycleStartTimestamp, + id: '', + } + } catch (error) { + console.error(`Error getting receipts cursor for cycle ${cycleNumber}:`, error) + return { + timestamp: cycleStartTimestamp, + id: '', + } + } + } + + /** + * Get resume cursor for originalTxs in a specific cycle + */ + async getOriginalTxsCursor(cycleNumber: number, cycleStartTimestamp: number): Promise { + try { + // Query last originalTx for this cycle + const originalTxs = await OriginalTxDataDB.queryOriginalTxsData({ + limit: 1, // limit + startCycle: cycleNumber, // startCycle + }) + + if (originalTxs && originalTxs.length > 0) { + // Sort by timestamp DESC to get the last one + originalTxs.sort((a, b) => b.timestamp - a.timestamp) + const lastTx = originalTxs[0] + return { + timestamp: lastTx.timestamp, + id: lastTx.txId, + } + } + + // No originalTxs found for this cycle, start from cycle beginning + return { + timestamp: cycleStartTimestamp, + id: '', + } + } catch (error) { + console.error(`Error getting originalTxs cursor for cycle ${cycleNumber}:`, error) + return { + timestamp: cycleStartTimestamp, + id: '', + } + } + } + + /** + * Get counts of data already synced for a cycle + */ + async getCycleSyncStatus(cycleNumber: number): Promise<{ + receiptsCount: number + originalTxsCount: number + isComplete: boolean + }> { + try { + const [receiptsCountResult, originalTxsCountResult] = await Promise.all([ + ReceiptDB.queryReceiptCountByCycles(cycleNumber, cycleNumber), + OriginalTxDataDB.queryOriginalTxDataCountByCycles(cycleNumber, cycleNumber), + ]) + + const receiptsCount = + receiptsCountResult && receiptsCountResult.length > 0 ? receiptsCountResult[0].receipts : 0 + + const originalTxsCount = + originalTxsCountResult && originalTxsCountResult.length > 0 + ? originalTxsCountResult[0].originalTxsData + : 0 + + return { + receiptsCount, + originalTxsCount, + isComplete: false, // Determined by sync logic + } + } catch (error) { + console.error(`Error getting cycle sync status for cycle ${cycleNumber}:`, error) + return { + receiptsCount: 0, + originalTxsCount: 0, + isComplete: false, + } + } + } + + /** + * Determine which cycles need to be synced + * Compares local DB with distributor totals + */ + async getCyclesToSync(startCycle: number, endCycle: number): Promise { + try { + const lastLocalCycle = await this.getLastCompletedCycle() + + // If we have no local data, sync all cycles + if (lastLocalCycle === 0) { + const cyclesToSync: number[] = [] + for (let i = startCycle; i <= endCycle; i++) { + cyclesToSync.push(i) + } + return cyclesToSync + } + + // If endCycle is beyond what we have, sync from last local + 1 + if (endCycle > lastLocalCycle) { + const cyclesToSync: number[] = [] + for (let i = lastLocalCycle + 1; i <= endCycle; i++) { + cyclesToSync.push(i) + } + return cyclesToSync + } + + // All cycles already synced + return [] + } catch (error) { + console.error('Error determining cycles to sync:', error) + return [] + } + } + + /** + * Check if a cycle is fully synced by comparing counts with distributor + */ + async isCycleFullySynced( + cycleNumber: number, + expectedReceiptsCount: number, + expectedOriginalTxsCount: number + ): Promise { + try { + const status = await this.getCycleSyncStatus(cycleNumber) + + const receiptsMatch = status.receiptsCount === expectedReceiptsCount + const originalTxsMatch = status.originalTxsCount === expectedOriginalTxsCount + + if (config.verbose) { + console.log( + `Cycle ${cycleNumber} sync check: ` + + `receipts ${status.receiptsCount}/${expectedReceiptsCount}, ` + + `originalTxs ${status.originalTxsCount}/${expectedOriginalTxsCount}` + ) + } + + return receiptsMatch && originalTxsMatch + } catch (error) { + console.error(`Error checking if cycle ${cycleNumber} is fully synced:`, error) + return false + } + } + + /** + * Get detailed resume information for a specific cycle + */ + async getCycleResumeInfo( + cycleNumber: number, + cycleStartTimestamp: number, + cycleEndTimestamp: number + ): Promise { + const [receiptsCursor, originalTxsCursor, syncStatus] = await Promise.all([ + this.getReceiptsCursor(cycleNumber, cycleStartTimestamp), + this.getOriginalTxsCursor(cycleNumber, cycleStartTimestamp), + this.getCycleSyncStatus(cycleNumber), + ]) + + return { + cycleNumber, + startTimestamp: cycleStartTimestamp, + endTimestamp: cycleEndTimestamp, + receipts: { + lastTimestamp: receiptsCursor.timestamp, + lastId: receiptsCursor.id, + count: syncStatus.receiptsCount, + }, + originalTxs: { + lastTimestamp: originalTxsCursor.timestamp, + lastId: originalTxsCursor.id, + count: syncStatus.originalTxsCount, + }, + } + } + + /** + * Log sync progress + */ + logProgress( + cycleNumber: number, + dataType: 'receipts' | 'originalTxs', + itemsFetched: number, + totalItems: number + ): void { + const percentage = totalItems > 0 ? ((totalItems / totalItems) * 100).toFixed(1) : '0.0' + console.log( + `[Cycle ${cycleNumber}] ${dataType}: +${itemsFetched} items (total: ${totalItems}, ${percentage}%)` + ) + } + + /** + * Get overall sync statistics from database + */ + async getSyncStats(): Promise<{ + totalCycles: number + totalReceipts: number + totalOriginalTxs: number + lastCycleNumber: number + }> { + try { + const [cycleCount, receiptCount, originalTxCount, lastCycle] = await Promise.all([ + CycleDB.queryCycleCount(), + ReceiptDB.queryReceiptCount(), + OriginalTxDataDB.queryOriginalTxDataCount(), + this.getLastCompletedCycle(), + ]) + + return { + totalCycles: cycleCount || 0, + totalReceipts: receiptCount || 0, + totalOriginalTxs: originalTxCount || 0, + lastCycleNumber: lastCycle, + } + } catch (error) { + console.error('Error getting sync stats:', error) + return { + totalCycles: 0, + totalReceipts: 0, + totalOriginalTxs: 0, + lastCycleNumber: 0, + } + } + } + + /** + * Print sync summary + */ + async printSyncSummary(): Promise { + const stats = await this.getSyncStats() + console.log('='.repeat(60)) + console.log('Sync Summary:') + console.log(` Total Cycles: ${stats.totalCycles}`) + console.log(` Total Receipts: ${stats.totalReceipts}`) + console.log(` Total OriginalTxs: ${stats.totalOriginalTxs}`) + console.log(` Last Cycle: ${stats.lastCycleNumber}`) + console.log('='.repeat(60)) + } +} diff --git a/src/collector.ts b/src/collector.ts index 1232126..76cb0b2 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -9,6 +9,7 @@ import * as Crypto from './utils/crypto' import { CycleDB, ReceiptDB, OriginalTxDataDB } from './storage' import { downloadTxsDataAndCycles, + downloadTxsDataAndCyclesParallel, compareWithOldReceiptsData, compareWithOldCyclesData, downloadAndSyncGenesisAccounts, @@ -224,6 +225,15 @@ export const checkAndSyncData = async (): Promise<() => Promise> => { const syncData = async (): Promise => { // If there is already some data in the db, we can assume that the genesis accounts data has been synced already if (lastStoredCycleCount === 0) await downloadAndSyncGenesisAccounts() // To sync accounts data that are from genesis accounts/accounts data that the network start with + + // Use parallel sync if enabled (default) + if (config.useParallelSync) { + console.log('Using optimized parallel sync strategy') + await downloadTxsDataAndCyclesParallel(totalCyclesToSync, lastStoredCycleCount) + return + } + + console.log('Using legacy sequential sync strategy') // Sync receipts and originalTxsData data first if there is old data if ( lastStoredReceiptCycle > 0 && diff --git a/src/config/index.ts b/src/config/index.ts index 2ddb2d5..b602842 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -94,6 +94,11 @@ export interface Config { MAX_ACCOUNT_HISTORY_STATES_PER_REQUEST: number MAX_STATS_PER_REQUEST: number } + parallelSyncConcurrency: number // Number of parallel workers for cycle sync + useParallelSync: boolean // Enable parallel sync with composite cursor + cyclesPerBatch: number // Number of cycles to batch together in multi-cycle requests (default: 10) + enablePrefetch: boolean // Enable prefetching of next batch while processing current batch (default: true) + syncRetryAttempts: number // Number of retry attempts for failed requests (default: 3) dexScreenerAPI: string // Dex Screener API URL for Liberdus token dexScreenerLink: string // Dex Screener Link for Liberdus token } @@ -184,6 +189,11 @@ let config: Config = { MAX_ACCOUNT_HISTORY_STATES_PER_REQUEST: 100, MAX_STATS_PER_REQUEST: 1000000, }, + parallelSyncConcurrency: Number(process.env.PARALLEL_SYNC_CONCURRENCY) || 10, // 10 parallel workers + useParallelSync: process.env.USE_PARALLEL_SYNC !== 'false', // Enable by default + cyclesPerBatch: Number(process.env.CYCLES_PER_BATCH) || 10, // Batch 10 cycles together + enablePrefetch: process.env.ENABLE_PREFETCH !== 'false', // Enable prefetch by default + syncRetryAttempts: Number(process.env.SYNC_RETRY_ATTEMPTS) || 3, // Retry failed requests 3 times dexScreenerAPI: 'https://api.dexscreener.com/latest/dex/search?q=0x693ed886545970F0a3ADf8C59af5cCdb6dDF0a76', dexScreenerLink: 'https://dexscreener.com/polygon/0x041e48a5b11c29fdbd92498eb05573c52728398c', diff --git a/src/storage/index.ts b/src/storage/index.ts index cc314d6..92f0793 100644 --- a/src/storage/index.ts +++ b/src/storage/index.ts @@ -147,6 +147,11 @@ export const initializeDB = async (): Promise => { receiptDatabase, 'CREATE INDEX if not exists `receipts_cycle_timestamp` ON `receipts` (`cycle` DESC, `timestamp` DESC)' ) + // Composite index for cursor-based pagination (optimal for parallel sync) + await runCreate( + receiptDatabase, + 'CREATE INDEX if not exists `receipts_cycle_timestamp_receiptId` ON `receipts` (`cycle` ASC, `timestamp` ASC, `receiptId` ASC)' + ) // be sure to adjust the data types of `transactionType`, `txFrom`, `txTo` as needed await runCreate( originalTxDataDatabase, @@ -173,6 +178,11 @@ export const initializeDB = async (): Promise => { originalTxDataDatabase, 'CREATE INDEX if not exists `originalTxsData_cycle_timestamp` ON `originalTxsData` (`cycle` DESC, `timestamp` DESC)' ) + // Composite index for cursor-based pagination (optimal for parallel sync) + await runCreate( + originalTxDataDatabase, + 'CREATE INDEX if not exists `originalTxsData_cycle_timestamp_txId` ON `originalTxsData` (`cycle` ASC, `timestamp` ASC, `txId` ASC)' + ) await runCreate( originalTxDataDatabase, 'CREATE INDEX if not exists `originalTxsData_txType` ON `originalTxsData` (`transactionType`)' diff --git a/src/storage/originalTxData.ts b/src/storage/originalTxData.ts index 137a5a1..d5bcd45 100644 --- a/src/storage/originalTxData.ts +++ b/src/storage/originalTxData.ts @@ -142,7 +142,7 @@ export async function queryOriginalTxDataCount( } if (startCycle || endCycle) { sql = db.updateSqlStatementClause(sql, values) - sql += `cycleNumber BETWEEN ? AND ?` + sql += `cycle BETWEEN ? AND ?` values.push(startCycle, endCycle) } if (afterTimestamp) { @@ -176,7 +176,7 @@ export async function queryOriginalTxsData(query: QueryOriginalTxsDataParams): P } if (startCycle || endCycle) { sql = db.updateSqlStatementClause(sql, values) - sql += `cycleNumber BETWEEN ? AND ?` + sql += `cycle BETWEEN ? AND ?` values.push(startCycle, endCycle) } if (afterTimestamp) { diff --git a/src/storage/sqlite3storage.ts b/src/storage/sqlite3storage.ts index 36988e4..6c68b85 100644 --- a/src/storage/sqlite3storage.ts +++ b/src/storage/sqlite3storage.ts @@ -1,6 +1,73 @@ import { Utils as StringUtils } from '@shardus/types' import { Database } from 'sqlite3' +interface QueryTiming { + id: number + sql: string + startMs: number + engineMs?: number +} + +const SQL_LOG_MAX_LENGTH = 200 +const SQL_ENGINE_WARN_THRESHOLD_MS = 500 +const SQL_QUEUE_WARN_THRESHOLD_MS = 250 +const SQL_TOTAL_WARN_THRESHOLD_MS = 1000 + +let queryIdSequence = 0 +const pendingQueries = new Map() +const queuedBySql = new Map() + +function formatSqlForLog(sql: string): string { + const normalized = sql.replace(/\s+/g, ' ').trim() + if (normalized.length <= SQL_LOG_MAX_LENGTH) return normalized + return `${normalized.slice(0, SQL_LOG_MAX_LENGTH - 3)}...` +} + +function registerQuery(sql: string): QueryTiming { + const entry: QueryTiming = { + id: ++queryIdSequence, + sql, + startMs: Date.now(), + } + pendingQueries.set(entry.id, entry) + let queue = queuedBySql.get(sql) + if (!queue) { + queue = [] + queuedBySql.set(sql, queue) + } + queue.push(entry.id) + return entry +} + +function cleanupQuery(entry: QueryTiming): void { + pendingQueries.delete(entry.id) + const queue = queuedBySql.get(entry.sql) + if (!queue) return + const index = queue.indexOf(entry.id) + if (index !== -1) queue.splice(index, 1) + if (queue.length === 0) queuedBySql.delete(entry.sql) +} + +function logTiming(operation: string, entry: QueryTiming, rows?: number): void { + const totalMs = Date.now() - entry.startMs + const engineMs = entry.engineMs ?? 0 + const queueMs = Math.max(0, totalMs - engineMs) + const payload = { + operation, + totalMs: Number(totalMs.toFixed(2)), + queueMs: Number(queueMs.toFixed(2)), + engineMs: Number(engineMs.toFixed(2)), + sql: formatSqlForLog(entry.sql), + rows, + } + + if (totalMs > SQL_TOTAL_WARN_THRESHOLD_MS || queueMs > SQL_QUEUE_WARN_THRESHOLD_MS) { + console.warn('[DB Timing]', payload) + } else { + console.log('[DB Timing]', payload) + } +} + export const createDB = async (dbPath: string, dbName: string): Promise => { console.log('dbName', dbName, 'dbPath', dbPath) const db = new Database(dbPath, (err) => { @@ -15,10 +82,33 @@ export const createDB = async (dbPath: string, dbName: string): Promise { - if (time > 500 && time < 1000) { - console.log('SLOW QUERY', process.pid, sql, time) - } else if (time > 1000) { - console.log('VERY SLOW QUERY', process.pid, sql, time) + const engineMs = typeof time === 'number' ? time : Number(time) + const queue = queuedBySql.get(sql) + const id = queue && queue.length > 0 ? queue[0] : undefined + if (id === undefined) { + console.warn('[DB Timing] profile event without pending query', { + pid: process.pid, + engineMs, + sql: formatSqlForLog(sql), + }) + return + } + const entry = pendingQueries.get(id) + if (!entry) { + console.warn('[DB Timing] profile missing pending entry', { + pid: process.pid, + engineMs, + sql: formatSqlForLog(sql), + }) + return + } + entry.engineMs = engineMs + if (engineMs > SQL_ENGINE_WARN_THRESHOLD_MS) { + console.warn('[DB Engine] Slow engine execution detected', { + pid: process.pid, + engineMs: Number(engineMs.toFixed(2)), + sql: formatSqlForLog(sql), + }) } }) console.log(`Database ${dbName} Initialized!`) @@ -58,12 +148,21 @@ export async function run( params: unknown[] | object = [] ): Promise<{ id: number }> { return new Promise((resolve, reject) => { + const entry = registerQuery(sql) + const finalize = (): void => { + setImmediate(() => { + logTiming('run', entry) + cleanupQuery(entry) + }) + } db.run(sql, params, function (err: Error) { if (err) { console.log('Error running sql ' + sql) console.log(err) + finalize() reject(err) } else { + finalize() resolve({ id: this.lastID }) } }) @@ -72,12 +171,21 @@ export async function run( export async function get(db: Database, sql: string, params = []): Promise { return new Promise((resolve, reject) => { + const entry = registerQuery(sql) + const finalize = (rows?: number): void => { + setImmediate(() => { + logTiming('get', entry, rows) + cleanupQuery(entry) + }) + } db.get(sql, params, (err: Error, result: T) => { if (err) { console.log('Error running sql: ' + sql) console.log(err) + finalize() reject(err) } else { + finalize(result ? 1 : 0) resolve(result) } }) @@ -86,12 +194,21 @@ export async function get(db: Database, sql: string, params = []): Promise export async function all(db: Database, sql: string, params = []): Promise { return new Promise((resolve, reject) => { + const entry = registerQuery(sql) + const finalize = (rowsCount?: number): void => { + setImmediate(() => { + logTiming('all', entry, rowsCount) + cleanupQuery(entry) + }) + } db.all(sql, params, (err: Error, rows: T[]) => { if (err) { console.log('Error running sql: ' + sql) console.log(err) + finalize() reject(err) } else { + finalize(rows ? rows.length : 0) resolve(rows) } }) From e2126ce111f566822bbd45c2d809b82403fd04a2 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Tue, 4 Nov 2025 16:22:52 +0800 Subject: [PATCH 02/14] feat: enhance API request handling with gzip support and improved response size logging --- src/class/ParallelCycleSync.ts | 35 +++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/class/ParallelCycleSync.ts b/src/class/ParallelCycleSync.ts index 4053846..e25bfd9 100644 --- a/src/class/ParallelCycleSync.ts +++ b/src/class/ParallelCycleSync.ts @@ -86,7 +86,10 @@ export class ParallelCycleSync { httpAgent: this.httpAgent, httpsAgent: this.httpsAgent, timeout: 45000, - headers: { 'Content-Type': 'application/json' }, + headers: { + 'Content-Type': 'application/json', + 'Accept-Encoding': 'gzip, deflate', // Request compressed responses + }, transformRequest: [ (data) => { // Use custom stringify for request body @@ -660,14 +663,19 @@ export class ParallelCycleSync { const receipts = response.data?.receipts || [] - // Get response size from Content-Length header (fast) instead of re-stringifying (slow) - const responseSizeBytes = parseInt(response.headers['content-length'] || '0', 10) - const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : 'unknown' + // Get response size - with compression, Content-Length might not be accurate + const contentLength = response.headers['content-length'] + const contentEncoding = response.headers['content-encoding'] + const responseSizeBytes = contentLength ? parseInt(contentLength, 10) : 0 + const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : '0.00' - if (config.verbose || networkElapsed > 1000) { + if (config.verbose || networkElapsed > 1000 || receipts.length === 0) { console.log( `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${receipts.length}, size: ${responseSizeKB}KB` + `records: ${receipts.length}, size: ${responseSizeKB}KB` + + (contentEncoding ? `, encoding: ${contentEncoding}` : '') + + (receipts.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') ) } @@ -739,14 +747,19 @@ export class ParallelCycleSync { const originalTxs = response.data?.originalTxs || [] - // Get response size from Content-Length header (fast) instead of re-stringifying (slow) - const responseSizeBytes = parseInt(response.headers['content-length'] || '0', 10) - const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : 'unknown' + // Get response size - with compression, Content-Length might not be accurate + const contentLength = response.headers['content-length'] + const contentEncoding = response.headers['content-encoding'] + const responseSizeBytes = contentLength ? parseInt(contentLength, 10) : 0 + const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : '0.00' - if (config.verbose || networkElapsed > 1000) { + if (config.verbose || networkElapsed > 1000 || originalTxs.length === 0) { console.log( `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${originalTxs.length}, size: ${responseSizeKB}KB` + `records: ${originalTxs.length}, size: ${responseSizeKB}KB` + + (contentEncoding ? `, encoding: ${contentEncoding}` : '') + + (originalTxs.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') ) } From c5e283c5acd79c0660c026af4823e90d927faa73 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Wed, 5 Nov 2025 23:26:24 +0800 Subject: [PATCH 03/14] Enhance API timing logs with compression metrics and standardize naming - Track response payload sizes (compressed and uncompressed) via socket bytesRead - Calculate compression ratio and savings for API responses - Standardize log naming to match Fastify: payload (compressed), payloadUncompressed (uncompressed) - Add response interceptor to capture actual bytes transferred over network - Update receipts and originalTxs fetch logs with consistent format --- src/class/ParallelCycleSync.ts | 179 ++++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 24 deletions(-) diff --git a/src/class/ParallelCycleSync.ts b/src/class/ParallelCycleSync.ts index e25bfd9..55a8722 100644 --- a/src/class/ParallelCycleSync.ts +++ b/src/class/ParallelCycleSync.ts @@ -36,6 +36,23 @@ export interface SyncStats { errors: number } +/** + * Response size metadata attached by transformResponse and interceptor + */ +interface ResponseSizeMetadata { + decompressedBytes: number + decompressedKB: string + compressedBytes?: number + compressedKB?: string + compressionRatio?: number + compressionSavings?: string +} + +interface ResponseDataWithMetadata { + __responseSize?: ResponseSizeMetadata + [key: string]: unknown +} + /** * Parallel sync orchestrator using cycle-based partitioning with composite cursors * Implements the optimal sync strategy with: @@ -108,9 +125,25 @@ export class ParallelCycleSync { (res) => { // Use custom parse for response with timing const startTime = Date.now() - const result = StringUtils.safeJsonParse(res) + const result = typeof res === 'string' ? StringUtils.safeJsonParse(res) : res const elapsed = Date.now() - startTime - const sizeKB = typeof res === 'string' ? (res.length / 1024).toFixed(2) : 'unknown' + + // Calculate decompressed size from raw response string + const decompressedBytes = typeof res === 'string' ? Buffer.byteLength(res) : 0 + const sizeKB = (decompressedBytes / 1024).toFixed(2) + + // Attach size metadata to result for later use + if (result && typeof result === 'object') { + Object.defineProperty(result, '__responseSize', { + value: { + decompressedBytes, + decompressedKB: sizeKB, + }, + enumerable: false, // Hidden from JSON.stringify and iteration + configurable: true, + }) + } + if (config.verbose && elapsed > 50) { console.log(`[Client] Response parse: ${elapsed}ms, size: ${sizeKB}KB`) } @@ -119,6 +152,80 @@ export class ParallelCycleSync { ], }) + // Add response interceptor to capture compressed size from socket bytesRead + this.axiosInstance.interceptors.response.use( + (response) => { + // Get Content-Length header for fallback + const contentLength = response.headers['content-length'] + + // Get socket from the request object + const socket = response.request?.socket + + let compressedBytes: number | undefined + + // Try to calculate compressed size from socket bytesRead (most accurate) + // We track cumulative bytesRead on the socket across requests (due to keep-alive) + if (socket && typeof socket.bytesRead === 'number') { + const currentBytesRead = socket.bytesRead + const lastBytesRead = (socket as { _lastBytesRead?: number })._lastBytesRead + + if (lastBytesRead !== undefined) { + const rawBytes = currentBytesRead - lastBytesRead + + // Subtract estimated header size (HTTP response headers + status line) + // Typical: "HTTP/1.1 200 OK\r\n" + headers + "\r\n\r\n" ≈ 200-400 bytes + const estimatedHeaderSize = 250 + if (rawBytes > estimatedHeaderSize) { + compressedBytes = rawBytes - estimatedHeaderSize + } + } + + // Update last bytesRead for next request on this socket + ;(socket as { _lastBytesRead?: number })._lastBytesRead = currentBytesRead + } + + // Fallback: Use Content-Length header if socket method didn't work + if (!compressedBytes && contentLength) { + compressedBytes = parseInt(contentLength, 10) + } + + // Get existing metadata from transformResponse + const existingMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + + // Merge compressed size with existing decompressed size metadata + if (existingMetadata && response.data && typeof response.data === 'object') { + const decompressedBytes = existingMetadata.decompressedBytes + + // Calculate compression metrics if both sizes are available + const compressionRatio = + compressedBytes && decompressedBytes > 0 + ? +(compressedBytes / decompressedBytes).toFixed(3) + : undefined + + const compressionSavings = + compressionRatio && compressionRatio < 1 + ? `${((1 - compressionRatio) * 100).toFixed(1)}%` + : undefined + + // Update the metadata with compressed size info + Object.defineProperty(response.data, '__responseSize', { + value: { + ...existingMetadata, + compressedBytes, + compressedKB: compressedBytes ? (compressedBytes / 1024).toFixed(2) : undefined, + compressionRatio, + compressionSavings, + }, + enumerable: false, + configurable: true, + }) + } + + return response + }, + (error) => Promise.reject(error) + ) + // Add interval between tasks to prevent overwhelming the distributor this.queue = new PQueue({ concurrency: this.syncConfig.concurrency, @@ -663,20 +770,32 @@ export class ParallelCycleSync { const receipts = response.data?.receipts || [] - // Get response size - with compression, Content-Length might not be accurate - const contentLength = response.headers['content-length'] - const contentEncoding = response.headers['content-encoding'] - const responseSizeBytes = contentLength ? parseInt(contentLength, 10) : 0 - const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : '0.00' + // Get size metadata from transformResponse and interceptor + const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + const decompressedKB = sizeMetadata?.decompressedKB || '0.00' + const compressedKB = sizeMetadata?.compressedKB + const compressionRatio = sizeMetadata?.compressionRatio + const compressionSavings = sizeMetadata?.compressionSavings if (config.verbose || networkElapsed > 1000 || receipts.length === 0) { - console.log( + // Build log message with compression info if available + let logMessage = `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${receipts.length}, size: ${responseSizeKB}KB` + - (contentEncoding ? `, encoding: ${contentEncoding}` : '') + - (receipts.length === 0 && response.data ? ', response.data exists but empty' : '') + - (!response.data ? ', response.data is null/undefined!' : '') - ) + `records: ${receipts.length}` + + // Only show compression metrics if compression actually reduced the size (ratio < 1) + if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { + logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` + } else { + // No compression or not effective, just show uncompressed size + logMessage += `, payload: ${decompressedKB}KB` + } + + logMessage += + (receipts.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') + + console.log(logMessage) } if (response.data && response.data.receipts) { @@ -747,20 +866,32 @@ export class ParallelCycleSync { const originalTxs = response.data?.originalTxs || [] - // Get response size - with compression, Content-Length might not be accurate - const contentLength = response.headers['content-length'] - const contentEncoding = response.headers['content-encoding'] - const responseSizeBytes = contentLength ? parseInt(contentLength, 10) : 0 - const responseSizeKB = responseSizeBytes > 0 ? (responseSizeBytes / 1024).toFixed(2) : '0.00' + // Get size metadata from transformResponse and interceptor + const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + const decompressedKB = sizeMetadata?.decompressedKB || '0.00' + const compressedKB = sizeMetadata?.compressedKB + const compressionRatio = sizeMetadata?.compressionRatio + const compressionSavings = sizeMetadata?.compressionSavings if (config.verbose || networkElapsed > 1000 || originalTxs.length === 0) { - console.log( + // Build log message with compression info if available + let logMessage = `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${originalTxs.length}, size: ${responseSizeKB}KB` + - (contentEncoding ? `, encoding: ${contentEncoding}` : '') + - (originalTxs.length === 0 && response.data ? ', response.data exists but empty' : '') + - (!response.data ? ', response.data is null/undefined!' : '') - ) + `records: ${originalTxs.length}` + + // Only show compression metrics if compression actually reduced the size (ratio < 1) + if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { + logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` + } else { + // No compression or not effective, just show uncompressed size + logMessage += `, payload: ${decompressedKB}KB` + } + + logMessage += + (originalTxs.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') + + console.log(logMessage) } if (response.data && response.data.originalTxs) { From 87e7c6054d43be225eca4aef171b035095859459 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Fri, 7 Nov 2025 00:46:19 +0800 Subject: [PATCH 04/14] refactor: rename ParallelCycleSync to ParallelDataSync and simplify sync architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename ParallelCycleSync class to ParallelDataSync for better clarity - Remove batchSize from ParallelSyncConfig, use config limits instead - Replace composite cursor approach with simpler timestamp + txId pagination - Simplify sync flow by removing separate cycle metadata fetching step - Update method names: syncCycleRange → startSyncing for better semantics - Remove legacy downloadTxsDataAndCyclesParallel function from DataSync - Streamline API endpoints to use /cycle instead of /multi-cycle-cursor - Add SyncTxDataByCycleRange interface for cleaner parameter passing --- src/class/DataSync.ts | 26 - ...rallelCycleSync.ts => ParallelDataSync.ts} | 527 ++++++------------ src/collector.ts | 18 +- 3 files changed, 198 insertions(+), 373 deletions(-) rename src/class/{ParallelCycleSync.ts => ParallelDataSync.ts} (62%) diff --git a/src/class/DataSync.ts b/src/class/DataSync.ts index 12b05ce..d6f4de5 100644 --- a/src/class/DataSync.ts +++ b/src/class/DataSync.ts @@ -4,7 +4,6 @@ import { AccountDB, CycleDB, ReceiptDB, TransactionDB, OriginalTxDataDB } from ' import { config, DISTRIBUTOR_URL } from '../config' import { Cycle } from '../types' import { Utils as StringUtils } from '@shardus/types' -import { ParallelCycleSync } from './ParallelCycleSync' export enum DataType { CYCLE = 'cycleinfo', @@ -833,28 +832,3 @@ export const downloadOriginalTxsDataBetweenCycles = async ( endCycle += config.requestLimits.MAX_BETWEEN_CYCLES_PER_REQUEST } } - -/** - * NEW: Parallel sync using cycle-based partitioning with composite cursors - * This is the optimal sync strategy with 10x+ performance improvement - */ -export const downloadTxsDataAndCyclesParallel = async ( - totalCyclesToSync: number, - fromCycle = 0 -): Promise => { - console.log('\n') - console.log('='.repeat(60)) - console.log('Using PARALLEL SYNC with Composite Cursor') - console.log('This prevents data loss and provides 10x+ performance improvement') - console.log('='.repeat(60)) - console.log('\n') - - const parallelSync = new ParallelCycleSync({ - concurrency: config.parallelSyncConcurrency, - batchSize: 500, - retryAttempts: 3, - retryDelayMs: 1000, - }) - - await parallelSync.syncCycleRange(fromCycle, totalCyclesToSync) -} diff --git a/src/class/ParallelCycleSync.ts b/src/class/ParallelDataSync.ts similarity index 62% rename from src/class/ParallelCycleSync.ts rename to src/class/ParallelDataSync.ts index 55a8722..4f90513 100644 --- a/src/class/ParallelCycleSync.ts +++ b/src/class/ParallelDataSync.ts @@ -4,7 +4,7 @@ import { Utils as StringUtils } from '@shardus/types' import { config, DISTRIBUTOR_URL } from '../config' import { queryFromDistributor, DataType } from './DataSync' import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' -import { ParallelSyncCheckpointManager, CompositeCursor } from './ParallelSyncCheckpoint' +import { ParallelSyncCheckpointManager } from './ParallelSyncCheckpoint' import { Cycle } from '../types' import axios, { AxiosInstance } from 'axios' import http from 'http' @@ -15,7 +15,6 @@ import https from 'https' */ export interface ParallelSyncConfig { concurrency: number // Number of parallel workers - batchSize: number // Items per request retryAttempts: number // Retry failed requests retryDelayMs: number // Delay between retries cyclesPerBatch: number // Number of cycles to batch together (default: 10) @@ -54,14 +53,26 @@ interface ResponseDataWithMetadata { } /** - * Parallel sync orchestrator using cycle-based partitioning with composite cursors + * Sync receipts and originalTxs data by cycle range with timestamp pagination + * Uses both timestamp and ID to handle timestamp collisions and prevent data loss + */ +export interface SyncTxDataByCycleRange { + startCycle: number + endCycle: number + afterTimestamp?: number + afterTxId?: string // receiptId or txId + limit?: number +} + +/** + * Parallel sync orchestrator using cycle-based partitioning with timestamp + txId pagination * Implements the optimal sync strategy with: * - Cycle-level parallelization - * - Composite cursor (timestamp + ID) to prevent data loss + * - Composite cursor (timestamp + txId ) to prevent data loss * - Automatic resume from database * - Work queue for load balancing */ -export class ParallelCycleSync { +export class ParallelDataSync { private checkpointManager: ParallelSyncCheckpointManager private queue: PQueue private syncConfig: ParallelSyncConfig @@ -72,10 +83,8 @@ export class ParallelCycleSync { constructor(syncConfig?: Partial) { this.checkpointManager = new ParallelSyncCheckpointManager() - this.syncConfig = { concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 10, - batchSize: syncConfig?.batchSize || 500, retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 3, retryDelayMs: syncConfig?.retryDelayMs || 1000, cyclesPerBatch: syncConfig?.cyclesPerBatch || config.cyclesPerBatch || 10, @@ -254,7 +263,7 @@ export class ParallelCycleSync { /** * Main entry point for parallel sync */ - async syncCycleRange(startCycle: number, endCycle: number): Promise { + async startSyncing(startCycle: number, endCycle: number): Promise { console.log(`\n${'='.repeat(60)}`) console.log(`Starting Parallel Cycle Sync: ${startCycle} → ${endCycle}`) console.log(`Concurrency: ${this.syncConfig.concurrency} workers`) @@ -264,20 +273,29 @@ export class ParallelCycleSync { this.stats.totalCycles = endCycle - startCycle try { - // Step 1: Fetch all cycle metadata (lightweight) - console.log('Step 1: Fetching cycle metadata...') - const cycles = await this.fetchCyclesMetadata(startCycle, endCycle) - console.log(`✓ Retrieved ${cycles.length} cycles\n`) + // Split cycles into batches + const cycleBatches: { startCycle: number; endCycle: number }[] = [] + + for (let i = startCycle; i <= endCycle; ) { + let batchEnd = i + this.syncConfig.cyclesPerBatch + if (batchEnd > endCycle) { + batchEnd = endCycle + } + cycleBatches.push({ startCycle: i, endCycle: batchEnd }) + i = batchEnd + 1 + } - // Step 2: Sync cycles themselves in parallel - console.log('Step 2: Syncing cycle records...') - await this.syncCyclesData(cycles) - console.log(`✓ Synced ${cycles.length} cycle records\n`) + console.log( + `Created ${cycleBatches.length} cycle batches (${this.syncConfig.cyclesPerBatch} cycles per batch)` + ) - // Step 3: Sync receipts and originalTxs for all cycles in parallel with multi-cycle batching - console.log('Step 3: Syncing receipts and originalTxs with multi-cycle batching...') - await this.syncAllCyclesDataMultiBatch(cycles) + // Add all batch sync tasks to the queue + const tasks = cycleBatches.map((batch) => + this.queue.add(() => this.syncDataByCycleRange(batch.startCycle, batch.endCycle)) + ) + // Wait for all tasks to complete + await Promise.all(tasks) this.stats.endTime = Date.now() // Summary @@ -290,139 +308,55 @@ export class ParallelCycleSync { } /** - * Fetch cycle metadata from distributor - */ - private async fetchCyclesMetadata(startCycle: number, endCycle: number): Promise { - const cycles: Cycle[] = [] - - // Fetch in chunks - const CHUNK_SIZE = 100 - for (let i = startCycle; i <= endCycle; i += CHUNK_SIZE) { - const chunkEnd = Math.min(i + CHUNK_SIZE - 1, endCycle) - - const response = await queryFromDistributor(DataType.CYCLE, { - start: i, - end: chunkEnd, - }) - - if (response && response.data && response.data.cycleInfo) { - cycles.push( - ...response.data.cycleInfo.map((cycleRecord: any) => ({ - counter: cycleRecord.counter, - cycleRecord, - start: cycleRecord.start, - cycleMarker: cycleRecord.marker, - })) - ) - } - } - - return cycles - } - - /** - * Sync cycle records to database - */ - private async syncCyclesData(cycles: Cycle[]): Promise { - // Insert cycles in batches - const BATCH_SIZE = 100 - for (let i = 0; i < cycles.length; i += BATCH_SIZE) { - const batch = cycles.slice(i, i + BATCH_SIZE) - await CycleDB.bulkInsertCycles(batch) - } - } - - /** - * Sync receipts and originalTxs for all cycles in parallel (LEGACY - single cycle per request) - */ - private async syncAllCyclesData(cycles: Cycle[]): Promise { - // Add all cycle sync tasks to the queue - const tasks = cycles.map((cycle) => this.queue.add(() => this.syncSingleCycle(cycle))) - - // Wait for all tasks to complete - await Promise.all(tasks) - } - - /** - * Sync receipts and originalTxs using multi-cycle batching with prefetching - * This dramatically reduces HTTP overhead for cycles with small data - */ - private async syncAllCyclesDataMultiBatch(cycles: Cycle[]): Promise { - // Group cycles into batches - const cycleBatches: Cycle[][] = [] - for (let i = 0; i < cycles.length; i += this.syncConfig.cyclesPerBatch) { - cycleBatches.push(cycles.slice(i, i + this.syncConfig.cyclesPerBatch)) - } - - console.log( - `Created ${cycleBatches.length} cycle batches (${this.syncConfig.cyclesPerBatch} cycles per batch)` - ) - - // Add all batch sync tasks to the queue - const tasks = cycleBatches.map((batch) => this.queue.add(() => this.syncCycleBatch(batch))) - - // Wait for all tasks to complete - await Promise.all(tasks) - } - - /** - * Sync receipts and originalTxs for a single cycle + * Sync data in parallel using adaptive multi-cycle fetching with prefetching on endpoints + * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5) */ - private async syncSingleCycle(cycle: Cycle): Promise { + private async syncDataByCycleRange(startCycle: number, endCycle: number): Promise { try { - // Get cycle time boundaries - const cycleStart = cycle.start - const cycleEnd = cycle.cycleRecord.duration - ? cycle.start + cycle.cycleRecord.duration - : cycle.start + 60 * 1000 // Default 1 minute - - // Sync both data types in parallel for this cycle + // Sync all data types in parallel await Promise.all([ - this.syncCycleReceipts(cycle.counter, cycleStart, cycleEnd), - this.syncCycleOriginalTxs(cycle.counter, cycleStart, cycleEnd), + this.syncCyclesByCycleRange(startCycle, endCycle), + this.syncReceiptsByCycleRange(startCycle, endCycle), + this.syncOriginalTxsByCycleRange(startCycle, endCycle), ]) - this.stats.completedCycles++ + this.stats.completedCycles += endCycle - startCycle + 1 if (config.verbose || this.stats.completedCycles % 10 === 0) { const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) - console.log(`Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%)`) + console.log( + `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + ) } } catch (error) { - console.error(`Error syncing cycle ${cycle.counter}:`, error) + console.error(`Error syncing cycle batch ${startCycle}-${endCycle}:`, error) this.stats.errors++ throw error } } /** - * Sync receipts and originalTxs for a batch of cycles using multi-cycle endpoints - * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5) + * Sync cycles across a batch of cycles using multi-cycle fetching */ - private async syncCycleBatch(cycleBatch: Cycle[]): Promise { - if (cycleBatch.length === 0) return - + private async syncCyclesByCycleRange(startCycle: number, endCycle: number): Promise { try { - const startCycle = cycleBatch[0].counter - const endCycle = cycleBatch[cycleBatch.length - 1].counter + const response = await this.fetchCyclesByCycleRange(startCycle, endCycle) - // Sync both data types in parallel - await Promise.all([this.syncCycleBatchReceipts(cycleBatch), this.syncCycleBatchOriginalTxs(cycleBatch)]) + if (!response || response.length === 0) { + if (config.verbose) { + console.log(`[Cycles ${startCycle}-${endCycle}] No cycle data returned`) + } + return + } - this.stats.completedCycles += cycleBatch.length + // Process cycles using bulkInsertCycles + await CycleDB.bulkInsertCycles(response) - if (config.verbose || this.stats.completedCycles % 10 === 0) { - const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) - console.log( - `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` - ) + if (config.verbose) { + console.log(`[Cycles ${startCycle}-${endCycle}] Cycles: +${response.length}`) } } catch (error) { - console.error( - `Error syncing cycle batch ${cycleBatch[0].counter}-${cycleBatch[cycleBatch.length - 1].counter}:`, - error - ) - this.stats.errors++ + console.error(`Error fetching cycles for cycle batch ${startCycle}-${endCycle}:`, error) throw error } } @@ -430,20 +364,15 @@ export class ParallelCycleSync { /** * Sync receipts across a batch of cycles using adaptive multi-cycle fetching with prefetching */ - private async syncCycleBatchReceipts(cycleBatch: Cycle[]): Promise { - const startCycle = cycleBatch[0].counter - const endCycle = cycleBatch[cycleBatch.length - 1].counter - - // Get resume cursor from database for the start cycle - const initialCursor = await this.checkpointManager.getReceiptsCursor(startCycle, cycleBatch[0].start) - + private async syncReceiptsByCycleRange(startCycle: number, endCycle: number): Promise { let currentCycle = startCycle - let currentCursor: CompositeCursor = initialCursor + let afterTimestamp = 0 + let afterTxId = '' let totalFetched = 0 // Prefetch: Start fetching first batch immediately let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch - ? this.fetchReceiptsMultiCycle(currentCycle, endCycle, currentCursor) + ? this.fetchReceiptsByCycleRange({ startCycle: currentCycle, endCycle, afterTimestamp, afterTxId }) : null while (currentCycle <= endCycle) { @@ -451,23 +380,34 @@ export class ParallelCycleSync { // Get the data (either from prefetch or fetch now) const response = nextFetchPromise ? await nextFetchPromise - : await this.fetchReceiptsMultiCycle(currentCycle, endCycle, currentCursor) + : await this.fetchReceiptsByCycleRange({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + }) if (!response || response.length === 0) { break // No more receipts in this cycle range } - // Update cursor based on last receipt BEFORE starting next fetch + // Update after timestamp and txId based on last receipt BEFORE starting next fetch const lastReceipt = response[response.length - 1] currentCycle = lastReceipt.cycle - const nextCursor: CompositeCursor = { - timestamp: lastReceipt.timestamp, - id: lastReceipt.receiptId, - } + afterTimestamp = lastReceipt.timestamp + afterTxId = lastReceipt.receiptId // Prefetch next batch while processing current batch - if (this.syncConfig.enablePrefetch && response.length >= this.syncConfig.batchSize) { - nextFetchPromise = this.fetchReceiptsMultiCycle(currentCycle, endCycle, nextCursor) + if ( + this.syncConfig.enablePrefetch && + response.length >= config.requestLimits.MAX_RECEIPTS_PER_REQUEST + ) { + nextFetchPromise = this.fetchReceiptsByCycleRange({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + }) } else { nextFetchPromise = null } @@ -477,7 +417,6 @@ export class ParallelCycleSync { totalFetched += response.length this.stats.totalReceipts += response.length - currentCursor = nextCursor if (config.verbose) { console.log( @@ -487,8 +426,8 @@ export class ParallelCycleSync { ) } - // If we got less than batch size, we've exhausted this cycle range - if (response.length < this.syncConfig.batchSize) { + // If we got less than the max response size, we've exhausted this cycle range + if (response.length < config.requestLimits.MAX_RECEIPTS_PER_REQUEST) { break } } catch (error) { @@ -501,20 +440,20 @@ export class ParallelCycleSync { /** * Sync originalTxs across a batch of cycles using adaptive multi-cycle fetching with prefetching */ - private async syncCycleBatchOriginalTxs(cycleBatch: Cycle[]): Promise { - const startCycle = cycleBatch[0].counter - const endCycle = cycleBatch[cycleBatch.length - 1].counter - - // Get resume cursor from database for the start cycle - const initialCursor = await this.checkpointManager.getOriginalTxsCursor(startCycle, cycleBatch[0].start) - + private async syncOriginalTxsByCycleRange(startCycle: number, endCycle: number): Promise { let currentCycle = startCycle - let currentCursor: CompositeCursor = initialCursor + let afterTimestamp = 0 + let afterTxId = '' let totalFetched = 0 // Prefetch: Start fetching first batch immediately let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch - ? this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, currentCursor) + ? this.fetchOriginalTxsByCycleRange({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + }) : null while (currentCycle <= endCycle) { @@ -522,23 +461,34 @@ export class ParallelCycleSync { // Get the data (either from prefetch or fetch now) const response = nextFetchPromise ? await nextFetchPromise - : await this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, currentCursor) + : await this.fetchOriginalTxsByCycleRange({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + }) if (!response || response.length === 0) { break // No more originalTxs in this cycle range } - // Update cursor based on last tx BEFORE starting next fetch + // Update after timestamp and txId based on last tx BEFORE starting next fetch const lastTx = response[response.length - 1] currentCycle = lastTx.cycle - const nextCursor: CompositeCursor = { - timestamp: lastTx.timestamp, - id: lastTx.txId, - } + afterTimestamp = lastTx.timestamp + afterTxId = lastTx.txId // Prefetch next batch while processing current batch - if (this.syncConfig.enablePrefetch && response.length >= this.syncConfig.batchSize) { - nextFetchPromise = this.fetchOriginalTxsMultiCycle(currentCycle, endCycle, nextCursor) + if ( + this.syncConfig.enablePrefetch && + response.length >= config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST + ) { + nextFetchPromise = this.fetchOriginalTxsByCycleRange({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + }) } else { nextFetchPromise = null } @@ -548,7 +498,6 @@ export class ParallelCycleSync { totalFetched += response.length this.stats.totalOriginalTxs += response.length - currentCursor = nextCursor if (config.verbose) { console.log( @@ -558,8 +507,8 @@ export class ParallelCycleSync { ) } - // If we got less than batch size, we've exhausted this cycle range - if (response.length < this.syncConfig.batchSize) { + // If we got less than the max response size, we've exhausted this cycle range + if (response.length < config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST) { break } } catch (error) { @@ -570,196 +519,86 @@ export class ParallelCycleSync { } /** - * Sync receipts for a specific cycle using composite cursor + * Fetch cycles by cycle range with retry logic */ - private async syncCycleReceipts(cycleNumber: number, cycleStart: number, cycleEnd: number): Promise { - // Get resume cursor from database - const cursor = await this.checkpointManager.getReceiptsCursor(cycleNumber, cycleStart) - - let currentCursor: CompositeCursor = cursor - let totalFetched = 0 - - while (true) { - try { - const response = await this.fetchReceiptsWithCursor(cycleNumber, currentCursor, cycleEnd) - - if (!response || response.length === 0) { - break // No more receipts for this cycle - } - - // Process receipts - await ReceiptDB.processReceiptData(response) - - totalFetched += response.length - this.stats.totalReceipts += response.length - - // Update cursor to last item - const lastReceipt = response[response.length - 1] - currentCursor = { - timestamp: lastReceipt.timestamp, - id: lastReceipt.receiptId, - } - - if (config.verbose) { - console.log(`[Cycle ${cycleNumber}] Receipts: +${response.length} (total: ${totalFetched})`) - } - - // If we got less than batch size, we're done - if (response.length < this.syncConfig.batchSize) { - break - } - } catch (error) { - console.error(`Error fetching receipts for cycle ${cycleNumber}:`, error) - throw error - } - } - } - - /** - * Sync originalTxs for a specific cycle using composite cursor - */ - private async syncCycleOriginalTxs( - cycleNumber: number, - cycleStart: number, - cycleEnd: number - ): Promise { - // Get resume cursor from database - const cursor = await this.checkpointManager.getOriginalTxsCursor(cycleNumber, cycleStart) - - let currentCursor: CompositeCursor = cursor - let totalFetched = 0 - - while (true) { + private async fetchCyclesByCycleRange(startCycle: number, endCycle: number): Promise { + // Retry with exponential backoff + for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { try { - const response = await this.fetchOriginalTxsWithCursor(cycleNumber, currentCursor, cycleEnd) - - if (!response || response.length === 0) { - break // No more originalTxs for this cycle - } - - // Process originalTxs - await OriginalTxDataDB.processOriginalTxData(response) + const startTime = Date.now() + const response = await queryFromDistributor(DataType.CYCLE, { + start: startCycle, + end: endCycle, + }) + const networkElapsed = Date.now() - startTime - totalFetched += response.length - this.stats.totalOriginalTxs += response.length + if (response && response.data && response.data.cycleInfo) { + const cycleRecords = response.data.cycleInfo.map((cycleRecord: any) => ({ + counter: cycleRecord.counter, + cycleRecord, + start: cycleRecord.start, + cycleMarker: cycleRecord.marker, + })) - // Update cursor to last item - const lastTx = response[response.length - 1] - currentCursor = { - timestamp: lastTx.timestamp, - id: lastTx.txId, + if (config.verbose) { + console.log( + `[API Timing] Cycles fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${cycleRecords.length}` + ) + } + return cycleRecords } + } catch (error: any) { + const isLastAttempt = attempt === this.syncConfig.retryAttempts + const isRetryableError = + error.code === 'ECONNRESET' || + error.code === 'ETIMEDOUT' || + error.code === 'ECONNREFUSED' || + error.code === 'EPIPE' - if (config.verbose) { - console.log(`[Cycle ${cycleNumber}] OriginalTxs: +${response.length} (total: ${totalFetched})`) + if (isRetryableError && !isLastAttempt) { + const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) + console.warn( + `Error on cycles fetch (cycles ${startCycle}-${endCycle}), ` + + `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + + `retrying in ${delay}ms...` + ) + await this.sleep(delay) + continue } - // If we got less than batch size, we're done - if (response.length < this.syncConfig.batchSize) { - break - } - } catch (error) { - console.error(`Error fetching originalTxs for cycle ${cycleNumber}:`, error) + // Non-retryable error or last attempt failed + console.error(`Error fetching cycles (cycles ${startCycle}-${endCycle}):`, error.message) throw error } } - } - - /** - * Fetch receipts using composite cursor (prevents data loss on timestamp collisions) - */ - private async fetchReceiptsWithCursor( - cycle: number, - cursor: CompositeCursor, - beforeTimestamp?: number - ): Promise { - const data = { - cycle, - afterTimestamp: cursor.timestamp, - afterReceiptId: cursor.id, - beforeTimestamp, - limit: this.syncConfig.batchSize, - sender: config.collectorInfo.publicKey, - sign: undefined, - } - - crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - - const url = `${DISTRIBUTOR_URL}/receipt/cycle-cursor` - - try { - const response = await this.axiosInstance.post(url, data) - - if (response.data && response.data.receipts) { - return response.data.receipts - } - - return [] - } catch (error) { - console.error(`Error fetching receipts with cursor:`, error.message) - throw error - } - } - - /** - * Fetch originalTxs using composite cursor - */ - private async fetchOriginalTxsWithCursor( - cycle: number, - cursor: CompositeCursor, - beforeTimestamp?: number - ): Promise { - const data = { - cycle, - afterTimestamp: cursor.timestamp, - afterTxId: cursor.id, - beforeTimestamp, - limit: this.syncConfig.batchSize, - sender: config.collectorInfo.publicKey, - sign: undefined, - } - crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - - const url = `${DISTRIBUTOR_URL}/originalTx/cycle-cursor` - - try { - const response = await this.axiosInstance.post(url, data) - - if (response.data && response.data.originalTxs) { - return response.data.originalTxs - } - - return [] - } catch (error) { - console.error(`Error fetching originalTxs with cursor:`, error.message) - throw error - } + return [] } /** - * Fetch receipts across multiple cycles using composite cursor with retry logic + * Fetch receipts by multi-cycle range with retry logic * Automatically adapts to cycle sizes - if cycles 1-10 only have data in 1-5, returns that subset */ - private async fetchReceiptsMultiCycle( - startCycle: number, - endCycle: number, - cursor: CompositeCursor - ): Promise { + private async fetchReceiptsByCycleRange({ + startCycle, + endCycle, + afterTimestamp, + afterTxId, + }: SyncTxDataByCycleRange): Promise { const data = { startCycle, endCycle, - afterCycle: startCycle, - afterTimestamp: cursor.timestamp, - afterReceiptId: cursor.id, - limit: this.syncConfig.batchSize, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_RECEIPTS_PER_REQUEST, sender: config.collectorInfo.publicKey, sign: undefined, } crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - const url = `${DISTRIBUTOR_URL}/receipt/multi-cycle-cursor` + const url = `${DISTRIBUTOR_URL}/receipt/cycle` // Retry with exponential backoff for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { @@ -835,27 +674,27 @@ export class ParallelCycleSync { } /** - * Fetch originalTxs across multiple cycles using composite cursor with retry logic + * Fetch originalTxs by multi-cycle range with retry logic */ - private async fetchOriginalTxsMultiCycle( - startCycle: number, - endCycle: number, - cursor: CompositeCursor - ): Promise { + private async fetchOriginalTxsByCycleRange({ + startCycle, + endCycle, + afterTimestamp, + afterTxId, + }: SyncTxDataByCycleRange): Promise { const data = { startCycle, endCycle, - afterCycle: startCycle, - afterTimestamp: cursor.timestamp, - afterTxId: cursor.id, - limit: this.syncConfig.batchSize, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST, sender: config.collectorInfo.publicKey, sign: undefined, } crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - const url = `${DISTRIBUTOR_URL}/originalTx/multi-cycle-cursor` + const url = `${DISTRIBUTOR_URL}/originalTx/cycle` // Retry with exponential backoff for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { diff --git a/src/collector.ts b/src/collector.ts index 76cb0b2..dfdb04d 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -9,7 +9,6 @@ import * as Crypto from './utils/crypto' import { CycleDB, ReceiptDB, OriginalTxDataDB } from './storage' import { downloadTxsDataAndCycles, - downloadTxsDataAndCyclesParallel, compareWithOldReceiptsData, compareWithOldCyclesData, downloadAndSyncGenesisAccounts, @@ -29,6 +28,7 @@ import RMQCyclesConsumer from './collectors/rmq/cycles' import RMQOriginalTxsConsumer from './collectors/rmq/original_txs' import RMQReceiptsConsumer from './collectors/rmq/receipts' import { setupCollectorSocketServer } from './collectorServer' +import { ParallelDataSync } from './class/ParallelDataSync' const DistributorFirehoseEvent = 'FIREHOSE' let ws: WebSocket @@ -228,8 +228,20 @@ export const checkAndSyncData = async (): Promise<() => Promise> => { // Use parallel sync if enabled (default) if (config.useParallelSync) { - console.log('Using optimized parallel sync strategy') - await downloadTxsDataAndCyclesParallel(totalCyclesToSync, lastStoredCycleCount) + console.log('\n') + console.log('='.repeat(60)) + console.log('Using NEW EFFICIENT PARALLEL SYNC STRATEGY based on cycle batches!') + console.log('This strategy is more robust and provides 10x+ performance improvement') + console.log('='.repeat(60)) + console.log('\n') + + const parallelDataSync = new ParallelDataSync({ + concurrency: config.parallelSyncConcurrency, + retryAttempts: 3, + retryDelayMs: 1000, + }) + + await parallelDataSync.startSyncing(lastStoredCycleCount, totalCyclesToSync) return } From a9e2fb9c944c0b20d61527bb5e62b177ec2da63b Mon Sep 17 00:00:00 2001 From: jairajdev Date: Fri, 7 Nov 2025 17:51:19 +0800 Subject: [PATCH 05/14] Add DataSyncManager for intelligent cycle-based data synchronization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements DataSyncManager to handle missing data identification and recovery for parallel sync operations with comprehensive gap detection and verification. Key features: - Automatic gap detection across cycle ranges - Data anomaly detection (validates last 15 cycles before sync) - Lookback verification window (cyclesPerBatch × parallelSyncConcurrency) - Recovery orchestration using ParallelDataSync for all scenarios - Fail-fast validation before websocket connection - Fresh start vs resume from interruption routing Handles complex scenarios: - Multiple interruption points during parallel sync - Incremental data ( received through websocket ) gaps during process restarts - Data integrity verification and mismatch detection - Unified recovery strategy using cycle-batch parallel sync --- src/class/DataSyncManager.ts | 727 +++++++++++++++++++++++++++++++++++ src/collector.ts | 49 +-- 2 files changed, 754 insertions(+), 22 deletions(-) create mode 100644 src/class/DataSyncManager.ts diff --git a/src/class/DataSyncManager.ts b/src/class/DataSyncManager.ts new file mode 100644 index 0000000..b91b27a --- /dev/null +++ b/src/class/DataSyncManager.ts @@ -0,0 +1,727 @@ +import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' +import { config } from '../config' +import { queryFromDistributor, DataType, downloadAndSyncGenesisAccounts } from './DataSync' +import { ParallelDataSync } from './ParallelDataSync' + +/** + * Represents a gap in cycle sequence + */ +export interface CycleGap { + startCycle: number + endCycle: number + gapSize: number +} + +/** + * Represents a cycle with mismatched transaction data + */ +export interface MismatchedCycle { + cycle: number + localReceipts: number + distributorReceipts: number + localOriginalTxs: number + distributorOriginalTxs: number + receiptsMismatch: boolean + originalTxsMismatch: boolean +} + +/** + * Comprehensive recovery plan for data synchronization + */ +export interface DataSyncRecoveryPlan { + currentDistributorCycle: number + lastLocalCycle: number + missingCycleRanges: CycleGap[] + mismatchedCycles: MismatchedCycle[] + lookbackVerificationRanges: CycleGap[] + totalMissingCycles: number + totalMismatchedCycles: number + recoveryNeeded: boolean +} + +/** + * DataSyncManager + * + * Orchestrates intelligent data synchronization with automatic gap detection and recovery. + * + * Key Features: + * - Anomaly detection: Validates data integrity before sync + * - Gap identification: Detects missing cycle ranges in local database + * - Data reconciliation: Compares local vs distributor data + * - Recovery orchestration: Patches gaps and mismatched cycles + * - Intelligent routing: Fresh start vs resume from interruption + * + * Example Scenario: + * 1. Parallel sync stops at cycle 150000 (target was 300000) + * 2. WebSocket saves incremental data from 300001 to 300100 + * 3. Process restarts at cycle 300105 + * + * Manager identifies and recovers: + * - Missing range: 150000 to 300001 (parallel sync interruption) + * - Missing range: 300100 to 300105 (websocket gap during restart) + * - Mismatched data in lookback window (e.g., 149900-150000) + * + * Handles multiple interruption points automatically. + */ +export class DataSyncManager { + private lookbackCycles: number + + constructor() { + // Calculate lookback window: cyclesPerBatch * parallelSyncConcurrency + const cyclesPerBatch = config.cyclesPerBatch || 10 + const concurrency = config.parallelSyncConcurrency || 10 + this.lookbackCycles = cyclesPerBatch * concurrency + + console.log(`DataSyncManager initialized with lookback window: ${this.lookbackCycles} cycles`) + } + + /** + * Main entry point for intelligent data synchronization + * Handles both fresh start and recovery from interruptions + */ + async syncData(): Promise { + const response = await this.getTotalDataFromDistributor() + if (!response) { + throw new Error('Failed to fetch total data from distributor') + } + const { totalCycles } = response + const lastLocalCycles = await CycleDB.queryLatestCycleRecords(1) + const lastLocalCycle = lastLocalCycles.length > 0 ? lastLocalCycles[0].counter : -1 + + // Always sync genesis accounts first + if (lastLocalCycle === 0) { + console.log('Syncing genesis accounts...') + await downloadAndSyncGenesisAccounts() + } + + // Check if this is a fresh start + const isFreshStart = lastLocalCycle === -1 || lastLocalCycle === 0 + + if (isFreshStart) { + // Fresh start - no checkpoint needed, just sync from beginning + console.log('🆕 Fresh start detected - syncing from cycle 0') + const parallelDataSync = new ParallelDataSync({ + concurrency: config.parallelSyncConcurrency, + retryAttempts: 3, + retryDelayMs: 1000, + }) + + await parallelDataSync.startSyncing(0, totalCycles - 1) + } else { + // Existing data - use DataSyncManager to identify and patch gaps/mismatches + console.log('📊 Existing data detected - running recovery analysis') + const recoveryPlan = await this.generateRecoveryPlan(totalCycles) + + // Execute the complete sync (recovery + normal sync) + await this.executeSyncWithRecovery(recoveryPlan) + } + } + + /** + * Detect data anomalies by verifying last 10-15 cycles against distributor + * Throws error if critical anomalies are found + * Fetches local cycle data internally + */ + async detectDataAnomalies(): Promise<{ lastLocalCycle: number; currentDistributorCycle: number }> { + console.log('\n📊 Running data anomaly detection...') + + // Fetch local and distributor cycle info + const lastLocalCycles = await CycleDB.queryLatestCycleRecords(1) + const lastLocalCycle = lastLocalCycles.length > 0 ? lastLocalCycles[0].counter : -1 + + const response = await this.getTotalDataFromDistributor() + if (!response) { + throw new Error('Failed to fetch distributor cycle info') + } + const currentDistributorCycle = response.totalCycles + + console.log(`Last local cycle: ${lastLocalCycle}`) + console.log(`Current distributor cycle: ${currentDistributorCycle}`) + + const anomalies: string[] = [] + + // Anomaly 1: Local DB has more cycles than distributor + if (lastLocalCycle > currentDistributorCycle) { + anomalies.push( + `Local DB has newer cycle than distributor (Local: ${lastLocalCycle}, Distributor: ${currentDistributorCycle})` + ) + } + + // Anomaly 2: Verify last 10-15 cycles match with distributor + if (lastLocalCycle >= 15) { + const verificationCycles = 15 + const startCycle = lastLocalCycle - verificationCycles + 1 + const endCycle = lastLocalCycle + + console.log( + `Verifying last ${verificationCycles} cycles (${startCycle} to ${endCycle}) against distributor...` + ) + + try { + // Compare cycles data + const localCycles = await CycleDB.queryCycleRecordsBetween(startCycle, endCycle) + const distributorResponse = await queryFromDistributor(DataType.CYCLE, { + start: startCycle, + end: endCycle, + }) + + if (distributorResponse?.data?.cycleInfo) { + const distributorCycles = distributorResponse.data.cycleInfo + + // Check if cycle counts match + if (localCycles.length !== distributorCycles.length) { + anomalies.push( + `Cycle count mismatch in range ${startCycle}-${endCycle}: ` + + `Local has ${localCycles.length}, Distributor has ${distributorCycles.length}` + ) + } else { + // Verify each cycle's marker matches + for (let i = 0; i < localCycles.length; i++) { + /* eslint-disable security/detect-object-injection */ + const localCycle = localCycles[i] + /* eslint-enable security/detect-object-injection */ + const distributorCycle = distributorCycles.find( + (c: { counter: number; marker: string }) => c.counter === localCycle.counter + ) + + if (!distributorCycle) { + anomalies.push(`Cycle ${localCycle.counter} exists locally but not in distributor`) + } else if (localCycle.cycleMarker !== distributorCycle.marker) { + anomalies.push( + `Cycle ${localCycle.counter} marker mismatch: ` + + `Local ${localCycle.cycleMarker} vs Distributor ${distributorCycle.marker}` + ) + } + } + } + } + + // Compare receipts count + const receiptsResponse = await queryFromDistributor(DataType.RECEIPT, { + startCycle, + endCycle, + type: 'tally', + }) + + if (receiptsResponse?.data?.receipts) { + const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts + const localReceiptsCount = await ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle) + + for (const distReceipt of distributorReceipts) { + const localReceipt = localReceiptsCount.find((r) => r.cycle === distReceipt.cycle) + if (localReceipt && localReceipt.receipts !== distReceipt.receipts) { + anomalies.push( + `Receipts count mismatch in cycle ${distReceipt.cycle}: ` + + `Local has ${localReceipt.receipts}, Distributor has ${distReceipt.receipts}` + ) + } + } + } + + // Compare originalTxs count + const originalTxsResponse = await queryFromDistributor(DataType.ORIGINALTX, { + startCycle, + endCycle, + type: 'tally', + }) + + if (originalTxsResponse?.data?.originalTxs) { + const distributorOriginalTxs: { cycle: number; originalTxsData: number }[] = + originalTxsResponse.data.originalTxs + const localOriginalTxsCount = await OriginalTxDataDB.queryOriginalTxDataCountByCycles( + startCycle, + endCycle + ) + + for (const distTx of distributorOriginalTxs) { + const localTx = localOriginalTxsCount.find((t) => t.cycle === distTx.cycle) + if (localTx && localTx.originalTxsData !== distTx.originalTxsData) { + anomalies.push( + `OriginalTxs count mismatch in cycle ${distTx.cycle}: ` + + `Local has ${localTx.originalTxsData}, Distributor has ${distTx.originalTxsData}` + ) + } + } + } + } catch (error) { + console.warn('Warning: Could not complete anomaly verification:', error) + // Don't fail on verification errors, just warn + } + } + + if (anomalies.length > 0) { + console.error('\n❌ DATA ANOMALIES DETECTED:') + anomalies.forEach((anomaly) => console.error(` - ${anomaly}`)) + throw new Error( + 'Data anomalies detected! Local database may be corrupted or out of sync. ' + + 'Please clear the database and restart the server.' + ) + } + + console.log('✅ No data anomalies detected') + + return { lastLocalCycle, currentDistributorCycle } + } + + /** + * Fetch total data count from distributor + */ + private async getTotalDataFromDistributor(): Promise<{ + totalCycles: number + totalAccounts: number + totalReceipts: number + totalOriginalTxs: number + } | null> { + const response = await queryFromDistributor(DataType.TOTALDATA, {}) + if (!response?.data || response.data.totalCycles === undefined) { + return null + } + return response.data + } + + /** + * Identify all missing cycle ranges by finding gaps in the cycles DB + * + * Example: + * - DB has cycles: 0-149999, 300001-300099, 300106-300200 + * - Returns gaps: [{150000, 300000}, {300100, 300105}] + */ + private async identifyMissingCycleRanges(targetCycle: number): Promise { + try { + console.log(`\n${'='.repeat(60)}`) + console.log(`Identifying missing cycle ranges up to cycle ${targetCycle}`) + console.log(`${'='.repeat(60)}`) + + // Get all cycles from DB ordered by counter + const allCycles = await CycleDB.queryCycleRecordsBetween(0, targetCycle) + + if (!allCycles || allCycles.length === 0) { + // No cycles in DB, everything from 0 to targetCycle is missing + console.log('No cycles found in DB, entire range is missing') + return [ + { + startCycle: 0, + endCycle: targetCycle, + gapSize: targetCycle + 1, + }, + ] + } + + const gaps: CycleGap[] = [] + const cycleNumbers = allCycles.map((c) => c.counter).sort((a, b) => a - b) + + console.log(`Found ${cycleNumbers.length} cycles in DB`) + console.log(`First cycle: ${cycleNumbers[0]}, Last cycle: ${cycleNumbers[cycleNumbers.length - 1]}`) + + // Check if there's a gap at the beginning + if (cycleNumbers[0] > 0) { + gaps.push({ + startCycle: 0, + endCycle: cycleNumbers[0] - 1, + gapSize: cycleNumbers[0], + }) + console.log(`Gap found at beginning: 0 to ${cycleNumbers[0] - 1}`) + } + + // Find gaps in the middle + for (let i = 0; i < cycleNumbers.length - 1; i++) { + const currentCycle = cycleNumbers[i] + const nextCycle = cycleNumbers[i + 1] + + // If next cycle is not immediately after current, there's a gap + if (nextCycle - currentCycle > 1) { + const gapStart = currentCycle + 1 + const gapEnd = nextCycle - 1 + gaps.push({ + startCycle: gapStart, + endCycle: gapEnd, + gapSize: gapEnd - gapStart + 1, + }) + console.log(`Gap found: ${gapStart} to ${gapEnd} (${gapEnd - gapStart + 1} cycles)`) + } + } + + // Check if there's a gap at the end + const lastLocalCycle = cycleNumbers[cycleNumbers.length - 1] + if (lastLocalCycle < targetCycle) { + gaps.push({ + startCycle: lastLocalCycle + 1, + endCycle: targetCycle, + gapSize: targetCycle - lastLocalCycle, + }) + console.log(`Gap found at end: ${lastLocalCycle + 1} to ${targetCycle}`) + } + + console.log(`\nTotal gaps found: ${gaps.length}`) + const totalMissing = gaps.reduce((sum, gap) => sum + gap.gapSize, 0) + console.log(`Total missing cycles: ${totalMissing}`) + + return gaps + } catch (error) { + console.error('Error identifying missing cycle ranges:', error) + throw error + } + } + + /** + * Verify data integrity with lookback window before each gap + * + * For each gap, check cyclesPerBatch * parallelSyncConcurrency cycles before the gap + * to ensure transaction data matches the distributor. + * + * Example: Gap at 150000, lookback 100 cycles -> verify 149900-150000 + */ + private async verifyDataIntegrityWithLookback(gaps: CycleGap[]): Promise { + try { + console.log(`\n${'='.repeat(60)}`) + console.log(`Verifying data integrity with ${this.lookbackCycles}-cycle lookback window`) + console.log(`${'='.repeat(60)}`) + + const allMismatchedCycles: MismatchedCycle[] = [] + const verificationRanges: CycleGap[] = [] + + // Build verification ranges for each gap + for (const gap of gaps) { + const lookbackStart = Math.max(0, gap.startCycle - this.lookbackCycles) + const lookbackEnd = gap.startCycle - 1 + + // Only verify if there's a valid lookback range + if (lookbackEnd >= lookbackStart && lookbackEnd >= 0) { + verificationRanges.push({ + startCycle: lookbackStart, + endCycle: lookbackEnd, + gapSize: lookbackEnd - lookbackStart + 1, + }) + console.log( + `Verification range for gap at ${gap.startCycle}: cycles ${lookbackStart}-${lookbackEnd}` + ) + } + } + + // Deduplicate overlapping verification ranges + const mergedRanges = this.mergeOverlappingRanges(verificationRanges) + console.log(`Merged into ${mergedRanges.length} verification ranges`) + + // Verify each range + for (const range of mergedRanges) { + console.log(`\nVerifying cycles ${range.startCycle} to ${range.endCycle}...`) + + const mismatched = await this.compareCycleDataWithDistributor(range.startCycle, range.endCycle) + allMismatchedCycles.push(...mismatched) + } + + if (allMismatchedCycles.length > 0) { + console.log(`\n⚠️ Found ${allMismatchedCycles.length} cycles with mismatched data:`) + for (const mismatch of allMismatchedCycles) { + console.log( + ` Cycle ${mismatch.cycle}: ` + + `Receipts (local: ${mismatch.localReceipts}, distributor: ${mismatch.distributorReceipts}), ` + + `OriginalTxs (local: ${mismatch.localOriginalTxs}, distributor: ${mismatch.distributorOriginalTxs})` + ) + } + } else { + console.log(`\n✅ All verified cycles have matching data`) + } + + return allMismatchedCycles + } catch (error) { + console.error('Error verifying data integrity:', error) + throw error + } + } + + /** + * Compare cycle data counts between local DB and distributor + */ + private async compareCycleDataWithDistributor( + startCycle: number, + endCycle: number + ): Promise { + const mismatched: MismatchedCycle[] = [] + + try { + // Fetch counts from distributor + const [receiptsResponse, originalTxsResponse] = await Promise.all([ + queryFromDistributor(DataType.RECEIPT, { startCycle, endCycle, type: 'tally' }), + queryFromDistributor(DataType.ORIGINALTX, { startCycle, endCycle, type: 'tally' }), + ]) + + if (!receiptsResponse?.data?.receipts || !originalTxsResponse?.data?.originalTxs) { + console.warn(`Failed to fetch distributor data for cycles ${startCycle}-${endCycle}`) + return mismatched + } + + const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts + const distributorOriginalTxs: { cycle: number; originalTxsData: number }[] = + originalTxsResponse.data.originalTxs + + // Fetch counts from local DB + const [localReceipts, localOriginalTxs] = await Promise.all([ + ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle), + OriginalTxDataDB.queryOriginalTxDataCountByCycles(startCycle, endCycle), + ]) + + // Create maps for easier lookup + const localReceiptsMap = new Map(localReceipts.map((r) => [r.cycle, r.receipts])) + const localOriginalTxsMap = new Map(localOriginalTxs.map((t) => [t.cycle, t.originalTxsData])) + + // Compare each cycle + const allCycles = new Set([ + ...distributorReceipts.map((r) => r.cycle), + ...distributorOriginalTxs.map((t) => t.cycle), + ]) + + for (const cycle of allCycles) { + const distReceipts = distributorReceipts.find((r) => r.cycle === cycle)?.receipts || 0 + const distOriginalTxs = distributorOriginalTxs.find((t) => t.cycle === cycle)?.originalTxsData || 0 + + const localReceiptsCount = localReceiptsMap.get(cycle) || 0 + const localOriginalTxsCount = localOriginalTxsMap.get(cycle) || 0 + + const receiptsMismatch = localReceiptsCount !== distReceipts + const originalTxsMismatch = localOriginalTxsCount !== distOriginalTxs + + if (receiptsMismatch || originalTxsMismatch) { + mismatched.push({ + cycle, + localReceipts: localReceiptsCount, + distributorReceipts: distReceipts, + localOriginalTxs: localOriginalTxsCount, + distributorOriginalTxs: distOriginalTxs, + receiptsMismatch, + originalTxsMismatch, + }) + } + } + + return mismatched + } catch (error) { + console.error(`Error comparing data for cycles ${startCycle}-${endCycle}:`, error) + return mismatched + } + } + + /** + * Merge overlapping or adjacent ranges to minimize API calls + */ + private mergeOverlappingRanges(ranges: CycleGap[]): CycleGap[] { + if (ranges.length === 0) return [] + + // Sort by start cycle + const sorted = [...ranges].sort((a, b) => a.startCycle - b.startCycle) + const merged: CycleGap[] = [sorted[0]] + + for (let i = 1; i < sorted.length; i++) { + const current = sorted[i] + const last = merged[merged.length - 1] + + // If current range overlaps or is adjacent to last range, merge them + if (current.startCycle <= last.endCycle + 1) { + last.endCycle = Math.max(last.endCycle, current.endCycle) + last.gapSize = last.endCycle - last.startCycle + 1 + } else { + merged.push(current) + } + } + + return merged + } + + /** + * Generate comprehensive recovery plan + * + * Orchestrates gap detection and data verification to create a complete recovery strategy. + * NOTE: This should only be called when there's existing data in DB (not fresh start) + */ + async generateRecoveryPlan(currentDistributorCycle: number): Promise { + try { + const lastLocalCycles = await CycleDB.queryLatestCycleRecords(1) + const lastLocalCycle = lastLocalCycles.length > 0 ? lastLocalCycles[0].counter : -1 + + console.log(`\n${'='.repeat(70)}`) + console.log(`GENERATING DATA SYNC RECOVERY PLAN`) + console.log(`${'='.repeat(70)}`) + console.log(`Current distributor cycle: ${currentDistributorCycle}`) + console.log(`Last local cycle: ${lastLocalCycle}`) + + // Step 1: Identify missing cycle ranges + const missingCycleRanges = await this.identifyMissingCycleRanges(currentDistributorCycle) + + // Step 2: Verify data integrity with lookback (only if there are gaps) + const mismatchedCycles = + missingCycleRanges.length > 0 ? await this.verifyDataIntegrityWithLookback(missingCycleRanges) : [] + + // Calculate lookback ranges for reporting + const lookbackVerificationRanges: CycleGap[] = [] + for (const gap of missingCycleRanges) { + const lookbackStart = Math.max(0, gap.startCycle - this.lookbackCycles) + const lookbackEnd = gap.startCycle - 1 + if (lookbackEnd >= lookbackStart && lookbackEnd >= 0) { + lookbackVerificationRanges.push({ + startCycle: lookbackStart, + endCycle: lookbackEnd, + gapSize: lookbackEnd - lookbackStart + 1, + }) + } + } + + const totalMissingCycles = missingCycleRanges.reduce((sum, gap) => sum + gap.gapSize, 0) + const recoveryNeeded = missingCycleRanges.length > 0 || mismatchedCycles.length > 0 + + const plan: DataSyncRecoveryPlan = { + currentDistributorCycle, + lastLocalCycle, + missingCycleRanges, + mismatchedCycles, + lookbackVerificationRanges, + totalMissingCycles, + totalMismatchedCycles: mismatchedCycles.length, + recoveryNeeded, + } + + this.printRecoveryPlan(plan) + + return plan + } catch (error) { + console.error('Error generating recovery plan:', error) + throw error + } + } + + /** + * Execute comprehensive sync with recovery + * + * Combines all sync needs (mismatched cycles + missing ranges) and uses ParallelDataSync + * for everything. No distinction between "patching" and "syncing" - both use the same mechanism. + */ + async executeSyncWithRecovery(recoveryPlan: DataSyncRecoveryPlan): Promise { + console.log(`\n${'='.repeat(70)}`) + console.log(`EXECUTING DATA SYNC WITH RECOVERY`) + console.log(`${'='.repeat(70)}`) + + try { + // Combine mismatched cycles and missing ranges into unified sync plan + const allRangesToSync: CycleGap[] = [] + + // Step 1: Add mismatched cycles (convert to ranges) + if (recoveryPlan.mismatchedCycles.length > 0) { + console.log(`\n📝 Identified ${recoveryPlan.mismatchedCycles.length} mismatched cycles to patch`) + const patchRanges = this.groupCyclesIntoRanges(recoveryPlan.mismatchedCycles.map((m) => m.cycle)) + allRangesToSync.push(...patchRanges) + } + + // Step 2: Add missing cycle ranges + if (recoveryPlan.missingCycleRanges.length > 0) { + console.log(`\n📥 Identified ${recoveryPlan.missingCycleRanges.length} missing cycle ranges to sync`) + allRangesToSync.push(...recoveryPlan.missingCycleRanges) + } + + // Step 3: Merge and deduplicate ranges + const mergedRanges = this.mergeOverlappingRanges(allRangesToSync) + console.log(`\nMerged into ${mergedRanges.length} sync ranges`) + + // Step 4: Execute ParallelDataSync for all ranges + if (mergedRanges.length > 0) { + for (const range of mergedRanges) { + console.log(`\nSyncing range: ${range.startCycle} to ${range.endCycle} (${range.gapSize} cycles)`) + + const parallelSync = new ParallelDataSync({ + concurrency: config.parallelSyncConcurrency || 10, + retryAttempts: 3, + retryDelayMs: 1000, + }) + + await parallelSync.startSyncing(range.startCycle, range.endCycle) + console.log(`✅ Completed range ${range.startCycle} to ${range.endCycle}`) + } + } else { + console.log('\n✅ No data to sync, database is up to date') + } + + console.log(`\n${'='.repeat(70)}`) + console.log(`✅ DATA SYNC COMPLETED SUCCESSFULLY`) + console.log(`${'='.repeat(70)}\n`) + } catch (error) { + console.error('Error executing sync with recovery:', error) + throw error + } + } + + /** + * Group individual cycles into consecutive ranges + */ + private groupCyclesIntoRanges(cycles: number[]): CycleGap[] { + if (cycles.length === 0) return [] + + const sorted = [...cycles].sort((a, b) => a - b) + const ranges: CycleGap[] = [] + let rangeStart = sorted[0] + let rangeEnd = sorted[0] + + for (let i = 1; i < sorted.length; i++) { + if (sorted[i] === rangeEnd + 1) { + // Consecutive cycle, extend range + rangeEnd = sorted[i] + } else { + // Gap found, save current range and start new one + ranges.push({ + startCycle: rangeStart, + endCycle: rangeEnd, + gapSize: rangeEnd - rangeStart + 1, + }) + rangeStart = sorted[i] + rangeEnd = sorted[i] + } + } + + // Add last range + ranges.push({ + startCycle: rangeStart, + endCycle: rangeEnd, + gapSize: rangeEnd - rangeStart + 1, + }) + + return ranges + } + + /** + * Print recovery plan summary + */ + private printRecoveryPlan(plan: DataSyncRecoveryPlan): void { + console.log(`\n${'='.repeat(70)}`) + console.log(`RECOVERY PLAN SUMMARY`) + console.log(`${'='.repeat(70)}`) + console.log(`Current distributor cycle: ${plan.currentDistributorCycle}`) + console.log(`Last local cycle: ${plan.lastLocalCycle}`) + console.log(`Recovery needed: ${plan.recoveryNeeded ? '⚠️ YES' : '✅ NO'}`) + console.log(``) + console.log(`Missing Cycle Ranges: ${plan.missingCycleRanges.length}`) + console.log(`Total missing cycles: ${plan.totalMissingCycles}`) + if (plan.missingCycleRanges.length > 0) { + for (const gap of plan.missingCycleRanges) { + console.log(` - Cycles ${gap.startCycle} to ${gap.endCycle} (${gap.gapSize} cycles)`) + } + } + console.log(``) + console.log(`Mismatched Cycles: ${plan.totalMismatchedCycles}`) + if (plan.mismatchedCycles.length > 0) { + for (const mismatch of plan.mismatchedCycles.slice(0, 10)) { + // Show first 10 + console.log( + ` - Cycle ${mismatch.cycle}: ` + + `Receipts ${mismatch.localReceipts}→${mismatch.distributorReceipts}, ` + + `OriginalTxs ${mismatch.localOriginalTxs}→${mismatch.distributorOriginalTxs}` + ) + } + if (plan.mismatchedCycles.length > 10) { + console.log(` ... and ${plan.mismatchedCycles.length - 10} more`) + } + } + console.log(``) + console.log(`Lookback Verification:`) + for (const range of plan.lookbackVerificationRanges) { + console.log(` - Verified cycles ${range.startCycle} to ${range.endCycle}`) + } + console.log(`${'='.repeat(70)}\n`) + } +} diff --git a/src/collector.ts b/src/collector.ts index dfdb04d..0cf573e 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -28,7 +28,7 @@ import RMQCyclesConsumer from './collectors/rmq/cycles' import RMQOriginalTxsConsumer from './collectors/rmq/original_txs' import RMQReceiptsConsumer from './collectors/rmq/receipts' import { setupCollectorSocketServer } from './collectorServer' -import { ParallelDataSync } from './class/ParallelDataSync' +import { DataSyncManager } from './class/DataSyncManager' const DistributorFirehoseEvent = 'FIREHOSE' let ws: WebSocket @@ -79,6 +79,7 @@ if (config.env == envEnum.DEV) { } export const checkAndSyncData = async (): Promise<() => Promise> => { + console.log('Using legacy sequential sync strategy') // Check if there is any existing data in the db let lastStoredReceiptCount = await ReceiptDB.queryReceiptCount() let lastStoredOriginalTxDataCount = await OriginalTxDataDB.queryOriginalTxDataCount() @@ -226,26 +227,6 @@ export const checkAndSyncData = async (): Promise<() => Promise> => { // If there is already some data in the db, we can assume that the genesis accounts data has been synced already if (lastStoredCycleCount === 0) await downloadAndSyncGenesisAccounts() // To sync accounts data that are from genesis accounts/accounts data that the network start with - // Use parallel sync if enabled (default) - if (config.useParallelSync) { - console.log('\n') - console.log('='.repeat(60)) - console.log('Using NEW EFFICIENT PARALLEL SYNC STRATEGY based on cycle batches!') - console.log('This strategy is more robust and provides 10x+ performance improvement') - console.log('='.repeat(60)) - console.log('\n') - - const parallelDataSync = new ParallelDataSync({ - concurrency: config.parallelSyncConcurrency, - retryAttempts: 3, - retryDelayMs: 1000, - }) - - await parallelDataSync.startSyncing(lastStoredCycleCount, totalCyclesToSync) - return - } - - console.log('Using legacy sequential sync strategy') // Sync receipts and originalTxsData data first if there is old data if ( lastStoredReceiptCycle > 0 && @@ -275,6 +256,30 @@ export const checkAndSyncData = async (): Promise<() => Promise> => { return syncData } +export const startDataSyncManager = async (): Promise<() => Promise> => { + console.log('\n') + console.log('='.repeat(60)) + console.log('INITIALIZING DATA SYNC MANAGER') + console.log('='.repeat(60)) + console.log('DataSyncManager provides intelligent data synchronization with:') + console.log(' • Early data anomaly detection before sync operations') + console.log(' • Automatic gap detection and recovery') + console.log(' • Lookback verification window for data integrity') + console.log(' • Parallel batch-cycle-based sync (10x+ performance improvement)') + console.log('='.repeat(60)) + console.log('\n') + + // Run anomaly detection BEFORE connecting to websocket + // This fails fast if there are data corruption issues + const syncManager = new DataSyncManager() + await syncManager.detectDataAnomalies() + + console.log('✅ Data anomaly check passed - proceeding with sync') + + // Return the sync function to be executed after WS connection + return syncManager.syncData +} + const attemptReconnection = (): void => { console.log(`Re-connecting Distributor in ${config.DISTRIBUTOR_RECONNECT_INTERVAL / 1000}s...`) reconnecting = true @@ -394,7 +399,7 @@ const startServer = async (): Promise => { await Storage.initializeDB() addExitListeners() - const syncData = await checkAndSyncData() + const syncData = config.useParallelSync ? await startDataSyncManager() : await checkAndSyncData() if (config.dataLogWrite) await initDataLogWriter() addSigListeners() From 176febf323d2d9144bd466bd0eab533b202ed234 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Fri, 7 Nov 2025 21:24:27 +0800 Subject: [PATCH 06/14] refactor: optimize data sync with improved anomaly detection and gap finding - Move CycleGap interface to storage/cycle.ts and implement efficient SQL-based gap detection - Enhance anomaly detection with better error handling and validation logic - Add sync summary functionality to DataSyncManager with database statistics - Remove ParallelSyncCheckpoint dependency and simplify ParallelDataSync - Improve logging and error messages throughout sync process --- src/class/DataSyncManager.ts | 330 ++++++++++++++-------------- src/class/ParallelDataSync.ts | 12 +- src/class/ParallelSyncCheckpoint.ts | 315 -------------------------- src/collector.ts | 6 +- src/storage/cycle.ts | 92 ++++++++ 5 files changed, 262 insertions(+), 493 deletions(-) delete mode 100644 src/class/ParallelSyncCheckpoint.ts diff --git a/src/class/DataSyncManager.ts b/src/class/DataSyncManager.ts index b91b27a..f62ba52 100644 --- a/src/class/DataSyncManager.ts +++ b/src/class/DataSyncManager.ts @@ -1,17 +1,9 @@ import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' +import { CycleGap } from '../storage/cycle' import { config } from '../config' import { queryFromDistributor, DataType, downloadAndSyncGenesisAccounts } from './DataSync' import { ParallelDataSync } from './ParallelDataSync' -/** - * Represents a gap in cycle sequence - */ -export interface CycleGap { - startCycle: number - endCycle: number - gapSize: number -} - /** * Represents a cycle with mismatched transaction data */ @@ -107,6 +99,9 @@ export class DataSyncManager { }) await parallelDataSync.startSyncing(0, totalCycles - 1) + + // Print final database summary + await this.printSyncSummary() } else { // Existing data - use DataSyncManager to identify and patch gaps/mismatches console.log('📊 Existing data detected - running recovery analysis') @@ -122,12 +117,16 @@ export class DataSyncManager { * Throws error if critical anomalies are found * Fetches local cycle data internally */ - async detectDataAnomalies(): Promise<{ lastLocalCycle: number; currentDistributorCycle: number }> { - console.log('\n📊 Running data anomaly detection...') - - // Fetch local and distributor cycle info + async detectDataAnomalies(): Promise { + // Fetch local cycle data const lastLocalCycles = await CycleDB.queryLatestCycleRecords(1) const lastLocalCycle = lastLocalCycles.length > 0 ? lastLocalCycles[0].counter : -1 + if (lastLocalCycle === -1) { + console.log('No local data found, skipping anomaly detection') + return + } + + console.log('\n📊 Running data anomaly detection...') const response = await this.getTotalDataFromDistributor() if (!response) { @@ -138,129 +137,113 @@ export class DataSyncManager { console.log(`Last local cycle: ${lastLocalCycle}`) console.log(`Current distributor cycle: ${currentDistributorCycle}`) - const anomalies: string[] = [] - // Anomaly 1: Local DB has more cycles than distributor if (lastLocalCycle > currentDistributorCycle) { - anomalies.push( + throw new Error( `Local DB has newer cycle than distributor (Local: ${lastLocalCycle}, Distributor: ${currentDistributorCycle})` ) } - // Anomaly 2: Verify last 10-15 cycles match with distributor - if (lastLocalCycle >= 15) { - const verificationCycles = 15 - const startCycle = lastLocalCycle - verificationCycles + 1 - const endCycle = lastLocalCycle + const verificationCycles = 15 - console.log( - `Verifying last ${verificationCycles} cycles (${startCycle} to ${endCycle}) against distributor...` - ) + // Anomaly 2: Verify last 15 cycles match with distributor + let startCycle = lastLocalCycle - verificationCycles + 1 + if (startCycle < 0) { + startCycle = 0 + } + const endCycle = lastLocalCycle - try { - // Compare cycles data - const localCycles = await CycleDB.queryCycleRecordsBetween(startCycle, endCycle) - const distributorResponse = await queryFromDistributor(DataType.CYCLE, { - start: startCycle, - end: endCycle, - }) + console.log( + `Verifying last ${verificationCycles} cycles (${startCycle} to ${endCycle}) against distributor...` + ) + + try { + // Compare cycles data + const localCycles = await CycleDB.queryCycleRecordsBetween(startCycle, endCycle) + const distributorResponse = await queryFromDistributor(DataType.CYCLE, { + start: startCycle, + end: endCycle, + }) - if (distributorResponse?.data?.cycleInfo) { - const distributorCycles = distributorResponse.data.cycleInfo + if (distributorResponse?.data?.cycleInfo) { + const distributorCycles = distributorResponse.data.cycleInfo - // Check if cycle counts match - if (localCycles.length !== distributorCycles.length) { - anomalies.push( - `Cycle count mismatch in range ${startCycle}-${endCycle}: ` + - `Local has ${localCycles.length}, Distributor has ${distributorCycles.length}` + // Verify each cycle's marker matches + for (let i = 0; i < localCycles.length; i++) { + /* eslint-disable security/detect-object-injection */ + const localCycle = localCycles[i] + /* eslint-enable security/detect-object-injection */ + const distributorCycle = distributorCycles.find( + (c: { counter: number; marker: string }) => c.counter === localCycle.counter + ) + + if (!distributorCycle) { + throw new Error(`Cycle ${localCycle.counter} exists locally but not in distributor`) + } else if (localCycle.cycleMarker !== distributorCycle.marker) { + throw new Error( + `Cycle ${localCycle.counter} marker mismatch: ` + + `Local ${localCycle.cycleMarker} vs Distributor ${distributorCycle.marker}` ) - } else { - // Verify each cycle's marker matches - for (let i = 0; i < localCycles.length; i++) { - /* eslint-disable security/detect-object-injection */ - const localCycle = localCycles[i] - /* eslint-enable security/detect-object-injection */ - const distributorCycle = distributorCycles.find( - (c: { counter: number; marker: string }) => c.counter === localCycle.counter - ) - - if (!distributorCycle) { - anomalies.push(`Cycle ${localCycle.counter} exists locally but not in distributor`) - } else if (localCycle.cycleMarker !== distributorCycle.marker) { - anomalies.push( - `Cycle ${localCycle.counter} marker mismatch: ` + - `Local ${localCycle.cycleMarker} vs Distributor ${distributorCycle.marker}` - ) - } - } } } + } - // Compare receipts count - const receiptsResponse = await queryFromDistributor(DataType.RECEIPT, { - startCycle, - endCycle, - type: 'tally', - }) + // Compare receipts count + const receiptsResponse = await queryFromDistributor(DataType.RECEIPT, { + startCycle, + endCycle, + type: 'tally', + }) + + if (receiptsResponse?.data?.receipts) { + const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts + const localReceiptsCount = await ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle) - if (receiptsResponse?.data?.receipts) { - const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts - const localReceiptsCount = await ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle) - - for (const distReceipt of distributorReceipts) { - const localReceipt = localReceiptsCount.find((r) => r.cycle === distReceipt.cycle) - if (localReceipt && localReceipt.receipts !== distReceipt.receipts) { - anomalies.push( - `Receipts count mismatch in cycle ${distReceipt.cycle}: ` + - `Local has ${localReceipt.receipts}, Distributor has ${distReceipt.receipts}` - ) - } + for (const distReceipt of distributorReceipts) { + const localReceipt = localReceiptsCount.find((r) => r.cycle === distReceipt.cycle) + if (localReceipt && localReceipt.receipts > distReceipt.receipts) { + throw new Error( + `Receipts count in local DB has more in cycle ${distReceipt.cycle}: ` + + `Local has ${localReceipt.receipts}, Distributor has ${distReceipt.receipts}` + ) } } + } - // Compare originalTxs count - const originalTxsResponse = await queryFromDistributor(DataType.ORIGINALTX, { - startCycle, - endCycle, - type: 'tally', - }) + // Compare originalTxs count + const originalTxsResponse = await queryFromDistributor(DataType.ORIGINALTX, { + startCycle, + endCycle, + type: 'tally', + }) - if (originalTxsResponse?.data?.originalTxs) { - const distributorOriginalTxs: { cycle: number; originalTxsData: number }[] = - originalTxsResponse.data.originalTxs - const localOriginalTxsCount = await OriginalTxDataDB.queryOriginalTxDataCountByCycles( - startCycle, - endCycle - ) + if (originalTxsResponse?.data?.originalTxs) { + const distributorOriginalTxs: { cycle: number; originalTxsData: number }[] = + originalTxsResponse.data.originalTxs + const localOriginalTxsCount = await OriginalTxDataDB.queryOriginalTxDataCountByCycles( + startCycle, + endCycle + ) - for (const distTx of distributorOriginalTxs) { - const localTx = localOriginalTxsCount.find((t) => t.cycle === distTx.cycle) - if (localTx && localTx.originalTxsData !== distTx.originalTxsData) { - anomalies.push( - `OriginalTxs count mismatch in cycle ${distTx.cycle}: ` + - `Local has ${localTx.originalTxsData}, Distributor has ${distTx.originalTxsData}` - ) - } + for (const distTx of distributorOriginalTxs) { + const localTx = localOriginalTxsCount.find((t) => t.cycle === distTx.cycle) + if (localTx && localTx.originalTxsData > distTx.originalTxsData) { + throw new Error( + `OriginalTxs count mismatch in cycle ${distTx.cycle}: ` + + `Local has ${localTx.originalTxsData}, Distributor has ${distTx.originalTxsData}` + ) } } - } catch (error) { - console.warn('Warning: Could not complete anomaly verification:', error) - // Don't fail on verification errors, just warn } - } - - if (anomalies.length > 0) { - console.error('\n❌ DATA ANOMALIES DETECTED:') - anomalies.forEach((anomaly) => console.error(` - ${anomaly}`)) - throw new Error( - 'Data anomalies detected! Local database may be corrupted or out of sync. ' + - 'Please clear the database and restart the server.' + } catch (error) { + throw Error( + `Data anomalies detected! Local database may be corrupted or out of sync. ` + + `Please patch the database or clear the database and restart the server. ` + + `Error: ${error}` ) } console.log('✅ No data anomalies detected') - - return { lastLocalCycle, currentDistributorCycle } } /** @@ -281,9 +264,11 @@ export class DataSyncManager { /** * Identify all missing cycle ranges by finding gaps in the cycles DB + * Uses efficient LEFT JOIN-based SQL query to find ranges directly - O(N) complexity * * Example: * - DB has cycles: 0-149999, 300001-300099, 300106-300200 + * - Missing ranges: 150000-300000, 300100-300105 * - Returns gaps: [{150000, 300000}, {300100, 300105}] */ private async identifyMissingCycleRanges(targetCycle: number): Promise { @@ -292,67 +277,34 @@ export class DataSyncManager { console.log(`Identifying missing cycle ranges up to cycle ${targetCycle}`) console.log(`${'='.repeat(60)}`) - // Get all cycles from DB ordered by counter - const allCycles = await CycleDB.queryCycleRecordsBetween(0, targetCycle) - - if (!allCycles || allCycles.length === 0) { - // No cycles in DB, everything from 0 to targetCycle is missing - console.log('No cycles found in DB, entire range is missing') - return [ - { - startCycle: 0, - endCycle: targetCycle, - gapSize: targetCycle + 1, - }, - ] - } - - const gaps: CycleGap[] = [] - const cycleNumbers = allCycles.map((c) => c.counter).sort((a, b) => a - b) - - console.log(`Found ${cycleNumbers.length} cycles in DB`) - console.log(`First cycle: ${cycleNumbers[0]}, Last cycle: ${cycleNumbers[cycleNumbers.length - 1]}`) - - // Check if there's a gap at the beginning - if (cycleNumbers[0] > 0) { - gaps.push({ - startCycle: 0, - endCycle: cycleNumbers[0] - 1, - gapSize: cycleNumbers[0], - }) - console.log(`Gap found at beginning: 0 to ${cycleNumbers[0] - 1}`) - } - - // Find gaps in the middle - for (let i = 0; i < cycleNumbers.length - 1; i++) { - const currentCycle = cycleNumbers[i] - const nextCycle = cycleNumbers[i + 1] - - // If next cycle is not immediately after current, there's a gap - if (nextCycle - currentCycle > 1) { - const gapStart = currentCycle + 1 - const gapEnd = nextCycle - 1 - gaps.push({ - startCycle: gapStart, - endCycle: gapEnd, - gapSize: gapEnd - gapStart + 1, - }) - console.log(`Gap found: ${gapStart} to ${gapEnd} (${gapEnd - gapStart + 1} cycles)`) + // Get missing cycle ranges directly from SQL using LEFT JOIN + const gaps = await CycleDB.queryMissingCycleRanges(targetCycle) + + // Handle case where no cycles exist in DB + if (gaps.length === 0) { + const cycleCount = await CycleDB.queryCycleCount() + if (cycleCount === 0) { + // No cycles in DB, entire range is missing + console.log('No cycles found in DB, entire range is missing') + return [ + { + startCycle: 0, + endCycle: targetCycle, + gapSize: targetCycle + 1, + }, + ] + } else { + // All cycles present + console.log('✅ No missing cycles - database is complete up to target cycle') + return [] } } - // Check if there's a gap at the end - const lastLocalCycle = cycleNumbers[cycleNumbers.length - 1] - if (lastLocalCycle < targetCycle) { - gaps.push({ - startCycle: lastLocalCycle + 1, - endCycle: targetCycle, - gapSize: targetCycle - lastLocalCycle, - }) - console.log(`Gap found at end: ${lastLocalCycle + 1} to ${targetCycle}`) - } - + // Log results console.log(`\nTotal gaps found: ${gaps.length}`) + for (const gap of gaps) { + console.log(` Gap: ${gap.startCycle} to ${gap.endCycle} (${gap.gapSize} cycles)`) + } const totalMissing = gaps.reduce((sum, gap) => sum + gap.gapSize, 0) console.log(`Total missing cycles: ${totalMissing}`) @@ -641,6 +593,9 @@ export class DataSyncManager { console.log(`\n${'='.repeat(70)}`) console.log(`✅ DATA SYNC COMPLETED SUCCESSFULLY`) console.log(`${'='.repeat(70)}\n`) + + // Print final database summary + await this.printSyncSummary() } catch (error) { console.error('Error executing sync with recovery:', error) throw error @@ -724,4 +679,47 @@ export class DataSyncManager { } console.log(`${'='.repeat(70)}\n`) } + + /** + * Get overall sync statistics from database + */ + async getSyncStats(): Promise<{ + totalCycles: number + totalReceipts: number + totalOriginalTxs: number + }> { + try { + const [cycleCount, receiptCount, originalTxCount] = await Promise.all([ + CycleDB.queryCycleCount(), + ReceiptDB.queryReceiptCount(), + OriginalTxDataDB.queryOriginalTxDataCount(), + ]) + + return { + totalCycles: cycleCount || 0, + totalReceipts: receiptCount || 0, + totalOriginalTxs: originalTxCount || 0, + } + } catch (error) { + console.error('Error getting sync stats:', error) + return { + totalCycles: 0, + totalReceipts: 0, + totalOriginalTxs: 0, + } + } + } + + /** + * Print sync summary + */ + async printSyncSummary(): Promise { + const stats = await this.getSyncStats() + console.log('='.repeat(60)) + console.log('Sync Summary:') + console.log(` Total Cycles: ${stats.totalCycles}`) + console.log(` Total Receipts: ${stats.totalReceipts}`) + console.log(` Total OriginalTxs: ${stats.totalOriginalTxs}`) + console.log('='.repeat(60)) + } } diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index 4f90513..f012aa3 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -4,7 +4,6 @@ import { Utils as StringUtils } from '@shardus/types' import { config, DISTRIBUTOR_URL } from '../config' import { queryFromDistributor, DataType } from './DataSync' import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' -import { ParallelSyncCheckpointManager } from './ParallelSyncCheckpoint' import { Cycle } from '../types' import axios, { AxiosInstance } from 'axios' import http from 'http' @@ -69,11 +68,9 @@ export interface SyncTxDataByCycleRange { * Implements the optimal sync strategy with: * - Cycle-level parallelization * - Composite cursor (timestamp + txId ) to prevent data loss - * - Automatic resume from database * - Work queue for load balancing */ export class ParallelDataSync { - private checkpointManager: ParallelSyncCheckpointManager private queue: PQueue private syncConfig: ParallelSyncConfig private stats: SyncStats @@ -82,7 +79,6 @@ export class ParallelDataSync { private axiosInstance: AxiosInstance constructor(syncConfig?: Partial) { - this.checkpointManager = new ParallelSyncCheckpointManager() this.syncConfig = { concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 10, retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 3, @@ -299,7 +295,7 @@ export class ParallelDataSync { this.stats.endTime = Date.now() // Summary - await this.printSummary() + await this.printSummary(startCycle, endCycle) } catch (error) { console.error('Fatal error in parallel sync:', error) this.stats.errors++ @@ -779,7 +775,7 @@ export class ParallelDataSync { /** * Print sync summary */ - private async printSummary(): Promise { + private async printSummary(startCycle: number, endCycle: number): Promise { const elapsedMs = (this.stats.endTime || Date.now()) - this.stats.startTime const elapsedSec = (elapsedMs / 1000).toFixed(2) const elapsedMin = (elapsedMs / 60000).toFixed(2) @@ -787,6 +783,7 @@ export class ParallelDataSync { console.log(`\n${'='.repeat(60)}`) console.log('Parallel Sync Complete!') console.log(`${'='.repeat(60)}`) + console.log(` Cycle Range: ${startCycle} → ${endCycle}`) console.log(` Cycles Synced: ${this.stats.completedCycles}/${this.stats.totalCycles}`) console.log(` Receipts Synced: ${this.stats.totalReceipts}`) console.log(` OriginalTxs Synced: ${this.stats.totalOriginalTxs}`) @@ -796,9 +793,6 @@ export class ParallelDataSync { ` Throughput: ${(this.stats.totalReceipts / (elapsedMs / 1000)).toFixed(0)} receipts/sec` ) console.log(`${'='.repeat(60)}\n`) - - // Print DB summary - await this.checkpointManager.printSyncSummary() } /** diff --git a/src/class/ParallelSyncCheckpoint.ts b/src/class/ParallelSyncCheckpoint.ts deleted file mode 100644 index 472a0d8..0000000 --- a/src/class/ParallelSyncCheckpoint.ts +++ /dev/null @@ -1,315 +0,0 @@ -import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' -import { config } from '../config' - -/** - * Composite cursor for tracking sync progress - * Uses both timestamp and ID to handle timestamp collisions - */ -export interface CompositeCursor { - timestamp: number - id: string // receiptId or txId -} - -/** - * Cycle resume information from database - */ -export interface CycleResumeInfo { - cycleNumber: number - startTimestamp: number - endTimestamp: number - receipts: { - lastTimestamp: number - lastId: string - count: number - } - originalTxs: { - lastTimestamp: number - lastId: string - count: number - } -} - -/** - * Manages sync state by querying the database - * No separate checkpoint storage needed - DB is source of truth - */ -export class ParallelSyncCheckpointManager { - /** - * Get the last completed cycle from database - */ - async getLastCompletedCycle(): Promise { - try { - const cycles = await CycleDB.queryLatestCycleRecords(1) - if (cycles && cycles.length > 0) { - return cycles[0].counter - } - return 0 - } catch (error) { - console.error('Error getting last completed cycle:', error) - return 0 - } - } - - /** - * Get resume cursor for receipts in a specific cycle - * Returns the last receipt's timestamp and ID, or cycle start if none exist - */ - async getReceiptsCursor(cycleNumber: number, cycleStartTimestamp: number): Promise { - try { - // Query last receipt for this cycle - const receipts = await ReceiptDB.queryReceipts({ - limit: 1, - startCycleNumber: cycleNumber, - }) - - if (receipts && receipts.length > 0) { - const lastReceipt = receipts[0] - return { - timestamp: lastReceipt.timestamp, - id: lastReceipt.receiptId, - } - } - - // No receipts found for this cycle, start from cycle beginning - return { - timestamp: cycleStartTimestamp, - id: '', - } - } catch (error) { - console.error(`Error getting receipts cursor for cycle ${cycleNumber}:`, error) - return { - timestamp: cycleStartTimestamp, - id: '', - } - } - } - - /** - * Get resume cursor for originalTxs in a specific cycle - */ - async getOriginalTxsCursor(cycleNumber: number, cycleStartTimestamp: number): Promise { - try { - // Query last originalTx for this cycle - const originalTxs = await OriginalTxDataDB.queryOriginalTxsData({ - limit: 1, // limit - startCycle: cycleNumber, // startCycle - }) - - if (originalTxs && originalTxs.length > 0) { - // Sort by timestamp DESC to get the last one - originalTxs.sort((a, b) => b.timestamp - a.timestamp) - const lastTx = originalTxs[0] - return { - timestamp: lastTx.timestamp, - id: lastTx.txId, - } - } - - // No originalTxs found for this cycle, start from cycle beginning - return { - timestamp: cycleStartTimestamp, - id: '', - } - } catch (error) { - console.error(`Error getting originalTxs cursor for cycle ${cycleNumber}:`, error) - return { - timestamp: cycleStartTimestamp, - id: '', - } - } - } - - /** - * Get counts of data already synced for a cycle - */ - async getCycleSyncStatus(cycleNumber: number): Promise<{ - receiptsCount: number - originalTxsCount: number - isComplete: boolean - }> { - try { - const [receiptsCountResult, originalTxsCountResult] = await Promise.all([ - ReceiptDB.queryReceiptCountByCycles(cycleNumber, cycleNumber), - OriginalTxDataDB.queryOriginalTxDataCountByCycles(cycleNumber, cycleNumber), - ]) - - const receiptsCount = - receiptsCountResult && receiptsCountResult.length > 0 ? receiptsCountResult[0].receipts : 0 - - const originalTxsCount = - originalTxsCountResult && originalTxsCountResult.length > 0 - ? originalTxsCountResult[0].originalTxsData - : 0 - - return { - receiptsCount, - originalTxsCount, - isComplete: false, // Determined by sync logic - } - } catch (error) { - console.error(`Error getting cycle sync status for cycle ${cycleNumber}:`, error) - return { - receiptsCount: 0, - originalTxsCount: 0, - isComplete: false, - } - } - } - - /** - * Determine which cycles need to be synced - * Compares local DB with distributor totals - */ - async getCyclesToSync(startCycle: number, endCycle: number): Promise { - try { - const lastLocalCycle = await this.getLastCompletedCycle() - - // If we have no local data, sync all cycles - if (lastLocalCycle === 0) { - const cyclesToSync: number[] = [] - for (let i = startCycle; i <= endCycle; i++) { - cyclesToSync.push(i) - } - return cyclesToSync - } - - // If endCycle is beyond what we have, sync from last local + 1 - if (endCycle > lastLocalCycle) { - const cyclesToSync: number[] = [] - for (let i = lastLocalCycle + 1; i <= endCycle; i++) { - cyclesToSync.push(i) - } - return cyclesToSync - } - - // All cycles already synced - return [] - } catch (error) { - console.error('Error determining cycles to sync:', error) - return [] - } - } - - /** - * Check if a cycle is fully synced by comparing counts with distributor - */ - async isCycleFullySynced( - cycleNumber: number, - expectedReceiptsCount: number, - expectedOriginalTxsCount: number - ): Promise { - try { - const status = await this.getCycleSyncStatus(cycleNumber) - - const receiptsMatch = status.receiptsCount === expectedReceiptsCount - const originalTxsMatch = status.originalTxsCount === expectedOriginalTxsCount - - if (config.verbose) { - console.log( - `Cycle ${cycleNumber} sync check: ` + - `receipts ${status.receiptsCount}/${expectedReceiptsCount}, ` + - `originalTxs ${status.originalTxsCount}/${expectedOriginalTxsCount}` - ) - } - - return receiptsMatch && originalTxsMatch - } catch (error) { - console.error(`Error checking if cycle ${cycleNumber} is fully synced:`, error) - return false - } - } - - /** - * Get detailed resume information for a specific cycle - */ - async getCycleResumeInfo( - cycleNumber: number, - cycleStartTimestamp: number, - cycleEndTimestamp: number - ): Promise { - const [receiptsCursor, originalTxsCursor, syncStatus] = await Promise.all([ - this.getReceiptsCursor(cycleNumber, cycleStartTimestamp), - this.getOriginalTxsCursor(cycleNumber, cycleStartTimestamp), - this.getCycleSyncStatus(cycleNumber), - ]) - - return { - cycleNumber, - startTimestamp: cycleStartTimestamp, - endTimestamp: cycleEndTimestamp, - receipts: { - lastTimestamp: receiptsCursor.timestamp, - lastId: receiptsCursor.id, - count: syncStatus.receiptsCount, - }, - originalTxs: { - lastTimestamp: originalTxsCursor.timestamp, - lastId: originalTxsCursor.id, - count: syncStatus.originalTxsCount, - }, - } - } - - /** - * Log sync progress - */ - logProgress( - cycleNumber: number, - dataType: 'receipts' | 'originalTxs', - itemsFetched: number, - totalItems: number - ): void { - const percentage = totalItems > 0 ? ((totalItems / totalItems) * 100).toFixed(1) : '0.0' - console.log( - `[Cycle ${cycleNumber}] ${dataType}: +${itemsFetched} items (total: ${totalItems}, ${percentage}%)` - ) - } - - /** - * Get overall sync statistics from database - */ - async getSyncStats(): Promise<{ - totalCycles: number - totalReceipts: number - totalOriginalTxs: number - lastCycleNumber: number - }> { - try { - const [cycleCount, receiptCount, originalTxCount, lastCycle] = await Promise.all([ - CycleDB.queryCycleCount(), - ReceiptDB.queryReceiptCount(), - OriginalTxDataDB.queryOriginalTxDataCount(), - this.getLastCompletedCycle(), - ]) - - return { - totalCycles: cycleCount || 0, - totalReceipts: receiptCount || 0, - totalOriginalTxs: originalTxCount || 0, - lastCycleNumber: lastCycle, - } - } catch (error) { - console.error('Error getting sync stats:', error) - return { - totalCycles: 0, - totalReceipts: 0, - totalOriginalTxs: 0, - lastCycleNumber: 0, - } - } - } - - /** - * Print sync summary - */ - async printSyncSummary(): Promise { - const stats = await this.getSyncStats() - console.log('='.repeat(60)) - console.log('Sync Summary:') - console.log(` Total Cycles: ${stats.totalCycles}`) - console.log(` Total Receipts: ${stats.totalReceipts}`) - console.log(` Total OriginalTxs: ${stats.totalOriginalTxs}`) - console.log(` Last Cycle: ${stats.lastCycleNumber}`) - console.log('='.repeat(60)) - } -} diff --git a/src/collector.ts b/src/collector.ts index 0cf573e..836d737 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -271,13 +271,13 @@ export const startDataSyncManager = async (): Promise<() => Promise> => { // Run anomaly detection BEFORE connecting to websocket // This fails fast if there are data corruption issues - const syncManager = new DataSyncManager() - await syncManager.detectDataAnomalies() + const dataSyncManager = new DataSyncManager() + await dataSyncManager.detectDataAnomalies() console.log('✅ Data anomaly check passed - proceeding with sync') // Return the sync function to be executed after WS connection - return syncManager.syncData + return dataSyncManager.syncData } const attemptReconnection = (): void => { diff --git a/src/storage/cycle.ts b/src/storage/cycle.ts index 6ddc964..d295f7e 100644 --- a/src/storage/cycle.ts +++ b/src/storage/cycle.ts @@ -234,3 +234,95 @@ export async function queryCycleRecordsByTimestamp( return [] } } + +export interface CycleGap { + startCycle: number + endCycle: number + gapSize: number +} + +/** + * Efficiently query for missing cycle ranges + * Returns ranges of missing cycles from 0 to targetCycle + * Uses LEFT JOIN to find gaps between consecutive cycles - O(N) complexity + */ +export async function queryMissingCycleRanges(targetCycle: number): Promise { + try { + + // Get first and last cycle for edge gap detection + const firstCycleResult = (await db.get( + cycleDatabase, + 'SELECT MIN(counter) as first_cycle FROM cycles', + [] + )) as { first_cycle: number } | undefined + + const lastCycleResult = (await db.get( + cycleDatabase, + 'SELECT MAX(counter) as last_cycle FROM cycles WHERE counter <= ?', + [targetCycle] + )) as { last_cycle: number } | undefined + + const firstCycle = firstCycleResult?.first_cycle ?? 0 + const lastCycle = lastCycleResult?.last_cycle ?? -1 + + const ranges: CycleGap[] = [] + + // Check for gap at the beginning (0 to firstCycle - 1) + if (firstCycle > 0) { + ranges.push({ + startCycle: 0, + endCycle: firstCycle - 1, + gapSize: firstCycle, + }) + } + + // Find gaps in the middle using LEFT JOIN + // For each cycle c1, check if the next cycle (c1.counter + 1) exists + // If not, find where the gap ends by looking for the next existing cycle + const sql = ` + SELECT + c1.counter + 1 AS startCycle, + (SELECT MIN(c2.counter) - 1 + FROM cycles c2 + WHERE c2.counter > c1.counter AND c2.counter <= ?) AS endCycle + FROM cycles c1 + WHERE NOT EXISTS ( + SELECT 1 FROM cycles c3 + WHERE c3.counter = c1.counter + 1 + ) + AND c1.counter < ? + ORDER BY c1.counter + ` + + const middleGaps = (await db.all(cycleDatabase, sql, [targetCycle, targetCycle])) as { + startCycle: number + endCycle: number + }[] + + // Add middle gaps with calculated gapSize (filter out null endCycle values) + for (const gap of middleGaps) { + if (gap.endCycle !== null && gap.endCycle >= gap.startCycle) { + ranges.push({ + startCycle: gap.startCycle, + endCycle: gap.endCycle, + gapSize: gap.endCycle - gap.startCycle + 1, + }) + } + } + + // Check for gap at the end (lastCycle + 1 to targetCycle) + if (lastCycle >= 0 && lastCycle < targetCycle) { + ranges.push({ + startCycle: lastCycle + 1, + endCycle: targetCycle, + gapSize: targetCycle - lastCycle, + }) + } + + if (config.verbose) console.log(`Found ${ranges.length} missing cycle ranges`) + return ranges + } catch (e) { + console.log('Error querying missing cycle ranges:', e) + throw e + } +} From eb68f9c9178b5e57382053a70801cf7691ffcb76 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Tue, 11 Nov 2025 00:34:33 +0800 Subject: [PATCH 07/14] Refactor ParallelDataSync to use unified data fetching and improve batch processing - Replace separate fetch methods with unified fetchDataFromDistributor method - Refactor startSyncing to accept pre-created cycle batches instead of range parameters - Add createCycleBatches method for better batch management separation - Optimize account/transaction processing by removing individual existence checks - Add batch querying for account timestamps to reduce database calls - Improve SQLite performance with increased cache size and memory-mapped I/O - Add network timing and compression metrics logging for better observability - Update DataSyncManager to use new batch-based sync approach - Increase default cyclesPerBatch from 10 to 100 for better throughput --- src/class/DataSync.ts | 2 + src/class/DataSyncManager.ts | 184 ++++++++---- src/class/ParallelDataSync.ts | 550 +++++++++++++++++----------------- src/collector.ts | 41 +-- src/config/index.ts | 2 +- src/storage/account.ts | 41 +++ src/storage/receipt.ts | 65 ++-- src/storage/sqlite3storage.ts | 7 +- 8 files changed, 509 insertions(+), 383 deletions(-) diff --git a/src/class/DataSync.ts b/src/class/DataSync.ts index d6f4de5..4ad76cf 100644 --- a/src/class/DataSync.ts +++ b/src/class/DataSync.ts @@ -15,6 +15,7 @@ export enum DataType { } interface queryFromDistributorParameters { + count?: number start?: number end?: number page?: number @@ -64,6 +65,7 @@ export const queryFromDistributor = async ( const response = await axios.post(url, data, { headers: { 'Content-Type': 'application/json', + 'Accept-Encoding': 'gzip, deflate', // Request compressed responses }, timeout: 45000, transformResponse: (res) => { diff --git a/src/class/DataSyncManager.ts b/src/class/DataSyncManager.ts index f62ba52..eea0504 100644 --- a/src/class/DataSyncManager.ts +++ b/src/class/DataSyncManager.ts @@ -1,4 +1,5 @@ -import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' +import { P2P } from '@shardus/types' +import { CycleDB, ReceiptDB, OriginalTxDataDB, AccountDB, TransactionDB } from '../storage' import { CycleGap } from '../storage/cycle' import { config } from '../config' import { queryFromDistributor, DataType, downloadAndSyncGenesisAccounts } from './DataSync' @@ -59,6 +60,18 @@ export class DataSyncManager { private lookbackCycles: number constructor() { + console.log('\n') + console.log('='.repeat(60)) + console.log('INITIALIZING DATA SYNC MANAGER') + console.log('='.repeat(60)) + console.log('DataSyncManager provides intelligent data synchronization with:') + console.log(' • Early data anomaly detection before sync operations') + console.log(' • Automatic gap detection and recovery') + console.log(' • Lookback verification window for data integrity') + console.log(' • Parallel multi-cycle-based sync (10x+ performance improvement)') + console.log('='.repeat(60)) + console.log('\n') + // Calculate lookback window: cyclesPerBatch * parallelSyncConcurrency const cyclesPerBatch = config.cyclesPerBatch || 10 const concurrency = config.parallelSyncConcurrency || 10 @@ -72,40 +85,39 @@ export class DataSyncManager { * Handles both fresh start and recovery from interruptions */ async syncData(): Promise { - const response = await this.getTotalDataFromDistributor() - if (!response) { - throw new Error('Failed to fetch total data from distributor') + const latestDistributorCycle = await this.getLatestCycleFromDistributor() + if (!latestDistributorCycle) { + throw new Error('Failed to fetch latest cycle from distributor') } - const { totalCycles } = response const lastLocalCycles = await CycleDB.queryLatestCycleRecords(1) const lastLocalCycle = lastLocalCycles.length > 0 ? lastLocalCycles[0].counter : -1 - // Always sync genesis accounts first - if (lastLocalCycle === 0) { - console.log('Syncing genesis accounts...') - await downloadAndSyncGenesisAccounts() - } - // Check if this is a fresh start const isFreshStart = lastLocalCycle === -1 || lastLocalCycle === 0 if (isFreshStart) { // Fresh start - no checkpoint needed, just sync from beginning console.log('🆕 Fresh start detected - syncing from cycle 0') + // Always sync genesis accounts first + console.log('Syncing genesis accounts...') + await downloadAndSyncGenesisAccounts() + const parallelDataSync = new ParallelDataSync({ concurrency: config.parallelSyncConcurrency, retryAttempts: 3, retryDelayMs: 1000, }) - await parallelDataSync.startSyncing(0, totalCycles - 1) + const cycleBatches = await parallelDataSync.createCycleBatches(0, latestDistributorCycle) + + await parallelDataSync.startSyncing(cycleBatches) // Print final database summary await this.printSyncSummary() } else { // Existing data - use DataSyncManager to identify and patch gaps/mismatches console.log('📊 Existing data detected - running recovery analysis') - const recoveryPlan = await this.generateRecoveryPlan(totalCycles) + const recoveryPlan = await this.generateRecoveryPlan(latestDistributorCycle) // Execute the complete sync (recovery + normal sync) await this.executeSyncWithRecovery(recoveryPlan) @@ -128,11 +140,10 @@ export class DataSyncManager { console.log('\n📊 Running data anomaly detection...') - const response = await this.getTotalDataFromDistributor() - if (!response) { - throw new Error('Failed to fetch distributor cycle info') + const currentDistributorCycle = await this.getLatestCycleFromDistributor() + if (!currentDistributorCycle) { + throw new Error('Failed to fetch latest cycle from distributor') } - const currentDistributorCycle = response.totalCycles console.log(`Last local cycle: ${lastLocalCycle}`) console.log(`Current distributor cycle: ${currentDistributorCycle}`) @@ -235,6 +246,7 @@ export class DataSyncManager { } } } + console.log('✅ No data anomalies detected') } catch (error) { throw Error( `Data anomalies detected! Local database may be corrupted or out of sync. ` + @@ -243,7 +255,21 @@ export class DataSyncManager { ) } - console.log('✅ No data anomalies detected') + console.log('✅ Data anomaly check passed - proceeding with sync') + } + + /** + * Fetch latest cycle from distributor + */ + private async getLatestCycleFromDistributor(): Promise { + const response: { data: { cycleInfo: P2P.CycleCreatorTypes.CycleRecord[] } } = await queryFromDistributor( + DataType.CYCLE, + { count: 1 } + ) + if (!response?.data || response.data?.cycleInfo?.[0]?.counter === undefined) { + return null + } + return response.data.cycleInfo[0].counter } /** @@ -384,6 +410,7 @@ export class DataSyncManager { /** * Compare cycle data counts between local DB and distributor + * Queries in batches to respect MAX_CYCLES_PER_REQUEST limit */ private async compareCycleDataWithDistributor( startCycle: number, @@ -392,43 +419,61 @@ export class DataSyncManager { const mismatched: MismatchedCycle[] = [] try { - // Fetch counts from distributor - const [receiptsResponse, originalTxsResponse] = await Promise.all([ - queryFromDistributor(DataType.RECEIPT, { startCycle, endCycle, type: 'tally' }), - queryFromDistributor(DataType.ORIGINALTX, { startCycle, endCycle, type: 'tally' }), - ]) + // Split into batches if range is larger than max allowed + const batches: { start: number; end: number }[] = [] + for (let i = startCycle; i <= endCycle; i += config.requestLimits.MAX_CYCLES_PER_REQUEST) { + const batchEnd = Math.min(i + config.requestLimits.MAX_CYCLES_PER_REQUEST, endCycle) + batches.push({ start: i, end: batchEnd }) + } - if (!receiptsResponse?.data?.receipts || !originalTxsResponse?.data?.originalTxs) { - console.warn(`Failed to fetch distributor data for cycles ${startCycle}-${endCycle}`) - return mismatched + // Fetch all distributor data in batches + const allDistributorReceipts: { cycle: number; receipts: number }[] = [] + const allDistributorOriginalTxs: { cycle: number; originalTxsData: number }[] = [] + + for (const batch of batches) { + const [receiptsResponse, originalTxsResponse] = await Promise.all([ + queryFromDistributor(DataType.RECEIPT, { + startCycle: batch.start, + endCycle: batch.end, + type: 'tally', + }), + queryFromDistributor(DataType.ORIGINALTX, { + startCycle: batch.start, + endCycle: batch.end, + type: 'tally', + }), + ]) + + if (receiptsResponse?.data?.receipts) { + allDistributorReceipts.push(...receiptsResponse.data.receipts) + } + if (originalTxsResponse?.data?.originalTxs) { + allDistributorOriginalTxs.push(...originalTxsResponse.data.originalTxs) + } } - const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts - const distributorOriginalTxs: { cycle: number; originalTxsData: number }[] = - originalTxsResponse.data.originalTxs + // Sort distributor data by cycle + allDistributorReceipts.sort((a, b) => a.cycle - b.cycle) + allDistributorOriginalTxs.sort((a, b) => a.cycle - b.cycle) - // Fetch counts from local DB + // Fetch counts from local DB (single query for entire range) const [localReceipts, localOriginalTxs] = await Promise.all([ ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle), OriginalTxDataDB.queryOriginalTxDataCountByCycles(startCycle, endCycle), ]) - // Create maps for easier lookup - const localReceiptsMap = new Map(localReceipts.map((r) => [r.cycle, r.receipts])) - const localOriginalTxsMap = new Map(localOriginalTxs.map((t) => [t.cycle, t.originalTxsData])) - - // Compare each cycle - const allCycles = new Set([ - ...distributorReceipts.map((r) => r.cycle), - ...distributorOriginalTxs.map((t) => t.cycle), - ]) + console.log( + `Comparing cycles ${startCycle} to ${endCycle} with ${allDistributorReceipts.length} distributor receipts and ${allDistributorOriginalTxs.length} distributor originalTxs` + ) + console.log(allDistributorReceipts, localReceipts) + console.log(allDistributorOriginalTxs, localOriginalTxs) - for (const cycle of allCycles) { - const distReceipts = distributorReceipts.find((r) => r.cycle === cycle)?.receipts || 0 - const distOriginalTxs = distributorOriginalTxs.find((t) => t.cycle === cycle)?.originalTxsData || 0 + for (let cycle = startCycle; cycle <= endCycle; cycle++) { + const distReceipts = allDistributorReceipts.find((r) => r.cycle === cycle)?.receipts || 0 + const distOriginalTxs = allDistributorOriginalTxs.find((t) => t.cycle === cycle)?.originalTxsData || 0 - const localReceiptsCount = localReceiptsMap.get(cycle) || 0 - const localOriginalTxsCount = localOriginalTxsMap.get(cycle) || 0 + const localReceiptsCount = localReceipts.find((r) => r.cycle === cycle)?.receipts || 0 + const localOriginalTxsCount = localOriginalTxs.find((t) => t.cycle === cycle)?.originalTxsData || 0 const receiptsMismatch = localReceiptsCount !== distReceipts const originalTxsMismatch = localOriginalTxsCount !== distOriginalTxs @@ -574,18 +619,25 @@ export class DataSyncManager { // Step 4: Execute ParallelDataSync for all ranges if (mergedRanges.length > 0) { - for (const range of mergedRanges) { - console.log(`\nSyncing range: ${range.startCycle} to ${range.endCycle} (${range.gapSize} cycles)`) + console.log('\n📡 Starting data sync with recovery plan') - const parallelSync = new ParallelDataSync({ - concurrency: config.parallelSyncConcurrency || 10, - retryAttempts: 3, - retryDelayMs: 1000, - }) + const parallelDataSync = new ParallelDataSync({ + concurrency: config.parallelSyncConcurrency, + retryAttempts: 3, + retryDelayMs: 1000, + }) - await parallelSync.startSyncing(range.startCycle, range.endCycle) - console.log(`✅ Completed range ${range.startCycle} to ${range.endCycle}`) + const cycleBatches = [] + // For each range, create cycle batches and merge them into one + for (const range of mergedRanges) { + console.log(`\nFor range: ${range.startCycle} to ${range.endCycle} (${range.gapSize} cycles)`) + const cycleBatch = parallelDataSync.createCycleBatches(range.startCycle, range.endCycle) + cycleBatches.push(...cycleBatch) } + + await parallelDataSync.startSyncing(cycleBatches) + + console.log('\n✅ Data sync with recovery completed successfully') } else { console.log('\n✅ No data to sync, database is up to date') } @@ -685,27 +737,35 @@ export class DataSyncManager { */ async getSyncStats(): Promise<{ totalCycles: number + totalAccounts: number totalReceipts: number totalOriginalTxs: number + totalTransactions: number }> { try { - const [cycleCount, receiptCount, originalTxCount] = await Promise.all([ + const [cycleCount, accountCount, receiptCount, originalTxCount, transactionCount] = await Promise.all([ CycleDB.queryCycleCount(), + AccountDB.queryAccountCount(), ReceiptDB.queryReceiptCount(), OriginalTxDataDB.queryOriginalTxDataCount(), + TransactionDB.queryTransactionCount(), ]) return { totalCycles: cycleCount || 0, + totalAccounts: accountCount || 0, totalReceipts: receiptCount || 0, totalOriginalTxs: originalTxCount || 0, + totalTransactions: transactionCount || 0, } } catch (error) { console.error('Error getting sync stats:', error) return { totalCycles: 0, + totalAccounts: 0, totalReceipts: 0, totalOriginalTxs: 0, + totalTransactions: 0, } } } @@ -715,11 +775,27 @@ export class DataSyncManager { */ async printSyncSummary(): Promise { const stats = await this.getSyncStats() + const distributorData = await this.getTotalDataFromDistributor() + console.log('='.repeat(60)) console.log('Sync Summary:') + console.log('\nLocal Database:') console.log(` Total Cycles: ${stats.totalCycles}`) + console.log(` Total Accounts: ${stats.totalAccounts}`) console.log(` Total Receipts: ${stats.totalReceipts}`) console.log(` Total OriginalTxs: ${stats.totalOriginalTxs}`) + console.log(` Total Transactions: ${stats.totalTransactions}`) + + if (distributorData) { + console.log('\nDistributor:') + console.log(` Total Cycles: ${distributorData.totalCycles}`) + console.log(` Total Accounts: ${distributorData.totalAccounts}`) + console.log(` Total Receipts: ${distributorData.totalReceipts}`) + console.log(` Total OriginalTxs: ${distributorData.totalOriginalTxs}`) + } else { + console.log('\nDistributor: Failed to fetch data') + } + console.log('='.repeat(60)) } } diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index f012aa3..510260c 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -1,8 +1,8 @@ import PQueue from 'p-queue' import * as crypto from '@shardus/crypto-utils' -import { Utils as StringUtils } from '@shardus/types' +import { P2P, Utils as StringUtils } from '@shardus/types' import { config, DISTRIBUTOR_URL } from '../config' -import { queryFromDistributor, DataType } from './DataSync' +import { DataType } from './DataSync' import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' import { Cycle } from '../types' import axios, { AxiosInstance } from 'axios' @@ -48,6 +48,7 @@ interface ResponseSizeMetadata { interface ResponseDataWithMetadata { __responseSize?: ResponseSizeMetadata + __networkElapsed?: number [key: string]: unknown } @@ -256,10 +257,39 @@ export class ParallelDataSync { ) } + /** + * Creates batches of cycles for parallel processing. + * This is a preparatory step before calling startSyncing, which expects these batches. + * @param startCycle The starting cycle number. + * @param endCycle The ending cycle number. + * @returns An array of cycle batches, each with a start and end cycle. + */ + public createCycleBatches( + startCycle: number, + endCycle: number + ): { startCycle: number; endCycle: number }[] { + const cycleBatches: { startCycle: number; endCycle: number }[] = [] + + for (let i = startCycle; i <= endCycle; i += this.syncConfig.cyclesPerBatch) { + const batchEndCycle = Math.min(i + this.syncConfig.cyclesPerBatch - 1, endCycle) + cycleBatches.push({ startCycle: i, endCycle: batchEndCycle }) + } + + return cycleBatches + } + /** * Main entry point for parallel sync */ - async startSyncing(startCycle: number, endCycle: number): Promise { + async startSyncing(cycleBatches: { startCycle: number; endCycle: number }[]): Promise { + if (!cycleBatches || cycleBatches.length === 0) { + console.log('No cycle batches provided for syncing.') + return + } + + const startCycle = cycleBatches[0].startCycle + const endCycle = cycleBatches[cycleBatches.length - 1].endCycle + console.log(`\n${'='.repeat(60)}`) console.log(`Starting Parallel Cycle Sync: ${startCycle} → ${endCycle}`) console.log(`Concurrency: ${this.syncConfig.concurrency} workers`) @@ -269,20 +299,8 @@ export class ParallelDataSync { this.stats.totalCycles = endCycle - startCycle try { - // Split cycles into batches - const cycleBatches: { startCycle: number; endCycle: number }[] = [] - - for (let i = startCycle; i <= endCycle; ) { - let batchEnd = i + this.syncConfig.cyclesPerBatch - if (batchEnd > endCycle) { - batchEnd = endCycle - } - cycleBatches.push({ startCycle: i, endCycle: batchEnd }) - i = batchEnd + 1 - } - console.log( - `Created ${cycleBatches.length} cycle batches (${this.syncConfig.cyclesPerBatch} cycles per batch)` + `Syncing ${cycleBatches.length} cycle batches created with ${this.syncConfig.cyclesPerBatch} cycles per batch` ) // Add all batch sync tasks to the queue @@ -305,7 +323,7 @@ export class ParallelDataSync { /** * Sync data in parallel using adaptive multi-cycle fetching with prefetching on endpoints - * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5) + * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5, then sends next request for 5-10) */ private async syncDataByCycleRange(startCycle: number, endCycle: number): Promise { try { @@ -318,12 +336,10 @@ export class ParallelDataSync { this.stats.completedCycles += endCycle - startCycle + 1 - if (config.verbose || this.stats.completedCycles % 10 === 0) { - const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) - console.log( - `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` - ) - } + const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) + console.log( + `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + ) } catch (error) { console.error(`Error syncing cycle batch ${startCycle}-${endCycle}:`, error) this.stats.errors++ @@ -336,17 +352,61 @@ export class ParallelDataSync { */ private async syncCyclesByCycleRange(startCycle: number, endCycle: number): Promise { try { - const response = await this.fetchCyclesByCycleRange(startCycle, endCycle) + const response = await this.fetchDataFromDistributor( + DataType.CYCLE, + startCycle, + endCycle, + this.signData({ start: startCycle, end: endCycle }) + ) - if (!response || response.length === 0) { - if (config.verbose) { - console.log(`[Cycles ${startCycle}-${endCycle}] No cycle data returned`) + const cycles = response?.data?.cycleInfo || [] + + // Get size metadata from transformResponse and interceptor + const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + const decompressedKB = sizeMetadata?.decompressedKB || '0.00' + const compressedKB = sizeMetadata?.compressedKB + const compressionRatio = sizeMetadata?.compressionRatio + const compressionSavings = sizeMetadata?.compressionSavings + const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + + if (config.verbose || networkElapsed > 1000) { + // Build log message with compression info if available + let logMessage = + `[API Timing] Cycles fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${cycles.length}` + + // Only show compression metrics if compression actually reduced the size (ratio < 1) + if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { + logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` + } else { + // No compression or not effective, just show uncompressed size + logMessage += `, payload: ${decompressedKB}KB` } - return + + logMessage += + (cycles.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') + + console.log(logMessage) } + if (!response || !response.data || !response.data.cycleInfo) { + console.error(`Error fetching cycles for cycle batch ${startCycle}-${endCycle}:`, response) + return // Couldn't fetch any cycles + } + + if (cycles.length === 0) { + return // No more originalTxs in this cycle range + } + const cycleRecords = cycles.map((cycleRecord: Cycle['cycleRecord']) => ({ + counter: cycleRecord.counter, + cycleRecord, + start: cycleRecord.start, + cycleMarker: cycleRecord.marker, + })) + // Process cycles using bulkInsertCycles - await CycleDB.bulkInsertCycles(response) + await CycleDB.bulkInsertCycles(cycleRecords) if (config.verbose) { console.log(`[Cycles ${startCycle}-${endCycle}] Cycles: +${response.length}`) @@ -359,6 +419,7 @@ export class ParallelDataSync { /** * Sync receipts across a batch of cycles using adaptive multi-cycle fetching with prefetching + * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5, then sends next request for 5-10) */ private async syncReceiptsByCycleRange(startCycle: number, endCycle: number): Promise { let currentCycle = startCycle @@ -366,9 +427,22 @@ export class ParallelDataSync { let afterTxId = '' let totalFetched = 0 + const route = `receipt/cycle` + // Prefetch: Start fetching first batch immediately let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch - ? this.fetchReceiptsByCycleRange({ startCycle: currentCycle, endCycle, afterTimestamp, afterTxId }) + ? this.fetchDataFromDistributor( + route, + currentCycle, + endCycle, + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_RECEIPTS_PER_REQUEST, + }) + ) : null while (currentCycle <= endCycle) { @@ -376,19 +450,61 @@ export class ParallelDataSync { // Get the data (either from prefetch or fetch now) const response = nextFetchPromise ? await nextFetchPromise - : await this.fetchReceiptsByCycleRange({ - startCycle: currentCycle, + : await this.fetchDataFromDistributor( + route, + currentCycle, endCycle, - afterTimestamp, - afterTxId, - }) + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_RECEIPTS_PER_REQUEST, + }) + ) + + const receipts = response?.data?.receipts || [] + + // Get size metadata from transformResponse and interceptor + const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + const decompressedKB = sizeMetadata?.decompressedKB || '0.00' + const compressedKB = sizeMetadata?.compressedKB + const compressionRatio = sizeMetadata?.compressionRatio + const compressionSavings = sizeMetadata?.compressionSavings + const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + + if (config.verbose || networkElapsed > 1000) { + // Build log message with compression info if available + let logMessage = + `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${receipts.length}` - if (!response || response.length === 0) { - break // No more receipts in this cycle range + // Only show compression metrics if compression actually reduced the size (ratio < 1) + if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { + logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` + } else { + // No compression or not effective, just show uncompressed size + logMessage += `, payload: ${decompressedKB}KB` + } + + logMessage += + (receipts.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') + + console.log(logMessage) + } + + if (!response || !response.data || !response.data.receipts) { + console.error(`Error fetching receipts for cycle batch ${startCycle}-${endCycle}:`, response) + break // Couldn't fetch any receipts + } + + if (receipts.length === 0) { + break // No more originalTxs in this cycle range } // Update after timestamp and txId based on last receipt BEFORE starting next fetch - const lastReceipt = response[response.length - 1] + const lastReceipt = receipts[receipts.length - 1] currentCycle = lastReceipt.cycle afterTimestamp = lastReceipt.timestamp afterTxId = lastReceipt.receiptId @@ -396,34 +512,40 @@ export class ParallelDataSync { // Prefetch next batch while processing current batch if ( this.syncConfig.enablePrefetch && - response.length >= config.requestLimits.MAX_RECEIPTS_PER_REQUEST + receipts.length >= config.requestLimits.MAX_RECEIPTS_PER_REQUEST ) { - nextFetchPromise = this.fetchReceiptsByCycleRange({ - startCycle: currentCycle, + nextFetchPromise = this.fetchDataFromDistributor( + route, + currentCycle, endCycle, - afterTimestamp, - afterTxId, - }) + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_RECEIPTS_PER_REQUEST, + }) + ) } else { nextFetchPromise = null } // Process receipts (overlaps with next fetch if prefetch enabled) - await ReceiptDB.processReceiptData(response) + await ReceiptDB.processReceiptData(receipts) - totalFetched += response.length - this.stats.totalReceipts += response.length + totalFetched += receipts.length + this.stats.totalReceipts += receipts.length if (config.verbose) { console.log( - `[Cycles ${startCycle}-${endCycle}] Receipts: +${response.length} (total: ${totalFetched}), ` + + `[Cycles ${startCycle}-${endCycle}] Receipts: +${receipts.length} (total: ${totalFetched}), ` + `last in cycle ${currentCycle}` + (this.syncConfig.enablePrefetch ? ' [prefetch]' : '') ) } - // If we got less than the max response size, we've exhausted this cycle range - if (response.length < config.requestLimits.MAX_RECEIPTS_PER_REQUEST) { + // If we got less than the max receipts size, we've exhausted this cycle range + if (receipts.length < config.requestLimits.MAX_RECEIPTS_PER_REQUEST) { break } } catch (error) { @@ -435,6 +557,7 @@ export class ParallelDataSync { /** * Sync originalTxs across a batch of cycles using adaptive multi-cycle fetching with prefetching + * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5, then sends next request for 5-10) */ private async syncOriginalTxsByCycleRange(startCycle: number, endCycle: number): Promise { let currentCycle = startCycle @@ -442,14 +565,22 @@ export class ParallelDataSync { let afterTxId = '' let totalFetched = 0 + const route = `originalTx/cycle` + // Prefetch: Start fetching first batch immediately let nextFetchPromise: Promise | null = this.syncConfig.enablePrefetch - ? this.fetchOriginalTxsByCycleRange({ - startCycle: currentCycle, + ? this.fetchDataFromDistributor( + route, + currentCycle, endCycle, - afterTimestamp, - afterTxId, - }) + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST, + }) + ) : null while (currentCycle <= endCycle) { @@ -457,19 +588,61 @@ export class ParallelDataSync { // Get the data (either from prefetch or fetch now) const response = nextFetchPromise ? await nextFetchPromise - : await this.fetchOriginalTxsByCycleRange({ - startCycle: currentCycle, + : await this.fetchDataFromDistributor( + route, + currentCycle, endCycle, - afterTimestamp, - afterTxId, - }) + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST, + }) + ) + + const originalTxs = response?.data?.originalTxs || [] + + // Get size metadata from transformResponse and interceptor + const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize + const decompressedKB = sizeMetadata?.decompressedKB || '0.00' + const compressedKB = sizeMetadata?.compressedKB + const compressionRatio = sizeMetadata?.compressionRatio + const compressionSavings = sizeMetadata?.compressionSavings + const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + + if (config.verbose || networkElapsed > 1000) { + // Build log message with compression info if available + let logMessage = + `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `records: ${originalTxs.length}` + + // Only show compression metrics if compression actually reduced the size (ratio < 1) + if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { + logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` + } else { + // No compression or not effective, just show uncompressed size + logMessage += `, payload: ${decompressedKB}KB` + } + + logMessage += + (originalTxs.length === 0 && response.data ? ', response.data exists but empty' : '') + + (!response.data ? ', response.data is null/undefined!' : '') + + console.log(logMessage) + } - if (!response || response.length === 0) { + if (!response || !response.data || !response.data.originalTxs) { + console.error(`Error fetching originalTxs for cycle batch ${startCycle}-${endCycle}:`, response) + break // Couldn't fetch any originalTxs + } + + if (originalTxs.length === 0) { break // No more originalTxs in this cycle range } // Update after timestamp and txId based on last tx BEFORE starting next fetch - const lastTx = response[response.length - 1] + const lastTx = originalTxs[originalTxs.length - 1] currentCycle = lastTx.cycle afterTimestamp = lastTx.timestamp afterTxId = lastTx.txId @@ -479,32 +652,38 @@ export class ParallelDataSync { this.syncConfig.enablePrefetch && response.length >= config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST ) { - nextFetchPromise = this.fetchOriginalTxsByCycleRange({ - startCycle: currentCycle, + nextFetchPromise = this.fetchDataFromDistributor( + route, + currentCycle, endCycle, - afterTimestamp, - afterTxId, - }) + this.signData({ + startCycle: currentCycle, + endCycle, + afterTimestamp, + afterTxId, + limit: config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST, + }) + ) } else { nextFetchPromise = null } // Process originalTxs (overlaps with next fetch if prefetch enabled) - await OriginalTxDataDB.processOriginalTxData(response) + await OriginalTxDataDB.processOriginalTxData(originalTxs) - totalFetched += response.length - this.stats.totalOriginalTxs += response.length + totalFetched += originalTxs.length + this.stats.totalOriginalTxs += originalTxs.length if (config.verbose) { console.log( - `[Cycles ${startCycle}-${endCycle}] OriginalTxs: +${response.length} (total: ${totalFetched}), ` + + `[Cycles ${startCycle}-${endCycle}] OriginalTxs: +${originalTxs.length} (total: ${totalFetched}), ` + `last in cycle ${currentCycle}` + (this.syncConfig.enablePrefetch ? ' [prefetch]' : '') ) } - // If we got less than the max response size, we've exhausted this cycle range - if (response.length < config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST) { + // If we got less than the max originalTxs size, we've exhausted this cycle range + if (originalTxs.length < config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST) { break } } catch (error) { @@ -515,86 +694,15 @@ export class ParallelDataSync { } /** - * Fetch cycles by cycle range with retry logic + * Fetch data by multi-cycle range with retry logic */ - private async fetchCyclesByCycleRange(startCycle: number, endCycle: number): Promise { - // Retry with exponential backoff - for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { - try { - const startTime = Date.now() - const response = await queryFromDistributor(DataType.CYCLE, { - start: startCycle, - end: endCycle, - }) - const networkElapsed = Date.now() - startTime - - if (response && response.data && response.data.cycleInfo) { - const cycleRecords = response.data.cycleInfo.map((cycleRecord: any) => ({ - counter: cycleRecord.counter, - cycleRecord, - start: cycleRecord.start, - cycleMarker: cycleRecord.marker, - })) - - if (config.verbose) { - console.log( - `[API Timing] Cycles fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${cycleRecords.length}` - ) - } - return cycleRecords - } - } catch (error: any) { - const isLastAttempt = attempt === this.syncConfig.retryAttempts - const isRetryableError = - error.code === 'ECONNRESET' || - error.code === 'ETIMEDOUT' || - error.code === 'ECONNREFUSED' || - error.code === 'EPIPE' - - if (isRetryableError && !isLastAttempt) { - const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) - console.warn( - `Error on cycles fetch (cycles ${startCycle}-${endCycle}), ` + - `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + - `retrying in ${delay}ms...` - ) - await this.sleep(delay) - continue - } - - // Non-retryable error or last attempt failed - console.error(`Error fetching cycles (cycles ${startCycle}-${endCycle}):`, error.message) - throw error - } - } - - return [] - } - - /** - * Fetch receipts by multi-cycle range with retry logic - * Automatically adapts to cycle sizes - if cycles 1-10 only have data in 1-5, returns that subset - */ - private async fetchReceiptsByCycleRange({ - startCycle, - endCycle, - afterTimestamp, - afterTxId, - }: SyncTxDataByCycleRange): Promise { - const data = { - startCycle, - endCycle, - afterTimestamp, - afterTxId, - limit: config.requestLimits.MAX_RECEIPTS_PER_REQUEST, - sender: config.collectorInfo.publicKey, - sign: undefined, - } - - crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - - const url = `${DISTRIBUTOR_URL}/receipt/cycle` + private async fetchDataFromDistributor( + route: string, + startCycle: number, + endCycle: number, + data: any + ): Promise { + const url = `${DISTRIBUTOR_URL}/${route}` // Retry with exponential backoff for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { @@ -602,42 +710,10 @@ export class ParallelDataSync { const startTime = Date.now() const response = await this.axiosInstance.post(url, data) const networkElapsed = Date.now() - startTime - - const receipts = response.data?.receipts || [] - - // Get size metadata from transformResponse and interceptor - const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize - const decompressedKB = sizeMetadata?.decompressedKB || '0.00' - const compressedKB = sizeMetadata?.compressedKB - const compressionRatio = sizeMetadata?.compressionRatio - const compressionSavings = sizeMetadata?.compressionSavings - - if (config.verbose || networkElapsed > 1000 || receipts.length === 0) { - // Build log message with compression info if available - let logMessage = - `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${receipts.length}` - - // Only show compression metrics if compression actually reduced the size (ratio < 1) - if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { - logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` - } else { - // No compression or not effective, just show uncompressed size - logMessage += `, payload: ${decompressedKB}KB` - } - - logMessage += - (receipts.length === 0 && response.data ? ', response.data exists but empty' : '') + - (!response.data ? ', response.data is null/undefined!' : '') - - console.log(logMessage) - } - - if (response.data && response.data.receipts) { - return response.data.receipts + if (response && response.data) { + ;(response.data as ResponseDataWithMetadata).__networkElapsed = networkElapsed } - - return [] + return response } catch (error: any) { const isLastAttempt = attempt === this.syncConfig.retryAttempts const isRetryableError = @@ -649,7 +725,7 @@ export class ParallelDataSync { if (isRetryableError && !isLastAttempt) { const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) console.warn( - `ECONNRESET on receipts fetch (cycles ${startCycle}-${endCycle}), ` + + `ECONNRESET on ${route} fetch (cycles ${startCycle}-${endCycle}), ` + `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + `retrying in ${delay}ms...` ) @@ -658,111 +734,25 @@ export class ParallelDataSync { } // Non-retryable error or last attempt failed - console.error( - `Error fetching receipts multi-cycle (cycles ${startCycle}-${endCycle}):`, - error.message - ) + console.error(`Error fetching ${route} for (cycles ${startCycle}-${endCycle}):`, error.message) throw error } } - return [] + return null } /** - * Fetch originalTxs by multi-cycle range with retry logic + * Sign data */ - private async fetchOriginalTxsByCycleRange({ - startCycle, - endCycle, - afterTimestamp, - afterTxId, - }: SyncTxDataByCycleRange): Promise { + private signData(obj: SyncTxDataByCycleRange | { start: number; end: number }): P2P.P2PTypes.SignedObject { const data = { - startCycle, - endCycle, - afterTimestamp, - afterTxId, - limit: config.requestLimits.MAX_ORIGINAL_TXS_PER_REQUEST, + ...obj, sender: config.collectorInfo.publicKey, sign: undefined, } - crypto.signObj(data, config.collectorInfo.secretKey, config.collectorInfo.publicKey) - - const url = `${DISTRIBUTOR_URL}/originalTx/cycle` - - // Retry with exponential backoff - for (let attempt = 0; attempt <= this.syncConfig.retryAttempts; attempt++) { - try { - const startTime = Date.now() - const response = await this.axiosInstance.post(url, data) - const networkElapsed = Date.now() - startTime - - const originalTxs = response.data?.originalTxs || [] - - // Get size metadata from transformResponse and interceptor - const sizeMetadata = (response.data as ResponseDataWithMetadata)?.__responseSize - const decompressedKB = sizeMetadata?.decompressedKB || '0.00' - const compressedKB = sizeMetadata?.compressedKB - const compressionRatio = sizeMetadata?.compressionRatio - const compressionSavings = sizeMetadata?.compressionSavings - - if (config.verbose || networkElapsed > 1000 || originalTxs.length === 0) { - // Build log message with compression info if available - let logMessage = - `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + - `records: ${originalTxs.length}` - - // Only show compression metrics if compression actually reduced the size (ratio < 1) - if (compressedKB !== undefined && compressionRatio !== undefined && compressionRatio < 1) { - logMessage += `, payload: ${compressedKB}KB, payloadUncompressed: ${decompressedKB}KB, ratio: ${compressionRatio}, savings: ${compressionSavings}` - } else { - // No compression or not effective, just show uncompressed size - logMessage += `, payload: ${decompressedKB}KB` - } - - logMessage += - (originalTxs.length === 0 && response.data ? ', response.data exists but empty' : '') + - (!response.data ? ', response.data is null/undefined!' : '') - - console.log(logMessage) - } - - if (response.data && response.data.originalTxs) { - return response.data.originalTxs - } - - return [] - } catch (error: any) { - const isLastAttempt = attempt === this.syncConfig.retryAttempts - const isRetryableError = - error.code === 'ECONNRESET' || - error.code === 'ETIMEDOUT' || - error.code === 'ECONNREFUSED' || - error.code === 'EPIPE' - - if (isRetryableError && !isLastAttempt) { - const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) - console.warn( - `ECONNRESET on originalTxs fetch (cycles ${startCycle}-${endCycle}), ` + - `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + - `retrying in ${delay}ms...` - ) - await this.sleep(delay) - continue - } - - // Non-retryable error or last attempt failed - console.error( - `Error fetching originalTxs multi-cycle (cycles ${startCycle}-${endCycle}):`, - error.message - ) - throw error - } - } - - return [] + return data } /** diff --git a/src/collector.ts b/src/collector.ts index 836d737..59b29fd 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -256,30 +256,6 @@ export const checkAndSyncData = async (): Promise<() => Promise> => { return syncData } -export const startDataSyncManager = async (): Promise<() => Promise> => { - console.log('\n') - console.log('='.repeat(60)) - console.log('INITIALIZING DATA SYNC MANAGER') - console.log('='.repeat(60)) - console.log('DataSyncManager provides intelligent data synchronization with:') - console.log(' • Early data anomaly detection before sync operations') - console.log(' • Automatic gap detection and recovery') - console.log(' • Lookback verification window for data integrity') - console.log(' • Parallel batch-cycle-based sync (10x+ performance improvement)') - console.log('='.repeat(60)) - console.log('\n') - - // Run anomaly detection BEFORE connecting to websocket - // This fails fast if there are data corruption issues - const dataSyncManager = new DataSyncManager() - await dataSyncManager.detectDataAnomalies() - - console.log('✅ Data anomaly check passed - proceeding with sync') - - // Return the sync function to be executed after WS connection - return dataSyncManager.syncData -} - const attemptReconnection = (): void => { console.log(`Re-connecting Distributor in ${config.DISTRIBUTOR_RECONNECT_INTERVAL / 1000}s...`) reconnecting = true @@ -298,7 +274,7 @@ const connectToDistributor = (): void => { ws = new WebSocket(URL) ws.onopen = () => { console.log( - `✅ Socket connected to the Distributor @ ${config.distributorInfo.ip}:${config.distributorInfo.port}}` + `✅ Socket connected to the Distributor @ ${config.distributorInfo.ip}:${config.distributorInfo.port}` ) connected = true reconnecting = false @@ -399,7 +375,15 @@ const startServer = async (): Promise => { await Storage.initializeDB() addExitListeners() - const syncData = config.useParallelSync ? await startDataSyncManager() : await checkAndSyncData() + let dataSyncManager = null + + if (config.useParallelSync) { + // Run anomaly detection BEFORE connecting to websocket + // This fails fast if there are data corruption issues + dataSyncManager = new DataSyncManager() + await dataSyncManager.detectDataAnomalies() + } + const syncData = !config.useParallelSync && (await checkAndSyncData()) if (config.dataLogWrite) await initDataLogWriter() addSigListeners() @@ -421,6 +405,11 @@ const startServer = async (): Promise => { } } + if (config.useParallelSync) { + await dataSyncManager.syncData() + return + } + await syncData() } diff --git a/src/config/index.ts b/src/config/index.ts index b602842..0e1eb50 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -191,7 +191,7 @@ let config: Config = { }, parallelSyncConcurrency: Number(process.env.PARALLEL_SYNC_CONCURRENCY) || 10, // 10 parallel workers useParallelSync: process.env.USE_PARALLEL_SYNC !== 'false', // Enable by default - cyclesPerBatch: Number(process.env.CYCLES_PER_BATCH) || 10, // Batch 10 cycles together + cyclesPerBatch: Number(process.env.CYCLES_PER_BATCH) || 100, // Batch 100 cycles together ( matching MAX_BETWEEN_CYCLES_PER_REQUEST, can be lower if needed ) enablePrefetch: process.env.ENABLE_PREFETCH !== 'false', // Enable prefetch by default syncRetryAttempts: Number(process.env.SYNC_RETRY_ATTEMPTS) || 3, // Retry failed requests 3 times dexScreenerAPI: diff --git a/src/storage/account.ts b/src/storage/account.ts index d189fa9..02d2cb9 100644 --- a/src/storage/account.ts +++ b/src/storage/account.ts @@ -204,6 +204,47 @@ export async function queryAccountByAccountId(accountId: string): Promise { + try { + const sql = `SELECT timestamp, createdTimestamp FROM accounts WHERE accountId=?` + const dbAccount = (await db.get(accountDatabase, sql, [accountId])) as DbAccount + if (dbAccount) return { timestamp: dbAccount.timestamp, createdTimestamp: dbAccount.createdTimestamp } + return null + } catch (e) { + console.log(e) + return null + } +} + +export async function queryAccountTimestampsBatch( + accountIds: string[] +): Promise> { + const resultMap = new Map() + if (accountIds.length === 0) return resultMap + + try { + // Create placeholders for IN clause + const placeholders = accountIds.map(() => '?').join(', ') + const sql = `SELECT accountId, timestamp, createdTimestamp FROM accounts WHERE accountId IN (${placeholders})` + const accounts = (await db.all(accountDatabase, sql, accountIds)) as DbAccount[] + + for (const account of accounts) { + resultMap.set(account.accountId, { + timestamp: account.timestamp, + createdTimestamp: account.createdTimestamp, + }) + } + + if (config.verbose) console.log('Batch queried accounts', accounts.length, 'of', accountIds.length) + } catch (e) { + console.log('Error in queryAccountTimestampsBatch', e) + } + + return resultMap +} + export async function processAccountData(accounts: AccountsCopy[]): Promise { console.log('accounts size', accounts.length) if (accounts && accounts.length <= 0) return [] diff --git a/src/storage/receipt.ts b/src/storage/receipt.ts index f3425eb..8e6bbfb 100644 --- a/src/storage/receipt.ts +++ b/src/storage/receipt.ts @@ -158,18 +158,19 @@ export async function processReceiptData( combineAccounts.push(accObj) } } else { - const accountExist = await AccountDB.queryAccountByAccountId(accObj.accountId) - if (config.verbose) console.log('accountExist', accountExist) - if (!accountExist) { - combineAccounts.push(accObj) - } else { - if (accountExist.timestamp < accObj.timestamp) { - await AccountDB.updateAccount(accObj) - } - if (accObj.createdTimestamp < accountExist.createdTimestamp) { - await AccountDB.updateCreatedTimestamp(accObj.accountId, accObj.createdTimestamp) - } - } + // const accountExist = await AccountDB.queryAccountTimestamp(accObj.accountId) + // if (config.verbose) console.log('accountExist', accountExist) + // if (accountExist) { + // if (accountExist.timestamp < accObj.timestamp) { + // await AccountDB.updateAccount(accObj) + // // combineAccounts.push(accObj) + // } + // if (accObj.createdTimestamp < accountExist.createdTimestamp) { + // await AccountDB.updateCreatedTimestamp(accObj.accountId, accObj.createdTimestamp) + // } + // } else { + combineAccounts.push(accObj) + // } } // if tx receipt is saved as an account, create tx object from the account and save it @@ -229,13 +230,13 @@ export async function processReceiptData( } txObj.data = {} } - const transactionExist = await TransactionDB.queryTransactionByTxId(tx.txId) - if (config.verbose) console.log('transactionExist', transactionExist) - if (!transactionExist) { - combineTransactions.push(txObj) - } else if (transactionExist.timestamp < txObj.timestamp) { - await TransactionDB.insertTransaction(txObj) - } + // const transactionExist = await TransactionDB.queryTransactionByTxId(tx.txId) + // if (config.verbose) console.log('transactionExist', transactionExist) + // if (!transactionExist) { + combineTransactions.push(txObj) + // } else if (transactionExist.timestamp < txObj.timestamp) { + // await TransactionDB.insertTransaction(txObj) + // } if (config.saveAccountHistoryState) { // Note: This has to be changed once we change the way the global modification tx consensus is updated if ( @@ -286,8 +287,32 @@ export async function processReceiptData( accountHistoryStateList = [] } } + + // Batch query all collected account IDs once + const accountIdsToQuery = combineAccounts.map((acc) => acc.accountId) + const existingAccounts = await AccountDB.queryAccountTimestampsBatch(accountIdsToQuery) + for (const accObj of combineAccounts) { + const accountExist = existingAccounts.get(accObj.accountId) + if (accountExist) { + if (accountExist.timestamp > accObj.timestamp) { + // await AccountDB.updateAccount(accObj) + // Remove the account from the list + combineAccounts = combineAccounts.filter((acc) => acc.accountId !== accObj.accountId) + } + if (accountExist.createdTimestamp > accObj.createdTimestamp) { + await AccountDB.updateCreatedTimestamp(accObj.accountId, accObj.createdTimestamp) + } + } + } + // Insert the combined accounts in bucketSize + if (combineAccounts.length > 0) { + for (let i = 0; i < combineAccounts.length; i += bucketSize) { + const accounts = combineAccounts.slice(i, i + bucketSize) + await AccountDB.bulkInsertAccounts(accounts) + } + } + if (combineReceipts.length > 0) await bulkInsertReceipts(combineReceipts) - if (combineAccounts.length > 0) await AccountDB.bulkInsertAccounts(combineAccounts) if (combineTransactions.length > 0) await TransactionDB.bulkInsertTransactions(combineTransactions) if (accountHistoryStateList.length > 0) await AccountHistoryStateDB.bulkInsertAccountHistoryStates(accountHistoryStateList) diff --git a/src/storage/sqlite3storage.ts b/src/storage/sqlite3storage.ts index 6c68b85..27acc6c 100644 --- a/src/storage/sqlite3storage.ts +++ b/src/storage/sqlite3storage.ts @@ -79,8 +79,11 @@ export const createDB = async (dbPath: string, dbName: string): Promise { const engineMs = typeof time === 'number' ? time : Number(time) const queue = queuedBySql.get(sql) From bb8717d2894df5cae1d9c102df4e3854a865069f Mon Sep 17 00:00:00 2001 From: jairajdev Date: Wed, 12 Nov 2025 14:05:02 +0800 Subject: [PATCH 08/14] fix: improve sync statistics and throughput calculation - Separate totalCyclesToSync from totalCycles in stats - Calculate throughput based on all record types (cycles + receipts + originalTxs) - Track actual cycle records inserted, not just cycle ranges processed - Update progress reporting to show data cycles vs total records - Change throughput label from "receipts/sec" to "records/sec" --- src/class/ParallelDataSync.ts | 28 ++++++++++++++++++---------- src/collector.ts | 2 +- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index 510260c..e7fe05a 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -27,8 +27,9 @@ export interface ParallelSyncConfig { export interface SyncStats { startTime: number endTime?: number - totalCycles: number + totalCyclesToSync: number completedCycles: number + totalCycles: number totalReceipts: number totalOriginalTxs: number errors: number @@ -241,8 +242,9 @@ export class ParallelDataSync { this.stats = { startTime: Date.now(), - totalCycles: 0, + totalCyclesToSync: 0, completedCycles: 0, + totalCycles: 0, totalReceipts: 0, totalOriginalTxs: 0, errors: 0, @@ -296,7 +298,7 @@ export class ParallelDataSync { console.log(`${'='.repeat(60)}\n`) this.stats.startTime = Date.now() - this.stats.totalCycles = endCycle - startCycle + this.stats.totalCyclesToSync = endCycle - startCycle try { console.log( @@ -336,9 +338,9 @@ export class ParallelDataSync { this.stats.completedCycles += endCycle - startCycle + 1 - const progress = ((this.stats.completedCycles / this.stats.totalCycles) * 100).toFixed(1) + const progress = ((this.stats.completedCycles / this.stats.totalCyclesToSync) * 100).toFixed(1) console.log( - `Progress: ${this.stats.completedCycles}/${this.stats.totalCycles} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + `Progress: ${this.stats.completedCycles}/${this.stats.totalCyclesToSync} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` ) } catch (error) { console.error(`Error syncing cycle batch ${startCycle}-${endCycle}:`, error) @@ -405,9 +407,12 @@ export class ParallelDataSync { cycleMarker: cycleRecord.marker, })) - // Process cycles using bulkInsertCycles + // Bulk insert cycles await CycleDB.bulkInsertCycles(cycleRecords) + // Update stats + this.stats.totalCycles += cycleRecords.length + if (config.verbose) { console.log(`[Cycles ${startCycle}-${endCycle}] Cycles: +${response.length}`) } @@ -770,18 +775,21 @@ export class ParallelDataSync { const elapsedSec = (elapsedMs / 1000).toFixed(2) const elapsedMin = (elapsedMs / 60000).toFixed(2) + const totalRecords = this.stats.totalCycles + this.stats.totalReceipts + this.stats.totalOriginalTxs + const throughput = (totalRecords / (elapsedMs / 1000)).toFixed(0) + console.log(`\n${'='.repeat(60)}`) console.log('Parallel Sync Complete!') console.log(`${'='.repeat(60)}`) console.log(` Cycle Range: ${startCycle} → ${endCycle}`) - console.log(` Cycles Synced: ${this.stats.completedCycles}/${this.stats.totalCycles}`) + console.log(` Data Cycles Synced: ${this.stats.completedCycles}/${this.stats.totalCyclesToSync}`) + console.log(` Cycles Synced: ${this.stats.totalCycles}`) console.log(` Receipts Synced: ${this.stats.totalReceipts}`) console.log(` OriginalTxs Synced: ${this.stats.totalOriginalTxs}`) + console.log(` Total Records: ${totalRecords}`) console.log(` Errors: ${this.stats.errors}`) console.log(` Time Elapsed: ${elapsedSec}s (${elapsedMin} min)`) - console.log( - ` Throughput: ${(this.stats.totalReceipts / (elapsedMs / 1000)).toFixed(0)} receipts/sec` - ) + console.log(` Throughput: ${throughput} records/sec`) console.log(`${'='.repeat(60)}\n`) } diff --git a/src/collector.ts b/src/collector.ts index 59b29fd..c27f139 100644 --- a/src/collector.ts +++ b/src/collector.ts @@ -285,7 +285,7 @@ const connectToDistributor = (): void => { try { validateData(StringUtils.safeJsonParse(data)) } catch (e) { - console.log('Error in processing received data!', e) + console.log('Error in processing received data!', data, e) } }) ws.onerror = (error) => { From 54f6fd9cab2e44dc57059304d918b6cfe09eeff8 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Wed, 12 Nov 2025 14:06:58 +0800 Subject: [PATCH 09/14] feat: add parallel fetching to data_sync_checker script - Implement parallel batch fetching with configurable concurrency (up to 100) - Fix batch boundaries to prevent overlapping cycles and double-counting - Add detailed mismatch reporting with formatted table output - Improve type safety with TallyItem and MismatchResult interfaces feat: add distributor_tally_verifier script Create comprehensive verification tool to compare distributor endpoints: - Tally mode: Compare tally endpoint vs cycle-based pagination - Full mode: Compare full data endpoint vs cycle-based pagination - Track transaction IDs to debug count discrepancies - Support page-based pagination for full data endpoint - Display detailed mismatch analysis with ID-level comparison --- scripts/data_sync_checker.ts | 235 ++++++++++--- scripts/distributor_tally_verifier.ts | 470 ++++++++++++++++++++++++++ 2 files changed, 662 insertions(+), 43 deletions(-) create mode 100644 scripts/distributor_tally_verifier.ts diff --git a/scripts/data_sync_checker.ts b/scripts/data_sync_checker.ts index 8d21656..78a471d 100644 --- a/scripts/data_sync_checker.ts +++ b/scripts/data_sync_checker.ts @@ -13,53 +13,201 @@ const endCycle = 0 const saveToFile = false const data_type: any = DataType.RECEIPT // DataType.RECEIPT // DataType.CYCLE // DataType.ORIGINALTX -const api_url = data_type === DataType.RECEIPT ? 'receipt' : data_type === DataType.CYCLE ? 'cycleinfo' : 'originalTx' +const api_url = + data_type === DataType.RECEIPT ? 'receipt' : data_type === DataType.CYCLE ? 'cycleinfo' : 'originalTx' -const runProgram = async (): Promise => { - const limit = 100 - let distributor_responses: any = [] - let api_responses: any = [] - let nextEnd = startCycle + limit - for (let i = startCycle; i < endCycle;) { - console.log(`Start Cycle ${i} End Cycle ${nextEnd}`) - const distributor_data = data_type === DataType.CYCLE ? { - start: i, - end: nextEnd - } : { - startCycle: i, - endCycle: nextEnd, - type: 'tally' +interface MismatchResult { + cycle: number + distributorCount: number + collectorCount: number +} + +interface TallyItem { + cycle: number + receipts?: number + originalTxsData?: number + originalTxs?: number +} + +const fetchBatch = async ( + cycleStart: number, + cycleEnd: number +): Promise<{ distributor: TallyItem[]; api: TallyItem[] }> => { + const distributor_data = + data_type === DataType.CYCLE + ? { + start: cycleStart, + end: cycleEnd, } - const api_data = data_type === DataType.CYCLE ? `?start=${i}&end=${nextEnd}` : `?startCycle=${i}&endCycle=${nextEnd}&tally=true` - - const res1 = await queryFromDistributor(data_type, distributor_data) - // console.log(res1.data) - - const res2 = await axios.get(`${API_SERVER_URL}/api/${api_url}${api_data}`) - // console.log(res2.data) - - switch (data_type) { - case DataType.RECEIPT: - distributor_responses = [...distributor_responses, ...res1.data.receipts] - api_responses = [...api_responses, ...res2.data.totalReceipts] - break - case DataType.CYCLE: - distributor_responses = [...distributor_responses, ...res1.data.cycleInfo] - api_responses = [...api_responses, ...res2.data.cycles] - break - case DataType.ORIGINALTX: - distributor_responses = [...distributor_responses, ...res1.data.originalTxs] - api_responses = [...api_responses, ...res2.data.totalOriginalTxs] - break + : { + startCycle: cycleStart, + endCycle: cycleEnd, + type: 'tally', } - i = nextEnd + 1 - nextEnd += limit + const api_data = + data_type === DataType.CYCLE + ? `?start=${cycleStart}&end=${cycleEnd}` + : `?startCycle=${cycleStart}&endCycle=${cycleEnd}&tally=true` + + const [res1, res2] = await Promise.all([ + queryFromDistributor(data_type, distributor_data), + axios.get(`${API_SERVER_URL}/api/${api_url}${api_data}`), + ]) + + let distributorData: TallyItem[] = [] + let apiData: TallyItem[] = [] + + switch (data_type) { + case DataType.RECEIPT: + distributorData = res1.data.receipts || [] + apiData = res2.data.totalReceipts || [] + break + case DataType.CYCLE: + distributorData = res1.data.cycleInfo || [] + apiData = res2.data.cycles || [] + break + case DataType.ORIGINALTX: + distributorData = res1.data.originalTxs || [] + apiData = res2.data.totalOriginalTxs || [] + break + } + + return { distributor: distributorData, api: apiData } +} + +const chunkArray = (array: T[], chunkSize: number): T[][] => { + const chunks: T[][] = [] + for (let i = 0; i < array.length; i += chunkSize) { + chunks.push(array.slice(i, i + chunkSize)) + } + return chunks +} + +const runProgram = async (): Promise => { + const limit = 100 + const concurrency = 100 + + const batches: Array<{ start: number; end: number }> = [] + + // Create batches without overlapping boundaries + let currentStart = startCycle + while (currentStart <= endCycle) { + const batchEnd = Math.min(currentStart + limit - 1, endCycle) + batches.push({ start: currentStart, end: batchEnd }) + currentStart = batchEnd + 1 + } + + console.log(`Fetching ${batches.length} batches in parallel (concurrency: ${concurrency})...`) + + // Process batches in chunks to limit concurrency + const batchChunks = chunkArray(batches, concurrency) + const allResults: Array<{ distributor: TallyItem[]; api: TallyItem[] }> = [] + + for (const chunk of batchChunks) { + console.log(`Processing ${chunk.length} batches in parallel...`) + const chunkResults = await Promise.all( + chunk.map((batch) => { + console.log(`Fetching cycles ${batch.start} to ${batch.end}`) + return fetchBatch(batch.start, batch.end) + }) + ) + allResults.push(...chunkResults) + } + + // Combine results + let distributor_responses: TallyItem[] = [] + let api_responses: TallyItem[] = [] + + for (const result of allResults) { + distributor_responses = [...distributor_responses, ...result.distributor] + api_responses = [...api_responses, ...result.api] + } + + console.log( + '\nDISTRIBUTOR RESPONSES:', + distributor_responses.length, + 'API SERVER RESPONSES:', + api_responses.length + ) + + // Compare and find mismatches + const mismatches: MismatchResult[] = [] + + if (data_type === DataType.RECEIPT || data_type === DataType.ORIGINALTX) { + // Create maps for easy lookup + const distributorMap = new Map() + const apiMap = new Map() + + for (const item of distributor_responses) { + const count = + data_type === DataType.RECEIPT + ? item.receipts ?? 0 + : data_type === DataType.ORIGINALTX + ? item.originalTxsData ?? item.originalTxs ?? 0 + : 0 + distributorMap.set(item.cycle, count) + } + + for (const item of api_responses) { + const count = + data_type === DataType.RECEIPT + ? item.receipts ?? 0 + : data_type === DataType.ORIGINALTX + ? item.originalTxsData ?? item.originalTxs ?? 0 + : 0 + apiMap.set(item.cycle, count) } - console.log('DISTRIBUTOR RESPONSES', distributor_responses.length, 'API SERVER RESPONSES', api_responses.length) - console.log(isDeepStrictEqual(distributor_responses, api_responses)) - // console.dir(distributor_responses, { depth: null }) - // console.dir(api_responses, { depth: null }) - // save to file + + // Find all unique cycles + const allCycles = new Set([...distributorMap.keys(), ...apiMap.keys()]) + + for (const cycle of allCycles) { + const distributorCount = distributorMap.get(cycle) || 0 + const apiCount = apiMap.get(cycle) || 0 + + if (distributorCount !== apiCount) { + mismatches.push({ + cycle, + distributorCount, + collectorCount: apiCount, + }) + } + } + + // Sort mismatches by cycle + mismatches.sort((a, b) => a.cycle - b.cycle) + } + + // Print mismatches + if (mismatches.length > 0) { + console.log(`\n${'='.repeat(70)}`) + console.log(`Found ${mismatches.length} mismatched cycles:`) + console.log(`${'='.repeat(70)}`) + console.log( + `${'Cycle'.padEnd(10)} | ${'Distributor'.padEnd(15)} | ${'Collector'.padEnd(15)} | ${'Difference'}` + ) + console.log(`${'-'.repeat(70)}`) + + for (const mismatch of mismatches) { + const diff = mismatch.collectorCount - mismatch.distributorCount + console.log( + `${String(mismatch.cycle).padEnd(10)} | ${String(mismatch.distributorCount).padEnd(15)} | ${String( + mismatch.collectorCount + ).padEnd(15)} | ${diff > 0 ? '+' : ''}${diff}` + ) + } + console.log(`${'='.repeat(70)}\n`) + } else { + console.log('\n✅ No mismatches found! All cycles match.') + } + + // Deep comparison for cycles + if (data_type === DataType.CYCLE) { + const isEqual = isDeepStrictEqual(distributor_responses, api_responses) + console.log('\nDeep comparison result:', isEqual ? '✅ MATCH' : '❌ MISMATCH') + } + + // Save to file if (saveToFile) { writeFileSync( `distributor_${data_type}_${startCycle}_${endCycle}.json`, @@ -69,6 +217,7 @@ const runProgram = async (): Promise => { `api_server_${data_type}_${startCycle}_${endCycle}.json`, JSON.stringify(api_responses, null, 4) ) + console.log('\n📁 Results saved to files') } } runProgram() diff --git a/scripts/distributor_tally_verifier.ts b/scripts/distributor_tally_verifier.ts new file mode 100644 index 0000000..be0f2a5 --- /dev/null +++ b/scripts/distributor_tally_verifier.ts @@ -0,0 +1,470 @@ +import axios from 'axios' +import * as crypto from '@shardus/crypto-utils' +import { config, DISTRIBUTOR_URL } from '../src/config' +import { queryFromDistributor, DataType } from '../src/class/DataSync' +crypto.init(config.hashKey) + +const startCycle = 0 +const endCycle = 0 + +// Choose data type to verify +const data_type: DataType = DataType.RECEIPT // DataType.RECEIPT or DataType.ORIGINALTX + +// Choose comparison mode: +// 'tally' - Compare tally endpoint vs cycle-based pagination +// 'full' - Compare full data endpoint vs cycle-based pagination +const comparisonMode: 'tally' | 'full' = 'tally' + +interface TallyItem { + cycle: number + receipts?: number + originalTxsData?: number + originalTxs?: number +} + +interface MismatchResult { + cycle: number + tallyCount: number + actualCount: number +} + +interface TransactionIdDetails { + cycle: number + fullDataIds: string[] + cycleBasedIds: string[] +} + +/** + * Fetch tally counts from distributor (aggregated counts per cycle) + */ +const fetchTallyCounts = async ( + cycleStart: number, + cycleEnd: number +): Promise> => { + const tallyMap = new Map() + + const response = await queryFromDistributor(data_type, { + startCycle: cycleStart, + endCycle: cycleEnd, + type: 'tally', + }) + + if (!response?.data) { + console.warn(`No tally data returned for cycles ${cycleStart}-${cycleEnd}`) + return tallyMap + } + + const tallyData: TallyItem[] = + data_type === DataType.RECEIPT ? response.data.receipts || [] : response.data.originalTxs || [] + + for (const item of tallyData) { + const count = + data_type === DataType.RECEIPT + ? item.receipts ?? 0 + : item.originalTxsData ?? item.originalTxs ?? 0 + tallyMap.set(item.cycle, count) + } + + return tallyMap +} + +/** + * Fetch full data from distributor without tally (fetches actual records and counts them) + * Uses pagination to fetch all data across multiple pages + */ +const fetchFullDataCounts = async ( + cycleStart: number, + cycleEnd: number +): Promise<{ counts: Map; ids: Map }> => { + const countsMap = new Map() + const idsMap = new Map() + + let page = 1 + let hasMorePages = true + const maxLimit = config.requestLimits.MAX_RECEIPTS_PER_REQUEST + + while (hasMorePages) { + const response = await queryFromDistributor(data_type, { + startCycle: cycleStart, + endCycle: cycleEnd, + page: page, + // No 'type: tally' - fetch actual data + }) + + if (!response?.data) { + console.warn(`No data returned for cycles ${cycleStart}-${cycleEnd} page ${page}`) + break + } + + const items = + data_type === DataType.RECEIPT ? response.data.receipts || [] : response.data.originalTxs || [] + + if (items.length === 0) { + break // No more data + } + + // Count items per cycle and collect IDs + for (const item of items) { + const cycle = item.cycle + const txId = data_type === DataType.RECEIPT ? item.receiptId : item.txId + + countsMap.set(cycle, (countsMap.get(cycle) || 0) + 1) + + if (!idsMap.has(cycle)) { + idsMap.set(cycle, []) + } + idsMap.get(cycle)!.push(txId) + } + + console.log( + `Fetched page ${page} for cycles ${cycleStart}-${cycleEnd}: ${items.length} items, total cycles tracked: ${countsMap.size}` + ) + + // Check if we need to fetch more pages + if (items.length < maxLimit) { + hasMorePages = false + } else { + page++ + } + } + + return { counts: countsMap, ids: idsMap } +} + +/** + * Fetch actual data from distributor using cycle-based pagination + * (Same method used in ParallelDataSync) + */ +const fetchActualDataCounts = async ( + cycleStart: number, + cycleEnd: number +): Promise<{ counts: Map; ids: Map }> => { + const actualCountsMap = new Map() + const actualIdsMap = new Map() + + let currentCycle = cycleStart + let afterTimestamp = 0 + let afterTxId = '' + const limit = config.requestLimits.MAX_RECEIPTS_PER_REQUEST + + const url = + data_type === DataType.RECEIPT + ? `${DISTRIBUTOR_URL}/receipt/cycle` + : `${DISTRIBUTOR_URL}/originalTx/cycle` + + while (currentCycle <= cycleEnd) { + const requestData = { + startCycle: currentCycle, + endCycle: cycleEnd, + afterTimestamp, + afterTxId, + limit, + sender: config.collectorInfo.publicKey, + sign: undefined, + } + + crypto.signObj(requestData, config.collectorInfo.secretKey, config.collectorInfo.publicKey) + + const response = await axios.post(url, requestData) + + const items = + data_type === DataType.RECEIPT + ? response.data?.receipts || [] + : response.data?.originalTxs || [] + + if (items.length === 0) { + break // No more data + } + + // Count items per cycle and collect IDs + for (const item of items) { + const cycle = item.cycle + const txId = data_type === DataType.RECEIPT ? item.receiptId : item.txId + + actualCountsMap.set(cycle, (actualCountsMap.get(cycle) || 0) + 1) + + if (!actualIdsMap.has(cycle)) { + actualIdsMap.set(cycle, []) + } + actualIdsMap.get(cycle)!.push(txId) + } + + // Update pagination cursors + const lastItem = items[items.length - 1] + currentCycle = lastItem.cycle + afterTimestamp = lastItem.timestamp + afterTxId = data_type === DataType.RECEIPT ? lastItem.receiptId : lastItem.txId + + console.log( + `Fetched ${items.length} items, last in cycle ${currentCycle}, total cycles tracked: ${actualCountsMap.size}` + ) + + // If we got less than limit, we've exhausted the range + if (items.length < limit) { + break + } + } + + return { counts: actualCountsMap, ids: actualIdsMap } +} + +const chunkArray = (array: T[], chunkSize: number): T[][] => { + const chunks: T[][] = [] + for (let i = 0; i < array.length; i += chunkSize) { + chunks.push(array.slice(i, i + chunkSize)) + } + return chunks +} + +const runProgram = async (): Promise => { + const limit = 100 + const concurrency = 10 + + const batches: Array<{ start: number; end: number }> = [] + + // Create batches without overlapping boundaries + let currentStart = startCycle + while (currentStart <= endCycle) { + const batchEnd = Math.min(currentStart + limit - 1, endCycle) + batches.push({ start: currentStart, end: batchEnd }) + currentStart = batchEnd + 1 + } + + const dataTypeName = data_type === DataType.RECEIPT ? 'Receipts' : 'OriginalTxs' + const modeName = + comparisonMode === 'tally' + ? 'Tally vs Cycle-Based Pagination' + : 'Full Data vs Cycle-Based Pagination' + + console.log(`\n${'='.repeat(70)}`) + console.log(`Distributor Verifier - ${dataTypeName}`) + console.log(`${'='.repeat(70)}`) + console.log(`Comparison Mode: ${modeName}`) + console.log(`Cycle Range: ${startCycle} to ${endCycle}`) + console.log(`Batches: ${batches.length}`) + console.log(`Concurrency: ${concurrency}`) + console.log(`${'='.repeat(70)}\n`) + + const batchChunks = chunkArray(batches, concurrency) + + // Step 1: Fetch first dataset (tally or full data) + const firstDataLabel = comparisonMode === 'tally' ? 'tally' : 'full data' + console.log(`Fetching ${firstDataLabel} counts from distributor...`) + + const firstDataCountsMap = new Map() + const firstDataIdsMap = new Map() + + if (comparisonMode === 'tally') { + // Tally mode: only fetch counts (no IDs available) + const tallyMaps: Map[] = [] + for (const chunk of batchChunks) { + const chunkResults = await Promise.all( + chunk.map((batch) => { + console.log(`Fetching ${firstDataLabel} for cycles ${batch.start} to ${batch.end}`) + return fetchTallyCounts(batch.start, batch.end) + }) + ) + tallyMaps.push(...chunkResults) + } + // Merge tally counts + for (const map of tallyMaps) { + for (const [cycle, count] of map.entries()) { + firstDataCountsMap.set(cycle, (firstDataCountsMap.get(cycle) || 0) + count) + } + } + } else { + // Full data mode: fetch counts and IDs + const fullDataResults: Array<{ counts: Map; ids: Map }> = [] + for (const chunk of batchChunks) { + const chunkResults = await Promise.all( + chunk.map((batch) => { + console.log(`Fetching ${firstDataLabel} for cycles ${batch.start} to ${batch.end}`) + return fetchFullDataCounts(batch.start, batch.end) + }) + ) + fullDataResults.push(...chunkResults) + } + // Merge full data counts and IDs + for (const result of fullDataResults) { + for (const [cycle, count] of result.counts.entries()) { + firstDataCountsMap.set(cycle, (firstDataCountsMap.get(cycle) || 0) + count) + } + for (const [cycle, ids] of result.ids.entries()) { + if (!firstDataIdsMap.has(cycle)) { + firstDataIdsMap.set(cycle, []) + } + firstDataIdsMap.get(cycle)!.push(...ids) + } + } + } + + console.log(`\n${firstDataLabel} counts fetched: ${firstDataCountsMap.size} cycles\n`) + + // Step 2: Fetch cycle-based pagination data (always with IDs) + console.log('Fetching data using cycle-based pagination...') + const cycleBasedResults: Array<{ counts: Map; ids: Map }> = [] + + for (const chunk of batchChunks) { + const chunkResults = await Promise.all( + chunk.map((batch) => { + console.log(`Fetching cycle-based data for cycles ${batch.start} to ${batch.end}`) + return fetchActualDataCounts(batch.start, batch.end) + }) + ) + cycleBasedResults.push(...chunkResults) + } + + // Merge cycle-based counts and IDs + const cycleBasedCountsMap = new Map() + const cycleBasedIdsMap = new Map() + + for (const result of cycleBasedResults) { + for (const [cycle, count] of result.counts.entries()) { + cycleBasedCountsMap.set(cycle, (cycleBasedCountsMap.get(cycle) || 0) + count) + } + for (const [cycle, ids] of result.ids.entries()) { + if (!cycleBasedIdsMap.has(cycle)) { + cycleBasedIdsMap.set(cycle, []) + } + cycleBasedIdsMap.get(cycle)!.push(...ids) + } + } + + console.log(`\nCycle-based pagination counts fetched: ${cycleBasedCountsMap.size} cycles\n`) + + // Compare first dataset vs cycle-based counts + const mismatches: MismatchResult[] = [] + const allCycles = new Set([...firstDataCountsMap.keys(), ...cycleBasedCountsMap.keys()]) + + for (const cycle of allCycles) { + const firstDataCount = firstDataCountsMap.get(cycle) || 0 + const cycleBasedCount = cycleBasedCountsMap.get(cycle) || 0 + + if (firstDataCount !== cycleBasedCount) { + mismatches.push({ + cycle, + tallyCount: firstDataCount, + actualCount: cycleBasedCount, + }) + } + } + + // Sort mismatches by cycle + mismatches.sort((a, b) => a.cycle - b.cycle) + + // Print results + const firstColumnLabel = comparisonMode === 'tally' ? 'Tally Count' : 'Full Data Count' + const secondColumnLabel = 'Cycle-Based Count' + + console.log(`\n${'='.repeat(70)}`) + console.log(`Verification Results - ${dataTypeName}`) + console.log(`${'='.repeat(70)}`) + console.log(`Comparison Mode: ${modeName}`) + console.log(`Total cycles checked: ${allCycles.size}`) + console.log(`Cycles with ${firstDataLabel} data: ${firstDataCountsMap.size}`) + console.log(`Cycles with cycle-based data: ${cycleBasedCountsMap.size}`) + console.log(`Mismatches found: ${mismatches.length}`) + console.log(`${'='.repeat(70)}\n`) + + if (mismatches.length > 0) { + console.log(`\n${'='.repeat(70)}`) + console.log(`Mismatched Cycles:`) + console.log(`${'='.repeat(70)}`) + console.log( + `${'Cycle'.padEnd(10)} | ${firstColumnLabel.padEnd(18)} | ${secondColumnLabel.padEnd(18)} | ${'Difference'}` + ) + console.log(`${'-'.repeat(70)}`) + + for (const mismatch of mismatches) { + const diff = mismatch.actualCount - mismatch.tallyCount + console.log( + `${String(mismatch.cycle).padEnd(10)} | ${String(mismatch.tallyCount).padEnd(18)} | ${String( + mismatch.actualCount + ).padEnd(18)} | ${diff > 0 ? '+' : ''}${diff}` + ) + } + console.log(`${'='.repeat(70)}\n`) + } else { + console.log(`✅ All cycles match! ${firstDataLabel} and cycle-based data are consistent.\n`) + } + + // Calculate total counts + let totalFirstData = 0 + let totalCycleBased = 0 + for (const count of firstDataCountsMap.values()) { + totalFirstData += count + } + for (const count of cycleBasedCountsMap.values()) { + totalCycleBased += count + } + + console.log(`Total ${dataTypeName} from ${firstDataLabel}: ${totalFirstData}`) + console.log(`Total ${dataTypeName} from cycle-based: ${totalCycleBased}`) + console.log(`Difference: ${totalCycleBased - totalFirstData}`) + + // Display transaction IDs for mismatched cycles (if available in full mode) + if (mismatches.length > 0 && firstDataIdsMap.size > 0) { + console.log(`\n${'='.repeat(70)}`) + console.log(`Transaction IDs for Mismatched Cycles:`) + console.log(`${'='.repeat(70)}\n`) + + for (const mismatch of mismatches.slice(0, 10)) { + // Show first 10 mismatches + console.log(`Cycle ${mismatch.cycle}:`) + + const fullDataIds = firstDataIdsMap.get(mismatch.cycle) || [] + const cycleBasedIds = cycleBasedIdsMap.get(mismatch.cycle) || [] + + console.log(` Full Data IDs (${fullDataIds.length}):`) + if (fullDataIds.length > 0) { + fullDataIds.slice(0, 5).forEach((id) => console.log(` - ${id}`)) + if (fullDataIds.length > 5) { + console.log(` ... and ${fullDataIds.length - 5} more`) + } + } else { + console.log(` (none)`) + } + + console.log(` Cycle-Based IDs (${cycleBasedIds.length}):`) + if (cycleBasedIds.length > 0) { + cycleBasedIds.slice(0, 5).forEach((id) => console.log(` - ${id}`)) + if (cycleBasedIds.length > 5) { + console.log(` ... and ${cycleBasedIds.length - 5} more`) + } + } else { + console.log(` (none)`) + } + + // Find IDs that are in one set but not the other + const fullDataSet = new Set(fullDataIds) + const cycleBasedSet = new Set(cycleBasedIds) + + const onlyInFullData = fullDataIds.filter((id) => !cycleBasedSet.has(id)) + const onlyInCycleBased = cycleBasedIds.filter((id) => !fullDataSet.has(id)) + + if (onlyInFullData.length > 0) { + console.log(` Only in Full Data (${onlyInFullData.length}):`) + onlyInFullData.slice(0, 3).forEach((id) => console.log(` - ${id}`)) + if (onlyInFullData.length > 3) { + console.log(` ... and ${onlyInFullData.length - 3} more`) + } + } + + if (onlyInCycleBased.length > 0) { + console.log(` Only in Cycle-Based (${onlyInCycleBased.length}):`) + onlyInCycleBased.slice(0, 3).forEach((id) => console.log(` - ${id}`)) + if (onlyInCycleBased.length > 3) { + console.log(` ... and ${onlyInCycleBased.length - 3} more`) + } + } + + console.log() + } + + if (mismatches.length > 10) { + console.log(`... and ${mismatches.length - 10} more mismatched cycles\n`) + } + } +} + +runProgram() From 797a53eedd8ef9764e43270af5b2e7a29510ede0 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Wed, 12 Nov 2025 18:39:28 +0800 Subject: [PATCH 10/14] Add deserialization timing tracking and improve error handling in parallel sync - Track deserialization time for API responses and log when > 50ms - Add explicit deserialization timing for receipts and originalTxs processing - Improve parallel sync error handling with Promise.allSettled for better failure reporting - Fix cycle count calculation (add +1 for inclusive range) - Rename syncCyclesByCycleRange to syncCycleRecordsByCycleRange for clarity - Refactor database timing functions and move to bottom of file - Export deserializeDbReceipt and deserializeDbOriginalTxData functions - Optimize deserialization calls using forEach instead of for loops --- src/class/ParallelDataSync.ts | 137 +++++++++++++++++++++++++++------- src/storage/originalTxData.ts | 12 +-- src/storage/receipt.ts | 2 +- src/storage/sqlite3storage.ts | 122 +++++++++++++++--------------- 4 files changed, 176 insertions(+), 97 deletions(-) diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index e7fe05a..7825818 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -50,6 +50,7 @@ interface ResponseSizeMetadata { interface ResponseDataWithMetadata { __responseSize?: ResponseSizeMetadata __networkElapsed?: number + _deserializedTime?: number [key: string]: unknown } @@ -133,7 +134,7 @@ export class ParallelDataSync { // Use custom parse for response with timing const startTime = Date.now() const result = typeof res === 'string' ? StringUtils.safeJsonParse(res) : res - const elapsed = Date.now() - startTime + const deserializedTime = Date.now() - startTime // Calculate decompressed size from raw response string const decompressedBytes = typeof res === 'string' ? Buffer.byteLength(res) : 0 @@ -149,10 +150,12 @@ export class ParallelDataSync { enumerable: false, // Hidden from JSON.stringify and iteration configurable: true, }) + // Attach deserialization time + ;(result as ResponseDataWithMetadata)._deserializedTime = deserializedTime } - if (config.verbose && elapsed > 50) { - console.log(`[Client] Response parse: ${elapsed}ms, size: ${sizeKB}KB`) + if (config.verbose && deserializedTime > 50) { + console.log(`[Client] Response deserialization: ${deserializedTime}ms, size: ${sizeKB}KB`) } return result }, @@ -298,7 +301,7 @@ export class ParallelDataSync { console.log(`${'='.repeat(60)}\n`) this.stats.startTime = Date.now() - this.stats.totalCyclesToSync = endCycle - startCycle + this.stats.totalCyclesToSync = endCycle - startCycle + 1 try { console.log( @@ -310,12 +313,45 @@ export class ParallelDataSync { this.queue.add(() => this.syncDataByCycleRange(batch.startCycle, batch.endCycle)) ) - // Wait for all tasks to complete - await Promise.all(tasks) + console.log(`Waiting for ${tasks.length} tasks to complete...`) + + // Wait for all tasks to complete (even if some fail) + const results = await Promise.allSettled(tasks) + + console.log('All tasks completed, setting end time...') this.stats.endTime = Date.now() + // Count successful and failed tasks + const successful = results.filter((r) => r.status === 'fulfilled').length + const failed = results.filter((r) => r.status === 'rejected').length + + console.log(`Tasks completed: ${successful} successful, ${failed} failed`) + + // Log failed task errors + if (failed > 0) { + console.error(`\n${failed} tasks failed with errors:`) + results.forEach((result, index) => { + if (result.status === 'rejected') { + const batch = cycleBatches[index] + console.error( + ` Batch ${index} (cycles ${batch.startCycle}-${batch.endCycle}): ${ + result.reason?.message || result.reason + }` + ) + } + }) + } + + console.log('Printing summary...') // Summary await this.printSummary(startCycle, endCycle) + + console.log('Summary printed successfully') + + // Throw if there were any failures so the caller knows sync was incomplete + if (failed > 0) { + throw new Error(`Parallel sync completed with ${failed} failed batches out of ${tasks.length} total`) + } } catch (error) { console.error('Fatal error in parallel sync:', error) this.stats.errors++ @@ -328,31 +364,52 @@ export class ParallelDataSync { * Adaptively handles partial cycle completion (e.g., if requesting cycles 1-10 but only get data from 1-5, then sends next request for 5-10) */ private async syncDataByCycleRange(startCycle: number, endCycle: number): Promise { - try { - // Sync all data types in parallel - await Promise.all([ - this.syncCyclesByCycleRange(startCycle, endCycle), - this.syncReceiptsByCycleRange(startCycle, endCycle), - this.syncOriginalTxsByCycleRange(startCycle, endCycle), - ]) - - this.stats.completedCycles += endCycle - startCycle + 1 + // Sync all data types in parallel with individual error tracking + const results = await Promise.allSettled([ + this.syncCycleRecordsByCycleRange(startCycle, endCycle), + this.syncReceiptsByCycleRange(startCycle, endCycle), + this.syncOriginalTxsByCycleRange(startCycle, endCycle), + ]) + + const dataTypes = ['Cycle Records', 'Receipts', 'OriginalTxs'] + const failedTypes: string[] = [] + const errors: unknown[] = [] + + results.forEach((result, index) => { + if (result.status === 'rejected') { + failedTypes.push(dataTypes[index]) + errors.push(result.reason) + } + }) - const progress = ((this.stats.completedCycles / this.stats.totalCyclesToSync) * 100).toFixed(1) - console.log( - `Progress: ${this.stats.completedCycles}/${this.stats.totalCyclesToSync} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + if (failedTypes.length > 0) { + console.error( + `Error syncing cycle batch ${startCycle}-${endCycle}: Failed data types: ${failedTypes.join(', ')}` ) - } catch (error) { - console.error(`Error syncing cycle batch ${startCycle}-${endCycle}:`, error) + errors.forEach((error, index) => { + const errorMessage = error instanceof Error ? error.message : String(error) + console.error(` ${failedTypes[index]}: ${errorMessage}`) + }) this.stats.errors++ - throw error + throw new Error( + `Failed to sync ${ + failedTypes.length + } data type(s) for batch ${startCycle}-${endCycle}: ${failedTypes.join(', ')}` + ) } + + this.stats.completedCycles += endCycle - startCycle + 1 + + const progress = ((this.stats.completedCycles / this.stats.totalCyclesToSync) * 100).toFixed(1) + console.log( + `Progress: ${this.stats.completedCycles}/${this.stats.totalCyclesToSync} cycles (${progress}%) [batch: ${startCycle}-${endCycle}]` + ) } /** - * Sync cycles across a batch of cycles using multi-cycle fetching + * Sync cycle records across a batch of cycles using multi-cycle fetching */ - private async syncCyclesByCycleRange(startCycle: number, endCycle: number): Promise { + private async syncCycleRecordsByCycleRange(startCycle: number, endCycle: number): Promise { try { const response = await this.fetchDataFromDistributor( DataType.CYCLE, @@ -370,11 +427,13 @@ export class ParallelDataSync { const compressionRatio = sizeMetadata?.compressionRatio const compressionSavings = sizeMetadata?.compressionSavings const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + const deserializedTime = (response.data as ResponseDataWithMetadata)?._deserializedTime || 0 if (config.verbose || networkElapsed > 1000) { // Build log message with compression info if available let logMessage = - `[API Timing] Cycles fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `[API Timing] Cycle Records fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `deserialization: ${deserializedTime}ms, ` + `records: ${cycles.length}` // Only show compression metrics if compression actually reduced the size (ratio < 1) @@ -393,7 +452,7 @@ export class ParallelDataSync { } if (!response || !response.data || !response.data.cycleInfo) { - console.error(`Error fetching cycles for cycle batch ${startCycle}-${endCycle}:`, response) + console.error(`Error fetching cycle records for cycle batch ${startCycle}-${endCycle}:`, response) return // Couldn't fetch any cycles } @@ -414,10 +473,10 @@ export class ParallelDataSync { this.stats.totalCycles += cycleRecords.length if (config.verbose) { - console.log(`[Cycles ${startCycle}-${endCycle}] Cycles: +${response.length}`) + console.log(`[Cycles ${startCycle}-${endCycle}] Cycle Records: +${cycleRecords.length}`) } } catch (error) { - console.error(`Error fetching cycles for cycle batch ${startCycle}-${endCycle}:`, error) + console.error(`Error fetching cycle records for cycle batch ${startCycle}-${endCycle}:`, error) throw error } } @@ -477,11 +536,13 @@ export class ParallelDataSync { const compressionRatio = sizeMetadata?.compressionRatio const compressionSavings = sizeMetadata?.compressionSavings const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + const deserializedTime = (response.data as ResponseDataWithMetadata)?._deserializedTime || 0 if (config.verbose || networkElapsed > 1000) { // Build log message with compression info if available let logMessage = `[API Timing] Receipts fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `deserialization: ${deserializedTime}ms, ` + `records: ${receipts.length}` // Only show compression metrics if compression actually reduced the size (ratio < 1) @@ -535,6 +596,16 @@ export class ParallelDataSync { nextFetchPromise = null } + const startTime = Date.now() + // Deserialize receipts + receipts.forEach((receipt) => { + ReceiptDB.deserializeDbReceipt(receipt) + }) + const elapsed = Date.now() - startTime + if (elapsed > 100) { + console.log(`Deserializing ${receipts.length} receipts took: ${elapsed}ms`) + } + // Process receipts (overlaps with next fetch if prefetch enabled) await ReceiptDB.processReceiptData(receipts) @@ -615,11 +686,13 @@ export class ParallelDataSync { const compressionRatio = sizeMetadata?.compressionRatio const compressionSavings = sizeMetadata?.compressionSavings const networkElapsed = (response.data as ResponseDataWithMetadata)?.__networkElapsed || 0 + const deserializedTime = (response.data as ResponseDataWithMetadata)?._deserializedTime || 0 if (config.verbose || networkElapsed > 1000) { // Build log message with compression info if available let logMessage = `[API Timing] OriginalTxs fetch (cycles ${startCycle}-${endCycle}): ${networkElapsed}ms, ` + + `deserialization: ${deserializedTime}ms, ` + `records: ${originalTxs.length}` // Only show compression metrics if compression actually reduced the size (ratio < 1) @@ -673,6 +746,16 @@ export class ParallelDataSync { nextFetchPromise = null } + const startTime = Date.now() + // Deserialize originalTxs + originalTxs.forEach((originalTx) => { + OriginalTxDataDB.deserializeDbOriginalTxData(originalTx) + }) + const elapsed = Date.now() - startTime + if (elapsed > 100) { + console.log(`Deserializing ${originalTxs.length} originalTxs took ${elapsed}ms`) + } + // Process originalTxs (overlaps with next fetch if prefetch enabled) await OriginalTxDataDB.processOriginalTxData(originalTxs) diff --git a/src/storage/originalTxData.ts b/src/storage/originalTxData.ts index d5bcd45..8134e23 100644 --- a/src/storage/originalTxData.ts +++ b/src/storage/originalTxData.ts @@ -196,9 +196,7 @@ export async function queryOriginalTxsData(query: QueryOriginalTxsDataParams): P sql += ` OFFSET ${skip}` } originalTxsData = (await db.all(originalTxDataDatabase, sql, values)) as DbOriginalTxData[] - for (const originalTxData of originalTxsData) { - originalTxData.originalTxData = StringUtils.safeJsonParse(originalTxData.originalTxData) - } + originalTxsData.forEach((originalTxData: DbOriginalTxData) => deserializeDbOriginalTxData(originalTxData)) } catch (e) { console.log(e) } @@ -210,9 +208,7 @@ export async function queryOriginalTxDataByTxId(txId: string): Promise() const queuedBySql = new Map() -function formatSqlForLog(sql: string): string { - const normalized = sql.replace(/\s+/g, ' ').trim() - if (normalized.length <= SQL_LOG_MAX_LENGTH) return normalized - return `${normalized.slice(0, SQL_LOG_MAX_LENGTH - 3)}...` -} - -function registerQuery(sql: string): QueryTiming { - const entry: QueryTiming = { - id: ++queryIdSequence, - sql, - startMs: Date.now(), - } - pendingQueries.set(entry.id, entry) - let queue = queuedBySql.get(sql) - if (!queue) { - queue = [] - queuedBySql.set(sql, queue) - } - queue.push(entry.id) - return entry -} - -function cleanupQuery(entry: QueryTiming): void { - pendingQueries.delete(entry.id) - const queue = queuedBySql.get(entry.sql) - if (!queue) return - const index = queue.indexOf(entry.id) - if (index !== -1) queue.splice(index, 1) - if (queue.length === 0) queuedBySql.delete(entry.sql) -} - -function logTiming(operation: string, entry: QueryTiming, rows?: number): void { - const totalMs = Date.now() - entry.startMs - const engineMs = entry.engineMs ?? 0 - const queueMs = Math.max(0, totalMs - engineMs) - const payload = { - operation, - totalMs: Number(totalMs.toFixed(2)), - queueMs: Number(queueMs.toFixed(2)), - engineMs: Number(engineMs.toFixed(2)), - sql: formatSqlForLog(entry.sql), - rows, - } - - if (totalMs > SQL_TOTAL_WARN_THRESHOLD_MS || queueMs > SQL_QUEUE_WARN_THRESHOLD_MS) { - console.warn('[DB Timing]', payload) - } else { - console.log('[DB Timing]', payload) - } -} - export const createDB = async (dbPath: string, dbName: string): Promise => { console.log('dbName', dbName, 'dbPath', dbPath) const db = new Database(dbPath, (err) => { @@ -89,8 +38,7 @@ export const createDB = async (dbPath: string, dbName: string): Promise 0 ? queue[0] : undefined if (id === undefined) { - console.warn('[DB Timing] profile event without pending query', { - pid: process.pid, + printQueryTimingLog('profile event without pending query', { engineMs, sql: formatSqlForLog(sql), }) @@ -98,8 +46,7 @@ export const createDB = async (dbPath: string, dbName: string): Promise SQL_ENGINE_WARN_THRESHOLD_MS) { - console.warn('[DB Engine] Slow engine execution detected', { - pid: process.pid, - engineMs: Number(engineMs.toFixed(2)), - sql: formatSqlForLog(sql), - }) + console.warn(`[DB Engine] Slow Query: ${engineMs} ms for SQL: ${formatSqlForLog(sql)}`) } }) console.log(`Database ${dbName} Initialized!`) @@ -150,8 +93,8 @@ export async function run( sql: string, params: unknown[] | object = [] ): Promise<{ id: number }> { + const entry = registerQuery(sql) return new Promise((resolve, reject) => { - const entry = registerQuery(sql) const finalize = (): void => { setImmediate(() => { logTiming('run', entry) @@ -173,8 +116,8 @@ export async function run( } export async function get(db: Database, sql: string, params = []): Promise { + const entry = registerQuery(sql) return new Promise((resolve, reject) => { - const entry = registerQuery(sql) const finalize = (rows?: number): void => { setImmediate(() => { logTiming('get', entry, rows) @@ -196,8 +139,8 @@ export async function get(db: Database, sql: string, params = []): Promise } export async function all(db: Database, sql: string, params = []): Promise { + const entry = registerQuery(sql) return new Promise((resolve, reject) => { - const entry = registerQuery(sql) const finalize = (rowsCount?: number): void => { setImmediate(() => { logTiming('all', entry, rowsCount) @@ -254,3 +197,56 @@ export function updateSqlStatementClause(sql: string, inputs: any[]): string { else sql += ' WHERE ' return sql } + +function registerQuery(sql: string): QueryTiming { + const entry: QueryTiming = { + id: ++queryIdSequence, + sql, + startMs: Date.now(), + } + pendingQueries.set(entry.id, entry) + let queue = queuedBySql.get(sql) + if (!queue) { + queue = [] + queuedBySql.set(sql, queue) + } + queue.push(entry.id) + return entry +} + +function cleanupQuery(entry: QueryTiming): void { + pendingQueries.delete(entry.id) + const queue = queuedBySql.get(entry.sql) + if (!queue) return + const index = queue.indexOf(entry.id) + if (index !== -1) queue.splice(index, 1) + if (queue.length === 0) queuedBySql.delete(entry.sql) +} + +function printQueryTimingLog(message: string, payload: object): void { + console.warn(`[DB Timing] ${message}`, JSON.stringify(payload)) +} + +function logTiming(operation: string, entry: QueryTiming, rows?: number): void { + const totalMs = Date.now() - entry.startMs + const engineMs = entry.engineMs ?? 0 + const queueMs = Math.max(0, totalMs - engineMs) + const payload = { + operation, + totalMs: Number(totalMs.toFixed(2)), + queueMs: Number(queueMs.toFixed(2)), + engineMs: Number(engineMs.toFixed(2)), + sql: formatSqlForLog(entry.sql), + rows, + } + + if (totalMs > SQL_TOTAL_WARN_THRESHOLD_MS || queueMs > SQL_QUEUE_WARN_THRESHOLD_MS) { + printQueryTimingLog('', payload) + } +} + +function formatSqlForLog(sql: string): string { + const normalized = sql.replace(/\s+/g, ' ').trim() + if (normalized.length <= SQL_LOG_MAX_LENGTH) return normalized + return `${normalized.slice(0, SQL_LOG_MAX_LENGTH - 3)}...` +} From 1e218211b0021ace5549f99bc27c5aba10e5dd20 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Fri, 14 Nov 2025 00:27:44 +0800 Subject: [PATCH 11/14] feat: implement buffered database writes with serialized queue for parallel sync - Add accumulation buffers (1000 record threshold) to batch DB writes and reduce contention - Implement serialized write queue with transaction support to prevent concurrent write conflicts - Optimize receipt processing by pre-fetching existing IDs to avoid N+1 query problem - Increase retry attempts to 5 with exponential backoff for better collector recovery - Add mutex locks to prevent race conditions during buffer flushes - Configure WAL checkpoint frequency and database pragmas for high-throughput operations --- src/class/DataSyncManager.ts | 10 +- src/class/ParallelDataSync.ts | 327 +++++++++++++++++++++++++++-- src/config/index.ts | 2 +- src/storage/account.ts | 3 +- src/storage/accountHistoryState.ts | 3 +- src/storage/cycle.ts | 4 +- src/storage/originalTxData.ts | 3 +- src/storage/receipt.ts | 67 ++++-- src/storage/sqlite3storage.ts | 149 ++++++++++++- src/storage/transaction.ts | 7 +- 10 files changed, 519 insertions(+), 56 deletions(-) diff --git a/src/class/DataSyncManager.ts b/src/class/DataSyncManager.ts index eea0504..565c534 100644 --- a/src/class/DataSyncManager.ts +++ b/src/class/DataSyncManager.ts @@ -104,8 +104,9 @@ export class DataSyncManager { const parallelDataSync = new ParallelDataSync({ concurrency: config.parallelSyncConcurrency, - retryAttempts: 3, - retryDelayMs: 1000, + cyclesPerBatch: config.cyclesPerBatch, + retryAttempts: config.syncRetryAttempts, + enablePrefetch: config.enablePrefetch, }) const cycleBatches = await parallelDataSync.createCycleBatches(0, latestDistributorCycle) @@ -623,8 +624,9 @@ export class DataSyncManager { const parallelDataSync = new ParallelDataSync({ concurrency: config.parallelSyncConcurrency, - retryAttempts: 3, - retryDelayMs: 1000, + cyclesPerBatch: config.cyclesPerBatch, + retryAttempts: config.syncRetryAttempts, + enablePrefetch: config.enablePrefetch, }) const cycleBatches = [] diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index 7825818..ae812ff 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -3,11 +3,22 @@ import * as crypto from '@shardus/crypto-utils' import { P2P, Utils as StringUtils } from '@shardus/types' import { config, DISTRIBUTOR_URL } from '../config' import { DataType } from './DataSync' -import { CycleDB, ReceiptDB, OriginalTxDataDB } from '../storage' -import { Cycle } from '../types' +import { + CycleDB, + ReceiptDB, + OriginalTxDataDB, + // receiptDatabase, + // originalTxDataDatabase, + // cycleDatabase, +} from '../storage' +import { Cycle, Receipt, OriginalTxData } from '../types' import axios, { AxiosInstance } from 'axios' import http from 'http' import https from 'https' +// import { checkpointWAL } from '../storage/sqlite3storage' + +// For Debugging Purpose - Set to false to skip processing data and saving to DB +const processData = true /** * Configuration for parallel sync @@ -81,12 +92,39 @@ export class ParallelDataSync { private httpsAgent: https.Agent private axiosInstance: AxiosInstance + // Accumulation buffers for batching DB writes - only write when threshold is reached + private receiptBuffer: Receipt[] = [] + private originalTxBuffer: OriginalTxData[] = [] + private cycleBuffer: Cycle[] = [] + private readonly ACCUMULATION_THRESHOLD = 1000 // Write to DB when buffer reaches this size + + // Mutex locks to prevent concurrent buffer access (race conditions) + private receiptBufferLock = false + private originalTxBufferLock = false + private cycleBufferLock = false + + // // WAL checkpoint tracking + // private flushCount = 0 // Total number of buffer flushes + // private readonly CHECKPOINT_FREQUENCY = 10 // Run WAL checkpoint every N flushes to prevent WAL from growing too large + + // // Flush pending flag to prevent multiple workers from waiting to flush + // private receiptFlushPending = false + + // // Adaptive flush delay system - adds delays before DB writes to prevent overload + // private flushTimestamps: number[] = [] // Timestamps of recent flushes + // private readonly FLUSH_WINDOW_MS = 10000 // Track flushes in last 10 seconds + // private readonly FAST_FLUSH_THRESHOLD = 5 // If 5+ flushes in window, system is overloaded + // private minFlushDelay = 200 // Min delay before flush (ms) + // private maxFlushDelay = 1000 // Max delay before flush (ms) + // private readonly OVERLOAD_MIN_DELAY = 3000 // When overloaded, min delay increases to 3s + // private readonly OVERLOAD_MAX_DELAY = 5000 // When overloaded, max delay increases to 5s + constructor(syncConfig?: Partial) { this.syncConfig = { concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 10, - retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 3, + cyclesPerBatch: syncConfig?.cyclesPerBatch || config.cyclesPerBatch || 100, + retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 5, retryDelayMs: syncConfig?.retryDelayMs || 1000, - cyclesPerBatch: syncConfig?.cyclesPerBatch || config.cyclesPerBatch || 10, enablePrefetch: syncConfig?.enablePrefetch ?? config.enablePrefetch ?? true, prefetchDepth: syncConfig?.prefetchDepth || 1, } @@ -308,7 +346,11 @@ export class ParallelDataSync { `Syncing ${cycleBatches.length} cycle batches created with ${this.syncConfig.cyclesPerBatch} cycles per batch` ) - // Add all batch sync tasks to the queue + // Three-phase approach for optimal performance: + // Phase 1: Use main queue (concurrency: 5) for parallel API fetching + // Phase 2: Buffer data in memory until ACCUMULATION_THRESHOLD (1000) is reached + // Phase 3: DB writes are batched and serialized via storage-level queue + // This combines parallel I/O with batched, serialized DB writes to minimize contention const tasks = cycleBatches.map((batch) => this.queue.add(() => this.syncDataByCycleRange(batch.startCycle, batch.endCycle)) ) @@ -318,7 +360,11 @@ export class ParallelDataSync { // Wait for all tasks to complete (even if some fail) const results = await Promise.allSettled(tasks) - console.log('All tasks completed, setting end time...') + console.log('All tasks completed, flushing remaining buffers...') + + // Flush any remaining buffered data to database + await this.flushAllBuffers() + this.stats.endTime = Date.now() // Count successful and failed tasks @@ -355,6 +401,12 @@ export class ParallelDataSync { } catch (error) { console.error('Fatal error in parallel sync:', error) this.stats.errors++ + // Try to flush buffers even on error to preserve data + try { + await this.flushAllBuffers() + } catch (flushError) { + console.error('Error flushing buffers during error handling:', flushError) + } throw error } } @@ -466,8 +518,8 @@ export class ParallelDataSync { cycleMarker: cycleRecord.marker, })) - // Bulk insert cycles - await CycleDB.bulkInsertCycles(cycleRecords) + // Add cycles to buffer - will flush to DB when buffer reaches threshold + await this.addToBuffer('cycle', cycleRecords) // Update stats this.stats.totalCycles += cycleRecords.length @@ -606,8 +658,8 @@ export class ParallelDataSync { console.log(`Deserializing ${receipts.length} receipts took: ${elapsed}ms`) } - // Process receipts (overlaps with next fetch if prefetch enabled) - await ReceiptDB.processReceiptData(receipts) + // Add receipts to buffer - will flush to DB when buffer reaches threshold + await this.addToBuffer('receipt', receipts) totalFetched += receipts.length this.stats.totalReceipts += receipts.length @@ -756,8 +808,8 @@ export class ParallelDataSync { console.log(`Deserializing ${originalTxs.length} originalTxs took ${elapsed}ms`) } - // Process originalTxs (overlaps with next fetch if prefetch enabled) - await OriginalTxDataDB.processOriginalTxData(originalTxs) + // Add originalTxs to buffer - will flush to DB when buffer reaches threshold + await this.addToBuffer('originalTx', originalTxs) totalFetched += originalTxs.length this.stats.totalOriginalTxs += originalTxs.length @@ -804,25 +856,30 @@ export class ParallelDataSync { return response } catch (error: any) { const isLastAttempt = attempt === this.syncConfig.retryAttempts - const isRetryableError = - error.code === 'ECONNRESET' || - error.code === 'ETIMEDOUT' || - error.code === 'ECONNREFUSED' || - error.code === 'EPIPE' - if (isRetryableError && !isLastAttempt) { + // Retry ALL errors (network errors, socket hang up, timeouts, etc.) + // This gives the collector time to recover when overloaded + if (!isLastAttempt) { + // Exponential backoff with longer delays to give collector time to recover const delay = this.syncConfig.retryDelayMs * Math.pow(2, attempt) + const errorCode = error.code || error.cause?.code || 'UNKNOWN' + const errorMsg = error.message || 'Unknown error' console.warn( - `ECONNRESET on ${route} fetch (cycles ${startCycle}-${endCycle}), ` + - `attempt ${attempt + 1}/${this.syncConfig.retryAttempts + 1}, ` + - `retrying in ${delay}ms...` + `Error (${errorCode}: ${errorMsg}) on ${route} fetch (cycles ${startCycle}-${endCycle}), ` + + `attempt ${attempt + 1}/${this.syncConfig.retryAttempts}, ` + + `retrying in ${delay}ms... (Giving collector time to process DB writes)` ) await this.sleep(delay) continue } - // Non-retryable error or last attempt failed - console.error(`Error fetching ${route} for (cycles ${startCycle}-${endCycle}):`, error.message) + // Last attempt failed - throw error + console.error( + `Error fetching ${route} for (cycles ${startCycle}-${endCycle}) after ${ + this.syncConfig.retryAttempts + 1 + } attempts:`, + error.message + ) throw error } } @@ -876,6 +933,230 @@ export class ParallelDataSync { console.log(`${'='.repeat(60)}\n`) } + /** + * Generic function to add data to buffer and flush if threshold reached + * Handles all buffer types (receipts, originalTxs, cycles) + */ + private async addToBuffer( + type: 'receipt' | 'originalTx' | 'cycle', + data: Receipt[] | OriginalTxData[] | Cycle[] + ): Promise { + if (type === 'receipt') { + // Wait for lock to be released (prevents concurrent modification during flush) + while (this.receiptBufferLock) { + await new Promise((resolve) => setTimeout(resolve, 10)) + } + + // Add data to buffer + this.receiptBuffer.push(...(data as Receipt[])) + + // Check if buffer reached threshold + if (this.receiptBuffer.length >= this.ACCUMULATION_THRESHOLD) { + await this.flushBuffer('receipt') + } + } else if (type === 'originalTx') { + // Wait for lock to be released (prevents concurrent modification during flush) + while (this.originalTxBufferLock) { + await new Promise((resolve) => setTimeout(resolve, 10)) + } + + // Add data to buffer + this.originalTxBuffer.push(...(data as OriginalTxData[])) + + // Check if buffer reached threshold + if (this.originalTxBuffer.length >= this.ACCUMULATION_THRESHOLD) { + await this.flushBuffer('originalTx') + } + } else { + // Wait for lock to be released (prevents concurrent modification during flush) + while (this.cycleBufferLock) { + await new Promise((resolve) => setTimeout(resolve, 10)) + } + + // Add data to buffer + this.cycleBuffer.push(...(data as Cycle[])) + + // Check if buffer reached threshold + if (this.cycleBuffer.length >= this.ACCUMULATION_THRESHOLD) { + await this.flushBuffer('cycle') + } + } + } + + /** + * Generic function to flush buffer to database + * Handles all buffer types with adaptive delay and locking (adaptive cooling only for receipts) + */ + private async flushBuffer(type: 'receipt' | 'originalTx' | 'cycle'): Promise { + if (type === 'receipt') { + if (this.receiptBuffer.length === 0) return + + // // If another worker is already flushing, return immediately (it will flush our data too) + // if (this.receiptFlushPending) { + // return + // } + + // // Mark flush as pending + // this.receiptFlushPending = true + + // // Apply adaptive delay BEFORE acquiring lock to spread out DB writes (receipts only) + // const delay = this.getAdaptiveFlushDelay() + // if (delay > 0) { + // const recentFlushCount = this.flushTimestamps.length + // const delayRange = `${this.minFlushDelay}-${this.maxFlushDelay}ms` + // console.log( + // `[Adaptive Cooling] Receipts - Waiting ${delay}ms before flush ` + + // `(recent flushes: ${recentFlushCount}, range: ${delayRange})` + // ) + // await new Promise((resolve) => setTimeout(resolve, delay)) + // } + + // // If another worker is already locking, return immediately (it will flush our data too) + // if (this.receiptBufferLock) { + // return + // } + + this.receiptBufferLock = true + try { + const toFlush = [...this.receiptBuffer] + this.receiptBuffer = [] + console.log(`[Buffer Flush] Flushing ${toFlush.length} receipts to database`) + if (processData) await ReceiptDB.processReceiptData(toFlush, false, false) + + // // Track flush timestamp for adaptive delay system (receipts only) + // this.recordFlushTimestamp() + } finally { + this.receiptBufferLock = false + + // // Clear flush pending flag + // this.receiptFlushPending = false + } + } else if (type === 'originalTx') { + if (this.originalTxBuffer.length === 0) return + + // If another worker is already locking, return immediately (it will flush our data too) + if (this.originalTxBufferLock) { + return + } + + this.originalTxBufferLock = true + try { + const toFlush = [...this.originalTxBuffer] + this.originalTxBuffer = [] + console.log(`[Buffer Flush] Flushing ${toFlush.length} originaltxs to database`) + if (processData) await OriginalTxDataDB.processOriginalTxData(toFlush) + } finally { + this.originalTxBufferLock = false + } + } else { + if (this.cycleBuffer.length === 0) return + + // If another worker is already locking, return immediately (it will flush our data too) + if (this.cycleBufferLock) { + return + } + + this.cycleBufferLock = true + try { + const toFlush = [...this.cycleBuffer] + this.cycleBuffer = [] + console.log(`[Buffer Flush] Flushing ${toFlush.length} cycles to database`) + if (processData) await CycleDB.bulkInsertCycles(toFlush) + } finally { + this.cycleBufferLock = false + } + } + } + + /** + * Flush all buffers (call at end of sync) + */ + private async flushAllBuffers(): Promise { + await this.flushBuffer('receipt') + await this.flushBuffer('originalTx') + await this.flushBuffer('cycle') + } + + // /** + // * Conditionally checkpoint WAL files if enough flushes have occurred + // * This prevents WAL files from growing too large during long sync operations + // */ + // private async maybeCheckpointWAL(): Promise { + // if (this.flushCount % this.CHECKPOINT_FREQUENCY === 0) { + // console.log( + // `[WAL Checkpoint] Running periodic checkpoint after ${this.flushCount} buffer flushes (~${ + // this.flushCount * this.ACCUMULATION_THRESHOLD + // } records)` + // ) + // // Run checkpoints on all three databases in parallel + // // Use PASSIVE mode to avoid blocking readers + // await Promise.all([ + // checkpointWAL(receiptDatabase, 'PASSIVE'), + // checkpointWAL(originalTxDataDatabase, 'PASSIVE'), + // checkpointWAL(cycleDatabase, 'PASSIVE'), + // ]) + // } + // } + + // /** + // * Record flush timestamp and clean up old timestamps + // * Used to track flush frequency and detect system overload + // */ + // private recordFlushTimestamp(): void { + // const now = Date.now() + // this.flushTimestamps.push(now) + + // // Clean up old timestamps outside the tracking window + // this.flushTimestamps = this.flushTimestamps.filter((timestamp) => now - timestamp < this.FLUSH_WINDOW_MS) + // } + + // /** + // * Calculate adaptive flush delay based on recent flush frequency + // * Returns a random delay within a range that adapts to system load + // */ + // private getAdaptiveFlushDelay(): number { + // // Clean up old timestamps first + // const now = Date.now() + // this.flushTimestamps = this.flushTimestamps.filter((timestamp) => now - timestamp < this.FLUSH_WINDOW_MS) + + // // Check if system is overloaded (too many flushes in recent window) + // const recentFlushCount = this.flushTimestamps.length + // const isOverloaded = recentFlushCount >= this.FAST_FLUSH_THRESHOLD + // const wasOverloaded = this.minFlushDelay === this.OVERLOAD_MIN_DELAY + + // // Adjust delay range based on system load + // if (isOverloaded) { + // // System overloaded - use longer delays + // const wasNormal = this.minFlushDelay === 200 + // this.minFlushDelay = this.OVERLOAD_MIN_DELAY + // this.maxFlushDelay = this.OVERLOAD_MAX_DELAY + // if (wasNormal) { + // // Log when transitioning from normal to overloaded + // console.log( + // `[Adaptive Cooling] ⚠️ OVERLOAD DETECTED! ${recentFlushCount} flushes in last ${ + // this.FLUSH_WINDOW_MS / 1000 + // }s. ` + `Increasing cooling delay: ${this.minFlushDelay}-${this.maxFlushDelay}ms` + // ) + // } + // } else if (recentFlushCount < this.FAST_FLUSH_THRESHOLD / 2) { + // // System healthy - reduce delays back to normal + // if (wasOverloaded) { + // // Log when recovering from overload + // console.log( + // `[Adaptive Cooling] ✓ System recovered! ${recentFlushCount} flushes in last ${ + // this.FLUSH_WINDOW_MS / 1000 + // }s. ` + `Reducing cooling delay: 200-1000ms` + // ) + // } + // this.minFlushDelay = 200 + // this.maxFlushDelay = 1000 + // } + + // // Return random delay within current range to stagger DB writes + // const delay = this.minFlushDelay + Math.floor(Math.random() * (this.maxFlushDelay - this.minFlushDelay)) + // return delay + // } + /** * Get current statistics */ diff --git a/src/config/index.ts b/src/config/index.ts index 0e1eb50..fae05f3 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -193,7 +193,7 @@ let config: Config = { useParallelSync: process.env.USE_PARALLEL_SYNC !== 'false', // Enable by default cyclesPerBatch: Number(process.env.CYCLES_PER_BATCH) || 100, // Batch 100 cycles together ( matching MAX_BETWEEN_CYCLES_PER_REQUEST, can be lower if needed ) enablePrefetch: process.env.ENABLE_PREFETCH !== 'false', // Enable prefetch by default - syncRetryAttempts: Number(process.env.SYNC_RETRY_ATTEMPTS) || 3, // Retry failed requests 3 times + syncRetryAttempts: Number(process.env.SYNC_RETRY_ATTEMPTS) || 5, // Retry failed requests 5 times dexScreenerAPI: 'https://api.dexscreener.com/latest/dex/search?q=0x693ed886545970F0a3ADf8C59af5cCdb6dDF0a76', dexScreenerLink: 'https://dexscreener.com/polygon/0x041e48a5b11c29fdbd92498eb05573c52728398c', diff --git a/src/storage/account.ts b/src/storage/account.ts index 02d2cb9..2d1fbef 100644 --- a/src/storage/account.ts +++ b/src/storage/account.ts @@ -79,7 +79,8 @@ export async function bulkInsertAccounts(accounts: Account[]): Promise { ${keepNewerData('accountType')}, ${keepNewerData('isGlobal')}, createdTimestamp = MIN(accounts.createdTimestamp, excluded.createdTimestamp)` - await db.run(accountDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(accountDatabase, sql, values) console.log('Successfully bulk inserted Accounts', accounts.length) } catch (e) { console.log(e) diff --git a/src/storage/accountHistoryState.ts b/src/storage/accountHistoryState.ts index c517687..b0e1723 100644 --- a/src/storage/accountHistoryState.ts +++ b/src/storage/accountHistoryState.ts @@ -70,7 +70,8 @@ export async function bulkInsertAccountHistoryStates( ) const sql = `INSERT OR REPLACE INTO accountHistoryState ${fields} VALUES ${allPlaceholders}` - await db.run(accountHistoryStateDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(accountHistoryStateDatabase, sql, values) console.log('Successfully bulk inserted AccountHistoryStates', accountHistoryStates.length) } catch (e) { console.log(e) diff --git a/src/storage/cycle.ts b/src/storage/cycle.ts index d295f7e..1bb1f43 100644 --- a/src/storage/cycle.ts +++ b/src/storage/cycle.ts @@ -62,7 +62,8 @@ export async function bulkInsertCycles(cycles: Cycle[]): Promise { ) const sql = `INSERT OR REPLACE INTO cycles ${fields} VALUES ${allPlaceholders}` - await db.run(cycleDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(cycleDatabase, sql, values) console.log('Successfully bulk inserted Cycles', cycles.length) } catch (e) { console.log(e) @@ -248,7 +249,6 @@ export interface CycleGap { */ export async function queryMissingCycleRanges(targetCycle: number): Promise { try { - // Get first and last cycle for edge gap detection const firstCycleResult = (await db.get( cycleDatabase, diff --git a/src/storage/originalTxData.ts b/src/storage/originalTxData.ts index 8134e23..50f37b2 100644 --- a/src/storage/originalTxData.ts +++ b/src/storage/originalTxData.ts @@ -60,7 +60,8 @@ export async function bulkInsertOriginalTxsData(originalTxsData: OriginalTxData[ ) const sql = `INSERT OR REPLACE INTO originalTxsData ${fields} VALUES ${allPlaceholders}` - await db.run(originalTxDataDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(originalTxDataDatabase, sql, values) console.log(`Successfully bulk inserted OriginalTxsData`, originalTxsData.length) } catch (e) { console.log(e) diff --git a/src/storage/receipt.ts b/src/storage/receipt.ts index 796fa48..b05d103 100644 --- a/src/storage/receipt.ts +++ b/src/storage/receipt.ts @@ -72,7 +72,8 @@ export async function bulkInsertReceipts(receipts: Receipt[]): Promise { ) const sql = `INSERT OR REPLACE INTO receipts ${fields} VALUES ${allPlaceholders}` - await db.run(receiptDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(receiptDatabase, sql, values) console.log('Successfully bulk inserted receipts', receipts.length) } catch (e) { console.log(e) @@ -83,14 +84,28 @@ export async function bulkInsertReceipts(receipts: Receipt[]): Promise { export async function processReceiptData( receipts: Receipt[], saveOnlyNewData = false, + filterExistingAccounts = true, // When true, queries DB to filter out older account data before insert forwardToSubscribers = false ): Promise { if (receipts && receipts.length <= 0) return - const bucketSize = 1000 + const bucketSize = 2000 + const bucketSizeForReceipts = 1000 // Receipts size can be big, better to save less than the bucket size let combineReceipts: Receipt[] = [] let combineAccounts: Account[] = [] // For new accounts to bulk insert; Not for accounts that are already stored in database let combineTransactions: Transaction[] = [] let accountHistoryStateList: AccountHistoryStateDB.AccountHistoryState[] = [] + + // Optimization: If saveOnlyNewData is true, batch query existing receipt IDs BEFORE the loop + // to avoid N+1 query problem (individual SELECTs for each receipt) + let existingReceiptIds: Set = new Set() + if (saveOnlyNewData && receipts.length > 0) { + const receiptIds = receipts.map((r) => r.tx.txId) + const placeholders = receiptIds.map(() => '?').join(', ') + const sql = `SELECT receiptId FROM receipts WHERE receiptId IN (${placeholders})` + const existingReceipts = (await db.all(receiptDatabase, sql, receiptIds)) as { receiptId: string }[] + existingReceiptIds = new Set(existingReceipts.map((r) => r.receiptId)) + } + for (const receiptObj of receipts) { const { afterStates, @@ -118,8 +133,10 @@ export async function processReceiptData( applyTimestamp: applyTimestamp ?? calculatedApplyTimestamp, } if (saveOnlyNewData) { - const receiptExist = await queryReceiptByReceiptId(tx.txId) - if (!receiptExist) combineReceipts.push(modifiedReceiptObj as unknown as Receipt) + // Check against pre-fetched set instead of querying database for each receipt + if (!existingReceiptIds.has(tx.txId)) { + combineReceipts.push(modifiedReceiptObj as unknown as Receipt) + } } else combineReceipts.push(modifiedReceiptObj as unknown as Receipt) const txReceipt = appReceiptData receiptsMap.set(tx.txId, tx.timestamp) @@ -128,8 +145,7 @@ export async function processReceiptData( forwardData(receiptObj) } - // Receipts size can be big, better to save per 100 - if (combineReceipts.length >= 100) { + if (combineReceipts.length >= bucketSizeForReceipts) { await bulkInsertReceipts(combineReceipts) combineReceipts = [] } @@ -288,22 +304,30 @@ export async function processReceiptData( } } - // Batch query all collected account IDs once - const accountIdsToQuery = combineAccounts.map((acc) => acc.accountId) - const existingAccounts = await AccountDB.queryAccountTimestampsBatch(accountIdsToQuery) - for (const accObj of combineAccounts) { - const accountExist = existingAccounts.get(accObj.accountId) - if (accountExist) { - if (accountExist.timestamp > accObj.timestamp) { - // await AccountDB.updateAccount(accObj) - // Remove the account from the list - combineAccounts = combineAccounts.filter((acc) => acc.accountId !== accObj.accountId) - } - if (accountExist.createdTimestamp > accObj.createdTimestamp) { - await AccountDB.updateCreatedTimestamp(accObj.accountId, accObj.createdTimestamp) + // Optimization: The bulkInsertAccounts SQL already handles: + // 1. Keeping newer data via CASE WHEN excluded.timestamp > accounts.timestamp + // 2. Preserving oldest createdTimestamp via MIN(accounts.createdTimestamp, excluded.createdTimestamp) + // By default (filterExistingAccounts=false), we skip the batch query and individual updates - just bulk insert everything + + if (filterExistingAccounts) { + // Legacy path: Batch query all collected account IDs once and filter before insert + const accountIdsToQuery = combineAccounts.map((acc) => acc.accountId) + const existingAccounts = await AccountDB.queryAccountTimestampsBatch(accountIdsToQuery) + for (const accObj of combineAccounts) { + const accountExist = existingAccounts.get(accObj.accountId) + if (accountExist) { + if (accountExist.timestamp > accObj.timestamp) { + // await AccountDB.updateAccount(accObj) + // Remove the account from the list + combineAccounts = combineAccounts.filter((acc) => acc.accountId !== accObj.accountId) + } + if (accountExist.createdTimestamp > accObj.createdTimestamp) { + await AccountDB.updateCreatedTimestamp(accObj.accountId, accObj.createdTimestamp) + } } } } + // Insert the combined accounts in bucketSize if (combineAccounts.length > 0) { for (let i = 0; i < combineAccounts.length; i += bucketSize) { @@ -399,7 +423,10 @@ export async function queryReceiptCountByCycles( let receipts: { cycle: number; 'COUNT(*)': number }[] = [] try { const sql = `SELECT cycle, COUNT(*) FROM receipts GROUP BY cycle HAVING cycle BETWEEN ? AND ? ORDER BY cycle ASC` - receipts = (await db.all(receiptDatabase, sql, [start, end])) as { cycle: number; 'COUNT(*)': number }[] + receipts = (await db.all(receiptDatabase, sql, [start, end])) as { + cycle: number + 'COUNT(*)': number + }[] } catch (e) { console.log(e) } diff --git a/src/storage/sqlite3storage.ts b/src/storage/sqlite3storage.ts index 4cd94ee..68ba51a 100644 --- a/src/storage/sqlite3storage.ts +++ b/src/storage/sqlite3storage.ts @@ -1,6 +1,11 @@ import { Utils as StringUtils } from '@shardus/types' import { Database } from 'sqlite3' +// Simple write queue using Promise chain - serializes all database writes +// This prevents write contention while allowing parallel reads (SELECTs) +// Only INSERT/UPDATE/DELETE operations should use this queue +let writeQueueTail: Promise = Promise.resolve() + interface QueryTiming { id: number sql: string @@ -29,7 +34,7 @@ export const createDB = async (dbPath: string, dbName: string): Promise => { + console.log('dbName (Read)', dbName, 'dbPath', dbPath) + const db = new Database(dbPath, (err) => { + if (err) { + console.log('Error opening read database:', err) + throw err + } + }) + await run(db, 'PRAGMA journal_mode=WAL') // WAL mode allows concurrent reads with writes + await run(db, 'PRAGMA synchronous = OFF') // Read-only connection doesn't need sync + await run(db, 'PRAGMA temp_store = MEMORY') + await run(db, 'PRAGMA cache_size = -128000') // 128MB cache (smaller than write connection) + await run(db, 'PRAGMA mmap_size = 536870912') // 512MB memory-mapped I/O for faster reads + await run(db, 'PRAGMA busy_timeout = 5000') // Shorter timeout - reads shouldn't block in WAL mode + await run(db, 'PRAGMA threads = 4') // Use up to 4 threads for parallel operations + await run(db, 'PRAGMA query_only = ON') // Enforce read-only mode at SQLite level + db.on('profile', (sql, time) => { + const engineMs = typeof time === 'number' ? time : Number(time) + const queue = queuedBySql.get(sql) + const id = queue && queue.length > 0 ? queue[0] : undefined + if (id === undefined) { + printQueryTimingLog('profile event without pending query (read)', { + engineMs, + sql: formatSqlForLog(sql), + }) + return + } + const entry = pendingQueries.get(id) + if (!entry) { + printQueryTimingLog('profile missing pending entry (read)', { + engineMs, + sql: formatSqlForLog(sql), + }) + return + } + entry.engineMs = engineMs + if (engineMs > SQL_ENGINE_WARN_THRESHOLD_MS) { + console.warn(`[DB Engine Read] Slow Query: ${engineMs} ms for SQL: ${formatSqlForLog(sql)}`) + } + }) + console.log(`Read Database ${dbName} Initialized!`) + return db +} + +/** + * Manually checkpoint the WAL file to prevent it from growing too large + * Uses PASSIVE mode which won't block readers + * Call this periodically during long-running sync operations + */ +export async function checkpointWAL( + db: Database, + mode: 'PASSIVE' | 'FULL' | 'RESTART' = 'PASSIVE' +): Promise { + try { + await run(db, `PRAGMA wal_checkpoint(${mode})`) + console.log(`[WAL Checkpoint] Executed ${mode} checkpoint`) + } catch (error) { + console.error('[WAL Checkpoint] Failed to checkpoint WAL:', error) + } +} + /** * Close Database Connections Gracefully */ @@ -161,6 +233,81 @@ export async function all(db: Database, sql: string, params = []): Promise(writeOperation: () => Promise): Promise { + const enqueuedAt = Date.now() + + // Wait for previous write to finish, ignoring errors to prevent propagation + const myTurn = writeQueueTail.catch(() => undefined) + + // Create and chain the new write operation + const currentWrite = myTurn.then(async () => { + const startedAt = Date.now() + const promiseQueueMs = startedAt - enqueuedAt + + // Log if we waited a long time in the Promise queue + if (promiseQueueMs > 100) { + console.log(`[Promise Queue] Waited ${promiseQueueMs}ms in Promise queue before starting DB operation`) + } + + const value = await writeOperation() + const completedAt = Date.now() + const executionMs = completedAt - startedAt + + // Log slow DB operations (includes transaction + SQLite busy_timeout) + if (executionMs > 500) { + console.log( + `[DB Operation] Total: ${executionMs}ms (Promise queue: ${promiseQueueMs}ms, DB execution+waiting: ${executionMs}ms)` + ) + } + + return value + }) + + // Update queue tail to current write (for next operation to wait on) + writeQueueTail = currentWrite.catch(() => undefined) + + // Return the actual operation result + return currentWrite +} + +/** + * Execute work within a database transaction + * Uses BEGIN (deferred) since our write queue already serializes writes + * This reduces lock contention compared to BEGIN IMMEDIATE + * @param db Database instance + * @param work Async function containing the work to execute within the transaction + * @returns Result of the work function + */ +export async function executeInTransaction(db: Database, work: () => Promise): Promise { + await run(db, 'BEGIN') // Deferred transaction - acquires RESERVED lock on first write, not at BEGIN + try { + const result = await work() + await run(db, 'COMMIT') + return result + } catch (error) { + await run(db, 'ROLLBACK') + throw error + } +} + +export async function executeDbWriteWithTransaction( + db: Database, + sql: string, + params: unknown[] | object = [] +): Promise { + // Serialize write through storage-level queue + transaction for atomicity + await executeDbWrite(() => + executeInTransaction(db, async () => { + await run(db, sql, params) + }) + ) +} + export function extractValues(object: object): string[] { try { const inputs: string[] = [] diff --git a/src/storage/transaction.ts b/src/storage/transaction.ts index f46238b..5779cfa 100644 --- a/src/storage/transaction.ts +++ b/src/storage/transaction.ts @@ -61,7 +61,8 @@ export async function bulkInsertTransactions(transactions: Transaction[]): Promi ) const sql = `INSERT OR REPLACE INTO transactions ${fields} VALUES ${allPlaceholders}` - await db.run(transactionDatabase, sql, values) + // Serialize write through storage-level queue + transaction for atomicity + await db.executeDbWriteWithTransaction(transactionDatabase, sql, values) console.log('Successfully bulk inserted transactions', transactions.length) } catch (e) { console.log(e) @@ -378,7 +379,9 @@ export async function queryActiveAccountsCountByTxFee( WHERE timestamp < ? AND timestamp > ? ${excludeZeroFeeTxs ? ' AND txFee > 0' : ''} ` const values = [beforeTimestamp, afterTimestamp] - activeAccounts = (await db.get(transactionDatabase, sql, values)) as { 'COUNT(DISTINCT txFrom)': number } + activeAccounts = (await db.get(transactionDatabase, sql, values)) as { + 'COUNT(DISTINCT txFrom)': number + } } catch (e) { console.log('Error querying active accounts by txFee:', e) } From 0f4c91acbd963f054ac6108191d97857a5cd4a9b Mon Sep 17 00:00:00 2001 From: jairajdev Date: Sat, 15 Nov 2025 00:31:48 +0800 Subject: [PATCH 12/14] feat: optimize receipt deserialization by processing in chunks to prevent event loop blocking --- src/class/ParallelDataSync.ts | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index ae812ff..5332a12 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -648,16 +648,6 @@ export class ParallelDataSync { nextFetchPromise = null } - const startTime = Date.now() - // Deserialize receipts - receipts.forEach((receipt) => { - ReceiptDB.deserializeDbReceipt(receipt) - }) - const elapsed = Date.now() - startTime - if (elapsed > 100) { - console.log(`Deserializing ${receipts.length} receipts took: ${elapsed}ms`) - } - // Add receipts to buffer - will flush to DB when buffer reaches threshold await this.addToBuffer('receipt', receipts) @@ -1018,8 +1008,28 @@ export class ParallelDataSync { this.receiptBufferLock = true try { - const toFlush = [...this.receiptBuffer] + const toFlush = [...this.receiptBuffer] as any this.receiptBuffer = [] + + const startTime = Date.now() + // Deserialize receipts in chunks to prevent event loop blocking + const CHUNK_SIZE = 20 + for (let i = 0; i < toFlush.length; i += CHUNK_SIZE) { + const end = Math.min(i + CHUNK_SIZE, toFlush.length) + // Deserialize chunk of receipts + for (let j = i; j < end; j++) { + // eslint-disable-next-line security/detect-object-injection + ReceiptDB.deserializeDbReceipt(toFlush[j]) + } + // Yield to event loop after each chunk (except the last one) + if (end < toFlush.length) { + await new Promise((resolve) => setImmediate(resolve)) + } + } + const elapsed = Date.now() - startTime + if (elapsed > 100) { + console.log(`Deserializing ${toFlush.length} receipts took: ${elapsed}ms`) + } console.log(`[Buffer Flush] Flushing ${toFlush.length} receipts to database`) if (processData) await ReceiptDB.processReceiptData(toFlush, false, false) From 0e88eea9287bd5e717c92177de969c83e7d69e64 Mon Sep 17 00:00:00 2001 From: jairajdev Date: Mon, 17 Nov 2025 21:16:17 +0800 Subject: [PATCH 13/14] refactor: Adjust Parallel Sync configurations for optimal performance - Set as a config for manual WAL checkpointing every 10 buffer flushes to prevent WAL growth - Reduce concurrency from 10 to 5 workers to balance throughput with DB pressure - Add chunked receipt deserialization (20 per chunk) to prevent event loop blocking - Put as a config for write queue infrastructure for serialized database operations - Add processData flag for debugging and performance testing --- src/class/ParallelDataSync.ts | 176 ++++++++-------------------------- src/config/index.ts | 2 +- src/storage/sqlite3storage.ts | 32 +++++-- 3 files changed, 67 insertions(+), 143 deletions(-) diff --git a/src/class/ParallelDataSync.ts b/src/class/ParallelDataSync.ts index 5332a12..1af311f 100644 --- a/src/class/ParallelDataSync.ts +++ b/src/class/ParallelDataSync.ts @@ -7,19 +7,21 @@ import { CycleDB, ReceiptDB, OriginalTxDataDB, - // receiptDatabase, - // originalTxDataDatabase, - // cycleDatabase, + receiptDatabase, + originalTxDataDatabase, + cycleDatabase, } from '../storage' import { Cycle, Receipt, OriginalTxData } from '../types' import axios, { AxiosInstance } from 'axios' import http from 'http' import https from 'https' -// import { checkpointWAL } from '../storage/sqlite3storage' +import { useManualCheckPoint, checkpointWAL } from '../storage/sqlite3storage' // For Debugging Purpose - Set to false to skip processing data and saving to DB const processData = true +const DESERIALIZE_RECEIPTS_CHUNK_SIZE = 20 // Number of receipts to deserialize at a time + /** * Configuration for parallel sync */ @@ -103,25 +105,13 @@ export class ParallelDataSync { private originalTxBufferLock = false private cycleBufferLock = false - // // WAL checkpoint tracking - // private flushCount = 0 // Total number of buffer flushes - // private readonly CHECKPOINT_FREQUENCY = 10 // Run WAL checkpoint every N flushes to prevent WAL from growing too large - - // // Flush pending flag to prevent multiple workers from waiting to flush - // private receiptFlushPending = false - - // // Adaptive flush delay system - adds delays before DB writes to prevent overload - // private flushTimestamps: number[] = [] // Timestamps of recent flushes - // private readonly FLUSH_WINDOW_MS = 10000 // Track flushes in last 10 seconds - // private readonly FAST_FLUSH_THRESHOLD = 5 // If 5+ flushes in window, system is overloaded - // private minFlushDelay = 200 // Min delay before flush (ms) - // private maxFlushDelay = 1000 // Max delay before flush (ms) - // private readonly OVERLOAD_MIN_DELAY = 3000 // When overloaded, min delay increases to 3s - // private readonly OVERLOAD_MAX_DELAY = 5000 // When overloaded, max delay increases to 5s + // WAL checkpoint tracking + private flushCount = 0 // Total number of buffer flushes + private readonly CHECKPOINT_FREQUENCY = 10 // Run WAL checkpoint every N flushes to prevent WAL from growing too large constructor(syncConfig?: Partial) { this.syncConfig = { - concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 10, + concurrency: syncConfig?.concurrency || config.parallelSyncConcurrency || 5, cyclesPerBatch: syncConfig?.cyclesPerBatch || config.cyclesPerBatch || 100, retryAttempts: syncConfig?.retryAttempts || config.syncRetryAttempts || 5, retryDelayMs: syncConfig?.retryDelayMs || 1000, @@ -349,7 +339,7 @@ export class ParallelDataSync { // Three-phase approach for optimal performance: // Phase 1: Use main queue (concurrency: 5) for parallel API fetching // Phase 2: Buffer data in memory until ACCUMULATION_THRESHOLD (1000) is reached - // Phase 3: DB writes are batched and serialized via storage-level queue + // Phase 3: DB writes are batched and serialized via write queue // This combines parallel I/O with batched, serialized DB writes to minimize contention const tasks = cycleBatches.map((batch) => this.queue.add(() => this.syncDataByCycleRange(batch.startCycle, batch.endCycle)) @@ -981,45 +971,19 @@ export class ParallelDataSync { if (type === 'receipt') { if (this.receiptBuffer.length === 0) return - // // If another worker is already flushing, return immediately (it will flush our data too) - // if (this.receiptFlushPending) { - // return - // } - - // // Mark flush as pending - // this.receiptFlushPending = true - - // // Apply adaptive delay BEFORE acquiring lock to spread out DB writes (receipts only) - // const delay = this.getAdaptiveFlushDelay() - // if (delay > 0) { - // const recentFlushCount = this.flushTimestamps.length - // const delayRange = `${this.minFlushDelay}-${this.maxFlushDelay}ms` - // console.log( - // `[Adaptive Cooling] Receipts - Waiting ${delay}ms before flush ` + - // `(recent flushes: ${recentFlushCount}, range: ${delayRange})` - // ) - // await new Promise((resolve) => setTimeout(resolve, delay)) - // } - - // // If another worker is already locking, return immediately (it will flush our data too) - // if (this.receiptBufferLock) { - // return - // } - this.receiptBufferLock = true try { - const toFlush = [...this.receiptBuffer] as any + const toFlush = [...this.receiptBuffer] this.receiptBuffer = [] const startTime = Date.now() // Deserialize receipts in chunks to prevent event loop blocking - const CHUNK_SIZE = 20 - for (let i = 0; i < toFlush.length; i += CHUNK_SIZE) { - const end = Math.min(i + CHUNK_SIZE, toFlush.length) + for (let i = 0; i < toFlush.length; i += DESERIALIZE_RECEIPTS_CHUNK_SIZE) { + const end = Math.min(i + DESERIALIZE_RECEIPTS_CHUNK_SIZE, toFlush.length) // Deserialize chunk of receipts for (let j = i; j < end; j++) { // eslint-disable-next-line security/detect-object-injection - ReceiptDB.deserializeDbReceipt(toFlush[j]) + ReceiptDB.deserializeDbReceipt(toFlush[j] as any) } // Yield to event loop after each chunk (except the last one) if (end < toFlush.length) { @@ -1033,13 +997,13 @@ export class ParallelDataSync { console.log(`[Buffer Flush] Flushing ${toFlush.length} receipts to database`) if (processData) await ReceiptDB.processReceiptData(toFlush, false, false) - // // Track flush timestamp for adaptive delay system (receipts only) - // this.recordFlushTimestamp() + if (useManualCheckPoint) { + // Increment flush count and potentially checkpoint WAL + this.flushCount++ + await this.maybeCheckpointWAL() + } } finally { this.receiptBufferLock = false - - // // Clear flush pending flag - // this.receiptFlushPending = false } } else if (type === 'originalTx') { if (this.originalTxBuffer.length === 0) return @@ -1087,85 +1051,27 @@ export class ParallelDataSync { await this.flushBuffer('cycle') } - // /** - // * Conditionally checkpoint WAL files if enough flushes have occurred - // * This prevents WAL files from growing too large during long sync operations - // */ - // private async maybeCheckpointWAL(): Promise { - // if (this.flushCount % this.CHECKPOINT_FREQUENCY === 0) { - // console.log( - // `[WAL Checkpoint] Running periodic checkpoint after ${this.flushCount} buffer flushes (~${ - // this.flushCount * this.ACCUMULATION_THRESHOLD - // } records)` - // ) - // // Run checkpoints on all three databases in parallel - // // Use PASSIVE mode to avoid blocking readers - // await Promise.all([ - // checkpointWAL(receiptDatabase, 'PASSIVE'), - // checkpointWAL(originalTxDataDatabase, 'PASSIVE'), - // checkpointWAL(cycleDatabase, 'PASSIVE'), - // ]) - // } - // } - - // /** - // * Record flush timestamp and clean up old timestamps - // * Used to track flush frequency and detect system overload - // */ - // private recordFlushTimestamp(): void { - // const now = Date.now() - // this.flushTimestamps.push(now) - - // // Clean up old timestamps outside the tracking window - // this.flushTimestamps = this.flushTimestamps.filter((timestamp) => now - timestamp < this.FLUSH_WINDOW_MS) - // } - - // /** - // * Calculate adaptive flush delay based on recent flush frequency - // * Returns a random delay within a range that adapts to system load - // */ - // private getAdaptiveFlushDelay(): number { - // // Clean up old timestamps first - // const now = Date.now() - // this.flushTimestamps = this.flushTimestamps.filter((timestamp) => now - timestamp < this.FLUSH_WINDOW_MS) - - // // Check if system is overloaded (too many flushes in recent window) - // const recentFlushCount = this.flushTimestamps.length - // const isOverloaded = recentFlushCount >= this.FAST_FLUSH_THRESHOLD - // const wasOverloaded = this.minFlushDelay === this.OVERLOAD_MIN_DELAY - - // // Adjust delay range based on system load - // if (isOverloaded) { - // // System overloaded - use longer delays - // const wasNormal = this.minFlushDelay === 200 - // this.minFlushDelay = this.OVERLOAD_MIN_DELAY - // this.maxFlushDelay = this.OVERLOAD_MAX_DELAY - // if (wasNormal) { - // // Log when transitioning from normal to overloaded - // console.log( - // `[Adaptive Cooling] ⚠️ OVERLOAD DETECTED! ${recentFlushCount} flushes in last ${ - // this.FLUSH_WINDOW_MS / 1000 - // }s. ` + `Increasing cooling delay: ${this.minFlushDelay}-${this.maxFlushDelay}ms` - // ) - // } - // } else if (recentFlushCount < this.FAST_FLUSH_THRESHOLD / 2) { - // // System healthy - reduce delays back to normal - // if (wasOverloaded) { - // // Log when recovering from overload - // console.log( - // `[Adaptive Cooling] ✓ System recovered! ${recentFlushCount} flushes in last ${ - // this.FLUSH_WINDOW_MS / 1000 - // }s. ` + `Reducing cooling delay: 200-1000ms` - // ) - // } - // this.minFlushDelay = 200 - // this.maxFlushDelay = 1000 - // } - - // // Return random delay within current range to stagger DB writes - // const delay = this.minFlushDelay + Math.floor(Math.random() * (this.maxFlushDelay - this.minFlushDelay)) - // return delay - // } + /** + * Conditionally checkpoint WAL files if enough flushes have occurred + * This prevents WAL files from growing too large during long sync operations + */ + private async maybeCheckpointWAL(): Promise { + if (!useManualCheckPoint) return + if (this.flushCount % this.CHECKPOINT_FREQUENCY === 0) { + console.log( + `[WAL Checkpoint] Running periodic checkpoint after ${this.flushCount} buffer flushes (~${ + this.flushCount * this.ACCUMULATION_THRESHOLD + } records)` + ) + // Run checkpoints on all three databases in parallel + // Use PASSIVE mode to avoid blocking readers + await Promise.all([ + checkpointWAL(receiptDatabase, 'PASSIVE'), + checkpointWAL(originalTxDataDatabase, 'PASSIVE'), + checkpointWAL(cycleDatabase, 'PASSIVE'), + ]) + } + } /** * Get current statistics diff --git a/src/config/index.ts b/src/config/index.ts index fae05f3..76cb9ab 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -189,7 +189,7 @@ let config: Config = { MAX_ACCOUNT_HISTORY_STATES_PER_REQUEST: 100, MAX_STATS_PER_REQUEST: 1000000, }, - parallelSyncConcurrency: Number(process.env.PARALLEL_SYNC_CONCURRENCY) || 10, // 10 parallel workers + parallelSyncConcurrency: Number(process.env.PARALLEL_SYNC_CONCURRENCY) || 5, // 5 parallel sync fetches useParallelSync: process.env.USE_PARALLEL_SYNC !== 'false', // Enable by default cyclesPerBatch: Number(process.env.CYCLES_PER_BATCH) || 100, // Batch 100 cycles together ( matching MAX_BETWEEN_CYCLES_PER_REQUEST, can be lower if needed ) enablePrefetch: process.env.ENABLE_PREFETCH !== 'false', // Enable prefetch by default diff --git a/src/storage/sqlite3storage.ts b/src/storage/sqlite3storage.ts index 68ba51a..2b673de 100644 --- a/src/storage/sqlite3storage.ts +++ b/src/storage/sqlite3storage.ts @@ -1,11 +1,16 @@ import { Utils as StringUtils } from '@shardus/types' import { Database } from 'sqlite3' +const enableWritingQueue = false + // Simple write queue using Promise chain - serializes all database writes // This prevents write contention while allowing parallel reads (SELECTs) // Only INSERT/UPDATE/DELETE operations should use this queue let writeQueueTail: Promise = Promise.resolve() +// Control whether to use manual WAL checkpoints +export const useManualCheckPoint = false + interface QueryTiming { id: number sql: string @@ -34,7 +39,11 @@ export const createDB = async (dbPath: string, dbName: string): Promise { - // Serialize write through storage-level queue + transaction for atomicity - await executeDbWrite(() => - executeInTransaction(db, async () => { - await run(db, sql, params) - }) - ) + // Use write queue if enabled + if (enableWritingQueue) { + // Serialize write throuh promise queue + await executeDbWrite(() => + executeInTransaction(db, async () => { + await run(db, sql, params) + }) + ) + return + } + + // Use transaction directly + await executeInTransaction(db, async () => { + await run(db, sql, params) + }) } export function extractValues(object: object): string[] { From cfa8da847afd7af398f9517b3e8888c4a4d9d0ec Mon Sep 17 00:00:00 2001 From: jairajdev Date: Mon, 17 Nov 2025 22:03:53 +0800 Subject: [PATCH 14/14] feat: enhance logging for cycle and receipt comparisons in DataSyncManager --- src/class/DataSyncManager.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/class/DataSyncManager.ts b/src/class/DataSyncManager.ts index 565c534..e8077f3 100644 --- a/src/class/DataSyncManager.ts +++ b/src/class/DataSyncManager.ts @@ -171,6 +171,8 @@ export class DataSyncManager { try { // Compare cycles data + console.log('\nComparing cycles data...') + console.log('CycleNumber', 'Local-Marker', ' Distributor-Marker') const localCycles = await CycleDB.queryCycleRecordsBetween(startCycle, endCycle) const distributorResponse = await queryFromDistributor(DataType.CYCLE, { start: startCycle, @@ -188,7 +190,7 @@ export class DataSyncManager { const distributorCycle = distributorCycles.find( (c: { counter: number; marker: string }) => c.counter === localCycle.counter ) - + console.log(localCycle.counter, localCycle.cycleMarker, distributorCycle?.marker) if (!distributorCycle) { throw new Error(`Cycle ${localCycle.counter} exists locally but not in distributor`) } else if (localCycle.cycleMarker !== distributorCycle.marker) { @@ -201,6 +203,8 @@ export class DataSyncManager { } // Compare receipts count + console.log('\nComparing receipts count...') + console.log('CycleNumber', 'Local-Receipts', 'Distributor-Receipts') const receiptsResponse = await queryFromDistributor(DataType.RECEIPT, { startCycle, endCycle, @@ -210,9 +214,9 @@ export class DataSyncManager { if (receiptsResponse?.data?.receipts) { const distributorReceipts: { cycle: number; receipts: number }[] = receiptsResponse.data.receipts const localReceiptsCount = await ReceiptDB.queryReceiptCountByCycles(startCycle, endCycle) - for (const distReceipt of distributorReceipts) { const localReceipt = localReceiptsCount.find((r) => r.cycle === distReceipt.cycle) + console.log(distReceipt.cycle, localReceipt?.receipts, distReceipt.receipts) if (localReceipt && localReceipt.receipts > distReceipt.receipts) { throw new Error( `Receipts count in local DB has more in cycle ${distReceipt.cycle}: ` + @@ -223,6 +227,8 @@ export class DataSyncManager { } // Compare originalTxs count + console.log('\nComparing originalTxs count...') + console.log('CycleNumber', 'Local-OriginalTxs', 'Distributor-OriginalTxs') const originalTxsResponse = await queryFromDistributor(DataType.ORIGINALTX, { startCycle, endCycle, @@ -239,6 +245,7 @@ export class DataSyncManager { for (const distTx of distributorOriginalTxs) { const localTx = localOriginalTxsCount.find((t) => t.cycle === distTx.cycle) + console.log(distTx.cycle, localTx?.originalTxsData, distTx.originalTxsData) if (localTx && localTx.originalTxsData > distTx.originalTxsData) { throw new Error( `OriginalTxs count mismatch in cycle ${distTx.cycle}: ` + @@ -466,8 +473,6 @@ export class DataSyncManager { console.log( `Comparing cycles ${startCycle} to ${endCycle} with ${allDistributorReceipts.length} distributor receipts and ${allDistributorOriginalTxs.length} distributor originalTxs` ) - console.log(allDistributorReceipts, localReceipts) - console.log(allDistributorOriginalTxs, localOriginalTxs) for (let cycle = startCycle; cycle <= endCycle; cycle++) { const distReceipts = allDistributorReceipts.find((r) => r.cycle === cycle)?.receipts || 0 @@ -631,8 +636,9 @@ export class DataSyncManager { const cycleBatches = [] // For each range, create cycle batches and merge them into one + console.log('\nPreparing cycle batches for the following ranges:') for (const range of mergedRanges) { - console.log(`\nFor range: ${range.startCycle} to ${range.endCycle} (${range.gapSize} cycles)`) + console.log(` - range: ${range.startCycle} to ${range.endCycle} (${range.gapSize} cycles)`) const cycleBatch = parallelDataSync.createCycleBatches(range.startCycle, range.endCycle) cycleBatches.push(...cycleBatch) }