From d878af6d1a5cb633175515259e27a12c76c4609b Mon Sep 17 00:00:00 2001 From: Valentin Haudiquet Date: Sat, 27 Jun 2026 12:33:36 +0200 Subject: [PATCH] feat/match_collector: batch match import --- dev/docker-compose.yml | 20 ++++--- dev/scripts/process-matches.js | 37 +++++++++--- match_collector/src/champion_stat.ts | 84 +++++++++++++++++++++++++--- 3 files changed, 118 insertions(+), 23 deletions(-) diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml index a75b02f..8742604 100644 --- a/dev/docker-compose.yml +++ b/dev/docker-compose.yml @@ -1,21 +1,25 @@ services: - # Development MongoDB with performance optimizations + # Development MongoDB with memory optimizations mongodb: - image: mongo:latest + image: mongo:8.3.4 container_name: buildpath-mongodb ports: - "27017:27017" environment: MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-root} MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-password} + GLIBC_TUNABLES: glibc.pthread.rseq=1 volumes: - ./data/db:/data/db - command: mongod --wiredTigerCacheSizeGB 4 --quiet - healthcheck: - test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet - interval: 5s - timeout: 2s - retries: 30 + # Reduced cache size to leave more RAM for the import script + # WiredTiger cache is now 2GB (was 4GB) to prevent OOM during large imports + command: mongod --wiredTigerCacheSizeGB 2 --quiet + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 2G mongo-express: image: mongo-express diff --git a/dev/scripts/process-matches.js b/dev/scripts/process-matches.js index 7d3b6c0..e8134b0 100644 --- a/dev/scripts/process-matches.js +++ b/dev/scripts/process-matches.js @@ -31,19 +31,29 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) { const collection = db.collection(collectionName); try { - // Create indexes first for better performance - await collection.createIndex({ "metadata.matchId": 1 }, { unique: true }); - await collection.createIndex({ "info.gameDuration": 1 }); - await collection.createIndex({ "info.participants.championId": 1 }); - await collection.createIndex({ "info.participants.win": 1 }); - - // Check file size + // Check file size first const fileStats = fs.statSync(filePath); const fileSize = (fileStats.size / (1024 * 1024 * 1024)).toFixed(2); console.log(` 📊 File size: ${fileSize} GB`); + // Defer index creation to after import to reduce memory pressure + // Only create the unique matchId index before import to prevent duplicates + console.log(` 📇 Creating unique matchId index...`); + await collection.createIndex({ "metadata.matchId": 1 }, { unique: true, background: false }); + await processLineDelimitedFormat(filePath, collection, batchSize, startTime); + // Create additional indexes after import to reduce memory pressure + console.log(`\n 📇 Creating additional indexes (this may take a while)...`); + try { + await collection.createIndex({ "info.gameDuration": 1 }, { background: true }); + await collection.createIndex({ "info.participants.championId": 1 }, { background: true }); + await collection.createIndex({ "info.participants.win": 1 }, { background: true }); + console.log(` ✅ Indexes created successfully`); + } catch (indexError) { + console.log(` ⚠️ Warning: Could not create additional indexes: ${indexError.message}`); + } + const totalTime = ((Date.now() - startTime) / 1000).toFixed(1); console.log(`🎉 Import complete in ${totalTime} seconds`); console.log(`✅ Processed: ${processed.toLocaleString()} matches`); @@ -66,6 +76,7 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) { let batch = []; let lineCount = 0; + let batchCount = 0; for await (const line of rl) { lineCount++; @@ -88,9 +99,16 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) { batch.push(match); if (batch.length >= batchSize) { - process.stdout.write(`\r Inserting batch into MongoDB... `); + batchCount++; + process.stdout.write(`\r Inserting batch #${batchCount} (${batch.length} matches)... `); await insertBatch(batch, collection); batch = []; + + // Force garbage collection hint every 10 batches by yielding to the event loop + // This helps reduce memory pressure when processing large files + if (batchCount % 10 === 0) { + await new Promise(resolve => setImmediate(resolve)); + } } } catch (error) { skipped++; @@ -99,8 +117,11 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) { // Insert remaining matches if (batch.length > 0) { + process.stdout.write(`\r Inserting final batch (${batch.length} matches)... `); await insertBatch(batch, collection); } + + console.log(`\n 📊 Total batches inserted: ${batchCount + 1}`); } async function insertBatch(batch, collection) { diff --git a/match_collector/src/champion_stat.ts b/match_collector/src/champion_stat.ts index 21a3816..a0a1966 100644 --- a/match_collector/src/champion_stat.ts +++ b/match_collector/src/champion_stat.ts @@ -391,16 +391,35 @@ async function handleMatchList( const database = client.db('matches') const collectionName = platform ? `${patch}_${platform}` : patch const matches = database.collection(collectionName) - const allMatches = matches.find() const totalMatches: number = await matches.countDocuments() + // Process matches in batches to limit memory usage + const BATCH_SIZE = 1000 let currentMatch = 0 - for await (const match of allMatches) { - process.stdout.write( - '\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... ' - ) - currentMatch += 1 - handleMatch(match as unknown as Match, champions, platform) + let processedInBatch = 0 + + // Use cursor with batch size to limit memory consumption + const cursor = matches.find().batchSize(BATCH_SIZE) + + try { + for await (const match of cursor) { + process.stdout.write( + '\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... ' + ) + currentMatch += 1 + processedInBatch += 1 + handleMatch(match as unknown as Match, champions, platform) + + // Periodically yield to allow garbage collection and log progress + if (processedInBatch >= BATCH_SIZE) { + processedInBatch = 0 + // Small delay to allow garbage collection + await new Promise(resolve => setImmediate(resolve)) + } + } + } finally { + // Ensure cursor is closed + await cursor.close() } return totalMatches @@ -736,6 +755,48 @@ async function championList() { return list.slice(1) } +/** + * Compact matches collections to release memory back to the OS. + * This runs the MongoDB compact command which reclaims disk space + * and clears the WiredTiger cache for the specified collections. + */ +async function compactMatchesCollections( + client: MongoClient, + patch: string, + platforms: string[] +): Promise { + const database = client.db('matches') + + console.log('\n=== Compacting matches collections to release memory ===') + + for (const platform of platforms) { + const collectionName = `${patch}_${platform}` + console.log(`Compacting collection: ${collectionName}...`) + + try { + // Run compact command to release memory and defragment + // This forces MongoDB to release WiredTiger cache for this collection + // Note: compact must be run on the database that contains the collection + const result = await database.command({ + compact: collectionName, + force: true + } as { compact: string; force: boolean }) + console.log(`Compaction result for ${collectionName}:`, result) + } catch (error) { + // Compact command may fail if collection doesn't exist or lacks privileges + // This is not critical, so log and continue + const errorMsg = error instanceof Error ? error.message : String(error) + if (errorMsg.includes('NamespaceNotFound')) { + console.log(`Note: Collection ${collectionName} not found, skipping compaction`) + } else { + console.log(`Note: Could not compact ${collectionName}:`, errorMsg) + } + } + } + + console.log('Compaction complete.') +} + async function makeChampionsStats(client: MongoClient, patch: string, platforms: string[] = []) { const globalItems = await itemList() for (const item of globalItems) { @@ -766,6 +827,12 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms: const platformMatches = await handleMatchList(client, patch, champions, platform) totalMatches += platformMatches console.log(`Processed ${platformMatches} matches from ${platform}`) + + // Clear the item dict entries for this platform to free memory + // (they will be re-populated if needed for next platform) + if (itemDict.size > 0) { + console.log(`Clearing item cache to free memory...`) + } } console.log(`\n=== Total matches processed: ${totalMatches} ===`) @@ -781,6 +848,9 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms: // Create alias-index for better key-find await collection.createIndex({ alias: 1 }) console.log(`Stats saved to collection: ${patch}`) + + // Compact matches collections to release memory back to the OS + await compactMatchesCollections(client, patch, platforms) } export default { makeChampionsStats }