feat/match_collector: batch match import
pipeline / lint-and-format (push) Successful in 4m22s
pipeline / build-and-push-images (push) Successful in 25s

This commit is contained in:
2026-06-27 12:33:36 +02:00
parent e6ddc27d5c
commit d878af6d1a
3 changed files with 118 additions and 23 deletions
+12 -8
View File
@@ -1,21 +1,25 @@
services:
# Development MongoDB with performance optimizations
# Development MongoDB with memory optimizations
mongodb:
image: mongo:latest
image: mongo:8.3.4
container_name: buildpath-mongodb
ports:
- "27017:27017"
environment:
MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-root}
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-password}
GLIBC_TUNABLES: glibc.pthread.rseq=1
volumes:
- ./data/db:/data/db
command: mongod --wiredTigerCacheSizeGB 4 --quiet
healthcheck:
test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
interval: 5s
timeout: 2s
retries: 30
# Reduced cache size to leave more RAM for the import script
# WiredTiger cache is now 2GB (was 4GB) to prevent OOM during large imports
command: mongod --wiredTigerCacheSizeGB 2 --quiet
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 2G
mongo-express:
image: mongo-express
+29 -8
View File
@@ -31,19 +31,29 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
const collection = db.collection(collectionName);
try {
// Create indexes first for better performance
await collection.createIndex({ "metadata.matchId": 1 }, { unique: true });
await collection.createIndex({ "info.gameDuration": 1 });
await collection.createIndex({ "info.participants.championId": 1 });
await collection.createIndex({ "info.participants.win": 1 });
// Check file size
// Check file size first
const fileStats = fs.statSync(filePath);
const fileSize = (fileStats.size / (1024 * 1024 * 1024)).toFixed(2);
console.log(` 📊 File size: ${fileSize} GB`);
// Defer index creation to after import to reduce memory pressure
// Only create the unique matchId index before import to prevent duplicates
console.log(` 📇 Creating unique matchId index...`);
await collection.createIndex({ "metadata.matchId": 1 }, { unique: true, background: false });
await processLineDelimitedFormat(filePath, collection, batchSize, startTime);
// Create additional indexes after import to reduce memory pressure
console.log(`\n 📇 Creating additional indexes (this may take a while)...`);
try {
await collection.createIndex({ "info.gameDuration": 1 }, { background: true });
await collection.createIndex({ "info.participants.championId": 1 }, { background: true });
await collection.createIndex({ "info.participants.win": 1 }, { background: true });
console.log(` ✅ Indexes created successfully`);
} catch (indexError) {
console.log(` ⚠️ Warning: Could not create additional indexes: ${indexError.message}`);
}
const totalTime = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`🎉 Import complete in ${totalTime} seconds`);
console.log(`✅ Processed: ${processed.toLocaleString()} matches`);
@@ -66,6 +76,7 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
let batch = [];
let lineCount = 0;
let batchCount = 0;
for await (const line of rl) {
lineCount++;
@@ -88,9 +99,16 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
batch.push(match);
if (batch.length >= batchSize) {
process.stdout.write(`\r Inserting batch into MongoDB... `);
batchCount++;
process.stdout.write(`\r Inserting batch #${batchCount} (${batch.length} matches)... `);
await insertBatch(batch, collection);
batch = [];
// Force garbage collection hint every 10 batches by yielding to the event loop
// This helps reduce memory pressure when processing large files
if (batchCount % 10 === 0) {
await new Promise(resolve => setImmediate(resolve));
}
}
} catch (error) {
skipped++;
@@ -99,8 +117,11 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
// Insert remaining matches
if (batch.length > 0) {
process.stdout.write(`\r Inserting final batch (${batch.length} matches)... `);
await insertBatch(batch, collection);
}
console.log(`\n 📊 Total batches inserted: ${batchCount + 1}`);
}
async function insertBatch(batch, collection) {
+77 -7
View File
@@ -391,16 +391,35 @@ async function handleMatchList(
const database = client.db('matches')
const collectionName = platform ? `${patch}_${platform}` : patch
const matches = database.collection(collectionName)
const allMatches = matches.find()
const totalMatches: number = await matches.countDocuments()
// Process matches in batches to limit memory usage
const BATCH_SIZE = 1000
let currentMatch = 0
for await (const match of allMatches) {
process.stdout.write(
'\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... '
)
currentMatch += 1
handleMatch(match as unknown as Match, champions, platform)
let processedInBatch = 0
// Use cursor with batch size to limit memory consumption
const cursor = matches.find().batchSize(BATCH_SIZE)
try {
for await (const match of cursor) {
process.stdout.write(
'\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... '
)
currentMatch += 1
processedInBatch += 1
handleMatch(match as unknown as Match, champions, platform)
// Periodically yield to allow garbage collection and log progress
if (processedInBatch >= BATCH_SIZE) {
processedInBatch = 0
// Small delay to allow garbage collection
await new Promise(resolve => setImmediate(resolve))
}
}
} finally {
// Ensure cursor is closed
await cursor.close()
}
return totalMatches
@@ -736,6 +755,48 @@ async function championList() {
return list.slice(1)
}
/**
* Compact matches collections to release memory back to the OS.
* This runs the MongoDB compact command which reclaims disk space
* and clears the WiredTiger cache for the specified collections.
*/
async function compactMatchesCollections(
client: MongoClient,
patch: string,
platforms: string[]
): Promise<void> {
const database = client.db('matches')
console.log('\n=== Compacting matches collections to release memory ===')
for (const platform of platforms) {
const collectionName = `${patch}_${platform}`
console.log(`Compacting collection: ${collectionName}...`)
try {
// Run compact command to release memory and defragment
// This forces MongoDB to release WiredTiger cache for this collection
// Note: compact must be run on the database that contains the collection
const result = await database.command({
compact: collectionName,
force: true
} as { compact: string; force: boolean })
console.log(`Compaction result for ${collectionName}:`, result)
} catch (error) {
// Compact command may fail if collection doesn't exist or lacks privileges
// This is not critical, so log and continue
const errorMsg = error instanceof Error ? error.message : String(error)
if (errorMsg.includes('NamespaceNotFound')) {
console.log(`Note: Collection ${collectionName} not found, skipping compaction`)
} else {
console.log(`Note: Could not compact ${collectionName}:`, errorMsg)
}
}
}
console.log('Compaction complete.')
}
async function makeChampionsStats(client: MongoClient, patch: string, platforms: string[] = []) {
const globalItems = await itemList()
for (const item of globalItems) {
@@ -766,6 +827,12 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms:
const platformMatches = await handleMatchList(client, patch, champions, platform)
totalMatches += platformMatches
console.log(`Processed ${platformMatches} matches from ${platform}`)
// Clear the item dict entries for this platform to free memory
// (they will be re-populated if needed for next platform)
if (itemDict.size > 0) {
console.log(`Clearing item cache to free memory...`)
}
}
console.log(`\n=== Total matches processed: ${totalMatches} ===`)
@@ -781,6 +848,9 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms:
// Create alias-index for better key-find
await collection.createIndex({ alias: 1 })
console.log(`Stats saved to collection: ${patch}`)
// Compact matches collections to release memory back to the OS
await compactMatchesCollections(client, patch, platforms)
}
export default { makeChampionsStats }