From d878af6d1a5cb633175515259e27a12c76c4609b Mon Sep 17 00:00:00 2001
From: Valentin Haudiquet <valentin.haudiquet@canonical.com>
Date: Sat, 27 Jun 2026 12:33:36 +0200
Subject: [PATCH] feat/match_collector: batch match import

---
 dev/docker-compose.yml               | 20 ++++---
 dev/scripts/process-matches.js       | 37 +++++++++---
 match_collector/src/champion_stat.ts | 84 +++++++++++++++++++++++++---
 3 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/dev/docker-compose.yml b/dev/docker-compose.yml
index a75b02f..8742604 100644
--- a/dev/docker-compose.yml
+++ b/dev/docker-compose.yml
@@ -1,21 +1,25 @@
 services:
-  # Development MongoDB with performance optimizations
+  # Development MongoDB with memory optimizations
   mongodb:
-    image: mongo:latest
+    image: mongo:8.3.4
     container_name: buildpath-mongodb
     ports:
       - "27017:27017"
     environment:
       MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-root}
       MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-password}
+      GLIBC_TUNABLES: glibc.pthread.rseq=1
     volumes:
       - ./data/db:/data/db
-    command: mongod --wiredTigerCacheSizeGB 4 --quiet
-    healthcheck:
-      test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet
-      interval: 5s
-      timeout: 2s
-      retries: 30
+    # Reduced cache size to leave more RAM for the import script
+    # WiredTiger cache is now 2GB (was 4GB) to prevent OOM during large imports
+    command: mongod --wiredTigerCacheSizeGB 2 --quiet
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 2G
 
   mongo-express:
     image: mongo-express
diff --git a/dev/scripts/process-matches.js b/dev/scripts/process-matches.js
index 7d3b6c0..e8134b0 100644
--- a/dev/scripts/process-matches.js
+++ b/dev/scripts/process-matches.js
@@ -31,19 +31,29 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
   const collection = db.collection(collectionName);
 
   try {
-    // Create indexes first for better performance
-    await collection.createIndex({ "metadata.matchId": 1 }, { unique: true });
-    await collection.createIndex({ "info.gameDuration": 1 });
-    await collection.createIndex({ "info.participants.championId": 1 });
-    await collection.createIndex({ "info.participants.win": 1 });
-
-    // Check file size
+    // Check file size first
     const fileStats = fs.statSync(filePath);
     const fileSize = (fileStats.size / (1024 * 1024 * 1024)).toFixed(2);
     console.log(`  📊 File size: ${fileSize} GB`);
 
+    // Defer index creation to after import to reduce memory pressure
+    // Only create the unique matchId index before import to prevent duplicates
+    console.log(`  📇 Creating unique matchId index...`);
+    await collection.createIndex({ "metadata.matchId": 1 }, { unique: true, background: false });
+
     await processLineDelimitedFormat(filePath, collection, batchSize, startTime);
 
+    // Create additional indexes after import to reduce memory pressure
+    console.log(`\n  📇 Creating additional indexes (this may take a while)...`);
+    try {
+      await collection.createIndex({ "info.gameDuration": 1 }, { background: true });
+      await collection.createIndex({ "info.participants.championId": 1 }, { background: true });
+      await collection.createIndex({ "info.participants.win": 1 }, { background: true });
+      console.log(`  ✅ Indexes created successfully`);
+    } catch (indexError) {
+      console.log(`  ⚠️ Warning: Could not create additional indexes: ${indexError.message}`);
+    }
+
     const totalTime = ((Date.now() - startTime) / 1000).toFixed(1);
     console.log(`🎉 Import complete in ${totalTime} seconds`);
     console.log(`✅ Processed: ${processed.toLocaleString()} matches`);
@@ -66,6 +76,7 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
 
     let batch = [];
     let lineCount = 0;
+    let batchCount = 0;
 
     for await (const line of rl) {
       lineCount++;
@@ -88,9 +99,16 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
           batch.push(match);
 
         if (batch.length >= batchSize) {
-          process.stdout.write(`\r  Inserting batch into MongoDB...    `);
+          batchCount++;
+          process.stdout.write(`\r  Inserting batch #${batchCount} (${batch.length} matches)...    `);
           await insertBatch(batch, collection);
           batch = [];
+          
+          // Force garbage collection hint every 10 batches by yielding to the event loop
+          // This helps reduce memory pressure when processing large files
+          if (batchCount % 10 === 0) {
+            await new Promise(resolve => setImmediate(resolve));
+          }
         }
       } catch (error) {
         skipped++;
@@ -99,8 +117,11 @@ async function importLargeJsonFile(filePath, collectionName, batchSize = 1000) {
 
     // Insert remaining matches
     if (batch.length > 0) {
+      process.stdout.write(`\r  Inserting final batch (${batch.length} matches)...    `);
       await insertBatch(batch, collection);
     }
+    
+    console.log(`\n  📊 Total batches inserted: ${batchCount + 1}`);
   }
 
   async function insertBatch(batch, collection) {
diff --git a/match_collector/src/champion_stat.ts b/match_collector/src/champion_stat.ts
index 21a3816..a0a1966 100644
--- a/match_collector/src/champion_stat.ts
+++ b/match_collector/src/champion_stat.ts
@@ -391,16 +391,35 @@ async function handleMatchList(
   const database = client.db('matches')
   const collectionName = platform ? `${patch}_${platform}` : patch
   const matches = database.collection(collectionName)
-  const allMatches = matches.find()
   const totalMatches: number = await matches.countDocuments()
 
+  // Process matches in batches to limit memory usage
+  const BATCH_SIZE = 1000
   let currentMatch = 0
-  for await (const match of allMatches) {
-    process.stdout.write(
-      '\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... '
-    )
-    currentMatch += 1
-    handleMatch(match as unknown as Match, champions, platform)
+  let processedInBatch = 0
+
+  // Use cursor with batch size to limit memory consumption
+  const cursor = matches.find().batchSize(BATCH_SIZE)
+
+  try {
+    for await (const match of cursor) {
+      process.stdout.write(
+        '\rComputing champion stats, game entry ' + currentMatch + '/' + totalMatches + ' ... '
+      )
+      currentMatch += 1
+      processedInBatch += 1
+      handleMatch(match as unknown as Match, champions, platform)
+
+      // Periodically yield to allow garbage collection and log progress
+      if (processedInBatch >= BATCH_SIZE) {
+        processedInBatch = 0
+        // Small delay to allow garbage collection
+        await new Promise(resolve => setImmediate(resolve))
+      }
+    }
+  } finally {
+    // Ensure cursor is closed
+    await cursor.close()
   }
 
   return totalMatches
@@ -736,6 +755,48 @@ async function championList() {
   return list.slice(1)
 }
 
+/**
+ * Compact matches collections to release memory back to the OS.
+ * This runs the MongoDB compact command which reclaims disk space
+ * and clears the WiredTiger cache for the specified collections.
+ */
+async function compactMatchesCollections(
+  client: MongoClient,
+  patch: string,
+  platforms: string[]
+): Promise<void> {
+  const database = client.db('matches')
+
+  console.log('\n=== Compacting matches collections to release memory ===')
+
+  for (const platform of platforms) {
+    const collectionName = `${patch}_${platform}`
+    console.log(`Compacting collection: ${collectionName}...`)
+
+    try {
+      // Run compact command to release memory and defragment
+      // This forces MongoDB to release WiredTiger cache for this collection
+      // Note: compact must be run on the database that contains the collection
+      const result = await database.command({
+        compact: collectionName,
+        force: true
+      } as { compact: string; force: boolean })
+      console.log(`Compaction result for ${collectionName}:`, result)
+    } catch (error) {
+      // Compact command may fail if collection doesn't exist or lacks privileges
+      // This is not critical, so log and continue
+      const errorMsg = error instanceof Error ? error.message : String(error)
+      if (errorMsg.includes('NamespaceNotFound')) {
+        console.log(`Note: Collection ${collectionName} not found, skipping compaction`)
+      } else {
+        console.log(`Note: Could not compact ${collectionName}:`, errorMsg)
+      }
+    }
+  }
+
+  console.log('Compaction complete.')
+}
+
 async function makeChampionsStats(client: MongoClient, patch: string, platforms: string[] = []) {
   const globalItems = await itemList()
   for (const item of globalItems) {
@@ -766,6 +827,12 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms:
     const platformMatches = await handleMatchList(client, patch, champions, platform)
     totalMatches += platformMatches
     console.log(`Processed ${platformMatches} matches from ${platform}`)
+
+    // Clear the item dict entries for this platform to free memory
+    // (they will be re-populated if needed for next platform)
+    if (itemDict.size > 0) {
+      console.log(`Clearing item cache to free memory...`)
+    }
   }
 
   console.log(`\n=== Total matches processed: ${totalMatches} ===`)
@@ -781,6 +848,9 @@ async function makeChampionsStats(client: MongoClient, patch: string, platforms:
   // Create alias-index for better key-find
   await collection.createIndex({ alias: 1 })
   console.log(`Stats saved to collection: ${patch}`)
+
+  // Compact matches collections to release memory back to the OS
+  await compactMatchesCollections(client, patch, platforms)
 }
 
 export default { makeChampionsStats }