Skip to content

Commit 761adeb

Browse files
authored
dev: add mongodb query to find large docs (#3790)
1 parent 96395be commit 761adeb

1 file changed

Lines changed: 83 additions & 0 deletions

File tree

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
// Writes a JSON file with a list of documents larger than 10MB across databases and collections, starting with largest.
2+
3+
const databases = ["xforge_machine", "sf_jobs", "xforge"];
4+
let allResults = [];
5+
6+
const now = new Date();
7+
const year = now.getFullYear();
8+
const month = String(now.getMonth() + 1).padStart(2, "0");
9+
const day = String(now.getDate()).padStart(2, "0");
10+
const hour = String(now.getHours()).padStart(2, "0");
11+
const minute = String(now.getMinutes()).padStart(2, "0");
12+
const second = String(now.getSeconds()).padStart(2, "0");
13+
const outputFileName = `large-documents-${year}-${month}-${day}-${hour}${minute}${second}.json`;
14+
15+
function getTimestamp() {
16+
const timestamp = new Date();
17+
const timestampYear = timestamp.getFullYear();
18+
const timestampMonth = String(timestamp.getMonth() + 1).padStart(2, "0");
19+
const timestampDay = String(timestamp.getDate()).padStart(2, "0");
20+
const timestampHour = String(timestamp.getHours()).padStart(2, "0");
21+
const timestampMinute = String(timestamp.getMinutes()).padStart(2, "0");
22+
const timestampSecond = String(timestamp.getSeconds()).padStart(2, "0");
23+
24+
return `${timestampYear}-${timestampMonth}-${timestampDay} ${timestampHour}:${timestampMinute}:${timestampSecond}`;
25+
}
26+
27+
databases.forEach(dbName => {
28+
const currentDb = db.getSiblingDB(dbName);
29+
const collections = currentDb.getCollectionNames();
30+
31+
console.log(`${getTimestamp()} Checking ${collections.length} collections in database "${dbName}"...`);
32+
33+
collections.forEach(collectionName => {
34+
const results = currentDb
35+
.getCollection(collectionName)
36+
.aggregate([
37+
{
38+
$project: {
39+
_id: 1,
40+
size: { $bsonSize: "$$ROOT" }
41+
}
42+
},
43+
{
44+
$match: { size: { $gt: 10_000_000 } }
45+
},
46+
{
47+
$project: {
48+
_id: 1,
49+
size: 1,
50+
collection: collectionName,
51+
database: dbName
52+
}
53+
}
54+
])
55+
.toArray();
56+
57+
allResults = allResults.concat(results);
58+
});
59+
});
60+
61+
console.log(
62+
`${getTimestamp()} Found ${allResults.length} documents larger than 10MB across all databases and collections.`
63+
);
64+
65+
// Sort by size descending
66+
allResults.sort((a, b) => b.size - a.size);
67+
68+
allResults = allResults.map(doc => {
69+
return {
70+
// _id values will be normalized from both string and ObjectId types.
71+
// "_id": { "$oid": "1234" } -> "id": "1234"
72+
// "_id": "1234" -> "id": "1234"
73+
id: doc._id.toString(),
74+
size: doc.size,
75+
collection: doc.collection,
76+
database: doc.database
77+
};
78+
});
79+
80+
// Write results to file
81+
const fs = require("fs");
82+
fs.writeFileSync(outputFileName, EJSON.stringify(allResults, null, 2));
83+
console.log(`${getTimestamp()} Wrote results to ${outputFileName}`);

0 commit comments

Comments
 (0)