This repository has been archived by the owner on Jan 18, 2024. It is now read-only.
forked from variety/variety
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmongoDBSchemaAnalyzer.js
153 lines (118 loc) · 4.71 KB
/
mongoDBSchemaAnalyzer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/* MongoDB Schema Analyzer
This tool helps you get a sense of your application's schema, as well as any
outliers to that schema. Particularly useful when you inherit a codebase with
data dump and want to quickly learn how the data's structured. Also useful for
finding rare keys.
Please see https://github.com/JamesCropcho/mongodb-schema-analyzer for details.
Released by Maypop Inc, © 2012, under the MIT License. */
if (typeof collection === "undefined") {
throw "You have to supply a 'collection' variable, à la --eval 'var collection = \"animals\"'";
}
if (typeof limit === "undefined") { var limit = db[collection].count(); }
print("Using limit of " + limit);
schemaAnalyzerCanHaveChildren = function (v) {
var isArray = v &&
typeof v === 'object' &&
typeof v.length === 'number' &&
!(v.propertyIsEnumerable('length'));
var isObject = typeof v === 'object';
return isArray || isObject;
}
db.system.js.save( { _id : "schemaAnalyzerCanHaveChildren", value : schemaAnalyzerCanHaveChildren } );
schemaAnalyzerMapRecursive = function(parentKey, keys) {
for (var key in keys) {
var value = keys[key];
key = (parentKey + "." + key).replace(/\.\d+/g,'.XX');
emit({key : key}, {type: schemaAnalyzerTypeOf(value)});
if (schemaAnalyzerCanHaveChildren(value)) {
schemaAnalyzerMapRecursive(key, value);
}
}
}
db.system.js.save({_id: "schemaAnalyzerMapRecursive", value: schemaAnalyzerMapRecursive});
schemaAnalyzerTypeOf = function(thing) {
if (typeof thing === "undefined") { throw "schemaAnalyzerTypeOf() requires an argument"; }
if (typeof thing !== "object") {
return typeof thing;
}
else {
if (thing && thing.constructor === Array) {
return "array";
}
else if (thing === null) {
return "null";
}
else {
return "object";
}
}
}
db.system.js.save({_id: "schemaAnalyzerTypeOf", value: schemaAnalyzerTypeOf});
map = function() {
var keys = this;
for (var key in keys) {
var value = keys[key];
// Internally, Mongo uses keys like groceries.0, groceries.1, groceries.2 for
// items in an array. -JC
key = key.replace(/\.\d+/g,'.XX');
emit({key : key}, {type: schemaAnalyzerTypeOf(value)});
if (schemaAnalyzerCanHaveChildren(value)) {
schemaAnalyzerMapRecursive(key, value);
}
}
}
reduce = function(key, values){
var types = [];
values.forEach(function(value) {
if(types.indexOf(value.type) === -1) {
// i.e. "if 'types' does not already have 'value.type', then insert it
// into 'types'." -JC
types.push(value.type);
}
});
return { types: types };
}
var resultsCollectionName = collection + "Keys";
db[collection].mapReduce(map, reduce, {
out: {
replace : resultsCollectionName,
db : "schemaAnalyzerResults"},
limit : limit,
sort : {_id: -1},
scope : { limit : limit }});
var resultsDB = db.getMongo().getDB("schemaAnalyzerResults");
var numDocuments = db[collection].count();
// Using our method of retrieving keys, Mongo gets confused about the following, and
// incorrectly thinks they are keys. -JC
var blackListKeys = ["_id.equals", "_id.getTimestamp", "_id.isObjectId", "_id.str","_id.tojson"];
resultsDB[resultsCollectionName].find({}).forEach(function(key) {
keyName = key["_id"].key;
// We throw away keys which end in an array index, since they are not useful
// for our analysis. (We still keep the key of their parent array, though.) -JC
if(keyName.match(/\.XX$/)) {
resultsDB[resultsCollectionName].remove({ "_id" : key["_id"]});
return;
}
var blackListKeyFound = false;
blackListKeys.forEach(function(blackListKey) {
if(keyName === blackListKey) {
resultsDB[resultsCollectionName].remove({ "_id" : { key: keyName }});
blackListKeyFound = true;
}
});
if(blackListKeyFound) { return; }
if(!(keyName.match(/\.XX/) && !keyName.match(/\.XX$/))) {
// i.e. "Unless the key's value is an array which contains arrays" -JC
// ...we do not support totalOccurrences for these keys because it is
// a bit too tricky for a 'version 1'. Perhaps we'll support in the future. -JC
var existsQuery = {};
existsQuery[keyName] = {$exists: true};
key.totalOccurrences = db[collection].count(existsQuery);
key.percentContaining = (key.totalOccurrences / numDocuments) * 100;
}
resultsDB[resultsCollectionName].save(key);
});
var sortedKeys = resultsDB[resultsCollectionName].find({}).sort({totalOccurrences: -1});
sortedKeys.forEach(function(key) {
print(tojson(key, '', true));
});