-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Items in cluster improvements #1
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
/** | ||
* Copyright 2014 BigML | ||
* Copyright 2014-2015 BigML | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. You may obtain | ||
|
@@ -24,7 +24,7 @@ var utils = require(PATH + 'utils'); | |
function cosineDistance2(terms, centroidTerms, scale) { | ||
/** | ||
* Returns the square of the distance defined by cosine similarity | ||
* | ||
* | ||
* @param {array} terms Array of input terms | ||
* @param {array} centroidTerms Array of terms used in the centroid field | ||
* @param {number} scale Scaling factor for the field | ||
|
@@ -56,6 +56,46 @@ function cosineDistance2(terms, centroidTerms, scale) { | |
} | ||
|
||
|
||
function cosineItemsDistance2(inputItems, centroidItems, scale) { | ||
/** | ||
* Returns the square of the distance defined by cosine similarity | ||
* | ||
* @param {array} inputItems Array of input items | ||
* @param {array} centroidItems Array of items used in the centroid field | ||
* @param {number} scale Scaling factor for the field | ||
*/ | ||
|
||
var inputCount = 0, item, cosineSimilarity, similarityDistance, i, j, | ||
centroidItemsLength = centroidItems.length; | ||
|
||
// Centroid values for the field can be an empty list. | ||
// Then the distance for an empty input is 1 | ||
// (before applying the scale factor). | ||
if (inputItems.length === 0 && centroidItemsLength === 0) { | ||
return 0; | ||
} | ||
if (inputItems.length === 0 || centroidItemsLength === 0) { | ||
return Math.pow(scale, 2); | ||
} | ||
|
||
//TODO: sorting arrays search is more efficient | ||
for (i = 0; i < centroidItemsLength; i++) { | ||
item = centroidItems[i]; | ||
for (j = 0; j < inputItems.length; j++) { | ||
if (inputItems[j] === item) { | ||
inputCount += 1; | ||
break; | ||
} | ||
} | ||
} | ||
cosineSimilarity = (inputCount / | ||
Math.sqrt(inputItems.length * centroidItemsLength)); | ||
similarityDistance = scale * (1 - cosineSimilarity); | ||
return Math.pow(similarityDistance, 2); | ||
} | ||
|
||
|
||
|
||
/** | ||
* LocalCentroid | ||
* @constructor | ||
|
@@ -72,7 +112,7 @@ function LocalCentroid(centroidInfo) { | |
this.name = centroidInfo.name; | ||
} | ||
|
||
LocalCentroid.prototype.distance2 = function (inputData, termSets, | ||
LocalCentroid.prototype.distance2 = function (inputData, termSets, items, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add the corresponding docstring |
||
scales, stopDistance2) { | ||
/** | ||
* Squared distance from the given input data to the centroid | ||
|
@@ -87,19 +127,24 @@ LocalCentroid.prototype.distance2 = function (inputData, termSets, | |
* squared distance | ||
*/ | ||
|
||
var distance2 = 0.0, fieldId, value, valueType, terms; | ||
var distance2 = 0.0, fieldId, value, valueType, dataIn; | ||
for (fieldId in this.center) { | ||
if (this.center.hasOwnProperty(fieldId)) { | ||
value = this.center[fieldId]; | ||
valueType = typeof value; | ||
if (utils.isArray(value)) { | ||
// text field | ||
if (termSets.hasOwnProperty(fieldId)) { | ||
terms = termSets[fieldId]; | ||
// text field | ||
dataIn = termSets[fieldId]; | ||
distance2 += cosineDistance2(dataIn, value, scales[fieldId]); | ||
} else if (items.hasOwnProperty(fieldId)) { | ||
//items field | ||
dataIn = items[fieldId]; | ||
distance2 += cosineItemsDistance2(dataIn, value, scales[fieldId]); | ||
} else { | ||
terms = []; | ||
dataIn = []; | ||
distance2 += cosineDistance2(dataIn, value, scales[fieldId]); | ||
} | ||
distance2 += cosineDistance2(terms, value, scales[fieldId]); | ||
} else { | ||
switch (valueType) { | ||
case 'string': | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
/** | ||
* Copyright 2014 BigML | ||
* Copyright 2014-2015 BigML | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. You may obtain | ||
|
@@ -27,6 +27,62 @@ var LocalCentroid = require(PATH + 'LocalCentroid'); | |
var constants = require(PATH + 'constants'); | ||
|
||
|
||
function countItemsMatches(text, items) { | ||
/** | ||
* Check if a word is contained or not | ||
* @param {string} text Input text (single word) | ||
* @param {string} items to match | ||
*/ | ||
for (indx = 0; indx < items.length; indx++) { | ||
if (text === items[indx][0]) { | ||
return 1; | ||
} | ||
} | ||
return 0; | ||
} | ||
|
||
|
||
function itemMatches(text, summaryItems) { | ||
/** | ||
* Computes item matches depending on the chosen items analysis options | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are numerical segments taken into account in this code? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, as @joseribes pointed out, the items field has only text separated by a user-given separator or regular expression, so either you have a list of text tokens or you tokenize using the user-given information. I was mixing the "Item" concept in association rules in my comment. |
||
* | ||
* @param {string} text Input text (array or string) | ||
* @param {string} items to match | ||
*/ | ||
|
||
var inputItems = [], | ||
itemMatchesList = [], | ||
singleItem, | ||
indx, | ||
itemText, | ||
regexp; | ||
|
||
if ((typeof text).indexOf('object') > -1) { | ||
//Array case | ||
inputItems = text; | ||
inputItems[0] = text[0] || [""]; | ||
} else { | ||
//Text string case | ||
regexp = new RegExp('(\\b|_)([^\\b_\\s]+?)(\\b|_)', 'g'); | ||
inputItems = text.match(regexp); | ||
} | ||
if (inputItems[0] == "") return []; | ||
|
||
//Text array here | ||
for (indx = 0; indx < inputItems.length; indx++) { | ||
singleItem = inputItems[indx]; | ||
|
||
if (countItemsMatches(singleItem, summaryItems) > 0) { | ||
itemMatchesList.push(singleItem); | ||
} | ||
} | ||
return itemMatchesList; | ||
|
||
} | ||
|
||
|
||
|
||
|
||
function parseTerms(text, caseSensitive) { | ||
/** | ||
* Parses the text into words | ||
|
@@ -123,7 +179,7 @@ function LocalCluster(resource, connection) { | |
* @param {object} resource Model's resource info | ||
*/ | ||
var status, fields, field, fieldId, fieldInfo, clusters, i, clustersLength, | ||
centroid; | ||
centroid, summaryFields, index; | ||
if (error) { | ||
throw new Error('Cannot create the Cluster instance. Could not' + | ||
' retrieve the resource: ' + error); | ||
|
@@ -145,7 +201,12 @@ function LocalCluster(resource, connection) { | |
self.termForms = {}; | ||
self.tagClouds = {}; | ||
self.termAnalysis = {}; | ||
self.items = {}; | ||
fields = resource.clusters.fields; | ||
summaryFields = resource['summary_fields']; | ||
for (index = 0; index < summaryFields.length; index++) { | ||
delete fields[summaryFields[index]]; | ||
} | ||
for (fieldId in fields) { | ||
if (fields.hasOwnProperty(fieldId)) { | ||
field = fields[fieldId]; | ||
|
@@ -154,6 +215,9 @@ function LocalCluster(resource, connection) { | |
self.tagClouds[fieldId] = field.summary.tag_cloud; | ||
self.termAnalysis[fieldId] = field.term_analysis; | ||
} | ||
if (field.optype === 'items') { | ||
self.items[fieldId] = field.summary.items; | ||
} | ||
} | ||
} | ||
self.fields = fields; | ||
|
@@ -196,7 +260,7 @@ LocalCluster.prototype.computeNearest = function (inputData) { | |
* | ||
* @param {object} data Input data to predict from | ||
*/ | ||
var uniqueTerms = {}, terms, caseSensitive, tokenMode, inputDataField, | ||
var uniqueTerms = {}, terms, items, caseSensitive, tokenMode, inputDataField, | ||
fieldId, nearest, distance2, i, clustersLength, centroid; | ||
for (fieldId in this.tagClouds) { | ||
if (inputData.hasOwnProperty(fieldId)) { | ||
|
@@ -210,7 +274,7 @@ LocalCluster.prototype.computeNearest = function (inputData) { | |
} else { | ||
terms = []; | ||
} | ||
|
||
if (tokenMode !== constants.TM_TOKENS) { | ||
terms.push((caseSensitive) ? inputDataField : | ||
inputDataField.toLowerCase()); | ||
|
@@ -223,12 +287,24 @@ LocalCluster.prototype.computeNearest = function (inputData) { | |
} | ||
} | ||
|
||
//items fields | ||
items = []; | ||
for (fieldId in this.items) { | ||
if (inputData.hasOwnProperty(fieldId)) { | ||
if (inputData.hasOwnProperty(fieldId)) { | ||
inputDataField = inputData[fieldId]; | ||
items[fieldId] = itemMatches(inputDataField, this.items[fieldId]); | ||
delete inputData[fieldId]; | ||
} | ||
} | ||
} | ||
|
||
nearest = {'centroidId': null, 'centroidName': null, | ||
'distance': Infinity}; | ||
clustersLength = this.centroids.length; | ||
for (i = 0; i < clustersLength; i++) { | ||
centroid = this.centroids[i]; | ||
distance2 = centroid.distance2(inputData, uniqueTerms, this.scales, | ||
distance2 = centroid.distance2(inputData, uniqueTerms, items, this.scales, | ||
nearest.distance); | ||
if (distance2 < nearest.distance) { | ||
nearest = {'centroidId': centroid.centroidId, | ||
|
@@ -290,6 +366,7 @@ LocalCluster.prototype.validateInput = function (inputData, cb) { | |
if (this.fields.hasOwnProperty(fieldId)) { | ||
field = this.fields[fieldId]; | ||
if (field.optype !== "categorical" && field.optype !== "text" && | ||
field.optype !== "items" && | ||
!inputData.hasOwnProperty(fieldId) && | ||
!inputData.hasOwnProperty(field.name)) { | ||
throw new Error("The input data lacks some numeric fields values." + | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess you define a separate function in order not to mix with the existing one, but I think they can be merged into one. Maybe in a later PR...