Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Items in cluster improvements #1

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 53 additions & 8 deletions lib/LocalCentroid.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2014 BigML
* Copyright 2014-2015 BigML
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
Expand All @@ -24,7 +24,7 @@ var utils = require(PATH + 'utils');
function cosineDistance2(terms, centroidTerms, scale) {
/**
* Returns the square of the distance defined by cosine similarity
*
*
* @param {array} terms Array of input terms
* @param {array} centroidTerms Array of terms used in the centroid field
* @param {number} scale Scaling factor for the field
Expand Down Expand Up @@ -56,6 +56,46 @@ function cosineDistance2(terms, centroidTerms, scale) {
}


function cosineItemsDistance2(inputItems, centroidItems, scale) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you define a separate function in order not to mix with the existing one, but I think they can be merged into one. Maybe in a later PR...

/**
* Returns the square of the distance defined by cosine similarity
*
* @param {array} inputItems Array of input items
* @param {array} centroidItems Array of items used in the centroid field
* @param {number} scale Scaling factor for the field
*/

var inputCount = 0, item, cosineSimilarity, similarityDistance, i, j,
centroidItemsLength = centroidItems.length;

// Centroid values for the field can be an empty list.
// Then the distance for an empty input is 1
// (before applying the scale factor).
if (inputItems.length === 0 && centroidItemsLength === 0) {
return 0;
}
if (inputItems.length === 0 || centroidItemsLength === 0) {
return Math.pow(scale, 2);
}

//TODO: sorting arrays search is more efficient
for (i = 0; i < centroidItemsLength; i++) {
item = centroidItems[i];
for (j = 0; j < inputItems.length; j++) {
if (inputItems[j] === item) {
inputCount += 1;
break;
}
}
}
cosineSimilarity = (inputCount /
Math.sqrt(inputItems.length * centroidItemsLength));
similarityDistance = scale * (1 - cosineSimilarity);
return Math.pow(similarityDistance, 2);
}



/**
* LocalCentroid
* @constructor
Expand All @@ -72,7 +112,7 @@ function LocalCentroid(centroidInfo) {
this.name = centroidInfo.name;
}

LocalCentroid.prototype.distance2 = function (inputData, termSets,
LocalCentroid.prototype.distance2 = function (inputData, termSets, items,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add the corresponding docstring

scales, stopDistance2) {
/**
* Squared distance from the given input data to the centroid
Expand All @@ -87,19 +127,24 @@ LocalCentroid.prototype.distance2 = function (inputData, termSets,
* squared distance
*/

var distance2 = 0.0, fieldId, value, valueType, terms;
var distance2 = 0.0, fieldId, value, valueType, dataIn;
for (fieldId in this.center) {
if (this.center.hasOwnProperty(fieldId)) {
value = this.center[fieldId];
valueType = typeof value;
if (utils.isArray(value)) {
// text field
if (termSets.hasOwnProperty(fieldId)) {
terms = termSets[fieldId];
// text field
dataIn = termSets[fieldId];
distance2 += cosineDistance2(dataIn, value, scales[fieldId]);
} else if (items.hasOwnProperty(fieldId)) {
//items field
dataIn = items[fieldId];
distance2 += cosineItemsDistance2(dataIn, value, scales[fieldId]);
} else {
terms = [];
dataIn = [];
distance2 += cosineDistance2(dataIn, value, scales[fieldId]);
}
distance2 += cosineDistance2(terms, value, scales[fieldId]);
} else {
switch (valueType) {
case 'string':
Expand Down
87 changes: 82 additions & 5 deletions lib/LocalCluster.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2014 BigML
* Copyright 2014-2015 BigML
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
Expand Down Expand Up @@ -27,6 +27,62 @@ var LocalCentroid = require(PATH + 'LocalCentroid');
var constants = require(PATH + 'constants');


function countItemsMatches(text, items) {
/**
* Check if a word is contained or not
* @param {string} text Input text (single word)
* @param {string} items to match
*/
for (indx = 0; indx < items.length; indx++) {
if (text === items[indx][0]) {
return 1;
}
}
return 0;
}


function itemMatches(text, summaryItems) {
/**
* Computes item matches depending on the chosen items analysis options
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are numerical segments taken into account in this code?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, as @joseribes pointed out, the items field has only text separated by a user-given separator or regular expression, so either you have a list of text tokens or you tokenize using the user-given information. I was mixing the "Item" concept in association rules in my comment.

*
* @param {string} text Input text (array or string)
* @param {string} items to match
*/

var inputItems = [],
itemMatchesList = [],
singleItem,
indx,
itemText,
regexp;

if ((typeof text).indexOf('object') > -1) {
//Array case
inputItems = text;
inputItems[0] = text[0] || [""];
} else {
//Text string case
regexp = new RegExp('(\\b|_)([^\\b_\\s]+?)(\\b|_)', 'g');
inputItems = text.match(regexp);
}
if (inputItems[0] == "") return [];

//Text array here
for (indx = 0; indx < inputItems.length; indx++) {
singleItem = inputItems[indx];

if (countItemsMatches(singleItem, summaryItems) > 0) {
itemMatchesList.push(singleItem);
}
}
return itemMatchesList;

}




function parseTerms(text, caseSensitive) {
/**
* Parses the text into words
Expand Down Expand Up @@ -123,7 +179,7 @@ function LocalCluster(resource, connection) {
* @param {object} resource Model's resource info
*/
var status, fields, field, fieldId, fieldInfo, clusters, i, clustersLength,
centroid;
centroid, summaryFields, index;
if (error) {
throw new Error('Cannot create the Cluster instance. Could not' +
' retrieve the resource: ' + error);
Expand All @@ -145,7 +201,12 @@ function LocalCluster(resource, connection) {
self.termForms = {};
self.tagClouds = {};
self.termAnalysis = {};
self.items = {};
fields = resource.clusters.fields;
summaryFields = resource['summary_fields'];
for (index = 0; index < summaryFields.length; index++) {
delete fields[summaryFields[index]];
}
for (fieldId in fields) {
if (fields.hasOwnProperty(fieldId)) {
field = fields[fieldId];
Expand All @@ -154,6 +215,9 @@ function LocalCluster(resource, connection) {
self.tagClouds[fieldId] = field.summary.tag_cloud;
self.termAnalysis[fieldId] = field.term_analysis;
}
if (field.optype === 'items') {
self.items[fieldId] = field.summary.items;
}
}
}
self.fields = fields;
Expand Down Expand Up @@ -196,7 +260,7 @@ LocalCluster.prototype.computeNearest = function (inputData) {
*
* @param {object} data Input data to predict from
*/
var uniqueTerms = {}, terms, caseSensitive, tokenMode, inputDataField,
var uniqueTerms = {}, terms, items, caseSensitive, tokenMode, inputDataField,
fieldId, nearest, distance2, i, clustersLength, centroid;
for (fieldId in this.tagClouds) {
if (inputData.hasOwnProperty(fieldId)) {
Expand All @@ -210,7 +274,7 @@ LocalCluster.prototype.computeNearest = function (inputData) {
} else {
terms = [];
}

if (tokenMode !== constants.TM_TOKENS) {
terms.push((caseSensitive) ? inputDataField :
inputDataField.toLowerCase());
Expand All @@ -223,12 +287,24 @@ LocalCluster.prototype.computeNearest = function (inputData) {
}
}

//items fields
items = [];
for (fieldId in this.items) {
if (inputData.hasOwnProperty(fieldId)) {
if (inputData.hasOwnProperty(fieldId)) {
inputDataField = inputData[fieldId];
items[fieldId] = itemMatches(inputDataField, this.items[fieldId]);
delete inputData[fieldId];
}
}
}

nearest = {'centroidId': null, 'centroidName': null,
'distance': Infinity};
clustersLength = this.centroids.length;
for (i = 0; i < clustersLength; i++) {
centroid = this.centroids[i];
distance2 = centroid.distance2(inputData, uniqueTerms, this.scales,
distance2 = centroid.distance2(inputData, uniqueTerms, items, this.scales,
nearest.distance);
if (distance2 < nearest.distance) {
nearest = {'centroidId': centroid.centroidId,
Expand Down Expand Up @@ -290,6 +366,7 @@ LocalCluster.prototype.validateInput = function (inputData, cb) {
if (this.fields.hasOwnProperty(fieldId)) {
field = this.fields[fieldId];
if (field.optype !== "categorical" && field.optype !== "text" &&
field.optype !== "items" &&
!inputData.hasOwnProperty(fieldId) &&
!inputData.hasOwnProperty(field.name)) {
throw new Error("The input data lacks some numeric fields values." +
Expand Down