Skip to content

Commit

Permalink
Implement SelectKBest algo for feature selection
Browse files Browse the repository at this point in the history
  • Loading branch information
akondas committed Feb 14, 2018
1 parent 52c9ba8 commit fbf84ca
Show file tree
Hide file tree
Showing 14 changed files with 389 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/FeatureExtraction/TfIdfTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public function __construct(?array $samples = null)
}
}

public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->countTokensFrequency($samples);

Expand Down
2 changes: 1 addition & 1 deletion src/FeatureExtraction/TokenCountVectorizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null,
$this->minDF = $minDF;
}

public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->buildVocabulary($samples);
}
Expand Down
10 changes: 10 additions & 0 deletions src/FeatureSelection/ScoringFunction.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php

declare(strict_types=1);

namespace Phpml\FeatureSelection;

interface ScoringFunction
{
public function score(array $samples, array $targets): array;
}
21 changes: 21 additions & 0 deletions src/FeatureSelection/ScoringFunction/ANOVAFValue.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

declare(strict_types=1);

namespace Phpml\FeatureSelection\ScoringFunction;

use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Statistic\ANOVA;

final class ANOVAFValue implements ScoringFunction
{
public function score(array $samples, array $targets): array
{
$grouped = [];
foreach ($samples as $index => $sample) {
$grouped[$targets[$index]][] = $sample;
}

return ANOVA::oneWayF(array_values($grouped));
}
}
78 changes: 78 additions & 0 deletions src/FeatureSelection/SelectKBest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
<?php

declare(strict_types=1);

namespace Phpml\FeatureSelection;

use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\Transformer;

final class SelectKBest implements Transformer
{
/**
* @var ScoringFunction
*/
private $scoringFunction;

/**
* @var int
*/
private $k;

/**
* @var array|null
*/
private $scores = null;

/**
* @var array|null
*/
private $keepColumns = null;

public function __construct(?ScoringFunction $scoringFunction = null, int $k = 10)
{
if ($scoringFunction === null) {
$scoringFunction = new ANOVAFValue();
}

$this->scoringFunction = $scoringFunction;
$this->k = $k;
}

public function fit(array $samples, ?array $targets = null): void
{
if ($targets === null || empty($targets)) {
throw InvalidArgumentException::arrayCantBeEmpty();
}

$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
if ($this->k >= count($sorted)) {
return;
}

arsort($sorted);
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
}

public function transform(array &$samples): void
{
if ($this->keepColumns === null) {
return;
}

foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}

public function scores(): array
{
if ($this->scores === null) {
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
}

return $this->scores;
}
}
4 changes: 1 addition & 3 deletions src/FeatureSelection/VarianceThreshold.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ public function __construct(float $threshold = 0.0)
}

$this->threshold = $threshold;
$this->variances = [];
$this->keepColumns = [];
}

public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->variances = array_map(function (array $column) {
return Variance::population($column);
Expand Down
137 changes: 137 additions & 0 deletions src/Math/Statistic/ANOVA.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
<?php

declare(strict_types=1);

namespace Phpml\Math\Statistic;

use Phpml\Exception\InvalidArgumentException;

/**
* Analysis of variance
* https://en.wikipedia.org/wiki/Analysis_of_variance
*/
final class ANOVA
{
/**
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
* the same population mean. The test is applied to samples from two or
* more groups, possibly with differing sizes.
*
* @param array|array[] $samples - each row is class samples
*
* @return array|float[]
*/
public static function oneWayF(array $samples): array
{
$classes = count($samples);
if ($classes < 2) {
throw InvalidArgumentException::arraySizeToSmall(2);
}

$samplesPerClass = array_map(function (array $class): int {
return count($class);
}, $samples);
$allSamples = array_sum($samplesPerClass);
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
$sumSamples = self::sumOfFeaturesPerClass($samples);
$squareSumSamples = self::sumOfSquares($sumSamples);
$sumSamplesSquare = self::squaresSum($sumSamples);
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
$dfbn = $classes - 1;
$dfwn = $allSamples - $classes;

$msb = array_map(function ($s) use ($dfbn) {
return $s / $dfbn;
}, $ssbn);
$msw = array_map(function ($s) use ($dfwn) {
return $s / $dfwn;
}, $sswn);

$f = [];
foreach ($msb as $index => $msbValue) {
$f[$index] = $msbValue / $msw[$index];
}

return $f;
}

private static function sumOfSquaresPerFeature(array $samples): array
{
$sum = array_fill(0, count($samples[0][0]), 0);
foreach ($samples as $class) {
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature ** 2;
}
}
}

return $sum;
}

private static function sumOfFeaturesPerClass(array $samples): array
{
return array_map(function (array $class) {
$sum = array_fill(0, count($class[0]), 0);
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature;
}
}

return $sum;
}, $samples);
}

private static function sumOfSquares(array $sums): array
{
$squares = array_fill(0, count($sums[0]), 0);
foreach ($sums as $row) {
foreach ($row as $index => $sum) {
$squares[$index] += $sum;
}
}

return array_map(function ($sum) {
return $sum ** 2;
}, $squares);
}

private static function squaresSum(array $sums): array
{
foreach ($sums as &$row) {
foreach ($row as &$sum) {
$sum = $sum ** 2;
}
}

return $sums;
}

private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
{
$ssbn = array_fill(0, count($samples[0][0]), 0);
foreach ($sumSamplesSquare as $classIndex => $class) {
foreach ($class as $index => $feature) {
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
}
}

foreach ($squareSumSamples as $index => $sum) {
$ssbn[$index] -= $sum / $allSamples;
}

return $ssbn;
}

private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
{
$sswn = [];
foreach ($ssAllSamples as $index => $ss) {
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
}

return $sswn;
}
}
2 changes: 1 addition & 1 deletion src/Preprocessing/Imputer.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public function __construct($missingValue, Strategy $strategy, int $axis = self:
$this->samples = $samples;
}

public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->samples = $samples;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Preprocessing/Normalizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public function __construct(int $norm = self::NORM_L2)
$this->norm = $norm;
}

public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
if ($this->fitted) {
return;
Expand Down
9 changes: 3 additions & 6 deletions src/Transformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
interface Transformer
{
/**
* @param array $samples
* most transformers don't require targets to train so null allow to use fit method without setting targets
*/
public function fit(array $samples);
public function fit(array $samples, ?array $targets = null): void;

/**
* @param array $samples
*/
public function transform(array &$samples);
public function transform(array &$samples): void;
}
10 changes: 5 additions & 5 deletions tests/Classification/MLPClassifierTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public function testSynapsesGeneration(): void
public function testBackpropagationLearning(): void
{
// Single layer 2 classes.
$network = new MLPClassifier(2, [2], ['a', 'b']);
$network = new MLPClassifier(2, [2], ['a', 'b'], 1000);
$network->train(
[[1, 0], [0, 1], [1, 1], [0, 0]],
['a', 'b', 'a', 'b']
Expand Down Expand Up @@ -118,7 +118,7 @@ public function testBackpropagationPartialTraining(): void
public function testBackpropagationLearningMultilayer(): void
{
// Multi-layer 2 classes.
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c']);
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c'], 2000);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
['a', 'b', 'a', 'c']
Expand All @@ -133,7 +133,7 @@ public function testBackpropagationLearningMultilayer(): void
public function testBackpropagationLearningMulticlass(): void
{
// Multi-layer more than 2 classes.
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4]);
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4], 1000);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
['a', 'b', 'a', 'a', 4]
Expand All @@ -151,7 +151,7 @@ public function testBackpropagationLearningMulticlass(): void
*/
public function testBackpropagationActivationFunctions(ActivationFunction $activationFunction): void
{
$network = new MLPClassifier(5, [3], ['a', 'b'], 10000, $activationFunction);
$network = new MLPClassifier(5, [3], ['a', 'b'], 1000, $activationFunction);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1]],
['a', 'b', 'a', 'a']
Expand All @@ -178,7 +178,7 @@ public function testSaveAndRestore(): void
// Instantinate new Percetron trained for OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new MLPClassifier(2, [2], [0, 1]);
$classifier = new MLPClassifier(2, [2], [0, 1], 1000);
$classifier->train($samples, $targets);
$testSamples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$predicted = $classifier->predict($testSamples);
Expand Down
25 changes: 25 additions & 0 deletions tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php

declare(strict_types=1);

namespace Phpml\Tests\FeatureSelection\ScoringFunction;

use Phpml\Dataset\Demo\IrisDataset;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use PHPUnit\Framework\TestCase;

final class ANOVAFValueTest extends TestCase
{
public function testScoreForANOVAFValue(): void
{
$dataset = new IrisDataset();
$function = new ANOVAFValue();

self::assertEquals(
[119.2645, 47.3644, 1179.0343, 959.3244],
$function->score($dataset->getSamples(), $dataset->getTargets()),
'',
0.0001
);
}
}
Loading

0 comments on commit fbf84ca

Please sign in to comment.