MDL-65769 lib: update PHP-ML to 0.8.0

This commit is contained in:
Simey Lameze 2019-07-11 16:56:50 +08:00
parent f7e108438f
commit e6c25fb057
126 changed files with 3639 additions and 3753 deletions

View file

@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (c) 2016 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
Copyright (c) 2016-2018 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View file

@ -1,4 +1,4 @@
Current version is 12b8b11
Current version is 0.8.0
# Download latest stable version from https://github.com/php-ai/php-ml
# Remove all files but:

View file

@ -9,15 +9,16 @@ use Phpml\Helper\Trainable;
class Apriori implements Associator
{
use Trainable, Predictable;
use Trainable;
use Predictable;
const ARRAY_KEY_ANTECEDENT = 'antecedent';
public const ARRAY_KEY_ANTECEDENT = 'antecedent';
const ARRAY_KEY_CONFIDENCE = 'confidence';
public const ARRAY_KEY_CONFIDENCE = 'confidence';
const ARRAY_KEY_CONSEQUENT = 'consequent';
public const ARRAY_KEY_CONSEQUENT = 'consequent';
const ARRAY_KEY_SUPPORT = 'support';
public const ARRAY_KEY_SUPPORT = 'support';
/**
* Minimum relative probability of frequent transactions.
@ -31,7 +32,7 @@ class Apriori implements Associator
*
* @var mixed[][][]
*/
private $large;
private $large = [];
/**
* Minimum relative frequency of transactions.
@ -45,13 +46,10 @@ class Apriori implements Associator
*
* @var mixed[][]
*/
private $rules;
private $rules = [];
/**
* Apriori constructor.
*
* @param float $support
* @param float $confidence
*/
public function __construct(float $support = 0.0, float $confidence = 0.0)
{
@ -64,13 +62,13 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
public function getRules() : array
public function getRules(): array
{
if (!$this->large) {
if (count($this->large) === 0) {
$this->large = $this->apriori();
}
if ($this->rules) {
if (count($this->rules) > 0) {
return $this->rules;
}
@ -86,15 +84,14 @@ class Apriori implements Associator
*
* @return mixed[][][]
*/
public function apriori() : array
public function apriori(): array
{
$L = [];
$L[1] = $this->items();
$L[1] = $this->frequent($L[1]);
for ($k = 2; !empty($L[$k - 1]); ++$k) {
$L[$k] = $this->candidates($L[$k - 1]);
$L[$k] = $this->frequent($L[$k]);
$items = $this->frequent($this->items());
for ($k = 1; isset($items[0]); ++$k) {
$L[$k] = $items;
$items = $this->frequent($this->candidates($items));
}
return $L;
@ -105,7 +102,7 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
protected function predictSample(array $sample) : array
protected function predictSample(array $sample): array
{
$predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample) {
return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample);
@ -119,9 +116,9 @@ class Apriori implements Associator
/**
* Generate rules for each k-length frequent item set.
*/
private function generateAllRules()
private function generateAllRules(): void
{
for ($k = 2; !empty($this->large[$k]); ++$k) {
for ($k = 2; isset($this->large[$k]); ++$k) {
foreach ($this->large[$k] as $frequent) {
$this->generateRules($frequent);
}
@ -133,15 +130,16 @@ class Apriori implements Associator
*
* @param mixed[] $frequent
*/
private function generateRules(array $frequent)
private function generateRules(array $frequent): void
{
foreach ($this->antecedents($frequent) as $antecedent) {
if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) {
$confidence = $this->confidence($frequent, $antecedent);
if ($this->confidence <= $confidence) {
$consequent = array_values(array_diff($frequent, $antecedent));
$this->rules[] = [
self::ARRAY_KEY_ANTECEDENT => $antecedent,
self::ARRAY_KEY_CONSEQUENT => $consequent,
self::ARRAY_KEY_SUPPORT => $this->support($consequent),
self::ARRAY_KEY_SUPPORT => $this->support($frequent),
self::ARRAY_KEY_CONFIDENCE => $confidence,
];
}
@ -155,7 +153,7 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
private function powerSet(array $sample) : array
private function powerSet(array $sample): array
{
$results = [[]];
foreach ($sample as $item) {
@ -174,7 +172,7 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
private function antecedents(array $sample) : array
private function antecedents(array $sample): array
{
$cardinality = count($sample);
$antecedents = $this->powerSet($sample);
@ -189,7 +187,7 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
private function items() : array
private function items(): array
{
$items = [];
@ -213,11 +211,11 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
private function frequent(array $samples) : array
private function frequent(array $samples): array
{
return array_filter($samples, function ($entry) {
return array_values(array_filter($samples, function ($entry) {
return $this->support($entry) >= $this->support;
});
}));
}
/**
@ -227,7 +225,7 @@ class Apriori implements Associator
*
* @return mixed[][]
*/
private function candidates(array $samples) : array
private function candidates(array $samples): array
{
$candidates = [];
@ -237,15 +235,16 @@ class Apriori implements Associator
continue;
}
$candidate = array_unique(array_merge($p, $q));
$candidate = array_values(array_unique(array_merge($p, $q)));
if ($this->contains($candidates, $candidate)) {
continue;
}
foreach ((array) $this->samples as $sample) {
foreach ($this->samples as $sample) {
if ($this->subset($sample, $candidate)) {
$candidates[] = $candidate;
continue 2;
}
}
@ -261,10 +260,8 @@ class Apriori implements Associator
*
* @param mixed[] $set
* @param mixed[] $subset
*
* @return float
*/
private function confidence(array $set, array $subset) : float
private function confidence(array $set, array $subset): float
{
return $this->support($set) / $this->support($subset);
}
@ -276,10 +273,8 @@ class Apriori implements Associator
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*
* @return float
*/
private function support(array $sample) : float
private function support(array $sample): float
{
return $this->frequency($sample) / count($this->samples);
}
@ -290,10 +285,8 @@ class Apriori implements Associator
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*
* @return int
*/
private function frequency(array $sample) : int
private function frequency(array $sample): int
{
return count(array_filter($this->samples, function ($entry) use ($sample) {
return $this->subset($entry, $sample);
@ -307,10 +300,8 @@ class Apriori implements Associator
*
* @param mixed[][] $system
* @param mixed[] $set
*
* @return bool
*/
private function contains(array $system, array $set) : bool
private function contains(array $system, array $set): bool
{
return (bool) array_filter($system, function ($entry) use ($set) {
return $this->equals($entry, $set);
@ -322,12 +313,10 @@ class Apriori implements Associator
*
* @param mixed[] $set
* @param mixed[] $subset
*
* @return bool
*/
private function subset(array $set, array $subset) : bool
private function subset(array $set, array $subset): bool
{
return !array_diff($subset, array_intersect($subset, $set));
return count(array_diff($subset, array_intersect($subset, $set))) === 0;
}
/**
@ -335,10 +324,8 @@ class Apriori implements Associator
*
* @param mixed[] $set1
* @param mixed[] $set2
*
* @return bool
*/
private function equals(array $set1, array $set2) : bool
private function equals(array $set1, array $set2): bool
{
return array_diff($set1, $set2) == array_diff($set2, $set1);
}

View file

@ -4,23 +4,40 @@ declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
class DecisionTree implements Classifier
{
use Trainable, Predictable;
use Trainable;
use Predictable;
const CONTINUOUS = 1;
const NOMINAL = 2;
public const CONTINUOUS = 1;
public const NOMINAL = 2;
/**
* @var int
*/
public $actualDepth = 0;
/**
* @var array
*/
protected $columnTypes;
protected $columnTypes = [];
/**
* @var DecisionTreeLeaf
*/
protected $tree;
/**
* @var int
*/
protected $maxDepth;
/**
* @var array
@ -32,21 +49,6 @@ class DecisionTree implements Classifier
*/
private $featureCount = 0;
/**
* @var DecisionTreeLeaf
*/
protected $tree = null;
/**
* @var int
*/
protected $maxDepth;
/**
* @var int
*/
public $actualDepth = 0;
/**
* @var int
*/
@ -55,32 +57,24 @@ class DecisionTree implements Classifier
/**
* @var array
*/
private $selectedFeatures;
private $selectedFeatures = [];
/**
* @var array|null
*/
private $featureImportances;
/**
* @var array
*/
private $featureImportances = null;
private $columnNames = [];
/**
*
* @var array
*/
private $columnNames = null;
/**
* @param int $maxDepth
*/
public function __construct(int $maxDepth = 10)
{
$this->maxDepth = $maxDepth;
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
@ -96,23 +90,19 @@ class DecisionTree implements Classifier
// If column names are given or computed before, then there is no
// need to init it and accidentally remove the previous given names
if ($this->columnNames === null) {
if ($this->columnNames === []) {
$this->columnNames = range(0, $this->featureCount - 1);
} elseif (count($this->columnNames) > $this->featureCount) {
$this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
} elseif (count($this->columnNames) < $this->featureCount) {
$this->columnNames = array_merge($this->columnNames,
$this->columnNames = array_merge(
$this->columnNames,
range(count($this->columnNames), $this->featureCount - 1)
);
}
}
/**
* @param array $samples
*
* @return array
*/
public static function getColumnTypes(array $samples) : array
public static function getColumnTypes(array $samples): array
{
$types = [];
$featureCount = count($samples[0]);
@ -126,12 +116,120 @@ class DecisionTree implements Classifier
}
/**
* @param array $records
* @param int $depth
*
* @return DecisionTreeLeaf
* @param mixed $baseValue
*/
protected function getSplitLeaf(array $records, int $depth = 0) : DecisionTreeLeaf
public function getGiniIndex($baseValue, array $colValues, array $targets): float
{
$countMatrix = [];
foreach ($this->labels as $label) {
$countMatrix[$label] = [0, 0];
}
foreach ($colValues as $index => $value) {
$label = $targets[$index];
$rowIndex = $value === $baseValue ? 0 : 1;
++$countMatrix[$label][$rowIndex];
}
$giniParts = [0, 0];
for ($i = 0; $i <= 1; ++$i) {
$part = 0;
$sum = array_sum(array_column($countMatrix, $i));
if ($sum > 0) {
foreach ($this->labels as $label) {
$part += ($countMatrix[$label][$i] / (float) $sum) ** 2;
}
}
$giniParts[$i] = (1 - $part) * $sum;
}
return array_sum($giniParts) / count($colValues);
}
/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
}
$this->numUsableFeatures = $numFeatures;
return $this;
}
/**
* A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setColumnNames(array $names)
{
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
}
$this->columnNames = $names;
return $this;
}
public function getHtml(): string
{
return $this->tree->getHTML($this->columnNames);
}
/**
* This will return an array including an importance value for
* each column in the given dataset. The importance values are
* normalized and their total makes 1.<br/>
*/
public function getFeatureImportances(): array
{
if ($this->featureImportances !== null) {
return $this->featureImportances;
}
$sampleCount = count($this->samples);
$this->featureImportances = [];
foreach ($this->columnNames as $column => $columnName) {
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
$importance = 0;
foreach ($nodes as $node) {
$importance += $node->getNodeImpurityDecrease($sampleCount);
}
$this->featureImportances[$columnName] = $importance;
}
// Normalize & sort the importances
$total = array_sum($this->featureImportances);
if ($total > 0) {
array_walk($this->featureImportances, function (&$importance) use ($total): void {
$importance /= $total;
});
arsort($this->featureImportances);
}
return $this->featureImportances;
}
protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
{
$split = $this->getBestSplit($records);
$split->level = $depth;
@ -143,7 +241,7 @@ class DecisionTree implements Classifier
// otherwise group the records so that we can classify the leaf
// in case maximum depth is reached
$leftRecords = [];
$rightRecords= [];
$rightRecords = [];
$remainingTargets = [];
$prevRecord = null;
$allSame = true;
@ -151,9 +249,10 @@ class DecisionTree implements Classifier
foreach ($records as $recordNo) {
// Check if the previous record is the same with the current one
$record = $this->samples[$recordNo];
if ($prevRecord && $prevRecord != $record) {
if ($prevRecord !== null && $prevRecord != $record) {
$allSame = false;
}
$prevRecord = $record;
// According to the split criteron, this record will
@ -161,7 +260,7 @@ class DecisionTree implements Classifier
if ($split->evaluate($record)) {
$leftRecords[] = $recordNo;
} else {
$rightRecords[]= $recordNo;
$rightRecords[] = $recordNo;
}
// Group remaining targets
@ -174,31 +273,29 @@ class DecisionTree implements Classifier
}
if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) {
$split->isTerminal = 1;
$split->isTerminal = true;
arsort($remainingTargets);
$split->classValue = key($remainingTargets);
$split->classValue = (string) key($remainingTargets);
} else {
if ($leftRecords) {
if (isset($leftRecords[0])) {
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
}
if ($rightRecords) {
$split->rightLeaf= $this->getSplitLeaf($rightRecords, $depth + 1);
if (isset($rightRecords[0])) {
$split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
}
}
return $split;
}
/**
* @param array $records
*
* @return DecisionTreeLeaf
*/
protected function getBestSplit(array $records) : DecisionTreeLeaf
protected function getBestSplit(array $records): DecisionTreeLeaf
{
$targets = array_intersect_key($this->targets, array_flip($records));
$samples = array_intersect_key($this->samples, array_flip($records));
$samples = array_combine($records, $this->preprocess($samples));
$samples = (array) array_combine(
$records,
$this->preprocess(array_intersect_key($this->samples, array_flip($records)))
);
$bestGiniVal = 1;
$bestSplit = null;
$features = $this->getSelectedFeatures();
@ -207,26 +304,31 @@ class DecisionTree implements Classifier
foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i];
}
$counts = array_count_values($colValues);
arsort($counts);
$baseValue = key($counts);
if ($baseValue === null) {
continue;
}
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit === null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf();
$split->value = $baseValue;
$split->giniIndex = $gini;
$split->columnIndex = $i;
$split->isContinuous = $this->columnTypes[$i] == self::CONTINUOUS;
$split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
$split->records = $records;
// If a numeric column is to be selected, then
// the original numeric value and the selected operator
// will also be saved into the leaf for future access
if ($this->columnTypes[$i] == self::CONTINUOUS) {
if ($this->columnTypes[$i] === self::CONTINUOUS) {
$matches = [];
preg_match("/^([<>=]{1,2})\s*(.*)/", strval($split->value), $matches);
preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches);
$split->operator = $matches[1];
$split->numericValue = floatval($matches[2]);
$split->numericValue = (float) $matches[2];
}
$bestSplit = $split;
@ -249,17 +351,15 @@ class DecisionTree implements Classifier
*
* If any of above methods were not called beforehand, then all features
* are returned by default.
*
* @return array
*/
protected function getSelectedFeatures() : array
protected function getSelectedFeatures(): array
{
$allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures === 0 && !$this->selectedFeatures) {
if ($this->numUsableFeatures === 0 && count($this->selectedFeatures) === 0) {
return $allFeatures;
}
if ($this->selectedFeatures) {
if (count($this->selectedFeatures) > 0) {
return $this->selectedFeatures;
}
@ -267,55 +367,15 @@ class DecisionTree implements Classifier
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
}
shuffle($allFeatures);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
sort($selectedFeatures);
return $selectedFeatures;
}
/**
* @param mixed $baseValue
* @param array $colValues
* @param array $targets
*
* @return float
*/
public function getGiniIndex($baseValue, array $colValues, array $targets) : float
{
$countMatrix = [];
foreach ($this->labels as $label) {
$countMatrix[$label] = [0, 0];
}
foreach ($colValues as $index => $value) {
$label = $targets[$index];
$rowIndex = $value === $baseValue ? 0 : 1;
++$countMatrix[$label][$rowIndex];
}
$giniParts = [0, 0];
for ($i = 0; $i <= 1; ++$i) {
$part = 0;
$sum = array_sum(array_column($countMatrix, $i));
if ($sum > 0) {
foreach ($this->labels as $label) {
$part += pow($countMatrix[$label][$i] / floatval($sum), 2);
}
}
$giniParts[$i] = (1 - $part) * $sum;
}
return array_sum($giniParts) / count($colValues);
}
/**
* @param array $samples
*
* @return array
*/
protected function preprocess(array $samples) : array
protected function preprocess(array $samples): array
{
// Detect and convert continuous data column values into
// discrete values by using the median as a threshold value
@ -326,25 +386,22 @@ class DecisionTree implements Classifier
$median = Mean::median($values);
foreach ($values as &$value) {
if ($value <= $median) {
$value = "<= $median";
$value = "<= ${median}";
} else {
$value = "> $median";
$value = "> ${median}";
}
}
}
$columns[] = $values;
}
// Below method is a strange yet very simple & efficient method
// to get the transpose of a 2D array
return array_map(null, ...$columns);
}
/**
* @param array $columnValues
*
* @return bool
*/
protected static function isCategoricalColumn(array $columnValues) : bool
protected static function isCategoricalColumn(array $columnValues): bool
{
$count = count($columnValues);
@ -355,7 +412,7 @@ class DecisionTree implements Classifier
// all values in that column (Lower than or equal to %20 of all values)
$numericValues = array_filter($columnValues, 'is_numeric');
$floatValues = array_filter($columnValues, 'is_float');
if ($floatValues) {
if (count($floatValues) > 0) {
return false;
}
@ -368,119 +425,21 @@ class DecisionTree implements Classifier
return count($distinctValues) <= $count / 5;
}
/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @param int $numFeatures
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
}
$this->numUsableFeatures = $numFeatures;
return $this;
}
/**
* Used to set predefined features to consider while deciding which column to use for a split
*
* @param array $selectedFeatures
*/
protected function setSelectedFeatures(array $selectedFeatures)
protected function setSelectedFeatures(array $selectedFeatures): void
{
$this->selectedFeatures = $selectedFeatures;
}
/**
* A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected.
*
* @param array $names
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setColumnNames(array $names)
{
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
}
$this->columnNames = $names;
return $this;
}
/**
* @return string
*/
public function getHtml()
{
return $this->tree->getHTML($this->columnNames);
}
/**
* This will return an array including an importance value for
* each column in the given dataset. The importance values are
* normalized and their total makes 1.<br/>
*
* @return array
*/
public function getFeatureImportances()
{
if ($this->featureImportances !== null) {
return $this->featureImportances;
}
$sampleCount = count($this->samples);
$this->featureImportances = [];
foreach ($this->columnNames as $column => $columnName) {
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
$importance = 0;
foreach ($nodes as $node) {
$importance += $node->getNodeImpurityDecrease($sampleCount);
}
$this->featureImportances[$columnName] = $importance;
}
// Normalize & sort the importances
$total = array_sum($this->featureImportances);
if ($total > 0) {
foreach ($this->featureImportances as &$importance) {
$importance /= $total;
}
arsort($this->featureImportances);
}
return $this->featureImportances;
}
/**
* Collects and returns an array of internal nodes that use the given
* column as a split criterion
*
* @param int $column
* @param DecisionTreeLeaf $node
*
* @return array
*/
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node) : array
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
{
if (!$node || $node->isTerminal) {
if ($node->isTerminal) {
return [];
}
@ -491,22 +450,18 @@ class DecisionTree implements Classifier
$lNodes = [];
$rNodes = [];
if ($node->leftLeaf) {
if ($node->leftLeaf !== null) {
$lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
}
if ($node->rightLeaf) {
if ($node->rightLeaf !== null) {
$rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
}
$nodes = array_merge($nodes, $lNodes, $rNodes);
return $nodes;
return array_merge($nodes, $lNodes, $rNodes);
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSample(array $sample)
@ -514,7 +469,7 @@ class DecisionTree implements Classifier
$node = $this->tree;
do {
if ($node->isTerminal) {
break;
return $node->classValue;
}
if ($node->evaluate($sample)) {
@ -524,6 +479,6 @@ class DecisionTree implements Classifier
}
} while ($node);
return $node ? $node->classValue : $this->labels[0];
return $this->labels[0];
}
}

View file

@ -4,10 +4,12 @@ declare(strict_types=1);
namespace Phpml\Classification\DecisionTree;
use Phpml\Math\Comparison;
class DecisionTreeLeaf
{
/**
* @var string
* @var string|int
*/
public $value;
@ -27,14 +29,14 @@ class DecisionTreeLeaf
public $columnIndex;
/**
* @var DecisionTreeLeaf
* @var DecisionTreeLeaf|null
*/
public $leftLeaf = null;
public $leftLeaf;
/**
* @var DecisionTreeLeaf
* @var DecisionTreeLeaf|null
*/
public $rightLeaf= null;
public $rightLeaf;
/**
* @var array
@ -70,19 +72,19 @@ class DecisionTreeLeaf
public $level = 0;
/**
* @param array $record
* @return bool
* HTML representation of the tree without column names
*/
public function evaluate($record)
public function __toString(): string
{
return $this->getHTML();
}
public function evaluate(array $record): bool
{
$recordField = $record[$this->columnIndex];
if ($this->isContinuous) {
$op = $this->operator;
$value= $this->numericValue;
$recordField = strval($recordField);
eval("\$result = $recordField $op $value;");
return $result;
return Comparison::compare((string) $recordField, $this->numericValue, $this->operator);
}
return $recordField == $this->value;
@ -91,27 +93,23 @@ class DecisionTreeLeaf
/**
* Returns Mean Decrease Impurity (MDI) in the node.
* For terminal nodes, this value is equal to 0
*
* @param int $parentRecordCount
*
* @return float
*/
public function getNodeImpurityDecrease(int $parentRecordCount)
public function getNodeImpurityDecrease(int $parentRecordCount): float
{
if ($this->isTerminal) {
return 0.0;
}
$nodeSampleCount = (float)count($this->records);
$nodeSampleCount = (float) count($this->records);
$iT = $this->giniIndex;
if ($this->leftLeaf) {
$pL = count($this->leftLeaf->records)/$nodeSampleCount;
if ($this->leftLeaf !== null) {
$pL = count($this->leftLeaf->records) / $nodeSampleCount;
$iT -= $pL * $this->leftLeaf->giniIndex;
}
if ($this->rightLeaf) {
$pR = count($this->rightLeaf->records)/$nodeSampleCount;
if ($this->rightLeaf !== null) {
$pR = count($this->rightLeaf->records) / $nodeSampleCount;
$iT -= $pR * $this->rightLeaf->giniIndex;
}
@ -120,14 +118,11 @@ class DecisionTreeLeaf
/**
* Returns HTML representation of the node including children nodes
*
* @param $columnNames
* @return string
*/
public function getHTML($columnNames = null)
public function getHTML(?array $columnNames = null): string
{
if ($this->isTerminal) {
$value = "<b>$this->classValue</b>";
$value = "<b>${this}->classValue</b>";
} else {
$value = $this->value;
if ($columnNames !== null) {
@ -135,39 +130,36 @@ class DecisionTreeLeaf
} else {
$col = "col_$this->columnIndex";
}
if (!preg_match("/^[<>=]{1,2}/", $value)) {
$value = "=$value";
if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) {
$value = "=${value}";
}
$value = "<b>$col $value</b><br>Gini: ". number_format($this->giniIndex, 2);
$value = "<b>${col} ${value}</b><br>Gini: ".number_format($this->giniIndex, 2);
}
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>
$value</td></tr>";
if ($this->leftLeaf || $this->rightLeaf) {
$str .='<tr>';
if ($this->leftLeaf) {
$str .="<td valign=top><b>| Yes</b><br>" . $this->leftLeaf->getHTML($columnNames) . "</td>";
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>${value}</td></tr>";
if ($this->leftLeaf !== null || $this->rightLeaf !== null) {
$str .= '<tr>';
if ($this->leftLeaf !== null) {
$str .= '<td valign=top><b>| Yes</b><br>'.$this->leftLeaf->getHTML($columnNames).'</td>';
} else {
$str .='<td></td>';
$str .= '<td></td>';
}
$str .='<td>&nbsp;</td>';
if ($this->rightLeaf) {
$str .="<td valign=top align=right><b>No |</b><br>" . $this->rightLeaf->getHTML($columnNames) . "</td>";
$str .= '<td>&nbsp;</td>';
if ($this->rightLeaf !== null) {
$str .= '<td valign=top align=right><b>No |</b><br>'.$this->rightLeaf->getHTML($columnNames).'</td>';
} else {
$str .='<td></td>';
$str .= '<td></td>';
}
$str .= '</tr>';
}
$str .= '</table>';
return $str;
}
/**
* HTML representation of the tree without column names
*
* @return string
*/
public function __toString()
{
return $this->getHTML();
}
}

View file

@ -4,20 +4,24 @@ declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\Linear\DecisionStump;
use Phpml\Classification\WeightedClassifier;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
use Phpml\Classification\Classifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
use ReflectionClass;
class AdaBoost implements Classifier
{
use Predictable, Trainable;
use Predictable;
use Trainable;
/**
* Actual labels given in the targets array
*
* @var array
*/
protected $labels = [];
@ -74,8 +78,6 @@ class AdaBoost implements Classifier
* ADAptive BOOSTing (AdaBoost) is an ensemble algorithm to
* improve classification performance of 'weak' classifiers such as
* DecisionStump (default base classifier of AdaBoost).
*
* @param int $maxIterations
*/
public function __construct(int $maxIterations = 50)
{
@ -84,32 +86,29 @@ class AdaBoost implements Classifier
/**
* Sets the base classifier that will be used for boosting (default = DecisionStump)
*
* @param string $baseClassifier
* @param array $classifierOptions
*/
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = [])
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = []): void
{
$this->baseClassifier = $baseClassifier;
$this->classifierOptions = $classifierOptions;
}
/**
* @param array $samples
* @param array $targets
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
// Initialize usual variables
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) != 2) {
throw new \Exception("AdaBoost is a binary classifier and can classify between two classes only");
if (count($this->labels) !== 2) {
throw new InvalidArgumentException('AdaBoost is a binary classifier and can classify between two classes only');
}
// Set all target values to either -1 or 1
$this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
$this->labels = [
1 => $this->labels[0],
-1 => $this->labels[1],
];
foreach ($targets as $target) {
$this->targets[] = $target == $this->labels[1] ? 1 : -1;
}
@ -140,25 +139,34 @@ class AdaBoost implements Classifier
}
/**
* Returns the classifier with the lowest error rate with the
* consideration of current sample weights
*
* @return Classifier
* @return mixed
*/
protected function getBestClassifier()
public function predictSample(array $sample)
{
$ref = new \ReflectionClass($this->baseClassifier);
if ($this->classifierOptions) {
$classifier = $ref->newInstanceArgs($this->classifierOptions);
} else {
$classifier = $ref->newInstance();
$sum = 0;
foreach ($this->alpha as $index => $alpha) {
$h = $this->classifiers[$index]->predict($sample);
$sum += $h * $alpha;
}
if (is_subclass_of($classifier, WeightedClassifier::class)) {
return $this->labels[$sum > 0 ? 1 : -1];
}
/**
* Returns the classifier with the lowest error rate with the
* consideration of current sample weights
*/
protected function getBestClassifier(): Classifier
{
$ref = new ReflectionClass($this->baseClassifier);
/** @var Classifier $classifier */
$classifier = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
if ($classifier instanceof WeightedClassifier) {
$classifier->setSampleWeights($this->weights);
$classifier->train($this->samples, $this->targets);
} else {
list($samples, $targets) = $this->resample();
[$samples, $targets] = $this->resample();
$classifier->train($samples, $targets);
}
@ -168,25 +176,24 @@ class AdaBoost implements Classifier
/**
* Resamples the dataset in accordance with the weights and
* returns the new dataset
*
* @return array
*/
protected function resample()
protected function resample(): array
{
$weights = $this->weights;
$std = StandardDeviation::population($weights);
$mean= Mean::arithmetic($weights);
$mean = Mean::arithmetic($weights);
$min = min($weights);
$minZ= (int)round(($min - $mean) / $std);
$minZ = (int) round(($min - $mean) / $std);
$samples = [];
$targets = [];
foreach ($weights as $index => $weight) {
$z = (int)round(($weight - $mean) / $std) - $minZ + 1;
$z = (int) round(($weight - $mean) / $std) - $minZ + 1;
for ($i = 0; $i < $z; ++$i) {
if (rand(0, 1) == 0) {
if (random_int(0, 1) == 0) {
continue;
}
$samples[] = $this->samples[$index];
$targets[] = $this->targets[$index];
}
@ -197,12 +204,8 @@ class AdaBoost implements Classifier
/**
* Evaluates the classifier and returns the classification error rate
*
* @param Classifier $classifier
*
* @return float
*/
protected function evaluateClassifier(Classifier $classifier)
protected function evaluateClassifier(Classifier $classifier): float
{
$total = (float) array_sum($this->weights);
$wrong = 0;
@ -218,25 +221,20 @@ class AdaBoost implements Classifier
/**
* Calculates alpha of a classifier
*
* @param float $errorRate
* @return float
*/
protected function calculateAlpha(float $errorRate)
protected function calculateAlpha(float $errorRate): float
{
if ($errorRate == 0) {
$errorRate = 1e-10;
}
return 0.5 * log((1 - $errorRate) / $errorRate);
}
/**
* Updates the sample weights
*
* @param Classifier $classifier
* @param float $alpha
*/
protected function updateWeights(Classifier $classifier, float $alpha)
protected function updateWeights(Classifier $classifier, float $alpha): void
{
$sumOfWeights = array_sum($this->weights);
$weightsT1 = [];
@ -251,19 +249,4 @@ class AdaBoost implements Classifier
$this->weights = $weightsT1;
}
/**
* @param array $sample
* @return mixed
*/
public function predictSample(array $sample)
{
$sum = 0;
foreach ($this->alpha as $index => $alpha) {
$h = $this->classifiers[$index]->predict($sample);
$sum += $h * $alpha;
}
return $this->labels[ $sum > 0 ? 1 : -1];
}
}

View file

@ -4,25 +4,23 @@ declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use ReflectionClass;
class Bagging implements Classifier
{
use Trainable, Predictable;
use Trainable;
use Predictable;
/**
* @var int
*/
protected $numSamples;
/**
* @var array
*/
private $targets = [];
/**
* @var int
*/
@ -34,7 +32,7 @@ class Bagging implements Classifier
protected $numClassifier;
/**
* @var Classifier
* @var string
*/
protected $classifier = DecisionTree::class;
@ -46,24 +44,17 @@ class Bagging implements Classifier
/**
* @var array
*/
protected $classifiers;
protected $classifiers = [];
/**
* @var float
*/
protected $subsetRatio = 0.7;
/**
* @var array
*/
private $samples = [];
/**
* Creates an ensemble classifier with given number of base classifiers
* Default number of base classifiers is 50.
* The more number of base classifiers, the better performance but at the cost of procesing time
*
* @param int $numClassifier
*/
public function __construct(int $numClassifier = 50)
{
@ -75,19 +66,18 @@ class Bagging implements Classifier
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
* to train each base classifier.
*
* @param float $ratio
*
* @return $this
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function setSubsetRatio(float $ratio)
{
if ($ratio < 0.1 || $ratio > 1.0) {
throw new \Exception("Subset ratio should be between 0.1 and 1.0");
throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0');
}
$this->subsetRatio = $ratio;
return $this;
}
@ -99,9 +89,6 @@ class Bagging implements Classifier
* given in the order they are in the constructor of the classifier and parameter
* names are neglected.
*
* @param string $classifier
* @param array $classifierOptions
*
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
@ -112,11 +99,7 @@ class Bagging implements Classifier
return $this;
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
@ -127,24 +110,20 @@ class Bagging implements Classifier
$this->classifiers = $this->initClassifiers();
$index = 0;
foreach ($this->classifiers as $classifier) {
list($samples, $targets) = $this->getRandomSubset($index);
[$samples, $targets] = $this->getRandomSubset($index);
$classifier->train($samples, $targets);
++$index;
}
}
/**
* @param int $index
* @return array
*/
protected function getRandomSubset(int $index)
protected function getRandomSubset(int $index): array
{
$samples = [];
$targets = [];
srand($index);
$bootstrapSize = $this->subsetRatio * $this->numSamples;
for ($i = 0; $i < $bootstrapSize; ++$i) {
$rand = rand(0, $this->numSamples - 1);
$rand = random_int(0, $this->numSamples - 1);
$samples[] = $this->samples[$rand];
$targets[] = $this->targets[$rand];
}
@ -152,50 +131,40 @@ class Bagging implements Classifier
return [$samples, $targets];
}
/**
* @return array
*/
protected function initClassifiers()
protected function initClassifiers(): array
{
$classifiers = [];
for ($i = 0; $i < $this->numClassifier; ++$i) {
$ref = new \ReflectionClass($this->classifier);
if ($this->classifierOptions) {
$obj = $ref->newInstanceArgs($this->classifierOptions);
} else {
$obj = $ref->newInstance();
}
$ref = new ReflectionClass($this->classifier);
/** @var Classifier $obj */
$obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
$classifiers[] = $this->initSingleClassifier($obj);
}
return $classifiers;
}
/**
* @param Classifier $classifier
*
* @return Classifier
*/
protected function initSingleClassifier($classifier)
protected function initSingleClassifier(Classifier $classifier): Classifier
{
return $classifier;
}
/**
* @param array $sample
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictions = [];
foreach ($this->classifiers as $classifier) {
/* @var $classifier Classifier */
/** @var Classifier $classifier */
$predictions[] = $classifier->predict($sample);
}
$counts = array_count_values($predictions);
arsort($counts);
reset($counts);
return key($counts);
}
}

View file

@ -4,7 +4,9 @@ declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
class RandomForest extends Bagging
{
@ -14,16 +16,14 @@ class RandomForest extends Bagging
protected $featureSubsetRatio = 'log';
/**
* @var array
* @var array|null
*/
protected $columnNames = null;
protected $columnNames;
/**
* Initializes RandomForest with the given number of trees. More trees
* may increase the prediction performance while it will also substantially
* increase the processing time and the required memory
*
* @param int $numClassifier
*/
public function __construct(int $numClassifier = 50)
{
@ -41,40 +41,36 @@ class RandomForest extends Bagging
* Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1
* features to be taken into consideration while selecting subspace of features
*
* @param mixed $ratio string or float should be given
*
* @return $this
*
* @throws \Exception
* @param string|float $ratio
*/
public function setFeatureSubsetRatio($ratio)
public function setFeatureSubsetRatio($ratio): self
{
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
if (!is_string($ratio) && !is_float($ratio)) {
throw new InvalidArgumentException('Feature subset ratio must be a string or a float');
}
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0');
}
if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') {
throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'");
}
$this->featureSubsetRatio = $ratio;
return $this;
}
/**
* RandomForest algorithm is usable *only* with DecisionTree
*
* @param string $classifier
* @param array $classifierOptions
*
* @return $this
*
* @throws \Exception
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
if ($classifier != DecisionTree::class) {
throw new \Exception("RandomForest can only use DecisionTree as base classifier");
if ($classifier !== DecisionTree::class) {
throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier');
}
return parent::setClassifer($classifier, $classifierOptions);
@ -84,15 +80,13 @@ class RandomForest extends Bagging
* This will return an array including an importance value for
* each column in the given dataset. Importance values for a column
* is the average importance of that column in all trees in the forest
*
* @return array
*/
public function getFeatureImportances()
public function getFeatureImportances(): array
{
// Traverse each tree and sum importance of the columns
$sum = [];
foreach ($this->classifiers as $tree) {
/* @var $tree DecisionTree */
/** @var DecisionTree $tree */
$importances = $tree->getFeatureImportances();
foreach ($importances as $column => $importance) {
@ -106,10 +100,9 @@ class RandomForest extends Bagging
// Normalize & sort the importance values
$total = array_sum($sum);
foreach ($sum as &$importance) {
array_walk($sum, function (&$importance) use ($total): void {
$importance /= $total;
}
});
arsort($sum);
return $sum;
@ -119,7 +112,6 @@ class RandomForest extends Bagging
* A string array to represent the columns is given. They are useful
* when trying to print some information about the trees such as feature importances
*
* @param array $names
* @return $this
*/
public function setColumnNames(array $names)
@ -134,14 +126,14 @@ class RandomForest extends Bagging
*
* @return DecisionTree
*/
protected function initSingleClassifier($classifier)
protected function initSingleClassifier(Classifier $classifier): Classifier
{
if (is_float($this->featureSubsetRatio)) {
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureCount == 'sqrt') {
$featureCount = (int)sqrt($this->featureCount) + 1;
$featureCount = (int) ($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureSubsetRatio === 'sqrt') {
$featureCount = (int) ($this->featureCount ** .5) + 1;
} else {
$featureCount = (int)log($this->featureCount, 2) + 1;
$featureCount = (int) log($this->featureCount, 2) + 1;
}
if ($featureCount >= $this->featureCount) {
@ -153,7 +145,7 @@ class RandomForest extends Bagging
}
return $classifier
->setColumnNames($this->columnNames)
->setNumFeatures($featureCount);
->setColumnNames($this->columnNames)
->setNumFeatures($featureCount);
}
}

View file

@ -11,7 +11,8 @@ use Phpml\Math\Distance\Euclidean;
class KNearestNeighbors implements Classifier
{
use Trainable, Predictable;
use Trainable;
use Predictable;
/**
* @var int
@ -24,12 +25,11 @@ class KNearestNeighbors implements Classifier
private $distanceMetric;
/**
* @param int $k
* @param Distance|null $distanceMetric (if null then Euclidean distance as default)
*/
public function __construct(int $k = 3, Distance $distanceMetric = null)
public function __construct(int $k = 3, ?Distance $distanceMetric = null)
{
if (null === $distanceMetric) {
if ($distanceMetric === null) {
$distanceMetric = new Euclidean();
}
@ -40,17 +40,14 @@ class KNearestNeighbors implements Classifier
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSample(array $sample)
{
$distances = $this->kNeighborsDistances($sample);
$predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
$predictions = array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
foreach ($distances as $index => $distance) {
foreach (array_keys($distances) as $index) {
++$predictions[$this->targets[$index]];
}
@ -61,13 +58,9 @@ class KNearestNeighbors implements Classifier
}
/**
* @param array $sample
*
* @return array
*
* @throws \Phpml\Exception\InvalidArgumentException
*/
private function kNeighborsDistances(array $sample)
private function kNeighborsDistances(array $sample): array
{
$distances = [];

View file

@ -4,22 +4,24 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Exception\InvalidArgumentException;
class Adaline extends Perceptron
{
/**
* Batch training is the default Adaline training algorithm
*/
const BATCH_TRAINING = 1;
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
const ONLINE_TRAINING = 2;
public const ONLINE_TRAINING = 2;
/**
* Training type may be either 'Batch' or 'Online' learning
*
* @var string
* @var string|int
*/
protected $trainingType;
@ -32,18 +34,16 @@ class Adaline extends Perceptron
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation
*
* @param float $learningRate
* @param int $maxIterations
* @param bool $normalizeInputs
* @param int $trainingType
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000,
bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING)
{
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) {
throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm");
public function __construct(
float $learningRate = 0.001,
int $maxIterations = 1000,
bool $normalizeInputs = true,
int $trainingType = self::BATCH_TRAINING
) {
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) {
throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm');
}
$this->trainingType = $trainingType;
@ -54,11 +54,8 @@ class Adaline extends Perceptron
/**
* Adapts the weights with respect to given samples and targets
* by use of gradient descent learning rule
*
* @param array $samples
* @param array $targets
*/
protected function runTraining(array $samples, array $targets)
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target) {
@ -73,6 +70,6 @@ class Adaline extends Perceptron
$isBatch = $this->trainingType == self::BATCH_TRAINING;
return parent::runGradientDescent($samples, $targets, $callback, $isBatch);
parent::runGradientDescent($samples, $targets, $callback, $isBatch);
}
}

View file

@ -4,16 +4,19 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
use Phpml\Helper\OneVsRest;
use Phpml\Classification\WeightedClassifier;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\WeightedClassifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Predictable;
use Phpml\Math\Comparison;
class DecisionStump extends WeightedClassifier
{
use Predictable, OneVsRest;
use Predictable;
use OneVsRest;
const AUTO_SELECT = -1;
public const AUTO_SELECT = -1;
/**
* @var int
@ -23,7 +26,7 @@ class DecisionStump extends WeightedClassifier
/**
* @var array
*/
protected $binaryLabels;
protected $binaryLabels = [];
/**
* Lowest error rate obtained while training/optimizing the model
@ -50,7 +53,7 @@ class DecisionStump extends WeightedClassifier
/**
* @var array
*/
protected $columnTypes;
protected $columnTypes = [];
/**
* @var int
@ -67,7 +70,7 @@ class DecisionStump extends WeightedClassifier
*
* @var array
*/
protected $prob;
protected $prob = [];
/**
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
@ -76,22 +79,35 @@ class DecisionStump extends WeightedClassifier
* If columnIndex is given, then the stump tries to produce a decision node
* on this column, otherwise in cases given the value of -1, the stump itself
* decides which column to take for the decision (Default DecisionTree behaviour)
*
* @param int $columnIndex
*/
public function __construct(int $columnIndex = self::AUTO_SELECT)
{
$this->givenColumnIndex = $columnIndex;
}
public function __toString(): string
{
return "IF ${this}->column ${this}->operator ${this}->value ".
'THEN '.$this->binaryLabels[0].' '.
'ELSE '.$this->binaryLabels[1];
}
/**
* @param array $samples
* @param array $targets
* @param array $labels
*
* @throws \Exception
* While finding best split point for a numerical valued column,
* DecisionStump looks for equally distanced values between minimum and maximum
* values in the column. Given <i>$count</i> value determines how many split
* points to be probed. The more split counts, the better performance but
* worse processing time (Default value is 10.0)
*/
protected function trainBinary(array $samples, array $targets, array $labels)
public function setNumericalSplitCount(float $count): void
{
$this->numSplitCount = $count;
}
/**
* @throws InvalidArgumentException
*/
protected function trainBinary(array $samples, array $targets, array $labels): void
{
$this->binaryLabels = $labels;
$this->featureCount = count($samples[0]);
@ -103,13 +119,13 @@ class DecisionStump extends WeightedClassifier
// Check the size of the weights given.
// If none given, then assign 1 as a weight to each sample
if ($this->weights) {
$numWeights = count($this->weights);
if ($numWeights != count($samples)) {
throw new \Exception("Number of sample weights does not match with number of samples");
}
} else {
if (count($this->weights) === 0) {
$this->weights = array_fill(0, count($samples), 1);
} else {
$numWeights = count($this->weights);
if ($numWeights !== count($samples)) {
throw new InvalidArgumentException('Number of sample weights does not match with number of samples');
}
}
// Determine type of each column as either "continuous" or "nominal"
@ -118,14 +134,17 @@ class DecisionStump extends WeightedClassifier
// Try to find the best split in the columns of the dataset
// by calculating error rate for each split point in each column
$columns = range(0, count($samples[0]) - 1);
if ($this->givenColumnIndex != self::AUTO_SELECT) {
if ($this->givenColumnIndex !== self::AUTO_SELECT) {
$columns = [$this->givenColumnIndex];
}
$bestSplit = [
'value' => 0, 'operator' => '',
'prob' => [], 'column' => 0,
'trainingErrorRate' => 1.0];
'value' => 0,
'operator' => '',
'prob' => [],
'column' => 0,
'trainingErrorRate' => 1.0,
];
foreach ($columns as $col) {
if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) {
$split = $this->getBestNumericalSplit($samples, $targets, $col);
@ -144,30 +163,10 @@ class DecisionStump extends WeightedClassifier
}
}
/**
* While finding best split point for a numerical valued column,
* DecisionStump looks for equally distanced values between minimum and maximum
* values in the column. Given <i>$count</i> value determines how many split
* points to be probed. The more split counts, the better performance but
* worse processing time (Default value is 10.0)
*
* @param float $count
*/
public function setNumericalSplitCount(float $count)
{
$this->numSplitCount = $count;
}
/**
* Determines best split point for the given column
*
* @param array $samples
* @param array $targets
* @param int $col
*
* @return array
*/
protected function getBestNumericalSplit(array $samples, array $targets, int $col)
protected function getBestNumericalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
// Trying all possible points may be accomplished in two general ways:
@ -178,27 +177,35 @@ class DecisionStump extends WeightedClassifier
$maxValue = max($values);
$stepSize = ($maxValue - $minValue) / $this->numSplitCount;
$split = null;
$split = [];
foreach (['<=', '>'] as $operator) {
// Before trying all possible split points, let's first try
// the average value for the cut point
$threshold = array_sum($values) / (float) count($values);
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($split == null || $errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
'prob' => $prob, 'column' => $col,
'trainingErrorRate' => $errorRate];
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if (!isset($split['trainingErrorRate']) || $errorRate < $split['trainingErrorRate']) {
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
// Try other possible points one by one
for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) {
$threshold = (float)$step;
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
for ($step = $minValue; $step <= $maxValue; $step += $stepSize) {
$threshold = (float) $step;
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
'prob' => $prob, 'column' => $col,
'trainingErrorRate' => $errorRate];
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}// for
}
@ -206,29 +213,25 @@ class DecisionStump extends WeightedClassifier
return $split;
}
/**
* @param array $samples
* @param array $targets
* @param int $col
*
* @return array
*/
protected function getBestNominalSplit(array $samples, array $targets, int $col) : array
protected function getBestNominalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
$valueCounts = array_count_values($values);
$distinctVals= array_keys($valueCounts);
$distinctVals = array_keys($valueCounts);
$split = null;
$split = [];
foreach (['=', '!='] as $operator) {
foreach ($distinctVals as $val) {
list($errorRate, $prob) = $this->calculateErrorRate($targets, $val, $operator, $values);
if ($split == null || $split['trainingErrorRate'] < $errorRate) {
$split = ['value' => $val, 'operator' => $operator,
'prob' => $prob, 'column' => $col,
'trainingErrorRate' => $errorRate];
[$errorRate, $prob] = $this->calculateErrorRate($targets, $val, $operator, $values);
if (!isset($split['trainingErrorRate']) || $split['trainingErrorRate'] < $errorRate) {
$split = [
'value' => $val,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}
}
@ -236,72 +239,42 @@ class DecisionStump extends WeightedClassifier
return $split;
}
/**
*
* @param mixed $leftValue
* @param string $operator
* @param mixed $rightValue
*
* @return boolean
*/
protected function evaluate($leftValue, string $operator, $rightValue)
{
switch ($operator) {
case '>': return $leftValue > $rightValue;
case '>=': return $leftValue >= $rightValue;
case '<': return $leftValue < $rightValue;
case '<=': return $leftValue <= $rightValue;
case '=': return $leftValue === $rightValue;
case '!=':
case '<>': return $leftValue !== $rightValue;
}
return false;
}
/**
* Calculates the ratio of wrong predictions based on the new threshold
* value given as the parameter
*
* @param array $targets
* @param float $threshold
* @param string $operator
* @param array $values
*
* @return array
*/
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values) : array
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values): array
{
$wrong = 0.0;
$prob = [];
$leftLabel = $this->binaryLabels[0];
$rightLabel= $this->binaryLabels[1];
$rightLabel = $this->binaryLabels[1];
foreach ($values as $index => $value) {
if ($this->evaluate($value, $operator, $threshold)) {
if (Comparison::compare($value, $threshold, $operator)) {
$predicted = $leftLabel;
} else {
$predicted = $rightLabel;
}
$target = $targets[$index];
if (strval($predicted) != strval($targets[$index])) {
if ((string) $predicted != (string) $targets[$index]) {
$wrong += $this->weights[$index];
}
if (!isset($prob[$predicted][$target])) {
$prob[$predicted][$target] = 0;
}
++$prob[$predicted][$target];
}
// Calculate probabilities: Proportion of labels in each leaf
$dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
foreach ($prob as $leaf => $counts) {
$leafTotal = (float)array_sum($prob[$leaf]);
$leafTotal = (float) array_sum($prob[$leaf]);
foreach ($counts as $label => $count) {
if (strval($leaf) == strval($label)) {
if ((string) $leaf == (string) $label) {
$dist[$leaf] = $count / $leafTotal;
}
}
@ -316,15 +289,12 @@ class DecisionStump extends WeightedClassifier
* Probability of a sample is calculated as the proportion of the label
* within the labels of the training samples in the decision node
*
* @param array $sample
* @param mixed $label
*
* @return float
*/
protected function predictProbability(array $sample, $label) : float
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if (strval($predicted) == strval($label)) {
if ((string) $predicted == (string) $label) {
return $this->prob[$label];
}
@ -332,33 +302,18 @@ class DecisionStump extends WeightedClassifier
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSampleBinary(array $sample)
{
if ($this->evaluate($sample[$this->column], $this->operator, $this->value)) {
if (Comparison::compare($sample[$this->column], $this->value, $this->operator)) {
return $this->binaryLabels[0];
}
return $this->binaryLabels[1];
}
/**
* @return void
*/
protected function resetBinary()
protected function resetBinary(): void
{
}
/**
* @return string
*/
public function __toString()
{
return "IF $this->column $this->operator $this->value " .
"THEN " . $this->binaryLabels[0] . " ".
"ELSE " . $this->binaryLabels[1];
}
}

View file

@ -4,6 +4,9 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Closure;
use Exception;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Optimizer\ConjugateGradient;
class LogisticRegression extends Adaline
@ -11,17 +14,17 @@ class LogisticRegression extends Adaline
/**
* Batch training: Gradient descent algorithm (default)
*/
const BATCH_TRAINING = 1;
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
const ONLINE_TRAINING = 2;
public const ONLINE_TRAINING = 2;
/**
* Conjugate Batch: Conjugate Gradient algorithm
*/
const CONJUGATE_GRAD_TRAINING = 3;
public const CONJUGATE_GRAD_TRAINING = 3;
/**
* Cost function to optimize: 'log' and 'sse' are supported <br>
@ -30,7 +33,7 @@ class LogisticRegression extends Adaline
*
* @var string
*/
protected $costFunction = 'sse';
protected $costFunction = 'log';
/**
* Regularization term: only 'L2' is supported
@ -59,32 +62,33 @@ class LogisticRegression extends Adaline
*
* Penalty (Regularization term) can be 'L2' or empty string to cancel penalty term
*
* @param int $maxIterations
* @param bool $normalizeInputs
* @param int $trainingType
* @param string $cost
* @param string $penalty
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct(int $maxIterations = 500, bool $normalizeInputs = true,
int $trainingType = self::CONJUGATE_GRAD_TRAINING, string $cost = 'sse',
string $penalty = 'L2')
{
public function __construct(
int $maxIterations = 500,
bool $normalizeInputs = true,
int $trainingType = self::CONJUGATE_GRAD_TRAINING,
string $cost = 'log',
string $penalty = 'L2'
) {
$trainingTypes = range(self::BATCH_TRAINING, self::CONJUGATE_GRAD_TRAINING);
if (!in_array($trainingType, $trainingTypes)) {
throw new \Exception("Logistic regression can only be trained with " .
"batch (gradient descent), online (stochastic gradient descent) " .
"or conjugate batch (conjugate gradients) algorithms");
if (!in_array($trainingType, $trainingTypes, true)) {
throw new InvalidArgumentException(
'Logistic regression can only be trained with '.
'batch (gradient descent), online (stochastic gradient descent) '.
'or conjugate batch (conjugate gradients) algorithms'
);
}
if (!in_array($cost, ['log', 'sse'])) {
throw new \Exception("Logistic regression cost function can be one of the following: \n" .
"'log' for log-likelihood and 'sse' for sum of squared errors");
if (!in_array($cost, ['log', 'sse'], true)) {
throw new InvalidArgumentException(
"Logistic regression cost function can be one of the following: \n".
"'log' for log-likelihood and 'sse' for sum of squared errors"
);
}
if ($penalty != '' && strtoupper($penalty) !== 'L2') {
throw new \Exception("Logistic regression supports only 'L2' regularization");
if ($penalty !== '' && strtoupper($penalty) !== 'L2') {
throw new InvalidArgumentException('Logistic regression supports only \'L2\' regularization');
}
$this->learningRate = 0.001;
@ -99,10 +103,8 @@ class LogisticRegression extends Adaline
/**
* Sets the learning rate if gradient descent algorithm is
* selected for training
*
* @param float $learningRate
*/
public function setLearningRate(float $learningRate)
public function setLearningRate(float $learningRate): void
{
$this->learningRate = $learningRate;
}
@ -110,10 +112,8 @@ class LogisticRegression extends Adaline
/**
* Lambda (λ) parameter of regularization term. If 0 is given,
* then the regularization term is cancelled
*
* @param float $lambda
*/
public function setLambda(float $lambda)
public function setLambda(float $lambda): void
{
$this->lambda = $lambda;
}
@ -122,40 +122,40 @@ class LogisticRegression extends Adaline
* Adapts the weights with respect to given samples and targets
* by use of selected solver
*
* @param array $samples
* @param array $targets
*
* @throws \Exception
*/
protected function runTraining(array $samples, array $targets)
protected function runTraining(array $samples, array $targets): void
{
$callback = $this->getCostFunction();
switch ($this->trainingType) {
case self::BATCH_TRAINING:
return $this->runGradientDescent($samples, $targets, $callback, true);
$this->runGradientDescent($samples, $targets, $callback, true);
return;
case self::ONLINE_TRAINING:
return $this->runGradientDescent($samples, $targets, $callback, false);
$this->runGradientDescent($samples, $targets, $callback, false);
return;
case self::CONJUGATE_GRAD_TRAINING:
return $this->runConjugateGradient($samples, $targets, $callback);
$this->runConjugateGradient($samples, $targets, $callback);
return;
default:
throw new \Exception('Logistic regression has invalid training type: %s.', $this->trainingType);
// Not reached
throw new Exception(sprintf('Logistic regression has invalid training type: %d.', $this->trainingType));
}
}
/**
* Executes Conjugate Gradient method to optimize the weights of the LogReg model
*
* @param array $samples
* @param array $targets
* @param \Closure $gradientFunc
*/
protected function runConjugateGradient(array $samples, array $targets, \Closure $gradientFunc)
protected function runConjugateGradient(array $samples, array $targets, Closure $gradientFunc): void
{
if (empty($this->optimizer)) {
if ($this->optimizer === null) {
$this->optimizer = (new ConjugateGradient($this->featureCount))
->setMaxIterations($this->maxIterations);
}
@ -167,14 +167,12 @@ class LogisticRegression extends Adaline
/**
* Returns the appropriate callback function for the selected cost function
*
* @return \Closure
*
* @throws \Exception
*/
protected function getCostFunction()
protected function getCostFunction(): Closure
{
$penalty = 0;
if ($this->penalty == 'L2') {
if ($this->penalty === 'L2') {
$penalty = $this->lambda;
}
@ -190,7 +188,7 @@ class LogisticRegression extends Adaline
* The gradient of the cost function to be used with gradient descent:
* ∇J(x) = -(y - h(x)) = (h(x) - y)
*/
$callback = function ($weights, $sample, $y) use ($penalty) {
return function ($weights, $sample, $y) use ($penalty) {
$this->weights = $weights;
$hX = $this->output($sample);
@ -199,17 +197,18 @@ class LogisticRegression extends Adaline
if ($hX == 1) {
$hX = 1 - 1e-10;
}
if ($hX == 0) {
$hX = 1e-10;
}
$y = $y < 0 ? 0 : 1;
$error = -$y * log($hX) - (1 - $y) * log(1 - $hX);
$gradient = $hX - $y;
return [$error, $gradient, $penalty];
};
return $callback;
case 'sse':
/*
* Sum of squared errors or least squared errors cost function:
@ -221,31 +220,27 @@ class LogisticRegression extends Adaline
* The gradient of the cost function:
* ∇J(x) = -(h(x) - y) . h(x) . (1 - h(x))
*/
$callback = function ($weights, $sample, $y) use ($penalty) {
return function ($weights, $sample, $y) use ($penalty) {
$this->weights = $weights;
$hX = $this->output($sample);
$y = $y < 0 ? 0 : 1;
$error = ($y - $hX) ** 2;
$gradient = -($y - $hX) * $hX * (1 - $hX);
return [$error, $gradient, $penalty];
};
return $callback;
default:
throw new \Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
// Not reached
throw new Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
}
}
/**
* Returns the output of the network, a float value between 0.0 and 1.0
*
* @param array $sample
*
* @return float
*/
protected function output(array $sample)
protected function output(array $sample): float
{
$sum = parent::output($sample);
@ -254,16 +249,12 @@ class LogisticRegression extends Adaline
/**
* Returns the class value (either -1 or 1) for the given input
*
* @param array $sample
*
* @return int
*/
protected function outputClass(array $sample)
protected function outputClass(array $sample): int
{
$output = $this->output($sample);
if (round($output) > 0.5) {
if ($output > 0.5) {
return 1;
}
@ -276,20 +267,17 @@ class LogisticRegression extends Adaline
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param array $sample
* @param mixed $label
*
* @return float
*/
protected function predictProbability(array $sample, $label)
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
$sample = $this->checkNormalizedSample($sample);
$probability = $this->output($sample);
if (strval($predicted) == strval($label)) {
$sample = $this->checkNormalizedSample($sample);
return abs($this->output($sample) - 0.5);
if (array_search($label, $this->labels, true) > 0) {
return $probability;
}
return 0.0;
return 1 - $probability;
}
}

View file

@ -4,20 +4,24 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Optimizer\StochasticGD;
use Phpml\Helper\Optimizer\GD;
use Closure;
use Phpml\Classification\Classifier;
use Phpml\Preprocessing\Normalizer;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Optimizer\GD;
use Phpml\Helper\Optimizer\Optimizer;
use Phpml\Helper\Optimizer\StochasticGD;
use Phpml\Helper\Predictable;
use Phpml\IncrementalEstimator;
use Phpml\Preprocessing\Normalizer;
class Perceptron implements Classifier, IncrementalEstimator
{
use Predictable, OneVsRest;
use Predictable;
use OneVsRest;
/**
* @var \Phpml\Helper\Optimizer\Optimizer
* @var Optimizer|GD|StochasticGD|null
*/
protected $optimizer;
@ -34,7 +38,7 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* @var array
*/
protected $weights;
protected $weights = [];
/**
* @var float
@ -56,29 +60,23 @@ class Perceptron implements Classifier, IncrementalEstimator
*/
protected $enableEarlyStop = true;
/**
* @var array
*/
protected $costValues = [];
/**
* Initalize a perceptron classifier with given learning rate and maximum
* number of iterations used while training the perceptron
*
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
* @param int $maxIterations Must be at least 1
* @param bool $normalizeInputs
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
* @param int $maxIterations Must be at least 1
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true)
{
if ($learningRate <= 0.0 || $learningRate > 1.0) {
throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)");
throw new InvalidArgumentException('Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)');
}
if ($maxIterations <= 0) {
throw new \Exception("Maximum number of iterations must be an integer greater than 0");
throw new InvalidArgumentException('Maximum number of iterations must be an integer greater than 0');
}
if ($normalizeInputs) {
@ -89,31 +87,24 @@ class Perceptron implements Classifier, IncrementalEstimator
$this->maxIterations = $maxIterations;
}
/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function partialTrain(array $samples, array $targets, array $labels = [])
public function partialTrain(array $samples, array $targets, array $labels = []): void
{
$this->trainByLabel($samples, $targets, $labels);
}
/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function trainBinary(array $samples, array $targets, array $labels)
public function trainBinary(array $samples, array $targets, array $labels): void
{
if ($this->normalizer) {
if ($this->normalizer !== null) {
$this->normalizer->transform($samples);
}
// Set all target values to either -1 or 1
$this->labels = [1 => $labels[0], -1 => $labels[1]];
$this->labels = [
1 => $labels[0],
-1 => $labels[1],
];
foreach ($targets as $key => $target) {
$targets[$key] = strval($target) == strval($this->labels[1]) ? 1 : -1;
$targets[$key] = (string) $target == (string) $this->labels[1] ? 1 : -1;
}
// Set samples and feature count vars
@ -122,15 +113,6 @@ class Perceptron implements Classifier, IncrementalEstimator
$this->runTraining($samples, $targets);
}
protected function resetBinary()
{
$this->labels = [];
$this->optimizer = null;
$this->featureCount = 0;
$this->weights = null;
$this->costValues = [];
}
/**
* Normally enabling early stopping for the optimization procedure may
* help saving processing time while in some cases it may result in
@ -139,8 +121,6 @@ class Perceptron implements Classifier, IncrementalEstimator
* If "false" is given, the optimization procedure will always be executed
* for $maxIterations times
*
* @param bool $enable
*
* @return $this
*/
public function setEarlyStop(bool $enable = true)
@ -152,22 +132,26 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* Returns the cost values obtained during the training.
*
* @return array
*/
public function getCostValues()
public function getCostValues(): array
{
return $this->costValues;
}
protected function resetBinary(): void
{
$this->labels = [];
$this->optimizer = null;
$this->featureCount = 0;
$this->weights = [];
$this->costValues = [];
}
/**
* Trains the perceptron model with Stochastic Gradient Descent optimization
* to get the correct set of weights
*
* @param array $samples
* @param array $targets
*/
protected function runTraining(array $samples, array $targets)
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target) {
@ -175,7 +159,7 @@ class Perceptron implements Classifier, IncrementalEstimator
$prediction = $this->outputClass($sample);
$gradient = $prediction - $target;
$error = $gradient**2;
$error = $gradient ** 2;
return [$error, $gradient];
};
@ -186,17 +170,12 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* Executes a Gradient Descent algorithm for
* the given cost function
*
* @param array $samples
* @param array $targets
* @param \Closure $gradientFunc
* @param bool $isBatch
*/
protected function runGradientDescent(array $samples, array $targets, \Closure $gradientFunc, bool $isBatch = false)
protected function runGradientDescent(array $samples, array $targets, Closure $gradientFunc, bool $isBatch = false): void
{
$class = $isBatch ? GD::class : StochasticGD::class;
if (empty($this->optimizer)) {
if ($this->optimizer === null) {
$this->optimizer = (new $class($this->featureCount))
->setLearningRate($this->learningRate)
->setMaxIterations($this->maxIterations)
@ -211,14 +190,10 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* Checks if the sample should be normalized and if so, returns the
* normalized sample
*
* @param array $sample
*
* @return array
*/
protected function checkNormalizedSample(array $sample)
protected function checkNormalizedSample(array $sample): array
{
if ($this->normalizer) {
if ($this->normalizer !== null) {
$samples = [$sample];
$this->normalizer->transform($samples);
$sample = $samples[0];
@ -230,8 +205,7 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* Calculates net output of the network as a float value for the given input
*
* @param array $sample
* @return int
* @return int|float
*/
protected function output(array $sample)
{
@ -249,11 +223,8 @@ class Perceptron implements Classifier, IncrementalEstimator
/**
* Returns the class value (either -1 or 1) for the given input
*
* @param array $sample
* @return int
*/
protected function outputClass(array $sample)
protected function outputClass(array $sample): int
{
return $this->output($sample) > 0 ? 1 : -1;
}
@ -264,26 +235,22 @@ class Perceptron implements Classifier, IncrementalEstimator
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param array $sample
* @param mixed $label
*
* @return float
*/
protected function predictProbability(array $sample, $label)
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if (strval($predicted) == strval($label)) {
if ((string) $predicted == (string) $label) {
$sample = $this->checkNormalizedSample($sample);
return abs($this->output($sample));
return (float) abs($this->output($sample));
}
return 0.0;
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSampleBinary(array $sample)

View file

@ -9,22 +9,23 @@ use Phpml\NeuralNetwork\Network\MultilayerPerceptron;
class MLPClassifier extends MultilayerPerceptron implements Classifier
{
/**
* @param mixed $target
* @return int
* @param mixed $target
*
* @throws InvalidArgumentException
*/
public function getTargetClass($target): int
{
if (!in_array($target, $this->classes)) {
throw InvalidArgumentException::invalidTarget($target);
if (!in_array($target, $this->classes, true)) {
throw new InvalidArgumentException(
sprintf('Target with value "%s" is not part of the accepted classes', $target)
);
}
return array_search($target, $this->classes);
return array_search($target, $this->classes, true);
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSample(array $sample)
@ -39,18 +40,17 @@ class MLPClassifier extends MultilayerPerceptron implements Classifier
$max = $value;
}
}
return $this->classes[$predictedClass];
return $predictedClass;
}
/**
* @param array $sample
* @param mixed $target
*/
protected function trainSample(array $sample, $target)
protected function trainSample(array $sample, $target): void
{
// Feed-forward.
$this->setInput($sample)->getOutput();
$this->setInput($sample);
// Back-propagate.
$this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target));

View file

@ -4,6 +4,7 @@ declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
@ -11,11 +12,14 @@ use Phpml\Math\Statistic\StandardDeviation;
class NaiveBayes implements Classifier
{
use Trainable, Predictable;
use Trainable;
use Predictable;
const CONTINUOS = 1;
const NOMINAL = 2;
const EPSILON = 1e-10;
public const CONTINUOS = 1;
public const NOMINAL = 2;
public const EPSILON = 1e-10;
/**
* @var array
@ -25,7 +29,7 @@ class NaiveBayes implements Classifier
/**
* @var array
*/
private $mean= [];
private $mean = [];
/**
* @var array
@ -57,19 +61,14 @@ class NaiveBayes implements Classifier
*/
private $labels = [];
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->sampleCount = count($this->samples);
$this->featureCount = count($this->samples[0]);
$labelCounts = array_count_values($this->targets);
$this->labels = array_keys($labelCounts);
$this->labels = array_map('strval', array_flip(array_flip($this->targets)));
foreach ($this->labels as $label) {
$samples = $this->getSamplesByLabel($label);
$this->p[$label] = count($samples) / $this->sampleCount;
@ -77,16 +76,39 @@ class NaiveBayes implements Classifier
}
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
// Use NaiveBayes assumption for each label using:
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
// Then compare probability for each class to determine which label is most likely
$predictions = [];
foreach ($this->labels as $label) {
$p = $this->p[$label];
for ($i = 0; $i < $this->featureCount; ++$i) {
$Plf = $this->sampleProbability($sample, $i, $label);
$p += $Plf;
}
$predictions[$label] = $p;
}
arsort($predictions, SORT_NUMERIC);
reset($predictions);
return key($predictions);
}
/**
* Calculates vital statistics for each label & feature. Stores these
* values in private array in order to avoid repeated calculation
* @param string $label
* @param array $samples
*/
private function calculateStatistics($label, $samples)
private function calculateStatistics(string $label, array $samples): void
{
$this->std[$label] = array_fill(0, $this->featureCount, 0);
$this->mean[$label]= array_fill(0, $this->featureCount, 0);
$this->mean[$label] = array_fill(0, $this->featureCount, 0);
$this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
$this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
for ($i = 0; $i < $this->featureCount; ++$i) {
@ -113,25 +135,25 @@ class NaiveBayes implements Classifier
/**
* Calculates the probability P(label|sample_n)
*
* @param array $sample
* @param int $feature
* @param string $label
*
* @return float
*/
private function sampleProbability($sample, $feature, $label)
private function sampleProbability(array $sample, int $feature, string $label): float
{
if (!isset($sample[$feature])) {
throw new InvalidArgumentException('Missing feature. All samples must have equal number of features');
}
$value = $sample[$feature];
if ($this->dataType[$label][$feature] == self::NOMINAL) {
if (!isset($this->discreteProb[$label][$feature][$value]) ||
$this->discreteProb[$label][$feature][$value] == 0) {
return self::EPSILON;
}
return $this->discreteProb[$label][$feature][$value];
}
$std = $this->std[$label][$feature] ;
$mean= $this->mean[$label][$feature];
$std = $this->std[$label][$feature];
$mean = $this->mean[$label][$feature];
// Calculate the probability density by use of normal/Gaussian distribution
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
//
@ -139,19 +161,16 @@ class NaiveBayes implements Classifier
// some libraries adopt taking log of calculations such as
// scikit-learn did.
// (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py)
$pdf = -0.5 * log(2.0 * pi() * $std * $std);
$pdf -= 0.5 * pow($value - $mean, 2) / ($std * $std);
$pdf = -0.5 * log(2.0 * M_PI * $std * $std);
$pdf -= 0.5 * (($value - $mean) ** 2) / ($std * $std);
return $pdf;
}
/**
* Return samples belonging to specific label
*
* @param string $label
*
* @return array
*/
private function getSamplesByLabel($label)
private function getSamplesByLabel(string $label): array
{
$samples = [];
for ($i = 0; $i < $this->sampleCount; ++$i) {
@ -159,30 +178,7 @@ class NaiveBayes implements Classifier
$samples[] = $this->samples[$i];
}
}
return $samples;
}
/**
* @param array $sample
* @return mixed
*/
protected function predictSample(array $sample)
{
// Use NaiveBayes assumption for each label using:
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
// Then compare probability for each class to determine which label is most likely
$predictions = [];
foreach ($this->labels as $label) {
$p = $this->p[$label];
for ($i = 0; $i<$this->featureCount; ++$i) {
$Plf = $this->sampleProbability($sample, $i, $label);
$p += $Plf;
}
$predictions[$label] = $p;
}
arsort($predictions, SORT_NUMERIC);
reset($predictions);
return key($predictions);
}
}

View file

@ -10,20 +10,15 @@ use Phpml\SupportVectorMachine\Type;
class SVC extends SupportVectorMachine implements Classifier
{
/**
* @param int $kernel
* @param float $cost
* @param int $degree
* @param float|null $gamma
* @param float $coef0
* @param float $tolerance
* @param int $cacheSize
* @param bool $shrinking
* @param bool $probabilityEstimates
*/
public function __construct(
int $kernel = Kernel::LINEAR, float $cost = 1.0, int $degree = 3, float $gamma = null, float $coef0 = 0.0,
float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true,
int $kernel = Kernel::RBF,
float $cost = 1.0,
int $degree = 3,
?float $gamma = null,
float $coef0 = 0.0,
float $tolerance = 0.001,
int $cacheSize = 100,
bool $shrinking = true,
bool $probabilityEstimates = false
) {
parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);

View file

@ -9,14 +9,12 @@ abstract class WeightedClassifier implements Classifier
/**
* @var array
*/
protected $weights;
protected $weights = [];
/**
* Sets the array including a weight for each sample
*
* @param array $weights
*/
public function setSampleWeights(array $weights)
public function setSampleWeights(array $weights): void
{
$this->weights = $weights;
}

View file

@ -6,10 +6,5 @@ namespace Phpml\Clustering;
interface Clusterer
{
/**
* @param array $samples
*
* @return array
*/
public function cluster(array $samples);
public function cluster(array $samples): array;
}

View file

@ -9,6 +9,8 @@ use Phpml\Math\Distance\Euclidean;
class DBSCAN implements Clusterer
{
private const NOISE = -1;
/**
* @var float
*/
@ -24,14 +26,9 @@ class DBSCAN implements Clusterer
*/
private $distanceMetric;
/**
* @param float $epsilon
* @param int $minSamples
* @param Distance $distanceMetric
*/
public function __construct($epsilon = 0.5, $minSamples = 3, Distance $distanceMetric = null)
public function __construct(float $epsilon = 0.5, int $minSamples = 3, ?Distance $distanceMetric = null)
{
if (null === $distanceMetric) {
if ($distanceMetric === null) {
$distanceMetric = new Euclidean();
}
@ -40,72 +37,84 @@ class DBSCAN implements Clusterer
$this->distanceMetric = $distanceMetric;
}
/**
* @param array $samples
*
* @return array
*/
public function cluster(array $samples)
public function cluster(array $samples): array
{
$clusters = [];
$visited = [];
$labels = [];
$n = 0;
foreach ($samples as $index => $sample) {
if (isset($visited[$index])) {
if (isset($labels[$index])) {
continue;
}
$visited[$index] = true;
$regionSamples = $this->getSamplesInRegion($sample, $samples);
if (count($regionSamples) >= $this->minSamples) {
$clusters[] = $this->expandCluster($regionSamples, $visited);
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) < $this->minSamples) {
$labels[$index] = self::NOISE;
continue;
}
$labels[$index] = $n;
$this->expandCluster($samples, $neighborIndices, $labels, $n);
++$n;
}
return $this->groupByCluster($samples, $labels, $n);
}
private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void
{
while (($index = array_pop($seeds)) !== null) {
if (isset($labels[$index])) {
if ($labels[$index] === self::NOISE) {
$labels[$index] = $n;
}
continue;
}
$labels[$index] = $n;
$sample = $samples[$index];
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) >= $this->minSamples) {
$seeds = array_unique(array_merge($seeds, $neighborIndices));
}
}
}
private function getIndicesInRegion(array $center, array $samples): array
{
$indices = [];
foreach ($samples as $index => $sample) {
if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) {
$indices[] = $index;
}
}
return $indices;
}
private function groupByCluster(array $samples, array $labels, int $n): array
{
$clusters = array_fill(0, $n, []);
foreach ($samples as $index => $sample) {
if ($labels[$index] !== self::NOISE) {
$clusters[$labels[$index]][$index] = $sample;
}
}
// Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility
foreach ($clusters as $index => $cluster) {
$clusters[$index] = array_merge($cluster, []);
}
return $clusters;
}
/**
* @param array $localSample
* @param array $samples
*
* @return array
*/
private function getSamplesInRegion($localSample, $samples)
{
$region = [];
foreach ($samples as $index => $sample) {
if ($this->distanceMetric->distance($localSample, $sample) < $this->epsilon) {
$region[$index] = $sample;
}
}
return $region;
}
/**
* @param array $samples
* @param array $visited
*
* @return array
*/
private function expandCluster($samples, &$visited)
{
$cluster = [];
foreach ($samples as $index => $sample) {
if (!isset($visited[$index])) {
$visited[$index] = true;
$regionSamples = $this->getSamplesInRegion($sample, $samples);
if (count($regionSamples) > $this->minSamples) {
$cluster = array_merge($regionSamples, $cluster);
}
}
$cluster[] = $sample;
}
return $cluster;
}
}

View file

@ -4,8 +4,8 @@ declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Clustering\KMeans\Point;
use Phpml\Clustering\KMeans\Cluster;
use Phpml\Clustering\KMeans\Point;
use Phpml\Clustering\KMeans\Space;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance\Euclidean;
@ -18,9 +18,9 @@ class FuzzyCMeans implements Clusterer
private $clustersNumber;
/**
* @var array|Cluster[]
* @var Cluster[]
*/
private $clusters = null;
private $clusters = [];
/**
* @var Space
@ -28,9 +28,9 @@ class FuzzyCMeans implements Clusterer
private $space;
/**
* @var array|float[][]
* @var float[][]
*/
private $membership;
private $membership = [];
/**
* @var float
@ -55,170 +55,36 @@ class FuzzyCMeans implements Clusterer
/**
* @var array
*/
private $samples;
private $samples = [];
/**
* @param int $clustersNumber
* @param float $fuzziness
* @param float $epsilon
* @param int $maxIterations
*
* @throws InvalidArgumentException
*/
public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
{
if ($clustersNumber <= 0) {
throw InvalidArgumentException::invalidClustersNumber();
throw new InvalidArgumentException('Invalid clusters number');
}
$this->clustersNumber = $clustersNumber;
$this->fuzziness = $fuzziness;
$this->epsilon = $epsilon;
$this->maxIterations = $maxIterations;
}
protected function initClusters()
{
// Membership array is a matrix of cluster number by sample counts
// We initilize the membership array with random values
$dim = $this->space->getDimension();
$this->generateRandomMembership($dim, $this->sampleCount);
$this->updateClusters();
}
/**
* @param int $rows
* @param int $cols
*/
protected function generateRandomMembership(int $rows, int $cols)
{
$this->membership = [];
for ($i = 0; $i < $rows; ++$i) {
$row = [];
$total = 0.0;
for ($k = 0; $k < $cols; ++$k) {
$val = rand(1, 5) / 10.0;
$row[] = $val;
$total += $val;
}
$this->membership[] = array_map(function ($val) use ($total) {
return $val / $total;
}, $row);
}
}
protected function updateClusters()
{
$dim = $this->space->getDimension();
if (!$this->clusters) {
$this->clusters = [];
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
}
}
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$cluster = $this->clusters[$i];
$center = $cluster->getCoordinates();
for ($k = 0; $k < $dim; ++$k) {
$a = $this->getMembershipRowTotal($i, $k, true);
$b = $this->getMembershipRowTotal($i, $k, false);
$center[$k] = $a / $b;
}
$cluster->setCoordinates($center);
}
}
protected function getMembershipRowTotal(int $row, int $col, bool $multiply)
{
$sum = 0.0;
for ($k = 0; $k < $this->sampleCount; ++$k) {
$val = pow($this->membership[$row][$k], $this->fuzziness);
if ($multiply) {
$val *= $this->samples[$k][$col];
}
$sum += $val;
}
return $sum;
}
protected function updateMembershipMatrix()
{
for ($i = 0; $i < $this->clustersNumber; ++$i) {
for ($k = 0; $k < $this->sampleCount; ++$k) {
$distCalc = $this->getDistanceCalc($i, $k);
$this->membership[$i][$k] = 1.0 / $distCalc;
}
}
}
/**
*
* @param int $row
* @param int $col
* @return float
*/
protected function getDistanceCalc(int $row, int $col)
{
$sum = 0.0;
$distance = new Euclidean();
$dist1 = $distance->distance(
$this->clusters[$row]->getCoordinates(),
$this->samples[$col]
);
for ($j = 0; $j < $this->clustersNumber; ++$j) {
$dist2 = $distance->distance(
$this->clusters[$j]->getCoordinates(),
$this->samples[$col]
);
$val = pow($dist1 / $dist2, 2.0 / ($this->fuzziness - 1));
$sum += $val;
}
return $sum;
}
/**
* The objective is to minimize the distance between all data points
* and all cluster centers. This method returns the summation of all
* these distances
*/
protected function getObjective()
{
$sum = 0.0;
$distance = new Euclidean();
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$clust = $this->clusters[$i]->getCoordinates();
for ($k = 0; $k < $this->sampleCount; ++$k) {
$point = $this->samples[$k];
$sum += $distance->distance($clust, $point);
}
}
return $sum;
}
/**
* @return array
*/
public function getMembershipMatrix()
public function getMembershipMatrix(): array
{
return $this->membership;
}
/**
* @param array|Point[] $samples
* @return array
* @param Point[]|int[][] $samples
*/
public function cluster(array $samples)
public function cluster(array $samples): array
{
// Initialize variables, clusters and membership matrix
$this->sampleCount = count($samples);
$this->samples =& $samples;
$this->samples = &$samples;
$this->space = new Space(count($samples[0]));
$this->initClusters();
@ -242,8 +108,7 @@ class FuzzyCMeans implements Clusterer
$column = array_column($this->membership, $k);
arsort($column);
reset($column);
$i = key($column);
$cluster = $this->clusters[$i];
$cluster = $this->clusters[key($column)];
$cluster->attach(new Point($this->samples[$k]));
}
@ -255,4 +120,120 @@ class FuzzyCMeans implements Clusterer
return $grouped;
}
protected function initClusters(): void
{
// Membership array is a matrix of cluster number by sample counts
// We initilize the membership array with random values
$dim = $this->space->getDimension();
$this->generateRandomMembership($dim, $this->sampleCount);
$this->updateClusters();
}
protected function generateRandomMembership(int $rows, int $cols): void
{
$this->membership = [];
for ($i = 0; $i < $rows; ++$i) {
$row = [];
$total = 0.0;
for ($k = 0; $k < $cols; ++$k) {
$val = random_int(1, 5) / 10.0;
$row[] = $val;
$total += $val;
}
$this->membership[] = array_map(function ($val) use ($total) {
return $val / $total;
}, $row);
}
}
protected function updateClusters(): void
{
$dim = $this->space->getDimension();
if (count($this->clusters) === 0) {
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
}
}
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$cluster = $this->clusters[$i];
$center = $cluster->getCoordinates();
for ($k = 0; $k < $dim; ++$k) {
$a = $this->getMembershipRowTotal($i, $k, true);
$b = $this->getMembershipRowTotal($i, $k, false);
$center[$k] = $a / $b;
}
$cluster->setCoordinates($center);
}
}
protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float
{
$sum = 0.0;
for ($k = 0; $k < $this->sampleCount; ++$k) {
$val = $this->membership[$row][$k] ** $this->fuzziness;
if ($multiply) {
$val *= $this->samples[$k][$col];
}
$sum += $val;
}
return $sum;
}
protected function updateMembershipMatrix(): void
{
for ($i = 0; $i < $this->clustersNumber; ++$i) {
for ($k = 0; $k < $this->sampleCount; ++$k) {
$distCalc = $this->getDistanceCalc($i, $k);
$this->membership[$i][$k] = 1.0 / $distCalc;
}
}
}
protected function getDistanceCalc(int $row, int $col): float
{
$sum = 0.0;
$distance = new Euclidean();
$dist1 = $distance->distance(
$this->clusters[$row]->getCoordinates(),
$this->samples[$col]
);
for ($j = 0; $j < $this->clustersNumber; ++$j) {
$dist2 = $distance->distance(
$this->clusters[$j]->getCoordinates(),
$this->samples[$col]
);
$val = ($dist1 / $dist2) ** 2.0 / ($this->fuzziness - 1);
$sum += $val;
}
return $sum;
}
/**
* The objective is to minimize the distance between all data points
* and all cluster centers. This method returns the summation of all
* these distances
*/
protected function getObjective(): float
{
$sum = 0.0;
$distance = new Euclidean();
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$clust = $this->clusters[$i]->getCoordinates();
for ($k = 0; $k < $this->sampleCount; ++$k) {
$point = $this->samples[$k];
$sum += $distance->distance($clust, $point);
}
}
return $sum;
}
}

View file

@ -9,8 +9,9 @@ use Phpml\Exception\InvalidArgumentException;
class KMeans implements Clusterer
{
const INIT_RANDOM = 1;
const INIT_KMEANS_PLUS_PLUS = 2;
public const INIT_RANDOM = 1;
public const INIT_KMEANS_PLUS_PLUS = 2;
/**
* @var int
@ -22,32 +23,21 @@ class KMeans implements Clusterer
*/
private $initialization;
/**
* @param int $clustersNumber
* @param int $initialization
*
* @throws InvalidArgumentException
*/
public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS)
{
if ($clustersNumber <= 0) {
throw InvalidArgumentException::invalidClustersNumber();
throw new InvalidArgumentException('Invalid clusters number');
}
$this->clustersNumber = $clustersNumber;
$this->initialization = $initialization;
}
/**
* @param array $samples
*
* @return array
*/
public function cluster(array $samples)
public function cluster(array $samples): array
{
$space = new Space(count($samples[0]));
foreach ($samples as $sample) {
$space->addPoint($sample);
$space = new Space(count(reset($samples)));
foreach ($samples as $key => $sample) {
$space->addPoint($sample, $key);
}
$clusters = [];

View file

@ -5,11 +5,10 @@ declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use IteratorAggregate;
use Countable;
use SplObjectStorage;
use LogicException;
use SplObjectStorage;
class Cluster extends Point implements IteratorAggregate, Countable
class Cluster extends Point implements IteratorAggregate
{
/**
* @var Space
@ -21,10 +20,6 @@ class Cluster extends Point implements IteratorAggregate, Countable
*/
protected $points;
/**
* @param Space $space
* @param array $coordinates
*/
public function __construct(Space $space, array $coordinates)
{
parent::__construct($coordinates);
@ -32,23 +27,21 @@ class Cluster extends Point implements IteratorAggregate, Countable
$this->points = new SplObjectStorage();
}
/**
* @return array
*/
public function getPoints()
public function getPoints(): array
{
$points = [];
foreach ($this->points as $point) {
$points[] = $point->toArray();
if ($point->label === null) {
$points[] = $point->toArray();
} else {
$points[$point->label] = $point->toArray();
}
}
return $points;
}
/**
* @return array
*/
public function toArray()
public function toArray(): array
{
return [
'centroid' => parent::toArray(),
@ -56,17 +49,10 @@ class Cluster extends Point implements IteratorAggregate, Countable
];
}
/**
* @param Point $point
*
* @return Point
*
* @throws \LogicException
*/
public function attach(Point $point)
public function attach(Point $point): Point
{
if ($point instanceof self) {
throw new LogicException('cannot attach a cluster to another');
throw new LogicException('Cannot attach a cluster to another');
}
$this->points->attach($point);
@ -74,37 +60,27 @@ class Cluster extends Point implements IteratorAggregate, Countable
return $point;
}
/**
* @param Point $point
*
* @return Point
*/
public function detach(Point $point)
public function detach(Point $point): Point
{
$this->points->detach($point);
return $point;
}
/**
* @param SplObjectStorage $points
*/
public function attachAll(SplObjectStorage $points)
public function attachAll(SplObjectStorage $points): void
{
$this->points->addAll($points);
}
/**
* @param SplObjectStorage $points
*/
public function detachAll(SplObjectStorage $points)
public function detachAll(SplObjectStorage $points): void
{
$this->points->removeAll($points);
}
public function updateCentroid()
public function updateCentroid(): void
{
if (!$count = count($this->points)) {
$count = count($this->points);
if ($count === 0) {
return;
}
@ -129,18 +105,12 @@ class Cluster extends Point implements IteratorAggregate, Countable
return $this->points;
}
/**
* @return mixed
*/
public function count()
public function count(): int
{
return count($this->points);
}
/**
* @param array $newCoordinates
*/
public function setCoordinates(array $newCoordinates)
public function setCoordinates(array $newCoordinates): void
{
$this->coordinates = $newCoordinates;
}

View file

@ -6,7 +6,7 @@ namespace Phpml\Clustering\KMeans;
use ArrayAccess;
class Point implements ArrayAccess
class Point implements ArrayAccess, \Countable
{
/**
* @var int
@ -16,32 +16,32 @@ class Point implements ArrayAccess
/**
* @var array
*/
protected $coordinates;
protected $coordinates = [];
/**
* @param array $coordinates
* @var mixed
*/
public function __construct(array $coordinates)
protected $label;
/**
* @param mixed $label
*/
public function __construct(array $coordinates, $label = null)
{
$this->dimension = count($coordinates);
$this->coordinates = $coordinates;
$this->label = $label;
}
/**
* @return array
*/
public function toArray()
public function toArray(): array
{
return $this->coordinates;
}
/**
* @param Point $point
* @param bool $precise
*
* @return int|mixed
* @return float|int
*/
public function getDistanceWith(self $point, $precise = true)
public function getDistanceWith(self $point, bool $precise = true)
{
$distance = 0;
for ($n = 0; $n < $this->dimension; ++$n) {
@ -49,22 +49,23 @@ class Point implements ArrayAccess
$distance += $difference * $difference;
}
return $precise ? sqrt((float) $distance) : $distance;
return $precise ? $distance ** .5 : $distance;
}
/**
* @param array $points
*
* @return mixed
* @param Point[] $points
*/
public function getClosest(array $points)
public function getClosest(array $points): ?self
{
$minPoint = null;
foreach ($points as $point) {
$distance = $this->getDistanceWith($point, false);
if (!isset($minDistance)) {
$minDistance = $distance;
$minPoint = $point;
continue;
}
@ -77,20 +78,15 @@ class Point implements ArrayAccess
return $minPoint;
}
/**
* @return array
*/
public function getCoordinates()
public function getCoordinates(): array
{
return $this->coordinates;
}
/**
* @param mixed $offset
*
* @return bool
*/
public function offsetExists($offset)
public function offsetExists($offset): bool
{
return isset($this->coordinates[$offset]);
}
@ -109,7 +105,7 @@ class Point implements ArrayAccess
* @param mixed $offset
* @param mixed $value
*/
public function offsetSet($offset, $value)
public function offsetSet($offset, $value): void
{
$this->coordinates[$offset] = $value;
}
@ -117,8 +113,13 @@ class Point implements ArrayAccess
/**
* @param mixed $offset
*/
public function offsetUnset($offset)
public function offsetUnset($offset): void
{
unset($this->coordinates[$offset]);
}
public function count(): int
{
return count($this->coordinates);
}
}

View file

@ -4,10 +4,10 @@ declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use InvalidArgumentException;
use LogicException;
use Phpml\Clustering\KMeans;
use SplObjectStorage;
use LogicException;
use InvalidArgumentException;
class Space extends SplObjectStorage
{
@ -16,10 +16,7 @@ class Space extends SplObjectStorage
*/
protected $dimension;
/**
* @param $dimension
*/
public function __construct($dimension)
public function __construct(int $dimension)
{
if ($dimension < 1) {
throw new LogicException('a space dimension cannot be null or negative');
@ -28,12 +25,11 @@ class Space extends SplObjectStorage
$this->dimension = $dimension;
}
/**
* @return array
*/
public function toArray()
public function toArray(): array
{
$points = [];
/** @var Point $point */
foreach ($this as $point) {
$points[] = $point->toArray();
}
@ -42,33 +38,31 @@ class Space extends SplObjectStorage
}
/**
* @param array $coordinates
*
* @return Point
* @param mixed $label
*/
public function newPoint(array $coordinates)
public function newPoint(array $coordinates, $label = null): Point
{
if (count($coordinates) != $this->dimension) {
if (count($coordinates) !== $this->dimension) {
throw new LogicException('('.implode(',', $coordinates).') is not a point of this space');
}
return new Point($coordinates);
return new Point($coordinates, $label);
}
/**
* @param array $coordinates
* @param null $data
* @param mixed $label
* @param mixed $data
*/
public function addPoint(array $coordinates, $data = null)
public function addPoint(array $coordinates, $label = null, $data = null): void
{
$this->attach($this->newPoint($coordinates), $data);
$this->attach($this->newPoint($coordinates, $label), $data);
}
/**
* @param Point $point
* @param null $data
* @param object $point
* @param mixed $data
*/
public function attach($point, $data = null)
public function attach($point, $data = null): void
{
if (!$point instanceof Point) {
throw new InvalidArgumentException('can only attach points to spaces');
@ -77,10 +71,7 @@ class Space extends SplObjectStorage
parent::attach($point, $data);
}
/**
* @return int
*/
public function getDimension()
public function getDimension(): int
{
return $this->dimension;
}
@ -90,30 +81,30 @@ class Space extends SplObjectStorage
*/
public function getBoundaries()
{
if (!count($this)) {
if (count($this) === 0) {
return false;
}
$min = $this->newPoint(array_fill(0, $this->dimension, null));
$max = $this->newPoint(array_fill(0, $this->dimension, null));
/** @var self $point */
foreach ($this as $point) {
for ($n = 0; $n < $this->dimension; ++$n) {
($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n];
($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n];
if ($min[$n] === null || $min[$n] > $point[$n]) {
$min[$n] = $point[$n];
}
if ($max[$n] === null || $max[$n] < $point[$n]) {
$max[$n] = $point[$n];
}
}
}
return [$min, $max];
}
/**
* @param Point $min
* @param Point $max
*
* @return Point
*/
public function getRandomPoint(Point $min, Point $max)
public function getRandomPoint(Point $min, Point $max): Point
{
$point = $this->newPoint(array_fill(0, $this->dimension, null));
@ -125,12 +116,9 @@ class Space extends SplObjectStorage
}
/**
* @param int $clustersNumber
* @param int $initMethod
*
* @return array|Cluster[]
* @return Cluster[]
*/
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM)
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM): array
{
$clusters = $this->initializeClusters($clustersNumber, $initMethod);
@ -141,20 +129,19 @@ class Space extends SplObjectStorage
}
/**
* @param $clustersNumber
* @param $initMethod
*
* @return array|Cluster[]
* @return Cluster[]
*/
protected function initializeClusters(int $clustersNumber, int $initMethod)
protected function initializeClusters(int $clustersNumber, int $initMethod): array
{
switch ($initMethod) {
case KMeans::INIT_RANDOM:
$clusters = $this->initializeRandomClusters($clustersNumber);
break;
case KMeans::INIT_KMEANS_PLUS_PLUS:
$clusters = $this->initializeKMPPClusters($clustersNumber);
break;
default:
@ -167,11 +154,9 @@ class Space extends SplObjectStorage
}
/**
* @param $clusters
*
* @return bool
* @param Cluster[] $clusters
*/
protected function iterate($clusters)
protected function iterate(array $clusters): bool
{
$convergence = true;
@ -183,8 +168,8 @@ class Space extends SplObjectStorage
$closest = $point->getClosest($clusters);
if ($closest !== $cluster) {
isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage();
isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage();
$attach[$closest] ?? $attach[$closest] = new SplObjectStorage();
$detach[$cluster] ?? $detach[$cluster] = new SplObjectStorage();
$attach[$closest]->attach($point);
$detach[$cluster]->attach($point);
@ -194,10 +179,12 @@ class Space extends SplObjectStorage
}
}
/** @var Cluster $cluster */
foreach ($attach as $cluster) {
$cluster->attachAll($attach[$cluster]);
}
/** @var Cluster $cluster */
foreach ($detach as $cluster) {
$cluster->detachAll($detach[$cluster]);
}
@ -210,14 +197,58 @@ class Space extends SplObjectStorage
}
/**
* @param int $clustersNumber
*
* @return array
* @return Cluster[]
*/
private function initializeRandomClusters(int $clustersNumber)
protected function initializeKMPPClusters(int $clustersNumber): array
{
$clusters = [];
list($min, $max) = $this->getBoundaries();
$this->rewind();
/** @var Point $current */
$current = $this->current();
$clusters[] = new Cluster($this, $current->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
/** @var Point $point */
foreach ($this as $point) {
$closest = $point->getClosest($clusters);
if ($closest === null) {
continue;
}
$distance = $point->getDistanceWith($closest);
$sum += $distances[$point] = $distance;
}
$sum = random_int(0, (int) $sum);
/** @var Point $point */
foreach ($this as $point) {
$sum -= $distances[$point];
if ($sum > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
return $clusters;
}
/**
* @return Cluster[]
*/
private function initializeRandomClusters(int $clustersNumber): array
{
$clusters = [];
[$min, $max] = $this->getBoundaries();
for ($n = 0; $n < $clustersNumber; ++$n) {
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
@ -225,39 +256,4 @@ class Space extends SplObjectStorage
return $clusters;
}
/**
* @param int $clustersNumber
*
* @return array
*/
protected function initializeKMPPClusters(int $clustersNumber)
{
$clusters = [];
$this->rewind();
$clusters[] = new Cluster($this, $this->current()->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
foreach ($this as $point) {
$distance = $point->getDistanceWith($point->getClosest($clusters));
$sum += $distances[$point] = $distance;
}
$sum = random_int(0, (int) $sum);
foreach ($this as $point) {
if (($sum -= $distances[$point]) > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
return $clusters;
}
}

View file

@ -8,11 +8,7 @@ use Phpml\Dataset\Dataset;
class RandomSplit extends Split
{
/**
* @param Dataset $dataset
* @param float $testSize
*/
protected function splitDataset(Dataset $dataset, float $testSize)
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$samples = $dataset->getSamples();
$labels = $dataset->getTargets();

View file

@ -29,63 +29,42 @@ abstract class Split
*/
protected $testLabels = [];
/**
* @param Dataset $dataset
* @param float $testSize
* @param int $seed
*
* @throws InvalidArgumentException
*/
public function __construct(Dataset $dataset, float $testSize = 0.3, int $seed = null)
public function __construct(Dataset $dataset, float $testSize = 0.3, ?int $seed = null)
{
if (0 >= $testSize || 1 <= $testSize) {
throw InvalidArgumentException::percentNotInRange('testSize');
if ($testSize <= 0 || $testSize >= 1) {
throw new InvalidArgumentException('testsize must be between 0.0 and 1.0');
}
$this->seedGenerator($seed);
$this->splitDataset($dataset, $testSize);
}
abstract protected function splitDataset(Dataset $dataset, float $testSize);
/**
* @return array
*/
public function getTrainSamples()
public function getTrainSamples(): array
{
return $this->trainSamples;
}
/**
* @return array
*/
public function getTestSamples()
public function getTestSamples(): array
{
return $this->testSamples;
}
/**
* @return array
*/
public function getTrainLabels()
public function getTrainLabels(): array
{
return $this->trainLabels;
}
/**
* @return array
*/
public function getTestLabels()
public function getTestLabels(): array
{
return $this->testLabels;
}
/**
* @param int|null $seed
*/
protected function seedGenerator(int $seed = null)
abstract protected function splitDataset(Dataset $dataset, float $testSize): void;
protected function seedGenerator(?int $seed = null): void
{
if (null === $seed) {
if ($seed === null) {
mt_srand();
} else {
mt_srand($seed);

View file

@ -9,11 +9,7 @@ use Phpml\Dataset\Dataset;
class StratifiedRandomSplit extends RandomSplit
{
/**
* @param Dataset $dataset
* @param float $testSize
*/
protected function splitDataset(Dataset $dataset, float $testSize)
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$datasets = $this->splitByTarget($dataset);
@ -23,9 +19,7 @@ class StratifiedRandomSplit extends RandomSplit
}
/**
* @param Dataset $dataset
*
* @return Dataset[]|array
* @return Dataset[]
*/
private function splitByTarget(Dataset $dataset): array
{
@ -33,23 +27,16 @@ class StratifiedRandomSplit extends RandomSplit
$samples = $dataset->getSamples();
$uniqueTargets = array_unique($targets);
/** @var array $split */
$split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), []));
foreach ($samples as $key => $sample) {
$split[$targets[$key]][] = $sample;
}
$datasets = $this->createDatasets($uniqueTargets, $split);
return $datasets;
return $this->createDatasets($uniqueTargets, $split);
}
/**
* @param array $uniqueTargets
* @param array $split
*
* @return array
*/
private function createDatasets(array $uniqueTargets, array $split): array
{
$datasets = [];

View file

@ -19,34 +19,44 @@ class ArrayDataset implements Dataset
protected $targets = [];
/**
* @param array $samples
* @param array $targets
*
* @throws InvalidArgumentException
*/
public function __construct(array $samples, array $targets)
{
if (count($samples) != count($targets)) {
throw InvalidArgumentException::arraySizeNotMatch();
if (count($samples) !== count($targets)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$this->samples = $samples;
$this->targets = $targets;
}
/**
* @return array
*/
public function getSamples(): array
{
return $this->samples;
}
/**
* @return array
*/
public function getTargets(): array
{
return $this->targets;
}
/**
* @param int[] $columns
*/
public function removeColumns(array $columns): void
{
foreach ($this->samples as &$sample) {
$this->removeColumnsFromSample($sample, $columns);
}
}
private function removeColumnsFromSample(array &$sample, array $columns): void
{
foreach ($columns as $index) {
unset($sample[$index]);
}
$sample = array_values($sample);
}
}

View file

@ -11,36 +11,32 @@ class CsvDataset extends ArrayDataset
/**
* @var array
*/
protected $columnNames;
protected $columnNames = [];
/**
* @param string $filepath
* @param int $features
* @param bool $headingRow
* @param string $delimiter
*
* @throws FileException
*/
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',')
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',', int $maxLineLength = 0)
{
if (!file_exists($filepath)) {
throw FileException::missingFile(basename($filepath));
throw new FileException(sprintf('File "%s" missing.', basename($filepath)));
}
if (false === $handle = fopen($filepath, 'rb')) {
throw FileException::cantOpenFile(basename($filepath));
$handle = fopen($filepath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
}
if ($headingRow) {
$data = fgetcsv($handle, 1000, $delimiter);
$this->columnNames = array_slice($data, 0, $features);
$data = fgetcsv($handle, $maxLineLength, $delimiter);
$this->columnNames = array_slice((array) $data, 0, $features);
} else {
$this->columnNames = range(0, $features - 1);
}
$samples = $targets = [];
while (($data = fgetcsv($handle, 1000, $delimiter)) !== false) {
$samples[] = array_slice($data, 0, $features);
while (($data = fgetcsv($handle, $maxLineLength, $delimiter)) !== false) {
$samples[] = array_slice((array) $data, 0, $features);
$targets[] = $data[$features];
}
@ -49,10 +45,7 @@ class CsvDataset extends ArrayDataset
parent::__construct($samples, $targets);
}
/**
* @return array
*/
public function getColumnNames()
public function getColumnNames(): array
{
return $this->columnNames;
}

View file

@ -6,13 +6,7 @@ namespace Phpml\Dataset;
interface Dataset
{
/**
* @return array
*/
public function getSamples(): array;
/**
* @return array
*/
public function getTargets(): array;
}

View file

@ -22,7 +22,7 @@ class GlassDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../../data/glass.csv';
$filepath = __DIR__.'/../../../data/glass.csv';
parent::__construct($filepath, 9, true);
}
}

View file

@ -16,7 +16,7 @@ class IrisDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../../data/iris.csv';
$filepath = __DIR__.'/../../../data/iris.csv';
parent::__construct($filepath, 4, true);
}
}

View file

@ -16,7 +16,7 @@ class WineDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../../data/wine.csv';
$filepath = __DIR__.'/../../../data/wine.csv';
parent::__construct($filepath, 13, true);
}
}

View file

@ -8,39 +8,28 @@ use Phpml\Exception\DatasetException;
class FilesDataset extends ArrayDataset
{
/**
* @param string $rootPath
*
* @throws DatasetException
*/
public function __construct(string $rootPath)
{
if (!is_dir($rootPath)) {
throw DatasetException::missingFolder($rootPath);
throw new DatasetException(sprintf('Dataset root folder "%s" missing.', $rootPath));
}
$this->scanRootPath($rootPath);
}
/**
* @param string $rootPath
*/
private function scanRootPath(string $rootPath)
private function scanRootPath(string $rootPath): void
{
foreach (glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR) as $dir) {
$this->scanDir($dir);
}
}
/**
* @param string $dir
*/
private function scanDir(string $dir)
private function scanDir(string $dir): void
{
$target = basename($dir);
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
$this->samples[] = [file_get_contents($file)];
$this->samples[] = file_get_contents($file);
$this->targets[] = $target;
}
}

View file

@ -0,0 +1,101 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\InvalidArgumentException;
/**
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
*/
final class MnistDataset extends ArrayDataset
{
private const MAGIC_IMAGE = 0x00000803;
private const MAGIC_LABEL = 0x00000801;
private const IMAGE_ROWS = 28;
private const IMAGE_COLS = 28;
public function __construct(string $imagePath, string $labelPath)
{
$this->samples = $this->readImages($imagePath);
$this->targets = $this->readLabels($labelPath);
if (count($this->samples) !== count($this->targets)) {
throw new InvalidArgumentException('Must have the same number of images and labels');
}
}
private function readImages(string $imagePath): array
{
$stream = fopen($imagePath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$imagePath);
}
$images = [];
try {
$header = fread($stream, 16);
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
if ($fields['magic'] !== self::MAGIC_IMAGE) {
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
}
if ($fields['rows'] != self::IMAGE_ROWS) {
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
}
if ($fields['cols'] != self::IMAGE_COLS) {
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
}
for ($i = 0; $i < $fields['size']; $i++) {
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
// Convert to float between 0 and 1
$images[] = array_map(function ($b) {
return $b / 255;
}, array_values(unpack('C*', (string) $imageBytes)));
}
} finally {
fclose($stream);
}
return $images;
}
private function readLabels(string $labelPath): array
{
$stream = fopen($labelPath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$labelPath);
}
$labels = [];
try {
$header = fread($stream, 8);
$fields = unpack('Nmagic/Nsize', (string) $header);
if ($fields['magic'] !== self::MAGIC_LABEL) {
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
}
$labels = fread($stream, $fields['size']);
} finally {
fclose($stream);
}
return array_values(unpack('C*', (string) $labels));
}
}

View file

@ -0,0 +1,131 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
class SvmDataset extends ArrayDataset
{
public function __construct(string $filePath)
{
[$samples, $targets] = self::readProblem($filePath);
parent::__construct($samples, $targets);
}
private static function readProblem(string $filePath): array
{
$handle = self::openFile($filePath);
$samples = [];
$targets = [];
$maxIndex = 0;
while (false !== $line = fgets($handle)) {
[$sample, $target, $maxIndex] = self::processLine((string) $line, $maxIndex);
$samples[] = $sample;
$targets[] = $target;
}
fclose($handle);
foreach ($samples as &$sample) {
$sample = array_pad($sample, $maxIndex + 1, 0);
}
return [$samples, $targets];
}
/**
* @return resource
*/
private static function openFile(string $filePath)
{
if (!file_exists($filePath)) {
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
}
$handle = fopen($filePath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
}
return $handle;
}
private static function processLine(string $line, int $maxIndex): array
{
$columns = self::parseLine($line);
$target = self::parseTargetColumn($columns[0]);
$sample = array_fill(0, $maxIndex + 1, 0);
$n = count($columns);
for ($i = 1; $i < $n; ++$i) {
[$index, $value] = self::parseFeatureColumn($columns[$i]);
if ($index > $maxIndex) {
$maxIndex = $index;
$sample = array_pad($sample, $maxIndex + 1, 0);
}
$sample[$index] = $value;
}
return [$sample, $target, $maxIndex];
}
private static function parseLine(string $line): array
{
$line = explode('#', $line, 2)[0];
$line = rtrim($line);
$line = str_replace("\t", ' ', $line);
return explode(' ', $line);
}
private static function parseTargetColumn(string $column): float
{
if (!is_numeric($column)) {
throw new DatasetException(sprintf('Invalid target "%s".', $column));
}
return (float) $column;
}
private static function parseFeatureColumn(string $column): array
{
$feature = explode(':', $column, 2);
if (count($feature) !== 2) {
throw new DatasetException(sprintf('Invalid value "%s".', $column));
}
$index = self::parseFeatureIndex($feature[0]);
$value = self::parseFeatureValue($feature[1]);
return [$index, $value];
}
private static function parseFeatureIndex(string $index): int
{
if (!is_numeric($index) || !ctype_digit($index)) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
if ((int) $index < 1) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
return (int) $index - 1;
}
private static function parseFeatureValue(string $value): float
{
if (!is_numeric($value)) {
throw new DatasetException(sprintf('Invalid value "%s".', $value));
}
return (float) $value;
}
}

View file

@ -47,14 +47,12 @@ abstract class EigenTransformerBase
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*
* @param array $matrix
*/
protected function eigenDecomposition(array $matrix)
protected function eigenDecomposition(array $matrix): void
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects= $eig->getEigenvectors();
$eigVects = $eig->getEigenvectors();
$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
@ -85,12 +83,8 @@ abstract class EigenTransformerBase
/**
* Returns the reduced data
*
* @param array $data
*
* @return array
*/
protected function reduce(array $data)
protected function reduce(array $data): array
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);

View file

@ -4,16 +4,22 @@ declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Distance\Euclidean;
use Phpml\Math\Distance\Manhattan;
use Phpml\Math\Matrix;
class KernelPCA extends PCA
{
const KERNEL_RBF = 1;
const KERNEL_SIGMOID = 2;
const KERNEL_LAPLACIAN = 3;
const KERNEL_LINEAR = 4;
public const KERNEL_RBF = 1;
public const KERNEL_SIGMOID = 2;
public const KERNEL_LAPLACIAN = 3;
public const KERNEL_LINEAR = 4;
/**
* Selected kernel function
@ -25,7 +31,7 @@ class KernelPCA extends PCA
/**
* Gamma value used by the kernel
*
* @var float
* @var float|null
*/
protected $gamma;
@ -34,7 +40,7 @@ class KernelPCA extends PCA
*
* @var array
*/
protected $data;
protected $data = [];
/**
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
@ -44,18 +50,16 @@ class KernelPCA extends PCA
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
* This transformation will return the same number of rows with only <i>2</i> columns.
*
* @param int $kernel
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
* @param int $numFeatures Number of columns to be returned
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
* @param int $numFeatures Number of columns to be returned
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct(int $kernel = self::KERNEL_RBF, $totalVariance = null, $numFeatures = null, $gamma = null)
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
{
$availableKernels = [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR];
if (!in_array($kernel, $availableKernels)) {
throw new \Exception("KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian");
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
}
parent::__construct($totalVariance, $numFeatures);
@ -69,12 +73,8 @@ class KernelPCA extends PCA
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*
* @param array $data
*
* @return array
*/
public function fit(array $data)
public function fit(array $data): array
{
$numRows = count($data);
$this->data = $data;
@ -93,16 +93,33 @@ class KernelPCA extends PCA
return Matrix::transposeArray($this->eigVectors);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the variables obtained during the last run of <code>fit</code>.
*
* @throws InvalidArgumentException
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
}
if (is_array($sample[0])) {
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
}
$pairs = $this->getDistancePairs($sample);
return $this->projectSample($pairs);
}
/**
* Calculates similarity matrix by use of selected kernel function<br>
* An n-by-m matrix is given and an n-by-n matrix is returned
*
* @param array $data
* @param int $numRows
*
* @return array
*/
protected function calculateKernelMatrix(array $data, int $numRows)
protected function calculateKernelMatrix(array $data, int $numRows): array
{
$kernelFunc = $this->getKernel();
@ -125,15 +142,10 @@ class KernelPCA extends PCA
* conversion:
*
* K = K N.K K.N + N.K.N where N is n-by-n matrix filled with 1/n
*
* @param array $matrix
* @param int $n
*
* @return array
*/
protected function centerMatrix(array $matrix, int $n)
protected function centerMatrix(array $matrix, int $n): array
{
$N = array_fill(0, $n, array_fill(0, $n, 1.0/$n));
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
$N = new Matrix($N, false);
$K = new Matrix($matrix, false);
@ -145,19 +157,17 @@ class KernelPCA extends PCA
$N_K_N = $N->multiply($K_N);
return $K->subtract($N_K)
->subtract($K_N)
->add($N_K_N)
->toArray();
->subtract($K_N)
->add($N_K_N)
->toArray();
}
/**
* Returns the callable kernel function
*
* @return \Closure
*
* @throws \Exception
*/
protected function getKernel()
protected function getKernel(): Closure
{
switch ($this->kernel) {
case self::KERNEL_LINEAR:
@ -168,6 +178,7 @@ class KernelPCA extends PCA
case self::KERNEL_RBF:
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
$dist = new Euclidean();
return function ($x, $y) use ($dist) {
return exp(-$this->gamma * $dist->sqDistance($x, $y));
};
@ -176,27 +187,25 @@ class KernelPCA extends PCA
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
return function ($x, $y) {
$res = Matrix::dot($x, $y)[0] + 1.0;
return tanh($this->gamma * $res);
return tanh((float) $this->gamma * $res);
};
case self::KERNEL_LAPLACIAN:
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
$dist = new Manhattan();
return function ($x, $y) use ($dist) {
return exp(-$this->gamma * $dist->distance($x, $y));
};
default:
throw new \Exception(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
// Not reached
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
}
}
/**
* @param array $sample
*
* @return array
*/
protected function getDistancePairs(array $sample)
protected function getDistancePairs(array $sample): array
{
$kernel = $this->getKernel();
@ -208,12 +217,7 @@ class KernelPCA extends PCA
return $pairs;
}
/**
* @param array $pairs
*
* @return array
*/
protected function projectSample(array $pairs)
protected function projectSample(array $pairs): array
{
// Normalize eigenvectors by eig = eigVectors / eigValues
$func = function ($eigVal, $eigVect) {
@ -227,29 +231,4 @@ class KernelPCA extends PCA
// return k.dot(eig)
return Matrix::dot($pairs, $eig);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the variables obtained during the last run of <code>fit</code>.
*
* @param array $sample
*
* @return array
*
* @throws \Exception
*/
public function transform(array $sample)
{
if (!$this->fit) {
throw new \Exception("KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first");
}
if (is_array($sample[0])) {
throw new \Exception("KernelPCA::transform() accepts only one-dimensional arrays");
}
$pairs = $this->getDistancePairs($sample);
return $this->projectSample($pairs);
}
}

View file

@ -4,6 +4,8 @@ declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Matrix;
class LDA extends EigenTransformerBase
@ -16,22 +18,22 @@ class LDA extends EigenTransformerBase
/**
* @var array
*/
public $labels;
public $labels = [];
/**
* @var array
*/
public $means;
public $means = [];
/**
* @var array
*/
public $counts;
public $counts = [];
/**
* @var float[]
*/
public $overallMean;
public $overallMean = [];
/**
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
@ -43,25 +45,28 @@ class LDA extends EigenTransformerBase
* or numFeatures (number of features in the dataset) to be preserved.
*
* @param float|null $totalVariance Total explained variance to be preserved
* @param int|null $numFeatures Number of features to be preserved
* @param int|null $numFeatures Number of features to be preserved
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct($totalVariance = null, $numFeatures = null)
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new \Exception("Number of features to be preserved should be greater than 0");
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if ($totalVariance !== null && $numFeatures !== null) {
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
@ -69,16 +74,11 @@ class LDA extends EigenTransformerBase
/**
* Trains the algorithm to transform the given data to a lower dimensional space.
*
* @param array $data
* @param array $classes
*
* @return array
*/
public function fit(array $data, array $classes) : array
public function fit(array $data, array $classes): array
{
$this->labels = $this->getLabels($classes);
$this->means = $this->calculateMeans($data, $classes);
$this->means = $this->calculateMeans($data, $classes);
$sW = $this->calculateClassVar($data, $classes);
$sB = $this->calculateClassCov();
@ -91,12 +91,27 @@ class LDA extends EigenTransformerBase
return $this->reduce($data);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('LDA has not been fitted with respect to original dataset, please run LDA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
return $this->reduce($sample);
}
/**
* Returns unique labels in the dataset
*
* @param array $classes
*
* @return array
*/
protected function getLabels(array $classes): array
{
@ -105,29 +120,24 @@ class LDA extends EigenTransformerBase
return array_keys($counts);
}
/**
* Calculates mean of each column for each class and returns
* n by m matrix where n is number of labels and m is number of columns
*
* @param array $data
* @param array $classes
*
* @return array
*/
protected function calculateMeans(array $data, array $classes) : array
protected function calculateMeans(array $data, array $classes): array
{
$means = [];
$counts= [];
$counts = [];
$overallMean = array_fill(0, count($data[0]), 0.0);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);
$label = array_search($classes[$index], $this->labels, true);
foreach ($row as $col => $val) {
if (!isset($means[$label][$col])) {
$means[$label][$col] = 0.0;
}
$means[$label][$col] += $val;
$overallMean[$col] += $val;
}
@ -156,25 +166,19 @@ class LDA extends EigenTransformerBase
return $means;
}
/**
* Returns in-class scatter matrix for each class, which
* is a n by m matrix where n is number of classes and
* m is number of columns
*
* @param array $data
* @param array $classes
*
* @return Matrix
*/
protected function calculateClassVar($data, $classes)
protected function calculateClassVar(array $data, array $classes): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
$sW = new Matrix($s, false);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);
$label = array_search($classes[$index], $this->labels, true);
$means = $this->means[$label];
$row = $this->calculateVar($row, $means);
@ -189,10 +193,8 @@ class LDA extends EigenTransformerBase
* Returns between-class scatter matrix for each class, which
* is an n by m matrix where n is number of classes and
* m is number of columns
*
* @return Matrix
*/
protected function calculateClassCov()
protected function calculateClassCov(): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
@ -209,13 +211,8 @@ class LDA extends EigenTransformerBase
/**
* Returns the result of the calculation (x - m)T.(x - m)
*
* @param array $row
* @param array $means
*
* @return Matrix
*/
protected function calculateVar(array $row, array $means)
protected function calculateVar(array $row, array $means): Matrix
{
$x = new Matrix($row, false);
$m = new Matrix($means, false);
@ -223,27 +220,4 @@ class LDA extends EigenTransformerBase
return $diff->transpose()->multiply($diff);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @param array $sample
*
* @return array
*
* @throws \Exception
*/
public function transform(array $sample)
{
if (!$this->fit) {
throw new \Exception("LDA has not been fitted with respect to original dataset, please run LDA::fit() first");
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
return $this->reduce($sample);
}
}

View file

@ -4,6 +4,8 @@ declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Statistic\Covariance;
use Phpml\Math\Statistic\Mean;
@ -28,25 +30,28 @@ class PCA extends EigenTransformerBase
* within the data. It is a lossy data compression technique.<br>
*
* @param float $totalVariance Total explained variance to be preserved
* @param int $numFeatures Number of features to be preserved
* @param int $numFeatures Number of features to be preserved
*
* @throws \Exception
* @throws InvalidArgumentException
*/
public function __construct($totalVariance = null, $numFeatures = null)
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new \Exception("Number of features to be preserved should be greater than 0");
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if ($totalVariance !== null && $numFeatures !== null) {
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
@ -57,12 +62,8 @@ class PCA extends EigenTransformerBase
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*
* @param array $data
*
* @return array
*/
public function fit(array $data)
public function fit(array $data): array
{
$n = count($data[0]);
@ -78,10 +79,27 @@ class PCA extends EigenTransformerBase
}
/**
* @param array $data
* @param int $n
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
protected function calculateMeans(array $data, int $n)
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
$sample = $this->normalize($sample, count($sample[0]));
return $this->reduce($sample);
}
protected function calculateMeans(array $data, int $n): void
{
// Calculate means for each dimension
$this->means = [];
@ -94,20 +112,15 @@ class PCA extends EigenTransformerBase
/**
* Normalization of the data includes subtracting mean from
* each dimension therefore dimensions will be centered to zero
*
* @param array $data
* @param int $n
*
* @return array
*/
protected function normalize(array $data, int $n)
protected function normalize(array $data, int $n): array
{
if (empty($this->means)) {
if (count($this->means) === 0) {
$this->calculateMeans($data, $n);
}
// Normalize data
foreach ($data as $i => $row) {
foreach (array_keys($data) as $i) {
for ($k = 0; $k < $n; ++$k) {
$data[$i][$k] -= $this->means[$k];
}
@ -115,29 +128,4 @@ class PCA extends EigenTransformerBase
return $data;
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @param array $sample
*
* @return array
*
* @throws \Exception
*/
public function transform(array $sample)
{
if (!$this->fit) {
throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first");
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
$sample = $this->normalize($sample, count($sample[0]));
return $this->reduce($sample);
}
}

View file

@ -6,15 +6,9 @@ namespace Phpml;
interface Estimator
{
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets);
public function train(array $samples, array $targets): void;
/**
* @param array $samples
*
* @return mixed
*/
public function predict(array $samples);

View file

@ -4,15 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class DatasetException extends \Exception
use Exception;
class DatasetException extends Exception
{
/**
* @param string $path
*
* @return DatasetException
*/
public static function missingFolder(string $path)
{
return new self(sprintf('Dataset root folder "%s" missing.', $path));
}
}

View file

@ -4,35 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class FileException extends \Exception
use Exception;
class FileException extends Exception
{
/**
* @param string $filepath
*
* @return FileException
*/
public static function missingFile(string $filepath)
{
return new self(sprintf('File "%s" missing.', $filepath));
}
/**
* @param string $filepath
*
* @return FileException
*/
public static function cantOpenFile(string $filepath)
{
return new self(sprintf('File "%s" can\'t be open.', $filepath));
}
/**
* @param string $filepath
*
* @return FileException
*/
public static function cantSaveFile(string $filepath)
{
return new self(sprintf('File "%s" can\'t be saved.', $filepath));
}
}

View file

@ -4,112 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class InvalidArgumentException extends \Exception
use Exception;
class InvalidArgumentException extends Exception
{
/**
* @return InvalidArgumentException
*/
public static function arraySizeNotMatch()
{
return new self('Size of given arrays does not match');
}
/**
* @param $name
*
* @return InvalidArgumentException
*/
public static function percentNotInRange($name)
{
return new self(sprintf('%s must be between 0.0 and 1.0', $name));
}
/**
* @return InvalidArgumentException
*/
public static function arrayCantBeEmpty()
{
return new self('The array has zero elements');
}
/**
* @param int $minimumSize
*
* @return InvalidArgumentException
*/
public static function arraySizeToSmall($minimumSize = 2)
{
return new self(sprintf('The array must have at least %s elements', $minimumSize));
}
/**
* @return InvalidArgumentException
*/
public static function matrixDimensionsDidNotMatch()
{
return new self('Matrix dimensions did not match');
}
/**
* @return InvalidArgumentException
*/
public static function inconsistentMatrixSupplied()
{
return new self('Inconsistent matrix supplied');
}
/**
* @return InvalidArgumentException
*/
public static function invalidClustersNumber()
{
return new self('Invalid clusters number');
}
/**
* @return InvalidArgumentException
*/
public static function invalidTarget($target)
{
return new self('Target with value ' . $target . ' is not part of the accepted classes');
}
/**
* @param string $language
*
* @return InvalidArgumentException
*/
public static function invalidStopWordsLanguage(string $language)
{
return new self(sprintf('Can\'t find %s language for StopWords', $language));
}
/**
* @return InvalidArgumentException
*/
public static function invalidLayerNodeClass()
{
return new self('Layer node class must implement Node interface');
}
/**
* @return InvalidArgumentException
*/
public static function invalidLayersNumber()
{
return new self('Provide at least 1 hidden layer');
}
/**
* @return InvalidArgumentException
*/
public static function invalidClassesNumber()
{
return new self('Provide at least 2 different classes');
}
public static function inconsistentClasses()
{
return new self('The provided classes don\'t match the classes provided in the constructor');
}
}

View file

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class InvalidOperationException extends Exception
{
}

View file

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class LibsvmCommandException extends Exception
{
}

View file

@ -4,29 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class MatrixException extends \Exception
use Exception;
class MatrixException extends Exception
{
/**
* @return MatrixException
*/
public static function notSquareMatrix()
{
return new self('Matrix is not square matrix');
}
/**
* @return MatrixException
*/
public static function columnOutOfRange()
{
return new self('Column out of range');
}
/**
* @return MatrixException
*/
public static function singularMatrix()
{
return new self('Matrix is singular');
}
}

View file

@ -4,13 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class NormalizerException extends \Exception
use Exception;
class NormalizerException extends Exception
{
/**
* @return NormalizerException
*/
public static function unknownNorm()
{
return new self('Unknown norm supplied.');
}
}

View file

@ -4,25 +4,8 @@ declare(strict_types=1);
namespace Phpml\Exception;
class SerializeException extends \Exception
{
/**
* @param string $filepath
*
* @return SerializeException
*/
public static function cantUnserialize(string $filepath)
{
return new self(sprintf('"%s" can not be unserialized.', $filepath));
}
use Exception;
/**
* @param string $classname
*
* @return SerializeException
*/
public static function cantSerialize(string $classname)
{
return new self(sprintf('Class "%s" can not be serialized.', $classname));
}
class SerializeException extends Exception
{
}

View file

@ -11,39 +11,24 @@ class StopWords
/**
* @var array
*/
protected $stopWords;
protected $stopWords = [];
/**
* @param array $stopWords
*/
public function __construct(array $stopWords)
{
$this->stopWords = array_fill_keys($stopWords, true);
}
/**
* @param string $token
*
* @return bool
*/
public function isStopWord(string $token): bool
{
return isset($this->stopWords[$token]);
}
/**
* @param string $language
*
* @return StopWords
*
* @throws InvalidArgumentException
*/
public static function factory($language = 'English'): StopWords
public static function factory(string $language = 'English'): self
{
$className = __NAMESPACE__."\\StopWords\\$language";
$className = __NAMESPACE__."\\StopWords\\${language}";
if (!class_exists($className)) {
throw InvalidArgumentException::invalidStopWordsLanguage($language);
throw new InvalidArgumentException(sprintf('Can\'t find "%s" language for StopWords', $language));
}
return new $className();

View file

@ -11,35 +11,26 @@ class TfIdfTransformer implements Transformer
/**
* @var array
*/
private $idf;
private $idf = [];
/**
* @param array $samples
*/
public function __construct(array $samples = null)
public function __construct(array $samples = [])
{
if ($samples) {
if (count($samples) > 0) {
$this->fit($samples);
}
}
/**
* @param array $samples
*/
public function fit(array $samples)
public function fit(array $samples, ?array $targets = null): void
{
$this->countTokensFrequency($samples);
$count = count($samples);
foreach ($this->idf as &$value) {
$value = log((float)($count / $value), 10.0);
$value = log((float) ($count / $value), 10.0);
}
}
/**
* @param array $samples
*/
public function transform(array &$samples)
public function transform(array &$samples): void
{
foreach ($samples as &$sample) {
foreach ($sample as $index => &$feature) {
@ -48,10 +39,7 @@ class TfIdfTransformer implements Transformer
}
}
/**
* @param array $samples
*/
private function countTokensFrequency(array $samples)
private function countTokensFrequency(array $samples): void
{
$this->idf = array_fill_keys(array_keys($samples[0]), 0);

View file

@ -15,7 +15,7 @@ class TokenCountVectorizer implements Transformer
private $tokenizer;
/**
* @var StopWords
* @var StopWords|null
*/
private $stopWords;
@ -27,62 +27,42 @@ class TokenCountVectorizer implements Transformer
/**
* @var array
*/
private $vocabulary;
private $vocabulary = [];
/**
* @var array
*/
private $frequencies;
private $frequencies = [];
/**
* @param Tokenizer $tokenizer
* @param StopWords $stopWords
* @param float $minDF
*/
public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0.0)
public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
{
$this->tokenizer = $tokenizer;
$this->stopWords = $stopWords;
$this->minDF = $minDF;
$this->vocabulary = [];
$this->frequencies = [];
}
/**
* @param array $samples
*/
public function fit(array $samples)
public function fit(array $samples, ?array $targets = null): void
{
$this->buildVocabulary($samples);
}
/**
* @param array $samples
*/
public function transform(array &$samples)
public function transform(array &$samples): void
{
foreach ($samples as &$sample) {
array_walk($samples, function (string &$sample): void {
$this->transformSample($sample);
}
});
$this->checkDocumentFrequency($samples);
}
/**
* @return array
*/
public function getVocabulary()
public function getVocabulary(): array
{
return array_flip($this->vocabulary);
}
/**
* @param array $samples
*/
private function buildVocabulary(array &$samples)
private function buildVocabulary(array &$samples): void
{
foreach ($samples as $index => $sample) {
foreach ($samples as $sample) {
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$this->addTokenToVocabulary($token);
@ -90,17 +70,14 @@ class TokenCountVectorizer implements Transformer
}
}
/**
* @param string $sample
*/
private function transformSample(string &$sample)
private function transformSample(string &$sample): void
{
$counts = [];
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$index = $this->getTokenIndex($token);
if (false !== $index) {
if ($index !== false) {
$this->updateFrequency($token);
if (!isset($counts[$index])) {
$counts[$index] = 0;
@ -122,8 +99,6 @@ class TokenCountVectorizer implements Transformer
}
/**
* @param string $token
*
* @return int|bool
*/
private function getTokenIndex(string $token)
@ -135,10 +110,7 @@ class TokenCountVectorizer implements Transformer
return $this->vocabulary[$token] ?? false;
}
/**
* @param string $token
*/
private function addTokenToVocabulary(string $token)
private function addTokenToVocabulary(string $token): void
{
if ($this->isStopWord($token)) {
return;
@ -149,20 +121,12 @@ class TokenCountVectorizer implements Transformer
}
}
/**
* @param string $token
*
* @return bool
*/
private function isStopWord(string $token): bool
{
return $this->stopWords && $this->stopWords->isStopWord($token);
return $this->stopWords !== null && $this->stopWords->isStopWord($token);
}
/**
* @param string $token
*/
private function updateFrequency(string $token)
private function updateFrequency(string $token): void
{
if (!isset($this->frequencies[$token])) {
$this->frequencies[$token] = 0;
@ -171,10 +135,7 @@ class TokenCountVectorizer implements Transformer
++$this->frequencies[$token];
}
/**
* @param array $samples
*/
private function checkDocumentFrequency(array &$samples)
private function checkDocumentFrequency(array &$samples): void
{
if ($this->minDF > 0) {
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
@ -184,28 +145,19 @@ class TokenCountVectorizer implements Transformer
}
}
/**
* @param array $sample
* @param array $beyondMinimum
*/
private function resetBeyondMinimum(array &$sample, array $beyondMinimum)
private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void
{
foreach ($beyondMinimum as $index) {
$sample[$index] = 0;
}
}
/**
* @param int $samplesCount
*
* @return array
*/
private function getBeyondMinimumIndexes(int $samplesCount)
private function getBeyondMinimumIndexes(int $samplesCount): array
{
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token);
$indexes[] = $this->getTokenIndex((string) $token);
}
}

View file

@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
interface ScoringFunction
{
public function score(array $samples, array $targets): array;
}

View file

@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Statistic\ANOVA;
final class ANOVAFValue implements ScoringFunction
{
public function score(array $samples, array $targets): array
{
$grouped = [];
foreach ($samples as $index => $sample) {
$grouped[$targets[$index]][] = $sample;
}
return ANOVA::oneWayF(array_values($grouped));
}
}

View file

@ -0,0 +1,81 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Matrix;
use Phpml\Math\Statistic\Mean;
/**
* Quick linear model for testing the effect of a single regressor,
* sequentially for many regressors.
*
* This is done in 2 steps:
*
* 1. The cross correlation between each regressor and the target is computed,
* that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
* 2. It is converted to an F score.
*
* Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression)
*/
final class UnivariateLinearRegression implements ScoringFunction
{
/**
* @var bool
*/
private $center;
/**
* @param bool $center - if true samples and targets will be centered
*/
public function __construct(bool $center = true)
{
$this->center = $center;
}
public function score(array $samples, array $targets): array
{
if ($this->center) {
$this->centerTargets($targets);
$this->centerSamples($samples);
}
$correlations = [];
foreach (array_keys($samples[0]) as $index) {
$featureColumn = array_column($samples, $index);
$correlations[$index] =
(Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm())
/ (new Matrix($targets, false))->frobeniusNorm();
}
$degreesOfFreedom = count($targets) - ($this->center ? 2 : 1);
return array_map(function (float $correlation) use ($degreesOfFreedom): float {
return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom;
}, $correlations);
}
private function centerTargets(array &$targets): void
{
$mean = Mean::arithmetic($targets);
array_walk($targets, function (&$target) use ($mean): void {
$target -= $mean;
});
}
private function centerSamples(array &$samples): void
{
$means = [];
foreach ($samples[0] as $index => $feature) {
$means[$index] = Mean::arithmetic(array_column($samples, $index));
}
foreach ($samples as &$sample) {
foreach ($sample as $index => &$feature) {
$feature -= $means[$index];
}
}
}
}

View file

@ -0,0 +1,78 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\Transformer;
final class SelectKBest implements Transformer
{
/**
* @var ScoringFunction
*/
private $scoringFunction;
/**
* @var int
*/
private $k;
/**
* @var array|null
*/
private $scores = null;
/**
* @var array|null
*/
private $keepColumns = null;
public function __construct(int $k = 10, ?ScoringFunction $scoringFunction = null)
{
if ($scoringFunction === null) {
$scoringFunction = new ANOVAFValue();
}
$this->scoringFunction = $scoringFunction;
$this->k = $k;
}
public function fit(array $samples, ?array $targets = null): void
{
if ($targets === null || count($targets) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
if ($this->k >= count($sorted)) {
return;
}
arsort($sorted);
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
}
public function transform(array &$samples): void
{
if ($this->keepColumns === null) {
return;
}
foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}
public function scores(): array
{
if ($this->scores === null) {
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
}
return $this->scores;
}
}

View file

@ -0,0 +1,57 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Matrix;
use Phpml\Math\Statistic\Variance;
use Phpml\Transformer;
final class VarianceThreshold implements Transformer
{
/**
* @var float
*/
private $threshold;
/**
* @var array
*/
private $variances = [];
/**
* @var array
*/
private $keepColumns = [];
public function __construct(float $threshold = 0.0)
{
if ($threshold < 0) {
throw new InvalidArgumentException('Threshold can\'t be lower than zero');
}
$this->threshold = $threshold;
}
public function fit(array $samples, ?array $targets = null): void
{
$this->variances = array_map(function (array $column) {
return Variance::population($column);
}, Matrix::transposeArray($samples));
foreach ($this->variances as $column => $variance) {
if ($variance > $this->threshold) {
$this->keepColumns[$column] = true;
}
}
}
public function transform(array &$samples): void
{
foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}
}

View file

@ -4,6 +4,8 @@ declare(strict_types=1);
namespace Phpml\Helper;
use Phpml\Classification\Classifier;
trait OneVsRest
{
/**
@ -25,39 +27,37 @@ trait OneVsRest
/**
* Train a binary classifier in the OvR style
*
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
// Clears previous stuff.
$this->reset();
$this->trainBylabel($samples, $targets);
$this->trainByLabel($samples, $targets);
}
/**
* @param array $samples
* @param array $targets
* @param array $allLabels All training set labels
*
* @return void
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
*/
protected function trainByLabel(array $samples, array $targets, array $allLabels = [])
public function reset(): void
{
$this->classifiers = [];
$this->allLabels = [];
$this->costValues = [];
$this->resetBinary();
}
protected function trainByLabel(array $samples, array $targets, array $allLabels = []): void
{
// Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run.
if (!empty($allLabels)) {
$this->allLabels = $allLabels;
} else {
$this->allLabels = array_keys(array_count_values($targets));
}
$this->allLabels = count($allLabels) === 0 ? array_keys(array_count_values($targets)) : $allLabels;
sort($this->allLabels, SORT_STRING);
// If there are only two targets, then there is no need to perform OvR
if (count($this->allLabels) == 2) {
if (count($this->allLabels) === 2) {
// Init classifier if required.
if (empty($this->classifiers)) {
if (count($this->classifiers) === 0) {
$this->classifiers[0] = $this->getClassifierCopy();
}
@ -67,11 +67,11 @@ trait OneVsRest
foreach ($this->allLabels as $label) {
// Init classifier if required.
if (empty($this->classifiers[$label])) {
if (!isset($this->classifiers[$label])) {
$this->classifiers[$label] = $this->getClassifierCopy();
}
list($binarizedTargets, $classifierLabels) = $this->binarizeTargets($targets, $label);
[$binarizedTargets, $classifierLabels] = $this->binarizeTargets($targets, $label);
$this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels);
}
}
@ -85,64 +85,26 @@ trait OneVsRest
}
}
/**
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
*/
public function reset()
{
$this->classifiers = [];
$this->allLabels = [];
$this->costValues = [];
$this->resetBinary();
}
/**
* Returns an instance of the current class after cleaning up OneVsRest stuff.
*
* @return \Phpml\Estimator
*/
protected function getClassifierCopy()
protected function getClassifierCopy(): Classifier
{
// Clone the current classifier, so that
// we don't mess up its variables while training
// multiple instances of this classifier
$classifier = clone $this;
$classifier->reset();
return $classifier;
}
/**
* Groups all targets into two groups: Targets equal to
* the given label and the others
*
* $targets is not passed by reference nor contains objects so this method
* changes will not affect the caller $targets array.
*
* @param array $targets
* @param mixed $label
* @return array Binarized targets and target's labels
*/
private function binarizeTargets($targets, $label)
{
$notLabel = "not_$label";
foreach ($targets as $key => $target) {
$targets[$key] = $target == $label ? $label : $notLabel;
}
$labels = [$label, $notLabel];
return [$targets, $labels];
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSample(array $sample)
{
if (count($this->allLabels) == 2) {
if (count($this->allLabels) === 2) {
return $this->classifiers[0]->predictSampleBinary($sample);
}
@ -153,32 +115,24 @@ trait OneVsRest
}
arsort($probs, SORT_NUMERIC);
return key($probs);
}
/**
* Each classifier should implement this method instead of train(samples, targets)
*
* @param array $samples
* @param array $targets
* @param array $labels
*/
abstract protected function trainBinary(array $samples, array $targets, array $labels);
/**
* To be overwritten by OneVsRest classifiers.
*
* @return void
*/
abstract protected function resetBinary();
abstract protected function resetBinary(): void;
/**
* Each classifier that make use of OvR approach should be able to
* return a probability for a sample to belong to the given label.
*
* @param array $sample
* @param string $label
*
* @return mixed
*/
abstract protected function predictProbability(array $sample, string $label);
@ -186,9 +140,30 @@ trait OneVsRest
/**
* Each classifier should implement this method instead of predictSample()
*
* @param array $sample
*
* @return mixed
*/
abstract protected function predictSampleBinary(array $sample);
/**
* Groups all targets into two groups: Targets equal to
* the given label and the others
*
* $targets is not passed by reference nor contains objects so this method
* changes will not affect the caller $targets array.
*
* @param mixed $label
*
* @return array Binarized targets and target's labels
*/
private function binarizeTargets(array $targets, $label): array
{
$notLabel = "not_${label}";
foreach ($targets as $key => $target) {
$targets[$key] = $target == $label ? $label : $notLabel;
}
$labels = [$label, $notLabel];
return [$targets, $labels];
}
}

View file

@ -4,6 +4,8 @@ declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
/**
* Conjugate Gradient method to solve a non-linear f(x) with respect to unknown x
* See https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method)
@ -17,14 +19,7 @@ namespace Phpml\Helper\Optimizer;
*/
class ConjugateGradient extends GD
{
/**
* @param array $samples
* @param array $targets
* @param \Closure $gradientCb
*
* @return array
*/
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
@ -32,11 +27,11 @@ class ConjugateGradient extends GD
$this->sampleCount = count($samples);
$this->costValues = [];
$d = mp::muls($this->gradient($this->theta), -1);
$d = MP::muls($this->gradient($this->theta), -1);
for ($i = 0; $i < $this->maxIterations; ++$i) {
// Obtain α that minimizes f(θ + α.d)
$alpha = $this->getAlpha(array_sum($d));
$alpha = $this->getAlpha($d);
// θ(k+1) = θ(k) + α.d
$thetaNew = $this->getNewTheta($alpha, $d);
@ -65,30 +60,38 @@ class ConjugateGradient extends GD
/**
* Executes the callback function for the problem and returns
* sum of the gradient for all samples & targets.
*
* @param array $theta
*
* @return array
*/
protected function gradient(array $theta)
protected function gradient(array $theta): array
{
list(, $gradient) = parent::gradient($theta);
[, $updates, $penalty] = parent::gradient($theta);
// Calculate gradient for each dimension
$gradient = [];
for ($i = 0; $i <= $this->dimensions; ++$i) {
if ($i === 0) {
$gradient[$i] = array_sum($updates);
} else {
$col = array_column($this->samples, $i - 1);
$error = 0;
foreach ($col as $index => $val) {
$error += $val * $updates[$index];
}
$gradient[$i] = $error + $penalty * $theta[$i];
}
}
return $gradient;
}
/**
* Returns the value of f(x) for given solution
*
* @param array $theta
*
* @return float
*/
protected function cost(array $theta)
protected function cost(array $theta): float
{
list($cost) = parent::gradient($theta);
[$cost] = parent::gradient($theta);
return array_sum($cost) / $this->sampleCount;
return array_sum($cost) / (int) $this->sampleCount;
}
/**
@ -104,19 +107,15 @@ class ConjugateGradient extends GD
* b) Probe a larger alpha (0.01) and calculate cost function
* b-1) If cost function decreases, continue enlarging alpha
* b-2) If cost function increases, take the midpoint and try again
*
* @param float $d
*
* @return float
*/
protected function getAlpha(float $d)
protected function getAlpha(array $d): float
{
$small = 0.0001 * $d;
$large = 0.01 * $d;
$small = MP::muls($d, 0.0001);
$large = MP::muls($d, 0.01);
// Obtain θ + α.d for two initial values, x0 and x1
$x0 = mp::adds($this->theta, $small);
$x1 = mp::adds($this->theta, $large);
$x0 = MP::add($this->theta, $small);
$x1 = MP::add($this->theta, $large);
$epsilon = 0.0001;
$iteration = 0;
@ -132,20 +131,28 @@ class ConjugateGradient extends GD
if ($fx1 < $fx0) {
$x0 = $x1;
$x1 = mp::adds($x1, 0.01); // Enlarge second
$x1 = MP::adds($x1, 0.01); // Enlarge second
} else {
$x1 = mp::divs(mp::add($x1, $x0), 2.0);
$x1 = MP::divs(MP::add($x1, $x0), 2.0);
} // Get to the midpoint
$error = $fx1 / $this->dimensions;
} while ($error <= $epsilon || $iteration++ < 10);
// Return α = θ / d
if ($d == 0) {
return $x1[0] - $this->theta[0];
// Return α = θ / d
// For accuracy, choose a dimension which maximize |d[i]|
$imax = 0;
for ($i = 1; $i <= $this->dimensions; ++$i) {
if (abs($d[$i]) > abs($d[$imax])) {
$imax = $i;
}
}
return ($x1[0] - $this->theta[0]) / $d;
if ($d[$imax] == 0) {
return $x1[$imax] - $this->theta[$imax];
}
return ($x1[$imax] - $this->theta[$imax]) / $d[$imax];
}
/**
@ -153,30 +160,10 @@ class ConjugateGradient extends GD
* gradient direction.
*
* θ(k+1) = θ(k) + α.d
*
* @param float $alpha
* @param array $d
*
* @return array
*/
protected function getNewTheta(float $alpha, array $d)
protected function getNewTheta(float $alpha, array $d): array
{
$theta = $this->theta;
for ($i = 0; $i < $this->dimensions + 1; ++$i) {
if ($i === 0) {
$theta[$i] += $alpha * array_sum($d);
} else {
$sum = 0.0;
foreach ($this->samples as $si => $sample) {
$sum += $sample[$i - 1] * $d[$si] * $alpha;
}
$theta[$i] += $sum;
}
}
return $theta;
return MP::add($this->theta, MP::muls($d, $alpha));
}
/**
@ -187,35 +174,31 @@ class ConjugateGradient extends GD
*
* See:
* R. Fletcher and C. M. Reeves, "Function minimization by conjugate gradients", Comput. J. 7 (1964), 149154.
*
* @param array $newTheta
*
* @return float
*/
protected function getBeta(array $newTheta)
protected function getBeta(array $newTheta): float
{
$dNew = array_sum($this->gradient($newTheta));
$dOld = array_sum($this->gradient($this->theta)) + 1e-100;
$gNew = $this->gradient($newTheta);
$gOld = $this->gradient($this->theta);
$dNew = 0;
$dOld = 1e-100;
for ($i = 0; $i <= $this->dimensions; ++$i) {
$dNew += $gNew[$i] ** 2;
$dOld += $gOld[$i] ** 2;
}
return $dNew ** 2 / $dOld ** 2;
return $dNew / $dOld;
}
/**
* Calculates the new conjugate direction
*
* d(k+1) =∇f(x(k+1)) + β(k).d(k)
*
* @param array $theta
* @param float $beta
* @param array $d
*
* @return array
*/
protected function getNewDirection(array $theta, float $beta, array $d)
protected function getNewDirection(array $theta, float $beta, array $d): array
{
$grad = $this->gradient($theta);
return mp::add(mp::muls($grad, -1), mp::muls($d, $beta));
return MP::add(MP::muls($grad, -1), MP::muls($d, $beta));
}
}
@ -223,17 +206,12 @@ class ConjugateGradient extends GD
* Handles element-wise vector operations between vector-vector
* and vector-scalar variables
*/
class mp
class MP
{
/**
* Element-wise <b>multiplication</b> of two vectors of the same size
*
* @param array $m1
* @param array $m2
*
* @return array
*/
public static function mul(array $m1, array $m2)
public static function mul(array $m1, array $m2): array
{
$res = [];
foreach ($m1 as $i => $val) {
@ -245,13 +223,8 @@ class mp
/**
* Element-wise <b>division</b> of two vectors of the same size
*
* @param array $m1
* @param array $m2
*
* @return array
*/
public static function div(array $m1, array $m2)
public static function div(array $m1, array $m2): array
{
$res = [];
foreach ($m1 as $i => $val) {
@ -263,14 +236,8 @@ class mp
/**
* Element-wise <b>addition</b> of two vectors of the same size
*
* @param array $m1
* @param array $m2
* @param int $mag
*
* @return array
*/
public static function add(array $m1, array $m2, int $mag = 1)
public static function add(array $m1, array $m2, int $mag = 1): array
{
$res = [];
foreach ($m1 as $i => $val) {
@ -282,26 +249,16 @@ class mp
/**
* Element-wise <b>subtraction</b> of two vectors of the same size
*
* @param array $m1
* @param array $m2
*
* @return array
*/
public static function sub(array $m1, array $m2)
public static function sub(array $m1, array $m2): array
{
return self::add($m1, $m2, -1);
}
/**
* Element-wise <b>multiplication</b> of a vector with a scalar
*
* @param array $m1
* @param float $m2
*
* @return array
*/
public static function muls(array $m1, float $m2)
public static function muls(array $m1, float $m2): array
{
$res = [];
foreach ($m1 as $val) {
@ -313,13 +270,8 @@ class mp
/**
* Element-wise <b>division</b> of a vector with a scalar
*
* @param array $m1
* @param float $m2
*
* @return array
*/
public static function divs(array $m1, float $m2)
public static function divs(array $m1, float $m2): array
{
$res = [];
foreach ($m1 as $val) {
@ -331,14 +283,8 @@ class mp
/**
* Element-wise <b>addition</b> of a vector with a scalar
*
* @param array $m1
* @param float $m2
* @param int $mag
*
* @return array
*/
public static function adds(array $m1, float $m2, int $mag = 1)
public static function adds(array $m1, float $m2, int $mag = 1): array
{
$res = [];
foreach ($m1 as $val) {
@ -350,13 +296,8 @@ class mp
/**
* Element-wise <b>subtraction</b> of a vector with a scalar
*
* @param array $m1
* @param array $m2
*
* @return array
*/
public static function subs(array $m1, array $m2)
public static function subs(array $m1, float $m2): array
{
return self::adds($m1, $m2, -1);
}

View file

@ -4,6 +4,9 @@ declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidOperationException;
/**
* Batch version of Gradient Descent to optimize the weights
* of a classifier given samples, targets and the objective function to minimize
@ -13,18 +16,11 @@ class GD extends StochasticGD
/**
* Number of samples given
*
* @var int
* @var int|null
*/
protected $sampleCount = null;
protected $sampleCount;
/**
* @param array $samples
* @param array $targets
* @param \Closure $gradientCb
*
* @return array
*/
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
@ -38,11 +34,11 @@ class GD extends StochasticGD
$theta = $this->theta;
// Calculate update terms for each sample
list($errors, $updates, $totalPenalty) = $this->gradient($theta);
[$errors, $updates, $totalPenalty] = $this->gradient($theta);
$this->updateWeightsWithUpdates($updates, $totalPenalty);
$this->costValues[] = array_sum($errors)/$this->sampleCount;
$this->costValues[] = array_sum($errors) / $this->sampleCount;
if ($this->earlyStop($theta)) {
break;
@ -57,22 +53,22 @@ class GD extends StochasticGD
/**
* Calculates gradient, cost function and penalty term for each sample
* then returns them as an array of values
*
* @param array $theta
*
* @return array
*/
protected function gradient(array $theta)
protected function gradient(array $theta): array
{
$costs = [];
$gradient= [];
$gradient = [];
$totalPenalty = 0;
if ($this->gradientCb === null) {
throw new InvalidOperationException('Gradient callback is not defined');
}
foreach ($this->samples as $index => $sample) {
$target = $this->targets[$index];
$result = ($this->gradientCb)($theta, $sample, $target);
list($cost, $grad, $penalty) = array_pad($result, 3, 0);
[$cost, $grad, $penalty] = array_pad($result, 3, 0);
$costs[] = $cost;
$gradient[] = $grad;
@ -84,11 +80,7 @@ class GD extends StochasticGD
return [$costs, $gradient, $totalPenalty];
}
/**
* @param array $updates
* @param float $penalty
*/
protected function updateWeightsWithUpdates(array $updates, float $penalty)
protected function updateWeightsWithUpdates(array $updates, float $penalty): void
{
// Updates all weights at once
for ($i = 0; $i <= $this->dimensions; ++$i) {
@ -110,10 +102,8 @@ class GD extends StochasticGD
/**
* Clears the optimizer internal vars after the optimization process.
*
* @return void
*/
protected function clear()
protected function clear(): void
{
$this->sampleCount = null;
parent::clear();

View file

@ -4,6 +4,9 @@ declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidArgumentException;
abstract class Optimizer
{
/**
@ -11,7 +14,7 @@ abstract class Optimizer
*
* @var array
*/
protected $theta;
protected $theta = [];
/**
* Number of dimensions
@ -22,8 +25,6 @@ abstract class Optimizer
/**
* Inits a new instance of Optimizer for the given number of dimensions
*
* @param int $dimensions
*/
public function __construct(int $dimensions)
{
@ -32,23 +33,14 @@ abstract class Optimizer
// Inits the weights randomly
$this->theta = [];
for ($i = 0; $i < $this->dimensions; ++$i) {
$this->theta[] = rand() / (float) getrandmax();
$this->theta[] = (random_int(0, PHP_INT_MAX) / PHP_INT_MAX) + 0.1;
}
}
/**
* Sets the weights manually
*
* @param array $theta
*
* @return $this
*
* @throws \Exception
*/
public function setInitialTheta(array $theta)
public function setTheta(array $theta): self
{
if (count($theta) != $this->dimensions) {
throw new \Exception("Number of values in the weights array should be $this->dimensions");
if (count($theta) !== $this->dimensions) {
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions));
}
$this->theta = $theta;
@ -59,10 +51,6 @@ abstract class Optimizer
/**
* Executes the optimization with the given samples & targets
* and returns the weights
*
* @param array $samples
* @param array $targets
* @param \Closure $gradientCb
*/
abstract protected function runOptimization(array $samples, array $targets, \Closure $gradientCb);
abstract public function runOptimization(array $samples, array $targets, Closure $gradientCb): array;
}

View file

@ -4,6 +4,10 @@ declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
/**
* Stochastic Gradient Descent optimization method
* to find a solution for the equation A.ϴ = y where
@ -29,9 +33,9 @@ class StochasticGD extends Optimizer
* Callback function to get the gradient and cost value
* for a specific set of theta (ϴ) and a pair of sample & target
*
* @var \Closure
* @var \Closure|null
*/
protected $gradientCb = null;
protected $gradientCb;
/**
* Maximum number of iterations used to train the model
@ -66,18 +70,17 @@ class StochasticGD extends Optimizer
* @var bool
*/
protected $enableEarlyStop = true;
/**
* List of values obtained by evaluating the cost function at each iteration
* of the algorithm
*
* @var array
*/
protected $costValues= [];
protected $costValues = [];
/**
* Initializes the SGD optimizer for the given number of dimensions
*
* @param int $dimensions
*/
public function __construct(int $dimensions)
{
@ -87,6 +90,17 @@ class StochasticGD extends Optimizer
$this->dimensions = $dimensions;
}
public function setTheta(array $theta): Optimizer
{
if (count($theta) !== $this->dimensions + 1) {
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions + 1));
}
$this->theta = $theta;
return $this;
}
/**
* Sets minimum value for the change in the theta values
* between iterations to continue the iterations.<br>
@ -94,8 +108,6 @@ class StochasticGD extends Optimizer
* If change in the theta is less than given value then the
* algorithm will stop training
*
* @param float $threshold
*
* @return $this
*/
public function setChangeThreshold(float $threshold = 1e-5)
@ -109,8 +121,6 @@ class StochasticGD extends Optimizer
* Enable/Disable early stopping by checking at each iteration
* whether changes in theta or cost value are not large enough
*
* @param bool $enable
*
* @return $this
*/
public function setEarlyStop(bool $enable = true)
@ -121,8 +131,6 @@ class StochasticGD extends Optimizer
}
/**
* @param float $learningRate
*
* @return $this
*/
public function setLearningRate(float $learningRate)
@ -133,8 +141,6 @@ class StochasticGD extends Optimizer
}
/**
* @param int $maxIterations
*
* @return $this
*/
public function setMaxIterations(int $maxIterations)
@ -150,14 +156,8 @@ class StochasticGD extends Optimizer
*
* The cost function to minimize and the gradient of the function are to be
* handled by the callback function provided as the third parameter of the method.
*
* @param array $samples
* @param array $targets
* @param \Closure $gradientCb
*
* @return array
*/
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
@ -176,7 +176,7 @@ class StochasticGD extends Optimizer
// Save the best theta in the "pocket" so that
// any future set of theta worse than this will be disregarded
if ($bestTheta == null || $cost <= $bestScore) {
if ($bestTheta === null || $cost <= $bestScore) {
$bestTheta = $theta;
$bestScore = $cost;
}
@ -194,23 +194,33 @@ class StochasticGD extends Optimizer
// Solution in the pocket is better than or equal to the last state
// so, we use this solution
return $this->theta = $bestTheta;
return $this->theta = (array) $bestTheta;
}
/**
* @return float
* Returns the list of cost values for each iteration executed in
* last run of the optimization
*/
protected function updateTheta()
public function getCostValues(): array
{
return $this->costValues;
}
protected function updateTheta(): float
{
$jValue = 0.0;
$theta = $this->theta;
if ($this->gradientCb === null) {
throw new InvalidOperationException('Gradient callback is not defined');
}
foreach ($this->samples as $index => $sample) {
$target = $this->targets[$index];
$result = ($this->gradientCb)($theta, $sample, $target);
list($error, $gradient, $penalty) = array_pad($result, 3, 0);
[$error, $gradient, $penalty] = array_pad($result, 3, 0);
// Update bias
$this->theta[0] -= $this->learningRate * $gradient;
@ -231,19 +241,17 @@ class StochasticGD extends Optimizer
/**
* Checks if the optimization is not effective enough and can be stopped
* in case large enough changes in the solution do not happen
*
* @param array $oldTheta
*
* @return boolean
*/
protected function earlyStop($oldTheta)
protected function earlyStop(array $oldTheta): bool
{
// Check for early stop: No change larger than threshold (default 1e-5)
$diff = array_map(
function ($w1, $w2) {
return abs($w1 - $w2) > $this->threshold ? 1 : 0;
},
$oldTheta, $this->theta);
$oldTheta,
$this->theta
);
if (array_sum($diff) == 0) {
return true;
@ -251,30 +259,17 @@ class StochasticGD extends Optimizer
// Check if the last two cost values are almost the same
$costs = array_slice($this->costValues, -2);
if (count($costs) == 2 && abs($costs[1] - $costs[0]) < $this->threshold) {
if (count($costs) === 2 && abs($costs[1] - $costs[0]) < $this->threshold) {
return true;
}
return false;
}
/**
* Returns the list of cost values for each iteration executed in
* last run of the optimization
*
* @return array
*/
public function getCostValues()
{
return $this->costValues;
}
/**
* Clears the optimizer internal vars after the optimization process.
*
* @return void
*/
protected function clear()
protected function clear(): void
{
$this->samples = [];
$this->targets = [];

View file

@ -7,8 +7,6 @@ namespace Phpml\Helper;
trait Predictable
{
/**
* @param array $samples
*
* @return mixed
*/
public function predict(array $samples)
@ -26,8 +24,6 @@ trait Predictable
}
/**
* @param array $sample
*
* @return mixed
*/
abstract protected function predictSample(array $sample);

View file

@ -16,11 +16,7 @@ trait Trainable
*/
private $targets = [];
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);

View file

@ -6,10 +6,5 @@ namespace Phpml;
interface IncrementalEstimator
{
/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function partialTrain(array $samples, array $targets, array $labels = []);
public function partialTrain(array $samples, array $targets, array $labels = []): void;
}

View file

@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
use Phpml\Exception\InvalidArgumentException;
class Comparison
{
/**
* @param mixed $a
* @param mixed $b
*
* @throws InvalidArgumentException
*/
public static function compare($a, $b, string $operator): bool
{
switch ($operator) {
case '>':
return $a > $b;
case '>=':
return $a >= $b;
case '=':
case '==':
return $a == $b;
case '===':
return $a === $b;
case '<=':
return $a <= $b;
case '<':
return $a < $b;
case '!=':
case '<>':
return $a != $b;
case '!==':
return $a !== $b;
default:
throw new InvalidArgumentException(sprintf('Invalid operator "%s" provided', $operator));
}
}
}

View file

@ -6,11 +6,5 @@ namespace Phpml\Math;
interface Distance
{
/**
* @param array $a
* @param array $b
*
* @return float
*/
public function distance(array $a, array $b): float;
}

View file

@ -4,32 +4,16 @@ declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance;
class Chebyshev implements Distance
/**
* Class Chebyshev
*/
class Chebyshev extends Distance
{
/**
* @param array $a
* @param array $b
*
* @return float
*
* @throws InvalidArgumentException
* {@inheritdoc}
*/
public function distance(array $a, array $b): float
{
if (count($a) !== count($b)) {
throw InvalidArgumentException::arraySizeNotMatch();
}
$differences = [];
$count = count($a);
for ($i = 0; $i < $count; ++$i) {
$differences[] = abs($a[$i] - $b[$i]);
}
return max($differences);
return max($this->deltas($a, $b));
}
}

View file

@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance as DistanceInterface;
/**
* Class Distance
*/
abstract class Distance implements DistanceInterface
{
/**
* @var float|int
*/
public $norm;
/**
* Distance constructor.
*/
public function __construct(float $norm = 3.0)
{
$this->norm = $norm;
}
/**
* @throws InvalidArgumentException
*/
public function distance(array $a, array $b): float
{
$distance = 0;
foreach ($this->deltas($a, $b) as $delta) {
$distance += $delta ** $this->norm;
}
return $distance ** (1 / $this->norm);
}
/**
* @throws InvalidArgumentException
*/
protected function deltas(array $a, array $b): array
{
$count = count($a);
if ($count !== count($b)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$deltas = [];
for ($i = 0; $i < $count; $i++) {
$deltas[] = abs($a[$i] - $b[$i]);
}
return $deltas;
}
}

View file

@ -4,41 +4,25 @@ declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance;
class Euclidean implements Distance
/**
* Class Euclidean
*
* L^2 Metric.
*/
class Euclidean extends Distance
{
/**
* @param array $a
* @param array $b
*
* @return float
*
* @throws InvalidArgumentException
* Euclidean constructor.
*/
public function distance(array $a, array $b): float
public function __construct()
{
if (count($a) !== count($b)) {
throw InvalidArgumentException::arraySizeNotMatch();
}
$distance = 0;
foreach ($a as $i => $val) {
$distance += ($val - $b[$i]) ** 2;
}
return sqrt((float) $distance);
parent::__construct(2.0);
}
/**
* Square of Euclidean distance
*
* @param array $a
* @param array $b
*
* @return float
* @throws \Phpml\Exception\InvalidArgumentException
*/
public function sqDistance(array $a, array $b): float
{

View file

@ -4,32 +4,18 @@ declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance;
class Manhattan implements Distance
/**
* Class Manhattan
*
* L^1 Metric.
*/
class Manhattan extends Distance
{
/**
* @param array $a
* @param array $b
*
* @return float
*
* @throws InvalidArgumentException
* Manhattan constructor.
*/
public function distance(array $a, array $b): float
public function __construct()
{
if (count($a) !== count($b)) {
throw InvalidArgumentException::arraySizeNotMatch();
}
$distance = 0;
$count = count($a);
for ($i = 0; $i < $count; ++$i) {
$distance += abs($a[$i] - $b[$i]);
}
return $distance;
parent::__construct(1.0);
}
}

View file

@ -4,45 +4,11 @@ declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance;
class Minkowski implements Distance
/**
* Class Minkowski
*
* L^n Metric.
*/
class Minkowski extends Distance
{
/**
* @var float
*/
private $lambda;
/**
* @param float $lambda
*/
public function __construct(float $lambda = 3.0)
{
$this->lambda = $lambda;
}
/**
* @param array $a
* @param array $b
*
* @return float
*
* @throws InvalidArgumentException
*/
public function distance(array $a, array $b): float
{
if (count($a) !== count($b)) {
throw InvalidArgumentException::arraySizeNotMatch();
}
$distance = 0;
$count = count($a);
for ($i = 0; $i < $count; ++$i) {
$distance += pow(abs($a[$i] - $b[$i]), $this->lambda);
}
return (float)pow($distance, 1 / $this->lambda);
}
}

View file

@ -7,10 +7,10 @@ namespace Phpml\Math;
interface Kernel
{
/**
* @param float $a
* @param float $b
* @param float|array $a
* @param float|array $b
*
* @return float
* @return float|array
*/
public function compute($a, $b);
}

View file

@ -14,9 +14,6 @@ class RBF implements Kernel
*/
private $gamma;
/**
* @param float $gamma
*/
public function __construct(float $gamma)
{
$this->gamma = $gamma;
@ -25,15 +22,12 @@ class RBF implements Kernel
/**
* @param array $a
* @param array $b
*
* @return float
*/
public function compute($a, $b)
public function compute($a, $b): float
{
$score = 2 * Product::scalar($a, $b);
$squares = Product::scalar($a, $a) + Product::scalar($b, $b);
$result = exp(-$this->gamma * ($squares - $score));
return $result;
return exp(-$this->gamma * ($squares - $score));
}
}

View file

@ -1,68 +1,76 @@
<?php
declare(strict_types=1);
/**
* @package JAMA
* @package JAMA
*
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
* and a permutation vector piv of length m so that A(piv,:) = L*U.
* If m < n, then L is m-by-m and U is m-by-n.
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
* and a permutation vector piv of length m so that A(piv,:) = L*U.
* If m < n, then L is m-by-m and U is m-by-n.
*
* The LU decompostion with pivoting always exists, even if the matrix is
* singular, so the constructor will never fail. The primary use of the
* LU decomposition is in the solution of square systems of simultaneous
* linear equations. This will fail if isNonsingular() returns false.
* The LU decompostion with pivoting always exists, even if the matrix is
* singular, so the constructor will never fail. The primary use of the
* LU decomposition is in the solution of square systems of simultaneous
* linear equations. This will fail if isNonsingular() returns false.
*
* @author Paul Meagher
* @author Bartosz Matosiuk
* @author Michael Bommarito
* @version 1.1
* @license PHP v3.0
* @author Paul Meagher
* @author Bartosz Matosiuk
* @author Michael Bommarito
*
* @version 1.1
*
* @license PHP v3.0
*
* Slightly changed to adapt the original code to PHP-ML library
* @date 2017/04/24
*
* @author Mustafa Karabulut
*/
namespace Phpml\Math\LinearAlgebra;
use Phpml\Math\Matrix;
use Phpml\Exception\MatrixException;
use Phpml\Math\Matrix;
class LUDecomposition
{
/**
* Decomposition storage
* @var array
* Decomposition storage
*
* @var array
*/
private $LU = [];
/**
* Row dimension.
* @var int
* Row dimension.
*
* @var int
*/
private $m;
/**
* Column dimension.
* @var int
* Column dimension.
*
* @var int
*/
private $n;
/**
* Pivot sign.
* @var int
* Pivot sign.
*
* @var int
*/
private $pivsign;
/**
* Internal storage of pivot vector.
* @var array
* Internal storage of pivot vector.
*
* @var array
*/
private $piv = [];
/**
* Constructs Structure to access L, U and piv.
*
@ -72,17 +80,18 @@ class LUDecomposition
*/
public function __construct(Matrix $A)
{
if ($A->getRows() != $A->getColumns()) {
throw MatrixException::notSquareMatrix();
if ($A->getRows() !== $A->getColumns()) {
throw new MatrixException('Matrix is not square matrix');
}
// Use a "left-looking", dot-product, Crout/Doolittle algorithm.
$this->LU = $A->toArray();
$this->m = $A->getRows();
$this->n = $A->getColumns();
$this->m = $A->getRows();
$this->n = $A->getColumns();
for ($i = 0; $i < $this->m; ++$i) {
$this->piv[$i] = $i;
}
$this->pivsign = 1;
$LUcolj = [];
@ -92,6 +101,7 @@ class LUDecomposition
for ($i = 0; $i < $this->m; ++$i) {
$LUcolj[$i] = &$this->LU[$i][$j];
}
// Apply previous transformations.
for ($i = 0; $i < $this->m; ++$i) {
$LUrowi = $this->LU[$i];
@ -101,26 +111,31 @@ class LUDecomposition
for ($k = 0; $k < $kmax; ++$k) {
$s += $LUrowi[$k] * $LUcolj[$k];
}
$LUrowi[$j] = $LUcolj[$i] -= $s;
}
// Find pivot and exchange if necessary.
$p = $j;
for ($i = $j + 1; $i < $this->m; ++$i) {
if (abs($LUcolj[$i]) > abs($LUcolj[$p])) {
if (abs($LUcolj[$i] ?? 0) > abs($LUcolj[$p] ?? 0)) {
$p = $i;
}
}
if ($p != $j) {
for ($k = 0; $k < $this->n; ++$k) {
$t = $this->LU[$p][$k];
$this->LU[$p][$k] = $this->LU[$j][$k];
$this->LU[$j][$k] = $t;
}
$k = $this->piv[$p];
$this->piv[$p] = $this->piv[$j];
$this->piv[$j] = $k;
$this->pivsign = $this->pivsign * -1;
$this->pivsign *= -1;
}
// Compute multipliers.
if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) {
for ($i = $j + 1; $i < $this->m; ++$i) {
@ -128,15 +143,14 @@ class LUDecomposition
}
}
}
} // function __construct()
}
/**
* Get lower triangular factor.
*
* @return Matrix Lower triangular factor
*/
public function getL()
public function getL(): Matrix
{
$L = [];
for ($i = 0; $i < $this->m; ++$i) {
@ -150,16 +164,16 @@ class LUDecomposition
}
}
}
return new Matrix($L);
} // function getL()
return new Matrix($L);
}
/**
* Get upper triangular factor.
*
* @return Matrix Upper triangular factor
*/
public function getU()
public function getU(): Matrix
{
$U = [];
for ($i = 0; $i < $this->n; ++$i) {
@ -171,38 +185,36 @@ class LUDecomposition
}
}
}
return new Matrix($U);
} // function getU()
return new Matrix($U);
}
/**
* Return pivot permutation vector.
*
* @return array Pivot vector
*/
public function getPivot()
public function getPivot(): array
{
return $this->piv;
} // function getPivot()
}
/**
* Alias for getPivot
*
* @see getPivot
*/
public function getDoublePivot()
public function getDoublePivot(): array
{
return $this->getPivot();
} // function getDoublePivot()
}
/**
* Is the matrix nonsingular?
*
* @return true if U, and hence A, is nonsingular.
* @return bool true if U, and hence A, is nonsingular.
*/
public function isNonsingular()
public function isNonsingular(): bool
{
for ($j = 0; $j < $this->n; ++$j) {
if ($this->LU[$j][$j] == 0) {
@ -211,30 +223,17 @@ class LUDecomposition
}
return true;
} // function isNonsingular()
}
/**
* Count determinants
*
* @return float|int d matrix determinant
*
* @throws MatrixException
*/
public function det()
public function det(): float
{
if ($this->m !== $this->n) {
throw MatrixException::notSquareMatrix();
}
$d = $this->pivsign;
for ($j = 0; $j < $this->n; ++$j) {
$d *= $this->LU[$j][$j];
}
return $d;
} // function det()
return (float) $d;
}
/**
* Solve A*X = B
@ -245,19 +244,19 @@ class LUDecomposition
*
* @throws MatrixException
*/
public function solve(Matrix $B)
public function solve(Matrix $B): array
{
if ($B->getRows() != $this->m) {
throw MatrixException::notSquareMatrix();
throw new MatrixException('Matrix is not square matrix');
}
if (!$this->isNonsingular()) {
throw MatrixException::singularMatrix();
throw new MatrixException('Matrix is singular');
}
// Copy right hand side with pivoting
$nx = $B->getColumns();
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1);
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1);
// Solve L*Y = B(piv,:)
for ($k = 0; $k < $this->n; ++$k) {
for ($i = $k + 1; $i < $this->n; ++$i) {
@ -266,29 +265,24 @@ class LUDecomposition
}
}
}
// Solve U*X = Y;
for ($k = $this->n - 1; $k >= 0; --$k) {
for ($j = 0; $j < $nx; ++$j) {
$X[$k][$j] /= $this->LU[$k][$k];
}
for ($i = 0; $i < $k; ++$i) {
for ($j = 0; $j < $nx; ++$j) {
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
}
}
}
return $X;
} // function solve()
/**
* @param array $matrix
* @param array $RL
* @param int $j0
* @param int $jF
*
* @return array
*/
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF)
return $X;
}
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF): array
{
$m = count($RL);
$n = $jF - $j0;
@ -302,4 +296,4 @@ class LUDecomposition
return $R;
}
} // class LUDecomposition
}

View file

@ -4,16 +4,16 @@ declare(strict_types=1);
namespace Phpml\Math;
use Phpml\Math\LinearAlgebra\LUDecomposition;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\MatrixException;
use Phpml\Math\LinearAlgebra\LUDecomposition;
class Matrix
{
/**
* @var array
*/
private $matrix;
private $matrix = [];
/**
* @var int
@ -31,9 +31,6 @@ class Matrix
private $determinant;
/**
* @param array $matrix
* @param bool $validate
*
* @throws InvalidArgumentException
*/
public function __construct(array $matrix, bool $validate = true)
@ -51,7 +48,7 @@ class Matrix
if ($validate) {
for ($i = 0; $i < $this->rows; ++$i) {
if (count($matrix[$i]) !== $this->columns) {
throw InvalidArgumentException::matrixDimensionsDidNotMatch();
throw new InvalidArgumentException('Matrix dimensions did not match');
}
}
}
@ -59,12 +56,7 @@ class Matrix
$this->matrix = $matrix;
}
/**
* @param array $array
*
* @return Matrix
*/
public static function fromFlatArray(array $array)
public static function fromFlatArray(array $array): self
{
$matrix = [];
foreach ($array as $value) {
@ -74,55 +66,38 @@ class Matrix
return new self($matrix);
}
/**
* @return array
*/
public function toArray()
public function toArray(): array
{
return $this->matrix;
}
/**
* @return float
*/
public function toScalar()
public function toScalar(): float
{
return $this->matrix[0][0];
}
/**
* @return int
*/
public function getRows()
public function getRows(): int
{
return $this->rows;
}
/**
* @return int
*/
public function getColumns()
public function getColumns(): int
{
return $this->columns;
}
/**
* @param $column
*
* @return array
*
* @throws MatrixException
*/
public function getColumnValues($column)
public function getColumnValues(int $column): array
{
if ($column >= $this->columns) {
throw MatrixException::columnOutOfRange();
throw new MatrixException('Column out of range');
}
return array_column($this->matrix, $column);
}
/**
* @return float|int
*
@ -130,12 +105,12 @@ class Matrix
*/
public function getDeterminant()
{
if ($this->determinant) {
if ($this->determinant !== null) {
return $this->determinant;
}
if (!$this->isSquare()) {
throw MatrixException::notSquareMatrix();
throw new MatrixException('Matrix is not square matrix');
}
$lu = new LUDecomposition($this);
@ -143,20 +118,14 @@ class Matrix
return $this->determinant = $lu->det();
}
/**
* @return bool
*/
public function isSquare()
public function isSquare(): bool
{
return $this->columns === $this->rows;
}
/**
* @return Matrix
*/
public function transpose()
public function transpose(): self
{
if ($this->rows == 1) {
if ($this->rows === 1) {
$matrix = array_map(function ($el) {
return [$el];
}, $this->matrix[0]);
@ -167,28 +136,30 @@ class Matrix
return new self($matrix, false);
}
/**
* @param Matrix $matrix
*
* @return Matrix
*
* @throws InvalidArgumentException
*/
public function multiply(Matrix $matrix)
public function multiply(self $matrix): self
{
if ($this->columns != $matrix->getRows()) {
throw InvalidArgumentException::inconsistentMatrixSupplied();
if ($this->columns !== $matrix->getRows()) {
throw new InvalidArgumentException('Inconsistent matrix supplied');
}
$array1 = $this->toArray();
$array2 = $matrix->toArray();
$colCount = $matrix->columns;
/*
- To speed-up multiplication, we need to avoid use of array index operator [ ] as much as possible( See #255 for details)
- A combination of "foreach" and "array_column" works much faster then accessing the array via index operator
*/
$product = [];
$multiplier = $matrix->toArray();
for ($i = 0; $i < $this->rows; ++$i) {
$columns = $matrix->getColumns();
for ($j = 0; $j < $columns; ++$j) {
$product[$i][$j] = 0;
for ($k = 0; $k < $this->columns; ++$k) {
$product[$i][$j] += $this->matrix[$i][$k] * $multiplier[$k][$j];
foreach ($array1 as $row => $rowData) {
for ($col = 0; $col < $colCount; ++$col) {
$columnData = array_column($array2, $col);
$sum = 0;
foreach ($rowData as $key => $valueData) {
$sum += $valueData * $columnData[$key];
}
$product[$row][$col] = $sum;
}
}
@ -196,11 +167,9 @@ class Matrix
}
/**
* @param $value
*
* @return Matrix
* @param float|int $value
*/
public function divideByScalar($value)
public function divideByScalar($value): self
{
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
@ -213,11 +182,9 @@ class Matrix
}
/**
* @param $value
*
* @return Matrix
* @param float|int $value
*/
public function multiplyByScalar($value)
public function multiplyByScalar($value): self
{
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
@ -231,37 +198,106 @@ class Matrix
/**
* Element-wise addition of the matrix with another one
*
* @param Matrix $other
*
* @return Matrix
*/
public function add(Matrix $other)
public function add(self $other): self
{
return $this->_add($other);
}
/**
* Element-wise subtracting of another matrix from this one
*
* @param Matrix $other
*
* @return Matrix
*/
public function subtract(Matrix $other)
public function subtract(self $other): self
{
return $this->_add($other, -1);
}
public function inverse(): self
{
if (!$this->isSquare()) {
throw new MatrixException('Matrix is not square matrix');
}
$LU = new LUDecomposition($this);
$identity = $this->getIdentity();
$inverse = $LU->solve($identity);
return new self($inverse, false);
}
public function crossOut(int $row, int $column): self
{
$newMatrix = [];
$r = 0;
for ($i = 0; $i < $this->rows; ++$i) {
$c = 0;
if ($row != $i) {
for ($j = 0; $j < $this->columns; ++$j) {
if ($column != $j) {
$newMatrix[$r][$c] = $this->matrix[$i][$j];
++$c;
}
}
++$r;
}
}
return new self($newMatrix, false);
}
public function isSingular(): bool
{
return $this->getDeterminant() == 0;
}
/**
* Frobenius norm (HilbertSchmidt norm, Euclidean norm) (‖A‖F)
* Square root of the sum of the square of all elements.
*
* https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
*
* _____________
* /
* ‖A‖F = Σ Σ |aᵢⱼ|²
* ᵢ₌₁ ᵢ₌₁
*/
public function frobeniusNorm(): float
{
$squareSum = 0;
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$squareSum += $this->matrix[$i][$j] ** 2;
}
}
return $squareSum ** .5;
}
/**
* Returns the transpose of given array
*/
public static function transposeArray(array $array): array
{
return (new self($array, false))->transpose()->toArray();
}
/**
* Returns the dot product of two arrays<br>
* Matrix::dot(x, y) ==> x.y'
*/
public static function dot(array $array1, array $array2): array
{
$m1 = new self($array1, false);
$m2 = new self($array2, false);
return $m1->multiply($m2->transpose())->toArray()[0];
}
/**
* Element-wise addition or substraction depending on the given sign parameter
*
* @param Matrix $other
* @param int $sign
*
* @return Matrix
*/
protected function _add(Matrix $other, $sign = 1)
private function _add(self $other, int $sign = 1): self
{
$a1 = $this->toArray();
$a2 = $other->toArray();
@ -276,30 +312,10 @@ class Matrix
return new self($newMatrix, false);
}
/**
* @return Matrix
*
* @throws MatrixException
*/
public function inverse()
{
if (!$this->isSquare()) {
throw MatrixException::notSquareMatrix();
}
$LU = new LUDecomposition($this);
$identity = $this->getIdentity();
$inverse = $LU->solve($identity);
return new self($inverse, false);
}
/**
* Returns diagonal identity matrix of the same size of this matrix
*
* @return Matrix
*/
protected function getIdentity()
private function getIdentity(): self
{
$array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0));
for ($i = 0; $i < $this->rows; ++$i) {
@ -308,67 +324,4 @@ class Matrix
return new self($array, false);
}
/**
* @param int $row
* @param int $column
*
* @return Matrix
*/
public function crossOut(int $row, int $column)
{
$newMatrix = [];
$r = 0;
for ($i = 0; $i < $this->rows; ++$i) {
$c = 0;
if ($row != $i) {
for ($j = 0; $j < $this->columns; ++$j) {
if ($column != $j) {
$newMatrix[$r][$c] = $this->matrix[$i][$j];
++$c;
}
}
++$r;
}
}
return new self($newMatrix, false);
}
/**
* @return bool
*/
public function isSingular() : bool
{
return 0 == $this->getDeterminant();
}
/**
* Returns the transpose of given array
*
* @param array $array
*
* @return array
*/
public static function transposeArray(array $array)
{
return (new self($array, false))->transpose()->toArray();
}
/**
* Returns the dot product of two arrays<br>
* Matrix::dot(x, y) ==> x.y'
*
* @param array $array1
* @param array $array2
*
* @return array
*/
public static function dot(array $array1, array $array2)
{
$m1 = new self($array1, false);
$m2 = new self($array2, false);
return $m1->multiply($m2->transpose())->toArray()[0];
}
}

View file

@ -7,9 +7,6 @@ namespace Phpml\Math;
class Product
{
/**
* @param array $a
* @param array $b
*
* @return mixed
*/
public static function scalar(array $a, array $b)
@ -17,7 +14,7 @@ class Product
$product = 0;
foreach ($a as $index => $value) {
if (is_numeric($value) && is_numeric($b[$index])) {
$product += $value * $b[$index];
$product += (float) $value * (float) $b[$index];
}
}

View file

@ -4,15 +4,18 @@ declare(strict_types=1);
namespace Phpml\Math;
class Set implements \IteratorAggregate
use ArrayIterator;
use IteratorAggregate;
class Set implements IteratorAggregate
{
/**
* @var string[]|int[]|float[]
* @var string[]|int[]|float[]|bool[]
*/
private $elements;
private $elements = [];
/**
* @param string[]|int[]|float[] $elements
* @param string[]|int[]|float[]|bool[] $elements
*/
public function __construct(array $elements = [])
{
@ -21,39 +24,24 @@ class Set implements \IteratorAggregate
/**
* Creates the union of A and B.
*
* @param Set $a
* @param Set $b
*
* @return Set
*/
public static function union(Set $a, Set $b) : Set
public static function union(self $a, self $b): self
{
return new self(array_merge($a->toArray(), $b->toArray()));
}
/**
* Creates the intersection of A and B.
*
* @param Set $a
* @param Set $b
*
* @return Set
*/
public static function intersection(Set $a, Set $b) : Set
public static function intersection(self $a, self $b): self
{
return new self(array_intersect($a->toArray(), $b->toArray()));
}
/**
* Creates the difference of A and B.
*
* @param Set $a
* @param Set $b
*
* @return Set
*/
public static function difference(Set $a, Set $b) : Set
public static function difference(self $a, self $b): self
{
return new self(array_diff($a->toArray(), $b->toArray()));
}
@ -61,12 +49,9 @@ class Set implements \IteratorAggregate
/**
* Creates the Cartesian product of A and B.
*
* @param Set $a
* @param Set $b
*
* @return Set[]
*/
public static function cartesian(Set $a, Set $b) : array
public static function cartesian(self $a, self $b): array
{
$cartesian = [];
@ -82,11 +67,9 @@ class Set implements \IteratorAggregate
/**
* Creates the power set of A.
*
* @param Set $a
*
* @return Set[]
*/
public static function power(Set $a) : array
public static function power(self $a): array
{
$power = [new self()];
@ -100,35 +83,17 @@ class Set implements \IteratorAggregate
}
/**
* Removes duplicates and rewrites index.
*
* @param string[]|int[]|float[] $elements
*
* @return string[]|int[]|float[]
* @param string|int|float|bool $element
*/
private static function sanitize(array $elements) : array
{
sort($elements, SORT_ASC);
return array_values(array_unique($elements, SORT_ASC));
}
/**
* @param string|int|float $element
*
* @return Set
*/
public function add($element) : Set
public function add($element): self
{
return $this->addAll([$element]);
}
/**
* @param string[]|int[]|float[] $elements
*
* @return Set
* @param string[]|int[]|float[]|bool[] $elements
*/
public function addAll(array $elements) : Set
public function addAll(array $elements): self
{
$this->elements = self::sanitize(array_merge($this->elements, $elements));
@ -137,20 +102,16 @@ class Set implements \IteratorAggregate
/**
* @param string|int|float $element
*
* @return Set
*/
public function remove($element) : Set
public function remove($element): self
{
return $this->removeAll([$element]);
}
/**
* @param string[]|int[]|float[] $elements
*
* @return Set
*/
public function removeAll(array $elements) : Set
public function removeAll(array $elements): self
{
$this->elements = self::sanitize(array_diff($this->elements, $elements));
@ -159,53 +120,54 @@ class Set implements \IteratorAggregate
/**
* @param string|int|float $element
*
* @return bool
*/
public function contains($element) : bool
public function contains($element): bool
{
return $this->containsAll([$element]);
}
/**
* @param string[]|int[]|float[] $elements
*
* @return bool
*/
public function containsAll(array $elements) : bool
public function containsAll(array $elements): bool
{
return !array_diff($elements, $this->elements);
return count(array_diff($elements, $this->elements)) === 0;
}
/**
* @return string[]|int[]|float[]
* @return string[]|int[]|float[]|bool[]
*/
public function toArray() : array
public function toArray(): array
{
return $this->elements;
}
/**
* @return \ArrayIterator
*/
public function getIterator() : \ArrayIterator
public function getIterator(): ArrayIterator
{
return new \ArrayIterator($this->elements);
return new ArrayIterator($this->elements);
}
/**
* @return bool
*/
public function isEmpty() : bool
public function isEmpty(): bool
{
return $this->cardinality() == 0;
return $this->cardinality() === 0;
}
/**
* @return int
*/
public function cardinality() : int
public function cardinality(): int
{
return count($this->elements);
}
/**
* Removes duplicates and rewrites index.
*
* @param string[]|int[]|float[]|bool[] $elements
*
* @return string[]|int[]|float[]|bool[]
*/
private static function sanitize(array $elements): array
{
sort($elements, SORT_ASC);
return array_values(array_unique($elements, SORT_ASC));
}
}

View file

@ -0,0 +1,137 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
/**
* Analysis of variance
* https://en.wikipedia.org/wiki/Analysis_of_variance
*/
final class ANOVA
{
/**
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
* the same population mean. The test is applied to samples from two or
* more groups, possibly with differing sizes.
*
* @param array[] $samples - each row is class samples
*
* @return float[]
*/
public static function oneWayF(array $samples): array
{
$classes = count($samples);
if ($classes < 2) {
throw new InvalidArgumentException('The array must have at least 2 elements');
}
$samplesPerClass = array_map(function (array $class): int {
return count($class);
}, $samples);
$allSamples = (int) array_sum($samplesPerClass);
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
$sumSamples = self::sumOfFeaturesPerClass($samples);
$squareSumSamples = self::sumOfSquares($sumSamples);
$sumSamplesSquare = self::squaresSum($sumSamples);
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
$dfbn = $classes - 1;
$dfwn = $allSamples - $classes;
$msb = array_map(function ($s) use ($dfbn) {
return $s / $dfbn;
}, $ssbn);
$msw = array_map(function ($s) use ($dfwn) {
return $s / $dfwn;
}, $sswn);
$f = [];
foreach ($msb as $index => $msbValue) {
$f[$index] = $msbValue / $msw[$index];
}
return $f;
}
private static function sumOfSquaresPerFeature(array $samples): array
{
$sum = array_fill(0, count($samples[0][0]), 0);
foreach ($samples as $class) {
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature ** 2;
}
}
}
return $sum;
}
private static function sumOfFeaturesPerClass(array $samples): array
{
return array_map(function (array $class) {
$sum = array_fill(0, count($class[0]), 0);
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature;
}
}
return $sum;
}, $samples);
}
private static function sumOfSquares(array $sums): array
{
$squares = array_fill(0, count($sums[0]), 0);
foreach ($sums as $row) {
foreach ($row as $index => $sum) {
$squares[$index] += $sum;
}
}
return array_map(function ($sum) {
return $sum ** 2;
}, $squares);
}
private static function squaresSum(array $sums): array
{
foreach ($sums as &$row) {
foreach ($row as &$sum) {
$sum **= 2;
}
}
return $sums;
}
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
{
$ssbn = array_fill(0, count($samples[0][0]), 0);
foreach ($sumSamplesSquare as $classIndex => $class) {
foreach ($class as $index => $feature) {
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
}
}
foreach ($squareSumSamples as $index => $sum) {
$ssbn[$index] -= $sum / $allSamples;
}
return $ssbn;
}
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
{
$sswn = [];
foreach ($ssAllSamples as $index => $ss) {
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
}
return $sswn;
}
}

View file

@ -9,17 +9,15 @@ use Phpml\Exception\InvalidArgumentException;
class Correlation
{
/**
* @param array|int[]|float[] $x
* @param array|int[]|float[] $y
*
* @return float
* @param int[]|float[] $x
* @param int[]|float[] $y
*
* @throws InvalidArgumentException
*/
public static function pearson(array $x, array $y)
public static function pearson(array $x, array $y): float
{
if (count($x) !== count($y)) {
throw InvalidArgumentException::arraySizeNotMatch();
throw new InvalidArgumentException('Size of given arrays does not match');
}
$count = count($x);
@ -34,12 +32,10 @@ class Correlation
$a = $x[$i] - $meanX;
$b = $y[$i] - $meanY;
$axb += ($a * $b);
$a2 += pow($a, 2);
$b2 += pow($b, 2);
$a2 += $a ** 2;
$b2 += $b ** 2;
}
$corr = $axb / sqrt((float) ($a2 * $b2));
return $corr;
return $axb / ($a2 * $b2) ** .5;
}
}

View file

@ -11,25 +11,17 @@ class Covariance
/**
* Calculates covariance from two given arrays, x and y, respectively
*
* @param array $x
* @param array $y
* @param bool $sample
* @param float $meanX
* @param float $meanY
*
* @return float
*
* @throws InvalidArgumentException
*/
public static function fromXYArrays(array $x, array $y, $sample = true, float $meanX = null, float $meanY = null)
public static function fromXYArrays(array $x, array $y, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
{
if (empty($x) || empty($y)) {
throw InvalidArgumentException::arrayCantBeEmpty();
$n = count($x);
if ($n === 0 || count($y) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$n = count($x);
if ($sample && $n === 1) {
throw InvalidArgumentException::arraySizeToSmall(2);
throw new InvalidArgumentException('The array must have at least 2 elements');
}
if ($meanX === null) {
@ -56,31 +48,22 @@ class Covariance
/**
* Calculates covariance of two dimensions, i and k in the given data.
*
* @param array $data
* @param int $i
* @param int $k
* @param bool $sample
* @param float $meanX
* @param float $meanY
*
* @return float
*
* @throws InvalidArgumentException
* @throws \Exception
*/
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, float $meanX = null, float $meanY = null)
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
{
if (empty($data)) {
throw InvalidArgumentException::arrayCantBeEmpty();
if (count($data) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$n = count($data);
if ($sample && $n === 1) {
throw InvalidArgumentException::arraySizeToSmall(2);
throw new InvalidArgumentException('The array must have at least 2 elements');
}
if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) {
throw new \Exception("Given indices i and k do not match with the dimensionality of data");
throw new InvalidArgumentException('Given indices i and k do not match with the dimensionality of data');
}
if ($meanX === null || $meanY === null) {
@ -104,15 +87,17 @@ class Covariance
// with a slight cost of CPU utilization.
$sum = 0.0;
foreach ($data as $row) {
$val = [];
$val = [0, 0];
foreach ($row as $index => $col) {
if ($index == $i) {
$val[0] = $col - $meanX;
}
if ($index == $k) {
$val[1] = $col - $meanY;
}
}
$sum += $val[0] * $val[1];
}
}
@ -127,12 +112,9 @@ class Covariance
/**
* Returns the covariance matrix of n-dimensional data
*
* @param array $data
* @param array|null $means
*
* @return array
*/
public static function covarianceMatrix(array $data, array $means = null)
public static function covarianceMatrix(array $data, ?array $means = null): array
{
$n = count($data[0]);
@ -150,7 +132,12 @@ class Covariance
$cov[$i][$k] = $cov[$k][$i];
} else {
$cov[$i][$k] = self::fromDataset(
$data, $i, $k, true, $means[$i], $means[$k]
$data,
$i,
$k,
true,
$means[$i],
$means[$k]
);
}
}

View file

@ -16,10 +16,6 @@ class Gaussian
*/
protected $std;
/**
* @param float $mean
* @param float $std
*/
public function __construct(float $mean, float $std)
{
$this->mean = $mean;
@ -29,8 +25,6 @@ class Gaussian
/**
* Returns probability density of the given <i>$value</i>
*
* @param float $value
*
* @return float|int
*/
public function pdf(float $value)
@ -39,22 +33,18 @@ class Gaussian
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
$std2 = $this->std ** 2;
$mean = $this->mean;
return exp(- (($value - $mean) ** 2) / (2 * $std2)) / sqrt(2 * $std2 * pi());
return exp(-(($value - $mean) ** 2) / (2 * $std2)) / ((2 * $std2 * M_PI) ** .5);
}
/**
* Returns probability density value of the given <i>$value</i> based on
* given standard deviation and the mean
*
* @param float $mean
* @param float $std
* @param float $value
*
* @return float
*/
public static function distributionPdf(float $mean, float $std, float $value)
public static function distributionPdf(float $mean, float $std, float $value): float
{
$normal = new self($mean, $std);
return $normal->pdf($value);
}
}

View file

@ -9,13 +9,9 @@ use Phpml\Exception\InvalidArgumentException;
class Mean
{
/**
* @param array $numbers
*
* @return float
*
* @throws InvalidArgumentException
*/
public static function arithmetic(array $numbers)
public static function arithmetic(array $numbers): float
{
self::checkArrayLength($numbers);
@ -23,8 +19,6 @@ class Mean
}
/**
* @param array $numbers
*
* @return float|mixed
*
* @throws InvalidArgumentException
@ -34,11 +28,11 @@ class Mean
self::checkArrayLength($numbers);
$count = count($numbers);
$middleIndex = (int)floor($count / 2);
$middleIndex = (int) floor($count / 2);
sort($numbers, SORT_NUMERIC);
$median = $numbers[$middleIndex];
if (0 === $count % 2) {
if ($count % 2 === 0) {
$median = ($median + $numbers[$middleIndex - 1]) / 2;
}
@ -46,8 +40,6 @@ class Mean
}
/**
* @param array $numbers
*
* @return mixed
*
* @throws InvalidArgumentException
@ -58,18 +50,16 @@ class Mean
$values = array_count_values($numbers);
return array_search(max($values), $values);
return array_search(max($values), $values, true);
}
/**
* @param array $array
*
* @throws InvalidArgumentException
*/
private static function checkArrayLength(array $array)
private static function checkArrayLength(array $array): void
{
if (empty($array)) {
throw InvalidArgumentException::arrayCantBeEmpty();
if (count($array) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
}
}

View file

@ -9,36 +9,51 @@ use Phpml\Exception\InvalidArgumentException;
class StandardDeviation
{
/**
* @param array|float[] $a
* @param bool $sample
*
* @return float
*
* @throws InvalidArgumentException
* @param float[]|int[] $numbers
*/
public static function population(array $a, $sample = true)
public static function population(array $numbers, bool $sample = true): float
{
if (empty($a)) {
throw InvalidArgumentException::arrayCantBeEmpty();
$n = count($numbers);
if ($n === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$n = count($a);
if ($sample && $n === 1) {
throw InvalidArgumentException::arraySizeToSmall(2);
throw new InvalidArgumentException('The array must have at least 2 elements');
}
$mean = Mean::arithmetic($a);
$mean = Mean::arithmetic($numbers);
$carry = 0.0;
foreach ($a as $val) {
$d = $val - $mean;
$carry += $d * $d;
foreach ($numbers as $val) {
$carry += ($val - $mean) ** 2;
}
if ($sample) {
--$n;
}
return sqrt((float) ($carry / $n));
return ($carry / $n) ** .5;
}
/**
* Sum of squares deviations
* ∑⟮xᵢ - μ⟯²
*
* @param float[]|int[] $numbers
*/
public static function sumOfSquares(array $numbers): float
{
if (count($numbers) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$mean = Mean::arithmetic($numbers);
return array_sum(array_map(
function ($val) use ($mean) {
return ($val - $mean) ** 2;
},
$numbers
));
}
}

View file

@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
/**
* In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean.
* Informally, it measures how far a set of (random) numbers are spread out from their average value
* https://en.wikipedia.org/wiki/Variance
*/
final class Variance
{
/**
* Population variance
* Use when all possible observations of the system are present.
* If used with a subset of data (sample variance), it will be a biased variance.
*
* ∑⟮xᵢ - μ⟯²
* σ² = ----------
* N
*/
public static function population(array $population): float
{
return StandardDeviation::sumOfSquares($population) / count($population);
}
}

View file

@ -9,10 +9,6 @@ use Phpml\Exception\InvalidArgumentException;
class Accuracy
{
/**
* @param array $actualLabels
* @param array $predictedLabels
* @param bool $normalize
*
* @return float|int
*
* @throws InvalidArgumentException
@ -20,7 +16,7 @@ class Accuracy
public static function score(array $actualLabels, array $predictedLabels, bool $normalize = true)
{
if (count($actualLabels) != count($predictedLabels)) {
throw InvalidArgumentException::arraySizeNotMatch();
throw new InvalidArgumentException('Size of given arrays does not match');
}
$score = 0;

View file

@ -4,8 +4,36 @@ declare(strict_types=1);
namespace Phpml\Metric;
use Phpml\Exception\InvalidArgumentException;
class ClassificationReport
{
public const MICRO_AVERAGE = 1;
public const MACRO_AVERAGE = 2;
public const WEIGHTED_AVERAGE = 3;
/**
* @var array
*/
private $truePositive = [];
/**
* @var array
*/
private $falsePositive = [];
/**
* @var array
*/
private $falseNegative = [];
/**
* @var array
*/
private $support = [];
/**
* @var array
*/
@ -21,27 +49,55 @@ class ClassificationReport
*/
private $f1score = [];
/**
* @var array
*/
private $support = [];
/**
* @var array
*/
private $average = [];
/**
* @param array $actualLabels
* @param array $predictedLabels
*/
public function __construct(array $actualLabels, array $predictedLabels)
public function __construct(array $actualLabels, array $predictedLabels, int $average = self::MACRO_AVERAGE)
{
$truePositive = $falsePositive = $falseNegative = $this->support = self::getLabelIndexedArray($actualLabels, $predictedLabels);
$averagingMethods = range(self::MICRO_AVERAGE, self::WEIGHTED_AVERAGE);
if (!in_array($average, $averagingMethods, true)) {
throw new InvalidArgumentException('Averaging method must be MICRO_AVERAGE, MACRO_AVERAGE or WEIGHTED_AVERAGE');
}
$this->aggregateClassificationResults($actualLabels, $predictedLabels);
$this->computeMetrics();
$this->computeAverage($average);
}
public function getPrecision(): array
{
return $this->precision;
}
public function getRecall(): array
{
return $this->recall;
}
public function getF1score(): array
{
return $this->f1score;
}
public function getSupport(): array
{
return $this->support;
}
public function getAverage(): array
{
return $this->average;
}
private function aggregateClassificationResults(array $actualLabels, array $predictedLabels): void
{
$truePositive = $falsePositive = $falseNegative = $support = self::getLabelIndexedArray($actualLabels, $predictedLabels);
foreach ($actualLabels as $index => $actual) {
$predicted = $predictedLabels[$index];
++$this->support[$actual];
++$support[$actual];
if ($actual === $predicted) {
++$truePositive[$actual];
@ -51,85 +107,92 @@ class ClassificationReport
}
}
$this->computeMetrics($truePositive, $falsePositive, $falseNegative);
$this->computeAverage();
$this->truePositive = $truePositive;
$this->falsePositive = $falsePositive;
$this->falseNegative = $falseNegative;
$this->support = $support;
}
/**
* @return array
*/
public function getPrecision()
private function computeMetrics(): void
{
return $this->precision;
}
/**
* @return array
*/
public function getRecall()
{
return $this->recall;
}
/**
* @return array
*/
public function getF1score()
{
return $this->f1score;
}
/**
* @return array
*/
public function getSupport()
{
return $this->support;
}
/**
* @return array
*/
public function getAverage()
{
return $this->average;
}
/**
* @param array $truePositive
* @param array $falsePositive
* @param array $falseNegative
*/
private function computeMetrics(array $truePositive, array $falsePositive, array $falseNegative)
{
foreach ($truePositive as $label => $tp) {
$this->precision[$label] = $this->computePrecision($tp, $falsePositive[$label]);
$this->recall[$label] = $this->computeRecall($tp, $falseNegative[$label]);
foreach ($this->truePositive as $label => $tp) {
$this->precision[$label] = $this->computePrecision($tp, $this->falsePositive[$label]);
$this->recall[$label] = $this->computeRecall($tp, $this->falseNegative[$label]);
$this->f1score[$label] = $this->computeF1Score((float) $this->precision[$label], (float) $this->recall[$label]);
}
}
private function computeAverage()
private function computeAverage(int $average): void
{
switch ($average) {
case self::MICRO_AVERAGE:
$this->computeMicroAverage();
return;
case self::MACRO_AVERAGE:
$this->computeMacroAverage();
return;
case self::WEIGHTED_AVERAGE:
$this->computeWeightedAverage();
return;
}
}
private function computeMicroAverage(): void
{
$truePositive = (int) array_sum($this->truePositive);
$falsePositive = (int) array_sum($this->falsePositive);
$falseNegative = (int) array_sum($this->falseNegative);
$precision = $this->computePrecision($truePositive, $falsePositive);
$recall = $this->computeRecall($truePositive, $falseNegative);
$f1score = $this->computeF1Score((float) $precision, (float) $recall);
$this->average = compact('precision', 'recall', 'f1score');
}
private function computeMacroAverage(): void
{
foreach (['precision', 'recall', 'f1score'] as $metric) {
$values = array_filter($this->{$metric});
if (empty($values)) {
$values = $this->{$metric};
if (count($values) == 0) {
$this->average[$metric] = 0.0;
continue;
}
$this->average[$metric] = array_sum($values) / count($values);
}
}
private function computeWeightedAverage(): void
{
foreach (['precision', 'recall', 'f1score'] as $metric) {
$values = $this->{$metric};
if (count($values) == 0) {
$this->average[$metric] = 0.0;
continue;
}
$sum = 0;
foreach ($values as $i => $value) {
$sum += $value * $this->support[$i];
}
$this->average[$metric] = $sum / array_sum($this->support);
}
}
/**
* @param int $truePositive
* @param int $falsePositive
*
* @return float|string
*/
private function computePrecision(int $truePositive, int $falsePositive)
{
if (0 == ($divider = $truePositive + $falsePositive)) {
$divider = $truePositive + $falsePositive;
if ($divider == 0) {
return 0.0;
}
@ -137,47 +200,33 @@ class ClassificationReport
}
/**
* @param int $truePositive
* @param int $falseNegative
*
* @return float|string
*/
private function computeRecall(int $truePositive, int $falseNegative)
{
if (0 == ($divider = $truePositive + $falseNegative)) {
$divider = $truePositive + $falseNegative;
if ($divider == 0) {
return 0.0;
}
return $truePositive / $divider;
}
/**
* @param float $precision
* @param float $recall
*
* @return float
*/
private function computeF1Score(float $precision, float $recall): float
{
if (0 == ($divider = $precision + $recall)) {
$divider = $precision + $recall;
if ($divider == 0) {
return 0.0;
}
return 2.0 * (($precision * $recall) / $divider);
}
/**
* @param array $actualLabels
* @param array $predictedLabels
*
* @return array
*/
private static function getLabelIndexedArray(array $actualLabels, array $predictedLabels): array
{
$labels = array_values(array_unique(array_merge($actualLabels, $predictedLabels)));
sort($labels);
$labels = array_combine($labels, array_fill(0, count($labels), 0));
return $labels;
return (array) array_combine($labels, array_fill(0, count($labels), 0));
}
}

View file

@ -6,22 +6,15 @@ namespace Phpml\Metric;
class ConfusionMatrix
{
/**
* @param array $actualLabels
* @param array $predictedLabels
* @param array $labels
*
* @return array
*/
public static function compute(array $actualLabels, array $predictedLabels, array $labels = null): array
public static function compute(array $actualLabels, array $predictedLabels, array $labels = []): array
{
$labels = $labels ? array_flip($labels) : self::getUniqueLabels($actualLabels);
$labels = count($labels) === 0 ? self::getUniqueLabels($actualLabels) : array_flip($labels);
$matrix = self::generateMatrixWithZeros($labels);
foreach ($actualLabels as $index => $actual) {
$predicted = $predictedLabels[$index];
if (!isset($labels[$actual]) || !isset($labels[$predicted])) {
if (!isset($labels[$actual], $labels[$predicted])) {
continue;
}
@ -32,17 +25,12 @@ class ConfusionMatrix
$column = $labels[$predicted];
}
$matrix[$row][$column] += 1;
++$matrix[$row][$column];
}
return $matrix;
}
/**
* @param array $labels
*
* @return array
*/
private static function generateMatrixWithZeros(array $labels): array
{
$count = count($labels);
@ -55,17 +43,11 @@ class ConfusionMatrix
return $matrix;
}
/**
* @param array $labels
*
* @return array
*/
private static function getUniqueLabels(array $labels): array
{
$labels = array_values(array_unique($labels));
sort($labels);
$labels = array_flip($labels);
return $labels;
return array_flip($labels);
}
}

View file

@ -4,52 +4,37 @@ declare(strict_types=1);
namespace Phpml;
use Phpml\Exception\SerializeException;
use Phpml\Exception\FileException;
use Phpml\Exception\SerializeException;
class ModelManager
{
/**
* @param Estimator $estimator
* @param string $filepath
*
* @throws FileException
* @throws SerializeException
*/
public function saveToFile(Estimator $estimator, string $filepath)
public function saveToFile(Estimator $estimator, string $filepath): void
{
if (!is_writable(dirname($filepath))) {
throw FileException::cantSaveFile(basename($filepath));
throw new FileException(sprintf('File "%s" can\'t be saved.', basename($filepath)));
}
$serialized = serialize($estimator);
if (empty($serialized)) {
throw SerializeException::cantSerialize(gettype($estimator));
if (!isset($serialized[0])) {
throw new SerializeException(sprintf('Class "%s" can not be serialized.', gettype($estimator)));
}
$result = file_put_contents($filepath, $serialized, LOCK_EX);
if ($result === false) {
throw FileException::cantSaveFile(basename($filepath));
throw new FileException(sprintf('File "%s" can\'t be saved.', basename($filepath)));
}
}
/**
* @param string $filepath
*
* @return Estimator
*
* @throws FileException
* @throws SerializeException
*/
public function restoreFromFile(string $filepath) : Estimator
public function restoreFromFile(string $filepath): Estimator
{
if (!file_exists($filepath) || !is_readable($filepath)) {
throw FileException::cantOpenFile(basename($filepath));
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
}
$object = unserialize(file_get_contents($filepath));
$object = unserialize((string) file_get_contents($filepath), [Estimator::class]);
if ($object === false) {
throw SerializeException::cantUnserialize(basename($filepath));
throw new SerializeException(sprintf('"%s" can not be unserialized.', basename($filepath)));
}
return $object;

View file

@ -8,8 +8,12 @@ interface ActivationFunction
{
/**
* @param float|int $value
*
* @return float
*/
public function compute($value): float;
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float;
}

View file

@ -10,11 +10,22 @@ class BinaryStep implements ActivationFunction
{
/**
* @param float|int $value
*
* @return float
*/
public function compute($value): float
{
return $value >= 0 ? 1.0 : 0.0;
}
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float
{
if ($value === 0 || $value === 0.0) {
return 1;
}
return 0;
}
}

View file

@ -10,11 +10,18 @@ class Gaussian implements ActivationFunction
{
/**
* @param float|int $value
*
* @return float
*/
public function compute($value): float
{
return exp(-pow($value, 2));
return exp(- $value ** 2);
}
/**
* @param float|int $value
* @param float|int $calculatedvalue
*/
public function differentiate($value, $calculatedvalue): float
{
return -2 * $value * $calculatedvalue;
}
}

View file

@ -13,21 +13,25 @@ class HyperbolicTangent implements ActivationFunction
*/
private $beta;
/**
* @param float $beta
*/
public function __construct($beta = 1.0)
public function __construct(float $beta = 1.0)
{
$this->beta = $beta;
}
/**
* @param float|int $value
*
* @return float
*/
public function compute($value): float
{
return tanh($this->beta * $value);
}
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float
{
return 1 - $computedvalue ** 2;
}
}

View file

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Phpml\NeuralNetwork\ActivationFunction;
use Phpml\NeuralNetwork\ActivationFunction;
class PReLU implements ActivationFunction
{
/**
* @var float
*/
private $beta;
public function __construct(float $beta = 0.01)
{
$this->beta = $beta;
}
/**
* @param float|int $value
*/
public function compute($value): float
{
return $value >= 0 ? $value : $this->beta * $value;
}
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float
{
return $computedvalue >= 0 ? 1.0 : $this->beta;
}
}

View file

@ -13,21 +13,25 @@ class Sigmoid implements ActivationFunction
*/
private $beta;
/**
* @param float $beta
*/
public function __construct($beta = 1.0)
public function __construct(float $beta = 1.0)
{
$this->beta = $beta;
}
/**
* @param float|int $value
*
* @return float
*/
public function compute($value): float
{
return 1 / (1 + exp(-$this->beta * $value));
}
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float
{
return $computedvalue * (1 - $computedvalue);
}
}

View file

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Phpml\NeuralNetwork\ActivationFunction;
use Phpml\NeuralNetwork\ActivationFunction;
class ThresholdedReLU implements ActivationFunction
{
/**
* @var float
*/
private $theta;
public function __construct(float $theta = 0.0)
{
$this->theta = $theta;
}
/**
* @param float|int $value
*/
public function compute($value): float
{
return $value > $this->theta ? $value : 0.0;
}
/**
* @param float|int $value
* @param float|int $calculatedvalue
*/
public function differentiate($value, $calculatedvalue): float
{
return $calculatedvalue >= $this->theta ? 1.0 : 0.0;
}
}

View file

@ -15,16 +15,12 @@ class Layer
private $nodes = [];
/**
* @param int $nodesNumber
* @param string $nodeClass
* @param ActivationFunction|null $activationFunction
*
* @throws InvalidArgumentException
*/
public function __construct(int $nodesNumber = 0, string $nodeClass = Neuron::class, ActivationFunction $activationFunction = null)
public function __construct(int $nodesNumber = 0, string $nodeClass = Neuron::class, ?ActivationFunction $activationFunction = null)
{
if (!in_array(Node::class, class_implements($nodeClass))) {
throw InvalidArgumentException::invalidLayerNodeClass();
if (!in_array(Node::class, class_implements($nodeClass), true)) {
throw new InvalidArgumentException('Layer node class must implement Node interface');
}
for ($i = 0; $i < $nodesNumber; ++$i) {
@ -32,25 +28,7 @@ class Layer
}
}
/**
* @param string $nodeClass
* @param ActivationFunction|null $activationFunction
*
* @return Neuron
*/
private function createNode(string $nodeClass, ActivationFunction $activationFunction = null)
{
if (Neuron::class == $nodeClass) {
return new Neuron($activationFunction);
}
return new $nodeClass();
}
/**
* @param Node $node
*/
public function addNode(Node $node)
public function addNode(Node $node): void
{
$this->nodes[] = $node;
}
@ -58,8 +36,17 @@ class Layer
/**
* @return Node[]
*/
public function getNodes()
public function getNodes(): array
{
return $this->nodes;
}
private function createNode(string $nodeClass, ?ActivationFunction $activationFunction = null): Node
{
if ($nodeClass === Neuron::class) {
return new Neuron($activationFunction);
}
return new $nodeClass();
}
}

View file

@ -8,20 +8,12 @@ interface Network
{
/**
* @param mixed $input
*
* @return self
*/
public function setInput($input);
public function setInput($input): self;
/**
* @return array
*/
public function getOutput(): array;
/**
* @param Layer $layer
*/
public function addLayer(Layer $layer);
public function addLayer(Layer $layer): void;
/**
* @return Layer[]

View file

@ -14,12 +14,9 @@ abstract class LayeredNetwork implements Network
/**
* @var Layer[]
*/
protected $layers;
protected $layers = [];
/**
* @param Layer $layer
*/
public function addLayer(Layer $layer)
public function addLayer(Layer $layer): void
{
$this->layers[] = $layer;
}
@ -32,25 +29,16 @@ abstract class LayeredNetwork implements Network
return $this->layers;
}
/**
* @return void
*/
public function removeLayers()
public function removeLayers(): void
{
unset($this->layers);
}
/**
* @return Layer
*/
public function getOutputLayer(): Layer
{
return $this->layers[count($this->layers) - 1];
}
/**
* @return array
*/
public function getOutput(): array
{
$result = [];
@ -63,10 +51,8 @@ abstract class LayeredNetwork implements Network
/**
* @param mixed $input
*
* @return $this
*/
public function setInput($input)
public function setInput($input): Network
{
$firstLayer = $this->layers[0];

Some files were not shown because too many files have changed in this diff Show more