mirror of
https://github.com/moodle/moodle.git
synced 2025-08-06 01:16:44 +02:00
MDL-65769 lib: update PHP-ML to 0.8.0
This commit is contained in:
parent
f7e108438f
commit
e6c25fb057
126 changed files with 3639 additions and 3753 deletions
|
@ -1,6 +1,6 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
|
||||
Copyright (c) 2016-2018 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
Current version is 12b8b11
|
||||
Current version is 0.8.0
|
||||
|
||||
# Download latest stable version from https://github.com/php-ai/php-ml
|
||||
# Remove all files but:
|
||||
|
|
|
@ -9,15 +9,16 @@ use Phpml\Helper\Trainable;
|
|||
|
||||
class Apriori implements Associator
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
use Trainable;
|
||||
use Predictable;
|
||||
|
||||
const ARRAY_KEY_ANTECEDENT = 'antecedent';
|
||||
public const ARRAY_KEY_ANTECEDENT = 'antecedent';
|
||||
|
||||
const ARRAY_KEY_CONFIDENCE = 'confidence';
|
||||
public const ARRAY_KEY_CONFIDENCE = 'confidence';
|
||||
|
||||
const ARRAY_KEY_CONSEQUENT = 'consequent';
|
||||
public const ARRAY_KEY_CONSEQUENT = 'consequent';
|
||||
|
||||
const ARRAY_KEY_SUPPORT = 'support';
|
||||
public const ARRAY_KEY_SUPPORT = 'support';
|
||||
|
||||
/**
|
||||
* Minimum relative probability of frequent transactions.
|
||||
|
@ -31,7 +32,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @var mixed[][][]
|
||||
*/
|
||||
private $large;
|
||||
private $large = [];
|
||||
|
||||
/**
|
||||
* Minimum relative frequency of transactions.
|
||||
|
@ -45,13 +46,10 @@ class Apriori implements Associator
|
|||
*
|
||||
* @var mixed[][]
|
||||
*/
|
||||
private $rules;
|
||||
private $rules = [];
|
||||
|
||||
/**
|
||||
* Apriori constructor.
|
||||
*
|
||||
* @param float $support
|
||||
* @param float $confidence
|
||||
*/
|
||||
public function __construct(float $support = 0.0, float $confidence = 0.0)
|
||||
{
|
||||
|
@ -64,13 +62,13 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
public function getRules() : array
|
||||
public function getRules(): array
|
||||
{
|
||||
if (!$this->large) {
|
||||
if (count($this->large) === 0) {
|
||||
$this->large = $this->apriori();
|
||||
}
|
||||
|
||||
if ($this->rules) {
|
||||
if (count($this->rules) > 0) {
|
||||
return $this->rules;
|
||||
}
|
||||
|
||||
|
@ -86,15 +84,14 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][][]
|
||||
*/
|
||||
public function apriori() : array
|
||||
public function apriori(): array
|
||||
{
|
||||
$L = [];
|
||||
$L[1] = $this->items();
|
||||
$L[1] = $this->frequent($L[1]);
|
||||
|
||||
for ($k = 2; !empty($L[$k - 1]); ++$k) {
|
||||
$L[$k] = $this->candidates($L[$k - 1]);
|
||||
$L[$k] = $this->frequent($L[$k]);
|
||||
$items = $this->frequent($this->items());
|
||||
for ($k = 1; isset($items[0]); ++$k) {
|
||||
$L[$k] = $items;
|
||||
$items = $this->frequent($this->candidates($items));
|
||||
}
|
||||
|
||||
return $L;
|
||||
|
@ -105,7 +102,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
protected function predictSample(array $sample) : array
|
||||
protected function predictSample(array $sample): array
|
||||
{
|
||||
$predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample) {
|
||||
return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample);
|
||||
|
@ -119,9 +116,9 @@ class Apriori implements Associator
|
|||
/**
|
||||
* Generate rules for each k-length frequent item set.
|
||||
*/
|
||||
private function generateAllRules()
|
||||
private function generateAllRules(): void
|
||||
{
|
||||
for ($k = 2; !empty($this->large[$k]); ++$k) {
|
||||
for ($k = 2; isset($this->large[$k]); ++$k) {
|
||||
foreach ($this->large[$k] as $frequent) {
|
||||
$this->generateRules($frequent);
|
||||
}
|
||||
|
@ -133,15 +130,16 @@ class Apriori implements Associator
|
|||
*
|
||||
* @param mixed[] $frequent
|
||||
*/
|
||||
private function generateRules(array $frequent)
|
||||
private function generateRules(array $frequent): void
|
||||
{
|
||||
foreach ($this->antecedents($frequent) as $antecedent) {
|
||||
if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) {
|
||||
$confidence = $this->confidence($frequent, $antecedent);
|
||||
if ($this->confidence <= $confidence) {
|
||||
$consequent = array_values(array_diff($frequent, $antecedent));
|
||||
$this->rules[] = [
|
||||
self::ARRAY_KEY_ANTECEDENT => $antecedent,
|
||||
self::ARRAY_KEY_CONSEQUENT => $consequent,
|
||||
self::ARRAY_KEY_SUPPORT => $this->support($consequent),
|
||||
self::ARRAY_KEY_SUPPORT => $this->support($frequent),
|
||||
self::ARRAY_KEY_CONFIDENCE => $confidence,
|
||||
];
|
||||
}
|
||||
|
@ -155,7 +153,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function powerSet(array $sample) : array
|
||||
private function powerSet(array $sample): array
|
||||
{
|
||||
$results = [[]];
|
||||
foreach ($sample as $item) {
|
||||
|
@ -174,7 +172,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function antecedents(array $sample) : array
|
||||
private function antecedents(array $sample): array
|
||||
{
|
||||
$cardinality = count($sample);
|
||||
$antecedents = $this->powerSet($sample);
|
||||
|
@ -189,7 +187,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function items() : array
|
||||
private function items(): array
|
||||
{
|
||||
$items = [];
|
||||
|
||||
|
@ -213,11 +211,11 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function frequent(array $samples) : array
|
||||
private function frequent(array $samples): array
|
||||
{
|
||||
return array_filter($samples, function ($entry) {
|
||||
return array_values(array_filter($samples, function ($entry) {
|
||||
return $this->support($entry) >= $this->support;
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -227,7 +225,7 @@ class Apriori implements Associator
|
|||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function candidates(array $samples) : array
|
||||
private function candidates(array $samples): array
|
||||
{
|
||||
$candidates = [];
|
||||
|
||||
|
@ -237,15 +235,16 @@ class Apriori implements Associator
|
|||
continue;
|
||||
}
|
||||
|
||||
$candidate = array_unique(array_merge($p, $q));
|
||||
$candidate = array_values(array_unique(array_merge($p, $q)));
|
||||
|
||||
if ($this->contains($candidates, $candidate)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ((array) $this->samples as $sample) {
|
||||
foreach ($this->samples as $sample) {
|
||||
if ($this->subset($sample, $candidate)) {
|
||||
$candidates[] = $candidate;
|
||||
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
|
@ -261,10 +260,8 @@ class Apriori implements Associator
|
|||
*
|
||||
* @param mixed[] $set
|
||||
* @param mixed[] $subset
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function confidence(array $set, array $subset) : float
|
||||
private function confidence(array $set, array $subset): float
|
||||
{
|
||||
return $this->support($set) / $this->support($subset);
|
||||
}
|
||||
|
@ -276,10 +273,8 @@ class Apriori implements Associator
|
|||
* @see \Phpml\Association\Apriori::samples
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function support(array $sample) : float
|
||||
private function support(array $sample): float
|
||||
{
|
||||
return $this->frequency($sample) / count($this->samples);
|
||||
}
|
||||
|
@ -290,10 +285,8 @@ class Apriori implements Associator
|
|||
* @see \Phpml\Association\Apriori::samples
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
private function frequency(array $sample) : int
|
||||
private function frequency(array $sample): int
|
||||
{
|
||||
return count(array_filter($this->samples, function ($entry) use ($sample) {
|
||||
return $this->subset($entry, $sample);
|
||||
|
@ -307,10 +300,8 @@ class Apriori implements Associator
|
|||
*
|
||||
* @param mixed[][] $system
|
||||
* @param mixed[] $set
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function contains(array $system, array $set) : bool
|
||||
private function contains(array $system, array $set): bool
|
||||
{
|
||||
return (bool) array_filter($system, function ($entry) use ($set) {
|
||||
return $this->equals($entry, $set);
|
||||
|
@ -322,12 +313,10 @@ class Apriori implements Associator
|
|||
*
|
||||
* @param mixed[] $set
|
||||
* @param mixed[] $subset
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function subset(array $set, array $subset) : bool
|
||||
private function subset(array $set, array $subset): bool
|
||||
{
|
||||
return !array_diff($subset, array_intersect($subset, $set));
|
||||
return count(array_diff($subset, array_intersect($subset, $set))) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -335,10 +324,8 @@ class Apriori implements Associator
|
|||
*
|
||||
* @param mixed[] $set1
|
||||
* @param mixed[] $set2
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function equals(array $set1, array $set2) : bool
|
||||
private function equals(array $set1, array $set2): bool
|
||||
{
|
||||
return array_diff($set1, $set2) == array_diff($set2, $set1);
|
||||
}
|
||||
|
|
|
@ -4,23 +4,40 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification;
|
||||
|
||||
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
|
||||
|
||||
class DecisionTree implements Classifier
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
use Trainable;
|
||||
use Predictable;
|
||||
|
||||
const CONTINUOUS = 1;
|
||||
const NOMINAL = 2;
|
||||
public const CONTINUOUS = 1;
|
||||
|
||||
public const NOMINAL = 2;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
public $actualDepth = 0;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $columnTypes;
|
||||
protected $columnTypes = [];
|
||||
|
||||
/**
|
||||
* @var DecisionTreeLeaf
|
||||
*/
|
||||
protected $tree;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $maxDepth;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
|
@ -32,21 +49,6 @@ class DecisionTree implements Classifier
|
|||
*/
|
||||
private $featureCount = 0;
|
||||
|
||||
/**
|
||||
* @var DecisionTreeLeaf
|
||||
*/
|
||||
protected $tree = null;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $maxDepth;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
public $actualDepth = 0;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
|
@ -55,32 +57,24 @@ class DecisionTree implements Classifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $selectedFeatures;
|
||||
private $selectedFeatures = [];
|
||||
|
||||
/**
|
||||
* @var array|null
|
||||
*/
|
||||
private $featureImportances;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $featureImportances = null;
|
||||
private $columnNames = [];
|
||||
|
||||
/**
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $columnNames = null;
|
||||
|
||||
/**
|
||||
* @param int $maxDepth
|
||||
*/
|
||||
public function __construct(int $maxDepth = 10)
|
||||
{
|
||||
$this->maxDepth = $maxDepth;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
$this->samples = array_merge($this->samples, $samples);
|
||||
$this->targets = array_merge($this->targets, $targets);
|
||||
|
@ -96,23 +90,19 @@ class DecisionTree implements Classifier
|
|||
|
||||
// If column names are given or computed before, then there is no
|
||||
// need to init it and accidentally remove the previous given names
|
||||
if ($this->columnNames === null) {
|
||||
if ($this->columnNames === []) {
|
||||
$this->columnNames = range(0, $this->featureCount - 1);
|
||||
} elseif (count($this->columnNames) > $this->featureCount) {
|
||||
$this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
|
||||
} elseif (count($this->columnNames) < $this->featureCount) {
|
||||
$this->columnNames = array_merge($this->columnNames,
|
||||
$this->columnNames = array_merge(
|
||||
$this->columnNames,
|
||||
range(count($this->columnNames), $this->featureCount - 1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function getColumnTypes(array $samples) : array
|
||||
public static function getColumnTypes(array $samples): array
|
||||
{
|
||||
$types = [];
|
||||
$featureCount = count($samples[0]);
|
||||
|
@ -126,12 +116,120 @@ class DecisionTree implements Classifier
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $records
|
||||
* @param int $depth
|
||||
*
|
||||
* @return DecisionTreeLeaf
|
||||
* @param mixed $baseValue
|
||||
*/
|
||||
protected function getSplitLeaf(array $records, int $depth = 0) : DecisionTreeLeaf
|
||||
public function getGiniIndex($baseValue, array $colValues, array $targets): float
|
||||
{
|
||||
$countMatrix = [];
|
||||
foreach ($this->labels as $label) {
|
||||
$countMatrix[$label] = [0, 0];
|
||||
}
|
||||
|
||||
foreach ($colValues as $index => $value) {
|
||||
$label = $targets[$index];
|
||||
$rowIndex = $value === $baseValue ? 0 : 1;
|
||||
++$countMatrix[$label][$rowIndex];
|
||||
}
|
||||
|
||||
$giniParts = [0, 0];
|
||||
for ($i = 0; $i <= 1; ++$i) {
|
||||
$part = 0;
|
||||
$sum = array_sum(array_column($countMatrix, $i));
|
||||
if ($sum > 0) {
|
||||
foreach ($this->labels as $label) {
|
||||
$part += ($countMatrix[$label][$i] / (float) $sum) ** 2;
|
||||
}
|
||||
}
|
||||
|
||||
$giniParts[$i] = (1 - $part) * $sum;
|
||||
}
|
||||
|
||||
return array_sum($giniParts) / count($colValues);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is used to set number of columns to be used
|
||||
* when deciding a split at an internal node of the tree. <br>
|
||||
* If the value is given 0, then all features are used (default behaviour),
|
||||
* otherwise the given value will be used as a maximum for number of columns
|
||||
* randomly selected for each split operation.
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function setNumFeatures(int $numFeatures)
|
||||
{
|
||||
if ($numFeatures < 0) {
|
||||
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
|
||||
}
|
||||
|
||||
$this->numUsableFeatures = $numFeatures;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* A string array to represent columns. Useful when HTML output or
|
||||
* column importances are desired to be inspected.
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function setColumnNames(array $names)
|
||||
{
|
||||
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
|
||||
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
|
||||
}
|
||||
|
||||
$this->columnNames = $names;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function getHtml(): string
|
||||
{
|
||||
return $this->tree->getHTML($this->columnNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* This will return an array including an importance value for
|
||||
* each column in the given dataset. The importance values are
|
||||
* normalized and their total makes 1.<br/>
|
||||
*/
|
||||
public function getFeatureImportances(): array
|
||||
{
|
||||
if ($this->featureImportances !== null) {
|
||||
return $this->featureImportances;
|
||||
}
|
||||
|
||||
$sampleCount = count($this->samples);
|
||||
$this->featureImportances = [];
|
||||
foreach ($this->columnNames as $column => $columnName) {
|
||||
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
|
||||
|
||||
$importance = 0;
|
||||
foreach ($nodes as $node) {
|
||||
$importance += $node->getNodeImpurityDecrease($sampleCount);
|
||||
}
|
||||
|
||||
$this->featureImportances[$columnName] = $importance;
|
||||
}
|
||||
|
||||
// Normalize & sort the importances
|
||||
$total = array_sum($this->featureImportances);
|
||||
if ($total > 0) {
|
||||
array_walk($this->featureImportances, function (&$importance) use ($total): void {
|
||||
$importance /= $total;
|
||||
});
|
||||
arsort($this->featureImportances);
|
||||
}
|
||||
|
||||
return $this->featureImportances;
|
||||
}
|
||||
|
||||
protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
|
||||
{
|
||||
$split = $this->getBestSplit($records);
|
||||
$split->level = $depth;
|
||||
|
@ -143,7 +241,7 @@ class DecisionTree implements Classifier
|
|||
// otherwise group the records so that we can classify the leaf
|
||||
// in case maximum depth is reached
|
||||
$leftRecords = [];
|
||||
$rightRecords= [];
|
||||
$rightRecords = [];
|
||||
$remainingTargets = [];
|
||||
$prevRecord = null;
|
||||
$allSame = true;
|
||||
|
@ -151,9 +249,10 @@ class DecisionTree implements Classifier
|
|||
foreach ($records as $recordNo) {
|
||||
// Check if the previous record is the same with the current one
|
||||
$record = $this->samples[$recordNo];
|
||||
if ($prevRecord && $prevRecord != $record) {
|
||||
if ($prevRecord !== null && $prevRecord != $record) {
|
||||
$allSame = false;
|
||||
}
|
||||
|
||||
$prevRecord = $record;
|
||||
|
||||
// According to the split criteron, this record will
|
||||
|
@ -161,7 +260,7 @@ class DecisionTree implements Classifier
|
|||
if ($split->evaluate($record)) {
|
||||
$leftRecords[] = $recordNo;
|
||||
} else {
|
||||
$rightRecords[]= $recordNo;
|
||||
$rightRecords[] = $recordNo;
|
||||
}
|
||||
|
||||
// Group remaining targets
|
||||
|
@ -174,31 +273,29 @@ class DecisionTree implements Classifier
|
|||
}
|
||||
|
||||
if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) {
|
||||
$split->isTerminal = 1;
|
||||
$split->isTerminal = true;
|
||||
arsort($remainingTargets);
|
||||
$split->classValue = key($remainingTargets);
|
||||
$split->classValue = (string) key($remainingTargets);
|
||||
} else {
|
||||
if ($leftRecords) {
|
||||
if (isset($leftRecords[0])) {
|
||||
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
|
||||
}
|
||||
if ($rightRecords) {
|
||||
$split->rightLeaf= $this->getSplitLeaf($rightRecords, $depth + 1);
|
||||
|
||||
if (isset($rightRecords[0])) {
|
||||
$split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
|
||||
}
|
||||
}
|
||||
|
||||
return $split;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $records
|
||||
*
|
||||
* @return DecisionTreeLeaf
|
||||
*/
|
||||
protected function getBestSplit(array $records) : DecisionTreeLeaf
|
||||
protected function getBestSplit(array $records): DecisionTreeLeaf
|
||||
{
|
||||
$targets = array_intersect_key($this->targets, array_flip($records));
|
||||
$samples = array_intersect_key($this->samples, array_flip($records));
|
||||
$samples = array_combine($records, $this->preprocess($samples));
|
||||
$samples = (array) array_combine(
|
||||
$records,
|
||||
$this->preprocess(array_intersect_key($this->samples, array_flip($records)))
|
||||
);
|
||||
$bestGiniVal = 1;
|
||||
$bestSplit = null;
|
||||
$features = $this->getSelectedFeatures();
|
||||
|
@ -207,26 +304,31 @@ class DecisionTree implements Classifier
|
|||
foreach ($samples as $index => $row) {
|
||||
$colValues[$index] = $row[$i];
|
||||
}
|
||||
|
||||
$counts = array_count_values($colValues);
|
||||
arsort($counts);
|
||||
$baseValue = key($counts);
|
||||
if ($baseValue === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
|
||||
if ($bestSplit === null || $bestGiniVal > $gini) {
|
||||
$split = new DecisionTreeLeaf();
|
||||
$split->value = $baseValue;
|
||||
$split->giniIndex = $gini;
|
||||
$split->columnIndex = $i;
|
||||
$split->isContinuous = $this->columnTypes[$i] == self::CONTINUOUS;
|
||||
$split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
|
||||
$split->records = $records;
|
||||
|
||||
// If a numeric column is to be selected, then
|
||||
// the original numeric value and the selected operator
|
||||
// will also be saved into the leaf for future access
|
||||
if ($this->columnTypes[$i] == self::CONTINUOUS) {
|
||||
if ($this->columnTypes[$i] === self::CONTINUOUS) {
|
||||
$matches = [];
|
||||
preg_match("/^([<>=]{1,2})\s*(.*)/", strval($split->value), $matches);
|
||||
preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches);
|
||||
$split->operator = $matches[1];
|
||||
$split->numericValue = floatval($matches[2]);
|
||||
$split->numericValue = (float) $matches[2];
|
||||
}
|
||||
|
||||
$bestSplit = $split;
|
||||
|
@ -249,17 +351,15 @@ class DecisionTree implements Classifier
|
|||
*
|
||||
* If any of above methods were not called beforehand, then all features
|
||||
* are returned by default.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getSelectedFeatures() : array
|
||||
protected function getSelectedFeatures(): array
|
||||
{
|
||||
$allFeatures = range(0, $this->featureCount - 1);
|
||||
if ($this->numUsableFeatures === 0 && !$this->selectedFeatures) {
|
||||
if ($this->numUsableFeatures === 0 && count($this->selectedFeatures) === 0) {
|
||||
return $allFeatures;
|
||||
}
|
||||
|
||||
if ($this->selectedFeatures) {
|
||||
if (count($this->selectedFeatures) > 0) {
|
||||
return $this->selectedFeatures;
|
||||
}
|
||||
|
||||
|
@ -267,55 +367,15 @@ class DecisionTree implements Classifier
|
|||
if ($numFeatures > $this->featureCount) {
|
||||
$numFeatures = $this->featureCount;
|
||||
}
|
||||
|
||||
shuffle($allFeatures);
|
||||
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
|
||||
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
|
||||
sort($selectedFeatures);
|
||||
|
||||
return $selectedFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed $baseValue
|
||||
* @param array $colValues
|
||||
* @param array $targets
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function getGiniIndex($baseValue, array $colValues, array $targets) : float
|
||||
{
|
||||
$countMatrix = [];
|
||||
foreach ($this->labels as $label) {
|
||||
$countMatrix[$label] = [0, 0];
|
||||
}
|
||||
|
||||
foreach ($colValues as $index => $value) {
|
||||
$label = $targets[$index];
|
||||
$rowIndex = $value === $baseValue ? 0 : 1;
|
||||
++$countMatrix[$label][$rowIndex];
|
||||
}
|
||||
|
||||
$giniParts = [0, 0];
|
||||
for ($i = 0; $i <= 1; ++$i) {
|
||||
$part = 0;
|
||||
$sum = array_sum(array_column($countMatrix, $i));
|
||||
if ($sum > 0) {
|
||||
foreach ($this->labels as $label) {
|
||||
$part += pow($countMatrix[$label][$i] / floatval($sum), 2);
|
||||
}
|
||||
}
|
||||
|
||||
$giniParts[$i] = (1 - $part) * $sum;
|
||||
}
|
||||
|
||||
return array_sum($giniParts) / count($colValues);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function preprocess(array $samples) : array
|
||||
protected function preprocess(array $samples): array
|
||||
{
|
||||
// Detect and convert continuous data column values into
|
||||
// discrete values by using the median as a threshold value
|
||||
|
@ -326,25 +386,22 @@ class DecisionTree implements Classifier
|
|||
$median = Mean::median($values);
|
||||
foreach ($values as &$value) {
|
||||
if ($value <= $median) {
|
||||
$value = "<= $median";
|
||||
$value = "<= ${median}";
|
||||
} else {
|
||||
$value = "> $median";
|
||||
$value = "> ${median}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$columns[] = $values;
|
||||
}
|
||||
|
||||
// Below method is a strange yet very simple & efficient method
|
||||
// to get the transpose of a 2D array
|
||||
return array_map(null, ...$columns);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $columnValues
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
protected static function isCategoricalColumn(array $columnValues) : bool
|
||||
protected static function isCategoricalColumn(array $columnValues): bool
|
||||
{
|
||||
$count = count($columnValues);
|
||||
|
||||
|
@ -355,7 +412,7 @@ class DecisionTree implements Classifier
|
|||
// all values in that column (Lower than or equal to %20 of all values)
|
||||
$numericValues = array_filter($columnValues, 'is_numeric');
|
||||
$floatValues = array_filter($columnValues, 'is_float');
|
||||
if ($floatValues) {
|
||||
if (count($floatValues) > 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -368,119 +425,21 @@ class DecisionTree implements Classifier
|
|||
return count($distinctValues) <= $count / 5;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is used to set number of columns to be used
|
||||
* when deciding a split at an internal node of the tree. <br>
|
||||
* If the value is given 0, then all features are used (default behaviour),
|
||||
* otherwise the given value will be used as a maximum for number of columns
|
||||
* randomly selected for each split operation.
|
||||
*
|
||||
* @param int $numFeatures
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function setNumFeatures(int $numFeatures)
|
||||
{
|
||||
if ($numFeatures < 0) {
|
||||
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
|
||||
}
|
||||
|
||||
$this->numUsableFeatures = $numFeatures;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to set predefined features to consider while deciding which column to use for a split
|
||||
*
|
||||
* @param array $selectedFeatures
|
||||
*/
|
||||
protected function setSelectedFeatures(array $selectedFeatures)
|
||||
protected function setSelectedFeatures(array $selectedFeatures): void
|
||||
{
|
||||
$this->selectedFeatures = $selectedFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* A string array to represent columns. Useful when HTML output or
|
||||
* column importances are desired to be inspected.
|
||||
*
|
||||
* @param array $names
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function setColumnNames(array $names)
|
||||
{
|
||||
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
|
||||
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
|
||||
}
|
||||
|
||||
$this->columnNames = $names;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function getHtml()
|
||||
{
|
||||
return $this->tree->getHTML($this->columnNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* This will return an array including an importance value for
|
||||
* each column in the given dataset. The importance values are
|
||||
* normalized and their total makes 1.<br/>
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getFeatureImportances()
|
||||
{
|
||||
if ($this->featureImportances !== null) {
|
||||
return $this->featureImportances;
|
||||
}
|
||||
|
||||
$sampleCount = count($this->samples);
|
||||
$this->featureImportances = [];
|
||||
foreach ($this->columnNames as $column => $columnName) {
|
||||
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
|
||||
|
||||
$importance = 0;
|
||||
foreach ($nodes as $node) {
|
||||
$importance += $node->getNodeImpurityDecrease($sampleCount);
|
||||
}
|
||||
|
||||
$this->featureImportances[$columnName] = $importance;
|
||||
}
|
||||
|
||||
// Normalize & sort the importances
|
||||
$total = array_sum($this->featureImportances);
|
||||
if ($total > 0) {
|
||||
foreach ($this->featureImportances as &$importance) {
|
||||
$importance /= $total;
|
||||
}
|
||||
arsort($this->featureImportances);
|
||||
}
|
||||
|
||||
return $this->featureImportances;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects and returns an array of internal nodes that use the given
|
||||
* column as a split criterion
|
||||
*
|
||||
* @param int $column
|
||||
* @param DecisionTreeLeaf $node
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node) : array
|
||||
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
|
||||
{
|
||||
if (!$node || $node->isTerminal) {
|
||||
if ($node->isTerminal) {
|
||||
return [];
|
||||
}
|
||||
|
||||
|
@ -491,22 +450,18 @@ class DecisionTree implements Classifier
|
|||
|
||||
$lNodes = [];
|
||||
$rNodes = [];
|
||||
if ($node->leftLeaf) {
|
||||
if ($node->leftLeaf !== null) {
|
||||
$lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
|
||||
}
|
||||
|
||||
if ($node->rightLeaf) {
|
||||
if ($node->rightLeaf !== null) {
|
||||
$rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
|
||||
}
|
||||
|
||||
$nodes = array_merge($nodes, $lNodes, $rNodes);
|
||||
|
||||
return $nodes;
|
||||
return array_merge($nodes, $lNodes, $rNodes);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
|
@ -514,7 +469,7 @@ class DecisionTree implements Classifier
|
|||
$node = $this->tree;
|
||||
do {
|
||||
if ($node->isTerminal) {
|
||||
break;
|
||||
return $node->classValue;
|
||||
}
|
||||
|
||||
if ($node->evaluate($sample)) {
|
||||
|
@ -524,6 +479,6 @@ class DecisionTree implements Classifier
|
|||
}
|
||||
} while ($node);
|
||||
|
||||
return $node ? $node->classValue : $this->labels[0];
|
||||
return $this->labels[0];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,10 +4,12 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\DecisionTree;
|
||||
|
||||
use Phpml\Math\Comparison;
|
||||
|
||||
class DecisionTreeLeaf
|
||||
{
|
||||
/**
|
||||
* @var string
|
||||
* @var string|int
|
||||
*/
|
||||
public $value;
|
||||
|
||||
|
@ -27,14 +29,14 @@ class DecisionTreeLeaf
|
|||
public $columnIndex;
|
||||
|
||||
/**
|
||||
* @var DecisionTreeLeaf
|
||||
* @var DecisionTreeLeaf|null
|
||||
*/
|
||||
public $leftLeaf = null;
|
||||
public $leftLeaf;
|
||||
|
||||
/**
|
||||
* @var DecisionTreeLeaf
|
||||
* @var DecisionTreeLeaf|null
|
||||
*/
|
||||
public $rightLeaf= null;
|
||||
public $rightLeaf;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
|
@ -70,48 +72,44 @@ class DecisionTreeLeaf
|
|||
public $level = 0;
|
||||
|
||||
/**
|
||||
* @param array $record
|
||||
* @return bool
|
||||
* HTML representation of the tree without column names
|
||||
*/
|
||||
public function evaluate($record)
|
||||
public function __toString(): string
|
||||
{
|
||||
return $this->getHTML();
|
||||
}
|
||||
|
||||
public function evaluate(array $record): bool
|
||||
{
|
||||
$recordField = $record[$this->columnIndex];
|
||||
|
||||
if ($this->isContinuous) {
|
||||
$op = $this->operator;
|
||||
$value= $this->numericValue;
|
||||
$recordField = strval($recordField);
|
||||
eval("\$result = $recordField $op $value;");
|
||||
return $result;
|
||||
return Comparison::compare((string) $recordField, $this->numericValue, $this->operator);
|
||||
}
|
||||
|
||||
|
||||
return $recordField == $this->value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns Mean Decrease Impurity (MDI) in the node.
|
||||
* For terminal nodes, this value is equal to 0
|
||||
*
|
||||
* @param int $parentRecordCount
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function getNodeImpurityDecrease(int $parentRecordCount)
|
||||
public function getNodeImpurityDecrease(int $parentRecordCount): float
|
||||
{
|
||||
if ($this->isTerminal) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
$nodeSampleCount = (float)count($this->records);
|
||||
$nodeSampleCount = (float) count($this->records);
|
||||
$iT = $this->giniIndex;
|
||||
|
||||
if ($this->leftLeaf) {
|
||||
$pL = count($this->leftLeaf->records)/$nodeSampleCount;
|
||||
if ($this->leftLeaf !== null) {
|
||||
$pL = count($this->leftLeaf->records) / $nodeSampleCount;
|
||||
$iT -= $pL * $this->leftLeaf->giniIndex;
|
||||
}
|
||||
|
||||
if ($this->rightLeaf) {
|
||||
$pR = count($this->rightLeaf->records)/$nodeSampleCount;
|
||||
if ($this->rightLeaf !== null) {
|
||||
$pR = count($this->rightLeaf->records) / $nodeSampleCount;
|
||||
$iT -= $pR * $this->rightLeaf->giniIndex;
|
||||
}
|
||||
|
||||
|
@ -120,14 +118,11 @@ class DecisionTreeLeaf
|
|||
|
||||
/**
|
||||
* Returns HTML representation of the node including children nodes
|
||||
*
|
||||
* @param $columnNames
|
||||
* @return string
|
||||
*/
|
||||
public function getHTML($columnNames = null)
|
||||
public function getHTML(?array $columnNames = null): string
|
||||
{
|
||||
if ($this->isTerminal) {
|
||||
$value = "<b>$this->classValue</b>";
|
||||
$value = "<b>${this}->classValue</b>";
|
||||
} else {
|
||||
$value = $this->value;
|
||||
if ($columnNames !== null) {
|
||||
|
@ -135,39 +130,36 @@ class DecisionTreeLeaf
|
|||
} else {
|
||||
$col = "col_$this->columnIndex";
|
||||
}
|
||||
if (!preg_match("/^[<>=]{1,2}/", $value)) {
|
||||
$value = "=$value";
|
||||
|
||||
if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) {
|
||||
$value = "=${value}";
|
||||
}
|
||||
$value = "<b>$col $value</b><br>Gini: ". number_format($this->giniIndex, 2);
|
||||
|
||||
$value = "<b>${col} ${value}</b><br>Gini: ".number_format($this->giniIndex, 2);
|
||||
}
|
||||
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>
|
||||
$value</td></tr>";
|
||||
if ($this->leftLeaf || $this->rightLeaf) {
|
||||
$str .='<tr>';
|
||||
if ($this->leftLeaf) {
|
||||
$str .="<td valign=top><b>| Yes</b><br>" . $this->leftLeaf->getHTML($columnNames) . "</td>";
|
||||
|
||||
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>${value}</td></tr>";
|
||||
|
||||
if ($this->leftLeaf !== null || $this->rightLeaf !== null) {
|
||||
$str .= '<tr>';
|
||||
if ($this->leftLeaf !== null) {
|
||||
$str .= '<td valign=top><b>| Yes</b><br>'.$this->leftLeaf->getHTML($columnNames).'</td>';
|
||||
} else {
|
||||
$str .='<td></td>';
|
||||
$str .= '<td></td>';
|
||||
}
|
||||
$str .='<td> </td>';
|
||||
if ($this->rightLeaf) {
|
||||
$str .="<td valign=top align=right><b>No |</b><br>" . $this->rightLeaf->getHTML($columnNames) . "</td>";
|
||||
|
||||
$str .= '<td> </td>';
|
||||
if ($this->rightLeaf !== null) {
|
||||
$str .= '<td valign=top align=right><b>No |</b><br>'.$this->rightLeaf->getHTML($columnNames).'</td>';
|
||||
} else {
|
||||
$str .='<td></td>';
|
||||
$str .= '<td></td>';
|
||||
}
|
||||
|
||||
$str .= '</tr>';
|
||||
}
|
||||
|
||||
$str .= '</table>';
|
||||
|
||||
return $str;
|
||||
}
|
||||
|
||||
/**
|
||||
* HTML representation of the tree without column names
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function __toString()
|
||||
{
|
||||
return $this->getHTML();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,20 +4,24 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Ensemble;
|
||||
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Classification\Linear\DecisionStump;
|
||||
use Phpml\Classification\WeightedClassifier;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
use Phpml\Math\Statistic\StandardDeviation;
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
use Phpml\Math\Statistic\StandardDeviation;
|
||||
use ReflectionClass;
|
||||
|
||||
class AdaBoost implements Classifier
|
||||
{
|
||||
use Predictable, Trainable;
|
||||
use Predictable;
|
||||
use Trainable;
|
||||
|
||||
/**
|
||||
* Actual labels given in the targets array
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $labels = [];
|
||||
|
@ -74,8 +78,6 @@ class AdaBoost implements Classifier
|
|||
* ADAptive BOOSTing (AdaBoost) is an ensemble algorithm to
|
||||
* improve classification performance of 'weak' classifiers such as
|
||||
* DecisionStump (default base classifier of AdaBoost).
|
||||
*
|
||||
* @param int $maxIterations
|
||||
*/
|
||||
public function __construct(int $maxIterations = 50)
|
||||
{
|
||||
|
@ -84,32 +86,29 @@ class AdaBoost implements Classifier
|
|||
|
||||
/**
|
||||
* Sets the base classifier that will be used for boosting (default = DecisionStump)
|
||||
*
|
||||
* @param string $baseClassifier
|
||||
* @param array $classifierOptions
|
||||
*/
|
||||
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = [])
|
||||
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = []): void
|
||||
{
|
||||
$this->baseClassifier = $baseClassifier;
|
||||
$this->classifierOptions = $classifierOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
// Initialize usual variables
|
||||
$this->labels = array_keys(array_count_values($targets));
|
||||
if (count($this->labels) != 2) {
|
||||
throw new \Exception("AdaBoost is a binary classifier and can classify between two classes only");
|
||||
if (count($this->labels) !== 2) {
|
||||
throw new InvalidArgumentException('AdaBoost is a binary classifier and can classify between two classes only');
|
||||
}
|
||||
|
||||
// Set all target values to either -1 or 1
|
||||
$this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
|
||||
$this->labels = [
|
||||
1 => $this->labels[0],
|
||||
-1 => $this->labels[1],
|
||||
];
|
||||
foreach ($targets as $target) {
|
||||
$this->targets[] = $target == $this->labels[1] ? 1 : -1;
|
||||
}
|
||||
|
@ -140,25 +139,34 @@ class AdaBoost implements Classifier
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the classifier with the lowest error rate with the
|
||||
* consideration of current sample weights
|
||||
*
|
||||
* @return Classifier
|
||||
* @return mixed
|
||||
*/
|
||||
protected function getBestClassifier()
|
||||
public function predictSample(array $sample)
|
||||
{
|
||||
$ref = new \ReflectionClass($this->baseClassifier);
|
||||
if ($this->classifierOptions) {
|
||||
$classifier = $ref->newInstanceArgs($this->classifierOptions);
|
||||
} else {
|
||||
$classifier = $ref->newInstance();
|
||||
$sum = 0;
|
||||
foreach ($this->alpha as $index => $alpha) {
|
||||
$h = $this->classifiers[$index]->predict($sample);
|
||||
$sum += $h * $alpha;
|
||||
}
|
||||
|
||||
if (is_subclass_of($classifier, WeightedClassifier::class)) {
|
||||
return $this->labels[$sum > 0 ? 1 : -1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the classifier with the lowest error rate with the
|
||||
* consideration of current sample weights
|
||||
*/
|
||||
protected function getBestClassifier(): Classifier
|
||||
{
|
||||
$ref = new ReflectionClass($this->baseClassifier);
|
||||
/** @var Classifier $classifier */
|
||||
$classifier = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
|
||||
|
||||
if ($classifier instanceof WeightedClassifier) {
|
||||
$classifier->setSampleWeights($this->weights);
|
||||
$classifier->train($this->samples, $this->targets);
|
||||
} else {
|
||||
list($samples, $targets) = $this->resample();
|
||||
[$samples, $targets] = $this->resample();
|
||||
$classifier->train($samples, $targets);
|
||||
}
|
||||
|
||||
|
@ -168,25 +176,24 @@ class AdaBoost implements Classifier
|
|||
/**
|
||||
* Resamples the dataset in accordance with the weights and
|
||||
* returns the new dataset
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function resample()
|
||||
protected function resample(): array
|
||||
{
|
||||
$weights = $this->weights;
|
||||
$std = StandardDeviation::population($weights);
|
||||
$mean= Mean::arithmetic($weights);
|
||||
$mean = Mean::arithmetic($weights);
|
||||
$min = min($weights);
|
||||
$minZ= (int)round(($min - $mean) / $std);
|
||||
$minZ = (int) round(($min - $mean) / $std);
|
||||
|
||||
$samples = [];
|
||||
$targets = [];
|
||||
foreach ($weights as $index => $weight) {
|
||||
$z = (int)round(($weight - $mean) / $std) - $minZ + 1;
|
||||
$z = (int) round(($weight - $mean) / $std) - $minZ + 1;
|
||||
for ($i = 0; $i < $z; ++$i) {
|
||||
if (rand(0, 1) == 0) {
|
||||
if (random_int(0, 1) == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$samples[] = $this->samples[$index];
|
||||
$targets[] = $this->targets[$index];
|
||||
}
|
||||
|
@ -197,12 +204,8 @@ class AdaBoost implements Classifier
|
|||
|
||||
/**
|
||||
* Evaluates the classifier and returns the classification error rate
|
||||
*
|
||||
* @param Classifier $classifier
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function evaluateClassifier(Classifier $classifier)
|
||||
protected function evaluateClassifier(Classifier $classifier): float
|
||||
{
|
||||
$total = (float) array_sum($this->weights);
|
||||
$wrong = 0;
|
||||
|
@ -218,25 +221,20 @@ class AdaBoost implements Classifier
|
|||
|
||||
/**
|
||||
* Calculates alpha of a classifier
|
||||
*
|
||||
* @param float $errorRate
|
||||
* @return float
|
||||
*/
|
||||
protected function calculateAlpha(float $errorRate)
|
||||
protected function calculateAlpha(float $errorRate): float
|
||||
{
|
||||
if ($errorRate == 0) {
|
||||
$errorRate = 1e-10;
|
||||
}
|
||||
|
||||
return 0.5 * log((1 - $errorRate) / $errorRate);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the sample weights
|
||||
*
|
||||
* @param Classifier $classifier
|
||||
* @param float $alpha
|
||||
*/
|
||||
protected function updateWeights(Classifier $classifier, float $alpha)
|
||||
protected function updateWeights(Classifier $classifier, float $alpha): void
|
||||
{
|
||||
$sumOfWeights = array_sum($this->weights);
|
||||
$weightsT1 = [];
|
||||
|
@ -251,19 +249,4 @@ class AdaBoost implements Classifier
|
|||
|
||||
$this->weights = $weightsT1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @return mixed
|
||||
*/
|
||||
public function predictSample(array $sample)
|
||||
{
|
||||
$sum = 0;
|
||||
foreach ($this->alpha as $index => $alpha) {
|
||||
$h = $this->classifiers[$index]->predict($sample);
|
||||
$sum += $h * $alpha;
|
||||
}
|
||||
|
||||
return $this->labels[ $sum > 0 ? 1 : -1];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,25 +4,23 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Ensemble;
|
||||
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use ReflectionClass;
|
||||
|
||||
class Bagging implements Classifier
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
use Trainable;
|
||||
use Predictable;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $numSamples;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $targets = [];
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
|
@ -34,7 +32,7 @@ class Bagging implements Classifier
|
|||
protected $numClassifier;
|
||||
|
||||
/**
|
||||
* @var Classifier
|
||||
* @var string
|
||||
*/
|
||||
protected $classifier = DecisionTree::class;
|
||||
|
||||
|
@ -46,24 +44,17 @@ class Bagging implements Classifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $classifiers;
|
||||
protected $classifiers = [];
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
protected $subsetRatio = 0.7;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $samples = [];
|
||||
|
||||
/**
|
||||
* Creates an ensemble classifier with given number of base classifiers
|
||||
* Default number of base classifiers is 50.
|
||||
* The more number of base classifiers, the better performance but at the cost of procesing time
|
||||
*
|
||||
* @param int $numClassifier
|
||||
*/
|
||||
public function __construct(int $numClassifier = 50)
|
||||
{
|
||||
|
@ -75,19 +66,18 @@ class Bagging implements Classifier
|
|||
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
|
||||
* to train each base classifier.
|
||||
*
|
||||
* @param float $ratio
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function setSubsetRatio(float $ratio)
|
||||
{
|
||||
if ($ratio < 0.1 || $ratio > 1.0) {
|
||||
throw new \Exception("Subset ratio should be between 0.1 and 1.0");
|
||||
throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0');
|
||||
}
|
||||
|
||||
$this->subsetRatio = $ratio;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
@ -99,9 +89,6 @@ class Bagging implements Classifier
|
|||
* given in the order they are in the constructor of the classifier and parameter
|
||||
* names are neglected.
|
||||
*
|
||||
* @param string $classifier
|
||||
* @param array $classifierOptions
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setClassifer(string $classifier, array $classifierOptions = [])
|
||||
|
@ -112,11 +99,7 @@ class Bagging implements Classifier
|
|||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
$this->samples = array_merge($this->samples, $samples);
|
||||
$this->targets = array_merge($this->targets, $targets);
|
||||
|
@ -127,24 +110,20 @@ class Bagging implements Classifier
|
|||
$this->classifiers = $this->initClassifiers();
|
||||
$index = 0;
|
||||
foreach ($this->classifiers as $classifier) {
|
||||
list($samples, $targets) = $this->getRandomSubset($index);
|
||||
[$samples, $targets] = $this->getRandomSubset($index);
|
||||
$classifier->train($samples, $targets);
|
||||
++$index;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $index
|
||||
* @return array
|
||||
*/
|
||||
protected function getRandomSubset(int $index)
|
||||
protected function getRandomSubset(int $index): array
|
||||
{
|
||||
$samples = [];
|
||||
$targets = [];
|
||||
srand($index);
|
||||
$bootstrapSize = $this->subsetRatio * $this->numSamples;
|
||||
for ($i = 0; $i < $bootstrapSize; ++$i) {
|
||||
$rand = rand(0, $this->numSamples - 1);
|
||||
$rand = random_int(0, $this->numSamples - 1);
|
||||
$samples[] = $this->samples[$rand];
|
||||
$targets[] = $this->targets[$rand];
|
||||
}
|
||||
|
@ -152,50 +131,40 @@ class Bagging implements Classifier
|
|||
return [$samples, $targets];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
protected function initClassifiers()
|
||||
protected function initClassifiers(): array
|
||||
{
|
||||
$classifiers = [];
|
||||
for ($i = 0; $i < $this->numClassifier; ++$i) {
|
||||
$ref = new \ReflectionClass($this->classifier);
|
||||
if ($this->classifierOptions) {
|
||||
$obj = $ref->newInstanceArgs($this->classifierOptions);
|
||||
} else {
|
||||
$obj = $ref->newInstance();
|
||||
}
|
||||
$ref = new ReflectionClass($this->classifier);
|
||||
/** @var Classifier $obj */
|
||||
$obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
|
||||
|
||||
$classifiers[] = $this->initSingleClassifier($obj);
|
||||
}
|
||||
|
||||
return $classifiers;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Classifier $classifier
|
||||
*
|
||||
* @return Classifier
|
||||
*/
|
||||
protected function initSingleClassifier($classifier)
|
||||
protected function initSingleClassifier(Classifier $classifier): Classifier
|
||||
{
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
$predictions = [];
|
||||
foreach ($this->classifiers as $classifier) {
|
||||
/* @var $classifier Classifier */
|
||||
/** @var Classifier $classifier */
|
||||
$predictions[] = $classifier->predict($sample);
|
||||
}
|
||||
|
||||
$counts = array_count_values($predictions);
|
||||
arsort($counts);
|
||||
reset($counts);
|
||||
|
||||
return key($counts);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,9 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Ensemble;
|
||||
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
class RandomForest extends Bagging
|
||||
{
|
||||
|
@ -14,16 +16,14 @@ class RandomForest extends Bagging
|
|||
protected $featureSubsetRatio = 'log';
|
||||
|
||||
/**
|
||||
* @var array
|
||||
* @var array|null
|
||||
*/
|
||||
protected $columnNames = null;
|
||||
protected $columnNames;
|
||||
|
||||
/**
|
||||
* Initializes RandomForest with the given number of trees. More trees
|
||||
* may increase the prediction performance while it will also substantially
|
||||
* increase the processing time and the required memory
|
||||
*
|
||||
* @param int $numClassifier
|
||||
*/
|
||||
public function __construct(int $numClassifier = 50)
|
||||
{
|
||||
|
@ -41,40 +41,36 @@ class RandomForest extends Bagging
|
|||
* Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1
|
||||
* features to be taken into consideration while selecting subspace of features
|
||||
*
|
||||
* @param mixed $ratio string or float should be given
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws \Exception
|
||||
* @param string|float $ratio
|
||||
*/
|
||||
public function setFeatureSubsetRatio($ratio)
|
||||
public function setFeatureSubsetRatio($ratio): self
|
||||
{
|
||||
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
|
||||
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
|
||||
if (!is_string($ratio) && !is_float($ratio)) {
|
||||
throw new InvalidArgumentException('Feature subset ratio must be a string or a float');
|
||||
}
|
||||
|
||||
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
|
||||
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
|
||||
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
|
||||
throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0');
|
||||
}
|
||||
|
||||
if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') {
|
||||
throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'");
|
||||
}
|
||||
|
||||
$this->featureSubsetRatio = $ratio;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* RandomForest algorithm is usable *only* with DecisionTree
|
||||
*
|
||||
* @param string $classifier
|
||||
* @param array $classifierOptions
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function setClassifer(string $classifier, array $classifierOptions = [])
|
||||
{
|
||||
if ($classifier != DecisionTree::class) {
|
||||
throw new \Exception("RandomForest can only use DecisionTree as base classifier");
|
||||
if ($classifier !== DecisionTree::class) {
|
||||
throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier');
|
||||
}
|
||||
|
||||
return parent::setClassifer($classifier, $classifierOptions);
|
||||
|
@ -84,15 +80,13 @@ class RandomForest extends Bagging
|
|||
* This will return an array including an importance value for
|
||||
* each column in the given dataset. Importance values for a column
|
||||
* is the average importance of that column in all trees in the forest
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getFeatureImportances()
|
||||
public function getFeatureImportances(): array
|
||||
{
|
||||
// Traverse each tree and sum importance of the columns
|
||||
$sum = [];
|
||||
foreach ($this->classifiers as $tree) {
|
||||
/* @var $tree DecisionTree */
|
||||
/** @var DecisionTree $tree */
|
||||
$importances = $tree->getFeatureImportances();
|
||||
|
||||
foreach ($importances as $column => $importance) {
|
||||
|
@ -106,10 +100,9 @@ class RandomForest extends Bagging
|
|||
|
||||
// Normalize & sort the importance values
|
||||
$total = array_sum($sum);
|
||||
foreach ($sum as &$importance) {
|
||||
array_walk($sum, function (&$importance) use ($total): void {
|
||||
$importance /= $total;
|
||||
}
|
||||
|
||||
});
|
||||
arsort($sum);
|
||||
|
||||
return $sum;
|
||||
|
@ -119,7 +112,6 @@ class RandomForest extends Bagging
|
|||
* A string array to represent the columns is given. They are useful
|
||||
* when trying to print some information about the trees such as feature importances
|
||||
*
|
||||
* @param array $names
|
||||
* @return $this
|
||||
*/
|
||||
public function setColumnNames(array $names)
|
||||
|
@ -134,14 +126,14 @@ class RandomForest extends Bagging
|
|||
*
|
||||
* @return DecisionTree
|
||||
*/
|
||||
protected function initSingleClassifier($classifier)
|
||||
protected function initSingleClassifier(Classifier $classifier): Classifier
|
||||
{
|
||||
if (is_float($this->featureSubsetRatio)) {
|
||||
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
|
||||
} elseif ($this->featureCount == 'sqrt') {
|
||||
$featureCount = (int)sqrt($this->featureCount) + 1;
|
||||
$featureCount = (int) ($this->featureSubsetRatio * $this->featureCount);
|
||||
} elseif ($this->featureSubsetRatio === 'sqrt') {
|
||||
$featureCount = (int) ($this->featureCount ** .5) + 1;
|
||||
} else {
|
||||
$featureCount = (int)log($this->featureCount, 2) + 1;
|
||||
$featureCount = (int) log($this->featureCount, 2) + 1;
|
||||
}
|
||||
|
||||
if ($featureCount >= $this->featureCount) {
|
||||
|
@ -153,7 +145,7 @@ class RandomForest extends Bagging
|
|||
}
|
||||
|
||||
return $classifier
|
||||
->setColumnNames($this->columnNames)
|
||||
->setNumFeatures($featureCount);
|
||||
->setColumnNames($this->columnNames)
|
||||
->setNumFeatures($featureCount);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,8 @@ use Phpml\Math\Distance\Euclidean;
|
|||
|
||||
class KNearestNeighbors implements Classifier
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
use Trainable;
|
||||
use Predictable;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
|
@ -24,12 +25,11 @@ class KNearestNeighbors implements Classifier
|
|||
private $distanceMetric;
|
||||
|
||||
/**
|
||||
* @param int $k
|
||||
* @param Distance|null $distanceMetric (if null then Euclidean distance as default)
|
||||
*/
|
||||
public function __construct(int $k = 3, Distance $distanceMetric = null)
|
||||
public function __construct(int $k = 3, ?Distance $distanceMetric = null)
|
||||
{
|
||||
if (null === $distanceMetric) {
|
||||
if ($distanceMetric === null) {
|
||||
$distanceMetric = new Euclidean();
|
||||
}
|
||||
|
||||
|
@ -40,17 +40,14 @@ class KNearestNeighbors implements Classifier
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
$distances = $this->kNeighborsDistances($sample);
|
||||
$predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
|
||||
|
||||
$predictions = array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
|
||||
|
||||
foreach ($distances as $index => $distance) {
|
||||
foreach (array_keys($distances) as $index) {
|
||||
++$predictions[$this->targets[$index]];
|
||||
}
|
||||
|
||||
|
@ -61,13 +58,9 @@ class KNearestNeighbors implements Classifier
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*
|
||||
* @throws \Phpml\Exception\InvalidArgumentException
|
||||
*/
|
||||
private function kNeighborsDistances(array $sample)
|
||||
private function kNeighborsDistances(array $sample): array
|
||||
{
|
||||
$distances = [];
|
||||
|
||||
|
|
|
@ -4,22 +4,24 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Linear;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
class Adaline extends Perceptron
|
||||
{
|
||||
/**
|
||||
* Batch training is the default Adaline training algorithm
|
||||
*/
|
||||
const BATCH_TRAINING = 1;
|
||||
public const BATCH_TRAINING = 1;
|
||||
|
||||
/**
|
||||
* Online training: Stochastic gradient descent learning
|
||||
*/
|
||||
const ONLINE_TRAINING = 2;
|
||||
public const ONLINE_TRAINING = 2;
|
||||
|
||||
/**
|
||||
* Training type may be either 'Batch' or 'Online' learning
|
||||
*
|
||||
* @var string
|
||||
* @var string|int
|
||||
*/
|
||||
protected $trainingType;
|
||||
|
||||
|
@ -32,18 +34,16 @@ class Adaline extends Perceptron
|
|||
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
|
||||
* by use of standard deviation and mean calculation
|
||||
*
|
||||
* @param float $learningRate
|
||||
* @param int $maxIterations
|
||||
* @param bool $normalizeInputs
|
||||
* @param int $trainingType
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000,
|
||||
bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING)
|
||||
{
|
||||
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) {
|
||||
throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm");
|
||||
public function __construct(
|
||||
float $learningRate = 0.001,
|
||||
int $maxIterations = 1000,
|
||||
bool $normalizeInputs = true,
|
||||
int $trainingType = self::BATCH_TRAINING
|
||||
) {
|
||||
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) {
|
||||
throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm');
|
||||
}
|
||||
|
||||
$this->trainingType = $trainingType;
|
||||
|
@ -54,11 +54,8 @@ class Adaline extends Perceptron
|
|||
/**
|
||||
* Adapts the weights with respect to given samples and targets
|
||||
* by use of gradient descent learning rule
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
protected function runTraining(array $samples, array $targets)
|
||||
protected function runTraining(array $samples, array $targets): void
|
||||
{
|
||||
// The cost function is the sum of squares
|
||||
$callback = function ($weights, $sample, $target) {
|
||||
|
@ -73,6 +70,6 @@ class Adaline extends Perceptron
|
|||
|
||||
$isBatch = $this->trainingType == self::BATCH_TRAINING;
|
||||
|
||||
return parent::runGradientDescent($samples, $targets, $callback, $isBatch);
|
||||
parent::runGradientDescent($samples, $targets, $callback, $isBatch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,16 +4,19 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Linear;
|
||||
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\OneVsRest;
|
||||
use Phpml\Classification\WeightedClassifier;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Classification\WeightedClassifier;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\OneVsRest;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Math\Comparison;
|
||||
|
||||
class DecisionStump extends WeightedClassifier
|
||||
{
|
||||
use Predictable, OneVsRest;
|
||||
use Predictable;
|
||||
use OneVsRest;
|
||||
|
||||
const AUTO_SELECT = -1;
|
||||
public const AUTO_SELECT = -1;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
|
@ -23,7 +26,7 @@ class DecisionStump extends WeightedClassifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $binaryLabels;
|
||||
protected $binaryLabels = [];
|
||||
|
||||
/**
|
||||
* Lowest error rate obtained while training/optimizing the model
|
||||
|
@ -50,7 +53,7 @@ class DecisionStump extends WeightedClassifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $columnTypes;
|
||||
protected $columnTypes = [];
|
||||
|
||||
/**
|
||||
* @var int
|
||||
|
@ -67,7 +70,7 @@ class DecisionStump extends WeightedClassifier
|
|||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $prob;
|
||||
protected $prob = [];
|
||||
|
||||
/**
|
||||
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
|
||||
|
@ -76,22 +79,35 @@ class DecisionStump extends WeightedClassifier
|
|||
* If columnIndex is given, then the stump tries to produce a decision node
|
||||
* on this column, otherwise in cases given the value of -1, the stump itself
|
||||
* decides which column to take for the decision (Default DecisionTree behaviour)
|
||||
*
|
||||
* @param int $columnIndex
|
||||
*/
|
||||
public function __construct(int $columnIndex = self::AUTO_SELECT)
|
||||
{
|
||||
$this->givenColumnIndex = $columnIndex;
|
||||
}
|
||||
|
||||
public function __toString(): string
|
||||
{
|
||||
return "IF ${this}->column ${this}->operator ${this}->value ".
|
||||
'THEN '.$this->binaryLabels[0].' '.
|
||||
'ELSE '.$this->binaryLabels[1];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $labels
|
||||
*
|
||||
* @throws \Exception
|
||||
* While finding best split point for a numerical valued column,
|
||||
* DecisionStump looks for equally distanced values between minimum and maximum
|
||||
* values in the column. Given <i>$count</i> value determines how many split
|
||||
* points to be probed. The more split counts, the better performance but
|
||||
* worse processing time (Default value is 10.0)
|
||||
*/
|
||||
protected function trainBinary(array $samples, array $targets, array $labels)
|
||||
public function setNumericalSplitCount(float $count): void
|
||||
{
|
||||
$this->numSplitCount = $count;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
protected function trainBinary(array $samples, array $targets, array $labels): void
|
||||
{
|
||||
$this->binaryLabels = $labels;
|
||||
$this->featureCount = count($samples[0]);
|
||||
|
@ -103,13 +119,13 @@ class DecisionStump extends WeightedClassifier
|
|||
|
||||
// Check the size of the weights given.
|
||||
// If none given, then assign 1 as a weight to each sample
|
||||
if ($this->weights) {
|
||||
$numWeights = count($this->weights);
|
||||
if ($numWeights != count($samples)) {
|
||||
throw new \Exception("Number of sample weights does not match with number of samples");
|
||||
}
|
||||
} else {
|
||||
if (count($this->weights) === 0) {
|
||||
$this->weights = array_fill(0, count($samples), 1);
|
||||
} else {
|
||||
$numWeights = count($this->weights);
|
||||
if ($numWeights !== count($samples)) {
|
||||
throw new InvalidArgumentException('Number of sample weights does not match with number of samples');
|
||||
}
|
||||
}
|
||||
|
||||
// Determine type of each column as either "continuous" or "nominal"
|
||||
|
@ -118,14 +134,17 @@ class DecisionStump extends WeightedClassifier
|
|||
// Try to find the best split in the columns of the dataset
|
||||
// by calculating error rate for each split point in each column
|
||||
$columns = range(0, count($samples[0]) - 1);
|
||||
if ($this->givenColumnIndex != self::AUTO_SELECT) {
|
||||
if ($this->givenColumnIndex !== self::AUTO_SELECT) {
|
||||
$columns = [$this->givenColumnIndex];
|
||||
}
|
||||
|
||||
$bestSplit = [
|
||||
'value' => 0, 'operator' => '',
|
||||
'prob' => [], 'column' => 0,
|
||||
'trainingErrorRate' => 1.0];
|
||||
'value' => 0,
|
||||
'operator' => '',
|
||||
'prob' => [],
|
||||
'column' => 0,
|
||||
'trainingErrorRate' => 1.0,
|
||||
];
|
||||
foreach ($columns as $col) {
|
||||
if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) {
|
||||
$split = $this->getBestNumericalSplit($samples, $targets, $col);
|
||||
|
@ -144,30 +163,10 @@ class DecisionStump extends WeightedClassifier
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* While finding best split point for a numerical valued column,
|
||||
* DecisionStump looks for equally distanced values between minimum and maximum
|
||||
* values in the column. Given <i>$count</i> value determines how many split
|
||||
* points to be probed. The more split counts, the better performance but
|
||||
* worse processing time (Default value is 10.0)
|
||||
*
|
||||
* @param float $count
|
||||
*/
|
||||
public function setNumericalSplitCount(float $count)
|
||||
{
|
||||
$this->numSplitCount = $count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines best split point for the given column
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param int $col
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getBestNumericalSplit(array $samples, array $targets, int $col)
|
||||
protected function getBestNumericalSplit(array $samples, array $targets, int $col): array
|
||||
{
|
||||
$values = array_column($samples, $col);
|
||||
// Trying all possible points may be accomplished in two general ways:
|
||||
|
@ -178,27 +177,35 @@ class DecisionStump extends WeightedClassifier
|
|||
$maxValue = max($values);
|
||||
$stepSize = ($maxValue - $minValue) / $this->numSplitCount;
|
||||
|
||||
$split = null;
|
||||
$split = [];
|
||||
|
||||
foreach (['<=', '>'] as $operator) {
|
||||
// Before trying all possible split points, let's first try
|
||||
// the average value for the cut point
|
||||
$threshold = array_sum($values) / (float) count($values);
|
||||
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
|
||||
if ($split == null || $errorRate < $split['trainingErrorRate']) {
|
||||
$split = ['value' => $threshold, 'operator' => $operator,
|
||||
'prob' => $prob, 'column' => $col,
|
||||
'trainingErrorRate' => $errorRate];
|
||||
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
|
||||
if (!isset($split['trainingErrorRate']) || $errorRate < $split['trainingErrorRate']) {
|
||||
$split = [
|
||||
'value' => $threshold,
|
||||
'operator' => $operator,
|
||||
'prob' => $prob,
|
||||
'column' => $col,
|
||||
'trainingErrorRate' => $errorRate,
|
||||
];
|
||||
}
|
||||
|
||||
// Try other possible points one by one
|
||||
for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) {
|
||||
$threshold = (float)$step;
|
||||
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
|
||||
for ($step = $minValue; $step <= $maxValue; $step += $stepSize) {
|
||||
$threshold = (float) $step;
|
||||
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
|
||||
if ($errorRate < $split['trainingErrorRate']) {
|
||||
$split = ['value' => $threshold, 'operator' => $operator,
|
||||
'prob' => $prob, 'column' => $col,
|
||||
'trainingErrorRate' => $errorRate];
|
||||
$split = [
|
||||
'value' => $threshold,
|
||||
'operator' => $operator,
|
||||
'prob' => $prob,
|
||||
'column' => $col,
|
||||
'trainingErrorRate' => $errorRate,
|
||||
];
|
||||
}
|
||||
}// for
|
||||
}
|
||||
|
@ -206,29 +213,25 @@ class DecisionStump extends WeightedClassifier
|
|||
return $split;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param int $col
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getBestNominalSplit(array $samples, array $targets, int $col) : array
|
||||
protected function getBestNominalSplit(array $samples, array $targets, int $col): array
|
||||
{
|
||||
$values = array_column($samples, $col);
|
||||
$valueCounts = array_count_values($values);
|
||||
$distinctVals= array_keys($valueCounts);
|
||||
$distinctVals = array_keys($valueCounts);
|
||||
|
||||
$split = null;
|
||||
$split = [];
|
||||
|
||||
foreach (['=', '!='] as $operator) {
|
||||
foreach ($distinctVals as $val) {
|
||||
list($errorRate, $prob) = $this->calculateErrorRate($targets, $val, $operator, $values);
|
||||
|
||||
if ($split == null || $split['trainingErrorRate'] < $errorRate) {
|
||||
$split = ['value' => $val, 'operator' => $operator,
|
||||
'prob' => $prob, 'column' => $col,
|
||||
'trainingErrorRate' => $errorRate];
|
||||
[$errorRate, $prob] = $this->calculateErrorRate($targets, $val, $operator, $values);
|
||||
if (!isset($split['trainingErrorRate']) || $split['trainingErrorRate'] < $errorRate) {
|
||||
$split = [
|
||||
'value' => $val,
|
||||
'operator' => $operator,
|
||||
'prob' => $prob,
|
||||
'column' => $col,
|
||||
'trainingErrorRate' => $errorRate,
|
||||
];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -236,72 +239,42 @@ class DecisionStump extends WeightedClassifier
|
|||
return $split;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param mixed $leftValue
|
||||
* @param string $operator
|
||||
* @param mixed $rightValue
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
protected function evaluate($leftValue, string $operator, $rightValue)
|
||||
{
|
||||
switch ($operator) {
|
||||
case '>': return $leftValue > $rightValue;
|
||||
case '>=': return $leftValue >= $rightValue;
|
||||
case '<': return $leftValue < $rightValue;
|
||||
case '<=': return $leftValue <= $rightValue;
|
||||
case '=': return $leftValue === $rightValue;
|
||||
case '!=':
|
||||
case '<>': return $leftValue !== $rightValue;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the ratio of wrong predictions based on the new threshold
|
||||
* value given as the parameter
|
||||
*
|
||||
* @param array $targets
|
||||
* @param float $threshold
|
||||
* @param string $operator
|
||||
* @param array $values
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values) : array
|
||||
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values): array
|
||||
{
|
||||
$wrong = 0.0;
|
||||
$prob = [];
|
||||
$leftLabel = $this->binaryLabels[0];
|
||||
$rightLabel= $this->binaryLabels[1];
|
||||
$rightLabel = $this->binaryLabels[1];
|
||||
|
||||
foreach ($values as $index => $value) {
|
||||
if ($this->evaluate($value, $operator, $threshold)) {
|
||||
if (Comparison::compare($value, $threshold, $operator)) {
|
||||
$predicted = $leftLabel;
|
||||
} else {
|
||||
$predicted = $rightLabel;
|
||||
}
|
||||
|
||||
$target = $targets[$index];
|
||||
if (strval($predicted) != strval($targets[$index])) {
|
||||
if ((string) $predicted != (string) $targets[$index]) {
|
||||
$wrong += $this->weights[$index];
|
||||
}
|
||||
|
||||
if (!isset($prob[$predicted][$target])) {
|
||||
$prob[$predicted][$target] = 0;
|
||||
}
|
||||
|
||||
++$prob[$predicted][$target];
|
||||
}
|
||||
|
||||
// Calculate probabilities: Proportion of labels in each leaf
|
||||
$dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
|
||||
foreach ($prob as $leaf => $counts) {
|
||||
$leafTotal = (float)array_sum($prob[$leaf]);
|
||||
$leafTotal = (float) array_sum($prob[$leaf]);
|
||||
foreach ($counts as $label => $count) {
|
||||
if (strval($leaf) == strval($label)) {
|
||||
if ((string) $leaf == (string) $label) {
|
||||
$dist[$leaf] = $count / $leafTotal;
|
||||
}
|
||||
}
|
||||
|
@ -316,15 +289,12 @@ class DecisionStump extends WeightedClassifier
|
|||
* Probability of a sample is calculated as the proportion of the label
|
||||
* within the labels of the training samples in the decision node
|
||||
*
|
||||
* @param array $sample
|
||||
* @param mixed $label
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function predictProbability(array $sample, $label) : float
|
||||
protected function predictProbability(array $sample, $label): float
|
||||
{
|
||||
$predicted = $this->predictSampleBinary($sample);
|
||||
if (strval($predicted) == strval($label)) {
|
||||
if ((string) $predicted == (string) $label) {
|
||||
return $this->prob[$label];
|
||||
}
|
||||
|
||||
|
@ -332,33 +302,18 @@ class DecisionStump extends WeightedClassifier
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSampleBinary(array $sample)
|
||||
{
|
||||
if ($this->evaluate($sample[$this->column], $this->operator, $this->value)) {
|
||||
if (Comparison::compare($sample[$this->column], $this->value, $this->operator)) {
|
||||
return $this->binaryLabels[0];
|
||||
}
|
||||
|
||||
return $this->binaryLabels[1];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
protected function resetBinary()
|
||||
protected function resetBinary(): void
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function __toString()
|
||||
{
|
||||
return "IF $this->column $this->operator $this->value " .
|
||||
"THEN " . $this->binaryLabels[0] . " ".
|
||||
"ELSE " . $this->binaryLabels[1];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,9 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Linear;
|
||||
|
||||
use Closure;
|
||||
use Exception;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\Optimizer\ConjugateGradient;
|
||||
|
||||
class LogisticRegression extends Adaline
|
||||
|
@ -11,17 +14,17 @@ class LogisticRegression extends Adaline
|
|||
/**
|
||||
* Batch training: Gradient descent algorithm (default)
|
||||
*/
|
||||
const BATCH_TRAINING = 1;
|
||||
public const BATCH_TRAINING = 1;
|
||||
|
||||
/**
|
||||
* Online training: Stochastic gradient descent learning
|
||||
*/
|
||||
const ONLINE_TRAINING = 2;
|
||||
public const ONLINE_TRAINING = 2;
|
||||
|
||||
/**
|
||||
* Conjugate Batch: Conjugate Gradient algorithm
|
||||
*/
|
||||
const CONJUGATE_GRAD_TRAINING = 3;
|
||||
public const CONJUGATE_GRAD_TRAINING = 3;
|
||||
|
||||
/**
|
||||
* Cost function to optimize: 'log' and 'sse' are supported <br>
|
||||
|
@ -30,7 +33,7 @@ class LogisticRegression extends Adaline
|
|||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $costFunction = 'sse';
|
||||
protected $costFunction = 'log';
|
||||
|
||||
/**
|
||||
* Regularization term: only 'L2' is supported
|
||||
|
@ -59,32 +62,33 @@ class LogisticRegression extends Adaline
|
|||
*
|
||||
* Penalty (Regularization term) can be 'L2' or empty string to cancel penalty term
|
||||
*
|
||||
* @param int $maxIterations
|
||||
* @param bool $normalizeInputs
|
||||
* @param int $trainingType
|
||||
* @param string $cost
|
||||
* @param string $penalty
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(int $maxIterations = 500, bool $normalizeInputs = true,
|
||||
int $trainingType = self::CONJUGATE_GRAD_TRAINING, string $cost = 'sse',
|
||||
string $penalty = 'L2')
|
||||
{
|
||||
public function __construct(
|
||||
int $maxIterations = 500,
|
||||
bool $normalizeInputs = true,
|
||||
int $trainingType = self::CONJUGATE_GRAD_TRAINING,
|
||||
string $cost = 'log',
|
||||
string $penalty = 'L2'
|
||||
) {
|
||||
$trainingTypes = range(self::BATCH_TRAINING, self::CONJUGATE_GRAD_TRAINING);
|
||||
if (!in_array($trainingType, $trainingTypes)) {
|
||||
throw new \Exception("Logistic regression can only be trained with " .
|
||||
"batch (gradient descent), online (stochastic gradient descent) " .
|
||||
"or conjugate batch (conjugate gradients) algorithms");
|
||||
if (!in_array($trainingType, $trainingTypes, true)) {
|
||||
throw new InvalidArgumentException(
|
||||
'Logistic regression can only be trained with '.
|
||||
'batch (gradient descent), online (stochastic gradient descent) '.
|
||||
'or conjugate batch (conjugate gradients) algorithms'
|
||||
);
|
||||
}
|
||||
|
||||
if (!in_array($cost, ['log', 'sse'])) {
|
||||
throw new \Exception("Logistic regression cost function can be one of the following: \n" .
|
||||
"'log' for log-likelihood and 'sse' for sum of squared errors");
|
||||
if (!in_array($cost, ['log', 'sse'], true)) {
|
||||
throw new InvalidArgumentException(
|
||||
"Logistic regression cost function can be one of the following: \n".
|
||||
"'log' for log-likelihood and 'sse' for sum of squared errors"
|
||||
);
|
||||
}
|
||||
|
||||
if ($penalty != '' && strtoupper($penalty) !== 'L2') {
|
||||
throw new \Exception("Logistic regression supports only 'L2' regularization");
|
||||
if ($penalty !== '' && strtoupper($penalty) !== 'L2') {
|
||||
throw new InvalidArgumentException('Logistic regression supports only \'L2\' regularization');
|
||||
}
|
||||
|
||||
$this->learningRate = 0.001;
|
||||
|
@ -99,10 +103,8 @@ class LogisticRegression extends Adaline
|
|||
/**
|
||||
* Sets the learning rate if gradient descent algorithm is
|
||||
* selected for training
|
||||
*
|
||||
* @param float $learningRate
|
||||
*/
|
||||
public function setLearningRate(float $learningRate)
|
||||
public function setLearningRate(float $learningRate): void
|
||||
{
|
||||
$this->learningRate = $learningRate;
|
||||
}
|
||||
|
@ -110,10 +112,8 @@ class LogisticRegression extends Adaline
|
|||
/**
|
||||
* Lambda (λ) parameter of regularization term. If 0 is given,
|
||||
* then the regularization term is cancelled
|
||||
*
|
||||
* @param float $lambda
|
||||
*/
|
||||
public function setLambda(float $lambda)
|
||||
public function setLambda(float $lambda): void
|
||||
{
|
||||
$this->lambda = $lambda;
|
||||
}
|
||||
|
@ -122,40 +122,40 @@ class LogisticRegression extends Adaline
|
|||
* Adapts the weights with respect to given samples and targets
|
||||
* by use of selected solver
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
protected function runTraining(array $samples, array $targets)
|
||||
protected function runTraining(array $samples, array $targets): void
|
||||
{
|
||||
$callback = $this->getCostFunction();
|
||||
|
||||
switch ($this->trainingType) {
|
||||
case self::BATCH_TRAINING:
|
||||
return $this->runGradientDescent($samples, $targets, $callback, true);
|
||||
$this->runGradientDescent($samples, $targets, $callback, true);
|
||||
|
||||
return;
|
||||
|
||||
case self::ONLINE_TRAINING:
|
||||
return $this->runGradientDescent($samples, $targets, $callback, false);
|
||||
$this->runGradientDescent($samples, $targets, $callback, false);
|
||||
|
||||
return;
|
||||
|
||||
case self::CONJUGATE_GRAD_TRAINING:
|
||||
return $this->runConjugateGradient($samples, $targets, $callback);
|
||||
$this->runConjugateGradient($samples, $targets, $callback);
|
||||
|
||||
return;
|
||||
|
||||
default:
|
||||
throw new \Exception('Logistic regression has invalid training type: %s.', $this->trainingType);
|
||||
// Not reached
|
||||
throw new Exception(sprintf('Logistic regression has invalid training type: %d.', $this->trainingType));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes Conjugate Gradient method to optimize the weights of the LogReg model
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientFunc
|
||||
*/
|
||||
protected function runConjugateGradient(array $samples, array $targets, \Closure $gradientFunc)
|
||||
protected function runConjugateGradient(array $samples, array $targets, Closure $gradientFunc): void
|
||||
{
|
||||
if (empty($this->optimizer)) {
|
||||
if ($this->optimizer === null) {
|
||||
$this->optimizer = (new ConjugateGradient($this->featureCount))
|
||||
->setMaxIterations($this->maxIterations);
|
||||
}
|
||||
|
@ -167,14 +167,12 @@ class LogisticRegression extends Adaline
|
|||
/**
|
||||
* Returns the appropriate callback function for the selected cost function
|
||||
*
|
||||
* @return \Closure
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
protected function getCostFunction()
|
||||
protected function getCostFunction(): Closure
|
||||
{
|
||||
$penalty = 0;
|
||||
if ($this->penalty == 'L2') {
|
||||
if ($this->penalty === 'L2') {
|
||||
$penalty = $this->lambda;
|
||||
}
|
||||
|
||||
|
@ -190,7 +188,7 @@ class LogisticRegression extends Adaline
|
|||
* The gradient of the cost function to be used with gradient descent:
|
||||
* ∇J(x) = -(y - h(x)) = (h(x) - y)
|
||||
*/
|
||||
$callback = function ($weights, $sample, $y) use ($penalty) {
|
||||
return function ($weights, $sample, $y) use ($penalty) {
|
||||
$this->weights = $weights;
|
||||
$hX = $this->output($sample);
|
||||
|
||||
|
@ -199,17 +197,18 @@ class LogisticRegression extends Adaline
|
|||
if ($hX == 1) {
|
||||
$hX = 1 - 1e-10;
|
||||
}
|
||||
|
||||
if ($hX == 0) {
|
||||
$hX = 1e-10;
|
||||
}
|
||||
|
||||
$y = $y < 0 ? 0 : 1;
|
||||
|
||||
$error = -$y * log($hX) - (1 - $y) * log(1 - $hX);
|
||||
$gradient = $hX - $y;
|
||||
|
||||
return [$error, $gradient, $penalty];
|
||||
};
|
||||
|
||||
return $callback;
|
||||
|
||||
case 'sse':
|
||||
/*
|
||||
* Sum of squared errors or least squared errors cost function:
|
||||
|
@ -221,31 +220,27 @@ class LogisticRegression extends Adaline
|
|||
* The gradient of the cost function:
|
||||
* ∇J(x) = -(h(x) - y) . h(x) . (1 - h(x))
|
||||
*/
|
||||
$callback = function ($weights, $sample, $y) use ($penalty) {
|
||||
return function ($weights, $sample, $y) use ($penalty) {
|
||||
$this->weights = $weights;
|
||||
$hX = $this->output($sample);
|
||||
|
||||
$y = $y < 0 ? 0 : 1;
|
||||
|
||||
$error = ($y - $hX) ** 2;
|
||||
$gradient = -($y - $hX) * $hX * (1 - $hX);
|
||||
|
||||
return [$error, $gradient, $penalty];
|
||||
};
|
||||
|
||||
return $callback;
|
||||
|
||||
default:
|
||||
throw new \Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
|
||||
// Not reached
|
||||
throw new Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the output of the network, a float value between 0.0 and 1.0
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function output(array $sample)
|
||||
protected function output(array $sample): float
|
||||
{
|
||||
$sum = parent::output($sample);
|
||||
|
||||
|
@ -254,16 +249,12 @@ class LogisticRegression extends Adaline
|
|||
|
||||
/**
|
||||
* Returns the class value (either -1 or 1) for the given input
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
protected function outputClass(array $sample)
|
||||
protected function outputClass(array $sample): int
|
||||
{
|
||||
$output = $this->output($sample);
|
||||
|
||||
if (round($output) > 0.5) {
|
||||
if ($output > 0.5) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -276,20 +267,17 @@ class LogisticRegression extends Adaline
|
|||
* The probability is simply taken as the distance of the sample
|
||||
* to the decision plane.
|
||||
*
|
||||
* @param array $sample
|
||||
* @param mixed $label
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function predictProbability(array $sample, $label)
|
||||
protected function predictProbability(array $sample, $label): float
|
||||
{
|
||||
$predicted = $this->predictSampleBinary($sample);
|
||||
$sample = $this->checkNormalizedSample($sample);
|
||||
$probability = $this->output($sample);
|
||||
|
||||
if (strval($predicted) == strval($label)) {
|
||||
$sample = $this->checkNormalizedSample($sample);
|
||||
return abs($this->output($sample) - 0.5);
|
||||
if (array_search($label, $this->labels, true) > 0) {
|
||||
return $probability;
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
return 1 - $probability;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,20 +4,24 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification\Linear;
|
||||
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\OneVsRest;
|
||||
use Phpml\Helper\Optimizer\StochasticGD;
|
||||
use Phpml\Helper\Optimizer\GD;
|
||||
use Closure;
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Preprocessing\Normalizer;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\OneVsRest;
|
||||
use Phpml\Helper\Optimizer\GD;
|
||||
use Phpml\Helper\Optimizer\Optimizer;
|
||||
use Phpml\Helper\Optimizer\StochasticGD;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\IncrementalEstimator;
|
||||
use Phpml\Preprocessing\Normalizer;
|
||||
|
||||
class Perceptron implements Classifier, IncrementalEstimator
|
||||
{
|
||||
use Predictable, OneVsRest;
|
||||
use Predictable;
|
||||
use OneVsRest;
|
||||
|
||||
/**
|
||||
* @var \Phpml\Helper\Optimizer\Optimizer
|
||||
* @var Optimizer|GD|StochasticGD|null
|
||||
*/
|
||||
protected $optimizer;
|
||||
|
||||
|
@ -34,7 +38,7 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $weights;
|
||||
protected $weights = [];
|
||||
|
||||
/**
|
||||
* @var float
|
||||
|
@ -56,29 +60,23 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
*/
|
||||
protected $enableEarlyStop = true;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $costValues = [];
|
||||
|
||||
/**
|
||||
* Initalize a perceptron classifier with given learning rate and maximum
|
||||
* number of iterations used while training the perceptron
|
||||
*
|
||||
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
|
||||
* @param int $maxIterations Must be at least 1
|
||||
* @param bool $normalizeInputs
|
||||
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
|
||||
* @param int $maxIterations Must be at least 1
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true)
|
||||
{
|
||||
if ($learningRate <= 0.0 || $learningRate > 1.0) {
|
||||
throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)");
|
||||
throw new InvalidArgumentException('Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)');
|
||||
}
|
||||
|
||||
if ($maxIterations <= 0) {
|
||||
throw new \Exception("Maximum number of iterations must be an integer greater than 0");
|
||||
throw new InvalidArgumentException('Maximum number of iterations must be an integer greater than 0');
|
||||
}
|
||||
|
||||
if ($normalizeInputs) {
|
||||
|
@ -89,31 +87,24 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
$this->maxIterations = $maxIterations;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $labels
|
||||
*/
|
||||
public function partialTrain(array $samples, array $targets, array $labels = [])
|
||||
public function partialTrain(array $samples, array $targets, array $labels = []): void
|
||||
{
|
||||
$this->trainByLabel($samples, $targets, $labels);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $labels
|
||||
*/
|
||||
public function trainBinary(array $samples, array $targets, array $labels)
|
||||
public function trainBinary(array $samples, array $targets, array $labels): void
|
||||
{
|
||||
if ($this->normalizer) {
|
||||
if ($this->normalizer !== null) {
|
||||
$this->normalizer->transform($samples);
|
||||
}
|
||||
|
||||
// Set all target values to either -1 or 1
|
||||
$this->labels = [1 => $labels[0], -1 => $labels[1]];
|
||||
$this->labels = [
|
||||
1 => $labels[0],
|
||||
-1 => $labels[1],
|
||||
];
|
||||
foreach ($targets as $key => $target) {
|
||||
$targets[$key] = strval($target) == strval($this->labels[1]) ? 1 : -1;
|
||||
$targets[$key] = (string) $target == (string) $this->labels[1] ? 1 : -1;
|
||||
}
|
||||
|
||||
// Set samples and feature count vars
|
||||
|
@ -122,15 +113,6 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
$this->runTraining($samples, $targets);
|
||||
}
|
||||
|
||||
protected function resetBinary()
|
||||
{
|
||||
$this->labels = [];
|
||||
$this->optimizer = null;
|
||||
$this->featureCount = 0;
|
||||
$this->weights = null;
|
||||
$this->costValues = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Normally enabling early stopping for the optimization procedure may
|
||||
* help saving processing time while in some cases it may result in
|
||||
|
@ -139,8 +121,6 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
* If "false" is given, the optimization procedure will always be executed
|
||||
* for $maxIterations times
|
||||
*
|
||||
* @param bool $enable
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setEarlyStop(bool $enable = true)
|
||||
|
@ -152,22 +132,26 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
|
||||
/**
|
||||
* Returns the cost values obtained during the training.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getCostValues()
|
||||
public function getCostValues(): array
|
||||
{
|
||||
return $this->costValues;
|
||||
}
|
||||
|
||||
protected function resetBinary(): void
|
||||
{
|
||||
$this->labels = [];
|
||||
$this->optimizer = null;
|
||||
$this->featureCount = 0;
|
||||
$this->weights = [];
|
||||
$this->costValues = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Trains the perceptron model with Stochastic Gradient Descent optimization
|
||||
* to get the correct set of weights
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
protected function runTraining(array $samples, array $targets)
|
||||
protected function runTraining(array $samples, array $targets): void
|
||||
{
|
||||
// The cost function is the sum of squares
|
||||
$callback = function ($weights, $sample, $target) {
|
||||
|
@ -175,7 +159,7 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
|
||||
$prediction = $this->outputClass($sample);
|
||||
$gradient = $prediction - $target;
|
||||
$error = $gradient**2;
|
||||
$error = $gradient ** 2;
|
||||
|
||||
return [$error, $gradient];
|
||||
};
|
||||
|
@ -186,17 +170,12 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
/**
|
||||
* Executes a Gradient Descent algorithm for
|
||||
* the given cost function
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientFunc
|
||||
* @param bool $isBatch
|
||||
*/
|
||||
protected function runGradientDescent(array $samples, array $targets, \Closure $gradientFunc, bool $isBatch = false)
|
||||
protected function runGradientDescent(array $samples, array $targets, Closure $gradientFunc, bool $isBatch = false): void
|
||||
{
|
||||
$class = $isBatch ? GD::class : StochasticGD::class;
|
||||
|
||||
if (empty($this->optimizer)) {
|
||||
if ($this->optimizer === null) {
|
||||
$this->optimizer = (new $class($this->featureCount))
|
||||
->setLearningRate($this->learningRate)
|
||||
->setMaxIterations($this->maxIterations)
|
||||
|
@ -211,14 +190,10 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
/**
|
||||
* Checks if the sample should be normalized and if so, returns the
|
||||
* normalized sample
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function checkNormalizedSample(array $sample)
|
||||
protected function checkNormalizedSample(array $sample): array
|
||||
{
|
||||
if ($this->normalizer) {
|
||||
if ($this->normalizer !== null) {
|
||||
$samples = [$sample];
|
||||
$this->normalizer->transform($samples);
|
||||
$sample = $samples[0];
|
||||
|
@ -230,8 +205,7 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
/**
|
||||
* Calculates net output of the network as a float value for the given input
|
||||
*
|
||||
* @param array $sample
|
||||
* @return int
|
||||
* @return int|float
|
||||
*/
|
||||
protected function output(array $sample)
|
||||
{
|
||||
|
@ -249,11 +223,8 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
|
||||
/**
|
||||
* Returns the class value (either -1 or 1) for the given input
|
||||
*
|
||||
* @param array $sample
|
||||
* @return int
|
||||
*/
|
||||
protected function outputClass(array $sample)
|
||||
protected function outputClass(array $sample): int
|
||||
{
|
||||
return $this->output($sample) > 0 ? 1 : -1;
|
||||
}
|
||||
|
@ -264,26 +235,22 @@ class Perceptron implements Classifier, IncrementalEstimator
|
|||
* The probability is simply taken as the distance of the sample
|
||||
* to the decision plane.
|
||||
*
|
||||
* @param array $sample
|
||||
* @param mixed $label
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function predictProbability(array $sample, $label)
|
||||
protected function predictProbability(array $sample, $label): float
|
||||
{
|
||||
$predicted = $this->predictSampleBinary($sample);
|
||||
|
||||
if (strval($predicted) == strval($label)) {
|
||||
if ((string) $predicted == (string) $label) {
|
||||
$sample = $this->checkNormalizedSample($sample);
|
||||
return abs($this->output($sample));
|
||||
|
||||
return (float) abs($this->output($sample));
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSampleBinary(array $sample)
|
||||
|
|
|
@ -9,22 +9,23 @@ use Phpml\NeuralNetwork\Network\MultilayerPerceptron;
|
|||
|
||||
class MLPClassifier extends MultilayerPerceptron implements Classifier
|
||||
{
|
||||
|
||||
/**
|
||||
* @param mixed $target
|
||||
* @return int
|
||||
* @param mixed $target
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function getTargetClass($target): int
|
||||
{
|
||||
if (!in_array($target, $this->classes)) {
|
||||
throw InvalidArgumentException::invalidTarget($target);
|
||||
if (!in_array($target, $this->classes, true)) {
|
||||
throw new InvalidArgumentException(
|
||||
sprintf('Target with value "%s" is not part of the accepted classes', $target)
|
||||
);
|
||||
}
|
||||
return array_search($target, $this->classes);
|
||||
|
||||
return array_search($target, $this->classes, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
|
@ -39,18 +40,17 @@ class MLPClassifier extends MultilayerPerceptron implements Classifier
|
|||
$max = $value;
|
||||
}
|
||||
}
|
||||
return $this->classes[$predictedClass];
|
||||
|
||||
return $predictedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @param mixed $target
|
||||
*/
|
||||
protected function trainSample(array $sample, $target)
|
||||
protected function trainSample(array $sample, $target): void
|
||||
{
|
||||
|
||||
// Feed-forward.
|
||||
$this->setInput($sample)->getOutput();
|
||||
$this->setInput($sample);
|
||||
|
||||
// Back-propagate.
|
||||
$this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target));
|
||||
|
|
|
@ -4,6 +4,7 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Classification;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
|
@ -11,11 +12,14 @@ use Phpml\Math\Statistic\StandardDeviation;
|
|||
|
||||
class NaiveBayes implements Classifier
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
use Trainable;
|
||||
use Predictable;
|
||||
|
||||
const CONTINUOS = 1;
|
||||
const NOMINAL = 2;
|
||||
const EPSILON = 1e-10;
|
||||
public const CONTINUOS = 1;
|
||||
|
||||
public const NOMINAL = 2;
|
||||
|
||||
public const EPSILON = 1e-10;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
|
@ -25,7 +29,7 @@ class NaiveBayes implements Classifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $mean= [];
|
||||
private $mean = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
|
@ -57,19 +61,14 @@ class NaiveBayes implements Classifier
|
|||
*/
|
||||
private $labels = [];
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
$this->samples = array_merge($this->samples, $samples);
|
||||
$this->targets = array_merge($this->targets, $targets);
|
||||
$this->sampleCount = count($this->samples);
|
||||
$this->featureCount = count($this->samples[0]);
|
||||
|
||||
$labelCounts = array_count_values($this->targets);
|
||||
$this->labels = array_keys($labelCounts);
|
||||
$this->labels = array_map('strval', array_flip(array_flip($this->targets)));
|
||||
foreach ($this->labels as $label) {
|
||||
$samples = $this->getSamplesByLabel($label);
|
||||
$this->p[$label] = count($samples) / $this->sampleCount;
|
||||
|
@ -77,16 +76,39 @@ class NaiveBayes implements Classifier
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
// Use NaiveBayes assumption for each label using:
|
||||
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
|
||||
// Then compare probability for each class to determine which label is most likely
|
||||
$predictions = [];
|
||||
foreach ($this->labels as $label) {
|
||||
$p = $this->p[$label];
|
||||
for ($i = 0; $i < $this->featureCount; ++$i) {
|
||||
$Plf = $this->sampleProbability($sample, $i, $label);
|
||||
$p += $Plf;
|
||||
}
|
||||
|
||||
$predictions[$label] = $p;
|
||||
}
|
||||
|
||||
arsort($predictions, SORT_NUMERIC);
|
||||
reset($predictions);
|
||||
|
||||
return key($predictions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates vital statistics for each label & feature. Stores these
|
||||
* values in private array in order to avoid repeated calculation
|
||||
* @param string $label
|
||||
* @param array $samples
|
||||
*/
|
||||
private function calculateStatistics($label, $samples)
|
||||
private function calculateStatistics(string $label, array $samples): void
|
||||
{
|
||||
$this->std[$label] = array_fill(0, $this->featureCount, 0);
|
||||
$this->mean[$label]= array_fill(0, $this->featureCount, 0);
|
||||
$this->mean[$label] = array_fill(0, $this->featureCount, 0);
|
||||
$this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
|
||||
$this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
|
||||
for ($i = 0; $i < $this->featureCount; ++$i) {
|
||||
|
@ -113,25 +135,25 @@ class NaiveBayes implements Classifier
|
|||
|
||||
/**
|
||||
* Calculates the probability P(label|sample_n)
|
||||
*
|
||||
* @param array $sample
|
||||
* @param int $feature
|
||||
* @param string $label
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function sampleProbability($sample, $feature, $label)
|
||||
private function sampleProbability(array $sample, int $feature, string $label): float
|
||||
{
|
||||
if (!isset($sample[$feature])) {
|
||||
throw new InvalidArgumentException('Missing feature. All samples must have equal number of features');
|
||||
}
|
||||
|
||||
$value = $sample[$feature];
|
||||
if ($this->dataType[$label][$feature] == self::NOMINAL) {
|
||||
if (!isset($this->discreteProb[$label][$feature][$value]) ||
|
||||
$this->discreteProb[$label][$feature][$value] == 0) {
|
||||
return self::EPSILON;
|
||||
}
|
||||
|
||||
return $this->discreteProb[$label][$feature][$value];
|
||||
}
|
||||
$std = $this->std[$label][$feature] ;
|
||||
$mean= $this->mean[$label][$feature];
|
||||
|
||||
$std = $this->std[$label][$feature];
|
||||
$mean = $this->mean[$label][$feature];
|
||||
// Calculate the probability density by use of normal/Gaussian distribution
|
||||
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
|
||||
//
|
||||
|
@ -139,19 +161,16 @@ class NaiveBayes implements Classifier
|
|||
// some libraries adopt taking log of calculations such as
|
||||
// scikit-learn did.
|
||||
// (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py)
|
||||
$pdf = -0.5 * log(2.0 * pi() * $std * $std);
|
||||
$pdf -= 0.5 * pow($value - $mean, 2) / ($std * $std);
|
||||
$pdf = -0.5 * log(2.0 * M_PI * $std * $std);
|
||||
$pdf -= 0.5 * (($value - $mean) ** 2) / ($std * $std);
|
||||
|
||||
return $pdf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return samples belonging to specific label
|
||||
*
|
||||
* @param string $label
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function getSamplesByLabel($label)
|
||||
private function getSamplesByLabel(string $label): array
|
||||
{
|
||||
$samples = [];
|
||||
for ($i = 0; $i < $this->sampleCount; ++$i) {
|
||||
|
@ -159,30 +178,7 @@ class NaiveBayes implements Classifier
|
|||
$samples[] = $this->samples[$i];
|
||||
}
|
||||
}
|
||||
|
||||
return $samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
// Use NaiveBayes assumption for each label using:
|
||||
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
|
||||
// Then compare probability for each class to determine which label is most likely
|
||||
$predictions = [];
|
||||
foreach ($this->labels as $label) {
|
||||
$p = $this->p[$label];
|
||||
for ($i = 0; $i<$this->featureCount; ++$i) {
|
||||
$Plf = $this->sampleProbability($sample, $i, $label);
|
||||
$p += $Plf;
|
||||
}
|
||||
$predictions[$label] = $p;
|
||||
}
|
||||
|
||||
arsort($predictions, SORT_NUMERIC);
|
||||
reset($predictions);
|
||||
return key($predictions);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,20 +10,15 @@ use Phpml\SupportVectorMachine\Type;
|
|||
|
||||
class SVC extends SupportVectorMachine implements Classifier
|
||||
{
|
||||
/**
|
||||
* @param int $kernel
|
||||
* @param float $cost
|
||||
* @param int $degree
|
||||
* @param float|null $gamma
|
||||
* @param float $coef0
|
||||
* @param float $tolerance
|
||||
* @param int $cacheSize
|
||||
* @param bool $shrinking
|
||||
* @param bool $probabilityEstimates
|
||||
*/
|
||||
public function __construct(
|
||||
int $kernel = Kernel::LINEAR, float $cost = 1.0, int $degree = 3, float $gamma = null, float $coef0 = 0.0,
|
||||
float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true,
|
||||
int $kernel = Kernel::RBF,
|
||||
float $cost = 1.0,
|
||||
int $degree = 3,
|
||||
?float $gamma = null,
|
||||
float $coef0 = 0.0,
|
||||
float $tolerance = 0.001,
|
||||
int $cacheSize = 100,
|
||||
bool $shrinking = true,
|
||||
bool $probabilityEstimates = false
|
||||
) {
|
||||
parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);
|
||||
|
|
|
@ -9,14 +9,12 @@ abstract class WeightedClassifier implements Classifier
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $weights;
|
||||
protected $weights = [];
|
||||
|
||||
/**
|
||||
* Sets the array including a weight for each sample
|
||||
*
|
||||
* @param array $weights
|
||||
*/
|
||||
public function setSampleWeights(array $weights)
|
||||
public function setSampleWeights(array $weights): void
|
||||
{
|
||||
$this->weights = $weights;
|
||||
}
|
||||
|
|
|
@ -6,10 +6,5 @@ namespace Phpml\Clustering;
|
|||
|
||||
interface Clusterer
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function cluster(array $samples);
|
||||
public function cluster(array $samples): array;
|
||||
}
|
||||
|
|
|
@ -9,6 +9,8 @@ use Phpml\Math\Distance\Euclidean;
|
|||
|
||||
class DBSCAN implements Clusterer
|
||||
{
|
||||
private const NOISE = -1;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
|
@ -24,14 +26,9 @@ class DBSCAN implements Clusterer
|
|||
*/
|
||||
private $distanceMetric;
|
||||
|
||||
/**
|
||||
* @param float $epsilon
|
||||
* @param int $minSamples
|
||||
* @param Distance $distanceMetric
|
||||
*/
|
||||
public function __construct($epsilon = 0.5, $minSamples = 3, Distance $distanceMetric = null)
|
||||
public function __construct(float $epsilon = 0.5, int $minSamples = 3, ?Distance $distanceMetric = null)
|
||||
{
|
||||
if (null === $distanceMetric) {
|
||||
if ($distanceMetric === null) {
|
||||
$distanceMetric = new Euclidean();
|
||||
}
|
||||
|
||||
|
@ -40,72 +37,84 @@ class DBSCAN implements Clusterer
|
|||
$this->distanceMetric = $distanceMetric;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function cluster(array $samples)
|
||||
public function cluster(array $samples): array
|
||||
{
|
||||
$clusters = [];
|
||||
$visited = [];
|
||||
$labels = [];
|
||||
$n = 0;
|
||||
|
||||
foreach ($samples as $index => $sample) {
|
||||
if (isset($visited[$index])) {
|
||||
if (isset($labels[$index])) {
|
||||
continue;
|
||||
}
|
||||
$visited[$index] = true;
|
||||
|
||||
$regionSamples = $this->getSamplesInRegion($sample, $samples);
|
||||
if (count($regionSamples) >= $this->minSamples) {
|
||||
$clusters[] = $this->expandCluster($regionSamples, $visited);
|
||||
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
|
||||
|
||||
if (count($neighborIndices) < $this->minSamples) {
|
||||
$labels[$index] = self::NOISE;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$labels[$index] = $n;
|
||||
|
||||
$this->expandCluster($samples, $neighborIndices, $labels, $n);
|
||||
|
||||
++$n;
|
||||
}
|
||||
|
||||
return $this->groupByCluster($samples, $labels, $n);
|
||||
}
|
||||
|
||||
private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void
|
||||
{
|
||||
while (($index = array_pop($seeds)) !== null) {
|
||||
if (isset($labels[$index])) {
|
||||
if ($labels[$index] === self::NOISE) {
|
||||
$labels[$index] = $n;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$labels[$index] = $n;
|
||||
|
||||
$sample = $samples[$index];
|
||||
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
|
||||
|
||||
if (count($neighborIndices) >= $this->minSamples) {
|
||||
$seeds = array_unique(array_merge($seeds, $neighborIndices));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function getIndicesInRegion(array $center, array $samples): array
|
||||
{
|
||||
$indices = [];
|
||||
|
||||
foreach ($samples as $index => $sample) {
|
||||
if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) {
|
||||
$indices[] = $index;
|
||||
}
|
||||
}
|
||||
|
||||
return $indices;
|
||||
}
|
||||
|
||||
private function groupByCluster(array $samples, array $labels, int $n): array
|
||||
{
|
||||
$clusters = array_fill(0, $n, []);
|
||||
|
||||
foreach ($samples as $index => $sample) {
|
||||
if ($labels[$index] !== self::NOISE) {
|
||||
$clusters[$labels[$index]][$index] = $sample;
|
||||
}
|
||||
}
|
||||
|
||||
// Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility
|
||||
foreach ($clusters as $index => $cluster) {
|
||||
$clusters[$index] = array_merge($cluster, []);
|
||||
}
|
||||
|
||||
return $clusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $localSample
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function getSamplesInRegion($localSample, $samples)
|
||||
{
|
||||
$region = [];
|
||||
|
||||
foreach ($samples as $index => $sample) {
|
||||
if ($this->distanceMetric->distance($localSample, $sample) < $this->epsilon) {
|
||||
$region[$index] = $sample;
|
||||
}
|
||||
}
|
||||
|
||||
return $region;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $visited
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function expandCluster($samples, &$visited)
|
||||
{
|
||||
$cluster = [];
|
||||
|
||||
foreach ($samples as $index => $sample) {
|
||||
if (!isset($visited[$index])) {
|
||||
$visited[$index] = true;
|
||||
$regionSamples = $this->getSamplesInRegion($sample, $samples);
|
||||
if (count($regionSamples) > $this->minSamples) {
|
||||
$cluster = array_merge($regionSamples, $cluster);
|
||||
}
|
||||
}
|
||||
|
||||
$cluster[] = $sample;
|
||||
}
|
||||
|
||||
return $cluster;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,8 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Clustering;
|
||||
|
||||
use Phpml\Clustering\KMeans\Point;
|
||||
use Phpml\Clustering\KMeans\Cluster;
|
||||
use Phpml\Clustering\KMeans\Point;
|
||||
use Phpml\Clustering\KMeans\Space;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance\Euclidean;
|
||||
|
@ -18,9 +18,9 @@ class FuzzyCMeans implements Clusterer
|
|||
private $clustersNumber;
|
||||
|
||||
/**
|
||||
* @var array|Cluster[]
|
||||
* @var Cluster[]
|
||||
*/
|
||||
private $clusters = null;
|
||||
private $clusters = [];
|
||||
|
||||
/**
|
||||
* @var Space
|
||||
|
@ -28,9 +28,9 @@ class FuzzyCMeans implements Clusterer
|
|||
private $space;
|
||||
|
||||
/**
|
||||
* @var array|float[][]
|
||||
* @var float[][]
|
||||
*/
|
||||
private $membership;
|
||||
private $membership = [];
|
||||
|
||||
/**
|
||||
* @var float
|
||||
|
@ -55,170 +55,36 @@ class FuzzyCMeans implements Clusterer
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $samples;
|
||||
private $samples = [];
|
||||
|
||||
/**
|
||||
* @param int $clustersNumber
|
||||
* @param float $fuzziness
|
||||
* @param float $epsilon
|
||||
* @param int $maxIterations
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
|
||||
{
|
||||
if ($clustersNumber <= 0) {
|
||||
throw InvalidArgumentException::invalidClustersNumber();
|
||||
throw new InvalidArgumentException('Invalid clusters number');
|
||||
}
|
||||
|
||||
$this->clustersNumber = $clustersNumber;
|
||||
$this->fuzziness = $fuzziness;
|
||||
$this->epsilon = $epsilon;
|
||||
$this->maxIterations = $maxIterations;
|
||||
}
|
||||
|
||||
protected function initClusters()
|
||||
{
|
||||
// Membership array is a matrix of cluster number by sample counts
|
||||
// We initilize the membership array with random values
|
||||
$dim = $this->space->getDimension();
|
||||
$this->generateRandomMembership($dim, $this->sampleCount);
|
||||
$this->updateClusters();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $rows
|
||||
* @param int $cols
|
||||
*/
|
||||
protected function generateRandomMembership(int $rows, int $cols)
|
||||
{
|
||||
$this->membership = [];
|
||||
for ($i = 0; $i < $rows; ++$i) {
|
||||
$row = [];
|
||||
$total = 0.0;
|
||||
for ($k = 0; $k < $cols; ++$k) {
|
||||
$val = rand(1, 5) / 10.0;
|
||||
$row[] = $val;
|
||||
$total += $val;
|
||||
}
|
||||
|
||||
$this->membership[] = array_map(function ($val) use ($total) {
|
||||
return $val / $total;
|
||||
}, $row);
|
||||
}
|
||||
}
|
||||
|
||||
protected function updateClusters()
|
||||
{
|
||||
$dim = $this->space->getDimension();
|
||||
if (!$this->clusters) {
|
||||
$this->clusters = [];
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
|
||||
}
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$cluster = $this->clusters[$i];
|
||||
$center = $cluster->getCoordinates();
|
||||
for ($k = 0; $k < $dim; ++$k) {
|
||||
$a = $this->getMembershipRowTotal($i, $k, true);
|
||||
$b = $this->getMembershipRowTotal($i, $k, false);
|
||||
$center[$k] = $a / $b;
|
||||
}
|
||||
|
||||
$cluster->setCoordinates($center);
|
||||
}
|
||||
}
|
||||
|
||||
protected function getMembershipRowTotal(int $row, int $col, bool $multiply)
|
||||
{
|
||||
$sum = 0.0;
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$val = pow($this->membership[$row][$k], $this->fuzziness);
|
||||
if ($multiply) {
|
||||
$val *= $this->samples[$k][$col];
|
||||
}
|
||||
|
||||
$sum += $val;
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
protected function updateMembershipMatrix()
|
||||
{
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$distCalc = $this->getDistanceCalc($i, $k);
|
||||
$this->membership[$i][$k] = 1.0 / $distCalc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param int $row
|
||||
* @param int $col
|
||||
* @return float
|
||||
*/
|
||||
protected function getDistanceCalc(int $row, int $col)
|
||||
{
|
||||
$sum = 0.0;
|
||||
$distance = new Euclidean();
|
||||
$dist1 = $distance->distance(
|
||||
$this->clusters[$row]->getCoordinates(),
|
||||
$this->samples[$col]
|
||||
);
|
||||
|
||||
for ($j = 0; $j < $this->clustersNumber; ++$j) {
|
||||
$dist2 = $distance->distance(
|
||||
$this->clusters[$j]->getCoordinates(),
|
||||
$this->samples[$col]
|
||||
);
|
||||
|
||||
$val = pow($dist1 / $dist2, 2.0 / ($this->fuzziness - 1));
|
||||
$sum += $val;
|
||||
}
|
||||
return $sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* The objective is to minimize the distance between all data points
|
||||
* and all cluster centers. This method returns the summation of all
|
||||
* these distances
|
||||
*/
|
||||
protected function getObjective()
|
||||
{
|
||||
$sum = 0.0;
|
||||
$distance = new Euclidean();
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$clust = $this->clusters[$i]->getCoordinates();
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$point = $this->samples[$k];
|
||||
$sum += $distance->distance($clust, $point);
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getMembershipMatrix()
|
||||
public function getMembershipMatrix(): array
|
||||
{
|
||||
return $this->membership;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array|Point[] $samples
|
||||
* @return array
|
||||
* @param Point[]|int[][] $samples
|
||||
*/
|
||||
public function cluster(array $samples)
|
||||
public function cluster(array $samples): array
|
||||
{
|
||||
// Initialize variables, clusters and membership matrix
|
||||
$this->sampleCount = count($samples);
|
||||
$this->samples =& $samples;
|
||||
$this->samples = &$samples;
|
||||
$this->space = new Space(count($samples[0]));
|
||||
$this->initClusters();
|
||||
|
||||
|
@ -242,8 +108,7 @@ class FuzzyCMeans implements Clusterer
|
|||
$column = array_column($this->membership, $k);
|
||||
arsort($column);
|
||||
reset($column);
|
||||
$i = key($column);
|
||||
$cluster = $this->clusters[$i];
|
||||
$cluster = $this->clusters[key($column)];
|
||||
$cluster->attach(new Point($this->samples[$k]));
|
||||
}
|
||||
|
||||
|
@ -255,4 +120,120 @@ class FuzzyCMeans implements Clusterer
|
|||
|
||||
return $grouped;
|
||||
}
|
||||
|
||||
protected function initClusters(): void
|
||||
{
|
||||
// Membership array is a matrix of cluster number by sample counts
|
||||
// We initilize the membership array with random values
|
||||
$dim = $this->space->getDimension();
|
||||
$this->generateRandomMembership($dim, $this->sampleCount);
|
||||
$this->updateClusters();
|
||||
}
|
||||
|
||||
protected function generateRandomMembership(int $rows, int $cols): void
|
||||
{
|
||||
$this->membership = [];
|
||||
for ($i = 0; $i < $rows; ++$i) {
|
||||
$row = [];
|
||||
$total = 0.0;
|
||||
for ($k = 0; $k < $cols; ++$k) {
|
||||
$val = random_int(1, 5) / 10.0;
|
||||
$row[] = $val;
|
||||
$total += $val;
|
||||
}
|
||||
|
||||
$this->membership[] = array_map(function ($val) use ($total) {
|
||||
return $val / $total;
|
||||
}, $row);
|
||||
}
|
||||
}
|
||||
|
||||
protected function updateClusters(): void
|
||||
{
|
||||
$dim = $this->space->getDimension();
|
||||
if (count($this->clusters) === 0) {
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
|
||||
}
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$cluster = $this->clusters[$i];
|
||||
$center = $cluster->getCoordinates();
|
||||
for ($k = 0; $k < $dim; ++$k) {
|
||||
$a = $this->getMembershipRowTotal($i, $k, true);
|
||||
$b = $this->getMembershipRowTotal($i, $k, false);
|
||||
$center[$k] = $a / $b;
|
||||
}
|
||||
|
||||
$cluster->setCoordinates($center);
|
||||
}
|
||||
}
|
||||
|
||||
protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float
|
||||
{
|
||||
$sum = 0.0;
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$val = $this->membership[$row][$k] ** $this->fuzziness;
|
||||
if ($multiply) {
|
||||
$val *= $this->samples[$k][$col];
|
||||
}
|
||||
|
||||
$sum += $val;
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
protected function updateMembershipMatrix(): void
|
||||
{
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$distCalc = $this->getDistanceCalc($i, $k);
|
||||
$this->membership[$i][$k] = 1.0 / $distCalc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected function getDistanceCalc(int $row, int $col): float
|
||||
{
|
||||
$sum = 0.0;
|
||||
$distance = new Euclidean();
|
||||
$dist1 = $distance->distance(
|
||||
$this->clusters[$row]->getCoordinates(),
|
||||
$this->samples[$col]
|
||||
);
|
||||
|
||||
for ($j = 0; $j < $this->clustersNumber; ++$j) {
|
||||
$dist2 = $distance->distance(
|
||||
$this->clusters[$j]->getCoordinates(),
|
||||
$this->samples[$col]
|
||||
);
|
||||
|
||||
$val = ($dist1 / $dist2) ** 2.0 / ($this->fuzziness - 1);
|
||||
$sum += $val;
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* The objective is to minimize the distance between all data points
|
||||
* and all cluster centers. This method returns the summation of all
|
||||
* these distances
|
||||
*/
|
||||
protected function getObjective(): float
|
||||
{
|
||||
$sum = 0.0;
|
||||
$distance = new Euclidean();
|
||||
for ($i = 0; $i < $this->clustersNumber; ++$i) {
|
||||
$clust = $this->clusters[$i]->getCoordinates();
|
||||
for ($k = 0; $k < $this->sampleCount; ++$k) {
|
||||
$point = $this->samples[$k];
|
||||
$sum += $distance->distance($clust, $point);
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,8 +9,9 @@ use Phpml\Exception\InvalidArgumentException;
|
|||
|
||||
class KMeans implements Clusterer
|
||||
{
|
||||
const INIT_RANDOM = 1;
|
||||
const INIT_KMEANS_PLUS_PLUS = 2;
|
||||
public const INIT_RANDOM = 1;
|
||||
|
||||
public const INIT_KMEANS_PLUS_PLUS = 2;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
|
@ -22,32 +23,21 @@ class KMeans implements Clusterer
|
|||
*/
|
||||
private $initialization;
|
||||
|
||||
/**
|
||||
* @param int $clustersNumber
|
||||
* @param int $initialization
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS)
|
||||
{
|
||||
if ($clustersNumber <= 0) {
|
||||
throw InvalidArgumentException::invalidClustersNumber();
|
||||
throw new InvalidArgumentException('Invalid clusters number');
|
||||
}
|
||||
|
||||
$this->clustersNumber = $clustersNumber;
|
||||
$this->initialization = $initialization;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function cluster(array $samples)
|
||||
public function cluster(array $samples): array
|
||||
{
|
||||
$space = new Space(count($samples[0]));
|
||||
foreach ($samples as $sample) {
|
||||
$space->addPoint($sample);
|
||||
$space = new Space(count(reset($samples)));
|
||||
foreach ($samples as $key => $sample) {
|
||||
$space->addPoint($sample, $key);
|
||||
}
|
||||
|
||||
$clusters = [];
|
||||
|
|
|
@ -5,11 +5,10 @@ declare(strict_types=1);
|
|||
namespace Phpml\Clustering\KMeans;
|
||||
|
||||
use IteratorAggregate;
|
||||
use Countable;
|
||||
use SplObjectStorage;
|
||||
use LogicException;
|
||||
use SplObjectStorage;
|
||||
|
||||
class Cluster extends Point implements IteratorAggregate, Countable
|
||||
class Cluster extends Point implements IteratorAggregate
|
||||
{
|
||||
/**
|
||||
* @var Space
|
||||
|
@ -21,10 +20,6 @@ class Cluster extends Point implements IteratorAggregate, Countable
|
|||
*/
|
||||
protected $points;
|
||||
|
||||
/**
|
||||
* @param Space $space
|
||||
* @param array $coordinates
|
||||
*/
|
||||
public function __construct(Space $space, array $coordinates)
|
||||
{
|
||||
parent::__construct($coordinates);
|
||||
|
@ -32,23 +27,21 @@ class Cluster extends Point implements IteratorAggregate, Countable
|
|||
$this->points = new SplObjectStorage();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getPoints()
|
||||
public function getPoints(): array
|
||||
{
|
||||
$points = [];
|
||||
foreach ($this->points as $point) {
|
||||
$points[] = $point->toArray();
|
||||
if ($point->label === null) {
|
||||
$points[] = $point->toArray();
|
||||
} else {
|
||||
$points[$point->label] = $point->toArray();
|
||||
}
|
||||
}
|
||||
|
||||
return $points;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function toArray()
|
||||
public function toArray(): array
|
||||
{
|
||||
return [
|
||||
'centroid' => parent::toArray(),
|
||||
|
@ -56,17 +49,10 @@ class Cluster extends Point implements IteratorAggregate, Countable
|
|||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Point $point
|
||||
*
|
||||
* @return Point
|
||||
*
|
||||
* @throws \LogicException
|
||||
*/
|
||||
public function attach(Point $point)
|
||||
public function attach(Point $point): Point
|
||||
{
|
||||
if ($point instanceof self) {
|
||||
throw new LogicException('cannot attach a cluster to another');
|
||||
throw new LogicException('Cannot attach a cluster to another');
|
||||
}
|
||||
|
||||
$this->points->attach($point);
|
||||
|
@ -74,37 +60,27 @@ class Cluster extends Point implements IteratorAggregate, Countable
|
|||
return $point;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Point $point
|
||||
*
|
||||
* @return Point
|
||||
*/
|
||||
public function detach(Point $point)
|
||||
public function detach(Point $point): Point
|
||||
{
|
||||
$this->points->detach($point);
|
||||
|
||||
return $point;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param SplObjectStorage $points
|
||||
*/
|
||||
public function attachAll(SplObjectStorage $points)
|
||||
public function attachAll(SplObjectStorage $points): void
|
||||
{
|
||||
$this->points->addAll($points);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param SplObjectStorage $points
|
||||
*/
|
||||
public function detachAll(SplObjectStorage $points)
|
||||
public function detachAll(SplObjectStorage $points): void
|
||||
{
|
||||
$this->points->removeAll($points);
|
||||
}
|
||||
|
||||
public function updateCentroid()
|
||||
public function updateCentroid(): void
|
||||
{
|
||||
if (!$count = count($this->points)) {
|
||||
$count = count($this->points);
|
||||
if ($count === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -129,18 +105,12 @@ class Cluster extends Point implements IteratorAggregate, Countable
|
|||
return $this->points;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return mixed
|
||||
*/
|
||||
public function count()
|
||||
public function count(): int
|
||||
{
|
||||
return count($this->points);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $newCoordinates
|
||||
*/
|
||||
public function setCoordinates(array $newCoordinates)
|
||||
|
||||
public function setCoordinates(array $newCoordinates): void
|
||||
{
|
||||
$this->coordinates = $newCoordinates;
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ namespace Phpml\Clustering\KMeans;
|
|||
|
||||
use ArrayAccess;
|
||||
|
||||
class Point implements ArrayAccess
|
||||
class Point implements ArrayAccess, \Countable
|
||||
{
|
||||
/**
|
||||
* @var int
|
||||
|
@ -16,32 +16,32 @@ class Point implements ArrayAccess
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $coordinates;
|
||||
protected $coordinates = [];
|
||||
|
||||
/**
|
||||
* @param array $coordinates
|
||||
* @var mixed
|
||||
*/
|
||||
public function __construct(array $coordinates)
|
||||
protected $label;
|
||||
|
||||
/**
|
||||
* @param mixed $label
|
||||
*/
|
||||
public function __construct(array $coordinates, $label = null)
|
||||
{
|
||||
$this->dimension = count($coordinates);
|
||||
$this->coordinates = $coordinates;
|
||||
$this->label = $label;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function toArray()
|
||||
public function toArray(): array
|
||||
{
|
||||
return $this->coordinates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Point $point
|
||||
* @param bool $precise
|
||||
*
|
||||
* @return int|mixed
|
||||
* @return float|int
|
||||
*/
|
||||
public function getDistanceWith(self $point, $precise = true)
|
||||
public function getDistanceWith(self $point, bool $precise = true)
|
||||
{
|
||||
$distance = 0;
|
||||
for ($n = 0; $n < $this->dimension; ++$n) {
|
||||
|
@ -49,22 +49,23 @@ class Point implements ArrayAccess
|
|||
$distance += $difference * $difference;
|
||||
}
|
||||
|
||||
return $precise ? sqrt((float) $distance) : $distance;
|
||||
return $precise ? $distance ** .5 : $distance;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $points
|
||||
*
|
||||
* @return mixed
|
||||
* @param Point[] $points
|
||||
*/
|
||||
public function getClosest(array $points)
|
||||
public function getClosest(array $points): ?self
|
||||
{
|
||||
$minPoint = null;
|
||||
|
||||
foreach ($points as $point) {
|
||||
$distance = $this->getDistanceWith($point, false);
|
||||
|
||||
if (!isset($minDistance)) {
|
||||
$minDistance = $distance;
|
||||
$minPoint = $point;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -77,20 +78,15 @@ class Point implements ArrayAccess
|
|||
return $minPoint;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getCoordinates()
|
||||
public function getCoordinates(): array
|
||||
{
|
||||
return $this->coordinates;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed $offset
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function offsetExists($offset)
|
||||
public function offsetExists($offset): bool
|
||||
{
|
||||
return isset($this->coordinates[$offset]);
|
||||
}
|
||||
|
@ -109,7 +105,7 @@ class Point implements ArrayAccess
|
|||
* @param mixed $offset
|
||||
* @param mixed $value
|
||||
*/
|
||||
public function offsetSet($offset, $value)
|
||||
public function offsetSet($offset, $value): void
|
||||
{
|
||||
$this->coordinates[$offset] = $value;
|
||||
}
|
||||
|
@ -117,8 +113,13 @@ class Point implements ArrayAccess
|
|||
/**
|
||||
* @param mixed $offset
|
||||
*/
|
||||
public function offsetUnset($offset)
|
||||
public function offsetUnset($offset): void
|
||||
{
|
||||
unset($this->coordinates[$offset]);
|
||||
}
|
||||
|
||||
public function count(): int
|
||||
{
|
||||
return count($this->coordinates);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,10 +4,10 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Clustering\KMeans;
|
||||
|
||||
use InvalidArgumentException;
|
||||
use LogicException;
|
||||
use Phpml\Clustering\KMeans;
|
||||
use SplObjectStorage;
|
||||
use LogicException;
|
||||
use InvalidArgumentException;
|
||||
|
||||
class Space extends SplObjectStorage
|
||||
{
|
||||
|
@ -16,10 +16,7 @@ class Space extends SplObjectStorage
|
|||
*/
|
||||
protected $dimension;
|
||||
|
||||
/**
|
||||
* @param $dimension
|
||||
*/
|
||||
public function __construct($dimension)
|
||||
public function __construct(int $dimension)
|
||||
{
|
||||
if ($dimension < 1) {
|
||||
throw new LogicException('a space dimension cannot be null or negative');
|
||||
|
@ -28,12 +25,11 @@ class Space extends SplObjectStorage
|
|||
$this->dimension = $dimension;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function toArray()
|
||||
public function toArray(): array
|
||||
{
|
||||
$points = [];
|
||||
|
||||
/** @var Point $point */
|
||||
foreach ($this as $point) {
|
||||
$points[] = $point->toArray();
|
||||
}
|
||||
|
@ -42,33 +38,31 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $coordinates
|
||||
*
|
||||
* @return Point
|
||||
* @param mixed $label
|
||||
*/
|
||||
public function newPoint(array $coordinates)
|
||||
public function newPoint(array $coordinates, $label = null): Point
|
||||
{
|
||||
if (count($coordinates) != $this->dimension) {
|
||||
if (count($coordinates) !== $this->dimension) {
|
||||
throw new LogicException('('.implode(',', $coordinates).') is not a point of this space');
|
||||
}
|
||||
|
||||
return new Point($coordinates);
|
||||
return new Point($coordinates, $label);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $coordinates
|
||||
* @param null $data
|
||||
* @param mixed $label
|
||||
* @param mixed $data
|
||||
*/
|
||||
public function addPoint(array $coordinates, $data = null)
|
||||
public function addPoint(array $coordinates, $label = null, $data = null): void
|
||||
{
|
||||
$this->attach($this->newPoint($coordinates), $data);
|
||||
$this->attach($this->newPoint($coordinates, $label), $data);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Point $point
|
||||
* @param null $data
|
||||
* @param object $point
|
||||
* @param mixed $data
|
||||
*/
|
||||
public function attach($point, $data = null)
|
||||
public function attach($point, $data = null): void
|
||||
{
|
||||
if (!$point instanceof Point) {
|
||||
throw new InvalidArgumentException('can only attach points to spaces');
|
||||
|
@ -77,10 +71,7 @@ class Space extends SplObjectStorage
|
|||
parent::attach($point, $data);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getDimension()
|
||||
public function getDimension(): int
|
||||
{
|
||||
return $this->dimension;
|
||||
}
|
||||
|
@ -90,30 +81,30 @@ class Space extends SplObjectStorage
|
|||
*/
|
||||
public function getBoundaries()
|
||||
{
|
||||
if (!count($this)) {
|
||||
if (count($this) === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$min = $this->newPoint(array_fill(0, $this->dimension, null));
|
||||
$max = $this->newPoint(array_fill(0, $this->dimension, null));
|
||||
|
||||
/** @var self $point */
|
||||
foreach ($this as $point) {
|
||||
for ($n = 0; $n < $this->dimension; ++$n) {
|
||||
($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n];
|
||||
($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n];
|
||||
if ($min[$n] === null || $min[$n] > $point[$n]) {
|
||||
$min[$n] = $point[$n];
|
||||
}
|
||||
|
||||
if ($max[$n] === null || $max[$n] < $point[$n]) {
|
||||
$max[$n] = $point[$n];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [$min, $max];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Point $min
|
||||
* @param Point $max
|
||||
*
|
||||
* @return Point
|
||||
*/
|
||||
public function getRandomPoint(Point $min, Point $max)
|
||||
public function getRandomPoint(Point $min, Point $max): Point
|
||||
{
|
||||
$point = $this->newPoint(array_fill(0, $this->dimension, null));
|
||||
|
||||
|
@ -125,12 +116,9 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
|
||||
/**
|
||||
* @param int $clustersNumber
|
||||
* @param int $initMethod
|
||||
*
|
||||
* @return array|Cluster[]
|
||||
* @return Cluster[]
|
||||
*/
|
||||
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM)
|
||||
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM): array
|
||||
{
|
||||
$clusters = $this->initializeClusters($clustersNumber, $initMethod);
|
||||
|
||||
|
@ -141,20 +129,19 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
|
||||
/**
|
||||
* @param $clustersNumber
|
||||
* @param $initMethod
|
||||
*
|
||||
* @return array|Cluster[]
|
||||
* @return Cluster[]
|
||||
*/
|
||||
protected function initializeClusters(int $clustersNumber, int $initMethod)
|
||||
protected function initializeClusters(int $clustersNumber, int $initMethod): array
|
||||
{
|
||||
switch ($initMethod) {
|
||||
case KMeans::INIT_RANDOM:
|
||||
$clusters = $this->initializeRandomClusters($clustersNumber);
|
||||
|
||||
break;
|
||||
|
||||
case KMeans::INIT_KMEANS_PLUS_PLUS:
|
||||
$clusters = $this->initializeKMPPClusters($clustersNumber);
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -167,11 +154,9 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
|
||||
/**
|
||||
* @param $clusters
|
||||
*
|
||||
* @return bool
|
||||
* @param Cluster[] $clusters
|
||||
*/
|
||||
protected function iterate($clusters)
|
||||
protected function iterate(array $clusters): bool
|
||||
{
|
||||
$convergence = true;
|
||||
|
||||
|
@ -183,8 +168,8 @@ class Space extends SplObjectStorage
|
|||
$closest = $point->getClosest($clusters);
|
||||
|
||||
if ($closest !== $cluster) {
|
||||
isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage();
|
||||
isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage();
|
||||
$attach[$closest] ?? $attach[$closest] = new SplObjectStorage();
|
||||
$detach[$cluster] ?? $detach[$cluster] = new SplObjectStorage();
|
||||
|
||||
$attach[$closest]->attach($point);
|
||||
$detach[$cluster]->attach($point);
|
||||
|
@ -194,10 +179,12 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
}
|
||||
|
||||
/** @var Cluster $cluster */
|
||||
foreach ($attach as $cluster) {
|
||||
$cluster->attachAll($attach[$cluster]);
|
||||
}
|
||||
|
||||
/** @var Cluster $cluster */
|
||||
foreach ($detach as $cluster) {
|
||||
$cluster->detachAll($detach[$cluster]);
|
||||
}
|
||||
|
@ -210,14 +197,58 @@ class Space extends SplObjectStorage
|
|||
}
|
||||
|
||||
/**
|
||||
* @param int $clustersNumber
|
||||
*
|
||||
* @return array
|
||||
* @return Cluster[]
|
||||
*/
|
||||
private function initializeRandomClusters(int $clustersNumber)
|
||||
protected function initializeKMPPClusters(int $clustersNumber): array
|
||||
{
|
||||
$clusters = [];
|
||||
list($min, $max) = $this->getBoundaries();
|
||||
$this->rewind();
|
||||
|
||||
/** @var Point $current */
|
||||
$current = $this->current();
|
||||
|
||||
$clusters[] = new Cluster($this, $current->getCoordinates());
|
||||
|
||||
$distances = new SplObjectStorage();
|
||||
|
||||
for ($i = 1; $i < $clustersNumber; ++$i) {
|
||||
$sum = 0;
|
||||
/** @var Point $point */
|
||||
foreach ($this as $point) {
|
||||
$closest = $point->getClosest($clusters);
|
||||
if ($closest === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$distance = $point->getDistanceWith($closest);
|
||||
$sum += $distances[$point] = $distance;
|
||||
}
|
||||
|
||||
$sum = random_int(0, (int) $sum);
|
||||
/** @var Point $point */
|
||||
foreach ($this as $point) {
|
||||
$sum -= $distances[$point];
|
||||
|
||||
if ($sum > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$clusters[] = new Cluster($this, $point->getCoordinates());
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $clusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Cluster[]
|
||||
*/
|
||||
private function initializeRandomClusters(int $clustersNumber): array
|
||||
{
|
||||
$clusters = [];
|
||||
[$min, $max] = $this->getBoundaries();
|
||||
|
||||
for ($n = 0; $n < $clustersNumber; ++$n) {
|
||||
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
|
||||
|
@ -225,39 +256,4 @@ class Space extends SplObjectStorage
|
|||
|
||||
return $clusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $clustersNumber
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function initializeKMPPClusters(int $clustersNumber)
|
||||
{
|
||||
$clusters = [];
|
||||
$this->rewind();
|
||||
|
||||
$clusters[] = new Cluster($this, $this->current()->getCoordinates());
|
||||
|
||||
$distances = new SplObjectStorage();
|
||||
|
||||
for ($i = 1; $i < $clustersNumber; ++$i) {
|
||||
$sum = 0;
|
||||
foreach ($this as $point) {
|
||||
$distance = $point->getDistanceWith($point->getClosest($clusters));
|
||||
$sum += $distances[$point] = $distance;
|
||||
}
|
||||
|
||||
$sum = random_int(0, (int) $sum);
|
||||
foreach ($this as $point) {
|
||||
if (($sum -= $distances[$point]) > 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$clusters[] = new Cluster($this, $point->getCoordinates());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return $clusters;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,11 +8,7 @@ use Phpml\Dataset\Dataset;
|
|||
|
||||
class RandomSplit extends Split
|
||||
{
|
||||
/**
|
||||
* @param Dataset $dataset
|
||||
* @param float $testSize
|
||||
*/
|
||||
protected function splitDataset(Dataset $dataset, float $testSize)
|
||||
protected function splitDataset(Dataset $dataset, float $testSize): void
|
||||
{
|
||||
$samples = $dataset->getSamples();
|
||||
$labels = $dataset->getTargets();
|
||||
|
|
|
@ -29,63 +29,42 @@ abstract class Split
|
|||
*/
|
||||
protected $testLabels = [];
|
||||
|
||||
/**
|
||||
* @param Dataset $dataset
|
||||
* @param float $testSize
|
||||
* @param int $seed
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(Dataset $dataset, float $testSize = 0.3, int $seed = null)
|
||||
public function __construct(Dataset $dataset, float $testSize = 0.3, ?int $seed = null)
|
||||
{
|
||||
if (0 >= $testSize || 1 <= $testSize) {
|
||||
throw InvalidArgumentException::percentNotInRange('testSize');
|
||||
if ($testSize <= 0 || $testSize >= 1) {
|
||||
throw new InvalidArgumentException('testsize must be between 0.0 and 1.0');
|
||||
}
|
||||
|
||||
$this->seedGenerator($seed);
|
||||
|
||||
$this->splitDataset($dataset, $testSize);
|
||||
}
|
||||
|
||||
abstract protected function splitDataset(Dataset $dataset, float $testSize);
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTrainSamples()
|
||||
public function getTrainSamples(): array
|
||||
{
|
||||
return $this->trainSamples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTestSamples()
|
||||
public function getTestSamples(): array
|
||||
{
|
||||
return $this->testSamples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTrainLabels()
|
||||
public function getTrainLabels(): array
|
||||
{
|
||||
return $this->trainLabels;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTestLabels()
|
||||
public function getTestLabels(): array
|
||||
{
|
||||
return $this->testLabels;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int|null $seed
|
||||
*/
|
||||
protected function seedGenerator(int $seed = null)
|
||||
abstract protected function splitDataset(Dataset $dataset, float $testSize): void;
|
||||
|
||||
protected function seedGenerator(?int $seed = null): void
|
||||
{
|
||||
if (null === $seed) {
|
||||
if ($seed === null) {
|
||||
mt_srand();
|
||||
} else {
|
||||
mt_srand($seed);
|
||||
|
|
|
@ -9,11 +9,7 @@ use Phpml\Dataset\Dataset;
|
|||
|
||||
class StratifiedRandomSplit extends RandomSplit
|
||||
{
|
||||
/**
|
||||
* @param Dataset $dataset
|
||||
* @param float $testSize
|
||||
*/
|
||||
protected function splitDataset(Dataset $dataset, float $testSize)
|
||||
protected function splitDataset(Dataset $dataset, float $testSize): void
|
||||
{
|
||||
$datasets = $this->splitByTarget($dataset);
|
||||
|
||||
|
@ -23,9 +19,7 @@ class StratifiedRandomSplit extends RandomSplit
|
|||
}
|
||||
|
||||
/**
|
||||
* @param Dataset $dataset
|
||||
*
|
||||
* @return Dataset[]|array
|
||||
* @return Dataset[]
|
||||
*/
|
||||
private function splitByTarget(Dataset $dataset): array
|
||||
{
|
||||
|
@ -33,23 +27,16 @@ class StratifiedRandomSplit extends RandomSplit
|
|||
$samples = $dataset->getSamples();
|
||||
|
||||
$uniqueTargets = array_unique($targets);
|
||||
/** @var array $split */
|
||||
$split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), []));
|
||||
|
||||
foreach ($samples as $key => $sample) {
|
||||
$split[$targets[$key]][] = $sample;
|
||||
}
|
||||
|
||||
$datasets = $this->createDatasets($uniqueTargets, $split);
|
||||
|
||||
return $datasets;
|
||||
return $this->createDatasets($uniqueTargets, $split);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $uniqueTargets
|
||||
* @param array $split
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function createDatasets(array $uniqueTargets, array $split): array
|
||||
{
|
||||
$datasets = [];
|
||||
|
|
|
@ -19,34 +19,44 @@ class ArrayDataset implements Dataset
|
|||
protected $targets = [];
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(array $samples, array $targets)
|
||||
{
|
||||
if (count($samples) != count($targets)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
if (count($samples) !== count($targets)) {
|
||||
throw new InvalidArgumentException('Size of given arrays does not match');
|
||||
}
|
||||
|
||||
$this->samples = $samples;
|
||||
$this->targets = $targets;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getSamples(): array
|
||||
{
|
||||
return $this->samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTargets(): array
|
||||
{
|
||||
return $this->targets;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int[] $columns
|
||||
*/
|
||||
public function removeColumns(array $columns): void
|
||||
{
|
||||
foreach ($this->samples as &$sample) {
|
||||
$this->removeColumnsFromSample($sample, $columns);
|
||||
}
|
||||
}
|
||||
|
||||
private function removeColumnsFromSample(array &$sample, array $columns): void
|
||||
{
|
||||
foreach ($columns as $index) {
|
||||
unset($sample[$index]);
|
||||
}
|
||||
|
||||
$sample = array_values($sample);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,36 +11,32 @@ class CsvDataset extends ArrayDataset
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $columnNames;
|
||||
protected $columnNames = [];
|
||||
|
||||
/**
|
||||
* @param string $filepath
|
||||
* @param int $features
|
||||
* @param bool $headingRow
|
||||
* @param string $delimiter
|
||||
*
|
||||
* @throws FileException
|
||||
*/
|
||||
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',')
|
||||
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',', int $maxLineLength = 0)
|
||||
{
|
||||
if (!file_exists($filepath)) {
|
||||
throw FileException::missingFile(basename($filepath));
|
||||
throw new FileException(sprintf('File "%s" missing.', basename($filepath)));
|
||||
}
|
||||
|
||||
if (false === $handle = fopen($filepath, 'rb')) {
|
||||
throw FileException::cantOpenFile(basename($filepath));
|
||||
$handle = fopen($filepath, 'rb');
|
||||
if ($handle === false) {
|
||||
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
|
||||
}
|
||||
|
||||
if ($headingRow) {
|
||||
$data = fgetcsv($handle, 1000, $delimiter);
|
||||
$this->columnNames = array_slice($data, 0, $features);
|
||||
$data = fgetcsv($handle, $maxLineLength, $delimiter);
|
||||
$this->columnNames = array_slice((array) $data, 0, $features);
|
||||
} else {
|
||||
$this->columnNames = range(0, $features - 1);
|
||||
}
|
||||
|
||||
$samples = $targets = [];
|
||||
while (($data = fgetcsv($handle, 1000, $delimiter)) !== false) {
|
||||
$samples[] = array_slice($data, 0, $features);
|
||||
while (($data = fgetcsv($handle, $maxLineLength, $delimiter)) !== false) {
|
||||
$samples[] = array_slice((array) $data, 0, $features);
|
||||
$targets[] = $data[$features];
|
||||
}
|
||||
|
||||
|
@ -49,10 +45,7 @@ class CsvDataset extends ArrayDataset
|
|||
parent::__construct($samples, $targets);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getColumnNames()
|
||||
public function getColumnNames(): array
|
||||
{
|
||||
return $this->columnNames;
|
||||
}
|
||||
|
|
|
@ -6,13 +6,7 @@ namespace Phpml\Dataset;
|
|||
|
||||
interface Dataset
|
||||
{
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getSamples(): array;
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getTargets(): array;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ class GlassDataset extends CsvDataset
|
|||
{
|
||||
public function __construct()
|
||||
{
|
||||
$filepath = __DIR__.'/../../../../data/glass.csv';
|
||||
$filepath = __DIR__.'/../../../data/glass.csv';
|
||||
parent::__construct($filepath, 9, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ class IrisDataset extends CsvDataset
|
|||
{
|
||||
public function __construct()
|
||||
{
|
||||
$filepath = __DIR__.'/../../../../data/iris.csv';
|
||||
$filepath = __DIR__.'/../../../data/iris.csv';
|
||||
parent::__construct($filepath, 4, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ class WineDataset extends CsvDataset
|
|||
{
|
||||
public function __construct()
|
||||
{
|
||||
$filepath = __DIR__.'/../../../../data/wine.csv';
|
||||
$filepath = __DIR__.'/../../../data/wine.csv';
|
||||
parent::__construct($filepath, 13, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,39 +8,28 @@ use Phpml\Exception\DatasetException;
|
|||
|
||||
class FilesDataset extends ArrayDataset
|
||||
{
|
||||
/**
|
||||
* @param string $rootPath
|
||||
*
|
||||
* @throws DatasetException
|
||||
*/
|
||||
public function __construct(string $rootPath)
|
||||
{
|
||||
if (!is_dir($rootPath)) {
|
||||
throw DatasetException::missingFolder($rootPath);
|
||||
throw new DatasetException(sprintf('Dataset root folder "%s" missing.', $rootPath));
|
||||
}
|
||||
|
||||
$this->scanRootPath($rootPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $rootPath
|
||||
*/
|
||||
private function scanRootPath(string $rootPath)
|
||||
private function scanRootPath(string $rootPath): void
|
||||
{
|
||||
foreach (glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR) as $dir) {
|
||||
$this->scanDir($dir);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $dir
|
||||
*/
|
||||
private function scanDir(string $dir)
|
||||
private function scanDir(string $dir): void
|
||||
{
|
||||
$target = basename($dir);
|
||||
|
||||
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
|
||||
$this->samples[] = [file_get_contents($file)];
|
||||
$this->samples[] = file_get_contents($file);
|
||||
$this->targets[] = $target;
|
||||
}
|
||||
}
|
||||
|
|
101
lib/mlbackend/php/phpml/src/Phpml/Dataset/MnistDataset.php
Normal file
101
lib/mlbackend/php/phpml/src/Phpml/Dataset/MnistDataset.php
Normal file
|
@ -0,0 +1,101 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Dataset;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
/**
|
||||
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
|
||||
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
|
||||
*/
|
||||
final class MnistDataset extends ArrayDataset
|
||||
{
|
||||
private const MAGIC_IMAGE = 0x00000803;
|
||||
|
||||
private const MAGIC_LABEL = 0x00000801;
|
||||
|
||||
private const IMAGE_ROWS = 28;
|
||||
|
||||
private const IMAGE_COLS = 28;
|
||||
|
||||
public function __construct(string $imagePath, string $labelPath)
|
||||
{
|
||||
$this->samples = $this->readImages($imagePath);
|
||||
$this->targets = $this->readLabels($labelPath);
|
||||
|
||||
if (count($this->samples) !== count($this->targets)) {
|
||||
throw new InvalidArgumentException('Must have the same number of images and labels');
|
||||
}
|
||||
}
|
||||
|
||||
private function readImages(string $imagePath): array
|
||||
{
|
||||
$stream = fopen($imagePath, 'rb');
|
||||
|
||||
if ($stream === false) {
|
||||
throw new InvalidArgumentException('Could not open file: '.$imagePath);
|
||||
}
|
||||
|
||||
$images = [];
|
||||
|
||||
try {
|
||||
$header = fread($stream, 16);
|
||||
|
||||
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
|
||||
|
||||
if ($fields['magic'] !== self::MAGIC_IMAGE) {
|
||||
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
|
||||
}
|
||||
|
||||
if ($fields['rows'] != self::IMAGE_ROWS) {
|
||||
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
|
||||
}
|
||||
|
||||
if ($fields['cols'] != self::IMAGE_COLS) {
|
||||
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $fields['size']; $i++) {
|
||||
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
|
||||
|
||||
// Convert to float between 0 and 1
|
||||
$images[] = array_map(function ($b) {
|
||||
return $b / 255;
|
||||
}, array_values(unpack('C*', (string) $imageBytes)));
|
||||
}
|
||||
} finally {
|
||||
fclose($stream);
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
private function readLabels(string $labelPath): array
|
||||
{
|
||||
$stream = fopen($labelPath, 'rb');
|
||||
|
||||
if ($stream === false) {
|
||||
throw new InvalidArgumentException('Could not open file: '.$labelPath);
|
||||
}
|
||||
|
||||
$labels = [];
|
||||
|
||||
try {
|
||||
$header = fread($stream, 8);
|
||||
|
||||
$fields = unpack('Nmagic/Nsize', (string) $header);
|
||||
|
||||
if ($fields['magic'] !== self::MAGIC_LABEL) {
|
||||
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
|
||||
}
|
||||
|
||||
$labels = fread($stream, $fields['size']);
|
||||
} finally {
|
||||
fclose($stream);
|
||||
}
|
||||
|
||||
return array_values(unpack('C*', (string) $labels));
|
||||
}
|
||||
}
|
131
lib/mlbackend/php/phpml/src/Phpml/Dataset/SvmDataset.php
Normal file
131
lib/mlbackend/php/phpml/src/Phpml/Dataset/SvmDataset.php
Normal file
|
@ -0,0 +1,131 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Dataset;
|
||||
|
||||
use Phpml\Exception\DatasetException;
|
||||
use Phpml\Exception\FileException;
|
||||
|
||||
class SvmDataset extends ArrayDataset
|
||||
{
|
||||
public function __construct(string $filePath)
|
||||
{
|
||||
[$samples, $targets] = self::readProblem($filePath);
|
||||
|
||||
parent::__construct($samples, $targets);
|
||||
}
|
||||
|
||||
private static function readProblem(string $filePath): array
|
||||
{
|
||||
$handle = self::openFile($filePath);
|
||||
|
||||
$samples = [];
|
||||
$targets = [];
|
||||
$maxIndex = 0;
|
||||
while (false !== $line = fgets($handle)) {
|
||||
[$sample, $target, $maxIndex] = self::processLine((string) $line, $maxIndex);
|
||||
$samples[] = $sample;
|
||||
$targets[] = $target;
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
foreach ($samples as &$sample) {
|
||||
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||
}
|
||||
|
||||
return [$samples, $targets];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return resource
|
||||
*/
|
||||
private static function openFile(string $filePath)
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
|
||||
}
|
||||
|
||||
$handle = fopen($filePath, 'rb');
|
||||
if ($handle === false) {
|
||||
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
|
||||
}
|
||||
|
||||
return $handle;
|
||||
}
|
||||
|
||||
private static function processLine(string $line, int $maxIndex): array
|
||||
{
|
||||
$columns = self::parseLine($line);
|
||||
|
||||
$target = self::parseTargetColumn($columns[0]);
|
||||
$sample = array_fill(0, $maxIndex + 1, 0);
|
||||
|
||||
$n = count($columns);
|
||||
for ($i = 1; $i < $n; ++$i) {
|
||||
[$index, $value] = self::parseFeatureColumn($columns[$i]);
|
||||
if ($index > $maxIndex) {
|
||||
$maxIndex = $index;
|
||||
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||
}
|
||||
|
||||
$sample[$index] = $value;
|
||||
}
|
||||
|
||||
return [$sample, $target, $maxIndex];
|
||||
}
|
||||
|
||||
private static function parseLine(string $line): array
|
||||
{
|
||||
$line = explode('#', $line, 2)[0];
|
||||
$line = rtrim($line);
|
||||
$line = str_replace("\t", ' ', $line);
|
||||
|
||||
return explode(' ', $line);
|
||||
}
|
||||
|
||||
private static function parseTargetColumn(string $column): float
|
||||
{
|
||||
if (!is_numeric($column)) {
|
||||
throw new DatasetException(sprintf('Invalid target "%s".', $column));
|
||||
}
|
||||
|
||||
return (float) $column;
|
||||
}
|
||||
|
||||
private static function parseFeatureColumn(string $column): array
|
||||
{
|
||||
$feature = explode(':', $column, 2);
|
||||
if (count($feature) !== 2) {
|
||||
throw new DatasetException(sprintf('Invalid value "%s".', $column));
|
||||
}
|
||||
|
||||
$index = self::parseFeatureIndex($feature[0]);
|
||||
$value = self::parseFeatureValue($feature[1]);
|
||||
|
||||
return [$index, $value];
|
||||
}
|
||||
|
||||
private static function parseFeatureIndex(string $index): int
|
||||
{
|
||||
if (!is_numeric($index) || !ctype_digit($index)) {
|
||||
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
||||
}
|
||||
|
||||
if ((int) $index < 1) {
|
||||
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
||||
}
|
||||
|
||||
return (int) $index - 1;
|
||||
}
|
||||
|
||||
private static function parseFeatureValue(string $value): float
|
||||
{
|
||||
if (!is_numeric($value)) {
|
||||
throw new DatasetException(sprintf('Invalid value "%s".', $value));
|
||||
}
|
||||
|
||||
return (float) $value;
|
||||
}
|
||||
}
|
|
@ -47,14 +47,12 @@ abstract class EigenTransformerBase
|
|||
* Calculates eigenValues and eigenVectors of the given matrix. Returns
|
||||
* top eigenVectors along with the largest eigenValues. The total explained variance
|
||||
* of these eigenVectors will be no less than desired $totalVariance value
|
||||
*
|
||||
* @param array $matrix
|
||||
*/
|
||||
protected function eigenDecomposition(array $matrix)
|
||||
protected function eigenDecomposition(array $matrix): void
|
||||
{
|
||||
$eig = new EigenvalueDecomposition($matrix);
|
||||
$eigVals = $eig->getRealEigenvalues();
|
||||
$eigVects= $eig->getEigenvectors();
|
||||
$eigVects = $eig->getEigenvectors();
|
||||
|
||||
$totalEigVal = array_sum($eigVals);
|
||||
// Sort eigenvalues in descending order
|
||||
|
@ -85,12 +83,8 @@ abstract class EigenTransformerBase
|
|||
|
||||
/**
|
||||
* Returns the reduced data
|
||||
*
|
||||
* @param array $data
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function reduce(array $data)
|
||||
protected function reduce(array $data): array
|
||||
{
|
||||
$m1 = new Matrix($data);
|
||||
$m2 = new Matrix($this->eigVectors);
|
||||
|
|
|
@ -4,16 +4,22 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\DimensionReduction;
|
||||
|
||||
use Closure;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\Math\Distance\Euclidean;
|
||||
use Phpml\Math\Distance\Manhattan;
|
||||
use Phpml\Math\Matrix;
|
||||
|
||||
class KernelPCA extends PCA
|
||||
{
|
||||
const KERNEL_RBF = 1;
|
||||
const KERNEL_SIGMOID = 2;
|
||||
const KERNEL_LAPLACIAN = 3;
|
||||
const KERNEL_LINEAR = 4;
|
||||
public const KERNEL_RBF = 1;
|
||||
|
||||
public const KERNEL_SIGMOID = 2;
|
||||
|
||||
public const KERNEL_LAPLACIAN = 3;
|
||||
|
||||
public const KERNEL_LINEAR = 4;
|
||||
|
||||
/**
|
||||
* Selected kernel function
|
||||
|
@ -25,7 +31,7 @@ class KernelPCA extends PCA
|
|||
/**
|
||||
* Gamma value used by the kernel
|
||||
*
|
||||
* @var float
|
||||
* @var float|null
|
||||
*/
|
||||
protected $gamma;
|
||||
|
||||
|
@ -34,7 +40,7 @@ class KernelPCA extends PCA
|
|||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $data;
|
||||
protected $data = [];
|
||||
|
||||
/**
|
||||
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
|
||||
|
@ -44,18 +50,16 @@ class KernelPCA extends PCA
|
|||
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
|
||||
* This transformation will return the same number of rows with only <i>2</i> columns.
|
||||
*
|
||||
* @param int $kernel
|
||||
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
|
||||
* @param int $numFeatures Number of columns to be returned
|
||||
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
|
||||
* @param int $numFeatures Number of columns to be returned
|
||||
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(int $kernel = self::KERNEL_RBF, $totalVariance = null, $numFeatures = null, $gamma = null)
|
||||
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
|
||||
{
|
||||
$availableKernels = [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR];
|
||||
if (!in_array($kernel, $availableKernels)) {
|
||||
throw new \Exception("KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian");
|
||||
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
|
||||
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
|
||||
}
|
||||
|
||||
parent::__construct($totalVariance, $numFeatures);
|
||||
|
@ -69,12 +73,8 @@ class KernelPCA extends PCA
|
|||
* of this data while preserving $totalVariance or $numFeatures. <br>
|
||||
* $data is an n-by-m matrix and returned array is
|
||||
* n-by-k matrix where k <= m
|
||||
*
|
||||
* @param array $data
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function fit(array $data)
|
||||
public function fit(array $data): array
|
||||
{
|
||||
$numRows = count($data);
|
||||
$this->data = $data;
|
||||
|
@ -93,16 +93,33 @@ class KernelPCA extends PCA
|
|||
return Matrix::transposeArray($this->eigVectors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the variables obtained during the last run of <code>fit</code>.
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* @throws InvalidOperationException
|
||||
*/
|
||||
public function transform(array $sample): array
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
|
||||
}
|
||||
|
||||
if (is_array($sample[0])) {
|
||||
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
|
||||
}
|
||||
|
||||
$pairs = $this->getDistancePairs($sample);
|
||||
|
||||
return $this->projectSample($pairs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates similarity matrix by use of selected kernel function<br>
|
||||
* An n-by-m matrix is given and an n-by-n matrix is returned
|
||||
*
|
||||
* @param array $data
|
||||
* @param int $numRows
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function calculateKernelMatrix(array $data, int $numRows)
|
||||
protected function calculateKernelMatrix(array $data, int $numRows): array
|
||||
{
|
||||
$kernelFunc = $this->getKernel();
|
||||
|
||||
|
@ -125,15 +142,10 @@ class KernelPCA extends PCA
|
|||
* conversion:
|
||||
*
|
||||
* K′ = K − N.K − K.N + N.K.N where N is n-by-n matrix filled with 1/n
|
||||
*
|
||||
* @param array $matrix
|
||||
* @param int $n
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function centerMatrix(array $matrix, int $n)
|
||||
protected function centerMatrix(array $matrix, int $n): array
|
||||
{
|
||||
$N = array_fill(0, $n, array_fill(0, $n, 1.0/$n));
|
||||
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
|
||||
$N = new Matrix($N, false);
|
||||
$K = new Matrix($matrix, false);
|
||||
|
||||
|
@ -145,19 +157,17 @@ class KernelPCA extends PCA
|
|||
$N_K_N = $N->multiply($K_N);
|
||||
|
||||
return $K->subtract($N_K)
|
||||
->subtract($K_N)
|
||||
->add($N_K_N)
|
||||
->toArray();
|
||||
->subtract($K_N)
|
||||
->add($N_K_N)
|
||||
->toArray();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the callable kernel function
|
||||
*
|
||||
* @return \Closure
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
protected function getKernel()
|
||||
protected function getKernel(): Closure
|
||||
{
|
||||
switch ($this->kernel) {
|
||||
case self::KERNEL_LINEAR:
|
||||
|
@ -168,6 +178,7 @@ class KernelPCA extends PCA
|
|||
case self::KERNEL_RBF:
|
||||
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
|
||||
$dist = new Euclidean();
|
||||
|
||||
return function ($x, $y) use ($dist) {
|
||||
return exp(-$this->gamma * $dist->sqDistance($x, $y));
|
||||
};
|
||||
|
@ -176,27 +187,25 @@ class KernelPCA extends PCA
|
|||
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
|
||||
return function ($x, $y) {
|
||||
$res = Matrix::dot($x, $y)[0] + 1.0;
|
||||
return tanh($this->gamma * $res);
|
||||
|
||||
return tanh((float) $this->gamma * $res);
|
||||
};
|
||||
|
||||
case self::KERNEL_LAPLACIAN:
|
||||
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
|
||||
$dist = new Manhattan();
|
||||
|
||||
return function ($x, $y) use ($dist) {
|
||||
return exp(-$this->gamma * $dist->distance($x, $y));
|
||||
};
|
||||
|
||||
default:
|
||||
throw new \Exception(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
|
||||
// Not reached
|
||||
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getDistancePairs(array $sample)
|
||||
protected function getDistancePairs(array $sample): array
|
||||
{
|
||||
$kernel = $this->getKernel();
|
||||
|
||||
|
@ -208,12 +217,7 @@ class KernelPCA extends PCA
|
|||
return $pairs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $pairs
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function projectSample(array $pairs)
|
||||
protected function projectSample(array $pairs): array
|
||||
{
|
||||
// Normalize eigenvectors by eig = eigVectors / eigValues
|
||||
$func = function ($eigVal, $eigVect) {
|
||||
|
@ -227,29 +231,4 @@ class KernelPCA extends PCA
|
|||
// return k.dot(eig)
|
||||
return Matrix::dot($pairs, $eig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the variables obtained during the last run of <code>fit</code>.
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function transform(array $sample)
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new \Exception("KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first");
|
||||
}
|
||||
|
||||
if (is_array($sample[0])) {
|
||||
throw new \Exception("KernelPCA::transform() accepts only one-dimensional arrays");
|
||||
}
|
||||
|
||||
$pairs = $this->getDistancePairs($sample);
|
||||
|
||||
return $this->projectSample($pairs);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\DimensionReduction;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\Math\Matrix;
|
||||
|
||||
class LDA extends EigenTransformerBase
|
||||
|
@ -16,22 +18,22 @@ class LDA extends EigenTransformerBase
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
public $labels;
|
||||
public $labels = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
public $means;
|
||||
public $means = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
public $counts;
|
||||
public $counts = [];
|
||||
|
||||
/**
|
||||
* @var float[]
|
||||
*/
|
||||
public $overallMean;
|
||||
public $overallMean = [];
|
||||
|
||||
/**
|
||||
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
|
||||
|
@ -43,25 +45,28 @@ class LDA extends EigenTransformerBase
|
|||
* or numFeatures (number of features in the dataset) to be preserved.
|
||||
*
|
||||
* @param float|null $totalVariance Total explained variance to be preserved
|
||||
* @param int|null $numFeatures Number of features to be preserved
|
||||
* @param int|null $numFeatures Number of features to be preserved
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct($totalVariance = null, $numFeatures = null)
|
||||
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
|
||||
{
|
||||
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
|
||||
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
|
||||
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
|
||||
}
|
||||
|
||||
if ($numFeatures !== null && $numFeatures <= 0) {
|
||||
throw new \Exception("Number of features to be preserved should be greater than 0");
|
||||
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
|
||||
}
|
||||
if ($totalVariance !== null && $numFeatures !== null) {
|
||||
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
|
||||
|
||||
if (($totalVariance !== null) === ($numFeatures !== null)) {
|
||||
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
|
||||
}
|
||||
|
||||
if ($numFeatures !== null) {
|
||||
$this->numFeatures = $numFeatures;
|
||||
}
|
||||
|
||||
if ($totalVariance !== null) {
|
||||
$this->totalVariance = $totalVariance;
|
||||
}
|
||||
|
@ -69,16 +74,11 @@ class LDA extends EigenTransformerBase
|
|||
|
||||
/**
|
||||
* Trains the algorithm to transform the given data to a lower dimensional space.
|
||||
*
|
||||
* @param array $data
|
||||
* @param array $classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function fit(array $data, array $classes) : array
|
||||
public function fit(array $data, array $classes): array
|
||||
{
|
||||
$this->labels = $this->getLabels($classes);
|
||||
$this->means = $this->calculateMeans($data, $classes);
|
||||
$this->means = $this->calculateMeans($data, $classes);
|
||||
|
||||
$sW = $this->calculateClassVar($data, $classes);
|
||||
$sB = $this->calculateClassCov();
|
||||
|
@ -91,12 +91,27 @@ class LDA extends EigenTransformerBase
|
|||
return $this->reduce($data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the eigenVectors obtained in the last run of <code>fit</code>.
|
||||
*
|
||||
* @throws InvalidOperationException
|
||||
*/
|
||||
public function transform(array $sample): array
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new InvalidOperationException('LDA has not been fitted with respect to original dataset, please run LDA::fit() first');
|
||||
}
|
||||
|
||||
if (!is_array($sample[0])) {
|
||||
$sample = [$sample];
|
||||
}
|
||||
|
||||
return $this->reduce($sample);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns unique labels in the dataset
|
||||
*
|
||||
* @param array $classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getLabels(array $classes): array
|
||||
{
|
||||
|
@ -105,29 +120,24 @@ class LDA extends EigenTransformerBase
|
|||
return array_keys($counts);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates mean of each column for each class and returns
|
||||
* n by m matrix where n is number of labels and m is number of columns
|
||||
*
|
||||
* @param array $data
|
||||
* @param array $classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function calculateMeans(array $data, array $classes) : array
|
||||
protected function calculateMeans(array $data, array $classes): array
|
||||
{
|
||||
$means = [];
|
||||
$counts= [];
|
||||
$counts = [];
|
||||
$overallMean = array_fill(0, count($data[0]), 0.0);
|
||||
|
||||
foreach ($data as $index => $row) {
|
||||
$label = array_search($classes[$index], $this->labels);
|
||||
$label = array_search($classes[$index], $this->labels, true);
|
||||
|
||||
foreach ($row as $col => $val) {
|
||||
if (!isset($means[$label][$col])) {
|
||||
$means[$label][$col] = 0.0;
|
||||
}
|
||||
|
||||
$means[$label][$col] += $val;
|
||||
$overallMean[$col] += $val;
|
||||
}
|
||||
|
@ -156,25 +166,19 @@ class LDA extends EigenTransformerBase
|
|||
return $means;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns in-class scatter matrix for each class, which
|
||||
* is a n by m matrix where n is number of classes and
|
||||
* m is number of columns
|
||||
*
|
||||
* @param array $data
|
||||
* @param array $classes
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
protected function calculateClassVar($data, $classes)
|
||||
protected function calculateClassVar(array $data, array $classes): Matrix
|
||||
{
|
||||
// s is an n (number of classes) by m (number of column) matrix
|
||||
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
|
||||
$sW = new Matrix($s, false);
|
||||
|
||||
foreach ($data as $index => $row) {
|
||||
$label = array_search($classes[$index], $this->labels);
|
||||
$label = array_search($classes[$index], $this->labels, true);
|
||||
$means = $this->means[$label];
|
||||
|
||||
$row = $this->calculateVar($row, $means);
|
||||
|
@ -189,10 +193,8 @@ class LDA extends EigenTransformerBase
|
|||
* Returns between-class scatter matrix for each class, which
|
||||
* is an n by m matrix where n is number of classes and
|
||||
* m is number of columns
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
protected function calculateClassCov()
|
||||
protected function calculateClassCov(): Matrix
|
||||
{
|
||||
// s is an n (number of classes) by m (number of column) matrix
|
||||
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
|
||||
|
@ -209,13 +211,8 @@ class LDA extends EigenTransformerBase
|
|||
|
||||
/**
|
||||
* Returns the result of the calculation (x - m)T.(x - m)
|
||||
*
|
||||
* @param array $row
|
||||
* @param array $means
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
protected function calculateVar(array $row, array $means)
|
||||
protected function calculateVar(array $row, array $means): Matrix
|
||||
{
|
||||
$x = new Matrix($row, false);
|
||||
$m = new Matrix($means, false);
|
||||
|
@ -223,27 +220,4 @@ class LDA extends EigenTransformerBase
|
|||
|
||||
return $diff->transpose()->multiply($diff);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the eigenVectors obtained in the last run of <code>fit</code>.
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function transform(array $sample)
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new \Exception("LDA has not been fitted with respect to original dataset, please run LDA::fit() first");
|
||||
}
|
||||
|
||||
if (!is_array($sample[0])) {
|
||||
$sample = [$sample];
|
||||
}
|
||||
|
||||
return $this->reduce($sample);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\DimensionReduction;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\Math\Statistic\Covariance;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
|
||||
|
@ -28,25 +30,28 @@ class PCA extends EigenTransformerBase
|
|||
* within the data. It is a lossy data compression technique.<br>
|
||||
*
|
||||
* @param float $totalVariance Total explained variance to be preserved
|
||||
* @param int $numFeatures Number of features to be preserved
|
||||
* @param int $numFeatures Number of features to be preserved
|
||||
*
|
||||
* @throws \Exception
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct($totalVariance = null, $numFeatures = null)
|
||||
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
|
||||
{
|
||||
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
|
||||
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
|
||||
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
|
||||
}
|
||||
|
||||
if ($numFeatures !== null && $numFeatures <= 0) {
|
||||
throw new \Exception("Number of features to be preserved should be greater than 0");
|
||||
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
|
||||
}
|
||||
if ($totalVariance !== null && $numFeatures !== null) {
|
||||
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
|
||||
|
||||
if (($totalVariance !== null) === ($numFeatures !== null)) {
|
||||
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
|
||||
}
|
||||
|
||||
if ($numFeatures !== null) {
|
||||
$this->numFeatures = $numFeatures;
|
||||
}
|
||||
|
||||
if ($totalVariance !== null) {
|
||||
$this->totalVariance = $totalVariance;
|
||||
}
|
||||
|
@ -57,12 +62,8 @@ class PCA extends EigenTransformerBase
|
|||
* of this data while preserving $totalVariance or $numFeatures. <br>
|
||||
* $data is an n-by-m matrix and returned array is
|
||||
* n-by-k matrix where k <= m
|
||||
*
|
||||
* @param array $data
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function fit(array $data)
|
||||
public function fit(array $data): array
|
||||
{
|
||||
$n = count($data[0]);
|
||||
|
||||
|
@ -78,10 +79,27 @@ class PCA extends EigenTransformerBase
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $data
|
||||
* @param int $n
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the eigenVectors obtained in the last run of <code>fit</code>.
|
||||
*
|
||||
* @throws InvalidOperationException
|
||||
*/
|
||||
protected function calculateMeans(array $data, int $n)
|
||||
public function transform(array $sample): array
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
|
||||
}
|
||||
|
||||
if (!is_array($sample[0])) {
|
||||
$sample = [$sample];
|
||||
}
|
||||
|
||||
$sample = $this->normalize($sample, count($sample[0]));
|
||||
|
||||
return $this->reduce($sample);
|
||||
}
|
||||
|
||||
protected function calculateMeans(array $data, int $n): void
|
||||
{
|
||||
// Calculate means for each dimension
|
||||
$this->means = [];
|
||||
|
@ -94,20 +112,15 @@ class PCA extends EigenTransformerBase
|
|||
/**
|
||||
* Normalization of the data includes subtracting mean from
|
||||
* each dimension therefore dimensions will be centered to zero
|
||||
*
|
||||
* @param array $data
|
||||
* @param int $n
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function normalize(array $data, int $n)
|
||||
protected function normalize(array $data, int $n): array
|
||||
{
|
||||
if (empty($this->means)) {
|
||||
if (count($this->means) === 0) {
|
||||
$this->calculateMeans($data, $n);
|
||||
}
|
||||
|
||||
// Normalize data
|
||||
foreach ($data as $i => $row) {
|
||||
foreach (array_keys($data) as $i) {
|
||||
for ($k = 0; $k < $n; ++$k) {
|
||||
$data[$i][$k] -= $this->means[$k];
|
||||
}
|
||||
|
@ -115,29 +128,4 @@ class PCA extends EigenTransformerBase
|
|||
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms the given sample to a lower dimensional vector by using
|
||||
* the eigenVectors obtained in the last run of <code>fit</code>.
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return array
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function transform(array $sample)
|
||||
{
|
||||
if (!$this->fit) {
|
||||
throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first");
|
||||
}
|
||||
|
||||
if (!is_array($sample[0])) {
|
||||
$sample = [$sample];
|
||||
}
|
||||
|
||||
$sample = $this->normalize($sample, count($sample[0]));
|
||||
|
||||
return $this->reduce($sample);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,15 +6,9 @@ namespace Phpml;
|
|||
|
||||
interface Estimator
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets);
|
||||
public function train(array $samples, array $targets): void;
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public function predict(array $samples);
|
||||
|
|
|
@ -4,15 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class DatasetException extends \Exception
|
||||
use Exception;
|
||||
|
||||
class DatasetException extends Exception
|
||||
{
|
||||
/**
|
||||
* @param string $path
|
||||
*
|
||||
* @return DatasetException
|
||||
*/
|
||||
public static function missingFolder(string $path)
|
||||
{
|
||||
return new self(sprintf('Dataset root folder "%s" missing.', $path));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,35 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class FileException extends \Exception
|
||||
use Exception;
|
||||
|
||||
class FileException extends Exception
|
||||
{
|
||||
/**
|
||||
* @param string $filepath
|
||||
*
|
||||
* @return FileException
|
||||
*/
|
||||
public static function missingFile(string $filepath)
|
||||
{
|
||||
return new self(sprintf('File "%s" missing.', $filepath));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $filepath
|
||||
*
|
||||
* @return FileException
|
||||
*/
|
||||
public static function cantOpenFile(string $filepath)
|
||||
{
|
||||
return new self(sprintf('File "%s" can\'t be open.', $filepath));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $filepath
|
||||
*
|
||||
* @return FileException
|
||||
*/
|
||||
public static function cantSaveFile(string $filepath)
|
||||
{
|
||||
return new self(sprintf('File "%s" can\'t be saved.', $filepath));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,112 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class InvalidArgumentException extends \Exception
|
||||
use Exception;
|
||||
|
||||
class InvalidArgumentException extends Exception
|
||||
{
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function arraySizeNotMatch()
|
||||
{
|
||||
return new self('Size of given arrays does not match');
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $name
|
||||
*
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function percentNotInRange($name)
|
||||
{
|
||||
return new self(sprintf('%s must be between 0.0 and 1.0', $name));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function arrayCantBeEmpty()
|
||||
{
|
||||
return new self('The array has zero elements');
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $minimumSize
|
||||
*
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function arraySizeToSmall($minimumSize = 2)
|
||||
{
|
||||
return new self(sprintf('The array must have at least %s elements', $minimumSize));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function matrixDimensionsDidNotMatch()
|
||||
{
|
||||
return new self('Matrix dimensions did not match');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function inconsistentMatrixSupplied()
|
||||
{
|
||||
return new self('Inconsistent matrix supplied');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidClustersNumber()
|
||||
{
|
||||
return new self('Invalid clusters number');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidTarget($target)
|
||||
{
|
||||
return new self('Target with value ' . $target . ' is not part of the accepted classes');
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $language
|
||||
*
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidStopWordsLanguage(string $language)
|
||||
{
|
||||
return new self(sprintf('Can\'t find %s language for StopWords', $language));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidLayerNodeClass()
|
||||
{
|
||||
return new self('Layer node class must implement Node interface');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidLayersNumber()
|
||||
{
|
||||
return new self('Provide at least 1 hidden layer');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return InvalidArgumentException
|
||||
*/
|
||||
public static function invalidClassesNumber()
|
||||
{
|
||||
return new self('Provide at least 2 different classes');
|
||||
}
|
||||
|
||||
public static function inconsistentClasses()
|
||||
{
|
||||
return new self('The provided classes don\'t match the classes provided in the constructor');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
use Exception;
|
||||
|
||||
class InvalidOperationException extends Exception
|
||||
{
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
use Exception;
|
||||
|
||||
class LibsvmCommandException extends Exception
|
||||
{
|
||||
}
|
|
@ -4,29 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class MatrixException extends \Exception
|
||||
use Exception;
|
||||
|
||||
class MatrixException extends Exception
|
||||
{
|
||||
/**
|
||||
* @return MatrixException
|
||||
*/
|
||||
public static function notSquareMatrix()
|
||||
{
|
||||
return new self('Matrix is not square matrix');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return MatrixException
|
||||
*/
|
||||
public static function columnOutOfRange()
|
||||
{
|
||||
return new self('Column out of range');
|
||||
}
|
||||
|
||||
/**
|
||||
* @return MatrixException
|
||||
*/
|
||||
public static function singularMatrix()
|
||||
{
|
||||
return new self('Matrix is singular');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,13 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class NormalizerException extends \Exception
|
||||
use Exception;
|
||||
|
||||
class NormalizerException extends Exception
|
||||
{
|
||||
/**
|
||||
* @return NormalizerException
|
||||
*/
|
||||
public static function unknownNorm()
|
||||
{
|
||||
return new self('Unknown norm supplied.');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,25 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Exception;
|
||||
|
||||
class SerializeException extends \Exception
|
||||
{
|
||||
/**
|
||||
* @param string $filepath
|
||||
*
|
||||
* @return SerializeException
|
||||
*/
|
||||
public static function cantUnserialize(string $filepath)
|
||||
{
|
||||
return new self(sprintf('"%s" can not be unserialized.', $filepath));
|
||||
}
|
||||
use Exception;
|
||||
|
||||
/**
|
||||
* @param string $classname
|
||||
*
|
||||
* @return SerializeException
|
||||
*/
|
||||
public static function cantSerialize(string $classname)
|
||||
{
|
||||
return new self(sprintf('Class "%s" can not be serialized.', $classname));
|
||||
}
|
||||
class SerializeException extends Exception
|
||||
{
|
||||
}
|
||||
|
|
|
@ -11,39 +11,24 @@ class StopWords
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $stopWords;
|
||||
protected $stopWords = [];
|
||||
|
||||
/**
|
||||
* @param array $stopWords
|
||||
*/
|
||||
public function __construct(array $stopWords)
|
||||
{
|
||||
$this->stopWords = array_fill_keys($stopWords, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isStopWord(string $token): bool
|
||||
{
|
||||
return isset($this->stopWords[$token]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $language
|
||||
*
|
||||
* @return StopWords
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function factory($language = 'English'): StopWords
|
||||
public static function factory(string $language = 'English'): self
|
||||
{
|
||||
$className = __NAMESPACE__."\\StopWords\\$language";
|
||||
$className = __NAMESPACE__."\\StopWords\\${language}";
|
||||
|
||||
if (!class_exists($className)) {
|
||||
throw InvalidArgumentException::invalidStopWordsLanguage($language);
|
||||
throw new InvalidArgumentException(sprintf('Can\'t find "%s" language for StopWords', $language));
|
||||
}
|
||||
|
||||
return new $className();
|
||||
|
|
|
@ -11,35 +11,26 @@ class TfIdfTransformer implements Transformer
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $idf;
|
||||
private $idf = [];
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function __construct(array $samples = null)
|
||||
public function __construct(array $samples = [])
|
||||
{
|
||||
if ($samples) {
|
||||
if (count($samples) > 0) {
|
||||
$this->fit($samples);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function fit(array $samples)
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->countTokensFrequency($samples);
|
||||
|
||||
$count = count($samples);
|
||||
foreach ($this->idf as &$value) {
|
||||
$value = log((float)($count / $value), 10.0);
|
||||
$value = log((float) ($count / $value), 10.0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function transform(array &$samples)
|
||||
public function transform(array &$samples): void
|
||||
{
|
||||
foreach ($samples as &$sample) {
|
||||
foreach ($sample as $index => &$feature) {
|
||||
|
@ -48,10 +39,7 @@ class TfIdfTransformer implements Transformer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
private function countTokensFrequency(array $samples)
|
||||
private function countTokensFrequency(array $samples): void
|
||||
{
|
||||
$this->idf = array_fill_keys(array_keys($samples[0]), 0);
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ class TokenCountVectorizer implements Transformer
|
|||
private $tokenizer;
|
||||
|
||||
/**
|
||||
* @var StopWords
|
||||
* @var StopWords|null
|
||||
*/
|
||||
private $stopWords;
|
||||
|
||||
|
@ -27,62 +27,42 @@ class TokenCountVectorizer implements Transformer
|
|||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $vocabulary;
|
||||
private $vocabulary = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $frequencies;
|
||||
private $frequencies = [];
|
||||
|
||||
/**
|
||||
* @param Tokenizer $tokenizer
|
||||
* @param StopWords $stopWords
|
||||
* @param float $minDF
|
||||
*/
|
||||
public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0.0)
|
||||
public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
|
||||
{
|
||||
$this->tokenizer = $tokenizer;
|
||||
$this->stopWords = $stopWords;
|
||||
$this->minDF = $minDF;
|
||||
|
||||
$this->vocabulary = [];
|
||||
$this->frequencies = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function fit(array $samples)
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->buildVocabulary($samples);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function transform(array &$samples)
|
||||
public function transform(array &$samples): void
|
||||
{
|
||||
foreach ($samples as &$sample) {
|
||||
array_walk($samples, function (string &$sample): void {
|
||||
$this->transformSample($sample);
|
||||
}
|
||||
});
|
||||
|
||||
$this->checkDocumentFrequency($samples);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getVocabulary()
|
||||
public function getVocabulary(): array
|
||||
{
|
||||
return array_flip($this->vocabulary);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
private function buildVocabulary(array &$samples)
|
||||
private function buildVocabulary(array &$samples): void
|
||||
{
|
||||
foreach ($samples as $index => $sample) {
|
||||
foreach ($samples as $sample) {
|
||||
$tokens = $this->tokenizer->tokenize($sample);
|
||||
foreach ($tokens as $token) {
|
||||
$this->addTokenToVocabulary($token);
|
||||
|
@ -90,17 +70,14 @@ class TokenCountVectorizer implements Transformer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $sample
|
||||
*/
|
||||
private function transformSample(string &$sample)
|
||||
private function transformSample(string &$sample): void
|
||||
{
|
||||
$counts = [];
|
||||
$tokens = $this->tokenizer->tokenize($sample);
|
||||
|
||||
foreach ($tokens as $token) {
|
||||
$index = $this->getTokenIndex($token);
|
||||
if (false !== $index) {
|
||||
if ($index !== false) {
|
||||
$this->updateFrequency($token);
|
||||
if (!isset($counts[$index])) {
|
||||
$counts[$index] = 0;
|
||||
|
@ -122,8 +99,6 @@ class TokenCountVectorizer implements Transformer
|
|||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
private function getTokenIndex(string $token)
|
||||
|
@ -135,10 +110,7 @@ class TokenCountVectorizer implements Transformer
|
|||
return $this->vocabulary[$token] ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*/
|
||||
private function addTokenToVocabulary(string $token)
|
||||
private function addTokenToVocabulary(string $token): void
|
||||
{
|
||||
if ($this->isStopWord($token)) {
|
||||
return;
|
||||
|
@ -149,20 +121,12 @@ class TokenCountVectorizer implements Transformer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function isStopWord(string $token): bool
|
||||
{
|
||||
return $this->stopWords && $this->stopWords->isStopWord($token);
|
||||
return $this->stopWords !== null && $this->stopWords->isStopWord($token);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*/
|
||||
private function updateFrequency(string $token)
|
||||
private function updateFrequency(string $token): void
|
||||
{
|
||||
if (!isset($this->frequencies[$token])) {
|
||||
$this->frequencies[$token] = 0;
|
||||
|
@ -171,10 +135,7 @@ class TokenCountVectorizer implements Transformer
|
|||
++$this->frequencies[$token];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
private function checkDocumentFrequency(array &$samples)
|
||||
private function checkDocumentFrequency(array &$samples): void
|
||||
{
|
||||
if ($this->minDF > 0) {
|
||||
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
|
||||
|
@ -184,28 +145,19 @@ class TokenCountVectorizer implements Transformer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @param array $beyondMinimum
|
||||
*/
|
||||
private function resetBeyondMinimum(array &$sample, array $beyondMinimum)
|
||||
private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void
|
||||
{
|
||||
foreach ($beyondMinimum as $index) {
|
||||
$sample[$index] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $samplesCount
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function getBeyondMinimumIndexes(int $samplesCount)
|
||||
private function getBeyondMinimumIndexes(int $samplesCount): array
|
||||
{
|
||||
$indexes = [];
|
||||
foreach ($this->frequencies as $token => $frequency) {
|
||||
if (($frequency / $samplesCount) < $this->minDF) {
|
||||
$indexes[] = $this->getTokenIndex($token);
|
||||
$indexes[] = $this->getTokenIndex((string) $token);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection;
|
||||
|
||||
interface ScoringFunction
|
||||
{
|
||||
public function score(array $samples, array $targets): array;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection\ScoringFunction;
|
||||
|
||||
use Phpml\FeatureSelection\ScoringFunction;
|
||||
use Phpml\Math\Statistic\ANOVA;
|
||||
|
||||
final class ANOVAFValue implements ScoringFunction
|
||||
{
|
||||
public function score(array $samples, array $targets): array
|
||||
{
|
||||
$grouped = [];
|
||||
foreach ($samples as $index => $sample) {
|
||||
$grouped[$targets[$index]][] = $sample;
|
||||
}
|
||||
|
||||
return ANOVA::oneWayF(array_values($grouped));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection\ScoringFunction;
|
||||
|
||||
use Phpml\FeatureSelection\ScoringFunction;
|
||||
use Phpml\Math\Matrix;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
|
||||
/**
|
||||
* Quick linear model for testing the effect of a single regressor,
|
||||
* sequentially for many regressors.
|
||||
*
|
||||
* This is done in 2 steps:
|
||||
*
|
||||
* 1. The cross correlation between each regressor and the target is computed,
|
||||
* that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
|
||||
* 2. It is converted to an F score.
|
||||
*
|
||||
* Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression)
|
||||
*/
|
||||
final class UnivariateLinearRegression implements ScoringFunction
|
||||
{
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
private $center;
|
||||
|
||||
/**
|
||||
* @param bool $center - if true samples and targets will be centered
|
||||
*/
|
||||
public function __construct(bool $center = true)
|
||||
{
|
||||
$this->center = $center;
|
||||
}
|
||||
|
||||
public function score(array $samples, array $targets): array
|
||||
{
|
||||
if ($this->center) {
|
||||
$this->centerTargets($targets);
|
||||
$this->centerSamples($samples);
|
||||
}
|
||||
|
||||
$correlations = [];
|
||||
foreach (array_keys($samples[0]) as $index) {
|
||||
$featureColumn = array_column($samples, $index);
|
||||
$correlations[$index] =
|
||||
(Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm())
|
||||
/ (new Matrix($targets, false))->frobeniusNorm();
|
||||
}
|
||||
|
||||
$degreesOfFreedom = count($targets) - ($this->center ? 2 : 1);
|
||||
|
||||
return array_map(function (float $correlation) use ($degreesOfFreedom): float {
|
||||
return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom;
|
||||
}, $correlations);
|
||||
}
|
||||
|
||||
private function centerTargets(array &$targets): void
|
||||
{
|
||||
$mean = Mean::arithmetic($targets);
|
||||
array_walk($targets, function (&$target) use ($mean): void {
|
||||
$target -= $mean;
|
||||
});
|
||||
}
|
||||
|
||||
private function centerSamples(array &$samples): void
|
||||
{
|
||||
$means = [];
|
||||
foreach ($samples[0] as $index => $feature) {
|
||||
$means[$index] = Mean::arithmetic(array_column($samples, $index));
|
||||
}
|
||||
|
||||
foreach ($samples as &$sample) {
|
||||
foreach ($sample as $index => &$feature) {
|
||||
$feature -= $means[$index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
|
||||
use Phpml\Transformer;
|
||||
|
||||
final class SelectKBest implements Transformer
|
||||
{
|
||||
/**
|
||||
* @var ScoringFunction
|
||||
*/
|
||||
private $scoringFunction;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $k;
|
||||
|
||||
/**
|
||||
* @var array|null
|
||||
*/
|
||||
private $scores = null;
|
||||
|
||||
/**
|
||||
* @var array|null
|
||||
*/
|
||||
private $keepColumns = null;
|
||||
|
||||
public function __construct(int $k = 10, ?ScoringFunction $scoringFunction = null)
|
||||
{
|
||||
if ($scoringFunction === null) {
|
||||
$scoringFunction = new ANOVAFValue();
|
||||
}
|
||||
|
||||
$this->scoringFunction = $scoringFunction;
|
||||
$this->k = $k;
|
||||
}
|
||||
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
if ($targets === null || count($targets) === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
|
||||
$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
|
||||
if ($this->k >= count($sorted)) {
|
||||
return;
|
||||
}
|
||||
|
||||
arsort($sorted);
|
||||
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
|
||||
}
|
||||
|
||||
public function transform(array &$samples): void
|
||||
{
|
||||
if ($this->keepColumns === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($samples as &$sample) {
|
||||
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
|
||||
}
|
||||
}
|
||||
|
||||
public function scores(): array
|
||||
{
|
||||
if ($this->scores === null) {
|
||||
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
|
||||
}
|
||||
|
||||
return $this->scores;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Matrix;
|
||||
use Phpml\Math\Statistic\Variance;
|
||||
use Phpml\Transformer;
|
||||
|
||||
final class VarianceThreshold implements Transformer
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
private $threshold;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $variances = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $keepColumns = [];
|
||||
|
||||
public function __construct(float $threshold = 0.0)
|
||||
{
|
||||
if ($threshold < 0) {
|
||||
throw new InvalidArgumentException('Threshold can\'t be lower than zero');
|
||||
}
|
||||
|
||||
$this->threshold = $threshold;
|
||||
}
|
||||
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->variances = array_map(function (array $column) {
|
||||
return Variance::population($column);
|
||||
}, Matrix::transposeArray($samples));
|
||||
|
||||
foreach ($this->variances as $column => $variance) {
|
||||
if ($variance > $this->threshold) {
|
||||
$this->keepColumns[$column] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public function transform(array &$samples): void
|
||||
{
|
||||
foreach ($samples as &$sample) {
|
||||
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -4,6 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Helper;
|
||||
|
||||
use Phpml\Classification\Classifier;
|
||||
|
||||
trait OneVsRest
|
||||
{
|
||||
/**
|
||||
|
@ -25,39 +27,37 @@ trait OneVsRest
|
|||
|
||||
/**
|
||||
* Train a binary classifier in the OvR style
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
// Clears previous stuff.
|
||||
$this->reset();
|
||||
|
||||
$this->trainBylabel($samples, $targets);
|
||||
$this->trainByLabel($samples, $targets);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $allLabels All training set labels
|
||||
*
|
||||
* @return void
|
||||
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
|
||||
*/
|
||||
protected function trainByLabel(array $samples, array $targets, array $allLabels = [])
|
||||
public function reset(): void
|
||||
{
|
||||
$this->classifiers = [];
|
||||
$this->allLabels = [];
|
||||
$this->costValues = [];
|
||||
|
||||
$this->resetBinary();
|
||||
}
|
||||
|
||||
protected function trainByLabel(array $samples, array $targets, array $allLabels = []): void
|
||||
{
|
||||
// Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run.
|
||||
if (!empty($allLabels)) {
|
||||
$this->allLabels = $allLabels;
|
||||
} else {
|
||||
$this->allLabels = array_keys(array_count_values($targets));
|
||||
}
|
||||
$this->allLabels = count($allLabels) === 0 ? array_keys(array_count_values($targets)) : $allLabels;
|
||||
sort($this->allLabels, SORT_STRING);
|
||||
|
||||
// If there are only two targets, then there is no need to perform OvR
|
||||
if (count($this->allLabels) == 2) {
|
||||
if (count($this->allLabels) === 2) {
|
||||
// Init classifier if required.
|
||||
if (empty($this->classifiers)) {
|
||||
if (count($this->classifiers) === 0) {
|
||||
$this->classifiers[0] = $this->getClassifierCopy();
|
||||
}
|
||||
|
||||
|
@ -67,11 +67,11 @@ trait OneVsRest
|
|||
|
||||
foreach ($this->allLabels as $label) {
|
||||
// Init classifier if required.
|
||||
if (empty($this->classifiers[$label])) {
|
||||
if (!isset($this->classifiers[$label])) {
|
||||
$this->classifiers[$label] = $this->getClassifierCopy();
|
||||
}
|
||||
|
||||
list($binarizedTargets, $classifierLabels) = $this->binarizeTargets($targets, $label);
|
||||
[$binarizedTargets, $classifierLabels] = $this->binarizeTargets($targets, $label);
|
||||
$this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels);
|
||||
}
|
||||
}
|
||||
|
@ -85,64 +85,26 @@ trait OneVsRest
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
|
||||
*/
|
||||
public function reset()
|
||||
{
|
||||
$this->classifiers = [];
|
||||
$this->allLabels = [];
|
||||
$this->costValues = [];
|
||||
|
||||
$this->resetBinary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance of the current class after cleaning up OneVsRest stuff.
|
||||
*
|
||||
* @return \Phpml\Estimator
|
||||
*/
|
||||
protected function getClassifierCopy()
|
||||
protected function getClassifierCopy(): Classifier
|
||||
{
|
||||
// Clone the current classifier, so that
|
||||
// we don't mess up its variables while training
|
||||
// multiple instances of this classifier
|
||||
$classifier = clone $this;
|
||||
$classifier->reset();
|
||||
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Groups all targets into two groups: Targets equal to
|
||||
* the given label and the others
|
||||
*
|
||||
* $targets is not passed by reference nor contains objects so this method
|
||||
* changes will not affect the caller $targets array.
|
||||
*
|
||||
* @param array $targets
|
||||
* @param mixed $label
|
||||
* @return array Binarized targets and target's labels
|
||||
*/
|
||||
private function binarizeTargets($targets, $label)
|
||||
{
|
||||
$notLabel = "not_$label";
|
||||
foreach ($targets as $key => $target) {
|
||||
$targets[$key] = $target == $label ? $label : $notLabel;
|
||||
}
|
||||
|
||||
$labels = [$label, $notLabel];
|
||||
return [$targets, $labels];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
if (count($this->allLabels) == 2) {
|
||||
if (count($this->allLabels) === 2) {
|
||||
return $this->classifiers[0]->predictSampleBinary($sample);
|
||||
}
|
||||
|
||||
|
@ -153,32 +115,24 @@ trait OneVsRest
|
|||
}
|
||||
|
||||
arsort($probs, SORT_NUMERIC);
|
||||
|
||||
return key($probs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Each classifier should implement this method instead of train(samples, targets)
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $labels
|
||||
*/
|
||||
abstract protected function trainBinary(array $samples, array $targets, array $labels);
|
||||
|
||||
/**
|
||||
* To be overwritten by OneVsRest classifiers.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
abstract protected function resetBinary();
|
||||
abstract protected function resetBinary(): void;
|
||||
|
||||
/**
|
||||
* Each classifier that make use of OvR approach should be able to
|
||||
* return a probability for a sample to belong to the given label.
|
||||
*
|
||||
* @param array $sample
|
||||
* @param string $label
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
abstract protected function predictProbability(array $sample, string $label);
|
||||
|
@ -186,9 +140,30 @@ trait OneVsRest
|
|||
/**
|
||||
* Each classifier should implement this method instead of predictSample()
|
||||
*
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
abstract protected function predictSampleBinary(array $sample);
|
||||
|
||||
/**
|
||||
* Groups all targets into two groups: Targets equal to
|
||||
* the given label and the others
|
||||
*
|
||||
* $targets is not passed by reference nor contains objects so this method
|
||||
* changes will not affect the caller $targets array.
|
||||
*
|
||||
* @param mixed $label
|
||||
*
|
||||
* @return array Binarized targets and target's labels
|
||||
*/
|
||||
private function binarizeTargets(array $targets, $label): array
|
||||
{
|
||||
$notLabel = "not_${label}";
|
||||
foreach ($targets as $key => $target) {
|
||||
$targets[$key] = $target == $label ? $label : $notLabel;
|
||||
}
|
||||
|
||||
$labels = [$label, $notLabel];
|
||||
|
||||
return [$targets, $labels];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Helper\Optimizer;
|
||||
|
||||
use Closure;
|
||||
|
||||
/**
|
||||
* Conjugate Gradient method to solve a non-linear f(x) with respect to unknown x
|
||||
* See https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method)
|
||||
|
@ -17,14 +19,7 @@ namespace Phpml\Helper\Optimizer;
|
|||
*/
|
||||
class ConjugateGradient extends GD
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientCb
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
|
||||
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
|
||||
{
|
||||
$this->samples = $samples;
|
||||
$this->targets = $targets;
|
||||
|
@ -32,11 +27,11 @@ class ConjugateGradient extends GD
|
|||
$this->sampleCount = count($samples);
|
||||
$this->costValues = [];
|
||||
|
||||
$d = mp::muls($this->gradient($this->theta), -1);
|
||||
$d = MP::muls($this->gradient($this->theta), -1);
|
||||
|
||||
for ($i = 0; $i < $this->maxIterations; ++$i) {
|
||||
// Obtain α that minimizes f(θ + α.d)
|
||||
$alpha = $this->getAlpha(array_sum($d));
|
||||
$alpha = $this->getAlpha($d);
|
||||
|
||||
// θ(k+1) = θ(k) + α.d
|
||||
$thetaNew = $this->getNewTheta($alpha, $d);
|
||||
|
@ -65,30 +60,38 @@ class ConjugateGradient extends GD
|
|||
/**
|
||||
* Executes the callback function for the problem and returns
|
||||
* sum of the gradient for all samples & targets.
|
||||
*
|
||||
* @param array $theta
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function gradient(array $theta)
|
||||
protected function gradient(array $theta): array
|
||||
{
|
||||
list(, $gradient) = parent::gradient($theta);
|
||||
[, $updates, $penalty] = parent::gradient($theta);
|
||||
|
||||
// Calculate gradient for each dimension
|
||||
$gradient = [];
|
||||
for ($i = 0; $i <= $this->dimensions; ++$i) {
|
||||
if ($i === 0) {
|
||||
$gradient[$i] = array_sum($updates);
|
||||
} else {
|
||||
$col = array_column($this->samples, $i - 1);
|
||||
$error = 0;
|
||||
foreach ($col as $index => $val) {
|
||||
$error += $val * $updates[$index];
|
||||
}
|
||||
|
||||
$gradient[$i] = $error + $penalty * $theta[$i];
|
||||
}
|
||||
}
|
||||
|
||||
return $gradient;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of f(x) for given solution
|
||||
*
|
||||
* @param array $theta
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function cost(array $theta)
|
||||
protected function cost(array $theta): float
|
||||
{
|
||||
list($cost) = parent::gradient($theta);
|
||||
[$cost] = parent::gradient($theta);
|
||||
|
||||
return array_sum($cost) / $this->sampleCount;
|
||||
return array_sum($cost) / (int) $this->sampleCount;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -104,19 +107,15 @@ class ConjugateGradient extends GD
|
|||
* b) Probe a larger alpha (0.01) and calculate cost function
|
||||
* b-1) If cost function decreases, continue enlarging alpha
|
||||
* b-2) If cost function increases, take the midpoint and try again
|
||||
*
|
||||
* @param float $d
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function getAlpha(float $d)
|
||||
protected function getAlpha(array $d): float
|
||||
{
|
||||
$small = 0.0001 * $d;
|
||||
$large = 0.01 * $d;
|
||||
$small = MP::muls($d, 0.0001);
|
||||
$large = MP::muls($d, 0.01);
|
||||
|
||||
// Obtain θ + α.d for two initial values, x0 and x1
|
||||
$x0 = mp::adds($this->theta, $small);
|
||||
$x1 = mp::adds($this->theta, $large);
|
||||
$x0 = MP::add($this->theta, $small);
|
||||
$x1 = MP::add($this->theta, $large);
|
||||
|
||||
$epsilon = 0.0001;
|
||||
$iteration = 0;
|
||||
|
@ -132,20 +131,28 @@ class ConjugateGradient extends GD
|
|||
|
||||
if ($fx1 < $fx0) {
|
||||
$x0 = $x1;
|
||||
$x1 = mp::adds($x1, 0.01); // Enlarge second
|
||||
$x1 = MP::adds($x1, 0.01); // Enlarge second
|
||||
} else {
|
||||
$x1 = mp::divs(mp::add($x1, $x0), 2.0);
|
||||
$x1 = MP::divs(MP::add($x1, $x0), 2.0);
|
||||
} // Get to the midpoint
|
||||
|
||||
$error = $fx1 / $this->dimensions;
|
||||
} while ($error <= $epsilon || $iteration++ < 10);
|
||||
|
||||
// Return α = θ / d
|
||||
if ($d == 0) {
|
||||
return $x1[0] - $this->theta[0];
|
||||
// Return α = θ / d
|
||||
// For accuracy, choose a dimension which maximize |d[i]|
|
||||
$imax = 0;
|
||||
for ($i = 1; $i <= $this->dimensions; ++$i) {
|
||||
if (abs($d[$i]) > abs($d[$imax])) {
|
||||
$imax = $i;
|
||||
}
|
||||
}
|
||||
|
||||
return ($x1[0] - $this->theta[0]) / $d;
|
||||
if ($d[$imax] == 0) {
|
||||
return $x1[$imax] - $this->theta[$imax];
|
||||
}
|
||||
|
||||
return ($x1[$imax] - $this->theta[$imax]) / $d[$imax];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -153,30 +160,10 @@ class ConjugateGradient extends GD
|
|||
* gradient direction.
|
||||
*
|
||||
* θ(k+1) = θ(k) + α.d
|
||||
*
|
||||
* @param float $alpha
|
||||
* @param array $d
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getNewTheta(float $alpha, array $d)
|
||||
protected function getNewTheta(float $alpha, array $d): array
|
||||
{
|
||||
$theta = $this->theta;
|
||||
|
||||
for ($i = 0; $i < $this->dimensions + 1; ++$i) {
|
||||
if ($i === 0) {
|
||||
$theta[$i] += $alpha * array_sum($d);
|
||||
} else {
|
||||
$sum = 0.0;
|
||||
foreach ($this->samples as $si => $sample) {
|
||||
$sum += $sample[$i - 1] * $d[$si] * $alpha;
|
||||
}
|
||||
|
||||
$theta[$i] += $sum;
|
||||
}
|
||||
}
|
||||
|
||||
return $theta;
|
||||
return MP::add($this->theta, MP::muls($d, $alpha));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -187,35 +174,31 @@ class ConjugateGradient extends GD
|
|||
*
|
||||
* See:
|
||||
* R. Fletcher and C. M. Reeves, "Function minimization by conjugate gradients", Comput. J. 7 (1964), 149–154.
|
||||
*
|
||||
* @param array $newTheta
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
protected function getBeta(array $newTheta)
|
||||
protected function getBeta(array $newTheta): float
|
||||
{
|
||||
$dNew = array_sum($this->gradient($newTheta));
|
||||
$dOld = array_sum($this->gradient($this->theta)) + 1e-100;
|
||||
$gNew = $this->gradient($newTheta);
|
||||
$gOld = $this->gradient($this->theta);
|
||||
$dNew = 0;
|
||||
$dOld = 1e-100;
|
||||
for ($i = 0; $i <= $this->dimensions; ++$i) {
|
||||
$dNew += $gNew[$i] ** 2;
|
||||
$dOld += $gOld[$i] ** 2;
|
||||
}
|
||||
|
||||
return $dNew ** 2 / $dOld ** 2;
|
||||
return $dNew / $dOld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the new conjugate direction
|
||||
*
|
||||
* d(k+1) =–∇f(x(k+1)) + β(k).d(k)
|
||||
*
|
||||
* @param array $theta
|
||||
* @param float $beta
|
||||
* @param array $d
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getNewDirection(array $theta, float $beta, array $d)
|
||||
protected function getNewDirection(array $theta, float $beta, array $d): array
|
||||
{
|
||||
$grad = $this->gradient($theta);
|
||||
|
||||
return mp::add(mp::muls($grad, -1), mp::muls($d, $beta));
|
||||
return MP::add(MP::muls($grad, -1), MP::muls($d, $beta));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -223,17 +206,12 @@ class ConjugateGradient extends GD
|
|||
* Handles element-wise vector operations between vector-vector
|
||||
* and vector-scalar variables
|
||||
*/
|
||||
class mp
|
||||
class MP
|
||||
{
|
||||
/**
|
||||
* Element-wise <b>multiplication</b> of two vectors of the same size
|
||||
*
|
||||
* @param array $m1
|
||||
* @param array $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function mul(array $m1, array $m2)
|
||||
public static function mul(array $m1, array $m2): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $i => $val) {
|
||||
|
@ -245,13 +223,8 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>division</b> of two vectors of the same size
|
||||
*
|
||||
* @param array $m1
|
||||
* @param array $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function div(array $m1, array $m2)
|
||||
public static function div(array $m1, array $m2): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $i => $val) {
|
||||
|
@ -263,14 +236,8 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>addition</b> of two vectors of the same size
|
||||
*
|
||||
* @param array $m1
|
||||
* @param array $m2
|
||||
* @param int $mag
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function add(array $m1, array $m2, int $mag = 1)
|
||||
public static function add(array $m1, array $m2, int $mag = 1): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $i => $val) {
|
||||
|
@ -282,26 +249,16 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>subtraction</b> of two vectors of the same size
|
||||
*
|
||||
* @param array $m1
|
||||
* @param array $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function sub(array $m1, array $m2)
|
||||
public static function sub(array $m1, array $m2): array
|
||||
{
|
||||
return self::add($m1, $m2, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Element-wise <b>multiplication</b> of a vector with a scalar
|
||||
*
|
||||
* @param array $m1
|
||||
* @param float $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function muls(array $m1, float $m2)
|
||||
public static function muls(array $m1, float $m2): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $val) {
|
||||
|
@ -313,13 +270,8 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>division</b> of a vector with a scalar
|
||||
*
|
||||
* @param array $m1
|
||||
* @param float $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function divs(array $m1, float $m2)
|
||||
public static function divs(array $m1, float $m2): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $val) {
|
||||
|
@ -331,14 +283,8 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>addition</b> of a vector with a scalar
|
||||
*
|
||||
* @param array $m1
|
||||
* @param float $m2
|
||||
* @param int $mag
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function adds(array $m1, float $m2, int $mag = 1)
|
||||
public static function adds(array $m1, float $m2, int $mag = 1): array
|
||||
{
|
||||
$res = [];
|
||||
foreach ($m1 as $val) {
|
||||
|
@ -350,13 +296,8 @@ class mp
|
|||
|
||||
/**
|
||||
* Element-wise <b>subtraction</b> of a vector with a scalar
|
||||
*
|
||||
* @param array $m1
|
||||
* @param array $m2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function subs(array $m1, array $m2)
|
||||
public static function subs(array $m1, float $m2): array
|
||||
{
|
||||
return self::adds($m1, $m2, -1);
|
||||
}
|
||||
|
|
|
@ -4,6 +4,9 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Helper\Optimizer;
|
||||
|
||||
use Closure;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
|
||||
/**
|
||||
* Batch version of Gradient Descent to optimize the weights
|
||||
* of a classifier given samples, targets and the objective function to minimize
|
||||
|
@ -13,18 +16,11 @@ class GD extends StochasticGD
|
|||
/**
|
||||
* Number of samples given
|
||||
*
|
||||
* @var int
|
||||
* @var int|null
|
||||
*/
|
||||
protected $sampleCount = null;
|
||||
protected $sampleCount;
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientCb
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
|
||||
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
|
||||
{
|
||||
$this->samples = $samples;
|
||||
$this->targets = $targets;
|
||||
|
@ -38,11 +34,11 @@ class GD extends StochasticGD
|
|||
$theta = $this->theta;
|
||||
|
||||
// Calculate update terms for each sample
|
||||
list($errors, $updates, $totalPenalty) = $this->gradient($theta);
|
||||
[$errors, $updates, $totalPenalty] = $this->gradient($theta);
|
||||
|
||||
$this->updateWeightsWithUpdates($updates, $totalPenalty);
|
||||
|
||||
$this->costValues[] = array_sum($errors)/$this->sampleCount;
|
||||
$this->costValues[] = array_sum($errors) / $this->sampleCount;
|
||||
|
||||
if ($this->earlyStop($theta)) {
|
||||
break;
|
||||
|
@ -57,22 +53,22 @@ class GD extends StochasticGD
|
|||
/**
|
||||
* Calculates gradient, cost function and penalty term for each sample
|
||||
* then returns them as an array of values
|
||||
*
|
||||
* @param array $theta
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function gradient(array $theta)
|
||||
protected function gradient(array $theta): array
|
||||
{
|
||||
$costs = [];
|
||||
$gradient= [];
|
||||
$gradient = [];
|
||||
$totalPenalty = 0;
|
||||
|
||||
if ($this->gradientCb === null) {
|
||||
throw new InvalidOperationException('Gradient callback is not defined');
|
||||
}
|
||||
|
||||
foreach ($this->samples as $index => $sample) {
|
||||
$target = $this->targets[$index];
|
||||
|
||||
$result = ($this->gradientCb)($theta, $sample, $target);
|
||||
list($cost, $grad, $penalty) = array_pad($result, 3, 0);
|
||||
[$cost, $grad, $penalty] = array_pad($result, 3, 0);
|
||||
|
||||
$costs[] = $cost;
|
||||
$gradient[] = $grad;
|
||||
|
@ -84,11 +80,7 @@ class GD extends StochasticGD
|
|||
return [$costs, $gradient, $totalPenalty];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $updates
|
||||
* @param float $penalty
|
||||
*/
|
||||
protected function updateWeightsWithUpdates(array $updates, float $penalty)
|
||||
protected function updateWeightsWithUpdates(array $updates, float $penalty): void
|
||||
{
|
||||
// Updates all weights at once
|
||||
for ($i = 0; $i <= $this->dimensions; ++$i) {
|
||||
|
@ -110,10 +102,8 @@ class GD extends StochasticGD
|
|||
|
||||
/**
|
||||
* Clears the optimizer internal vars after the optimization process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
protected function clear()
|
||||
protected function clear(): void
|
||||
{
|
||||
$this->sampleCount = null;
|
||||
parent::clear();
|
||||
|
|
|
@ -4,6 +4,9 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Helper\Optimizer;
|
||||
|
||||
use Closure;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
abstract class Optimizer
|
||||
{
|
||||
/**
|
||||
|
@ -11,7 +14,7 @@ abstract class Optimizer
|
|||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $theta;
|
||||
protected $theta = [];
|
||||
|
||||
/**
|
||||
* Number of dimensions
|
||||
|
@ -22,8 +25,6 @@ abstract class Optimizer
|
|||
|
||||
/**
|
||||
* Inits a new instance of Optimizer for the given number of dimensions
|
||||
*
|
||||
* @param int $dimensions
|
||||
*/
|
||||
public function __construct(int $dimensions)
|
||||
{
|
||||
|
@ -32,23 +33,14 @@ abstract class Optimizer
|
|||
// Inits the weights randomly
|
||||
$this->theta = [];
|
||||
for ($i = 0; $i < $this->dimensions; ++$i) {
|
||||
$this->theta[] = rand() / (float) getrandmax();
|
||||
$this->theta[] = (random_int(0, PHP_INT_MAX) / PHP_INT_MAX) + 0.1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the weights manually
|
||||
*
|
||||
* @param array $theta
|
||||
*
|
||||
* @return $this
|
||||
*
|
||||
* @throws \Exception
|
||||
*/
|
||||
public function setInitialTheta(array $theta)
|
||||
public function setTheta(array $theta): self
|
||||
{
|
||||
if (count($theta) != $this->dimensions) {
|
||||
throw new \Exception("Number of values in the weights array should be $this->dimensions");
|
||||
if (count($theta) !== $this->dimensions) {
|
||||
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions));
|
||||
}
|
||||
|
||||
$this->theta = $theta;
|
||||
|
@ -59,10 +51,6 @@ abstract class Optimizer
|
|||
/**
|
||||
* Executes the optimization with the given samples & targets
|
||||
* and returns the weights
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientCb
|
||||
*/
|
||||
abstract protected function runOptimization(array $samples, array $targets, \Closure $gradientCb);
|
||||
abstract public function runOptimization(array $samples, array $targets, Closure $gradientCb): array;
|
||||
}
|
||||
|
|
|
@ -4,6 +4,10 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Helper\Optimizer;
|
||||
|
||||
use Closure;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
|
||||
/**
|
||||
* Stochastic Gradient Descent optimization method
|
||||
* to find a solution for the equation A.ϴ = y where
|
||||
|
@ -29,9 +33,9 @@ class StochasticGD extends Optimizer
|
|||
* Callback function to get the gradient and cost value
|
||||
* for a specific set of theta (ϴ) and a pair of sample & target
|
||||
*
|
||||
* @var \Closure
|
||||
* @var \Closure|null
|
||||
*/
|
||||
protected $gradientCb = null;
|
||||
protected $gradientCb;
|
||||
|
||||
/**
|
||||
* Maximum number of iterations used to train the model
|
||||
|
@ -66,18 +70,17 @@ class StochasticGD extends Optimizer
|
|||
* @var bool
|
||||
*/
|
||||
protected $enableEarlyStop = true;
|
||||
|
||||
/**
|
||||
* List of values obtained by evaluating the cost function at each iteration
|
||||
* of the algorithm
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $costValues= [];
|
||||
protected $costValues = [];
|
||||
|
||||
/**
|
||||
* Initializes the SGD optimizer for the given number of dimensions
|
||||
*
|
||||
* @param int $dimensions
|
||||
*/
|
||||
public function __construct(int $dimensions)
|
||||
{
|
||||
|
@ -87,6 +90,17 @@ class StochasticGD extends Optimizer
|
|||
$this->dimensions = $dimensions;
|
||||
}
|
||||
|
||||
public function setTheta(array $theta): Optimizer
|
||||
{
|
||||
if (count($theta) !== $this->dimensions + 1) {
|
||||
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions + 1));
|
||||
}
|
||||
|
||||
$this->theta = $theta;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets minimum value for the change in the theta values
|
||||
* between iterations to continue the iterations.<br>
|
||||
|
@ -94,8 +108,6 @@ class StochasticGD extends Optimizer
|
|||
* If change in the theta is less than given value then the
|
||||
* algorithm will stop training
|
||||
*
|
||||
* @param float $threshold
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setChangeThreshold(float $threshold = 1e-5)
|
||||
|
@ -109,8 +121,6 @@ class StochasticGD extends Optimizer
|
|||
* Enable/Disable early stopping by checking at each iteration
|
||||
* whether changes in theta or cost value are not large enough
|
||||
*
|
||||
* @param bool $enable
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setEarlyStop(bool $enable = true)
|
||||
|
@ -121,8 +131,6 @@ class StochasticGD extends Optimizer
|
|||
}
|
||||
|
||||
/**
|
||||
* @param float $learningRate
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setLearningRate(float $learningRate)
|
||||
|
@ -133,8 +141,6 @@ class StochasticGD extends Optimizer
|
|||
}
|
||||
|
||||
/**
|
||||
* @param int $maxIterations
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setMaxIterations(int $maxIterations)
|
||||
|
@ -150,14 +156,8 @@ class StochasticGD extends Optimizer
|
|||
*
|
||||
* The cost function to minimize and the gradient of the function are to be
|
||||
* handled by the callback function provided as the third parameter of the method.
|
||||
*
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param \Closure $gradientCb
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function runOptimization(array $samples, array $targets, \Closure $gradientCb)
|
||||
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
|
||||
{
|
||||
$this->samples = $samples;
|
||||
$this->targets = $targets;
|
||||
|
@ -176,7 +176,7 @@ class StochasticGD extends Optimizer
|
|||
|
||||
// Save the best theta in the "pocket" so that
|
||||
// any future set of theta worse than this will be disregarded
|
||||
if ($bestTheta == null || $cost <= $bestScore) {
|
||||
if ($bestTheta === null || $cost <= $bestScore) {
|
||||
$bestTheta = $theta;
|
||||
$bestScore = $cost;
|
||||
}
|
||||
|
@ -194,23 +194,33 @@ class StochasticGD extends Optimizer
|
|||
|
||||
// Solution in the pocket is better than or equal to the last state
|
||||
// so, we use this solution
|
||||
return $this->theta = $bestTheta;
|
||||
return $this->theta = (array) $bestTheta;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return float
|
||||
* Returns the list of cost values for each iteration executed in
|
||||
* last run of the optimization
|
||||
*/
|
||||
protected function updateTheta()
|
||||
public function getCostValues(): array
|
||||
{
|
||||
return $this->costValues;
|
||||
}
|
||||
|
||||
protected function updateTheta(): float
|
||||
{
|
||||
$jValue = 0.0;
|
||||
$theta = $this->theta;
|
||||
|
||||
if ($this->gradientCb === null) {
|
||||
throw new InvalidOperationException('Gradient callback is not defined');
|
||||
}
|
||||
|
||||
foreach ($this->samples as $index => $sample) {
|
||||
$target = $this->targets[$index];
|
||||
|
||||
$result = ($this->gradientCb)($theta, $sample, $target);
|
||||
|
||||
list($error, $gradient, $penalty) = array_pad($result, 3, 0);
|
||||
[$error, $gradient, $penalty] = array_pad($result, 3, 0);
|
||||
|
||||
// Update bias
|
||||
$this->theta[0] -= $this->learningRate * $gradient;
|
||||
|
@ -231,19 +241,17 @@ class StochasticGD extends Optimizer
|
|||
/**
|
||||
* Checks if the optimization is not effective enough and can be stopped
|
||||
* in case large enough changes in the solution do not happen
|
||||
*
|
||||
* @param array $oldTheta
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
protected function earlyStop($oldTheta)
|
||||
protected function earlyStop(array $oldTheta): bool
|
||||
{
|
||||
// Check for early stop: No change larger than threshold (default 1e-5)
|
||||
$diff = array_map(
|
||||
function ($w1, $w2) {
|
||||
return abs($w1 - $w2) > $this->threshold ? 1 : 0;
|
||||
},
|
||||
$oldTheta, $this->theta);
|
||||
$oldTheta,
|
||||
$this->theta
|
||||
);
|
||||
|
||||
if (array_sum($diff) == 0) {
|
||||
return true;
|
||||
|
@ -251,30 +259,17 @@ class StochasticGD extends Optimizer
|
|||
|
||||
// Check if the last two cost values are almost the same
|
||||
$costs = array_slice($this->costValues, -2);
|
||||
if (count($costs) == 2 && abs($costs[1] - $costs[0]) < $this->threshold) {
|
||||
if (count($costs) === 2 && abs($costs[1] - $costs[0]) < $this->threshold) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of cost values for each iteration executed in
|
||||
* last run of the optimization
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getCostValues()
|
||||
{
|
||||
return $this->costValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the optimizer internal vars after the optimization process.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
protected function clear()
|
||||
protected function clear(): void
|
||||
{
|
||||
$this->samples = [];
|
||||
$this->targets = [];
|
||||
|
|
|
@ -7,8 +7,6 @@ namespace Phpml\Helper;
|
|||
trait Predictable
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public function predict(array $samples)
|
||||
|
@ -26,8 +24,6 @@ trait Predictable
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
abstract protected function predictSample(array $sample);
|
||||
|
|
|
@ -16,11 +16,7 @@ trait Trainable
|
|||
*/
|
||||
private $targets = [];
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
public function train(array $samples, array $targets): void
|
||||
{
|
||||
$this->samples = array_merge($this->samples, $samples);
|
||||
$this->targets = array_merge($this->targets, $targets);
|
||||
|
|
|
@ -6,10 +6,5 @@ namespace Phpml;
|
|||
|
||||
interface IncrementalEstimator
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
* @param array $labels
|
||||
*/
|
||||
public function partialTrain(array $samples, array $targets, array $labels = []);
|
||||
public function partialTrain(array $samples, array $targets, array $labels = []): void;
|
||||
}
|
||||
|
|
42
lib/mlbackend/php/phpml/src/Phpml/Math/Comparison.php
Normal file
42
lib/mlbackend/php/phpml/src/Phpml/Math/Comparison.php
Normal file
|
@ -0,0 +1,42 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Math;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
class Comparison
|
||||
{
|
||||
/**
|
||||
* @param mixed $a
|
||||
* @param mixed $b
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function compare($a, $b, string $operator): bool
|
||||
{
|
||||
switch ($operator) {
|
||||
case '>':
|
||||
return $a > $b;
|
||||
case '>=':
|
||||
return $a >= $b;
|
||||
case '=':
|
||||
case '==':
|
||||
return $a == $b;
|
||||
case '===':
|
||||
return $a === $b;
|
||||
case '<=':
|
||||
return $a <= $b;
|
||||
case '<':
|
||||
return $a < $b;
|
||||
case '!=':
|
||||
case '<>':
|
||||
return $a != $b;
|
||||
case '!==':
|
||||
return $a !== $b;
|
||||
default:
|
||||
throw new InvalidArgumentException(sprintf('Invalid operator "%s" provided', $operator));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -6,11 +6,5 @@ namespace Phpml\Math;
|
|||
|
||||
interface Distance
|
||||
{
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function distance(array $a, array $b): float;
|
||||
}
|
||||
|
|
|
@ -4,32 +4,16 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math\Distance;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance;
|
||||
|
||||
class Chebyshev implements Distance
|
||||
/**
|
||||
* Class Chebyshev
|
||||
*/
|
||||
class Chebyshev extends Distance
|
||||
{
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* {@inheritdoc}
|
||||
*/
|
||||
public function distance(array $a, array $b): float
|
||||
{
|
||||
if (count($a) !== count($b)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
}
|
||||
|
||||
$differences = [];
|
||||
$count = count($a);
|
||||
|
||||
for ($i = 0; $i < $count; ++$i) {
|
||||
$differences[] = abs($a[$i] - $b[$i]);
|
||||
}
|
||||
|
||||
return max($differences);
|
||||
return max($this->deltas($a, $b));
|
||||
}
|
||||
}
|
||||
|
|
61
lib/mlbackend/php/phpml/src/Phpml/Math/Distance/Distance.php
Normal file
61
lib/mlbackend/php/phpml/src/Phpml/Math/Distance/Distance.php
Normal file
|
@ -0,0 +1,61 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Math\Distance;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance as DistanceInterface;
|
||||
|
||||
/**
|
||||
* Class Distance
|
||||
*/
|
||||
abstract class Distance implements DistanceInterface
|
||||
{
|
||||
/**
|
||||
* @var float|int
|
||||
*/
|
||||
public $norm;
|
||||
|
||||
/**
|
||||
* Distance constructor.
|
||||
*/
|
||||
public function __construct(float $norm = 3.0)
|
||||
{
|
||||
$this->norm = $norm;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function distance(array $a, array $b): float
|
||||
{
|
||||
$distance = 0;
|
||||
|
||||
foreach ($this->deltas($a, $b) as $delta) {
|
||||
$distance += $delta ** $this->norm;
|
||||
}
|
||||
|
||||
return $distance ** (1 / $this->norm);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
protected function deltas(array $a, array $b): array
|
||||
{
|
||||
$count = count($a);
|
||||
|
||||
if ($count !== count($b)) {
|
||||
throw new InvalidArgumentException('Size of given arrays does not match');
|
||||
}
|
||||
|
||||
$deltas = [];
|
||||
|
||||
for ($i = 0; $i < $count; $i++) {
|
||||
$deltas[] = abs($a[$i] - $b[$i]);
|
||||
}
|
||||
|
||||
return $deltas;
|
||||
}
|
||||
}
|
|
@ -4,41 +4,25 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math\Distance;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance;
|
||||
|
||||
class Euclidean implements Distance
|
||||
/**
|
||||
* Class Euclidean
|
||||
*
|
||||
* L^2 Metric.
|
||||
*/
|
||||
class Euclidean extends Distance
|
||||
{
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* Euclidean constructor.
|
||||
*/
|
||||
public function distance(array $a, array $b): float
|
||||
public function __construct()
|
||||
{
|
||||
if (count($a) !== count($b)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
}
|
||||
|
||||
$distance = 0;
|
||||
|
||||
foreach ($a as $i => $val) {
|
||||
$distance += ($val - $b[$i]) ** 2;
|
||||
}
|
||||
|
||||
return sqrt((float) $distance);
|
||||
parent::__construct(2.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Square of Euclidean distance
|
||||
*
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
* @throws \Phpml\Exception\InvalidArgumentException
|
||||
*/
|
||||
public function sqDistance(array $a, array $b): float
|
||||
{
|
||||
|
|
|
@ -4,32 +4,18 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math\Distance;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance;
|
||||
|
||||
class Manhattan implements Distance
|
||||
/**
|
||||
* Class Manhattan
|
||||
*
|
||||
* L^1 Metric.
|
||||
*/
|
||||
class Manhattan extends Distance
|
||||
{
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* Manhattan constructor.
|
||||
*/
|
||||
public function distance(array $a, array $b): float
|
||||
public function __construct()
|
||||
{
|
||||
if (count($a) !== count($b)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
}
|
||||
|
||||
$distance = 0;
|
||||
$count = count($a);
|
||||
|
||||
for ($i = 0; $i < $count; ++$i) {
|
||||
$distance += abs($a[$i] - $b[$i]);
|
||||
}
|
||||
|
||||
return $distance;
|
||||
parent::__construct(1.0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,45 +4,11 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math\Distance;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Distance;
|
||||
|
||||
class Minkowski implements Distance
|
||||
/**
|
||||
* Class Minkowski
|
||||
*
|
||||
* L^n Metric.
|
||||
*/
|
||||
class Minkowski extends Distance
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
private $lambda;
|
||||
|
||||
/**
|
||||
* @param float $lambda
|
||||
*/
|
||||
public function __construct(float $lambda = 3.0)
|
||||
{
|
||||
$this->lambda = $lambda;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function distance(array $a, array $b): float
|
||||
{
|
||||
if (count($a) !== count($b)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
}
|
||||
|
||||
$distance = 0;
|
||||
$count = count($a);
|
||||
|
||||
for ($i = 0; $i < $count; ++$i) {
|
||||
$distance += pow(abs($a[$i] - $b[$i]), $this->lambda);
|
||||
}
|
||||
|
||||
return (float)pow($distance, 1 / $this->lambda);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,10 @@ namespace Phpml\Math;
|
|||
interface Kernel
|
||||
{
|
||||
/**
|
||||
* @param float $a
|
||||
* @param float $b
|
||||
* @param float|array $a
|
||||
* @param float|array $b
|
||||
*
|
||||
* @return float
|
||||
* @return float|array
|
||||
*/
|
||||
public function compute($a, $b);
|
||||
}
|
||||
|
|
|
@ -14,9 +14,6 @@ class RBF implements Kernel
|
|||
*/
|
||||
private $gamma;
|
||||
|
||||
/**
|
||||
* @param float $gamma
|
||||
*/
|
||||
public function __construct(float $gamma)
|
||||
{
|
||||
$this->gamma = $gamma;
|
||||
|
@ -25,15 +22,12 @@ class RBF implements Kernel
|
|||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($a, $b)
|
||||
public function compute($a, $b): float
|
||||
{
|
||||
$score = 2 * Product::scalar($a, $b);
|
||||
$squares = Product::scalar($a, $a) + Product::scalar($b, $b);
|
||||
$result = exp(-$this->gamma * ($squares - $score));
|
||||
|
||||
return $result;
|
||||
return exp(-$this->gamma * ($squares - $score));
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,68 +1,76 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* @package JAMA
|
||||
* @package JAMA
|
||||
*
|
||||
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
|
||||
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
|
||||
* and a permutation vector piv of length m so that A(piv,:) = L*U.
|
||||
* If m < n, then L is m-by-m and U is m-by-n.
|
||||
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
|
||||
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
|
||||
* and a permutation vector piv of length m so that A(piv,:) = L*U.
|
||||
* If m < n, then L is m-by-m and U is m-by-n.
|
||||
*
|
||||
* The LU decompostion with pivoting always exists, even if the matrix is
|
||||
* singular, so the constructor will never fail. The primary use of the
|
||||
* LU decomposition is in the solution of square systems of simultaneous
|
||||
* linear equations. This will fail if isNonsingular() returns false.
|
||||
* The LU decompostion with pivoting always exists, even if the matrix is
|
||||
* singular, so the constructor will never fail. The primary use of the
|
||||
* LU decomposition is in the solution of square systems of simultaneous
|
||||
* linear equations. This will fail if isNonsingular() returns false.
|
||||
*
|
||||
* @author Paul Meagher
|
||||
* @author Bartosz Matosiuk
|
||||
* @author Michael Bommarito
|
||||
* @version 1.1
|
||||
* @license PHP v3.0
|
||||
* @author Paul Meagher
|
||||
* @author Bartosz Matosiuk
|
||||
* @author Michael Bommarito
|
||||
*
|
||||
* @version 1.1
|
||||
*
|
||||
* @license PHP v3.0
|
||||
*
|
||||
* Slightly changed to adapt the original code to PHP-ML library
|
||||
* @date 2017/04/24
|
||||
*
|
||||
* @author Mustafa Karabulut
|
||||
*/
|
||||
|
||||
namespace Phpml\Math\LinearAlgebra;
|
||||
|
||||
use Phpml\Math\Matrix;
|
||||
use Phpml\Exception\MatrixException;
|
||||
use Phpml\Math\Matrix;
|
||||
|
||||
class LUDecomposition
|
||||
{
|
||||
/**
|
||||
* Decomposition storage
|
||||
* @var array
|
||||
* Decomposition storage
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $LU = [];
|
||||
|
||||
/**
|
||||
* Row dimension.
|
||||
* @var int
|
||||
* Row dimension.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
private $m;
|
||||
|
||||
/**
|
||||
* Column dimension.
|
||||
* @var int
|
||||
* Column dimension.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
private $n;
|
||||
|
||||
/**
|
||||
* Pivot sign.
|
||||
* @var int
|
||||
* Pivot sign.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
private $pivsign;
|
||||
|
||||
/**
|
||||
* Internal storage of pivot vector.
|
||||
* @var array
|
||||
* Internal storage of pivot vector.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $piv = [];
|
||||
|
||||
|
||||
/**
|
||||
* Constructs Structure to access L, U and piv.
|
||||
*
|
||||
|
@ -72,17 +80,18 @@ class LUDecomposition
|
|||
*/
|
||||
public function __construct(Matrix $A)
|
||||
{
|
||||
if ($A->getRows() != $A->getColumns()) {
|
||||
throw MatrixException::notSquareMatrix();
|
||||
if ($A->getRows() !== $A->getColumns()) {
|
||||
throw new MatrixException('Matrix is not square matrix');
|
||||
}
|
||||
|
||||
// Use a "left-looking", dot-product, Crout/Doolittle algorithm.
|
||||
$this->LU = $A->toArray();
|
||||
$this->m = $A->getRows();
|
||||
$this->n = $A->getColumns();
|
||||
$this->m = $A->getRows();
|
||||
$this->n = $A->getColumns();
|
||||
for ($i = 0; $i < $this->m; ++$i) {
|
||||
$this->piv[$i] = $i;
|
||||
}
|
||||
|
||||
$this->pivsign = 1;
|
||||
$LUcolj = [];
|
||||
|
||||
|
@ -92,6 +101,7 @@ class LUDecomposition
|
|||
for ($i = 0; $i < $this->m; ++$i) {
|
||||
$LUcolj[$i] = &$this->LU[$i][$j];
|
||||
}
|
||||
|
||||
// Apply previous transformations.
|
||||
for ($i = 0; $i < $this->m; ++$i) {
|
||||
$LUrowi = $this->LU[$i];
|
||||
|
@ -101,26 +111,31 @@ class LUDecomposition
|
|||
for ($k = 0; $k < $kmax; ++$k) {
|
||||
$s += $LUrowi[$k] * $LUcolj[$k];
|
||||
}
|
||||
|
||||
$LUrowi[$j] = $LUcolj[$i] -= $s;
|
||||
}
|
||||
|
||||
// Find pivot and exchange if necessary.
|
||||
$p = $j;
|
||||
for ($i = $j + 1; $i < $this->m; ++$i) {
|
||||
if (abs($LUcolj[$i]) > abs($LUcolj[$p])) {
|
||||
if (abs($LUcolj[$i] ?? 0) > abs($LUcolj[$p] ?? 0)) {
|
||||
$p = $i;
|
||||
}
|
||||
}
|
||||
|
||||
if ($p != $j) {
|
||||
for ($k = 0; $k < $this->n; ++$k) {
|
||||
$t = $this->LU[$p][$k];
|
||||
$this->LU[$p][$k] = $this->LU[$j][$k];
|
||||
$this->LU[$j][$k] = $t;
|
||||
}
|
||||
|
||||
$k = $this->piv[$p];
|
||||
$this->piv[$p] = $this->piv[$j];
|
||||
$this->piv[$j] = $k;
|
||||
$this->pivsign = $this->pivsign * -1;
|
||||
$this->pivsign *= -1;
|
||||
}
|
||||
|
||||
// Compute multipliers.
|
||||
if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) {
|
||||
for ($i = $j + 1; $i < $this->m; ++$i) {
|
||||
|
@ -128,15 +143,14 @@ class LUDecomposition
|
|||
}
|
||||
}
|
||||
}
|
||||
} // function __construct()
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get lower triangular factor.
|
||||
*
|
||||
* @return Matrix Lower triangular factor
|
||||
*/
|
||||
public function getL()
|
||||
public function getL(): Matrix
|
||||
{
|
||||
$L = [];
|
||||
for ($i = 0; $i < $this->m; ++$i) {
|
||||
|
@ -150,16 +164,16 @@ class LUDecomposition
|
|||
}
|
||||
}
|
||||
}
|
||||
return new Matrix($L);
|
||||
} // function getL()
|
||||
|
||||
return new Matrix($L);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get upper triangular factor.
|
||||
*
|
||||
* @return Matrix Upper triangular factor
|
||||
*/
|
||||
public function getU()
|
||||
public function getU(): Matrix
|
||||
{
|
||||
$U = [];
|
||||
for ($i = 0; $i < $this->n; ++$i) {
|
||||
|
@ -171,38 +185,36 @@ class LUDecomposition
|
|||
}
|
||||
}
|
||||
}
|
||||
return new Matrix($U);
|
||||
} // function getU()
|
||||
|
||||
return new Matrix($U);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return pivot permutation vector.
|
||||
*
|
||||
* @return array Pivot vector
|
||||
*/
|
||||
public function getPivot()
|
||||
public function getPivot(): array
|
||||
{
|
||||
return $this->piv;
|
||||
} // function getPivot()
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Alias for getPivot
|
||||
*
|
||||
* @see getPivot
|
||||
*/
|
||||
public function getDoublePivot()
|
||||
public function getDoublePivot(): array
|
||||
{
|
||||
return $this->getPivot();
|
||||
} // function getDoublePivot()
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Is the matrix nonsingular?
|
||||
*
|
||||
* @return true if U, and hence A, is nonsingular.
|
||||
* @return bool true if U, and hence A, is nonsingular.
|
||||
*/
|
||||
public function isNonsingular()
|
||||
public function isNonsingular(): bool
|
||||
{
|
||||
for ($j = 0; $j < $this->n; ++$j) {
|
||||
if ($this->LU[$j][$j] == 0) {
|
||||
|
@ -211,30 +223,17 @@ class LUDecomposition
|
|||
}
|
||||
|
||||
return true;
|
||||
} // function isNonsingular()
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Count determinants
|
||||
*
|
||||
* @return float|int d matrix determinant
|
||||
*
|
||||
* @throws MatrixException
|
||||
*/
|
||||
public function det()
|
||||
public function det(): float
|
||||
{
|
||||
if ($this->m !== $this->n) {
|
||||
throw MatrixException::notSquareMatrix();
|
||||
}
|
||||
|
||||
$d = $this->pivsign;
|
||||
for ($j = 0; $j < $this->n; ++$j) {
|
||||
$d *= $this->LU[$j][$j];
|
||||
}
|
||||
|
||||
return $d;
|
||||
} // function det()
|
||||
|
||||
return (float) $d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Solve A*X = B
|
||||
|
@ -245,19 +244,19 @@ class LUDecomposition
|
|||
*
|
||||
* @throws MatrixException
|
||||
*/
|
||||
public function solve(Matrix $B)
|
||||
public function solve(Matrix $B): array
|
||||
{
|
||||
if ($B->getRows() != $this->m) {
|
||||
throw MatrixException::notSquareMatrix();
|
||||
throw new MatrixException('Matrix is not square matrix');
|
||||
}
|
||||
|
||||
if (!$this->isNonsingular()) {
|
||||
throw MatrixException::singularMatrix();
|
||||
throw new MatrixException('Matrix is singular');
|
||||
}
|
||||
|
||||
// Copy right hand side with pivoting
|
||||
$nx = $B->getColumns();
|
||||
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1);
|
||||
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1);
|
||||
// Solve L*Y = B(piv,:)
|
||||
for ($k = 0; $k < $this->n; ++$k) {
|
||||
for ($i = $k + 1; $i < $this->n; ++$i) {
|
||||
|
@ -266,29 +265,24 @@ class LUDecomposition
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Solve U*X = Y;
|
||||
for ($k = $this->n - 1; $k >= 0; --$k) {
|
||||
for ($j = 0; $j < $nx; ++$j) {
|
||||
$X[$k][$j] /= $this->LU[$k][$k];
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $k; ++$i) {
|
||||
for ($j = 0; $j < $nx; ++$j) {
|
||||
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return $X;
|
||||
} // function solve()
|
||||
|
||||
/**
|
||||
* @param array $matrix
|
||||
* @param array $RL
|
||||
* @param int $j0
|
||||
* @param int $jF
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF)
|
||||
return $X;
|
||||
}
|
||||
|
||||
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF): array
|
||||
{
|
||||
$m = count($RL);
|
||||
$n = $jF - $j0;
|
||||
|
@ -302,4 +296,4 @@ class LUDecomposition
|
|||
|
||||
return $R;
|
||||
}
|
||||
} // class LUDecomposition
|
||||
}
|
||||
|
|
|
@ -4,16 +4,16 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math;
|
||||
|
||||
use Phpml\Math\LinearAlgebra\LUDecomposition;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\MatrixException;
|
||||
use Phpml\Math\LinearAlgebra\LUDecomposition;
|
||||
|
||||
class Matrix
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $matrix;
|
||||
private $matrix = [];
|
||||
|
||||
/**
|
||||
* @var int
|
||||
|
@ -31,9 +31,6 @@ class Matrix
|
|||
private $determinant;
|
||||
|
||||
/**
|
||||
* @param array $matrix
|
||||
* @param bool $validate
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(array $matrix, bool $validate = true)
|
||||
|
@ -51,7 +48,7 @@ class Matrix
|
|||
if ($validate) {
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
if (count($matrix[$i]) !== $this->columns) {
|
||||
throw InvalidArgumentException::matrixDimensionsDidNotMatch();
|
||||
throw new InvalidArgumentException('Matrix dimensions did not match');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -59,12 +56,7 @@ class Matrix
|
|||
$this->matrix = $matrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $array
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
public static function fromFlatArray(array $array)
|
||||
public static function fromFlatArray(array $array): self
|
||||
{
|
||||
$matrix = [];
|
||||
foreach ($array as $value) {
|
||||
|
@ -74,55 +66,38 @@ class Matrix
|
|||
return new self($matrix);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function toArray()
|
||||
public function toArray(): array
|
||||
{
|
||||
return $this->matrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return float
|
||||
*/
|
||||
public function toScalar()
|
||||
public function toScalar(): float
|
||||
{
|
||||
return $this->matrix[0][0];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getRows()
|
||||
public function getRows(): int
|
||||
{
|
||||
return $this->rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function getColumns()
|
||||
public function getColumns(): int
|
||||
{
|
||||
return $this->columns;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $column
|
||||
*
|
||||
* @return array
|
||||
*
|
||||
* @throws MatrixException
|
||||
*/
|
||||
public function getColumnValues($column)
|
||||
public function getColumnValues(int $column): array
|
||||
{
|
||||
if ($column >= $this->columns) {
|
||||
throw MatrixException::columnOutOfRange();
|
||||
throw new MatrixException('Column out of range');
|
||||
}
|
||||
|
||||
return array_column($this->matrix, $column);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return float|int
|
||||
*
|
||||
|
@ -130,12 +105,12 @@ class Matrix
|
|||
*/
|
||||
public function getDeterminant()
|
||||
{
|
||||
if ($this->determinant) {
|
||||
if ($this->determinant !== null) {
|
||||
return $this->determinant;
|
||||
}
|
||||
|
||||
if (!$this->isSquare()) {
|
||||
throw MatrixException::notSquareMatrix();
|
||||
throw new MatrixException('Matrix is not square matrix');
|
||||
}
|
||||
|
||||
$lu = new LUDecomposition($this);
|
||||
|
@ -143,20 +118,14 @@ class Matrix
|
|||
return $this->determinant = $lu->det();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isSquare()
|
||||
public function isSquare(): bool
|
||||
{
|
||||
return $this->columns === $this->rows;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Matrix
|
||||
*/
|
||||
public function transpose()
|
||||
public function transpose(): self
|
||||
{
|
||||
if ($this->rows == 1) {
|
||||
if ($this->rows === 1) {
|
||||
$matrix = array_map(function ($el) {
|
||||
return [$el];
|
||||
}, $this->matrix[0]);
|
||||
|
@ -167,28 +136,30 @@ class Matrix
|
|||
return new self($matrix, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Matrix $matrix
|
||||
*
|
||||
* @return Matrix
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function multiply(Matrix $matrix)
|
||||
public function multiply(self $matrix): self
|
||||
{
|
||||
if ($this->columns != $matrix->getRows()) {
|
||||
throw InvalidArgumentException::inconsistentMatrixSupplied();
|
||||
if ($this->columns !== $matrix->getRows()) {
|
||||
throw new InvalidArgumentException('Inconsistent matrix supplied');
|
||||
}
|
||||
|
||||
$array1 = $this->toArray();
|
||||
$array2 = $matrix->toArray();
|
||||
$colCount = $matrix->columns;
|
||||
|
||||
/*
|
||||
- To speed-up multiplication, we need to avoid use of array index operator [ ] as much as possible( See #255 for details)
|
||||
- A combination of "foreach" and "array_column" works much faster then accessing the array via index operator
|
||||
*/
|
||||
$product = [];
|
||||
$multiplier = $matrix->toArray();
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
$columns = $matrix->getColumns();
|
||||
for ($j = 0; $j < $columns; ++$j) {
|
||||
$product[$i][$j] = 0;
|
||||
for ($k = 0; $k < $this->columns; ++$k) {
|
||||
$product[$i][$j] += $this->matrix[$i][$k] * $multiplier[$k][$j];
|
||||
foreach ($array1 as $row => $rowData) {
|
||||
for ($col = 0; $col < $colCount; ++$col) {
|
||||
$columnData = array_column($array2, $col);
|
||||
$sum = 0;
|
||||
foreach ($rowData as $key => $valueData) {
|
||||
$sum += $valueData * $columnData[$key];
|
||||
}
|
||||
|
||||
$product[$row][$col] = $sum;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -196,11 +167,9 @@ class Matrix
|
|||
}
|
||||
|
||||
/**
|
||||
* @param $value
|
||||
*
|
||||
* @return Matrix
|
||||
* @param float|int $value
|
||||
*/
|
||||
public function divideByScalar($value)
|
||||
public function divideByScalar($value): self
|
||||
{
|
||||
$newMatrix = [];
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
|
@ -213,11 +182,9 @@ class Matrix
|
|||
}
|
||||
|
||||
/**
|
||||
* @param $value
|
||||
*
|
||||
* @return Matrix
|
||||
* @param float|int $value
|
||||
*/
|
||||
public function multiplyByScalar($value)
|
||||
public function multiplyByScalar($value): self
|
||||
{
|
||||
$newMatrix = [];
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
|
@ -231,37 +198,106 @@ class Matrix
|
|||
|
||||
/**
|
||||
* Element-wise addition of the matrix with another one
|
||||
*
|
||||
* @param Matrix $other
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
public function add(Matrix $other)
|
||||
public function add(self $other): self
|
||||
{
|
||||
return $this->_add($other);
|
||||
}
|
||||
|
||||
/**
|
||||
* Element-wise subtracting of another matrix from this one
|
||||
*
|
||||
* @param Matrix $other
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
public function subtract(Matrix $other)
|
||||
public function subtract(self $other): self
|
||||
{
|
||||
return $this->_add($other, -1);
|
||||
}
|
||||
|
||||
public function inverse(): self
|
||||
{
|
||||
if (!$this->isSquare()) {
|
||||
throw new MatrixException('Matrix is not square matrix');
|
||||
}
|
||||
|
||||
$LU = new LUDecomposition($this);
|
||||
$identity = $this->getIdentity();
|
||||
$inverse = $LU->solve($identity);
|
||||
|
||||
return new self($inverse, false);
|
||||
}
|
||||
|
||||
public function crossOut(int $row, int $column): self
|
||||
{
|
||||
$newMatrix = [];
|
||||
$r = 0;
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
$c = 0;
|
||||
if ($row != $i) {
|
||||
for ($j = 0; $j < $this->columns; ++$j) {
|
||||
if ($column != $j) {
|
||||
$newMatrix[$r][$c] = $this->matrix[$i][$j];
|
||||
++$c;
|
||||
}
|
||||
}
|
||||
|
||||
++$r;
|
||||
}
|
||||
}
|
||||
|
||||
return new self($newMatrix, false);
|
||||
}
|
||||
|
||||
public function isSingular(): bool
|
||||
{
|
||||
return $this->getDeterminant() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Frobenius norm (Hilbert–Schmidt norm, Euclidean norm) (‖A‖F)
|
||||
* Square root of the sum of the square of all elements.
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
|
||||
*
|
||||
* _____________
|
||||
* /ᵐ ⁿ
|
||||
* ‖A‖F = √ Σ Σ |aᵢⱼ|²
|
||||
* ᵢ₌₁ ᵢ₌₁
|
||||
*/
|
||||
public function frobeniusNorm(): float
|
||||
{
|
||||
$squareSum = 0;
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
for ($j = 0; $j < $this->columns; ++$j) {
|
||||
$squareSum += $this->matrix[$i][$j] ** 2;
|
||||
}
|
||||
}
|
||||
|
||||
return $squareSum ** .5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the transpose of given array
|
||||
*/
|
||||
public static function transposeArray(array $array): array
|
||||
{
|
||||
return (new self($array, false))->transpose()->toArray();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the dot product of two arrays<br>
|
||||
* Matrix::dot(x, y) ==> x.y'
|
||||
*/
|
||||
public static function dot(array $array1, array $array2): array
|
||||
{
|
||||
$m1 = new self($array1, false);
|
||||
$m2 = new self($array2, false);
|
||||
|
||||
return $m1->multiply($m2->transpose())->toArray()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Element-wise addition or substraction depending on the given sign parameter
|
||||
*
|
||||
* @param Matrix $other
|
||||
* @param int $sign
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
protected function _add(Matrix $other, $sign = 1)
|
||||
private function _add(self $other, int $sign = 1): self
|
||||
{
|
||||
$a1 = $this->toArray();
|
||||
$a2 = $other->toArray();
|
||||
|
@ -276,30 +312,10 @@ class Matrix
|
|||
return new self($newMatrix, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Matrix
|
||||
*
|
||||
* @throws MatrixException
|
||||
*/
|
||||
public function inverse()
|
||||
{
|
||||
if (!$this->isSquare()) {
|
||||
throw MatrixException::notSquareMatrix();
|
||||
}
|
||||
|
||||
$LU = new LUDecomposition($this);
|
||||
$identity = $this->getIdentity();
|
||||
$inverse = $LU->solve($identity);
|
||||
|
||||
return new self($inverse, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns diagonal identity matrix of the same size of this matrix
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
protected function getIdentity()
|
||||
private function getIdentity(): self
|
||||
{
|
||||
$array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0));
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
|
@ -308,67 +324,4 @@ class Matrix
|
|||
|
||||
return new self($array, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $row
|
||||
* @param int $column
|
||||
*
|
||||
* @return Matrix
|
||||
*/
|
||||
public function crossOut(int $row, int $column)
|
||||
{
|
||||
$newMatrix = [];
|
||||
$r = 0;
|
||||
for ($i = 0; $i < $this->rows; ++$i) {
|
||||
$c = 0;
|
||||
if ($row != $i) {
|
||||
for ($j = 0; $j < $this->columns; ++$j) {
|
||||
if ($column != $j) {
|
||||
$newMatrix[$r][$c] = $this->matrix[$i][$j];
|
||||
++$c;
|
||||
}
|
||||
}
|
||||
++$r;
|
||||
}
|
||||
}
|
||||
|
||||
return new self($newMatrix, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isSingular() : bool
|
||||
{
|
||||
return 0 == $this->getDeterminant();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the transpose of given array
|
||||
*
|
||||
* @param array $array
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function transposeArray(array $array)
|
||||
{
|
||||
return (new self($array, false))->transpose()->toArray();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the dot product of two arrays<br>
|
||||
* Matrix::dot(x, y) ==> x.y'
|
||||
*
|
||||
* @param array $array1
|
||||
* @param array $array2
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function dot(array $array1, array $array2)
|
||||
{
|
||||
$m1 = new self($array1, false);
|
||||
$m2 = new self($array2, false);
|
||||
|
||||
return $m1->multiply($m2->transpose())->toArray()[0];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,9 +7,6 @@ namespace Phpml\Math;
|
|||
class Product
|
||||
{
|
||||
/**
|
||||
* @param array $a
|
||||
* @param array $b
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public static function scalar(array $a, array $b)
|
||||
|
@ -17,7 +14,7 @@ class Product
|
|||
$product = 0;
|
||||
foreach ($a as $index => $value) {
|
||||
if (is_numeric($value) && is_numeric($b[$index])) {
|
||||
$product += $value * $b[$index];
|
||||
$product += (float) $value * (float) $b[$index];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -4,15 +4,18 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Math;
|
||||
|
||||
class Set implements \IteratorAggregate
|
||||
use ArrayIterator;
|
||||
use IteratorAggregate;
|
||||
|
||||
class Set implements IteratorAggregate
|
||||
{
|
||||
/**
|
||||
* @var string[]|int[]|float[]
|
||||
* @var string[]|int[]|float[]|bool[]
|
||||
*/
|
||||
private $elements;
|
||||
private $elements = [];
|
||||
|
||||
/**
|
||||
* @param string[]|int[]|float[] $elements
|
||||
* @param string[]|int[]|float[]|bool[] $elements
|
||||
*/
|
||||
public function __construct(array $elements = [])
|
||||
{
|
||||
|
@ -21,39 +24,24 @@ class Set implements \IteratorAggregate
|
|||
|
||||
/**
|
||||
* Creates the union of A and B.
|
||||
*
|
||||
* @param Set $a
|
||||
* @param Set $b
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public static function union(Set $a, Set $b) : Set
|
||||
public static function union(self $a, self $b): self
|
||||
{
|
||||
return new self(array_merge($a->toArray(), $b->toArray()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the intersection of A and B.
|
||||
*
|
||||
* @param Set $a
|
||||
* @param Set $b
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public static function intersection(Set $a, Set $b) : Set
|
||||
public static function intersection(self $a, self $b): self
|
||||
{
|
||||
return new self(array_intersect($a->toArray(), $b->toArray()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the difference of A and B.
|
||||
*
|
||||
* @param Set $a
|
||||
* @param Set $b
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public static function difference(Set $a, Set $b) : Set
|
||||
public static function difference(self $a, self $b): self
|
||||
{
|
||||
return new self(array_diff($a->toArray(), $b->toArray()));
|
||||
}
|
||||
|
@ -61,12 +49,9 @@ class Set implements \IteratorAggregate
|
|||
/**
|
||||
* Creates the Cartesian product of A and B.
|
||||
*
|
||||
* @param Set $a
|
||||
* @param Set $b
|
||||
*
|
||||
* @return Set[]
|
||||
*/
|
||||
public static function cartesian(Set $a, Set $b) : array
|
||||
public static function cartesian(self $a, self $b): array
|
||||
{
|
||||
$cartesian = [];
|
||||
|
||||
|
@ -82,11 +67,9 @@ class Set implements \IteratorAggregate
|
|||
/**
|
||||
* Creates the power set of A.
|
||||
*
|
||||
* @param Set $a
|
||||
*
|
||||
* @return Set[]
|
||||
*/
|
||||
public static function power(Set $a) : array
|
||||
public static function power(self $a): array
|
||||
{
|
||||
$power = [new self()];
|
||||
|
||||
|
@ -100,35 +83,17 @@ class Set implements \IteratorAggregate
|
|||
}
|
||||
|
||||
/**
|
||||
* Removes duplicates and rewrites index.
|
||||
*
|
||||
* @param string[]|int[]|float[] $elements
|
||||
*
|
||||
* @return string[]|int[]|float[]
|
||||
* @param string|int|float|bool $element
|
||||
*/
|
||||
private static function sanitize(array $elements) : array
|
||||
{
|
||||
sort($elements, SORT_ASC);
|
||||
|
||||
return array_values(array_unique($elements, SORT_ASC));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string|int|float $element
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public function add($element) : Set
|
||||
public function add($element): self
|
||||
{
|
||||
return $this->addAll([$element]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[]|int[]|float[] $elements
|
||||
*
|
||||
* @return Set
|
||||
* @param string[]|int[]|float[]|bool[] $elements
|
||||
*/
|
||||
public function addAll(array $elements) : Set
|
||||
public function addAll(array $elements): self
|
||||
{
|
||||
$this->elements = self::sanitize(array_merge($this->elements, $elements));
|
||||
|
||||
|
@ -137,20 +102,16 @@ class Set implements \IteratorAggregate
|
|||
|
||||
/**
|
||||
* @param string|int|float $element
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public function remove($element) : Set
|
||||
public function remove($element): self
|
||||
{
|
||||
return $this->removeAll([$element]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[]|int[]|float[] $elements
|
||||
*
|
||||
* @return Set
|
||||
*/
|
||||
public function removeAll(array $elements) : Set
|
||||
public function removeAll(array $elements): self
|
||||
{
|
||||
$this->elements = self::sanitize(array_diff($this->elements, $elements));
|
||||
|
||||
|
@ -159,53 +120,54 @@ class Set implements \IteratorAggregate
|
|||
|
||||
/**
|
||||
* @param string|int|float $element
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function contains($element) : bool
|
||||
public function contains($element): bool
|
||||
{
|
||||
return $this->containsAll([$element]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string[]|int[]|float[] $elements
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function containsAll(array $elements) : bool
|
||||
public function containsAll(array $elements): bool
|
||||
{
|
||||
return !array_diff($elements, $this->elements);
|
||||
return count(array_diff($elements, $this->elements)) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string[]|int[]|float[]
|
||||
* @return string[]|int[]|float[]|bool[]
|
||||
*/
|
||||
public function toArray() : array
|
||||
public function toArray(): array
|
||||
{
|
||||
return $this->elements;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return \ArrayIterator
|
||||
*/
|
||||
public function getIterator() : \ArrayIterator
|
||||
public function getIterator(): ArrayIterator
|
||||
{
|
||||
return new \ArrayIterator($this->elements);
|
||||
return new ArrayIterator($this->elements);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isEmpty() : bool
|
||||
public function isEmpty(): bool
|
||||
{
|
||||
return $this->cardinality() == 0;
|
||||
return $this->cardinality() === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return int
|
||||
*/
|
||||
public function cardinality() : int
|
||||
public function cardinality(): int
|
||||
{
|
||||
return count($this->elements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes duplicates and rewrites index.
|
||||
*
|
||||
* @param string[]|int[]|float[]|bool[] $elements
|
||||
*
|
||||
* @return string[]|int[]|float[]|bool[]
|
||||
*/
|
||||
private static function sanitize(array $elements): array
|
||||
{
|
||||
sort($elements, SORT_ASC);
|
||||
|
||||
return array_values(array_unique($elements, SORT_ASC));
|
||||
}
|
||||
}
|
||||
|
|
137
lib/mlbackend/php/phpml/src/Phpml/Math/Statistic/ANOVA.php
Normal file
137
lib/mlbackend/php/phpml/src/Phpml/Math/Statistic/ANOVA.php
Normal file
|
@ -0,0 +1,137 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Math\Statistic;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
/**
|
||||
* Analysis of variance
|
||||
* https://en.wikipedia.org/wiki/Analysis_of_variance
|
||||
*/
|
||||
final class ANOVA
|
||||
{
|
||||
/**
|
||||
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
|
||||
* the same population mean. The test is applied to samples from two or
|
||||
* more groups, possibly with differing sizes.
|
||||
*
|
||||
* @param array[] $samples - each row is class samples
|
||||
*
|
||||
* @return float[]
|
||||
*/
|
||||
public static function oneWayF(array $samples): array
|
||||
{
|
||||
$classes = count($samples);
|
||||
if ($classes < 2) {
|
||||
throw new InvalidArgumentException('The array must have at least 2 elements');
|
||||
}
|
||||
|
||||
$samplesPerClass = array_map(function (array $class): int {
|
||||
return count($class);
|
||||
}, $samples);
|
||||
$allSamples = (int) array_sum($samplesPerClass);
|
||||
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
|
||||
$sumSamples = self::sumOfFeaturesPerClass($samples);
|
||||
$squareSumSamples = self::sumOfSquares($sumSamples);
|
||||
$sumSamplesSquare = self::squaresSum($sumSamples);
|
||||
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
|
||||
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
|
||||
$dfbn = $classes - 1;
|
||||
$dfwn = $allSamples - $classes;
|
||||
|
||||
$msb = array_map(function ($s) use ($dfbn) {
|
||||
return $s / $dfbn;
|
||||
}, $ssbn);
|
||||
$msw = array_map(function ($s) use ($dfwn) {
|
||||
return $s / $dfwn;
|
||||
}, $sswn);
|
||||
|
||||
$f = [];
|
||||
foreach ($msb as $index => $msbValue) {
|
||||
$f[$index] = $msbValue / $msw[$index];
|
||||
}
|
||||
|
||||
return $f;
|
||||
}
|
||||
|
||||
private static function sumOfSquaresPerFeature(array $samples): array
|
||||
{
|
||||
$sum = array_fill(0, count($samples[0][0]), 0);
|
||||
foreach ($samples as $class) {
|
||||
foreach ($class as $sample) {
|
||||
foreach ($sample as $index => $feature) {
|
||||
$sum[$index] += $feature ** 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
private static function sumOfFeaturesPerClass(array $samples): array
|
||||
{
|
||||
return array_map(function (array $class) {
|
||||
$sum = array_fill(0, count($class[0]), 0);
|
||||
foreach ($class as $sample) {
|
||||
foreach ($sample as $index => $feature) {
|
||||
$sum[$index] += $feature;
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}, $samples);
|
||||
}
|
||||
|
||||
private static function sumOfSquares(array $sums): array
|
||||
{
|
||||
$squares = array_fill(0, count($sums[0]), 0);
|
||||
foreach ($sums as $row) {
|
||||
foreach ($row as $index => $sum) {
|
||||
$squares[$index] += $sum;
|
||||
}
|
||||
}
|
||||
|
||||
return array_map(function ($sum) {
|
||||
return $sum ** 2;
|
||||
}, $squares);
|
||||
}
|
||||
|
||||
private static function squaresSum(array $sums): array
|
||||
{
|
||||
foreach ($sums as &$row) {
|
||||
foreach ($row as &$sum) {
|
||||
$sum **= 2;
|
||||
}
|
||||
}
|
||||
|
||||
return $sums;
|
||||
}
|
||||
|
||||
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
|
||||
{
|
||||
$ssbn = array_fill(0, count($samples[0][0]), 0);
|
||||
foreach ($sumSamplesSquare as $classIndex => $class) {
|
||||
foreach ($class as $index => $feature) {
|
||||
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($squareSumSamples as $index => $sum) {
|
||||
$ssbn[$index] -= $sum / $allSamples;
|
||||
}
|
||||
|
||||
return $ssbn;
|
||||
}
|
||||
|
||||
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
|
||||
{
|
||||
$sswn = [];
|
||||
foreach ($ssAllSamples as $index => $ss) {
|
||||
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
|
||||
}
|
||||
|
||||
return $sswn;
|
||||
}
|
||||
}
|
|
@ -9,17 +9,15 @@ use Phpml\Exception\InvalidArgumentException;
|
|||
class Correlation
|
||||
{
|
||||
/**
|
||||
* @param array|int[]|float[] $x
|
||||
* @param array|int[]|float[] $y
|
||||
*
|
||||
* @return float
|
||||
* @param int[]|float[] $x
|
||||
* @param int[]|float[] $y
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function pearson(array $x, array $y)
|
||||
public static function pearson(array $x, array $y): float
|
||||
{
|
||||
if (count($x) !== count($y)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
throw new InvalidArgumentException('Size of given arrays does not match');
|
||||
}
|
||||
|
||||
$count = count($x);
|
||||
|
@ -34,12 +32,10 @@ class Correlation
|
|||
$a = $x[$i] - $meanX;
|
||||
$b = $y[$i] - $meanY;
|
||||
$axb += ($a * $b);
|
||||
$a2 += pow($a, 2);
|
||||
$b2 += pow($b, 2);
|
||||
$a2 += $a ** 2;
|
||||
$b2 += $b ** 2;
|
||||
}
|
||||
|
||||
$corr = $axb / sqrt((float) ($a2 * $b2));
|
||||
|
||||
return $corr;
|
||||
return $axb / ($a2 * $b2) ** .5;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,25 +11,17 @@ class Covariance
|
|||
/**
|
||||
* Calculates covariance from two given arrays, x and y, respectively
|
||||
*
|
||||
* @param array $x
|
||||
* @param array $y
|
||||
* @param bool $sample
|
||||
* @param float $meanX
|
||||
* @param float $meanY
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function fromXYArrays(array $x, array $y, $sample = true, float $meanX = null, float $meanY = null)
|
||||
public static function fromXYArrays(array $x, array $y, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
|
||||
{
|
||||
if (empty($x) || empty($y)) {
|
||||
throw InvalidArgumentException::arrayCantBeEmpty();
|
||||
$n = count($x);
|
||||
if ($n === 0 || count($y) === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
|
||||
$n = count($x);
|
||||
if ($sample && $n === 1) {
|
||||
throw InvalidArgumentException::arraySizeToSmall(2);
|
||||
throw new InvalidArgumentException('The array must have at least 2 elements');
|
||||
}
|
||||
|
||||
if ($meanX === null) {
|
||||
|
@ -56,31 +48,22 @@ class Covariance
|
|||
/**
|
||||
* Calculates covariance of two dimensions, i and k in the given data.
|
||||
*
|
||||
* @param array $data
|
||||
* @param int $i
|
||||
* @param int $k
|
||||
* @param bool $sample
|
||||
* @param float $meanX
|
||||
* @param float $meanY
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* @throws \Exception
|
||||
*/
|
||||
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, float $meanX = null, float $meanY = null)
|
||||
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
|
||||
{
|
||||
if (empty($data)) {
|
||||
throw InvalidArgumentException::arrayCantBeEmpty();
|
||||
if (count($data) === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
|
||||
$n = count($data);
|
||||
if ($sample && $n === 1) {
|
||||
throw InvalidArgumentException::arraySizeToSmall(2);
|
||||
throw new InvalidArgumentException('The array must have at least 2 elements');
|
||||
}
|
||||
|
||||
if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) {
|
||||
throw new \Exception("Given indices i and k do not match with the dimensionality of data");
|
||||
throw new InvalidArgumentException('Given indices i and k do not match with the dimensionality of data');
|
||||
}
|
||||
|
||||
if ($meanX === null || $meanY === null) {
|
||||
|
@ -104,15 +87,17 @@ class Covariance
|
|||
// with a slight cost of CPU utilization.
|
||||
$sum = 0.0;
|
||||
foreach ($data as $row) {
|
||||
$val = [];
|
||||
$val = [0, 0];
|
||||
foreach ($row as $index => $col) {
|
||||
if ($index == $i) {
|
||||
$val[0] = $col - $meanX;
|
||||
}
|
||||
|
||||
if ($index == $k) {
|
||||
$val[1] = $col - $meanY;
|
||||
}
|
||||
}
|
||||
|
||||
$sum += $val[0] * $val[1];
|
||||
}
|
||||
}
|
||||
|
@ -127,12 +112,9 @@ class Covariance
|
|||
/**
|
||||
* Returns the covariance matrix of n-dimensional data
|
||||
*
|
||||
* @param array $data
|
||||
* @param array|null $means
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function covarianceMatrix(array $data, array $means = null)
|
||||
public static function covarianceMatrix(array $data, ?array $means = null): array
|
||||
{
|
||||
$n = count($data[0]);
|
||||
|
||||
|
@ -150,7 +132,12 @@ class Covariance
|
|||
$cov[$i][$k] = $cov[$k][$i];
|
||||
} else {
|
||||
$cov[$i][$k] = self::fromDataset(
|
||||
$data, $i, $k, true, $means[$i], $means[$k]
|
||||
$data,
|
||||
$i,
|
||||
$k,
|
||||
true,
|
||||
$means[$i],
|
||||
$means[$k]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,10 +16,6 @@ class Gaussian
|
|||
*/
|
||||
protected $std;
|
||||
|
||||
/**
|
||||
* @param float $mean
|
||||
* @param float $std
|
||||
*/
|
||||
public function __construct(float $mean, float $std)
|
||||
{
|
||||
$this->mean = $mean;
|
||||
|
@ -29,8 +25,6 @@ class Gaussian
|
|||
/**
|
||||
* Returns probability density of the given <i>$value</i>
|
||||
*
|
||||
* @param float $value
|
||||
*
|
||||
* @return float|int
|
||||
*/
|
||||
public function pdf(float $value)
|
||||
|
@ -39,22 +33,18 @@ class Gaussian
|
|||
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
|
||||
$std2 = $this->std ** 2;
|
||||
$mean = $this->mean;
|
||||
return exp(- (($value - $mean) ** 2) / (2 * $std2)) / sqrt(2 * $std2 * pi());
|
||||
|
||||
return exp(-(($value - $mean) ** 2) / (2 * $std2)) / ((2 * $std2 * M_PI) ** .5);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns probability density value of the given <i>$value</i> based on
|
||||
* given standard deviation and the mean
|
||||
*
|
||||
* @param float $mean
|
||||
* @param float $std
|
||||
* @param float $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public static function distributionPdf(float $mean, float $std, float $value)
|
||||
public static function distributionPdf(float $mean, float $std, float $value): float
|
||||
{
|
||||
$normal = new self($mean, $std);
|
||||
|
||||
return $normal->pdf($value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,13 +9,9 @@ use Phpml\Exception\InvalidArgumentException;
|
|||
class Mean
|
||||
{
|
||||
/**
|
||||
* @param array $numbers
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function arithmetic(array $numbers)
|
||||
public static function arithmetic(array $numbers): float
|
||||
{
|
||||
self::checkArrayLength($numbers);
|
||||
|
||||
|
@ -23,8 +19,6 @@ class Mean
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $numbers
|
||||
*
|
||||
* @return float|mixed
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
|
@ -34,11 +28,11 @@ class Mean
|
|||
self::checkArrayLength($numbers);
|
||||
|
||||
$count = count($numbers);
|
||||
$middleIndex = (int)floor($count / 2);
|
||||
$middleIndex = (int) floor($count / 2);
|
||||
sort($numbers, SORT_NUMERIC);
|
||||
$median = $numbers[$middleIndex];
|
||||
|
||||
if (0 === $count % 2) {
|
||||
if ($count % 2 === 0) {
|
||||
$median = ($median + $numbers[$middleIndex - 1]) / 2;
|
||||
}
|
||||
|
||||
|
@ -46,8 +40,6 @@ class Mean
|
|||
}
|
||||
|
||||
/**
|
||||
* @param array $numbers
|
||||
*
|
||||
* @return mixed
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
|
@ -58,18 +50,16 @@ class Mean
|
|||
|
||||
$values = array_count_values($numbers);
|
||||
|
||||
return array_search(max($values), $values);
|
||||
return array_search(max($values), $values, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $array
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
private static function checkArrayLength(array $array)
|
||||
private static function checkArrayLength(array $array): void
|
||||
{
|
||||
if (empty($array)) {
|
||||
throw InvalidArgumentException::arrayCantBeEmpty();
|
||||
if (count($array) === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,36 +9,51 @@ use Phpml\Exception\InvalidArgumentException;
|
|||
class StandardDeviation
|
||||
{
|
||||
/**
|
||||
* @param array|float[] $a
|
||||
* @param bool $sample
|
||||
*
|
||||
* @return float
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
* @param float[]|int[] $numbers
|
||||
*/
|
||||
public static function population(array $a, $sample = true)
|
||||
public static function population(array $numbers, bool $sample = true): float
|
||||
{
|
||||
if (empty($a)) {
|
||||
throw InvalidArgumentException::arrayCantBeEmpty();
|
||||
$n = count($numbers);
|
||||
if ($n === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
|
||||
$n = count($a);
|
||||
|
||||
if ($sample && $n === 1) {
|
||||
throw InvalidArgumentException::arraySizeToSmall(2);
|
||||
throw new InvalidArgumentException('The array must have at least 2 elements');
|
||||
}
|
||||
|
||||
$mean = Mean::arithmetic($a);
|
||||
$mean = Mean::arithmetic($numbers);
|
||||
$carry = 0.0;
|
||||
foreach ($a as $val) {
|
||||
$d = $val - $mean;
|
||||
$carry += $d * $d;
|
||||
foreach ($numbers as $val) {
|
||||
$carry += ($val - $mean) ** 2;
|
||||
}
|
||||
|
||||
if ($sample) {
|
||||
--$n;
|
||||
}
|
||||
|
||||
return sqrt((float) ($carry / $n));
|
||||
return ($carry / $n) ** .5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sum of squares deviations
|
||||
* ∑⟮xᵢ - μ⟯²
|
||||
*
|
||||
* @param float[]|int[] $numbers
|
||||
*/
|
||||
public static function sumOfSquares(array $numbers): float
|
||||
{
|
||||
if (count($numbers) === 0) {
|
||||
throw new InvalidArgumentException('The array has zero elements');
|
||||
}
|
||||
|
||||
$mean = Mean::arithmetic($numbers);
|
||||
|
||||
return array_sum(array_map(
|
||||
function ($val) use ($mean) {
|
||||
return ($val - $mean) ** 2;
|
||||
},
|
||||
$numbers
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Math\Statistic;
|
||||
|
||||
/**
|
||||
* In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean.
|
||||
* Informally, it measures how far a set of (random) numbers are spread out from their average value
|
||||
* https://en.wikipedia.org/wiki/Variance
|
||||
*/
|
||||
final class Variance
|
||||
{
|
||||
/**
|
||||
* Population variance
|
||||
* Use when all possible observations of the system are present.
|
||||
* If used with a subset of data (sample variance), it will be a biased variance.
|
||||
*
|
||||
* ∑⟮xᵢ - μ⟯²
|
||||
* σ² = ----------
|
||||
* N
|
||||
*/
|
||||
public static function population(array $population): float
|
||||
{
|
||||
return StandardDeviation::sumOfSquares($population) / count($population);
|
||||
}
|
||||
}
|
|
@ -9,10 +9,6 @@ use Phpml\Exception\InvalidArgumentException;
|
|||
class Accuracy
|
||||
{
|
||||
/**
|
||||
* @param array $actualLabels
|
||||
* @param array $predictedLabels
|
||||
* @param bool $normalize
|
||||
*
|
||||
* @return float|int
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
|
@ -20,7 +16,7 @@ class Accuracy
|
|||
public static function score(array $actualLabels, array $predictedLabels, bool $normalize = true)
|
||||
{
|
||||
if (count($actualLabels) != count($predictedLabels)) {
|
||||
throw InvalidArgumentException::arraySizeNotMatch();
|
||||
throw new InvalidArgumentException('Size of given arrays does not match');
|
||||
}
|
||||
|
||||
$score = 0;
|
||||
|
|
|
@ -4,8 +4,36 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml\Metric;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
class ClassificationReport
|
||||
{
|
||||
public const MICRO_AVERAGE = 1;
|
||||
|
||||
public const MACRO_AVERAGE = 2;
|
||||
|
||||
public const WEIGHTED_AVERAGE = 3;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $truePositive = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $falsePositive = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $falseNegative = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $support = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
|
@ -21,27 +49,55 @@ class ClassificationReport
|
|||
*/
|
||||
private $f1score = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $support = [];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $average = [];
|
||||
|
||||
/**
|
||||
* @param array $actualLabels
|
||||
* @param array $predictedLabels
|
||||
*/
|
||||
public function __construct(array $actualLabels, array $predictedLabels)
|
||||
public function __construct(array $actualLabels, array $predictedLabels, int $average = self::MACRO_AVERAGE)
|
||||
{
|
||||
$truePositive = $falsePositive = $falseNegative = $this->support = self::getLabelIndexedArray($actualLabels, $predictedLabels);
|
||||
$averagingMethods = range(self::MICRO_AVERAGE, self::WEIGHTED_AVERAGE);
|
||||
if (!in_array($average, $averagingMethods, true)) {
|
||||
throw new InvalidArgumentException('Averaging method must be MICRO_AVERAGE, MACRO_AVERAGE or WEIGHTED_AVERAGE');
|
||||
}
|
||||
|
||||
$this->aggregateClassificationResults($actualLabels, $predictedLabels);
|
||||
$this->computeMetrics();
|
||||
$this->computeAverage($average);
|
||||
}
|
||||
|
||||
public function getPrecision(): array
|
||||
{
|
||||
return $this->precision;
|
||||
}
|
||||
|
||||
public function getRecall(): array
|
||||
{
|
||||
return $this->recall;
|
||||
}
|
||||
|
||||
public function getF1score(): array
|
||||
{
|
||||
return $this->f1score;
|
||||
}
|
||||
|
||||
public function getSupport(): array
|
||||
{
|
||||
return $this->support;
|
||||
}
|
||||
|
||||
public function getAverage(): array
|
||||
{
|
||||
return $this->average;
|
||||
}
|
||||
|
||||
private function aggregateClassificationResults(array $actualLabels, array $predictedLabels): void
|
||||
{
|
||||
$truePositive = $falsePositive = $falseNegative = $support = self::getLabelIndexedArray($actualLabels, $predictedLabels);
|
||||
|
||||
foreach ($actualLabels as $index => $actual) {
|
||||
$predicted = $predictedLabels[$index];
|
||||
++$this->support[$actual];
|
||||
++$support[$actual];
|
||||
|
||||
if ($actual === $predicted) {
|
||||
++$truePositive[$actual];
|
||||
|
@ -51,85 +107,92 @@ class ClassificationReport
|
|||
}
|
||||
}
|
||||
|
||||
$this->computeMetrics($truePositive, $falsePositive, $falseNegative);
|
||||
$this->computeAverage();
|
||||
$this->truePositive = $truePositive;
|
||||
$this->falsePositive = $falsePositive;
|
||||
$this->falseNegative = $falseNegative;
|
||||
$this->support = $support;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getPrecision()
|
||||
private function computeMetrics(): void
|
||||
{
|
||||
return $this->precision;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getRecall()
|
||||
{
|
||||
return $this->recall;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getF1score()
|
||||
{
|
||||
return $this->f1score;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getSupport()
|
||||
{
|
||||
return $this->support;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getAverage()
|
||||
{
|
||||
return $this->average;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $truePositive
|
||||
* @param array $falsePositive
|
||||
* @param array $falseNegative
|
||||
*/
|
||||
private function computeMetrics(array $truePositive, array $falsePositive, array $falseNegative)
|
||||
{
|
||||
foreach ($truePositive as $label => $tp) {
|
||||
$this->precision[$label] = $this->computePrecision($tp, $falsePositive[$label]);
|
||||
$this->recall[$label] = $this->computeRecall($tp, $falseNegative[$label]);
|
||||
foreach ($this->truePositive as $label => $tp) {
|
||||
$this->precision[$label] = $this->computePrecision($tp, $this->falsePositive[$label]);
|
||||
$this->recall[$label] = $this->computeRecall($tp, $this->falseNegative[$label]);
|
||||
$this->f1score[$label] = $this->computeF1Score((float) $this->precision[$label], (float) $this->recall[$label]);
|
||||
}
|
||||
}
|
||||
|
||||
private function computeAverage()
|
||||
private function computeAverage(int $average): void
|
||||
{
|
||||
switch ($average) {
|
||||
case self::MICRO_AVERAGE:
|
||||
$this->computeMicroAverage();
|
||||
|
||||
return;
|
||||
case self::MACRO_AVERAGE:
|
||||
$this->computeMacroAverage();
|
||||
|
||||
return;
|
||||
case self::WEIGHTED_AVERAGE:
|
||||
$this->computeWeightedAverage();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
private function computeMicroAverage(): void
|
||||
{
|
||||
$truePositive = (int) array_sum($this->truePositive);
|
||||
$falsePositive = (int) array_sum($this->falsePositive);
|
||||
$falseNegative = (int) array_sum($this->falseNegative);
|
||||
|
||||
$precision = $this->computePrecision($truePositive, $falsePositive);
|
||||
$recall = $this->computeRecall($truePositive, $falseNegative);
|
||||
$f1score = $this->computeF1Score((float) $precision, (float) $recall);
|
||||
|
||||
$this->average = compact('precision', 'recall', 'f1score');
|
||||
}
|
||||
|
||||
private function computeMacroAverage(): void
|
||||
{
|
||||
foreach (['precision', 'recall', 'f1score'] as $metric) {
|
||||
$values = array_filter($this->{$metric});
|
||||
if (empty($values)) {
|
||||
$values = $this->{$metric};
|
||||
if (count($values) == 0) {
|
||||
$this->average[$metric] = 0.0;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$this->average[$metric] = array_sum($values) / count($values);
|
||||
}
|
||||
}
|
||||
|
||||
private function computeWeightedAverage(): void
|
||||
{
|
||||
foreach (['precision', 'recall', 'f1score'] as $metric) {
|
||||
$values = $this->{$metric};
|
||||
if (count($values) == 0) {
|
||||
$this->average[$metric] = 0.0;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$sum = 0;
|
||||
foreach ($values as $i => $value) {
|
||||
$sum += $value * $this->support[$i];
|
||||
}
|
||||
|
||||
$this->average[$metric] = $sum / array_sum($this->support);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $truePositive
|
||||
* @param int $falsePositive
|
||||
*
|
||||
* @return float|string
|
||||
*/
|
||||
private function computePrecision(int $truePositive, int $falsePositive)
|
||||
{
|
||||
if (0 == ($divider = $truePositive + $falsePositive)) {
|
||||
$divider = $truePositive + $falsePositive;
|
||||
if ($divider == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
|
@ -137,47 +200,33 @@ class ClassificationReport
|
|||
}
|
||||
|
||||
/**
|
||||
* @param int $truePositive
|
||||
* @param int $falseNegative
|
||||
*
|
||||
* @return float|string
|
||||
*/
|
||||
private function computeRecall(int $truePositive, int $falseNegative)
|
||||
{
|
||||
if (0 == ($divider = $truePositive + $falseNegative)) {
|
||||
$divider = $truePositive + $falseNegative;
|
||||
if ($divider == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return $truePositive / $divider;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float $precision
|
||||
* @param float $recall
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function computeF1Score(float $precision, float $recall): float
|
||||
{
|
||||
if (0 == ($divider = $precision + $recall)) {
|
||||
$divider = $precision + $recall;
|
||||
if ($divider == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return 2.0 * (($precision * $recall) / $divider);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $actualLabels
|
||||
* @param array $predictedLabels
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private static function getLabelIndexedArray(array $actualLabels, array $predictedLabels): array
|
||||
{
|
||||
$labels = array_values(array_unique(array_merge($actualLabels, $predictedLabels)));
|
||||
sort($labels);
|
||||
$labels = array_combine($labels, array_fill(0, count($labels), 0));
|
||||
|
||||
return $labels;
|
||||
return (array) array_combine($labels, array_fill(0, count($labels), 0));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,22 +6,15 @@ namespace Phpml\Metric;
|
|||
|
||||
class ConfusionMatrix
|
||||
{
|
||||
/**
|
||||
* @param array $actualLabels
|
||||
* @param array $predictedLabels
|
||||
* @param array $labels
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public static function compute(array $actualLabels, array $predictedLabels, array $labels = null): array
|
||||
public static function compute(array $actualLabels, array $predictedLabels, array $labels = []): array
|
||||
{
|
||||
$labels = $labels ? array_flip($labels) : self::getUniqueLabels($actualLabels);
|
||||
$labels = count($labels) === 0 ? self::getUniqueLabels($actualLabels) : array_flip($labels);
|
||||
$matrix = self::generateMatrixWithZeros($labels);
|
||||
|
||||
foreach ($actualLabels as $index => $actual) {
|
||||
$predicted = $predictedLabels[$index];
|
||||
|
||||
if (!isset($labels[$actual]) || !isset($labels[$predicted])) {
|
||||
if (!isset($labels[$actual], $labels[$predicted])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -32,17 +25,12 @@ class ConfusionMatrix
|
|||
$column = $labels[$predicted];
|
||||
}
|
||||
|
||||
$matrix[$row][$column] += 1;
|
||||
++$matrix[$row][$column];
|
||||
}
|
||||
|
||||
return $matrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $labels
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private static function generateMatrixWithZeros(array $labels): array
|
||||
{
|
||||
$count = count($labels);
|
||||
|
@ -55,17 +43,11 @@ class ConfusionMatrix
|
|||
return $matrix;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $labels
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private static function getUniqueLabels(array $labels): array
|
||||
{
|
||||
$labels = array_values(array_unique($labels));
|
||||
sort($labels);
|
||||
$labels = array_flip($labels);
|
||||
|
||||
return $labels;
|
||||
return array_flip($labels);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,52 +4,37 @@ declare(strict_types=1);
|
|||
|
||||
namespace Phpml;
|
||||
|
||||
use Phpml\Exception\SerializeException;
|
||||
use Phpml\Exception\FileException;
|
||||
use Phpml\Exception\SerializeException;
|
||||
|
||||
class ModelManager
|
||||
{
|
||||
/**
|
||||
* @param Estimator $estimator
|
||||
* @param string $filepath
|
||||
*
|
||||
* @throws FileException
|
||||
* @throws SerializeException
|
||||
*/
|
||||
public function saveToFile(Estimator $estimator, string $filepath)
|
||||
public function saveToFile(Estimator $estimator, string $filepath): void
|
||||
{
|
||||
if (!is_writable(dirname($filepath))) {
|
||||
throw FileException::cantSaveFile(basename($filepath));
|
||||
throw new FileException(sprintf('File "%s" can\'t be saved.', basename($filepath)));
|
||||
}
|
||||
|
||||
$serialized = serialize($estimator);
|
||||
if (empty($serialized)) {
|
||||
throw SerializeException::cantSerialize(gettype($estimator));
|
||||
if (!isset($serialized[0])) {
|
||||
throw new SerializeException(sprintf('Class "%s" can not be serialized.', gettype($estimator)));
|
||||
}
|
||||
|
||||
$result = file_put_contents($filepath, $serialized, LOCK_EX);
|
||||
if ($result === false) {
|
||||
throw FileException::cantSaveFile(basename($filepath));
|
||||
throw new FileException(sprintf('File "%s" can\'t be saved.', basename($filepath)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $filepath
|
||||
*
|
||||
* @return Estimator
|
||||
*
|
||||
* @throws FileException
|
||||
* @throws SerializeException
|
||||
*/
|
||||
public function restoreFromFile(string $filepath) : Estimator
|
||||
public function restoreFromFile(string $filepath): Estimator
|
||||
{
|
||||
if (!file_exists($filepath) || !is_readable($filepath)) {
|
||||
throw FileException::cantOpenFile(basename($filepath));
|
||||
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
|
||||
}
|
||||
|
||||
$object = unserialize(file_get_contents($filepath));
|
||||
$object = unserialize((string) file_get_contents($filepath), [Estimator::class]);
|
||||
if ($object === false) {
|
||||
throw SerializeException::cantUnserialize(basename($filepath));
|
||||
throw new SerializeException(sprintf('"%s" can not be unserialized.', basename($filepath)));
|
||||
}
|
||||
|
||||
return $object;
|
||||
|
|
|
@ -8,8 +8,12 @@ interface ActivationFunction
|
|||
{
|
||||
/**
|
||||
* @param float|int $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($value): float;
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $computedvalue
|
||||
*/
|
||||
public function differentiate($value, $computedvalue): float;
|
||||
}
|
||||
|
|
|
@ -10,11 +10,22 @@ class BinaryStep implements ActivationFunction
|
|||
{
|
||||
/**
|
||||
* @param float|int $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return $value >= 0 ? 1.0 : 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $computedvalue
|
||||
*/
|
||||
public function differentiate($value, $computedvalue): float
|
||||
{
|
||||
if ($value === 0 || $value === 0.0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,11 +10,18 @@ class Gaussian implements ActivationFunction
|
|||
{
|
||||
/**
|
||||
* @param float|int $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return exp(-pow($value, 2));
|
||||
return exp(- $value ** 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $calculatedvalue
|
||||
*/
|
||||
public function differentiate($value, $calculatedvalue): float
|
||||
{
|
||||
return -2 * $value * $calculatedvalue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,21 +13,25 @@ class HyperbolicTangent implements ActivationFunction
|
|||
*/
|
||||
private $beta;
|
||||
|
||||
/**
|
||||
* @param float $beta
|
||||
*/
|
||||
public function __construct($beta = 1.0)
|
||||
public function __construct(float $beta = 1.0)
|
||||
{
|
||||
$this->beta = $beta;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return tanh($this->beta * $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $computedvalue
|
||||
*/
|
||||
public function differentiate($value, $computedvalue): float
|
||||
{
|
||||
return 1 - $computedvalue ** 2;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\NeuralNetwork\ActivationFunction;
|
||||
|
||||
use Phpml\NeuralNetwork\ActivationFunction;
|
||||
|
||||
class PReLU implements ActivationFunction
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
private $beta;
|
||||
|
||||
public function __construct(float $beta = 0.01)
|
||||
{
|
||||
$this->beta = $beta;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return $value >= 0 ? $value : $this->beta * $value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $computedvalue
|
||||
*/
|
||||
public function differentiate($value, $computedvalue): float
|
||||
{
|
||||
return $computedvalue >= 0 ? 1.0 : $this->beta;
|
||||
}
|
||||
}
|
|
@ -13,21 +13,25 @@ class Sigmoid implements ActivationFunction
|
|||
*/
|
||||
private $beta;
|
||||
|
||||
/**
|
||||
* @param float $beta
|
||||
*/
|
||||
public function __construct($beta = 1.0)
|
||||
public function __construct(float $beta = 1.0)
|
||||
{
|
||||
$this->beta = $beta;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return 1 / (1 + exp(-$this->beta * $value));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $computedvalue
|
||||
*/
|
||||
public function differentiate($value, $computedvalue): float
|
||||
{
|
||||
return $computedvalue * (1 - $computedvalue);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\NeuralNetwork\ActivationFunction;
|
||||
|
||||
use Phpml\NeuralNetwork\ActivationFunction;
|
||||
|
||||
class ThresholdedReLU implements ActivationFunction
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
private $theta;
|
||||
|
||||
public function __construct(float $theta = 0.0)
|
||||
{
|
||||
$this->theta = $theta;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
*/
|
||||
public function compute($value): float
|
||||
{
|
||||
return $value > $this->theta ? $value : 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param float|int $value
|
||||
* @param float|int $calculatedvalue
|
||||
*/
|
||||
public function differentiate($value, $calculatedvalue): float
|
||||
{
|
||||
return $calculatedvalue >= $this->theta ? 1.0 : 0.0;
|
||||
}
|
||||
}
|
|
@ -15,16 +15,12 @@ class Layer
|
|||
private $nodes = [];
|
||||
|
||||
/**
|
||||
* @param int $nodesNumber
|
||||
* @param string $nodeClass
|
||||
* @param ActivationFunction|null $activationFunction
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function __construct(int $nodesNumber = 0, string $nodeClass = Neuron::class, ActivationFunction $activationFunction = null)
|
||||
public function __construct(int $nodesNumber = 0, string $nodeClass = Neuron::class, ?ActivationFunction $activationFunction = null)
|
||||
{
|
||||
if (!in_array(Node::class, class_implements($nodeClass))) {
|
||||
throw InvalidArgumentException::invalidLayerNodeClass();
|
||||
if (!in_array(Node::class, class_implements($nodeClass), true)) {
|
||||
throw new InvalidArgumentException('Layer node class must implement Node interface');
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $nodesNumber; ++$i) {
|
||||
|
@ -32,25 +28,7 @@ class Layer
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $nodeClass
|
||||
* @param ActivationFunction|null $activationFunction
|
||||
*
|
||||
* @return Neuron
|
||||
*/
|
||||
private function createNode(string $nodeClass, ActivationFunction $activationFunction = null)
|
||||
{
|
||||
if (Neuron::class == $nodeClass) {
|
||||
return new Neuron($activationFunction);
|
||||
}
|
||||
|
||||
return new $nodeClass();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Node $node
|
||||
*/
|
||||
public function addNode(Node $node)
|
||||
public function addNode(Node $node): void
|
||||
{
|
||||
$this->nodes[] = $node;
|
||||
}
|
||||
|
@ -58,8 +36,17 @@ class Layer
|
|||
/**
|
||||
* @return Node[]
|
||||
*/
|
||||
public function getNodes()
|
||||
public function getNodes(): array
|
||||
{
|
||||
return $this->nodes;
|
||||
}
|
||||
|
||||
private function createNode(string $nodeClass, ?ActivationFunction $activationFunction = null): Node
|
||||
{
|
||||
if ($nodeClass === Neuron::class) {
|
||||
return new Neuron($activationFunction);
|
||||
}
|
||||
|
||||
return new $nodeClass();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,20 +8,12 @@ interface Network
|
|||
{
|
||||
/**
|
||||
* @param mixed $input
|
||||
*
|
||||
* @return self
|
||||
*/
|
||||
public function setInput($input);
|
||||
public function setInput($input): self;
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getOutput(): array;
|
||||
|
||||
/**
|
||||
* @param Layer $layer
|
||||
*/
|
||||
public function addLayer(Layer $layer);
|
||||
public function addLayer(Layer $layer): void;
|
||||
|
||||
/**
|
||||
* @return Layer[]
|
||||
|
|
|
@ -14,12 +14,9 @@ abstract class LayeredNetwork implements Network
|
|||
/**
|
||||
* @var Layer[]
|
||||
*/
|
||||
protected $layers;
|
||||
protected $layers = [];
|
||||
|
||||
/**
|
||||
* @param Layer $layer
|
||||
*/
|
||||
public function addLayer(Layer $layer)
|
||||
public function addLayer(Layer $layer): void
|
||||
{
|
||||
$this->layers[] = $layer;
|
||||
}
|
||||
|
@ -32,25 +29,16 @@ abstract class LayeredNetwork implements Network
|
|||
return $this->layers;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function removeLayers()
|
||||
public function removeLayers(): void
|
||||
{
|
||||
unset($this->layers);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Layer
|
||||
*/
|
||||
public function getOutputLayer(): Layer
|
||||
{
|
||||
return $this->layers[count($this->layers) - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getOutput(): array
|
||||
{
|
||||
$result = [];
|
||||
|
@ -63,10 +51,8 @@ abstract class LayeredNetwork implements Network
|
|||
|
||||
/**
|
||||
* @param mixed $input
|
||||
*
|
||||
* @return $this
|
||||
*/
|
||||
public function setInput($input)
|
||||
public function setInput($input): Network
|
||||
{
|
||||
$firstLayer = $this->layers[0];
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue