public function prepareModel()
{
$data = $this->applyTransform(new Transform\TokenPreparation($this->tokenizer, $this->documentNormalizer, $this->tokenNormalizer), $this->dataSource->getData());
$tokenCountByDocument = $this->applyTransform(new Transform\TokenCountByDocument(), $data);
$documentLength = $this->applyTransform(new Transform\DocumentLength(), $this->applyTransform(new Transform\TFIDF(), $tokenCountByDocument, $this->applyTransform(new Transform\DocumentCount(), $data), $this->applyTransform(new Transform\TokenAppearanceCount(), $tokenCountByDocument)));
$categoryMap = array();
$categoryCount = 0;
$tokenMap = array();
$tokenCount = 1;
// Produce the token and category maps for the whole document set
foreach ($documentLength as $category => $documents) {
if (!array_key_exists($category, $categoryMap)) {
$categoryMap[$category] = $categoryCount;
$categoryCount++;
}
foreach ($documents as $document) {
foreach (array_keys($document) as $token) {
if (!array_key_exists($token, $tokenMap)) {
$tokenMap[$token] = $tokenCount;
$tokenCount++;
}
}
}
}
// When using probabilities and our dataset is small we need to increase its
// size by duplicating the data
// see: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf section "8 Probability Estimates"
if ($this->hasThreshold()) {
foreach ($documentLength as $category => $documents) {
while (count($documents) <= 5) {
foreach ($documents as $document) {
$documents[] = $document;
}
}
$documentLength[$category] = $documents;
}
}
$transform = array();
// Prep the svm data set for use
foreach ($documentLength as $category => $documents) {
foreach ($documents as $document) {
$entry = array($categoryMap[$category]);
foreach ($document as $token => $value) {
$entry[$tokenMap[$token]] = $value;
}
ksort($entry, SORT_NUMERIC);
$transform[] = $entry;
}
}
// Weight the data set by the number of docs that appear in each class.
$weights = array();
foreach ($documentLength as $category => $documents) {
$weights[$categoryMap[$category]] = count($documents);
}
$lowest = min($weights);
foreach ($weights as $index => $weight) {
$weights[$index] = $lowest / $weight;
}
$this->model->setMaps(array_flip($categoryMap), $tokenMap);
$this->model->setModel($this->svm->train($transform, $weights));
$this->model->setPrepared(true);
}