Camspiers\StatisticalClassifier\Classifier\SVM::prepareModel PHP Method

prepareModel() public method

public prepareModel ( )
    public function prepareModel()
    {
        $data = $this->applyTransform(new Transform\TokenPreparation($this->tokenizer, $this->documentNormalizer, $this->tokenNormalizer), $this->dataSource->getData());
        $tokenCountByDocument = $this->applyTransform(new Transform\TokenCountByDocument(), $data);
        $documentLength = $this->applyTransform(new Transform\DocumentLength(), $this->applyTransform(new Transform\TFIDF(), $tokenCountByDocument, $this->applyTransform(new Transform\DocumentCount(), $data), $this->applyTransform(new Transform\TokenAppearanceCount(), $tokenCountByDocument)));
        $categoryMap = array();
        $categoryCount = 0;
        $tokenMap = array();
        $tokenCount = 1;
        // Produce the token and category maps for the whole document set
        foreach ($documentLength as $category => $documents) {
            if (!array_key_exists($category, $categoryMap)) {
                $categoryMap[$category] = $categoryCount;
                $categoryCount++;
            }
            foreach ($documents as $document) {
                foreach (array_keys($document) as $token) {
                    if (!array_key_exists($token, $tokenMap)) {
                        $tokenMap[$token] = $tokenCount;
                        $tokenCount++;
                    }
                }
            }
        }
        // When using probabilities and our dataset is small we need to increase its
        // size by duplicating the data
        // see: http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf section "8 Probability Estimates"
        if ($this->hasThreshold()) {
            foreach ($documentLength as $category => $documents) {
                while (count($documents) <= 5) {
                    foreach ($documents as $document) {
                        $documents[] = $document;
                    }
                }
                $documentLength[$category] = $documents;
            }
        }
        $transform = array();
        // Prep the svm data set for use
        foreach ($documentLength as $category => $documents) {
            foreach ($documents as $document) {
                $entry = array($categoryMap[$category]);
                foreach ($document as $token => $value) {
                    $entry[$tokenMap[$token]] = $value;
                }
                ksort($entry, SORT_NUMERIC);
                $transform[] = $entry;
            }
        }
        // Weight the data set by the number of docs that appear in each class.
        $weights = array();
        foreach ($documentLength as $category => $documents) {
            $weights[$categoryMap[$category]] = count($documents);
        }
        $lowest = min($weights);
        foreach ($weights as $index => $weight) {
            $weights[$index] = $lowest / $weight;
        }
        $this->model->setMaps(array_flip($categoryMap), $tokenMap);
        $this->model->setModel($this->svm->train($transform, $weights));
        $this->model->setPrepared(true);
    }