public static function extractTags($content, $top_k = 20, $options = array())
{
$defaults = array('mode' => 'default');
$options = array_merge($defaults, $options);
$tags = array();
$words = Jieba::cut($content);
$freq = array();
$total = 0.0;
foreach ($words as $w) {
$w = trim($w);
if (mb_strlen($w, 'UTF-8') < 2) {
continue;
}
if (isset($freq[$w])) {
$freq[$w] = $freq[$w] + 1.0;
} else {
$freq[$w] = 0.0 + 1.0;
}
$total = $total + 1.0;
}
foreach ($freq as $k => $v) {
$freq[$k] = $v / $total;
}
$tf_idf_list = array();
foreach ($freq as $k => $v) {
if (isset(self::$idf_freq[$k])) {
$idf_freq = self::$idf_freq[$k];
} else {
$idf_freq = self::$max_idf;
}
$tf_idf_list[$k] = $v * $idf_freq;
}
arsort($tf_idf_list);
$tags = array_slice($tf_idf_list, 0, $top_k, true);
return $tags;
}
public function testExtractTags() { $case_array = array("所謂" => 1.1425214508493, "沒有" => 0.76168096723288, "是否" => 0.71841348115616, "一般" => 0.59095311682055, "肌迫" => 0.38084048361644, "雖然" => 0.38084048361644, "退縮" => 0.38084048361644, "矯作" => 0.38084048361644, "怯懦" => 0.26367154884822, "滿肚" => 0.19042024180822); $top_k = 10; $content = file_get_contents(dirname(dirname(__FILE__)) . "/src/dict/lyric.txt", "r"); $tags = JiebaAnalyse::extractTags($content, $top_k); $this->assertEquals($case_array, $tags); }