public static function __cutDAG($sentence, $options = array())
{
$defaults = array('mode' => 'default');
$options = array_merge($defaults, $options);
$words = array();
$N = mb_strlen($sentence, 'UTF-8');
$DAG = self::getDAG($sentence);
self::calc($sentence, $DAG);
$x = 0;
$buf = '';
while ($x < $N) {
$current_route_keys = array_keys(self::$route[$x]);
$y = $current_route_keys[0] + 1;
$l_word = mb_substr($sentence, $x, $y - $x, 'UTF-8');
if ($y - $x == 1) {
$buf = $buf . $l_word;
} else {
if (mb_strlen($buf, 'UTF-8') > 0) {
if (mb_strlen($buf, 'UTF-8') == 1) {
array_push($words, $buf);
$buf = '';
} else {
$regognized = Finalseg::cut($buf);
foreach ($regognized as $key => $word) {
array_push($words, $word);
}
$buf = '';
}
}
array_push($words, $l_word);
}
$x = $y;
}
if (mb_strlen($buf, 'UTF-8') > 0) {
if (mb_strlen($buf, 'UTF-8') == 1) {
array_push($words, $buf);
} else {
$regognized = Finalseg::cut($buf);
foreach ($regognized as $key => $word) {
array_push($words, $word);
}
}
}
return $words;
}
/** * Static method cut * * @param string $sentence # input sentence * @param boolean $cut_all # cut_all or not * @param array $options # other options * * @return array $seg_list */ public static function cut($sentence, $cut_all = false, $options = array()) { $defaults = array('mode' => 'default'); $options = array_merge($defaults, $options); $seg_list = array(); $re_han_pattern = '([\\x{4E00}-\\x{9FA5}]+)'; $re_skip_pattern = '([a-zA-Z0-9+#\\n]+)'; preg_match_all('/(' . $re_han_pattern . '|' . $re_skip_pattern . ')/u', $sentence, $matches, PREG_PATTERN_ORDER); $blocks = $matches[0]; foreach ($blocks as $blk) { if (preg_match('/' . $re_han_pattern . '/u', $blk)) { if ($cut_all) { $words = Jieba::__cutAll($blk); } else { $words = Jieba::__cutDAG($blk); } foreach ($words as $word) { array_push($seg_list, $word); } } else { array_push($seg_list, $blk); } // end else (preg_match('/'.$re_han_pattern.'/u', $blk)) } // end foreach ($blocks as $blk) return $seg_list; }