public static function summarize($text, $count = 100, $max_paragraphs = 1)
{
$ellipsis = '…';
$showmore = false;
$ht = new HtmlTokenizer($text, false);
$set = $ht->parse();
$stack = array();
$para = 0;
$token = $set->current();
$summary = new HTMLTokenSet();
$set->rewind();
$remaining_words = $count;
// $bail lets the loop end naturally and close all open elements without adding new ones.
$bail = false;
for ($token = $set->current(); $set->valid(); $token = $set->next()) {
if (!$bail && $token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_OPEN) {
$stack[] = $token;
}
if (!$bail) {
switch ($token['type']) {
case HTMLTokenizer::NODE_TYPE_TEXT:
$words = preg_split('/(\\s+)/u', $token['value'], -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
// word count is doubled because spaces between words are captured as their own array elements via PREG_SPLIT_DELIM_CAPTURE
$words = array_slice($words, 0, $remaining_words * 2);
$remaining_words -= count($words) / 2;
$token['value'] = implode('', $words);
if ($remaining_words <= 0) {
$token['value'] .= $ellipsis;
$summary[] = $token;
$bail = true;
} else {
$summary[] = $token;
}
break;
case HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE:
// don't handle this case here
break;
default:
$summary[] = $token;
break;
}
}
if ($token['type'] == HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE) {
if (count($stack) > 0 && in_array($token['name'], Utils::array_map_field($stack, 'name'))) {
do {
$end = array_pop($stack);
$end['type'] = HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE;
$end['attrs'] = null;
$end['value'] = null;
$summary[] = $end;
} while (($bail || $end['name'] != $token['name']) && count($stack) > 0);
} else {
$end['name'] = $token['name'];
$end['type'] = HTMLTokenizer::NODE_TYPE_ELEMENT_CLOSE;
$end['attrs'] = null;
$end['value'] = null;
}
if (count($stack) == 0) {
$para++;
}
if ($bail || $para >= $max_paragraphs) {
break;
}
}
}
return (string) $summary;
}