public function split_message($text)
{
$match = $words = array();
/**
* Taken from the original code
*/
// Do not index code
$match[] = '#\\[code(?:=.*?)?(\\:?[0-9a-z]{5,})\\].*?\\[\\/code(\\:?[0-9a-z]{5,})\\]#is';
// BBcode
$match[] = '#\\[\\/?[a-z0-9\\*\\+\\-]+(?:=.*?)?(?::[a-z])?(\\:?[0-9a-z]{5,})\\]#';
$min = $this->word_length['min'];
$max = $this->word_length['max'];
$isset_min = $min - 1;
/**
* Clean up the string, remove HTML tags, remove BBCodes
*/
$word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
while (strlen($word)) {
if (strlen($word) > 255 || strlen($word) <= $isset_min) {
/**
* Words longer than 255 bytes are ignored. This will have to be
* changed whenever we change the length of search_wordlist.word_text
*
* Words shorter than $isset_min bytes are ignored, too
*/
$word = strtok(' ');
continue;
}
$len = utf8_strlen($word);
/**
* Test whether the word is too short to be indexed.
*
* Note that this limit does NOT apply to CJK and Hangul
*/
if ($len < $min) {
/**
* Note: this could be optimized. If the codepoint is lower than Hangul's range
* we know that it will also be lower than CJK ranges
*/
if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0)) {
$word = strtok(' ');
continue;
}
}
$words[] = $word;
$word = strtok(' ');
}
return $words;
}