phpbb\search\fulltext_native::split_message PHP Method

split_message() public method

The text is converted to UTF-8, cleaned up, and split. Then, words that conform to the defined length range are returned in an array. NOTE: duplicates are NOT removed from the return array
public split_message ( string $text ) : array
$text string Text to split, encoded in UTF-8
return array Array of UTF-8 words
    public function split_message($text)
    {
        $match = $words = array();
        /**
         * Taken from the original code
         */
        // Do not index code
        $match[] = '#\\[code(?:=.*?)?(\\:?[0-9a-z]{5,})\\].*?\\[\\/code(\\:?[0-9a-z]{5,})\\]#is';
        // BBcode
        $match[] = '#\\[\\/?[a-z0-9\\*\\+\\-]+(?:=.*?)?(?::[a-z])?(\\:?[0-9a-z]{5,})\\]#';
        $min = $this->word_length['min'];
        $max = $this->word_length['max'];
        $isset_min = $min - 1;
        /**
         * Clean up the string, remove HTML tags, remove BBCodes
         */
        $word = strtok($this->cleanup(preg_replace($match, ' ', strip_tags($text)), -1), ' ');
        while (strlen($word)) {
            if (strlen($word) > 255 || strlen($word) <= $isset_min) {
                /**
                 * Words longer than 255 bytes are ignored. This will have to be
                 * changed whenever we change the length of search_wordlist.word_text
                 *
                 * Words shorter than $isset_min bytes are ignored, too
                 */
                $word = strtok(' ');
                continue;
            }
            $len = utf8_strlen($word);
            /**
             * Test whether the word is too short to be indexed.
             *
             * Note that this limit does NOT apply to CJK and Hangul
             */
            if ($len < $min) {
                /**
                 * Note: this could be optimized. If the codepoint is lower than Hangul's range
                 * we know that it will also be lower than CJK ranges
                 */
                if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0) && (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0)) {
                    $word = strtok(' ');
                    continue;
                }
            }
            $words[] = $word;
            $word = strtok(' ');
        }
        return $words;
    }