phpbb\search\fulltext_native::cleanup PHP Method

cleanup() protected method

This method receives a UTF-8 string, normalizes and validates it, replaces all non-alphanumeric characters with strings then returns the result. Any number of "allowed chars" can be passed as a UTF-8 string in NFC.
protected cleanup ( string $text, string $allowed_chars = null, string $encoding = 'utf-8' ) : string
$text string Text to split, in UTF-8 (not normalized or sanitized)
$allowed_chars string String of special chars to allow
$encoding string Text encoding
return string Cleaned up text, only alphanumeric chars are left
    protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
    {
        static $conv = array(), $conv_loaded = array();
        $words = $allow = array();
        // Convert the text to UTF-8
        $encoding = strtolower($encoding);
        if ($encoding != 'utf-8') {
            $text = utf8_recode($text, $encoding);
        }
        $utf_len_mask = array("�" => 2, "�" => 2, "�" => 3, "�" => 4);
        /**
         * Replace HTML entities and NCRs
         */
        $text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
        /**
         * Normalize to NFC
         */
        $text = \Normalizer::normalize($text);
        /**
         * The first thing we do is:
         *
         * - convert ASCII-7 letters to lowercase
         * - remove the ASCII-7 non-alpha characters
         * - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
         *   0xC1 and 0xF5-0xFF
         *
         * @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
         */
        $sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#\$%&'()*+,-./:;<=>?@[\\]^_`{|}~\v\f�������������";
        $sb_replace = 'istcpamelrdojbnhfgvwuqkyxz                                                                              ';
        /**
         * This is the list of legal ASCII chars, it is automatically extended
         * with ASCII chars from $allowed_chars
         */
        $legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
        /**
         * Prepare an array containing the extra chars to allow
         */
        if (isset($allowed_chars[0])) {
            $pos = 0;
            $len = strlen($allowed_chars);
            do {
                $c = $allowed_chars[$pos];
                if ($c < "�") {
                    /**
                     * ASCII char
                     */
                    $sb_pos = strpos($sb_match, $c);
                    if (is_int($sb_pos)) {
                        /**
                         * Remove the char from $sb_match and its corresponding
                         * replacement in $sb_replace
                         */
                        $sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
                        $sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
                        $legal_ascii .= $c;
                    }
                    ++$pos;
                } else {
                    /**
                     * UTF-8 char
                     */
                    $utf_len = $utf_len_mask[$c & "�"];
                    $allow[substr($allowed_chars, $pos, $utf_len)] = 1;
                    $pos += $utf_len;
                }
            } while ($pos < $len);
        }
        $text = strtr($text, $sb_match, $sb_replace);
        $ret = '';
        $pos = 0;
        $len = strlen($text);
        do {
            /**
             * Do all consecutive ASCII chars at once
             */
            if ($spn = strspn($text, $legal_ascii, $pos)) {
                $ret .= substr($text, $pos, $spn);
                $pos += $spn;
            }
            if ($pos >= $len) {
                return $ret;
            }
            /**
             * Capture the UTF char
             */
            $utf_len = $utf_len_mask[$text[$pos] & "�"];
            $utf_char = substr($text, $pos, $utf_len);
            $pos += $utf_len;
            if ($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST || $utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST || $utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST) {
                /**
                 * All characters within these ranges are valid
                 *
                 * We separate them with a space in order to index each character
                 * individually
                 */
                $ret .= ' ' . $utf_char . ' ';
                continue;
            }
            if (isset($allow[$utf_char])) {
                /**
                 * The char is explicitly allowed
                 */
                $ret .= $utf_char;
                continue;
            }
            if (isset($conv[$utf_char])) {
                /**
                 * The char is mapped to something, maybe to itself actually
                 */
                $ret .= $conv[$utf_char];
                continue;
            }
            /**
             * The char isn't mapped, but did we load its conversion table?
             *
             * The search indexer table is split into blocks. The block number of
             * each char is equal to its codepoint right-shifted for 11 bits. It
             * means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
             * 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
             * all UTF chars encoded in 2 bytes are in the same first block.
             */
            if (isset($utf_char[2])) {
                if (isset($utf_char[3])) {
                    /**
                     * 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
                     * 0000 0111 0011 1111 0010 0000
                     */
                    $idx = (ord($utf_char[0]) & 0x7) << 7 | (ord($utf_char[1]) & 0x3f) << 1 | (ord($utf_char[2]) & 0x20) >> 5;
                } else {
                    /**
                     * 1110 nnnn 10nx xxxx 10xx xxxx
                     * 0000 0111 0010 0000
                     */
                    $idx = (ord($utf_char[0]) & 0x7) << 1 | (ord($utf_char[1]) & 0x20) >> 5;
                }
            } else {
                /**
                 * 110x xxxx 10xx xxxx
                 * 0000 0000 0000 0000
                 */
                $idx = 0;
            }
            /**
             * Check if the required conv table has been loaded already
             */
            if (!isset($conv_loaded[$idx])) {
                $conv_loaded[$idx] = 1;
                $file = $this->phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $this->php_ext;
                if (file_exists($file)) {
                    $conv += (include $file);
                }
            }
            if (isset($conv[$utf_char])) {
                $ret .= $conv[$utf_char];
            } else {
                /**
                 * We add an entry to the conversion table so that we
                 * don't have to convert to codepoint and perform the checks
                 * that are above this block
                 */
                $conv[$utf_char] = ' ';
                $ret .= ' ';
            }
        } while (1);
        return $ret;
    }