protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
{
static $conv = array(), $conv_loaded = array();
$words = $allow = array();
// Convert the text to UTF-8
$encoding = strtolower($encoding);
if ($encoding != 'utf-8') {
$text = utf8_recode($text, $encoding);
}
$utf_len_mask = array("�" => 2, "�" => 2, "�" => 3, "�" => 4);
/**
* Replace HTML entities and NCRs
*/
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
/**
* Normalize to NFC
*/
$text = \Normalizer::normalize($text);
/**
* The first thing we do is:
*
* - convert ASCII-7 letters to lowercase
* - remove the ASCII-7 non-alpha characters
* - remove the bytes that should not appear in a valid UTF-8 string: 0xC0,
* 0xC1 and 0xF5-0xFF
*
* @todo in theory, the third one is already taken care of during normalization and those chars should have been replaced by Unicode replacement chars
*/
$sb_match = "ISTCPAMELRDOJBNHFGVWUQKYXZ\r\n\t!\"#\$%&'()*+,-./:;<=>?@[\\]^_`{|}~\v\f�������������";
$sb_replace = 'istcpamelrdojbnhfgvwuqkyxz ';
/**
* This is the list of legal ASCII chars, it is automatically extended
* with ASCII chars from $allowed_chars
*/
$legal_ascii = ' eaisntroludcpmghbfvq10xy2j9kw354867z';
/**
* Prepare an array containing the extra chars to allow
*/
if (isset($allowed_chars[0])) {
$pos = 0;
$len = strlen($allowed_chars);
do {
$c = $allowed_chars[$pos];
if ($c < "�") {
/**
* ASCII char
*/
$sb_pos = strpos($sb_match, $c);
if (is_int($sb_pos)) {
/**
* Remove the char from $sb_match and its corresponding
* replacement in $sb_replace
*/
$sb_match = substr($sb_match, 0, $sb_pos) . substr($sb_match, $sb_pos + 1);
$sb_replace = substr($sb_replace, 0, $sb_pos) . substr($sb_replace, $sb_pos + 1);
$legal_ascii .= $c;
}
++$pos;
} else {
/**
* UTF-8 char
*/
$utf_len = $utf_len_mask[$c & "�"];
$allow[substr($allowed_chars, $pos, $utf_len)] = 1;
$pos += $utf_len;
}
} while ($pos < $len);
}
$text = strtr($text, $sb_match, $sb_replace);
$ret = '';
$pos = 0;
$len = strlen($text);
do {
/**
* Do all consecutive ASCII chars at once
*/
if ($spn = strspn($text, $legal_ascii, $pos)) {
$ret .= substr($text, $pos, $spn);
$pos += $spn;
}
if ($pos >= $len) {
return $ret;
}
/**
* Capture the UTF char
*/
$utf_len = $utf_len_mask[$text[$pos] & "�"];
$utf_char = substr($text, $pos, $utf_len);
$pos += $utf_len;
if ($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST || $utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST || $utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST) {
/**
* All characters within these ranges are valid
*
* We separate them with a space in order to index each character
* individually
*/
$ret .= ' ' . $utf_char . ' ';
continue;
}
if (isset($allow[$utf_char])) {
/**
* The char is explicitly allowed
*/
$ret .= $utf_char;
continue;
}
if (isset($conv[$utf_char])) {
/**
* The char is mapped to something, maybe to itself actually
*/
$ret .= $conv[$utf_char];
continue;
}
/**
* The char isn't mapped, but did we load its conversion table?
*
* The search indexer table is split into blocks. The block number of
* each char is equal to its codepoint right-shifted for 11 bits. It
* means that out of the 11, 16 or 21 meaningful bits of a 2-, 3- or
* 4- byte sequence we only keep the leftmost 0, 5 or 10 bits. Thus,
* all UTF chars encoded in 2 bytes are in the same first block.
*/
if (isset($utf_char[2])) {
if (isset($utf_char[3])) {
/**
* 1111 0nnn 10nn nnnn 10nx xxxx 10xx xxxx
* 0000 0111 0011 1111 0010 0000
*/
$idx = (ord($utf_char[0]) & 0x7) << 7 | (ord($utf_char[1]) & 0x3f) << 1 | (ord($utf_char[2]) & 0x20) >> 5;
} else {
/**
* 1110 nnnn 10nx xxxx 10xx xxxx
* 0000 0111 0010 0000
*/
$idx = (ord($utf_char[0]) & 0x7) << 1 | (ord($utf_char[1]) & 0x20) >> 5;
}
} else {
/**
* 110x xxxx 10xx xxxx
* 0000 0000 0000 0000
*/
$idx = 0;
}
/**
* Check if the required conv table has been loaded already
*/
if (!isset($conv_loaded[$idx])) {
$conv_loaded[$idx] = 1;
$file = $this->phpbb_root_path . 'includes/utf/data/search_indexer_' . $idx . '.' . $this->php_ext;
if (file_exists($file)) {
$conv += (include $file);
}
}
if (isset($conv[$utf_char])) {
$ret .= $conv[$utf_char];
} else {
/**
* We add an entry to the conversion table so that we
* don't have to convert to codepoint and perform the checks
* that are above this block
*/
$conv[$utf_char] = ' ';
$ret .= ' ';
}
} while (1);
return $ret;
}