public function tokenize($s)
{
if (function_exists('mb_internal_encoding') && (int) ini_get('mbstring.func_overload') & 2) {
$mbEncoding = mb_internal_encoding();
mb_internal_encoding('ASCII');
}
$tokens = array();
$pos = 0;
$s = preg_replace('#/\\*.*?\\*/#s', '', $s);
while (1) {
if (preg_match('#\\s+#A', $s, $match, 0, $pos)) {
$preceding_whitespace_pos = $pos;
$pos += strlen($match[0]);
} else {
$preceding_whitespace_pos = 0;
}
if ($pos >= strlen($s)) {
if (isset($mbEncoding)) {
mb_internal_encoding($mbEncoding);
}
return $tokens;
}
if (preg_match('#[+-]?\\d*n(?:[+-]\\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
$sym = substr($s, $pos, strlen($match[0]));
$tokens[] = new Token('Symbol', $sym, $pos);
$pos += strlen($match[0]);
continue;
}
$c = $s[$pos];
$c2 = substr($s, $pos, 2);
if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
$tokens[] = new Token('Token', $c2, $pos);
$pos += 2;
continue;
}
if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
if (in_array($c, array('.', '#', '[')) && $preceding_whitespace_pos > 0) {
$tokens[] = new Token('Token', ' ', $preceding_whitespace_pos);
}
$tokens[] = new Token('Token', $c, $pos);
++$pos;
continue;
}
if ($c === '"' || $c === "'") {
// Quoted string
$old_pos = $pos;
list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
$tokens[] = new Token('String', $sym, $old_pos);
continue;
}
$old_pos = $pos;
list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
$tokens[] = new Token('Symbol', $sym, $old_pos);
continue;
}
}