public function tokenizeString($string, $eolChar = '\\n')
{
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t*** START PHP TOKENIZING ***" . PHP_EOL;
$isWin = false;
if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
$isWin = true;
}
}
$tokens = @token_get_all($string);
$finalTokens = array();
$newStackPtr = 0;
$numTokens = count($tokens);
$lastNotEmptyToken = 0;
$insideInlineIf = array();
$insideUseGroup = false;
$commentTokenizer = new PHP_CodeSniffer_Tokenizers_Comment();
for ($stackPtr = 0; $stackPtr < $numTokens; $stackPtr++) {
$token = (array) $tokens[$stackPtr];
$tokenIsArray = isset($token[1]);
if (PHP_CODESNIFFER_VERBOSITY > 1) {
if ($tokenIsArray === true) {
$type = token_name($token[0]);
$content = PHP_CodeSniffer::prepareForOutput($token[1]);
} else {
$newToken = self::resolveSimpleToken($token[0]);
$type = $newToken['type'];
$content = PHP_CodeSniffer::prepareForOutput($token[0]);
}
echo "\tProcess token ";
if ($tokenIsArray === true) {
echo "[{$stackPtr}]";
} else {
echo " {$stackPtr} ";
}
echo ": {$type} => {$content}";
}
//end if
if ($newStackPtr > 0 && $finalTokens[$newStackPtr - 1]['code'] !== T_WHITESPACE) {
$lastNotEmptyToken = $newStackPtr - 1;
}
/*
If we are using \r\n newline characters, the \r and \n are sometimes
split over two tokens. This normally occurs after comments. We need
to merge these two characters together so that our line endings are
consistent for all lines.
*/
if ($tokenIsArray === true && substr($token[1], -1) === "\r") {
if (isset($tokens[$stackPtr + 1]) === true && is_array($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1][1][0] === "\n") {
$token[1] .= "\n";
if (PHP_CODESNIFFER_VERBOSITY > 1) {
if ($isWin === true) {
echo '\\n';
} else {
echo "[30;1m\\n[0m";
}
}
if ($tokens[$stackPtr + 1][1] === "\n") {
// This token's content has been merged into the previous,
// so we can skip it.
$tokens[$stackPtr + 1] = '';
} else {
$tokens[$stackPtr + 1][1] = substr($tokens[$stackPtr + 1][1], 1);
}
}
}
//end if
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo PHP_EOL;
}
/*
Parse doc blocks into something that can be easily iterated over.
*/
if ($tokenIsArray === true && $token[0] === T_DOC_COMMENT) {
$commentTokens = $commentTokenizer->tokenizeString($token[1], $eolChar, $newStackPtr);
foreach ($commentTokens as $commentToken) {
$finalTokens[$newStackPtr] = $commentToken;
$newStackPtr++;
}
continue;
}
/*
If this is a double quoted string, PHP will tokenize the whole
thing which causes problems with the scope map when braces are
within the string. So we need to merge the tokens together to
provide a single string.
*/
if ($tokenIsArray === false && ($token[0] === '"' || $token[0] === 'b"')) {
// Binary casts need a special token.
if ($token[0] === 'b"') {
$finalTokens[$newStackPtr] = array('code' => T_BINARY_CAST, 'type' => 'T_BINARY_CAST', 'content' => 'b');
$newStackPtr++;
}
$tokenContent = '"';
$nestedVars = array();
for ($i = $stackPtr + 1; $i < $numTokens; $i++) {
$subToken = (array) $tokens[$i];
$subTokenIsArray = isset($subToken[1]);
if ($subTokenIsArray === true) {
$tokenContent .= $subToken[1];
if ($subToken[1] === '{' && $subToken[0] !== T_ENCAPSED_AND_WHITESPACE) {
$nestedVars[] = $i;
}
} else {
$tokenContent .= $subToken[0];
if ($subToken[0] === '}') {
array_pop($nestedVars);
}
}
if ($subTokenIsArray === false && $subToken[0] === '"' && empty($nestedVars) === true) {
// We found the other end of the double quoted string.
break;
}
}
//end for
$stackPtr = $i;
// Convert each line within the double quoted string to a
// new token, so it conforms with other multiple line tokens.
$tokenLines = explode($eolChar, $tokenContent);
$numLines = count($tokenLines);
$newToken = array();
for ($j = 0; $j < $numLines; $j++) {
$newToken['content'] = $tokenLines[$j];
if ($j === $numLines - 1) {
if ($tokenLines[$j] === '') {
break;
}
} else {
$newToken['content'] .= $eolChar;
}
$newToken['code'] = T_DOUBLE_QUOTED_STRING;
$newToken['type'] = 'T_DOUBLE_QUOTED_STRING';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
}
// Continue, as we're done with this token.
continue;
}
//end if
/*
If this is a heredoc, PHP will tokenize the whole
thing which causes problems when heredocs don't
contain real PHP code, which is almost never.
We want to leave the start and end heredoc tokens
alone though.
*/
if ($tokenIsArray === true && $token[0] === T_START_HEREDOC) {
// Add the start heredoc token to the final array.
$finalTokens[$newStackPtr] = self::standardiseToken($token);
// Check if this is actually a nowdoc and use a different token
// to help the sniffs.
$nowdoc = false;
if ($token[1][3] === "'") {
$finalTokens[$newStackPtr]['code'] = T_START_NOWDOC;
$finalTokens[$newStackPtr]['type'] = 'T_START_NOWDOC';
$nowdoc = true;
}
$tokenContent = '';
for ($i = $stackPtr + 1; $i < $numTokens; $i++) {
$subTokenIsArray = is_array($tokens[$i]);
if ($subTokenIsArray === true && $tokens[$i][0] === T_END_HEREDOC) {
// We found the other end of the heredoc.
break;
}
if ($subTokenIsArray === true) {
$tokenContent .= $tokens[$i][1];
} else {
$tokenContent .= $tokens[$i];
}
}
if ($i === $numTokens) {
// We got to the end of the file and never
// found the closing token, so this probably wasn't
// a heredoc.
if (PHP_CODESNIFFER_VERBOSITY > 1) {
$type = $finalTokens[$newStackPtr]['type'];
echo "\t\t* failed to find the end of the here/nowdoc" . PHP_EOL;
echo "\t\t* token {$stackPtr} changed from {$type} to T_STRING" . PHP_EOL;
}
$finalTokens[$newStackPtr]['code'] = T_STRING;
$finalTokens[$newStackPtr]['type'] = 'T_STRING';
$newStackPtr++;
continue;
}
$stackPtr = $i;
$newStackPtr++;
// Convert each line within the heredoc to a
// new token, so it conforms with other multiple line tokens.
$tokenLines = explode($eolChar, $tokenContent);
$numLines = count($tokenLines);
$newToken = array();
for ($j = 0; $j < $numLines; $j++) {
$newToken['content'] = $tokenLines[$j];
if ($j === $numLines - 1) {
if ($tokenLines[$j] === '') {
break;
}
} else {
$newToken['content'] .= $eolChar;
}
if ($nowdoc === true) {
$newToken['code'] = T_NOWDOC;
$newToken['type'] = 'T_NOWDOC';
} else {
$newToken['code'] = T_HEREDOC;
$newToken['type'] = 'T_HEREDOC';
}
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
}
//end for
// Add the end heredoc token to the final array.
$finalTokens[$newStackPtr] = self::standardiseToken($tokens[$stackPtr]);
if ($nowdoc === true) {
$finalTokens[$newStackPtr]['code'] = T_END_NOWDOC;
$finalTokens[$newStackPtr]['type'] = 'T_END_NOWDOC';
$nowdoc = true;
}
$newStackPtr++;
// Continue, as we're done with this token.
continue;
}
//end if
/*
Before PHP 5.6, the ... operator was tokenized as three
T_STRING_CONCAT tokens in a row. So look for and combine
these tokens in earlier versions.
*/
if ($tokenIsArray === false && $token[0] === '.' && isset($tokens[$stackPtr + 1]) === true && isset($tokens[$stackPtr + 2]) === true && $tokens[$stackPtr + 1] === '.' && $tokens[$stackPtr + 2] === '.') {
$newToken = array();
$newToken['code'] = T_ELLIPSIS;
$newToken['type'] = 'T_ELLIPSIS';
$newToken['content'] = '...';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
$stackPtr += 2;
continue;
}
/*
Before PHP 5.6, the ** operator was tokenized as two
T_MULTIPLY tokens in a row. So look for and combine
these tokens in earlier versions.
*/
if ($tokenIsArray === false && $token[0] === '*' && isset($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1] === '*') {
$newToken = array();
$newToken['code'] = T_POW;
$newToken['type'] = 'T_POW';
$newToken['content'] = '**';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
$stackPtr++;
continue;
}
/*
Before PHP 5.6, the **= operator was tokenized as
T_MULTIPLY followed by T_MUL_EQUAL. So look for and combine
these tokens in earlier versions.
*/
if ($tokenIsArray === false && $token[0] === '*' && isset($tokens[$stackPtr + 1]) === true && is_array($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1][1] === '*=') {
$newToken = array();
$newToken['code'] = T_POW_EQUAL;
$newToken['type'] = 'T_POW_EQUAL';
$newToken['content'] = '**=';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
$stackPtr++;
continue;
}
/*
Before PHP 7, the ?? operator was tokenized as
T_INLINE_THEN followed by T_INLINE_THEN.
So look for and combine these tokens in earlier versions.
*/
if ($tokenIsArray === false && $token[0] === '?' && isset($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1][0] === '?') {
$newToken = array();
$newToken['code'] = T_COALESCE;
$newToken['type'] = 'T_COALESCE';
$newToken['content'] = '??';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
$stackPtr++;
continue;
}
/*
Tokens after a double colon may be look like scope openers,
such as when writing code like Foo::NAMESAPCE, but they are
only ever variables or strings.
*/
if ($stackPtr > 1 && $tokens[$stackPtr - 1][0] === T_PAAMAYIM_NEKUDOTAYIM && $tokenIsArray === true && $token[0] !== T_STRING && $token[0] !== T_VARIABLE && $token[0] !== T_DOLLAR && isset(PHP_CodeSniffer_Tokens::$emptyTokens[$token[0]]) === false) {
$newToken = array();
$newToken['code'] = T_STRING;
$newToken['type'] = 'T_STRING';
$newToken['content'] = $token[1];
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
continue;
}
/*
Before PHP 7, the <=> operator was tokenized as
T_IS_SMALLER_OR_EQUAL followed by T_GREATER_THAN.
So look for and combine these tokens in earlier versions.
*/
if ($tokenIsArray === true && $token[0] === T_IS_SMALLER_OR_EQUAL && isset($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1][0] === '>') {
$newToken = array();
$newToken['code'] = T_SPACESHIP;
$newToken['type'] = 'T_SPACESHIP';
$newToken['content'] = '<=>';
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
$stackPtr++;
continue;
}
/*
Emulate traits in PHP versions less than 5.4.
*/
if ($tokenIsArray === true && $token[0] === T_STRING && strtolower($token[1]) === 'trait' && $tokens[$stackPtr - 1][0] !== T_OBJECT_OPERATOR && $tokens[$stackPtr - 1][0] !== T_PAAMAYIM_NEKUDOTAYIM) {
$finalTokens[$newStackPtr] = array('content' => $token[1], 'code' => T_TRAIT, 'type' => 'T_TRAIT');
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token {$stackPtr} changed from T_STRING to T_TRAIT" . PHP_EOL;
}
$newStackPtr++;
continue;
}
/*
PHP doesn't assign a token to goto labels, so we have to.
These are just string tokens with a single colon after them. Double
colons are already tokenized and so don't interfere with this check.
But we do have to account for CASE statements, that look just like
goto labels.
*/
if ($tokenIsArray === true && $token[0] === T_STRING && isset($tokens[$stackPtr + 1]) === true && $tokens[$stackPtr + 1] === ':' && $tokens[$stackPtr - 1][0] !== T_PAAMAYIM_NEKUDOTAYIM) {
$stopTokens = array(T_CASE => true, T_SEMICOLON => true, T_OPEN_CURLY_BRACKET => true, T_INLINE_THEN => true);
for ($x = $newStackPtr - 1; $x > 0; $x--) {
if (isset($stopTokens[$finalTokens[$x]['code']]) === true) {
break;
}
}
if ($finalTokens[$x]['code'] !== T_CASE && $finalTokens[$x]['code'] !== T_INLINE_THEN) {
$finalTokens[$newStackPtr] = array('content' => $token[1] . ':', 'code' => T_GOTO_LABEL, 'type' => 'T_GOTO_LABEL');
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token {$stackPtr} changed from T_STRING to T_GOTO_LABEL" . PHP_EOL;
echo "\t\t* skipping T_COLON token " . ($stackPtr + 1) . PHP_EOL;
}
$newStackPtr++;
$stackPtr++;
continue;
}
}
//end if
/*
HHVM 3.5 tokenizes "else[\s]+if" as a T_ELSEIF token while PHP
proper only tokenizes "elseif" as a T_ELSEIF token. So split
up the HHVM token to make it looks like proper PHP.
*/
if ($tokenIsArray === true && $token[0] === T_ELSEIF && strtolower($token[1]) !== 'elseif') {
$finalTokens[$newStackPtr] = array('content' => substr($token[1], 0, 4), 'code' => T_ELSE, 'type' => 'T_ELSE');
$newStackPtr++;
$finalTokens[$newStackPtr] = array('content' => substr($token[1], 4, -2), 'code' => T_WHITESPACE, 'type' => 'T_WHITESPACE');
$newStackPtr++;
$finalTokens[$newStackPtr] = array('content' => substr($token[1], -2), 'code' => T_IF, 'type' => 'T_IF');
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token {$stackPtr} changed from T_ELSEIF to T_ELSE/T_WHITESPACE/T_IF" . PHP_EOL;
}
$newStackPtr++;
continue;
}
//end if
/*
HHVM 3.5 and 3.6 tokenizes a hashbang line such as #!/usr/bin/php
as T_HASHANG while PHP proper uses T_INLINE_HTML.
*/
if ($tokenIsArray === true && token_name($token[0]) === 'T_HASHBANG') {
$finalTokens[$newStackPtr] = array('content' => $token[1], 'code' => T_INLINE_HTML, 'type' => 'T_INLINE_HTML');
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t\t* token {$stackPtr} changed from T_HASHBANG to T_INLINE_HTML" . PHP_EOL;
}
$newStackPtr++;
continue;
}
//end if
/*
If this token has newlines in its content, split each line up
and create a new token for each line. We do this so it's easier
to ascertain where errors occur on a line.
Note that $token[1] is the token's content.
*/
if ($tokenIsArray === true && strpos($token[1], $eolChar) !== false) {
$tokenLines = explode($eolChar, $token[1]);
$numLines = count($tokenLines);
$newToken = array('type' => token_name($token[0]), 'code' => $token[0], 'content' => '');
for ($i = 0; $i < $numLines; $i++) {
$newToken['content'] = $tokenLines[$i];
if ($i === $numLines - 1) {
if ($tokenLines[$i] === '') {
break;
}
} else {
$newToken['content'] .= $eolChar;
}
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
}
} else {
if ($tokenIsArray === true && $token[0] === T_STRING) {
// Some T_STRING tokens should remain that way
// due to their context.
$context = array(T_OBJECT_OPERATOR => true, T_FUNCTION => true, T_CLASS => true, T_EXTENDS => true, T_IMPLEMENTS => true, T_NEW => true, T_CONST => true, T_NS_SEPARATOR => true, T_USE => true, T_NAMESPACE => true, T_PAAMAYIM_NEKUDOTAYIM => true);
if (isset($context[$finalTokens[$lastNotEmptyToken]['code']]) === true) {
$finalTokens[$newStackPtr] = array('content' => $token[1], 'code' => T_STRING, 'type' => 'T_STRING');
$newStackPtr++;
continue;
}
}
//end if
$newToken = null;
if ($tokenIsArray === false) {
if (isset(self::$_resolveTokenCache[$token[0]]) === true) {
$newToken = self::$_resolveTokenCache[$token[0]];
}
} else {
$cacheKey = null;
if ($token[0] === T_STRING) {
$cacheKey = strtolower($token[1]);
} else {
if ($token[0] !== T_CURLY_OPEN) {
$cacheKey = $token[0];
}
}
if ($cacheKey !== null && isset(self::$_resolveTokenCache[$cacheKey]) === true) {
$newToken = self::$_resolveTokenCache[$cacheKey];
$newToken['content'] = $token[1];
}
}
if ($newToken === null) {
$newToken = self::standardiseToken($token);
}
// Convert colons that are actually the ELSE component of an
// inline IF statement.
if ($newToken['code'] === T_INLINE_THEN) {
$insideInlineIf[] = $stackPtr;
} else {
if (empty($insideInlineIf) === false && $newToken['code'] === T_COLON) {
array_pop($insideInlineIf);
$newToken['code'] = T_INLINE_ELSE;
$newToken['type'] = 'T_INLINE_ELSE';
}
}
// This is a special condition for T_ARRAY tokens used for
// type hinting function arguments as being arrays. We want to keep
// the parenthesis map clean, so let's tag these tokens as
// T_ARRAY_HINT.
if ($newToken['code'] === T_ARRAY) {
for ($i = $stackPtr; $i < $numTokens; $i++) {
if ($tokens[$i] === '(') {
break;
} else {
if ($tokens[$i][0] === T_VARIABLE) {
$newToken['code'] = T_ARRAY_HINT;
$newToken['type'] = 'T_ARRAY_HINT';
break;
}
}
}
}
// This is a special case when checking PHP 5.5+ code in PHP < 5.5
// where "finally" should be T_FINALLY instead of T_STRING.
if ($newToken['code'] === T_STRING && strtolower($newToken['content']) === 'finally') {
$newToken['code'] = T_FINALLY;
$newToken['type'] = 'T_FINALLY';
}
// This is a special case for the PHP 5.5 classname::class syntax
// where "class" should be T_STRING instead of T_CLASS.
if (($newToken['code'] === T_CLASS || $newToken['code'] === T_FUNCTION) && $finalTokens[$newStackPtr - 1]['code'] === T_DOUBLE_COLON) {
$newToken['code'] = T_STRING;
$newToken['type'] = 'T_STRING';
}
// This is a special case for PHP 5.6 use function and use const
// where "function" and "const" should be T_STRING instead of T_FUNCTION
// and T_CONST.
if (($newToken['code'] === T_FUNCTION || $newToken['code'] === T_CONST) && $finalTokens[$lastNotEmptyToken]['code'] === T_USE) {
$newToken['code'] = T_STRING;
$newToken['type'] = 'T_STRING';
}
// This is a special case for use groups in PHP 7+ where leaving
// the curly braces as their normal tokens would confuse
// the scope map and sniffs.
if ($newToken['code'] === T_OPEN_CURLY_BRACKET && $finalTokens[$lastNotEmptyToken]['code'] === T_NS_SEPARATOR) {
$newToken['code'] = T_OPEN_USE_GROUP;
$newToken['type'] = 'T_OPEN_USE_GROUP';
$insideUseGroup = true;
}
if ($insideUseGroup === true && $newToken['code'] === T_CLOSE_CURLY_BRACKET) {
$newToken['code'] = T_CLOSE_USE_GROUP;
$newToken['type'] = 'T_CLOSE_USE_GROUP';
$insideUseGroup = false;
}
$finalTokens[$newStackPtr] = $newToken;
$newStackPtr++;
}
//end if
}
//end for
if (PHP_CODESNIFFER_VERBOSITY > 1) {
echo "\t*** END PHP TOKENIZING ***" . PHP_EOL;
}
return $finalTokens;
}