You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
337 lines
7.5 KiB
337 lines
7.5 KiB
<?php |
|
|
|
declare(strict_types=1); |
|
|
|
namespace Doctrine\Common\Lexer; |
|
|
|
use ReflectionClass; |
|
|
|
use function implode; |
|
use function in_array; |
|
use function preg_split; |
|
use function sprintf; |
|
use function substr; |
|
|
|
use const PREG_SPLIT_DELIM_CAPTURE; |
|
use const PREG_SPLIT_NO_EMPTY; |
|
use const PREG_SPLIT_OFFSET_CAPTURE; |
|
|
|
/** |
|
* Base class for writing simple lexers, i.e. for creating small DSLs. |
|
* |
|
* @psalm-type Token = array{value: int|string, type:string|int|null, position:int} |
|
*/ |
|
abstract class AbstractLexer |
|
{ |
|
/** |
|
* Lexer original input string. |
|
* |
|
* @var string |
|
*/ |
|
private $input; |
|
|
|
/** |
|
* Array of scanned tokens. |
|
* |
|
* Each token is an associative array containing three items: |
|
* - 'value' : the string value of the token in the input string |
|
* - 'type' : the type of the token (identifier, numeric, string, input |
|
* parameter, none) |
|
* - 'position' : the position of the token in the input string |
|
* |
|
* @var mixed[][] |
|
* @psalm-var list<Token> |
|
*/ |
|
private $tokens = []; |
|
|
|
/** |
|
* Current lexer position in input string. |
|
* |
|
* @var int |
|
*/ |
|
private $position = 0; |
|
|
|
/** |
|
* Current peek of current lexer position. |
|
* |
|
* @var int |
|
*/ |
|
private $peek = 0; |
|
|
|
/** |
|
* The next token in the input. |
|
* |
|
* @var mixed[]|null |
|
* @psalm-var Token|null |
|
*/ |
|
public $lookahead; |
|
|
|
/** |
|
* The last matched/seen token. |
|
* |
|
* @var mixed[]|null |
|
* @psalm-var Token|null |
|
*/ |
|
public $token; |
|
|
|
/** |
|
* Composed regex for input parsing. |
|
* |
|
* @var string|null |
|
*/ |
|
private $regex; |
|
|
|
/** |
|
* Sets the input data to be tokenized. |
|
* |
|
* The Lexer is immediately reset and the new input tokenized. |
|
* Any unprocessed tokens from any previous input are lost. |
|
* |
|
* @param string $input The input to be tokenized. |
|
* |
|
* @return void |
|
*/ |
|
public function setInput($input) |
|
{ |
|
$this->input = $input; |
|
$this->tokens = []; |
|
|
|
$this->reset(); |
|
$this->scan($input); |
|
} |
|
|
|
/** |
|
* Resets the lexer. |
|
* |
|
* @return void |
|
*/ |
|
public function reset() |
|
{ |
|
$this->lookahead = null; |
|
$this->token = null; |
|
$this->peek = 0; |
|
$this->position = 0; |
|
} |
|
|
|
/** |
|
* Resets the peek pointer to 0. |
|
* |
|
* @return void |
|
*/ |
|
public function resetPeek() |
|
{ |
|
$this->peek = 0; |
|
} |
|
|
|
/** |
|
* Resets the lexer position on the input to the given position. |
|
* |
|
* @param int $position Position to place the lexical scanner. |
|
* |
|
* @return void |
|
*/ |
|
public function resetPosition($position = 0) |
|
{ |
|
$this->position = $position; |
|
} |
|
|
|
/** |
|
* Retrieve the original lexer's input until a given position. |
|
* |
|
* @param int $position |
|
* |
|
* @return string |
|
*/ |
|
public function getInputUntilPosition($position) |
|
{ |
|
return substr($this->input, 0, $position); |
|
} |
|
|
|
/** |
|
* Checks whether a given token matches the current lookahead. |
|
* |
|
* @param int|string $type |
|
* |
|
* @return bool |
|
*/ |
|
public function isNextToken($type) |
|
{ |
|
return $this->lookahead !== null && $this->lookahead['type'] === $type; |
|
} |
|
|
|
/** |
|
* Checks whether any of the given tokens matches the current lookahead. |
|
* |
|
* @param list<int|string> $types |
|
* |
|
* @return bool |
|
*/ |
|
public function isNextTokenAny(array $types) |
|
{ |
|
return $this->lookahead !== null && in_array($this->lookahead['type'], $types, true); |
|
} |
|
|
|
/** |
|
* Moves to the next token in the input string. |
|
* |
|
* @return bool |
|
*/ |
|
public function moveNext() |
|
{ |
|
$this->peek = 0; |
|
$this->token = $this->lookahead; |
|
$this->lookahead = isset($this->tokens[$this->position]) |
|
? $this->tokens[$this->position++] : null; |
|
|
|
return $this->lookahead !== null; |
|
} |
|
|
|
/** |
|
* Tells the lexer to skip input tokens until it sees a token with the given value. |
|
* |
|
* @param string $type The token type to skip until. |
|
* |
|
* @return void |
|
*/ |
|
public function skipUntil($type) |
|
{ |
|
while ($this->lookahead !== null && $this->lookahead['type'] !== $type) { |
|
$this->moveNext(); |
|
} |
|
} |
|
|
|
/** |
|
* Checks if given value is identical to the given token. |
|
* |
|
* @param mixed $value |
|
* @param int|string $token |
|
* |
|
* @return bool |
|
*/ |
|
public function isA($value, $token) |
|
{ |
|
return $this->getType($value) === $token; |
|
} |
|
|
|
/** |
|
* Moves the lookahead token forward. |
|
* |
|
* @return mixed[]|null The next token or NULL if there are no more tokens ahead. |
|
* @psalm-return Token|null |
|
*/ |
|
public function peek() |
|
{ |
|
if (isset($this->tokens[$this->position + $this->peek])) { |
|
return $this->tokens[$this->position + $this->peek++]; |
|
} |
|
|
|
return null; |
|
} |
|
|
|
/** |
|
* Peeks at the next token, returns it and immediately resets the peek. |
|
* |
|
* @return mixed[]|null The next token or NULL if there are no more tokens ahead. |
|
* @psalm-return Token|null |
|
*/ |
|
public function glimpse() |
|
{ |
|
$peek = $this->peek(); |
|
$this->peek = 0; |
|
|
|
return $peek; |
|
} |
|
|
|
/** |
|
* Scans the input string for tokens. |
|
* |
|
* @param string $input A query string. |
|
* |
|
* @return void |
|
*/ |
|
protected function scan($input) |
|
{ |
|
if (! isset($this->regex)) { |
|
$this->regex = sprintf( |
|
'/(%s)|%s/%s', |
|
implode(')|(', $this->getCatchablePatterns()), |
|
implode('|', $this->getNonCatchablePatterns()), |
|
$this->getModifiers() |
|
); |
|
} |
|
|
|
$flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE; |
|
$matches = preg_split($this->regex, $input, -1, $flags); |
|
|
|
if ($matches === false) { |
|
// Work around https://bugs.php.net/78122 |
|
$matches = [[$input, 0]]; |
|
} |
|
|
|
foreach ($matches as $match) { |
|
// Must remain before 'value' assignment since it can change content |
|
$type = $this->getType($match[0]); |
|
|
|
$this->tokens[] = [ |
|
'value' => $match[0], |
|
'type' => $type, |
|
'position' => $match[1], |
|
]; |
|
} |
|
} |
|
|
|
/** |
|
* Gets the literal for a given token. |
|
* |
|
* @param int|string $token |
|
* |
|
* @return int|string |
|
*/ |
|
public function getLiteral($token) |
|
{ |
|
$className = static::class; |
|
$reflClass = new ReflectionClass($className); |
|
$constants = $reflClass->getConstants(); |
|
|
|
foreach ($constants as $name => $value) { |
|
if ($value === $token) { |
|
return $className . '::' . $name; |
|
} |
|
} |
|
|
|
return $token; |
|
} |
|
|
|
/** |
|
* Regex modifiers |
|
* |
|
* @return string |
|
*/ |
|
protected function getModifiers() |
|
{ |
|
return 'iu'; |
|
} |
|
|
|
/** |
|
* Lexical catchable patterns. |
|
* |
|
* @return string[] |
|
*/ |
|
abstract protected function getCatchablePatterns(); |
|
|
|
/** |
|
* Lexical non-catchable patterns. |
|
* |
|
* @return string[] |
|
*/ |
|
abstract protected function getNonCatchablePatterns(); |
|
|
|
/** |
|
* Retrieve token type. Also processes the token value if necessary. |
|
* |
|
* @param string $value |
|
* |
|
* @return int|string|null |
|
*/ |
|
abstract protected function getType(&$value); |
|
}
|
|
|