Augmentation vers version 3.3.0

This commit is contained in:
Gauvain Boiché
2020-03-31 15:31:03 +02:00
parent d926806907
commit a1864c0414
2618 changed files with 406015 additions and 31377 deletions

View File

@@ -0,0 +1,167 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;
use s9e\RegexpBuilder\Input\InputInterface;
use s9e\RegexpBuilder\Output\OutputInterface;
use s9e\RegexpBuilder\Passes\CoalesceOptionalStrings;
use s9e\RegexpBuilder\Passes\CoalesceSingleCharacterPrefix;
use s9e\RegexpBuilder\Passes\GroupSingleCharacters;
use s9e\RegexpBuilder\Passes\MergePrefix;
use s9e\RegexpBuilder\Passes\MergeSuffix;
use s9e\RegexpBuilder\Passes\PromoteSingleStrings;
use s9e\RegexpBuilder\Passes\Recurse;
class Builder
{
/**
* @var InputInterface
*/
protected $input;
/**
* @var MetaCharacters
*/
protected $meta;
/**
* @var Runner
*/
protected $runner;
/**
* @var Serializer
*/
protected $serializer;
/**
* @param array $config
*/
public function __construct(array $config = [])
{
$config += [
'delimiter' => '/',
'input' => 'Bytes',
'inputOptions' => [],
'meta' => [],
'output' => 'Bytes',
'outputOptions' => []
];
$this->setInput($config['input'], $config['inputOptions']);
$this->setMeta($config['meta']);
$this->setSerializer($config['output'], $config['outputOptions'], $config['delimiter']);
$this->setRunner();
}
/**
* Build and return a regular expression that matches all of the given strings
*
* @param string[] $strings Literal strings to be matched
* @return string Regular expression (without delimiters)
*/
public function build(array $strings)
{
$strings = array_unique($strings);
sort($strings);
if ($this->isEmpty($strings))
{
return '';
}
$strings = $this->splitStrings($strings);
$strings = $this->meta->replaceMeta($strings);
$strings = $this->runner->run($strings);
return $this->serializer->serializeStrings($strings);
}
/**
* Test whether the list of strings is empty
*
* @param string[] $strings
* @return bool
*/
protected function isEmpty(array $strings)
{
return (empty($strings) || $strings === ['']);
}
/**
* Set the InputInterface instance in $this->input
*
* @param string $inputType
* @param array $inputOptions
* @return void
*/
protected function setInput($inputType, array $inputOptions)
{
$className = __NAMESPACE__ . '\\Input\\' . $inputType;
$this->input = new $className($inputOptions);
}
/**
* Set the MetaCharacters instance in $this->meta
*
* @param array $map
* @return void
*/
protected function setMeta(array $map)
{
$this->meta = new MetaCharacters($this->input);
foreach ($map as $char => $expr)
{
$this->meta->add($char, $expr);
}
}
/**
* Set the Runner instance $in this->runner
*
* @return void
*/
protected function setRunner()
{
$this->runner = new Runner;
$this->runner->addPass(new MergePrefix);
$this->runner->addPass(new GroupSingleCharacters);
$this->runner->addPass(new Recurse($this->runner));
$this->runner->addPass(new PromoteSingleStrings);
$this->runner->addPass(new CoalesceOptionalStrings);
$this->runner->addPass(new MergeSuffix);
$this->runner->addPass(new CoalesceSingleCharacterPrefix);
}
/**
* Set the Serializer instance in $this->serializer
*
* @param string $outputType
* @param array $outputOptions
* @param string $delimiter
* @return void
*/
protected function setSerializer($outputType, array $outputOptions, $delimiter)
{
$className = __NAMESPACE__ . '\\Output\\' . $outputType;
$output = new $className($outputOptions);
$escaper = new Escaper($delimiter);
$this->serializer = new Serializer($output, $this->meta, $escaper);
}
/**
* Split all given strings by character
*
* @param string[] $strings List of strings
* @return array[] List of arrays
*/
protected function splitStrings(array $strings)
{
return array_map([$this->input, 'split'], $strings);
}
}

View File

@@ -0,0 +1,59 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;
class Escaper
{
/**
* @var array Characters to escape in a character class
*/
public $inCharacterClass = ['-' => '\\-', '\\' => '\\\\', ']' => '\\]', '^' => '\\^'];
/**
* @var array Characters to escape outside of a character class
*/
public $inLiteral = [
'$' => '\\$', '(' => '\\(', ')' => '\\)', '*' => '\\*',
'+' => '\\+', '.' => '\\.', '?' => '\\?', '[' => '\\]',
'\\' => '\\\\', '^' => '\\^', '{' => '\\{', '|' => '\\|'
];
/**
* @param string $delimiter Delimiter used in the final regexp
*/
public function __construct($delimiter = '/')
{
foreach (str_split($delimiter, 1) as $char)
{
$this->inCharacterClass[$char] = '\\' . $char;
$this->inLiteral[$char] = '\\' . $char;
}
}
/**
* Escape given character to be used in a character class
*
* @param string $char Original character
* @return string Escaped character
*/
public function escapeCharacterClass($char)
{
return (isset($this->inCharacterClass[$char])) ? $this->inCharacterClass[$char] : $char;
}
/**
* Escape given character to be used outside of a character class
*
* @param string $char Original character
* @return string Escaped character
*/
public function escapeLiteral($char)
{
return (isset($this->inLiteral[$char])) ? $this->inLiteral[$char] : $char;
}
}

View File

@@ -0,0 +1,23 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Input;
abstract class BaseImplementation implements InputInterface
{
/**
* {@inheritdoc}
*/
public function __construct(array $options = [])
{
}
/**
* {@inheritdoc}
*/
abstract public function split($string);
}

View File

@@ -0,0 +1,24 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Input;
class Bytes extends BaseImplementation
{
/**
* {@inheritdoc}
*/
public function split($string)
{
if ($string === '')
{
return [];
}
return array_map('ord', str_split($string, 1));
}
}

View File

@@ -0,0 +1,24 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Input;
interface InputInterface
{
/**
* @param array $options
*/
public function __construct(array $options = []);
/**
* Split given string into a list of values
*
* @param string $string
* @return integer[]
*/
public function split($string);
}

View File

@@ -0,0 +1,101 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Input;
use InvalidArgumentException;
class Utf8 extends BaseImplementation
{
/**
* @var bool Whether to use surrogates to represent higher codepoints
*/
protected $useSurrogates;
/**
* {@inheritdoc}
*/
public function __construct(array $options = [])
{
$this->useSurrogates = !empty($options['useSurrogates']);
}
/**
* {@inheritdoc}
*/
public function split($string)
{
if (preg_match_all('(.)us', $string, $matches) === false)
{
throw new InvalidArgumentException('Invalid UTF-8 string');
}
return ($this->useSurrogates) ? $this->charsToCodepointsWithSurrogates($matches[0]) : $this->charsToCodepoints($matches[0]);
}
/**
* Convert a list of UTF-8 characters into a list of Unicode codepoint
*
* @param string[] $chars
* @return integer[]
*/
protected function charsToCodepoints(array $chars)
{
return array_map([$this, 'cp'], $chars);
}
/**
* Convert a list of UTF-8 characters into a list of Unicode codepoint with surrogates
*
* @param string[] $chars
* @return integer[]
*/
protected function charsToCodepointsWithSurrogates(array $chars)
{
$codepoints = [];
foreach ($chars as $char)
{
$cp = $this->cp($char);
if ($cp < 0x10000)
{
$codepoints[] = $cp;
}
else
{
$codepoints[] = 0xD7C0 + ($cp >> 10);
$codepoints[] = 0xDC00 + ($cp & 0x3FF);
}
}
return $codepoints;
}
/**
* Compute and return the Unicode codepoint for given UTF-8 char
*
* @param string $char UTF-8 char
* @return integer
*/
protected function cp($char)
{
$cp = ord($char[0]);
if ($cp >= 0xF0)
{
$cp = ($cp << 18) + (ord($char[1]) << 12) + (ord($char[2]) << 6) + ord($char[3]) - 0x3C82080;
}
elseif ($cp >= 0xE0)
{
$cp = ($cp << 12) + (ord($char[1]) << 6) + ord($char[2]) - 0xE2080;
}
elseif ($cp >= 0xC0)
{
$cp = ($cp << 6) + ord($char[1]) - 0x3080;
}
return $cp;
}
}

View File

@@ -0,0 +1,222 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;
use InvalidArgumentException;
use s9e\RegexpBuilder\Input\InputInterface;
class MetaCharacters
{
/**
* @const Bit value that indicates whether a meta-character represents a single character usable
* in a character class
*/
const IS_CHAR = 1;
/**
* @const Bit value that indicates whether a meta-character represents a quantifiable expression
*/
const IS_QUANTIFIABLE = 2;
/**
* @var array Map of meta values and the expression they represent
*/
protected $exprs = [];
/**
* @var InputInterface
*/
protected $input;
/**
* @var array Map of meta-characters' codepoints and their value
*/
protected $meta = [];
/**
* @param InputInterface $input
*/
public function __construct(InputInterface $input)
{
$this->input = $input;
}
/**
* Add a meta-character to the list
*
* @param string $char Meta-character
* @param string $expr Regular expression
* @return void
*/
public function add($char, $expr)
{
$split = $this->input->split($char);
if (count($split) !== 1)
{
throw new InvalidArgumentException('Meta-characters must be represented by exactly one character');
}
if (@preg_match('(' . $expr . ')u', '') === false)
{
throw new InvalidArgumentException("Invalid expression '" . $expr . "'");
}
$inputValue = $split[0];
$metaValue = $this->computeValue($expr);
$this->exprs[$metaValue] = $expr;
$this->meta[$inputValue] = $metaValue;
}
/**
* Get the expression associated with a meta value
*
* @param integer $metaValue
* @return string
*/
public function getExpression($metaValue)
{
if (!isset($this->exprs[$metaValue]))
{
throw new InvalidArgumentException('Invalid meta value ' . $metaValue);
}
return $this->exprs[$metaValue];
}
/**
* Return whether a given value represents a single character usable in a character class
*
* @param integer $value
* @return bool
*/
public static function isChar($value)
{
return ($value >= 0 || ($value & self::IS_CHAR));
}
/**
* Return whether a given value represents a quantifiable expression
*
* @param integer $value
* @return bool
*/
public static function isQuantifiable($value)
{
return ($value >= 0 || ($value & self::IS_QUANTIFIABLE));
}
/**
* Replace values from meta-characters in a list of strings with their meta value
*
* @param array[] $strings
* @return array[]
*/
public function replaceMeta(array $strings)
{
foreach ($strings as &$string)
{
foreach ($string as &$value)
{
if (isset($this->meta[$value]))
{
$value = $this->meta[$value];
}
}
}
return $strings;
}
/**
* Compute and return a value for given expression
*
* Values are meant to be a unique negative integer. The last 2 bits indicate whether the
* expression is quantifiable and/or represents a single character.
*
* @param string $expr Regular expression
* @return integer
*/
protected function computeValue($expr)
{
$properties = [
'exprIsChar' => self::IS_CHAR,
'exprIsQuantifiable' => self::IS_QUANTIFIABLE
];
$value = (1 + count($this->meta)) * -pow(2, count($properties));
foreach ($properties as $methodName => $bitValue)
{
if ($this->$methodName($expr))
{
$value |= $bitValue;
}
}
return $value;
}
/**
* Test whether given expression represents a single character usable in a character class
*
* @param string $expr
* @return bool
*/
protected function exprIsChar($expr)
{
$regexps = [
// Escaped literal or escape sequence such as \w but not \R
'(^\\\\[adefhnrstvwDHNSVW\\W]$)D',
// Unicode properties such as \pL or \p{Lu}
'(^\\\\p(?:.|\\{[^}]+\\})$)Di',
// An escape sequence such as \x1F or \x{2600}
'(^\\\\x(?:[0-9a-f]{2}|\\{[^}]+\\})$)Di'
];
return $this->matchesAny($expr, $regexps);
}
/**
* Test whether given expression is quantifiable
*
* @param string $expr
* @return bool
*/
protected function exprIsQuantifiable($expr)
{
$regexps = [
// A dot or \R
'(^(?:\\.|\\\\R)$)D',
// A character class
'(^\\[\\^?(?:([^\\\\\\]]|\\\\.)(?:-(?-1))?)++\\]$)D'
];
return $this->matchesAny($expr, $regexps) || $this->exprIsChar($expr);
}
/**
* Test whether given expression matches any of the given regexps
*
* @param string $expr
* @param string[] $regexps
* @return bool
*/
protected function matchesAny($expr, array $regexps)
{
foreach ($regexps as $regexp)
{
if (preg_match($regexp, $expr))
{
return true;
}
}
return false;
}
}

View File

@@ -0,0 +1,62 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
use InvalidArgumentException;
abstract class BaseImplementation implements OutputInterface
{
/**
* @var integer
*/
protected $maxValue = 0;
/**
* @var integer
*/
protected $minValue = 0;
/**
* @param array $options
*/
public function __construct(array $options = [])
{
}
/**
* {@inheritdoc}
*/
public function output($value)
{
$this->validate($value);
return $this->outputValidValue($value);
}
/**
* Validate given value
*
* @param integer $value
* @return void
*/
protected function validate($value)
{
if ($value < $this->minValue || $value > $this->maxValue)
{
throw new InvalidArgumentException('Value ' . $value . ' is out of bounds (' . $this->minValue . '..' . $this->maxValue . ')');
}
}
/**
* Serialize a valid value into a character
*
* @param integer $value
* @return string
*/
abstract protected function outputValidValue($value);
}

View File

@@ -0,0 +1,22 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
class Bytes extends BaseImplementation
{
/** {@inheritdoc} */
protected $maxValue = 255;
/**
* {@inheritdoc}
*/
protected function outputValidValue($value)
{
return chr($value);
}
}

View File

@@ -0,0 +1,24 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
class JavaScript extends PrintableAscii
{
/** {@inheritdoc} */
protected $maxValue = 0x10FFFF;
/**
* {@inheritdoc}
*/
protected function escapeUnicode($cp)
{
$format = ($cp > 0xFFFF) ? '\\u{%' . $this->hexCase . '}' : '\\u%04' . $this->hexCase;
return sprintf($format, $cp);
}
}

View File

@@ -0,0 +1,19 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
interface OutputInterface
{
/**
* Serialize a value into a character
*
* @param integer $value
* @return string
*/
public function output($value);
}

View File

@@ -0,0 +1,22 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
class PHP extends PrintableAscii
{
/** {@inheritdoc} */
protected $maxValue = 0x10FFFF;
/**
* {@inheritdoc}
*/
protected function escapeUnicode($cp)
{
return sprintf('\\x{%04' . $this->hexCase . '}', $cp);
}
}

View File

@@ -0,0 +1,74 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
abstract class PrintableAscii extends BaseImplementation
{
/**
* @var string 'x' for lowercase hexadecimal symbols, 'X' for uppercase
*/
protected $hexCase;
/**
* {@inheritdoc}
*/
public function __construct(array $options = [])
{
$this->hexCase = (isset($options['case']) && $options['case'] === 'lower') ? 'x' : 'X';
}
/**
* Escape given ASCII codepoint
*
* @param integer $cp
* @return string
*/
protected function escapeAscii($cp)
{
return '\\x' . sprintf('%02' . $this->hexCase, $cp);
}
/**
* Escape given control code
*
* @param integer $cp
* @return string
*/
protected function escapeControlCode($cp)
{
$table = [9 => '\\t', 10 => '\\n', 13 => '\\r'];
return (isset($table[$cp])) ? $table[$cp] : $this->escapeAscii($cp);
}
/**
* Output the representation of a unicode character
*
* @param integer $cp Unicode codepoint
* @return string
*/
abstract protected function escapeUnicode($cp);
/**
* {@inheritdoc}
*/
protected function outputValidValue($value)
{
if ($value < 32)
{
return $this->escapeControlCode($value);
}
if ($value < 127)
{
return chr($value);
}
return ($value > 255) ? $this->escapeUnicode($value) : $this->escapeAscii($value);
}
}

View File

@@ -0,0 +1,54 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Output;
use InvalidArgumentException;
class Utf8 extends BaseImplementation
{
/** {@inheritdoc} */
protected $maxValue = 0x10FFFF;
/**
* {@inheritdoc}
*/
protected function outputValidValue($value)
{
if ($value < 0x80)
{
return chr($value);
}
if ($value < 0x800)
{
return chr(0xC0 | ($value >> 6)) . chr(0x80 | ($value & 0x3F));
}
if ($value < 0x10000)
{
return chr(0xE0 | ($value >> 12))
. chr(0x80 | (($value >> 6) & 0x3F))
. chr(0x80 | ($value & 0x3F));
}
return chr(0xF0 | ($value >> 18))
. chr(0x80 | (($value >> 12) & 0x3F))
. chr(0x80 | (($value >> 6) & 0x3F))
. chr(0x80 | ($value & 0x3F));
}
/**
* {@inheritdoc}
*/
protected function validate($value)
{
if ($value >= 0xD800 && $value <= 0xDFFF)
{
throw new InvalidArgumentException(sprintf('Surrogate 0x%X is not a valid UTF-8 character', $value));
}
parent::validate($value);
}
}

View File

@@ -0,0 +1,148 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
abstract class AbstractPass implements PassInterface
{
/**
* @var bool Whether the current set of strings is optional
*/
protected $isOptional;
/**
* {@inheritdoc}
*/
public function run(array $strings)
{
$strings = $this->beforeRun($strings);
if ($this->canRun($strings))
{
$strings = $this->runPass($strings);
}
$strings = $this->afterRun($strings);
return $strings;
}
/**
* Process the list of strings after the pass is run
*
* @param array[] $strings
* @return array[]
*/
protected function afterRun(array $strings)
{
if ($this->isOptional && $strings[0] !== [])
{
array_unshift($strings, []);
}
return $strings;
}
/**
* Prepare the list of strings before the pass is run
*
* @param array[] $strings
* @return array[]
*/
protected function beforeRun(array $strings)
{
$this->isOptional = (isset($strings[0]) && $strings[0] === []);
if ($this->isOptional)
{
array_shift($strings);
}
return $strings;
}
/**
* Test whether this pass can be run on a given list of strings
*
* @param array[] $strings
* @return bool
*/
protected function canRun(array $strings)
{
return true;
}
/**
* Run this pass on a list of strings
*
* @param array[] $strings
* @return array[]
*/
abstract protected function runPass(array $strings);
/**
* Test whether given string has an optional suffix
*
* @param array $string
* @return bool
*/
protected function hasOptionalSuffix(array $string)
{
$suffix = end($string);
return (is_array($suffix) && $suffix[0] === []);
}
/**
* Test whether given string contains a single alternations made of single values
*
* @param array $string
* @return bool
*/
protected function isCharacterClassString(array $string)
{
return ($this->isSingleAlternationString($string) && $this->isSingleCharStringList($string[0]));
}
/**
* Test whether given string contains one single element that is an alternation
*
* @param array
* @return bool
*/
protected function isSingleAlternationString(array $string)
{
return (count($string) === 1 && is_array($string[0]));
}
/**
* Test whether given string contains a single character value
*
* @param array $string
* @return bool
*/
protected function isSingleCharString(array $string)
{
return (count($string) === 1 && !is_array($string[0]));
}
/**
* Test whether given list of strings contains nothing but single-char strings
*
* @param array[] $strings
* @return bool
*/
protected function isSingleCharStringList(array $strings)
{
foreach ($strings as $string)
{
if (!$this->isSingleCharString($string))
{
return false;
}
}
return true;
}
}

View File

@@ -0,0 +1,138 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Replaces (?:ab?|b)? with a?b?
*/
class CoalesceOptionalStrings extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function canRun(array $strings)
{
return ($this->isOptional && count($strings) > 1);
}
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
foreach ($this->getPrefixGroups($strings) as $suffix => $prefixStrings)
{
$suffix = unserialize($suffix);
$suffixStrings = array_diff_key($strings, $prefixStrings);
if ($suffix === $this->buildSuffix($suffixStrings))
{
$this->isOptional = false;
return $this->buildCoalescedStrings($prefixStrings, $suffix);
}
}
return $strings;
}
/**
* Build the final list of coalesced strings
*
* @param array[] $prefixStrings
* @param array $suffix
* @return array[]
*/
protected function buildCoalescedStrings(array $prefixStrings, array $suffix)
{
$strings = $this->runPass($this->buildPrefix($prefixStrings));
if (count($strings) === 1 && $strings[0][0][0] === [])
{
// If the prefix has been remerged into a list of strings which contains only one string
// of which the first element is an optional alternations, we only need to append the
// suffix
$strings[0][] = $suffix;
}
else
{
// Put the current list of strings that form the prefix into a new list of strings, of
// which the only string is composed of our optional prefix followed by the suffix
array_unshift($strings, []);
$strings = [[$strings, $suffix]];
}
return $strings;
}
/**
* Build the list of strings used as prefix
*
* @param array[] $strings
* @return array[]
*/
protected function buildPrefix(array $strings)
{
$prefix = [];
foreach ($strings as $string)
{
// Remove the last element (suffix) of each string before adding it
array_pop($string);
$prefix[] = $string;
}
return $prefix;
}
/**
* Build a list of strings that matches any given strings or nothing
*
* Will unpack groups of single characters
*
* @param array[] $strings
* @return array[]
*/
protected function buildSuffix(array $strings)
{
$suffix = [[]];
foreach ($strings as $string)
{
if ($this->isCharacterClassString($string))
{
foreach ($string[0] as $element)
{
$suffix[] = $element;
}
}
else
{
$suffix[] = $string;
}
}
return $suffix;
}
/**
* Get the list of potential prefix strings grouped by identical suffix
*
* @param array[] $strings
* @return array
*/
protected function getPrefixGroups(array $strings)
{
$groups = [];
foreach ($strings as $k => $string)
{
if ($this->hasOptionalSuffix($string))
{
$groups[serialize(end($string))][$k] = $string;
}
}
return $groups;
}
}

View File

@@ -0,0 +1,81 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Replaces (?:ab|bb|c) with (?:[ab]b|c)
*/
class CoalesceSingleCharacterPrefix extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
$newStrings = [];
foreach ($this->getEligibleKeys($strings) as $keys)
{
// Create a new string to hold the merged strings and replace the first element with
// an empty character class
$newString = $strings[$keys[0]];
$newString[0] = [];
// Fill the character class with the prefix of each string in this group before removing
// the original string
foreach ($keys as $key)
{
$newString[0][] = [$strings[$key][0]];
unset($strings[$key]);
}
$newStrings[] = $newString;
}
return array_merge($newStrings, $strings);
}
/**
* Filter the list of eligible keys and keep those that have at least two matches
*
* @param array[] $eligibleKeys List of lists of keys
* @return array[]
*/
protected function filterEligibleKeys(array $eligibleKeys)
{
$filteredKeys = [];
foreach ($eligibleKeys as $k => $keys)
{
if (count($keys) > 1)
{
$filteredKeys[] = $keys;
}
}
return $filteredKeys;
}
/**
* Get a list of keys of strings eligible to be merged together, grouped by suffix
*
* @param array[] $strings
* @return array[]
*/
protected function getEligibleKeys(array $strings)
{
$eligibleKeys = [];
foreach ($strings as $k => $string)
{
if (!is_array($string[0]) && isset($string[1]))
{
$suffix = serialize(array_slice($string, 1));
$eligibleKeys[$suffix][] = $k;
}
}
return $this->filterEligibleKeys($eligibleKeys);
}
}

View File

@@ -0,0 +1,42 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Enables other passes to replace (?:[xy]|a[xy]) with a?[xy]
*/
class GroupSingleCharacters extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
$singles = $this->getSingleCharStrings($strings);
$cnt = count($singles);
if ($cnt > 1 && $cnt < count($strings))
{
// Remove the singles from the input, then prepend them as a new string
$strings = array_diff_key($strings, $singles);
array_unshift($strings, [array_values($singles)]);
}
return $strings;
}
/**
* Return an array of every single-char string in given list of strings
*
* @param array[] $strings
* @return array[]
*/
protected function getSingleCharStrings(array $strings)
{
return array_filter($strings, [$this, 'isSingleCharString']);
}
}

View File

@@ -0,0 +1,104 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Replaces (?:axx|ayy) with a(?:xx|yy)
*/
class MergePrefix extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
$newStrings = [];
foreach ($this->getStringsByPrefix($strings) as $prefix => $strings)
{
$newStrings[] = (isset($strings[1])) ? $this->mergeStrings($strings) : $strings[0];
}
return $newStrings;
}
/**
* Get the number of leading elements common to all given strings
*
* @param array[] $strings
* @return integer
*/
protected function getPrefixLength(array $strings)
{
$len = 1;
$cnt = count($strings[0]);
while ($len < $cnt && $this->stringsMatch($strings, $len))
{
++$len;
}
return $len;
}
/**
* Return given strings grouped by their first element
*
* NOTE: assumes that this pass is run before the first element of any string could be replaced
*
* @param array[] $strings
* @return array[]
*/
protected function getStringsByPrefix(array $strings)
{
$byPrefix = [];
foreach ($strings as $string)
{
$byPrefix[$string[0]][] = $string;
}
return $byPrefix;
}
/**
* Merge given strings into a new single string
*
* @param array[] $strings
* @return array
*/
protected function mergeStrings(array $strings)
{
$len = $this->getPrefixLength($strings);
$newString = array_slice($strings[0], 0, $len);
foreach ($strings as $string)
{
$newString[$len][] = array_slice($string, $len);
}
return $newString;
}
/**
* Test whether all given strings' elements match at given position
*
* @param array[] $strings
* @param integer $pos
* @return bool
*/
protected function stringsMatch(array $strings, $pos)
{
$value = $strings[0][$pos];
foreach ($strings as $string)
{
if (!isset($string[$pos]) || $string[$pos] !== $value)
{
return false;
}
}
return true;
}
}

View File

@@ -0,0 +1,83 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Replaces (?:aax|bbx) with (?:aa|bb)x
*/
class MergeSuffix extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function canRun(array $strings)
{
return (count($strings) > 1 && $this->hasMatchingSuffix($strings));
}
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
$newString = [];
while ($this->hasMatchingSuffix($strings))
{
array_unshift($newString, end($strings[0]));
$strings = $this->pop($strings);
}
array_unshift($newString, $strings);
return [$newString];
}
/**
* Test whether all given strings have the same last element
*
* @param array[] $strings
* @return bool
*/
protected function hasMatchingSuffix(array $strings)
{
$suffix = end($strings[1]);
foreach ($strings as $string)
{
if (end($string) !== $suffix)
{
return false;
}
}
return ($suffix !== false);
}
/**
* Remove the last element of every string
*
* @param array[] $strings Original strings
* @return array[] Processed strings
*/
protected function pop(array $strings)
{
$cnt = count($strings);
$i = $cnt;
while (--$i >= 0)
{
array_pop($strings[$i]);
}
// Remove empty elements then prepend one back at the start of the array if applicable
$strings = array_filter($strings);
if (count($strings) < $cnt)
{
array_unshift($strings, []);
}
return $strings;
}
}

View File

@@ -0,0 +1,19 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
interface PassInterface
{
/**
* Run this pass
*
* @param array[] $strings Original strings
* @return array[] Modified strings
*/
public function run(array $strings);
}

View File

@@ -0,0 +1,47 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
/**
* Replaces alternations that only contain one string to allow other passes to replace
* (?:a0?x|bx) with (?:a0?|b)x
*/
class PromoteSingleStrings extends AbstractPass
{
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
return array_map([$this, 'promoteSingleStrings'], $strings);
}
/**
* Promote single strings found inside given string
*
* @param array $string Original string
* @return array Modified string
*/
protected function promoteSingleStrings(array $string)
{
$newString = [];
foreach ($string as $element)
{
if (is_array($element) && count($element) === 1)
{
$newString = array_merge($newString, $element[0]);
}
else
{
$newString[] = $element;
}
}
return $newString;
}
}

View File

@@ -0,0 +1,58 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Passes;
use s9e\RegexpBuilder\Runner;
/**
* Enables passes to be run recursively into alternations to replace a(?:x0|x1|y0|y1) with a[xy][01]
*/
class Recurse extends AbstractPass
{
/**
* @var Runner
*/
protected $runner;
/**
* @param Runner $runner
*/
public function __construct(Runner $runner)
{
$this->runner = $runner;
}
/**
* {@inheritdoc}
*/
protected function runPass(array $strings)
{
return array_map([$this, 'recurseString'], $strings);
}
/**
* Recurse into given string and run all passes on each element
*
* @param array $string
* @return array
*/
protected function recurseString(array $string)
{
$isOptional = $this->isOptional;
foreach ($string as $k => $element)
{
if (is_array($element))
{
$string[$k] = $this->runner->run($element);
}
}
$this->isOptional = $isOptional;
return $string;
}
}

View File

@@ -0,0 +1,45 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;
use s9e\RegexpBuilder\Passes\PassInterface;
class Runner
{
/**
* @var PassInterface[]
*/
protected $passes = [];
/**
* Add a pass to the list
*
* @param PassInterface $pass
* @return void
*/
public function addPass(PassInterface $pass)
{
$this->passes[] = $pass;
}
/**
* Run all passes on the list of strings
*
* @param array[] $strings
* @return array[]
*/
public function run(array $strings)
{
foreach ($this->passes as $pass)
{
$strings = $pass->run($strings);
}
return $strings;
}
}

View File

@@ -0,0 +1,279 @@
<?php
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2018 The s9e Authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder;
use s9e\RegexpBuilder\MetaCharacters;
use s9e\RegexpBuilder\Output\OutputInterface;
class Serializer
{
/**
* @var Escaper
*/
protected $escaper;
/**
* @var MetaCharacters
*/
protected $meta;
/**
* @var OutputInterface
*/
protected $output;
/**
* @param OutputInterface $output
* @parm MetaCharacters $meta
* @param Escaper $escaper
*/
public function __construct(OutputInterface $output, MetaCharacters $meta, Escaper $escaper)
{
$this->escaper = $escaper;
$this->meta = $meta;
$this->output = $output;
}
/**
* Serialize given strings into a regular expression
*
* @param array[] $strings
* @return string
*/
public function serializeStrings(array $strings)
{
$info = $this->analyzeStrings($strings);
$alternations = array_map([$this, 'serializeString'], $info['strings']);
if (!empty($info['chars']))
{
// Prepend the character class to the list of alternations
array_unshift($alternations, $this->serializeCharacterClass($info['chars']));
}
$expr = implode('|', $alternations);
if ($this->needsParentheses($info))
{
$expr = '(?:' . $expr . ')';
}
return $expr . $info['quantifier'];
}
/**
* Analyze given strings to determine how to serialize them
*
* The returned array may contains any of the following elements:
*
* - (string) quantifier Either '' or '?'
* - (array) chars List of values from single-char strings
* - (array) strings List of multi-char strings
*
* @param array[] $strings
* @return array
*/
protected function analyzeStrings(array $strings)
{
$info = ['alternationsCount' => 0, 'quantifier' => ''];
if ($strings[0] === [])
{
$info['quantifier'] = '?';
unset($strings[0]);
}
$chars = $this->getChars($strings);
if (count($chars) > 1)
{
++$info['alternationsCount'];
$info['chars'] = array_values($chars);
$strings = array_diff_key($strings, $chars);
}
$info['strings'] = array_values($strings);
$info['alternationsCount'] += count($strings);
return $info;
}
/**
* Return the portion of strings that are composed of a single character
*
* @param array[]
* @return array String key => value
*/
protected function getChars(array $strings)
{
$chars = [];
foreach ($strings as $k => $string)
{
if ($this->isChar($string))
{
$chars[$k] = $string[0];
}
}
return $chars;
}
/**
* Get the list of ranges that cover all given values
*
* @param integer[] $values Ordered list of values
* @return array[] List of ranges in the form [start, end]
*/
protected function getRanges(array $values)
{
$i = 0;
$cnt = count($values);
$start = $values[0];
$end = $start;
$ranges = [];
while (++$i < $cnt)
{
if ($values[$i] === $end + 1)
{
++$end;
}
else
{
$ranges[] = [$start, $end];
$start = $end = $values[$i];
}
}
$ranges[] = [$start, $end];
return $ranges;
}
/**
* Test whether given string represents a single character
*
* @param array $string
* @return bool
*/
protected function isChar(array $string)
{
return count($string) === 1 && is_int($string[0]) && MetaCharacters::isChar($string[0]);
}
/**
* Test whether an expression is quantifiable based on the strings info
*
* @param array $info
* @return bool
*/
protected function isQuantifiable(array $info)
{
$strings = $info['strings'];
return empty($strings) || $this->isSingleQuantifiableString($strings);
}
/**
* Test whether a list of strings contains only one single quantifiable string
*
* @param array[] $strings
* @return bool
*/
protected function isSingleQuantifiableString(array $strings)
{
return count($strings) === 1 && count($strings[0]) === 1 && MetaCharacters::isQuantifiable($strings[0][0]);
}
/**
* Test whether an expression needs parentheses based on the strings info
*
* @param array $info
* @return bool
*/
protected function needsParentheses(array $info)
{
return ($info['alternationsCount'] > 1 || ($info['quantifier'] && !$this->isQuantifiable($info)));
}
/**
* Serialize a given list of values into a character class
*
* @param integer[] $values
* @return string
*/
protected function serializeCharacterClass(array $values)
{
$expr = '[';
foreach ($this->getRanges($values) as list($start, $end))
{
$expr .= $this->serializeCharacterClassUnit($start);
if ($end > $start)
{
if ($end > $start + 1)
{
$expr .= '-';
}
$expr .= $this->serializeCharacterClassUnit($end);
}
}
$expr .= ']';
return $expr;
}
/**
* Serialize a given value to be used in a character class
*
* @param integer $value
* @return string
*/
protected function serializeCharacterClassUnit($value)
{
return $this->serializeValue($value, 'escapeCharacterClass');
}
/**
* Serialize an element from a string
*
* @param array|integer $element
* @return string
*/
protected function serializeElement($element)
{
return (is_array($element)) ? $this->serializeStrings($element) : $this->serializeLiteral($element);
}
/**
* Serialize a given value to be used as a literal
*
* @param integer $value
* @return string
*/
protected function serializeLiteral($value)
{
return $this->serializeValue($value, 'escapeLiteral');
}
/**
* Serialize a given string into a regular expression
*
* @param array $string
* @return string
*/
protected function serializeString(array $string)
{
return implode('', array_map([$this, 'serializeElement'], $string));
}
/**
* Serialize a given value
*
* @param integer $value
* @param string $escapeMethod
* @return string
*/
protected function serializeValue($value, $escapeMethod)
{
return ($value < 0) ? $this->meta->getExpression($value) : $this->escaper->$escapeMethod($this->output->output($value));
}
}