From 6249a966a97ffc231d743ea5c4ccb4e6797582f5 Mon Sep 17 00:00:00 2001 From: Michael Crumm Date: Tue, 25 Apr 2017 14:20:44 -0700 Subject: [PATCH] Initial release * Subclass Diff_SequenceMatcher for syntax simplicity * Implement Fuzz::ratio, Fuzz::partialRatio * Change namespace to FuzzyWuzzy * Add phpspec * Add Collection, Process, etc. * Add example bin * Add README * Add automated test config (#5) * Add Travis CI config * Simplify travis config * Drop spec coverage * Add build badge to README * Add docs to README (#6) --- .gitignore | 2 +- .travis.yml | 20 +++ README.md | 106 ++++++++++++++ bin/fuzz | 61 ++++++++ composer.json | 6 +- lib/Collection.php | 259 ++++++++++++++++++++++++++++++++ lib/Diff/SequenceMatcher.php | 24 +++ lib/Fuzz.php | 276 +++++++++++++++++++++++++++++++++++ lib/Process.php | 140 ++++++++++++++++++ lib/StringProcessor.php | 68 +++++++++ lib/Utils.php | 62 ++++++++ phpspec.yml | 5 + spec/FuzzSpec.php | 54 +++++++ 13 files changed, 1079 insertions(+), 4 deletions(-) create mode 100644 .travis.yml create mode 100644 README.md create mode 100755 bin/fuzz create mode 100644 lib/Collection.php create mode 100644 lib/Diff/SequenceMatcher.php create mode 100644 lib/Fuzz.php create mode 100644 lib/Process.php create mode 100644 lib/StringProcessor.php create mode 100644 lib/Utils.php create mode 100644 phpspec.yml create mode 100644 spec/FuzzSpec.php diff --git a/.gitignore b/.gitignore index 88e99d5..7579f74 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ vendor -composer.lock \ No newline at end of file +composer.lock diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..7d41f3c --- /dev/null +++ b/.travis.yml @@ -0,0 +1,20 @@ +language: php + +php: + - 5.4 + - 5.5 + - 5.6 + - 7 + - hhvm + +sudo: false + +cache: + directories: + - $HOME/.composer/cache + +install: + - if [ -n "$GITHUB_COMPOSER_AUTH" ]; then composer config github-oauth.github.com ${GITHUB_COMPOSER_AUTH}; fi; + - COMPOSER_ROOT_VERSION=`git describe --abbrev=0` composer install --no-interaction + +script: bin/phpspec run diff --git a/README.md b/README.md new file mode 100644 index 0000000..f18aeb2 --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +# FuzzyWuzzy + +[![Build Status](https://travis-ci.org/wyndow/fuzzywuzzy.svg?branch=master)](https://travis-ci.org/wyndow/fuzzywuzzy) + +Fuzzy string matching for PHP, based on the [python library](https://github.com/seatgeek/fuzzywuzzy) of the same name. + +## Requirements + +* PHP 5.4 or higher + +## Installation + +Using [Composer](http://getcomposer.org/) + +``` +composer require wyndow/fuzzywuzzy +``` + +## Usage + +```php +use FuzzyWuzzy\Fuzz; +use FuzzyWuzzy\Process; + +$fuzz = new Fuzz(); +$process = new Process($fuzz); // $fuzz is optional here, and can be omitted. +``` + +### Simple Ratio + +```php +>>> $fuzz->ratio('this is a test', 'this is a test!') +=> 96 +``` + +### Partial Ratio + +```php +>>> $fuzz->partialRatio('this is a test', 'this is a test!') +=> 100 +``` + +### Token Sort Ratio + +```php +>>> $fuzz->ratio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear') +=> 90 +>>> $fuzz->tokenSortRatio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear') +=> 100 +``` + +### Token Set Ratio + +```php +>>> $fuzz->tokenSortRatio('fuzzy was a bear', 'fuzzy fuzzy was a bear') +=> 84 +>>> $fuzz->tokenSetRatio('fuzzy was a bear', 'fuzzy fuzzy was a bear') +=> 100 +``` + +### Process + +```php +>>> $choices = ['Atlanta Falcons', 'New York Jets', 'New York Giants', 'Dallas Cowboys'] +>>> $c = $process->extract('new york jets', $choices, null, null, 2) +=> FuzzyWuzzy\Collection {#205} +>>> $c->toArray() +=> [ + [ + "New York Jets", + 100, + ], + [ + "New York Giants", + 78, + ], + ] +>>> $process->extractOne('cowboys', $choices) +=> [ + "Dallas Cowboys", + 90, + ] +``` + +You can also pass additional parameters to `extractOne` to make it use a specific scorer. + +```php +>>> $process->extractOne('cowbell', $choices, null, [$fuzz, 'ratio']) +=> [ + "Dallas Cowboys", + 38, + ] +>>> $process->extractOne('cowbell', $choices, null, [$fuzz, 'tokenSetRatio']) +=> [ + "Dallas Cowboys", + 57, + ] +``` + +## Caveats + +Unicode strings may produce unexpected results. We intend to correct this in future versions. + +## Further Reading + +* [Fuzzy String Matching in Python](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) diff --git a/bin/fuzz b/bin/fuzz new file mode 100755 index 0000000..e43dd94 --- /dev/null +++ b/bin/fuzz @@ -0,0 +1,61 @@ +#!/usr/bin/env php + ' . $s2 . PHP_EOL; +echo '------' . PHP_EOL; +echo 'Ratio: ' . $fuzz->ratio($s1, $s2) . PHP_EOL; +echo 'Partial Ratio: ' . $fuzz->partialRatio($s1, $s2) . PHP_EOL; +echo '------' . PHP_EOL; +echo 'Token Sort Ratio: ' . $fuzz->tokenSortRatio($s1, $s2) . PHP_EOL; +echo 'Token Sort Partial Ratio: ' . $fuzz->tokenSortPartialRatio($s1, $s2) . PHP_EOL; +echo '------' . PHP_EOL; +echo 'Token Set Ratio: ' . $fuzz->tokenSetRatio($s1, $s2) . PHP_EOL; +echo 'Token Set Partial Ratio: ' . $fuzz->tokenSetPartialRatio($s1, $s2) . PHP_EOL; + +echo '------' . PHP_EOL; +echo '------' . PHP_EOL; +echo PHP_EOL; \ No newline at end of file diff --git a/composer.json b/composer.json index d8c48a3..b09e545 100644 --- a/composer.json +++ b/composer.json @@ -1,5 +1,5 @@ { - "name": "gowili/wuzzy", + "name": "wyndow/fuzzywuzzy", "description": "Fuzzy string matching based on FuzzyWuzzy from Seatgeek", "type": "library", "license": "MIT", @@ -10,7 +10,7 @@ } ], "require": { - "ext-mbstring": "*", + "php": ">=5.4", "phpspec/php-diff": "^1.0" }, "require-dev": { @@ -18,7 +18,7 @@ }, "autoload": { "psr-4": { - "Gowili\\Wuzzy\\": "lib/" + "FuzzyWuzzy\\": "lib/" } }, "config": { diff --git a/lib/Collection.php b/lib/Collection.php new file mode 100644 index 0000000..380a5d5 --- /dev/null +++ b/lib/Collection.php @@ -0,0 +1,259 @@ + + */ +class Collection implements \ArrayAccess, \IteratorAggregate, \Countable +{ + /** @var array */ + private $elements; + + /** + * Collection Constructor. + * + * @param array $elements + */ + public function __construct(array $elements = [ ]) + { + $this->elements = $elements; + } + + /** + * Adds an element to this collection. + * + * @param mixed $element Elements can be of any type. + */ + public function add($element) + { + $this->elements[] = $element; + } + + /** + * Returns true if the given elements exists in this collection. + * + * @param mixed $element + * @return boolean + */ + public function contains($element) + { + return in_array($element, $this->elements, true); + } + + /** + * @return int + */ + public function count() + { + return count($this->elements); + } + + /** + * Returns the set difference of this Collection and another comparable. + * + * @param array|\Traversable $cmp Value to compare against. + * @return static + * @throws \InvalidArgumentException When $cmp is not a valid for + * difference. + */ + public function difference($cmp) + { + return new static(array_diff($this->elements, static::coerce($cmp)->toArray())); + } + + /** + * @param callable $p + * @return static + */ + public function filter(\Closure $p) + { + return new static(array_filter($this->elements, $p)); + } + + /** + * @return \ArrayIterator + */ + public function getIterator() + { + return new \ArrayIterator($this->elements); + } + + /** + * Returns the set intersection of this Collection and another comparable. + * + * @param array|\Traversable $cmp Value to compare against. + * @return static + * @throws \InvalidArgumentException When $cmp is not a valid for + * intersection. + */ + public function intersection($cmp) + { + return new static(array_intersect($this->elements, static::coerce($cmp)->toArray())); + } + + /** + * Checks whether or not this collection is empty. + * + * @return boolean + */ + public function isEmpty() + { + return empty($this->elements); + } + + /** + * Returns a string containing all elements of this collection with a + * glue string. + * + * @param string $glue + * @return string A string representation of all the array elements in the + * same order, with the glue string between each element. + */ + public function join($glue = ' ') + { + return implode((string) $glue, $this->elements); + } + + /** + * Returns a new collection, the values of which are the result of mapping + * the predicate function onto each element in this collection. + * + * @param \Closure $p Predicate function. + * @return static + */ + public function map(\Closure $p) + { + return new static(array_map($p, $this->elements)); + } + + /** + * Apply a multisort to this collection of elements. + * + * @param mixed $arg [optional] + * @param mixed $arg [optional] + * @param mixed $_ [optional] + * @return static + */ + public function multiSort() + { + if (func_num_args() < 1) { throw new \LogicException('multiSort requires at least one argument.'); } + + $elements = $this->elements; + $args = func_get_args(); + $args[] = &$elements; + + call_user_func_array('array_multisort', $args); + + return new static($elements); + } + + /** + * @param mixed $offset + * @return bool + */ + public function offsetExists($offset) + { + return isset ($this->elements[$offset]); + } + + /** + * @param mixed $offset + * @return mixed + */ + public function offsetGet($offset) + { + return isset ($this->elements[$offset]) ? $this->elements[$offset] : null; + } + + /** + * @param mixed $offset + * @param mixed $value + */ + public function offsetSet($offset, $value) + { + if (!is_null($offset)) { + $this->elements[$offset] = $value; + return; + } + + $this->elements[] = $value; + } + + /** + * @param mixed $offset + */ + public function offsetUnset($offset) + { + unset($this->elements[$offset]); + } + + /** + * Returns a new collection with the elements of this collection, reversed. + * + * @return static + */ + public function reverse() + { + return new static(array_reverse($this->elements)); + } + + /** + * @param $offset + * @param null $length + * @return static + */ + public function slice($offset, $length = null) + { + return new static(array_slice($this->elements, $offset, $length, true)); + } + + /** + * Returns a new collection with the elements of this collection, sorted. + * + * @return static + */ + public function sort() + { + $sorted = $this->elements; + + sort($sorted); + + return new static($sorted); + } + + /** + * Returns the elements in this collection as an array. + * + * @return array + */ + public function toArray() + { + return $this->elements; + } + + /** + * Coerce an array-like value into a Collection. + * + * @param array|\Traversable $elements Value to compare against. + * @return Collection + * @throws \InvalidArgumentException When $cmp is not an array or Traversable. + */ + public static function coerce($elements) + { + if ($elements instanceof Collection) { + return $elements; + } elseif ($elements instanceof \Traversable) { + $elements = iterator_to_array($elements); + } elseif (!is_array($elements)) { + throw new \InvalidArgumentException(sprintf( + 'coerce requires an array or \Traversable, %s given.', + is_object($elements) ? get_class($elements) : gettype($elements) + )); + } + + return new static($elements); + } +} diff --git a/lib/Diff/SequenceMatcher.php b/lib/Diff/SequenceMatcher.php new file mode 100644 index 0000000..e273c4f --- /dev/null +++ b/lib/Diff/SequenceMatcher.php @@ -0,0 +1,24 @@ + + */ +class SequenceMatcher extends \Diff_SequenceMatcher +{ + /** + * SequenceMatcher Constructor. + * + * @param array|string $a + * @param array|string $b + * @param \Closure|null $junkCallback + * @param array $options + */ + public function __construct($a, $b, $junkCallback = null, array $options = [ ]) + { + parent::__construct($a, $b, $junkCallback, $options); + } +} diff --git a/lib/Fuzz.php b/lib/Fuzz.php new file mode 100644 index 0000000..caa1f60 --- /dev/null +++ b/lib/Fuzz.php @@ -0,0 +1,276 @@ + + */ +class Fuzz +{ + /** + * Returns a basic ratio score between the two strings. + * + * @param string $s1 + * @param string $s2 + * + * @return integer + */ + public function ratio($s1, $s2) + { + if (null === $s1) { throw new \UnexpectedValueException('s1 is null'); } + if (null === $s2) { throw new \UnexpectedValueException('s2 is null'); } + + if (strlen($s1) === 0 || strlen($s2) === 0) { + return 0; + } + + $m = new Diff\SequenceMatcher($s1, $s2); + + return Utils::intr(100 * $m->Ratio()); + } + + /** + * + * @todo Skip duplicate indexes for a little more speed. + * + * @param string $s1 + * @param string $s2 + * + * @return int + */ + public function partialRatio($s1, $s2) + { + if (null === $s1) { throw new \UnexpectedValueException('s1 is null'); } + if (null === $s2) { throw new \UnexpectedValueException('s2 is null'); } + + if (strlen($s1) === 0 || strlen($s2) === 0) { + return 0; + } + + if (strlen($s1) <= strlen($s2)) { + $shorter = $s1; + $longer = $s2; + } else { + $shorter = $s2; + $longer = $s1; + } + + $m = new Diff\SequenceMatcher($shorter, $longer); + + $blocks = $m->getMatchingBlocks(); + $scores = [ ]; + + foreach ($blocks as $block) { + $longStart = $block[1] - $block[0] > 0 ? $block[1] - $block[0] : 0; + $longEnd = $longStart + strlen($shorter); + $longSubstr = substr($longer, $longStart, $longEnd); + + $m2 = new Diff\SequenceMatcher($shorter, $longSubstr); + $r = $m2->Ratio(); + + if ($r > .995) { + return 100; + } else { + $scores[] = $r; + } + } + + return Utils::intr(100 * max($scores)); + } + + /** + * Returns a measure of the sequences' similarity between 0 and 100, + * using different algorithms. + * + * @param string $s1 + * @param string $s2 + * @param boolean $forceAscii + * @return integer + */ + public function weightedRatio($s1, $s2, $forceAscii = true) + { + $p1 = Utils::fullProcess($s1, $forceAscii); + $p2 = Utils::fullProcess($s2, $forceAscii); + + if (! Utils::validateString($p1)) { + return 0; + } + if (! Utils::validateString($p2)) { + return 0; + } + + # should we look at partials? + $try_partial = true; + $unbase_scale = .95; + $partial_scale = .90; + + $base = $this->ratio($p1, $p2); + $len_ratio = (float)((max(strlen($p1), strlen($p2))) / min(strlen($p1), strlen($p2))); + + # if strings are similar length, don't use partials + if ($len_ratio < 1.5) { + $try_partial = false; + } + + # if one string is much much shorter than the other + if ($len_ratio > 8) { + $partial_scale = .6; + } + + if ($try_partial) { + $partial = $this->partialRatio($p1, $p2) * $partial_scale; + $ptsor = $this->tokenSortPartialRatio($p1, $p2, $forceAscii) * $unbase_scale * $partial_scale; + $ptser = $this->tokenSetPartialRatio($p1, $p2, $forceAscii) * $unbase_scale * $partial_scale; + + return (int) max($base, $partial, $ptsor, $ptser); + } + + $tsor = $this->tokenSortRatio($p1, $p2, $forceAscii) * $unbase_scale; + $tser = $this->tokenSetRatio($p1, $p2, $forceAscii) * $unbase_scale; + + return (int) max($base, $tsor, $tser); + } + + /** + * @param $s1 + * @param $s2 + * @param bool $forceAscii + * @return int|mixed + */ + public function tokenSetPartialRatio($s1, $s2, $forceAscii = true) + { + return $this->tokenSet($s1, $s2, true, $forceAscii); + } + + /** + * @param $s1 + * @param $s2 + * @param bool $forceAscii + * @return int|mixed + */ + public function tokenSetRatio($s1, $s2, $forceAscii = true) + { + return $this->tokenSet($s1, $s2, false, $forceAscii); + } + + /** + * @param $s1 + * @param $s2 + * @param bool $forceAscii + * @return int + */ + public function tokenSortPartialRatio($s1, $s2, $forceAscii = true) + { + return $this->tokenSort($s1, $s2, true, $forceAscii); + } + + /** + * @param $s1 + * @param $s2 + * @param bool $forceAscii + * @return int + */ + public function tokenSortRatio($s1, $s2, $forceAscii = true) + { + return $this->tokenSort($s1, $s2, false, $forceAscii); + } + + /** + * Find all alphanumeric tokens in each string... + * + * - treat them as a set + * - construct two strings of the form: + * - take ratios of those two strings + * - controls for unordered partial matches + * + * @param $s1 + * @param $s2 + * @param bool $partial + * @param bool $forceAscii + * @return int|mixed + */ + private function tokenSet($s1, $s2, $partial = true, $forceAscii = true) + { + if (null === $s1) { throw new \UnexpectedValueException('s1 is null'); } + if (null === $s2) { throw new \UnexpectedValueException('s2 is null'); } + + $p1 = Utils::fullProcess($s1, $forceAscii); + $p2 = Utils::fullProcess($s2, $forceAscii); + + if (!Utils::validateString($p1)) { + return 0; + } + if (!Utils::validateString($p2)) { + return 0; + } + + # pull tokens + $tokens1 = StringProcessor::split(Utils::fullProcess($p1)); + $tokens2 = StringProcessor::split(Utils::fullProcess($p2)); + + $intersection = $tokens1->intersection($tokens2); + $diff1to2 = $tokens1->difference($tokens2); + $diff2to1 = $tokens2->difference($tokens1); + + $sorted_sect = $intersection->sort()->join(); + $sorted_1to2 = $diff1to2->sort()->join(); + $sorted_2to1 = $diff2to1->sort()->join(); + + $combined_1to2 = $sorted_sect . ' ' . $sorted_1to2; + $combined_2to1 = $sorted_sect . ' ' . $sorted_2to1; + + # strip + $sorted_sect = trim($sorted_sect); + $combined_1to2 = trim($combined_1to2); + $combined_2to1 = trim($combined_2to1); + + $ratioFunc = (boolean) $partial ? 'partialRatio' : 'ratio'; + + $pairwise = [ + call_user_func([$this, $ratioFunc], $sorted_sect, $combined_1to2), + call_user_func([$this, $ratioFunc], $sorted_sect, $combined_2to1), + call_user_func([$this, $ratioFunc], $combined_1to2, $combined_2to1) + ]; + + return max($pairwise); + } + + /** + * @param $s1 + * @param $s2 + * @param bool $partial + * @param bool $forceAscii + * @return int + */ + private function tokenSort($s1, $s2, $partial = true, $forceAscii = true) + { + if (null === $s1) { throw new \UnexpectedValueException('s1 is null'); } + if (null === $s2) { throw new \UnexpectedValueException('s2 is null'); } + + $sorted1 = $this->processAndSort($s1, $forceAscii); + $sorted2 = $this->processAndSort($s2, $forceAscii); + + if ((boolean) $partial) { + return $this->partialRatio($sorted1, $sorted2); + } + + return $this->ratio($sorted1, $sorted2); + } + + /** + * @param string $str + * @param boolean $forceAscii + * @return string + */ + private function processAndSort($str, $forceAscii = true) + { + $tokens = StringProcessor::split(Utils::fullProcess($str, $forceAscii)); + + return StringProcessor::strip($tokens->sort()->join()); + } +} diff --git a/lib/Process.php b/lib/Process.php new file mode 100644 index 0000000..a3f563f --- /dev/null +++ b/lib/Process.php @@ -0,0 +1,140 @@ +fuzz = $fuzz ?: new Fuzz(); + + $this->getString = function ($x) { return $x[0]; }; + $this->getScore = function ($x) { return $x[1]; }; + } + + /** + * Returns a Collection of matches for query, from a list of choices. + * + * @param string $query Query string. + * @param array|\Traversable $choices List of choices to match against. + * @param \Closure $processor Processing function (Default: {@link Utils::fullProcess}. + * @param \Closure $scorer Scoring function (Default: {@link Fuzz::weightedRatio}). + * @param integer $limit Limits the number of returned matches (Default: 5). + * @return Collection + */ + public function extract($query, $choices = [ ], $processor = null, $scorer = null, $limit = 5) + { + $choices = Collection::coerce($choices); + + if ($choices->isEmpty()) { return $choices; } + + $processor = $processor ?: function ($str) { return Utils::fullProcess($str); }; + $scorer = $scorer ?: function ($s1, $s2) { return $this->fuzz->weightedRatio($s1, $s2); }; + + if (!is_callable($processor)) { throw new \InvalidArgumentException('processor is not callable.'); } + if (!is_callable($scorer)) { throw new \InvalidArgumentException('scorer is not callable.'); } + + $scored = new Collection(); + + foreach ($choices as $choice) { + $processed = $processor($choice); + $score = $scorer($query, $processed); + + $scored->add([$choice, $score]); + } + + return $scored + ->multiSort($scored->map($this->getScore)->toArray(), SORT_DESC, SORT_NUMERIC) + ->slice(0, $limit) + ; + } + + /** + * Returns a Collection of best matches to a collection of choices. + * + * @param string $query Query string. + * @param array|\Traversable $choices List of choices to match against. + * @param \Closure $processor Processing function (Default: {@link Utils::fullProcess}. + * @param \Closure $scorer Scoring function (Default: {@link Fuzz::weightedRatio}). + * @param integer $cutoff Score cutoff for returned matches. + * @param integer $limit Limits the number of returned matches (Default: 5). + * @return Collection + */ + public function extractBests($query, $choices = [ ], $processor = null, $scorer = null, $cutoff = 0, $limit = 5) + { + $bestList = $this->extract($query, $choices, $processor, $scorer, $limit); + return $bestList->filter(function ($x) use ($cutoff) { return $x[1] >= $cutoff; }); + } + + /** + * Returns the best match for query from a collection of choices. + * + * @param string $query Query string. + * @param array|\Traversable $choices List of choices to match against. + * @param \Closure $processor Processing function (Default: {@link Utils::fullProcess}. + * @param \Closure $scorer Scoring function (Default: {@link Fuzz::weightedRatio}). + * @param integer $cutoff Score cutoff for returned matches. + * @return array + */ + public function extractOne($query, $choices = [ ], $processor = null, $scorer = null, $cutoff = 0) + { + $bestList = $this->extract($query, $choices, $processor, $scorer, 1); + return !$bestList->isEmpty() && $bestList[0][1] > $cutoff ? $bestList[0] : null; + } + + /** + * Returns a Collection that has been filtered for duplicates using fuzzy matching. + * + * @param array|\Traversable $containsDupes List containing duplicate strings. + * @param integer $threshold Match threshold. + * @param \Closure $scorer Scoring function. + * @return Collection + */ + public function dedupe($containsDupes = [ ], $threshold = 70, $scorer = null) + { + $containsDupes = Collection::coerce($containsDupes); + $scorer = $scorer ?: function ($s1, $s2) { return $this->fuzz->tokenSetRatio($s1, $s2); }; + $extractor = [ ]; + + # iterate over containsDupes + foreach ($containsDupes as $item) { + # return all duplicate matches found + $matches = $this->extract($item, $containsDupes, null, $scorer, null); + # filter matches based on threshold + $filtered = $matches->filter(function ($x) use ($threshold) { return $x[1] > $threshold; }); + # if there is only 1 item in *filtered*, no duplicates were found, so append to *extracted* + if ($filtered->count() === 1) { + $extractor[] = $filtered[0][0]; + } else { + # sort length DESC, score DESC, alpha ASC + $filtered = $filtered->multiSort( + $filtered->map(function ($x) { return strlen($x[0]); })->toArray(), SORT_DESC, SORT_NUMERIC, + $filtered->map($this->getScore)->toArray(), SORT_DESC, SORT_NUMERIC, + $filtered->map($this->getString)->toArray(), SORT_ASC, SORT_STRING | SORT_FLAG_CASE + ); + + $extractor[] = $filtered[0][0]; + } + } + + # "uniquify" *extractor* list + $keys = [ ]; + foreach ($extractor as $e) { + $keys[$e] = 1; + } + + return count($extractor) === count($containsDupes) ? $containsDupes : $extractor; + } +} diff --git a/lib/StringProcessor.php b/lib/StringProcessor.php new file mode 100644 index 0000000..ccf4fc9 --- /dev/null +++ b/lib/StringProcessor.php @@ -0,0 +1,68 @@ + + */ +class StringProcessor +{ + /** + * @param $str + * @return mixed + */ + public static function nonAlnumToWhitespace($str) + { + return preg_replace('/(?i)\W/u', ' ', $str); + } + + /** + * @param $str + * @return string + */ + public static function upcase($str) + { + return strtoupper($str); + } + + /** + * @param $str + * @return string + */ + public static function downcase($str) + { + return strtolower($str); + } + + /** + * @param $pieces + * @param string $glue + * @return string + */ + public static function join($pieces, $glue = ' ') + { + return Collection::coerce($pieces)->join($glue); + } + + /** + * @param $str + * @param string $delimiter + * @return Collection + */ + public static function split($str, $delimiter = ' ') + { + return new Collection(explode($delimiter, $str)); + } + + /** + * @param $str + * @param string $chars + * @return string + */ + public static function strip($str, $chars = " \t\n\r\0\x0B") + { + return trim($str, $chars); + } +} diff --git a/lib/Utils.php b/lib/Utils.php new file mode 100644 index 0000000..a1e1798 --- /dev/null +++ b/lib/Utils.php @@ -0,0 +1,62 @@ + + */ +class Utils +{ + /** + * Returns a correctly rounded integer. + * + * @param float $num Numeric value to round. + * @return int + */ + public static function intr($num) + { + return (int) round($num, 10, PHP_ROUND_HALF_DOWN); + } + + /** + * Returns a string after processing. + * + * - Replaces an non-alphanumeric characters with whitespace. + * - Converts the string to lowercase. + * - Trims leading/trailing whitespace from the string. + * + * @param string $str String to process. + * @param boolean $forceAscii If true, string will be converted to ASCII before processing. + * @return string + */ + public static function fullProcess($str, $forceAscii = true) + { + if (empty ($str)) { return ''; } + + # TODO: Force ascii, if true + + # Keep only Letters and Numbers (see Unicode docs). + $stringOut = StringProcessor::nonAlnumToWhitespace($str); + # Force into lowercase. + $stringOut = StringProcessor::downcase($stringOut); + # Remove leading and trailing whitespaces. + $stringOut = StringProcessor::strip($stringOut); + + return $stringOut; + } + + /** + * Validates that a string is a string, and that it has a positive length. + * + * @param string $str String to validate. + * @return boolean + */ + public static function validateString($str) + { + if (!is_string($str)) { return false; } + + return strlen($str) > 0; + } +} diff --git a/phpspec.yml b/phpspec.yml new file mode 100644 index 0000000..cc8bcac --- /dev/null +++ b/phpspec.yml @@ -0,0 +1,5 @@ +suites: + fuzzywuzzy_suite: + namespace: FuzzyWuzzy + psr4_prefix: FuzzyWuzzy + src_path: lib diff --git a/spec/FuzzSpec.php b/spec/FuzzSpec.php new file mode 100644 index 0000000..b87e3ce --- /dev/null +++ b/spec/FuzzSpec.php @@ -0,0 +1,54 @@ +shouldHaveType('FuzzyWuzzy\Fuzz'); + } + + function it_returns_a_perfect_match_for_ratio() + { + $this->ratio(self::$s1, self::$s1a)->shouldBe(100); + } + + function it_is_case_sensitive_for_ratio() + { + $this->ratio(self::$s1, self::$s2)->shouldNotBe(100); + } + + function it_returns_a_perfect_match_for_partial_ratio() + { + $this->partialRatio(self::$s1, self::$s3)->shouldBe(100); + } +}