From 89d3ce0379b70885217e6c1eee59e378174602c5 Mon Sep 17 00:00:00 2001 From: Marien Fressinaud Date: Fri, 4 Oct 2024 15:16:31 +0200 Subject: [PATCH] new: Allow to search for tags and URL parts --- src/controllers/Links.php | 57 +++-- src/search_engine/LinksSearcher.php | 190 ++++++++++++++ src/search_engine/Query.php | 48 ++++ src/search_engine/Query/Condition.php | 70 ++++++ src/search_engine/Query/Parser.php | 141 +++++++++++ src/search_engine/Query/TokenType.php | 16 ++ src/search_engine/Query/Tokenizer.php | 149 +++++++++++ tests/search_engine/LinksSearcherTest.php | 259 ++++++++++++++++++++ tests/search_engine/Query/ParserTest.php | 79 ++++++ tests/search_engine/Query/TokenizerTest.php | 148 +++++++++++ 10 files changed, 1140 insertions(+), 17 deletions(-) create mode 100644 src/search_engine/LinksSearcher.php create mode 100644 src/search_engine/Query.php create mode 100644 src/search_engine/Query/Condition.php create mode 100644 src/search_engine/Query/Parser.php create mode 100644 src/search_engine/Query/TokenType.php create mode 100644 src/search_engine/Query/Tokenizer.php create mode 100644 tests/search_engine/LinksSearcherTest.php create mode 100644 tests/search_engine/Query/ParserTest.php create mode 100644 tests/search_engine/Query/TokenizerTest.php diff --git a/src/controllers/Links.php b/src/controllers/Links.php index 3f3bd5e4..24912086 100644 --- a/src/controllers/Links.php +++ b/src/controllers/Links.php @@ -6,6 +6,7 @@ use Minz\Response; use App\auth; use App\models; +use App\search_engine; use App\services; use App\utils; @@ -36,19 +37,30 @@ public function index(Request $request): Response return Response::redirect('login', ['redirect_to' => \Minz\Url::for('links')]); } + $beta_enabled = models\FeatureFlag::isEnabled('beta', $user->id); + $query = $request->param('q'); $pagination_page = $request->paramInteger('page', 1); if ($query) { - $number_links = models\Link::countByQueryAndUserId( - $query, - $user->id, - [ - 'exclude_never_only' => true, - ] - ); + if ($beta_enabled) { + $search_query = search_engine\Query::fromString($query); + + $number_links = search_engine\LinksSearcher::countLinks($user, $search_query); + } else { + $number_links = models\Link::countByQueryAndUserId( + $query, + $user->id, + [ + 'exclude_never_only' => true, + ] + ); + } + $number_per_page = 30; + $pagination = new utils\Pagination($number_links, $number_per_page, $pagination_page); + if ($pagination_page !== $pagination->currentPage()) { return Response::redirect('links', [ 'q' => $query, @@ -56,16 +68,27 @@ public function index(Request $request): Response ]); } - $links = models\Link::listComputedByQueryAndUserId( - $query, - $user->id, - ['published_at', 'number_comments'], - [ - 'exclude_never_only' => true, - 'offset' => $pagination->currentOffset(), - 'limit' => $pagination->numberPerPage(), - ] - ); + if ($beta_enabled) { + $links = search_engine\LinksSearcher::getLinks( + $user, + $search_query, + pagination: [ + 'offset' => $pagination->currentOffset(), + 'limit' => $pagination->numberPerPage(), + ] + ); + } else { + $links = models\Link::listComputedByQueryAndUserId( + $query, + $user->id, + ['published_at', 'number_comments'], + [ + 'exclude_never_only' => true, + 'offset' => $pagination->currentOffset(), + 'limit' => $pagination->numberPerPage(), + ] + ); + } return Response::ok('links/search.phtml', [ 'links' => $links, diff --git a/src/search_engine/LinksSearcher.php b/src/search_engine/LinksSearcher.php new file mode 100644 index 00000000..739f7242 --- /dev/null +++ b/src/search_engine/LinksSearcher.php @@ -0,0 +1,190 @@ + + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class LinksSearcher +{ + /** + * @param array{ + * 'offset'?: int, + * 'limit'?: int|'ALL', + * } $pagination + * + * @return models\Link[] + */ + public static function getLinks( + models\User $user, + Query $query, + array $pagination = [], + ): array { + $default_pagination = [ + 'offset' => 0, + 'limit' => 'ALL', + ]; + + $pagination = array_merge($default_pagination, $pagination); + + $parameters = [ + ':query' => '', + ':user_id' => $user->id, + ':offset' => $pagination['offset'], + ]; + + $limit_statement = ''; + if ($pagination['limit'] !== 'ALL') { + $limit_statement = 'LIMIT :limit'; + $parameters[':limit'] = $pagination['limit']; + } + + list($query_statement, $query_parameters) = self::buildWhereQuery($query); + $parameters = array_merge($parameters, $query_parameters); + + $sql = <<prepare($sql); + $statement->execute($parameters); + + return models\Link::fromDatabaseRows($statement->fetchAll()); + } + + public static function countLinks(models\User $user, Query $query): int + { + $parameters = [ + ':query' => '', + ':user_id' => $user->id, + ]; + + list($query_statement, $query_parameters) = self::buildWhereQuery($query); + $parameters = array_merge($parameters, $query_parameters); + + $sql = <<prepare($sql); + $statement->execute($parameters); + + return intval($statement->fetchColumn()); + } + + /** + * @return array{string, array} + */ + private static function buildWhereQuery(Query $query): array + { + $where_sql = ''; + $parameters = []; + + $textConditions = $query->getConditions('text'); + $textValues = array_map(function (Query\Condition $condition): string { + return $condition->getValue(); + }, $textConditions); + $textQuery = implode(' ', $textValues); + + if ($textQuery !== '') { + $where_sql .= ' AND search_index @@ query'; + $parameters['query'] = $textQuery; + } + + $qualifierConditions = $query->getConditions('qualifier'); + + foreach ($qualifierConditions as $condition) { + $qualifier = $condition->getQualifier(); + if ($qualifier === 'url') { + $value = $condition->getValue(); + + $parameter_name = ':url' . (count($parameters) + 1); + + $where_sql .= " AND l.url LIKE {$parameter_name}"; + + $parameters[$parameter_name] = "%{$value}%"; + } + } + + $tagConditions = $query->getConditions('tag'); + + $tags_parameters = []; + $not_tags_parameters = []; + + foreach ($tagConditions as $condition) { + $parameter_name = ':tag' . (count($parameters) + 1); + + $value = $condition->getValue(); + + $parameters[$parameter_name] = $value; + + if ($condition->not()) { + $not_tags_parameters[] = $parameter_name; + } else { + $tags_parameters[] = $parameter_name; + } + } + + if ($tags_parameters) { + $tags_statement = implode(',', $tags_parameters); + $where_sql .= " AND l.tags::jsonb ??& array[{$tags_statement}]"; + } + + if ($not_tags_parameters) { + $not_tags_statement = implode(',', $not_tags_parameters); + $where_sql .= " AND NOT (l.tags::jsonb ??| array[{$not_tags_statement}])"; + } + + return [$where_sql, $parameters]; + } +} diff --git a/src/search_engine/Query.php b/src/search_engine/Query.php new file mode 100644 index 00000000..f37b48b0 --- /dev/null +++ b/src/search_engine/Query.php @@ -0,0 +1,48 @@ + + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class Query +{ + /** @var Query\Condition[] */ + private array $conditions = []; + + public function addCondition(Query\Condition $condition): void + { + $this->conditions[] = $condition; + } + + /** + * @param 'text'|'qualifier'|'tag'|'any' $type + * + * @return Query\Condition[] + */ + public function getConditions(string $type = 'any'): array + { + if ($type === 'any') { + return $this->conditions; + } + + return array_filter($this->conditions, function ($condition) use ($type) { + if ($type === 'text') { + return $condition->isTextCondition(); + } elseif ($type === 'qualifier') { + return $condition->isQualifierCondition(); + } elseif ($type === 'tag') { + return $condition->isTagCondition(); + } + }); + } + + public static function fromString(string $queryString): Query + { + $tokenizer = new Query\Tokenizer(); + $parser = new Query\Parser(); + $tokens = $tokenizer->tokenize($queryString); + return $parser->parse($tokens); + } +} diff --git a/src/search_engine/Query/Condition.php b/src/search_engine/Query/Condition.php new file mode 100644 index 00000000..0e6d265a --- /dev/null +++ b/src/search_engine/Query/Condition.php @@ -0,0 +1,70 @@ + + * @author Probesys + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class Condition +{ + public const TYPES = ['text', 'qualifier', 'tag']; + + /** + * @param value-of $type + */ + private function __construct( + /** @var value-of */ + private string $type, + private string $value, + private ?string $qualifier, + private bool $not, + ) { + } + + public static function textCondition(string $value): self + { + return new self('text', $value, null, false); + } + + public static function qualifierCondition(string $qualifier, string $value): self + { + return new self('qualifier', $value, $qualifier, false); + } + + public static function tagCondition(string $value, bool $not): self + { + return new self('tag', $value, null, $not); + } + + public function isTextCondition(): bool + { + return $this->type === 'text'; + } + + public function isQualifierCondition(): bool + { + return $this->type === 'qualifier'; + } + + public function isTagCondition(): bool + { + return $this->type === 'tag'; + } + + public function getValue(): string + { + return $this->value; + } + + public function getQualifier(): string + { + return $this->qualifier ?? ''; + } + + public function not(): bool + { + return $this->not; + } +} diff --git a/src/search_engine/Query/Parser.php b/src/search_engine/Query/Parser.php new file mode 100644 index 00000000..7a8d88fa --- /dev/null +++ b/src/search_engine/Query/Parser.php @@ -0,0 +1,141 @@ + QUERY + * + * QUERY -> CRITERIA QUERY + * QUERY -> CRITERIA end_of_query + * + * CRITERIA -> text + * CRITERIA -> qualifier_url + * CRITERIA -> qualifier_url text + * CRITERIA -> tag + * CRITERIA -> not tag + * + * Each rule of the grammar is implemented by a method in the Parser class to + * make the code as clear as possible. + * + * @phpstan-import-type Token from Tokenizer + * + * @author Marien Fressinaud + * @author Probesys + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class Parser +{ + /** @var Token[] */ + private array $tokens; + + /** @param Token[] $tokens */ + public function parse(array $tokens): Query + { + if (empty($tokens)) { + throw new \LogicException('The parser cannot be called with an empty list of tokens.'); + } + + $this->tokens = $tokens; + + $query = new Query(); + $this->ruleQuery($query); + return $query; + } + + private function ruleQuery(Query $query): void + { + $this->ruleCriteria($query); + + $currentToken = $this->readToken(); + + if ($currentToken['type'] === TokenType::EndOfQuery) { + $this->consumeToken(TokenType::EndOfQuery); + } else { + $this->ruleQuery($query); + } + } + + private function ruleCriteria(Query $query): void + { + $currentToken = $this->readToken(); + + if ($currentToken['type'] === TokenType::Text) { + $this->consumeToken(TokenType::Text); + + $value = $currentToken['value'] ?? ''; + + $condition = Query\Condition::textCondition($value); + } elseif ($currentToken['type'] === TokenType::Qualifier) { + $this->consumeToken(TokenType::Qualifier); + + $qualifier = $currentToken['value'] ?? ''; + + $currentToken = $this->readToken(); + + if ($currentToken['type'] === TokenType::Text) { + $this->consumeToken(TokenType::Text); + + $value = $currentToken['value'] ?? ''; + + $condition = Query\Condition::qualifierCondition($qualifier, $value); + } else { + $condition = Query\Condition::textCondition($qualifier); + } + } elseif ($currentToken['type'] === TokenType::Tag) { + $this->consumeToken(TokenType::Tag); + + $tag = $currentToken['value'] ?? ''; + + $condition = Query\Condition::tagCondition($tag, not: false); + } elseif ($currentToken['type'] === TokenType::Not) { + $this->consumeToken(TokenType::Not); + + $currentToken = $this->readToken(); + + $this->consumeToken(TokenType::Tag); + + $tag = $currentToken['value'] ?? ''; + + $condition = Query\Condition::tagCondition($tag, not: true); + } else { + $type = $currentToken['type']->value; + $pos = $currentToken['position']; + + throw new \LogicException("Unexpected token {$type} at position {$pos}"); + } + + $query->addCondition($condition); + } + + /** + * @return Token + */ + private function readToken(): array + { + $token = reset($this->tokens); + + if ($token === false) { + throw new \LogicException('The parser expected a token to be present but the list is empty.'); + } + + return $token; + } + + private function consumeToken(TokenType $expectedTokenType): void + { + $currentToken = $this->readToken(); + + if ($currentToken['type'] !== $expectedTokenType) { + $type = $currentToken['type']->value; + $pos = $currentToken['position']; + + throw new \LogicException("Unexpected token {$type} at position {$pos}"); + } + + array_shift($this->tokens); + } +} diff --git a/src/search_engine/Query/TokenType.php b/src/search_engine/Query/TokenType.php new file mode 100644 index 00000000..5d90717d --- /dev/null +++ b/src/search_engine/Query/TokenType.php @@ -0,0 +1,16 @@ + + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +enum TokenType: string +{ + case EndOfQuery = 'end of query'; + case Not = 'not'; + case Qualifier = 'qualifier'; + case Tag = 'tag'; + case Text = 'text'; +} diff --git a/src/search_engine/Query/Tokenizer.php b/src/search_engine/Query/Tokenizer.php new file mode 100644 index 00000000..ee4ec1e3 --- /dev/null +++ b/src/search_engine/Query/Tokenizer.php @@ -0,0 +1,149 @@ + + * @author Probesys + * @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL + */ +class Tokenizer +{ + public const VALID_QUALIFIERS = ['url']; + + /** + * @return Token[] + */ + public function tokenize(string $query): array + { + $tokens = []; + + // Add a final whitespace to simplify the foreach loop + $query = $query . ' '; + + $charPosition = 0; + $currentText = ''; + $quoteOpenPosition = 0; + $escaped = false; + + foreach (mb_str_split($query) as $char) { + $charPosition += 1; + $isWhitespace = \ctype_space($char); + $inQuotes = $quoteOpenPosition > 0; + + if ($escaped) { + // The current char is escaped, so we add it to the current + // text, even if it's a special char (e.g. whitespace, comma, + // quote, etc.) + $currentText = $currentText . $char; + $escaped = false; + } elseif ($char === '\\') { + // The current char is a (not escaped) backslash, so we set + // the variable $escaped to true to escape the next char. + $escaped = true; + } elseif ($char === '"') { + // The current char is a quote, so we change the quoteOpenPosition + // depending on the fact we're already in quotes or not. + $quoteOpenPosition = $inQuotes ? 0 : $charPosition; + } elseif ($inQuotes) { + // The current char is in quotes, so we add it to the current + // text, even if it's a special char. + $currentText = $currentText . $char; + } elseif ( + $char === ':' && + $currentText !== '' && + in_array($currentText, self::VALID_QUALIFIERS) + ) { + $tokens[] = [ + 'type' => TokenType::Qualifier, + 'value' => $currentText, + 'position' => $charPosition - mb_strlen($currentText), + ]; + + $currentText = ''; + } elseif (!$isWhitespace) { + // We are at the end of the possibilities. We just check that + // the current char is not a whitespace, and we add it to the + // currentText. + $currentText = $currentText . $char; + } elseif ($isWhitespace && $currentText !== '') { + // If the current char is a whitespace, the token is complete. + // We add it to the list of tokens. + $tokens = array_merge( + $tokens, + $this->textToTokens($currentText, $charPosition) + ); + $currentText = ''; + } + } + + if ($quoteOpenPosition > 0 && $currentText !== '') { + $tokens = array_merge( + $tokens, + $this->textToTokens($currentText, $charPosition) + ); + $currentText = ''; + } + + $tokens[] = [ + 'type' => TokenType::EndOfQuery, + 'position' => $charPosition + 1, + ]; + + return $tokens; + } + + /** + * @param non-empty-string $text + * + * @return Token[] + */ + private function textToTokens(string $text, int $positionEnd): array + { + $position = $positionEnd - mb_strlen($text); + + $tag_regex = '/^-?#[\pL\pN_]+/u'; + if (preg_match($tag_regex, $text) === 1) { + $tokens = []; + + if ($text[0] === '-') { + // If the qualifier starts with a "-", we transform this char + // into a "Not" token. + $tokens[] = [ + 'type' => TokenType::Not, + 'position' => $position, + ]; + + $text = mb_substr($text, 1); + $position += 1; + } + + // Remove the "#" char + $text = mb_substr($text, 1); + + assert($text !== ''); + + $tokens[] = [ + 'type' => TokenType::Tag, + 'value' => $text, + 'position' => $position, + ]; + + return $tokens; + } else { + return [ + [ + 'type' => TokenType::Text, + 'value' => $text, + 'position' => $position, + ], + ]; + } + } +} diff --git a/tests/search_engine/LinksSearcherTest.php b/tests/search_engine/LinksSearcherTest.php new file mode 100644 index 00000000..5e4a0763 --- /dev/null +++ b/tests/search_engine/LinksSearcherTest.php @@ -0,0 +1,259 @@ +fake('sentence', 10, false); + $user = UserFactory::create(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + ]); + $query = Query::fromString($title); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(1, count($links)); + $this->assertSame($link->id, $links[0]->id); + } + + public function testGetLinksSearchesByUrl(): void + { + /** @var string */ + $url = $this->fake('url'); + $user = UserFactory::create(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'url' => $url, + ]); + $query = Query::fromString("url: {$url}"); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(1, count($links)); + $this->assertSame($link->id, $links[0]->id); + } + + public function testGetLinksSearchesByTag(): void + { + /** @var string[] */ + $tags = $this->fake('words', 5); + /** @var string */ + $tag = $this->fake('randomElement', $tags); + $user = UserFactory::create(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'tags' => $tags, + ]); + $query = Query::fromString("#{$tag}"); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(1, count($links)); + $this->assertSame($link->id, $links[0]->id); + } + + public function testGetLinksCanExcludeByTag(): void + { + /** @var string[] */ + $tags = $this->fake('words', 5); + /** @var string */ + $tag = $this->fake('randomElement', $tags); + $user = UserFactory::create(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'tags' => $tags, + ]); + $query = Query::fromString("-#{$tag}"); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(0, count($links)); + } + + public function testGetLinksCanExcludeByMultipleTags(): void + { + $tag_to_keep = 'foo'; + $tag_to_exclude1 = 'bar'; + $tag_to_exclude2 = 'baz'; + $user = UserFactory::create(); + $link1 = LinkFactory::create([ + 'user_id' => $user->id, + 'tags' => ['foo'], + ]); + $link2 = LinkFactory::create([ + 'user_id' => $user->id, + 'tags' => ['foo', 'bar'], + ]); + $link2 = LinkFactory::create([ + 'user_id' => $user->id, + 'tags' => ['foo', 'bar', 'baz'], + ]); + $query = Query::fromString("#{$tag_to_keep} -#{$tag_to_exclude1} -#{$tag_to_exclude2}"); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(1, count($links)); + $this->assertSame($link1->id, $links[0]->id); + } + + public function testGetLinksSortsByCreatedAt(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $created_at_1 = \Minz\Time::ago(2, 'day'); + $created_at_2 = \Minz\Time::ago(1, 'day'); + $link_1 = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + 'created_at' => $created_at_1, + ]); + $link_2 = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + 'created_at' => $created_at_2, + ]); + $query = Query::fromString($title); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(2, count($links)); + $this->assertSame($link_2->id, $links[0]->id); + $this->assertSame($link_1->id, $links[1]->id); + } + + public function testGetLinksCanLimitResults(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $link_1 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(3, 'days'), + 'title' => $title, + ]); + $link_2 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(2, 'days'), + 'title' => $title, + ]); + $link_3 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(1, 'days'), + 'title' => $title, + ]); + $query = Query::fromString($title); + + $links = LinksSearcher::getLinks($user, $query, pagination: [ + 'limit' => 2, + ]); + + $this->assertSame(2, count($links)); + $this->assertSame($link_3->id, $links[0]->id); + $this->assertSame($link_2->id, $links[1]->id); + } + + public function testGetLinksCanOffsetResults(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $link_1 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(3, 'days'), + 'title' => $title, + ]); + $link_2 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(2, 'days'), + 'title' => $title, + ]); + $link_3 = LinkFactory::create([ + 'user_id' => $user->id, + 'created_at' => \Minz\Time::ago(1, 'days'), + 'title' => $title, + ]); + $query = Query::fromString($title); + + $links = LinksSearcher::getLinks($user, $query, pagination: [ + 'limit' => 2, + 'offset' => 1, + ]); + + $this->assertSame(2, count($links)); + $this->assertSame($link_2->id, $links[0]->id); + $this->assertSame($link_1->id, $links[1]->id); + } + + public function testGetLinksExcludesLinksOnlyInNeverCollection(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $never_list = $user->neverList(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + ]); + LinkToCollectionFactory::create([ + 'collection_id' => $never_list->id, + 'link_id' => $link->id, + ]); + $query = Query::fromString($title); + + $links = LinksSearcher::getLinks($user, $query); + + $this->assertSame(0, count($links)); + } + + public function testCountLinks(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + ]); + $query = Query::fromString($title); + + $count = LinksSearcher::countLinks($user, $query); + + $this->assertSame(1, $count); + } + + public function testCountLinksExcludeLinksOnlyInNeverCollection(): void + { + /** @var string */ + $title = $this->fake('sentence', 10, false); + $user = UserFactory::create(); + $never_list = $user->neverList(); + $link = LinkFactory::create([ + 'user_id' => $user->id, + 'title' => $title, + ]); + LinkToCollectionFactory::create([ + 'collection_id' => $never_list->id, + 'link_id' => $link->id, + ]); + $query = Query::fromString($title); + + $count = LinksSearcher::countLinks($user, $query); + + $this->assertSame(0, $count); + } +} diff --git a/tests/search_engine/Query/ParserTest.php b/tests/search_engine/Query/ParserTest.php new file mode 100644 index 00000000..493036d2 --- /dev/null +++ b/tests/search_engine/Query/ParserTest.php @@ -0,0 +1,79 @@ +tokenize($stringQuery); + + $query = $parser->parse($tokens); + + $conditions = $query->getConditions(); + $this->assertSame(3, count($conditions)); + + $this->assertTrue($conditions[0]->isTextCondition()); + $this->assertSame('some', $conditions[0]->getValue()); + + $this->assertTrue($conditions[1]->isTextCondition()); + $this->assertSame('text', $conditions[1]->getValue()); + + $this->assertTrue($conditions[2]->isTextCondition()); + $this->assertSame('and more text', $conditions[2]->getValue()); + } + + public function testParseQualifierUrl(): void + { + $tokenizer = new Tokenizer(); + $parser = new Parser(); + $stringQuery = 'url: https://flus.fr'; + $tokens = $tokenizer->tokenize($stringQuery); + + $query = $parser->parse($tokens); + + $conditions = $query->getConditions(); + $this->assertSame(1, count($conditions)); + + $this->assertTrue($conditions[0]->isQualifierCondition()); + $this->assertSame('url', $conditions[0]->getQualifier()); + $this->assertSame('https://flus.fr', $conditions[0]->getValue()); + } + + public function testParseTag(): void + { + $tokenizer = new Tokenizer(); + $parser = new Parser(); + $stringQuery = '#tag'; + $tokens = $tokenizer->tokenize($stringQuery); + + $query = $parser->parse($tokens); + + $conditions = $query->getConditions(); + $this->assertSame(1, count($conditions)); + + $this->assertTrue($conditions[0]->isTagCondition()); + $this->assertSame('tag', $conditions[0]->getValue()); + $this->assertFalse($conditions[0]->not()); + } + + public function testParseNegativeTag(): void + { + $tokenizer = new Tokenizer(); + $parser = new Parser(); + $stringQuery = '-#tag'; + $tokens = $tokenizer->tokenize($stringQuery); + + $query = $parser->parse($tokens); + + $conditions = $query->getConditions(); + $this->assertSame(1, count($conditions)); + + $this->assertTrue($conditions[0]->isTagCondition()); + $this->assertSame('tag', $conditions[0]->getValue()); + $this->assertTrue($conditions[0]->not()); + } +} diff --git a/tests/search_engine/Query/TokenizerTest.php b/tests/search_engine/Query/TokenizerTest.php new file mode 100644 index 00000000..ebec94ae --- /dev/null +++ b/tests/search_engine/Query/TokenizerTest.php @@ -0,0 +1,148 @@ + TokenType::EndOfQuery]; + + $tokens = $tokenizer->tokenize($query); + + $this->assertSame(count($expectedTokens), count($tokens)); + for ($i = 0; $i < count($tokens); $i++) { + $token = $tokens[$i]; + $expectedToken = $expectedTokens[$i]; + $this->assertSame($expectedToken['type'], $token['type']); + if (isset($expectedToken['value'])) { + $this->assertTrue(isset($token['value'])); + $this->assertSame($expectedToken['value'], $token['value']); + } + } + } + + /** + * @return array + */ + public static function tokensProvider(): array + { + // Note that positions are wrong in the Tokens. This is because it + // would be too complicated to maintain correctly and efficiently. + // Also, they are not used during tests. It's mainly so that PHPStan + // doesn't scream too loud. + return [ + [ + 'some text', + [ + ['type' => TokenType::Text, 'value' => 'some', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'text', 'position' => 0], + ], + ], + + [ + '"some text"', + [ + ['type' => TokenType::Text, 'value' => 'some text', 'position' => 0], + ], + ], + + [ + '\"some text', + [ + ['type' => TokenType::Text, 'value' => '"some', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'text', 'position' => 0], + ], + ], + + [ + '\some text', + [ + ['type' => TokenType::Text, 'value' => 'some', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'text', 'position' => 0], + ], + ], + + [ + 'some\ text', + [ + ['type' => TokenType::Text, 'value' => 'some text', 'position' => 0], + ], + ], + + [ + '\\\\', // equivalent to '\\' in the string passed to the tokenizer + [ + ['type' => TokenType::Text, 'value' => '\\', 'position' => 0], + ], + ], + + [ + '"some text', + [ + ['type' => TokenType::Text, 'value' => 'some text ', 'position' => 0], + ], + ], + + [ + 'url: https://flus.fr', + [ + ['type' => TokenType::Qualifier, 'value' => 'url', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'https://flus.fr', 'position' => 0], + ], + ], + + [ + 'url:', + [ + ['type' => TokenType::Qualifier, 'value' => 'url', 'position' => 0], + ], + ], + + [ + '"url: https://flus.fr"', + [ + ['type' => TokenType::Text, 'value' => 'url: https://flus.fr', 'position' => 0], + ], + ], + + [ + '#tag', + [ + ['type' => TokenType::Tag, 'value' => 'tag', 'position' => 0], + ], + ], + + [ + '-#tag', + [ + ['type' => TokenType::Not, 'position' => 0], + ['type' => TokenType::Tag, 'value' => 'tag', 'position' => 0], + ], + ], + + [ + 'some text #tag url: https://flus.fr "and more text"', + [ + ['type' => TokenType::Text, 'value' => 'some', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'text', 'position' => 0], + ['type' => TokenType::Tag, 'value' => 'tag', 'position' => 0], + ['type' => TokenType::Qualifier, 'value' => 'url', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'https://flus.fr', 'position' => 0], + ['type' => TokenType::Text, 'value' => 'and more text', 'position' => 0], + ], + ], + ]; + } +}