From 7332fab8a091f3ae39f744bd711bae9f56287f02 Mon Sep 17 00:00:00 2001 From: Mark Wardle Date: Thu, 7 Nov 2024 12:18:09 +0000 Subject: [PATCH] Improve ranked-search particularly for empty or invalid input --- src/com/eldrix/hermes/core.clj | 3 +- src/com/eldrix/hermes/impl/search.clj | 61 ++++++++++++++++---------- test/com/eldrix/hermes/search_test.clj | 26 ++++++++++- 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/src/com/eldrix/hermes/core.clj b/src/com/eldrix/hermes/core.clj index 7b1c858..83ca268 100644 --- a/src/com/eldrix/hermes/core.clj +++ b/src/com/eldrix/hermes/core.clj @@ -619,7 +619,8 @@ [[search]], in which no results would be returned if there is a token that matches no results, [[ranked-search]] simply scores from best to worst. This function is most useful for finding best matches, while [[search]] - is best used for autocompletion." + is best used for autocompletion. Unlike [[search]], this function returns + no results if there is no search string, or no tokens in the search string." [^Svc svc params] (search/do-ranked-search (.-searcher svc) (make-search-params svc params))) diff --git a/src/com/eldrix/hermes/impl/search.clj b/src/com/eldrix/hermes/impl/search.clj index f024260..9ec86ed 100644 --- a/src/com/eldrix/hermes/impl/search.clj +++ b/src/com/eldrix/hermes/impl/search.clj @@ -218,10 +218,14 @@ (let [term (.toString termAtt)] (recur (.incrementToken tokenStream) (conj result term)))))))) -(defn- make-autocomplete-tokens-query +(defn make-autocomplete-tokens-query "Create a query that matches ALL tokens within `s` for field `field-name`. This is most appropriate for autocompletion, in which a user would expect - to type and have all tokens considered in search." + to type and have all tokens considered in search. If `s` is nil, or has + no valid tokens (e.g an empty string or only stop characters), then this + returns nil. This permits use as a filter in autocompletion such that an + unfiltered list could be returned. This behaviour is different to + [[make-ranked-search-tokens-query]]." (^BooleanQuery [field-name s] (make-autocomplete-tokens-query field-name s 0)) (^BooleanQuery [field-name s fuzzy] (with-open [analyzer (StandardAnalyzer.)] @@ -234,23 +238,27 @@ (.build builder)) (first qs))))))) -(defn- make-ranked-search-tokens-query +(defn make-ranked-search-tokens-query "Create a query that searches using the tokens in `s`. This should be used for 'ranked' search, in which one is looking for a best match. This means some tokens may be ignored, but results will be ranked higher if more - tokens match." + tokens match. Unlike [[make-autocomplete-tokens-query]] this will always + return a query, and will return a MatchNoDocsQuery if the `s` is nil, or + has no valid tokens." (^BooleanQuery [field-name s] (make-ranked-search-tokens-query field-name s 0)) (^BooleanQuery [field-name s fuzzy] (with-open [analyzer (StandardAnalyzer.)] - (when s - (let [qs (map #(make-token-query field-name % fuzzy) (tokenize analyzer field-name s))] - (if (pos-int? (count qs)) - (let [builder (BooleanQuery$Builder.)] - (doseq [q qs] - (.add builder q BooleanClause$Occur/SHOULD)) - (.build builder)) - (first qs))))))) + (or + (when s + (let [qs (map #(make-token-query field-name % fuzzy) (tokenize analyzer field-name s))] + (if (pos-int? (count qs)) + (let [builder (BooleanQuery$Builder.)] + (doseq [q qs] + (.add builder q BooleanClause$Occur/SHOULD)) + (.build builder)) + (first qs)))) + (MatchNoDocsQuery.))))) ;; fallback to always returning a query that will match no results (defn q-or [queries] (lucene/q-or queries)) @@ -288,12 +296,13 @@ ^Query [{:keys [s query fuzzy show-fsn? inactive-concepts? inactive-descriptions? boost-length? concept-refsets properties] :or {show-fsn? false, inactive-concepts? false, inactive-descriptions? true, boost-length? true}}] - (let [qb (cond-> (BooleanQuery$Builder.) + (let [s-query (make-autocomplete-tokens-query "nterm" s fuzzy) ;; doesn't matter if s is nil or blank + qb (cond-> (BooleanQuery$Builder.) - s - (.add (make-autocomplete-tokens-query "nterm" s fuzzy) BooleanClause$Occur/MUST) + s-query ;; if we have a search string AND it was turned into a non-nil query, use it + (.add s-query BooleanClause$Occur/MUST) - query + query ;; also bring in any other query (.add query BooleanClause$Occur/MUST) (not inactive-concepts?) @@ -418,6 +427,8 @@ items." (when (and (zero? fuzzy) (pos? fallback)) ; only fallback to fuzzy search if no fuzziness requested first time (do-search searcher (assoc params :fuzzy fallback))))))) +(s/fdef do-ranked-search + :args (s/cat :searcher ::searcher :params ::search-params)) (defn do-ranked-search "A modified [[do-search]] that returns results in ranked order. `max-hits` must be specified. @@ -427,16 +438,18 @@ items." ``` will return the 'best' match for the search tokens. Unlike [[do-search]], which is useful for autocompletion, this will not return zero results if - one or more token is not found. Instead, results will be ranked from 'best' - to 'worst'. An important design consideration here was to not alter the - functioning of [[do-search]] with conditionals." + one or more token is not found. Instead, results will be ranked from 'best' + to 'worst'. An important design consideration here was to not alter the + functioning of [[do-search]] with conditionals. Note: no results will be + returned if 's' is nil, or empty, or contains no valid word tokens." [searcher {:keys [s fuzzy query] :as params}] (let [q1 (make-ranked-search-tokens-query "nterm" s fuzzy) - q2 (if query (q-and [q1 query]) q1)] - (do-search searcher (-> params ;; alter original parameters to use internal API to do ranked search, not autocompletion - (dissoc :s) - (assoc :boost-length? false - :query q2))))) + q2 (if (and q1 query) (q-and [q1 query]) q1)] + (when q2 + (do-search searcher (-> params ;; alter original parameters to use internal API to do ranked search, not autocompletion + (dissoc :s) + (assoc :boost-length? false + :query q2)))))) (defn q-self "Returns a query that will only return documents for the concept specified." diff --git a/test/com/eldrix/hermes/search_test.clj b/test/com/eldrix/hermes/search_test.clj index 2b48546..e7d1cc4 100644 --- a/test/com/eldrix/hermes/search_test.clj +++ b/test/com/eldrix/hermes/search_test.clj @@ -1,10 +1,11 @@ (ns com.eldrix.hermes.search-test (:require [clojure.core.async :as async] [clojure.spec.gen.alpha :as gen] - [clojure.test :refer [deftest is]] + [clojure.test :refer [deftest is testing]] [com.eldrix.hermes.core :as hermes] [com.eldrix.hermes.impl.lucene :as lucene] - [com.eldrix.hermes.impl.search :as search])) + [com.eldrix.hermes.impl.search :as search]) + (:import (org.apache.lucene.search Query))) (def example-results-1 [{:id 464271012 @@ -70,6 +71,27 @@ (with-open [svc (hermes/open "snomed.db")] (is (empty? (hermes/search svc {:constraint "<24700007" :query (search/q-concept-id 24700007)}))))) +(deftest ^:live test-token-queries + (let [ss (gen/sample (gen/string) 1000)] + (doseq [s ss] + (let [q1 (search/make-autocomplete-tokens-query "nterm" s) + q2 (search/make-ranked-search-tokens-query "nterm" s)] + (is (or (nil? q1) (instance? Query q1)) "Autocomplete tokenisation should work with arbitrary string input") + (is (instance? Query q2) "Ranked search tokenisation should work with arbitrary string input and always return a query"))))) + +(deftest ^:live empty-search + (with-open [svc (hermes/open "snomed.db")] + (testing "autocompletion" + (is (seq (hermes/search svc {:max-hits 1 :constraint "<14679004"})) + "For autocompletion, with no search string, result should be unfiltered sequence") + (is (seq (hermes/search svc {:s " " :max-hits 1 :constraint "<14679004"})) + "For autocompletion, when search term resolves to zero tokens, result should be unfiltered sequence")) + (testing "ranked search" + (is (empty? (hermes/ranked-search svc {:max-hits 1 :constraint "<14679004"})) + "For ranked search, with no search string, result should be empty AND it must not throw an 'null query' exception") + (is (empty? (hermes/ranked-search svc {:s " " :max-hits 1 :constraint "<14679004"})) + "For ranked search, when search term resolves to zero tokens, result should be empty AND it must not throw an 'null query' exception")))) + (comment (def svc (hermes/open "snomed.db")) (def searcher (.-searcher svc))