From bd35e7e2a0c00a945721ab6ad673d68672b7db34 Mon Sep 17 00:00:00 2001 From: Luis Borjas Reyes Date: Sat, 30 Jan 2021 17:10:06 -0500 Subject: [PATCH 1/2] Use an actual full-text search index for autocomplete * Redefines the materialized view, using a stored weighted vector. * Updates the existing query, to use said materialized view. --- .../202101301530_full_text_autocomplete.sql | 39 +++++++++++++++++++ src/Database/Queries.hs | 27 +++++++++++-- 2 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 migrations/202101301530_full_text_autocomplete.sql diff --git a/migrations/202101301530_full_text_autocomplete.sql b/migrations/202101301530_full_text_autocomplete.sql new file mode 100644 index 0000000..1f41dea --- /dev/null +++ b/migrations/202101301530_full_text_autocomplete.sql @@ -0,0 +1,39 @@ +drop materialized view if exists geocode.city_autocomplete; + +-- we're using a precalculated to_tsvector column to pay some +-- storage in return for some performance (the vector doesn't +-- have to be calculated in every query.) +-- more about weights and ranks: +-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING +-- https://rob.conery.io/2019/10/29/fine-tuning-full-text-search-with-postgresql-12/ +create materialized view if not exists geocode.city_autocomplete + as select geocode.city.geonameid, geocode.city.name, + location[0] as longitude, location[1] as latitude, + population, timezone, + geocode.country.iso as country_code, + geocode.country.name as country_name, + geocode.region.name as region_name, + geocode.district.name as district_name, + ( + setweight(to_tsvector('simple', geocode.city.name), 'A') || ' ' || + setweight(to_tsvector(coalesce(geocode.district.name, '')), 'C') || ' ' || + setweight(to_tsvector(coalesce(geocode.region.name, '')), 'C') || ' ' || + setweight(to_tsvector(coalesce(geocode.country.iso, '')), 'B') || ' ' || + setweight(to_tsvector(coalesce(geocode.country.name, '')), 'B') || ' ' || + setweight(to_tsvector(replace(coalesce(alternatenames, ''), ',', ' ')), 'D') + ) as autocomplete_doc + from geocode.city + left join geocode.country using(isocode) + left join geocode.region using(isocode, regcode) + left join geocode.district using(isocode, regcode, discode); + +-- cf. https://www.postgresql.org/docs/12/textsearch-tables.html +create index if not exists idx_full_text_autocomplete + on geocode.city_autocomplete using gin(autocomplete_doc); + +-- cf. https://www.postgresql.org/docs/12/sql-createfunction.html +create or replace function web_to_tsquery_prefix(text) returns tsquery + as 'select to_tsquery(quote_literal($1) || '':*'');' + language sql + immutable + returns null on null input; diff --git a/src/Database/Queries.hs b/src/Database/Queries.hs index 82b4dcf..b235eb9 100644 --- a/src/Database/Queries.hs +++ b/src/Database/Queries.hs @@ -33,6 +33,8 @@ cityCount :: Has Database sig m => m Int cityCount = do counts <- query_ "select count(geonameid) from geocode.city" pure $ maybe 0 fromOnly (listToMaybe counts) + + -- | Given an API Key, find out if it exists and is enabled; -- return status and current quota. findApiKey :: Has Database sig m => Text -> m (Bool, Maybe Integer) @@ -41,14 +43,30 @@ findApiKey key = do pure $ fromMaybe (False, Just 0) (listToMaybe exists) --- | Fast query for name autocomplete: biased towards more populous cities, --- doesn't order by how close the name is to the input; uses a denormalized --- materialized view. +-- | Fast query for name autocomplete: uses a denormalized, materialized view +-- that precomputes city, region, district, country and alternate names +-- in a weighted full-text search vector +-- see: +-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING +-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING +-- https://rob.conery.io/2019/10/29/fine-tuning-full-text-search-with-postgresql-12/ +-- Note that `web_to_tsquery_prefix` is a function we defined, to sanitize +-- input and interpret it as a simple query where each token is a prefix. cityAutoComplete :: Has Database sig m => Text -> Maybe Int -> m [CityQ] cityAutoComplete q limit' = do let limit = defaultLimit 5 limit' query - "select * from geocode.city_autocomplete where name %> ? order by name <-> ?, population desc limit ?" + [sql| + select geonameid, name, longitude, latitude, + population, timezone, country_code, + country_name, region_name, district_name + from geocode.city_autocomplete + where + web_to_tsquery_prefix(?) @@ autocomplete_doc + order by + ts_rank(autocomplete_doc, web_to_tsquery_prefix(?)) desc + limit ? + |] (q, q, limit) -- | Slightly slower, but more precise, query to find a city that @@ -108,6 +126,7 @@ defaultLimit def = -- NOTES {- to run queries manually: -- stack ghci +-- import qualified Database.PostgreSQL.Simple as PG λ> let (DatabaseUrl url) = defaultConfig & appDatabaseUrl λ> conn <- PG.connectPostgreSQL $ encodeUtf8 url λ> runDatabaseWithConnection conn $ Database.Queries.cityCount From 0ec166dd5c674918bebb8d9c2992662803df0681 Mon Sep 17 00:00:00 2001 From: Luis Borjas Reyes Date: Sat, 30 Jan 2021 17:28:28 -0500 Subject: [PATCH 2/2] update dockerfile to include migrations dir Also, mention it in the README --- Dockerfile | 1 + README.md | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Dockerfile b/Dockerfile index 16a1d0e..3958380 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,6 +70,7 @@ RUN apt-get update && apt-get install -y libpq-dev postgresql-client-12 RUN adduser geocode-city-api USER geocode-city-api +COPY --from=build /opt/geocode-city-api/migrations /opt/geocode-city-api/migrations COPY --from=build /opt/geocode-city-api/bin /opt/geocode-city-api WORKDIR /opt/geocode-city-api diff --git a/README.md b/README.md index ff57b2d..d9961e1 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,31 @@ is that for the hyperloglog entries to "fall off" as we reach max memory, I'm us heroku redis:maxmemory REDIS-NAME --policy allkeys-lru -a geocode-city ``` +#### Database migrations (Heroku specific) + +I _should_ set up a [`release` phase](https://devcenter.heroku.com/articles/container-registry-and-runtime#release-phase) to [run migrations automatically](https://devcenter.heroku.com/articles/release-phase). However, I'd rather not have that overhead for something that +happens somewhat seldom, so I instead use a [one-off dyno](https://devcenter.heroku.com/articles/container-registry-and-runtime#one-off-dynos): + +```sh +> heroku run bash -a geocode-city +Running bash on ⬢ geocode-city... up, run.7528 (Hobby) +~ $ pwd +/opt/geocode-city-api +~ $ ls +geocode-city-api-exe migrations +~ $ ./geocode-city-api-exe --migrate +Initializing schema +NOTICE: relation "schema_migrations" already exists, skipping +Ok: 202101182025_geonames_raw.sql +Ok: 202101182130_non_city_tables.sql +Ok: 202101182140_city_table.sql +Ok: 202101182150_ft_and_indexes.sql +Ok: 202101191900_api_keys_table.sql +Ok: 202101201800_materialize_autocomplete.sql +Ok: 202101231600_api_quotas.sql +Execute: 202101301530_full_text_autocomplete.sql +All migrations ran. +``` ## Development