Skip to content

Commit

Permalink
Merge pull request #20 from geocode-city/full-text-autocomplete
Browse files Browse the repository at this point in the history
Use an actual full-text search index for autocomplete
  • Loading branch information
lfborjas authored Jan 30, 2021
2 parents 7f2fdb2 + 0ec166d commit f712916
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 4 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ RUN apt-get update && apt-get install -y libpq-dev postgresql-client-12
RUN adduser geocode-city-api
USER geocode-city-api

COPY --from=build /opt/geocode-city-api/migrations /opt/geocode-city-api/migrations
COPY --from=build /opt/geocode-city-api/bin /opt/geocode-city-api

WORKDIR /opt/geocode-city-api
Expand Down
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,31 @@ is that for the hyperloglog entries to "fall off" as we reach max memory, I'm us
heroku redis:maxmemory REDIS-NAME --policy allkeys-lru -a geocode-city
```

#### Database migrations (Heroku specific)

I _should_ set up a [`release` phase](https://devcenter.heroku.com/articles/container-registry-and-runtime#release-phase) to [run migrations automatically](https://devcenter.heroku.com/articles/release-phase). However, I'd rather not have that overhead for something that
happens somewhat seldom, so I instead use a [one-off dyno](https://devcenter.heroku.com/articles/container-registry-and-runtime#one-off-dynos):

```sh
> heroku run bash -a geocode-city
Running bash on ⬢ geocode-city... up, run.7528 (Hobby)
~ $ pwd
/opt/geocode-city-api
~ $ ls
geocode-city-api-exe migrations
~ $ ./geocode-city-api-exe --migrate
Initializing schema
NOTICE: relation "schema_migrations" already exists, skipping
Ok: 202101182025_geonames_raw.sql
Ok: 202101182130_non_city_tables.sql
Ok: 202101182140_city_table.sql
Ok: 202101182150_ft_and_indexes.sql
Ok: 202101191900_api_keys_table.sql
Ok: 202101201800_materialize_autocomplete.sql
Ok: 202101231600_api_quotas.sql
Execute: 202101301530_full_text_autocomplete.sql
All migrations ran.
```

## Development

Expand Down
39 changes: 39 additions & 0 deletions migrations/202101301530_full_text_autocomplete.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
drop materialized view if exists geocode.city_autocomplete;

-- we're using a precalculated to_tsvector column to pay some
-- storage in return for some performance (the vector doesn't
-- have to be calculated in every query.)
-- more about weights and ranks:
-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING
-- https://rob.conery.io/2019/10/29/fine-tuning-full-text-search-with-postgresql-12/
create materialized view if not exists geocode.city_autocomplete
as select geocode.city.geonameid, geocode.city.name,
location[0] as longitude, location[1] as latitude,
population, timezone,
geocode.country.iso as country_code,
geocode.country.name as country_name,
geocode.region.name as region_name,
geocode.district.name as district_name,
(
setweight(to_tsvector('simple', geocode.city.name), 'A') || ' ' ||
setweight(to_tsvector(coalesce(geocode.district.name, '')), 'C') || ' ' ||
setweight(to_tsvector(coalesce(geocode.region.name, '')), 'C') || ' ' ||
setweight(to_tsvector(coalesce(geocode.country.iso, '')), 'B') || ' ' ||
setweight(to_tsvector(coalesce(geocode.country.name, '')), 'B') || ' ' ||
setweight(to_tsvector(replace(coalesce(alternatenames, ''), ',', ' ')), 'D')
) as autocomplete_doc
from geocode.city
left join geocode.country using(isocode)
left join geocode.region using(isocode, regcode)
left join geocode.district using(isocode, regcode, discode);

-- cf. https://www.postgresql.org/docs/12/textsearch-tables.html
create index if not exists idx_full_text_autocomplete
on geocode.city_autocomplete using gin(autocomplete_doc);

-- cf. https://www.postgresql.org/docs/12/sql-createfunction.html
create or replace function web_to_tsquery_prefix(text) returns tsquery
as 'select to_tsquery(quote_literal($1) || '':*'');'
language sql
immutable
returns null on null input;
27 changes: 23 additions & 4 deletions src/Database/Queries.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ cityCount :: Has Database sig m => m Int
cityCount = do
counts <- query_ "select count(geonameid) from geocode.city"
pure $ maybe 0 fromOnly (listToMaybe counts)


-- | Given an API Key, find out if it exists and is enabled;
-- return status and current quota.
findApiKey :: Has Database sig m => Text -> m (Bool, Maybe Integer)
Expand All @@ -41,14 +43,30 @@ findApiKey key = do
pure $ fromMaybe (False, Just 0) (listToMaybe exists)


-- | Fast query for name autocomplete: biased towards more populous cities,
-- doesn't order by how close the name is to the input; uses a denormalized
-- materialized view.
-- | Fast query for name autocomplete: uses a denormalized, materialized view
-- that precomputes city, region, district, country and alternate names
-- in a weighted full-text search vector
-- see:
-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING
-- https://www.postgresql.org/docs/12/textsearch-controls.html#TEXTSEARCH-RANKING
-- https://rob.conery.io/2019/10/29/fine-tuning-full-text-search-with-postgresql-12/
-- Note that `web_to_tsquery_prefix` is a function we defined, to sanitize
-- input and interpret it as a simple query where each token is a prefix.
cityAutoComplete :: Has Database sig m => Text -> Maybe Int -> m [CityQ]
cityAutoComplete q limit' = do
let limit = defaultLimit 5 limit'
query
"select * from geocode.city_autocomplete where name %> ? order by name <-> ?, population desc limit ?"
[sql|
select geonameid, name, longitude, latitude,
population, timezone, country_code,
country_name, region_name, district_name
from geocode.city_autocomplete
where
web_to_tsquery_prefix(?) @@ autocomplete_doc
order by
ts_rank(autocomplete_doc, web_to_tsquery_prefix(?)) desc
limit ?
|]
(q, q, limit)

-- | Slightly slower, but more precise, query to find a city that
Expand Down Expand Up @@ -108,6 +126,7 @@ defaultLimit def =
-- NOTES
{- to run queries manually:
-- stack ghci
-- import qualified Database.PostgreSQL.Simple as PG
λ> let (DatabaseUrl url) = defaultConfig & appDatabaseUrl
λ> conn <- PG.connectPostgreSQL $ encodeUtf8 url
λ> runDatabaseWithConnection conn $ Database.Queries.cityCount
Expand Down

0 comments on commit f712916

Please sign in to comment.