Skip to content

Commit

Permalink
Add PG extension AGE (#18)
Browse files Browse the repository at this point in the history
* add extension AGE
* add notebooks
* update docker-compose files
  • Loading branch information
haobibo authored Apr 15, 2024
1 parent 90320a8 commit 0312971
Show file tree
Hide file tree
Showing 16 changed files with 1,852 additions and 191 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ Building blocks for the following big data project use cases are supported in th

- [Flink / Spark](https://hub.docker.com/r/qpod/bigdata)
- [Kafka](https://hub.docker.com/r/qpod/kafka)
- [Elasticsearch](https://hub.docker.com/r/qpod/elasticsearch)
- [Postgresql with Extensions](https://hub.docker.com/r/qpod/postgres-ext): including pgvector, age, and [many others](https://github.com/QPod/lab-data/blob/main/docker_postgres/rootfs/opt/utils/install_list_pgext.tpl.apt)
- [GreenplumDB](https://hub.docker.com/r/qpod/greenplum)
- [Elasticsearch](https://hub.docker.com/r/qpod/elasticsearch)
2 changes: 0 additions & 2 deletions docker_greenplum/example/gpdb-single-node/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3"

networks:
greenplum:
ipam:
Expand Down
2 changes: 0 additions & 2 deletions docker_greenplum/example/gpdb-single-vm/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3"

networks:
greenplum:
ipam:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: "3"

# Standalone kraft mode, ref: https://github.com/bitnami/containers/tree/main/bitnami/kafka#kafka-without-zookeeper-kraft
# Note: the volume folder read/write permission is required for uid 1001 and gid 1000. use the command below for debug:
# `KAFKA_DATA_DIR="/data/database/kafka-bitnami/broker" && mkdir -pv $KAFKA_DATA_DIR && chmod -R ugo+rws $KAFKA_DATA_DIR`
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
version: '3'

## Standalone kraft mode - confluent kafka: https://github.com/confluentinc/cp-all-in-one/blob/7.3.0-post/cp-all-in-one-kraft/docker-compose.yml
# Note: the volume folder read/write permission is required for uid 1001 and gid 1000. use the command below for debug:
# `KAFKA_DATA_DIR="/data/database/kafka-bitnami/broker" && mkdir -pv $KAFKA_DATA_DIR && chmod -R ugo+rws $KAFKA_DATA_DIR`
Expand Down
308 changes: 142 additions & 166 deletions docker_postgres/README.md

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions docker_postgres/demo/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
services:
db-postgres:
# su postgres && psql -d casdoor -U pg-casdoor-username
image: qpod0dev/postgres-ext
container_name: db-postgres
environment:
POSTGRES_DB: postgres
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres

svc-jupyter:
image: quay.io/jupyter/minimal-notebook
depends_on:
- db-postgres
container_name: dev-jupyter
hostname: jupyter
user: root
environment:
NB_USER: root
NB_UID: 0
NB_GID: 0
ports:
- 8888:8888
volumes:
- "./:/home/root/work"
command: ["start-notebook.py", "--allow-root"]
106 changes: 106 additions & 0 deletions docker_postgres/demo/test-apache-age.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0e5737ed-ad83-4902-b551-ddd413f13475",
"metadata": {},
"source": [
"# Example - Using Python to connect to PG Apache AGE\n",
"\n",
"- reference: https://age.apache.org/getstarted/quickstart\n",
"- to install `apache-age-python`, code compile is currently required:\n",
"\n",
"```shell\n",
"sudo apt-get -y install gcc libpq-dev \"postgresql-client-${VER_PG}\"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0fa8e36-c263-4f95-b715-670ee760167d",
"metadata": {},
"outputs": [],
"source": [
"! pip install apache-age-python"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1700acf5-8cc6-495a-ab7d-44115cdc7b94",
"metadata": {},
"outputs": [],
"source": [
"import age\n",
"import psycopg2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dac7a486-6fc8-43f5-88f9-ed57d507becb",
"metadata": {},
"outputs": [],
"source": [
"conn = psycopg2.connect(host=\"db-postgres\", port=\"5432\", dbname=\"postgres\", user=\"postgres\", password=\"postgres\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "759fe54a-7897-4eaa-a304-ff9d084c0704",
"metadata": {},
"outputs": [],
"source": [
"DSN = \"host=db-postgres port=5432 dbname=postgres user=postgres password=postgres\"\n",
"TEST_GRAPH_NAME = 'agens'\n",
"\n",
"conn = age.connect(dsn=DSN, graph=TEST_GRAPH_NAME)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56c19b14-aaa5-4d73-b049-88f51fb983e6",
"metadata": {},
"outputs": [],
"source": [
"cursor = conn.execCypher(\"CREATE (n:Person {name: %s, title: 'Developer'}) RETURN n\", params=('Andy',))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "affae3fa-bad0-4685-a05b-fb689287fce3",
"metadata": {},
"outputs": [],
"source": [
"from age.models import Vertex\n",
"for row in cursor:\n",
" print(\"Vertex: %s , Type: %s \" % (Vertex, type(row[0])))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
163 changes: 163 additions & 0 deletions docker_postgres/demo/test-pgvector.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0e5737ed-ad83-4902-b551-ddd413f13475",
"metadata": {},
"source": [
"# Example - Using Python to connect to PG vector\n",
"\n",
"https://github.com/pgvector/pgvector-python/blob/master/examples/openai_embeddings.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0fa8e36-c263-4f95-b715-670ee760167d",
"metadata": {},
"outputs": [],
"source": [
"! pip install psycopg[binary] pgvector"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1700acf5-8cc6-495a-ab7d-44115cdc7b94",
"metadata": {},
"outputs": [],
"source": [
"from pgvector.psycopg import register_vector\n",
"import psycopg"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dac7a486-6fc8-43f5-88f9-ed57d507becb",
"metadata": {},
"outputs": [],
"source": [
"conn = psycopg.connect(\n",
" conninfo='postgresql://postgres:postgres@db-postgres/postgres',\n",
" autocommit=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e399b5c-e11b-4fd9-b430-7774d2bb8457",
"metadata": {},
"outputs": [],
"source": [
"conn.execute('CREATE EXTENSION IF NOT EXISTS vector')\n",
"register_vector(conn)\n",
"conn.execute('DROP TABLE IF EXISTS documents')\n",
"conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding vector(1536))')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36c139e8",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def get_embeddings(inputs: list):\n",
" dimensions = 1536\n",
" embeddings = [np.random.rand(dimensions) for _ in inputs]\n",
" return embeddings\n",
"\n",
"# The real code should by something like:\n",
"# from openai import OpenAI\n",
"# OPENAI_API_KEY=\"*\"\n",
"# client = OpenAI(api_key=OPENAI_API_KEY)\n",
"# response = client.embeddings.create(input=input, model='text-embedding-3-small')\n",
"# embeddings = [v.embedding for v in response.data]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b52db809-d008-448a-99bd-9db57dc35218",
"metadata": {},
"outputs": [],
"source": [
"inputs = [\n",
" 'The dog is barking',\n",
" 'The cat is purring',\n",
" 'The bear is growling'\n",
"]\n",
"\n",
"embeddings = get_embeddings(inputs)"
]
},
{
"cell_type": "markdown",
"id": "dff8234a-4bf2-4abc-8d3f-68a233225792",
"metadata": {},
"source": [
"## Insert doc contents and embeddings into DB table"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b507cff0-dae1-42ea-b71b-c8f230474cb0",
"metadata": {},
"outputs": [],
"source": [
"for content, embedding in zip(inputs, embeddings):\n",
" conn.execute('INSERT INTO documents (content, embedding) VALUES (%s, %s)', (content, embedding))"
]
},
{
"cell_type": "markdown",
"id": "c7b240ae-1d24-45a7-b96a-de89f50f0c14",
"metadata": {},
"source": [
"## Query DB table using embedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65391dfc",
"metadata": {},
"outputs": [],
"source": [
"document_id = 1\n",
"neighbors = conn.execute(\n",
" 'SELECT content FROM documents WHERE id != %(id)s ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 5',\n",
" {'id': document_id}\n",
").fetchall()\n",
"for neighbor in neighbors:\n",
" print(neighbor[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
8 changes: 4 additions & 4 deletions docker_postgres/postgres-ext.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ LABEL maintainer="haobibo@gmail.com"
COPY rootfs /

RUN set -x && . /opt/utils/script-utils.sh && . /opt/utils/script-setup-pg_ext_mirror.sh \
## Generate a package list based on PG_MAJOR version
&& apt-get update && apt-get install -y gettext \
&& envsubst < /opt/utils/install_list_pgext.tpl.apt > /opt/utils/install_list_pgext.apt \
&& rm -rf /opt/utils/install_list_pgext.tpl.apt \
&& echo "To install PG extensions: $(cat /opt/utils/install_list_pgext.apt)" \
&& install_apt /opt/utils/install_list_pgext.apt \
## Install extensions
&& . /opt/utils/script-setup-pg_ext.sh \
&& ls -alh /usr/share/postgresql/*/extension/*.control | sort \
&& echo "include_dir='./conf.d'" >> /var/lib/postgresql/data/postgresql.conf \
## Hack: fix system python / conda python
&& cp -rf /opt/conda/lib/python3.11/platform.py.bak /opt/conda/lib/python3.11/platform.py \
&& echo "Clean up" && list_installed_packages && install__clean

USER postgres
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
CREATE OR REPLACE PROCEDURE enable_all_extensions()
LANGUAGE plpgsql
AS $$
DECLARE
extension_name TEXT;
BEGIN
FOR extension_name IN
SELECT name FROM pg_available_extensions
WHERE name NOT IN (SELECT extname AS name FROM pg_extension)
ORDER BY name
LOOP
BEGIN
EXECUTE format('CREATE EXTENSION IF NOT EXISTS %I', extension_name);
EXCEPTION WHEN OTHERS THEN
-- Optionally log the error or do nothing to continue with the next extension
RAISE NOTICE 'Failed to create extension %: %', extension_name, SQLERRM;
END;
END LOOP;
END;
$$;

CREATE EXTENSION "hstore";
-- CALL enable_all_extensions();
24 changes: 24 additions & 0 deletions docker_postgres/rootfs/docker-entrypoint-initdb.d/01-init-db.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
set -Eeo pipefail

printenv | sort

ls -alh /usr/share/postgresql/${PG_MAJOR}/extension/*.control

mkdir -pv ${PGDATA}/conf.d
echo "include_dir='./conf.d'" >> ${PGDATA}/postgresql.conf
sudo mv /opt/utils/pg-ext.conf ${PGDATA}/conf.d/

cat ${PGDATA}/conf.d/*
tail ${PGDATA}/postgresql.conf


# https://github.com/docker-library/postgres/blob/master/docker-entrypoint.sh
# form: docker-entrypoint.sh
docker_temp_server_stop
docker_temp_server_start

psql -v ON_ERROR_STOP=0 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
CALL enable_all_extensions();
SELECT extname AS name, extversion AS ver FROM pg_extension ORDER BY extname;
EOSQL
Loading

0 comments on commit 0312971

Please sign in to comment.