diff --git a/.env.example b/.env.example index 928a277..f8b30f7 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,7 @@ MARBLE_APP_HOST=localhost:3000 GOOGLE_CLOUD_PROJECT=test-project # Required for authentication if you are running with a real firebase app and cloud storage bucket, outside of a GCP environment +# Also required if you are running in a GCP environment and want to use GCS as a document storage backend GOOGLE_APPLICATION_CREDENTIALS= SENTRY_DSN= @@ -56,17 +57,21 @@ PG_HOSTNAME=db PG_PORT=54321 PG_PASSWORD=marble -# configure the document storage backend with optional fake backends -GCS_INGESTION_BUCKET="data-ingestion-bucket" -GCS_CASE_MANAGER_BUCKET="case-manager-bucket" -FAKE_GCS=true - -# Configure the AWS S3 backend for sending decision files -FAKE_AWS_S3=true -# The 3 below are required if FAKE_AWS_S3 is false -AWS_REGION=eu-west-3 -AWS_ACCESS_KEY= -AWS_SECRET_KEY= +# Configure the document storage backend +# The options are +# - "file://{path}?create_dir=true" +# - for LOCAL TEST only, not fully compatible with all features - create_dir not required if the folder exists) +# - see https://pkg.go.dev/gocloud.dev/blob/fileblob#URLOpener for details on the format +# - "gs://{bucket_name}" +# - "s3://{bucket_name}" +# - "azblob://{bucket_name}" +# See https://gocloud.dev/howto/blob/ for details. +# Using a Google Cloud Storage (gs://) bucket backend additionally requires that a service account key be present in the file pointed +# to by GOOGLE_APPLICATION_CREDENTIALS. This is because with GCS, a private key from a service account key must be present to return signed urls. +# In all cases, credentials discovery is done automatically according to each provider's usual practices (env variables, +# credentials file, metadata server...) +INGESTION_BUCKET_URL="file:///tempFiles/data-ingestion-bucket?create_dir=true" +CASE_MANAGER_BUCKET_URL="file:///tempFiles/case-manager-bucket?create_dir=true" # Othe dependency configurations SEGMENT_WRITE_KEY_BACKEND=UgkImFmHmBZAWh5fxIKBY3QtvlcBrhqQ diff --git a/deployment.md b/deployment.md index 8f4bdfd..bf5aeda 100644 --- a/deployment.md +++ b/deployment.md @@ -1,16 +1,20 @@ # Deploying Marble + ## Marble architecture #### Components + At it's simplest, Marble's software can be described as below: Software architecture The Marble software is composed of 2 docker images and 3 parts: + 1. **[Docker 1]** a back-end API server (go + [gin](https://github.com/gin-gonic/gin)), that serves the internal and public REST APIs 2. **[Docker 1]** a set of cron job scripts (go, run from the same docker image with a different entrypoint as the back-end API server), meant to run periodically 3. **[Docker 2]** a front-end API server (typescript + [Remix](https://remix.run/)), that serves html and exposes actions to the browser It relies on the existence of a basic set of infrastructure to work: + 1. A Postgresql database 2. A scheduler for the cron jobs 3. A set of object storage buckets, to store documents uploaded in the case manager and csv files for batch data ingestion (currently, only Google Cloud Storage: compatibility for ASW S3 and Azure Blob Storage planned soon) @@ -18,19 +22,24 @@ It relies on the existence of a basic set of infrastructure to work: 5. A configured Firebase app for end user authentication #### Marble images + The docker images for Marble are stored on the registry `europe-west1-docker.pkg.dev/marble-infra/marble/marble-backend` and `europe-west1-docker.pkg.dev/marble-infra/marble/marble-frontend` for the API respectively. -#### Marble cloud +#### Marble cloud + For reference, below is a schematic of Marble's cloud offering architecture. It is essentially a variation of the architecture described above, with some infrastructure choices that work well for our cloud offering specifically. Software architecture (2) ## Deploy Marble serverless + Here at Marble, we choose to deploy our code on a serverless infrastructure. We do this to leverage the full ease of use, flexibility and scalability of GCP's Cloud Run offering. -Doing this, we choose to run the back-end API server and the cron jobs (which are run from the same docker image) separately: +Doing this, we choose to run the back-end API server and the cron jobs (which are run with different flags from the same docker image) separately: + - the API is a Cloud Run service - the scripts are run as distinct Cloud Run Jobs, scheduled by a Cloud Scheduler -_Technically_ we could have the cron jobs scheduled within our go code image and run from within the API service, but we avoid this for the following reasons: +_Technically_ we could have the cron jobs scheduled within the same container that is running the API service, but we avoid this for the following reasons: + - we don’t want a batch job to be stopped/to fail because an api instance is torn down - we don’t want the api to be impacted by a batch job’s cpu/memory usage - the cloud run API has limits in how long it can take at most to handle requests (which are shorter than the typical cron job execution time) @@ -40,16 +49,18 @@ However, running it all together could make sense if Marble is run in a VM, more Moreover, in our cloud deployment, we use Google Secret Manager (integrated with Cloud Run) to inject secrets as environment variables into the containers, and a GCP application load balancer for load balancing, TLS termination, DDoS protection and other security rules. In this repository, we provide an example set of terraform files that you can adapt to deploy Marble serverless in your own GCP environment. -It should also be reasonably simple to run an equivalent deployment on AWS using Fargate and AWS Scheduler, or Azure Container Instances + scheduler on Azure. +We have also received community contributions giving an example set of terraform files for deploying Marble in managed containers on AWS. +It should also be reasonably simple to run an equivalent deployment on Azure Container Instances + scheduler on Azure. ## Deploy Marble on a VM -While Marble on a VM is not ideal for our cloud offering, it may make sense for an open-source or on-premise usecase. If you do so, it could make sense to run the back-end API and built-in go scheduler together by passing the `--server --cron` arguments to the docker container. + +While Marble on a VM is not ideal for our cloud offering, it may make sense for an open-source or on-premise usecase. If you do so, you can run the back-end API and built-in go scheduler together by passing the `--server --cron` arguments to the docker container. ## Deploy Marble on Kubernetes -While we do not provide support for deploying Marble on Kubernetes, it should work very similarly to a serverless deployment. You can schedule the cron jobs by using Kubernetes' built-in scheduling tool. + +While we do not currently provide support for deploying Marble on Kubernetes, it should work very similarly to a serverless deployment. You can schedule the cron jobs by using Kubernetes' built-in scheduling tool. # Deploy Marble outside of GCP -Currently (March 2024), Marble still has some requirements on GCP infrastructure: -- Cloud Storage for file storing -- Firebase authentication for authentication (nb: in practice, any usage of Marble should fall under the Firebase auth free plan) -We plan to propose alternatives soon, starting by proposing S3/Azure blob options for file storing. + +Marble relies on Firebase Authentication (a part of Google's web app deployment toolbox) for authentication. This means that, even if you are running Marble in another cloud provider's cloud, or in your own infrastructure, you need to create a Firebase app, as well as a service account key to access it from the backend container. +In practice, any usage of Marble should fall under the Firebase Auth free plan, though you may still need to provide a credit card number. diff --git a/docker-compose.yaml b/docker-compose.yaml index 4f65da2..e9270d3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -47,8 +47,8 @@ services: PG_USER: postgres PG_PASSWORD: ${PG_PASSWORD:-} AUTHENTICATION_JWT_SIGNING_KEY: ${AUTHENTICATION_JWT_SIGNING_KEY:-} - GCS_INGESTION_BUCKET: ${GCS_INGESTION_BUCKET:-} - GCS_CASE_MANAGER_BUCKET: ${GCS_CASE_MANAGER_BUCKET:-} + INGESTION_BUCKET_URL: ${INGESTION_BUCKET_URL:-} + CASE_MANAGER_BUCKET_URL: ${CASE_MANAGER_BUCKET_URL:-} FIREBASE_AUTH_EMULATOR_HOST: ${FIREBASE_AUTH_EMULATOR_HOST_SERVER:-} MARBLE_APP_HOST: ${MARBLE_APP_HOST:-localhost:${HOST_APP_PORT:-3000}} # default value of GOOGLE_CLOUD_PROJECT must be kept if working with the emulator (and the emulator is running in the docker image below) @@ -59,7 +59,6 @@ services: CREATE_GLOBAL_ADMIN_EMAIL: ${CREATE_GLOBAL_ADMIN_EMAIL:-} CREATE_ORG_NAME: ${CREATE_ORG_NAME:-} CREATE_ORG_ADMIN_EMAIL: ${CREATE_ORG_ADMIN_EMAIL:-} - FAKE_GCS: ${FAKE_GCS:-} SEGMENT_WRITE_KEY: ${SEGMENT_WRITE_KEY_BACKEND:-UgkImFmHmBZAWh5fxIKBY3QtvlcBrhqQ} METABASE_SITE_URL: ${METABASE_SITE_URL:-} METABASE_JWT_SIGNING_KEY: ${METABASE_JWT_SIGNING_KEY:-} @@ -80,14 +79,9 @@ services: PG_PORT: ${PG_PORT:-} PG_USER: postgres PG_PASSWORD: ${PG_PASSWORD:-} - GCS_INGESTION_BUCKET: ${GCS_INGESTION_BUCKET:-} + INGESTION_BUCKET_URL: ${INGESTION_BUCKET_URL:-} REQUEST_LOGGING_LEVEL: ${REQUEST_LOGGING_LEVEL:-} LOGGING_FORMAT: ${LOGGING_FORMAT:-} - AWS_REGION: ${AWS_REGION:-} - AWS_ACCESS_KEY: ${AWS_ACCESS_KEY:-} - AWS_SECRET_KEY: ${AWS_SECRET_KEY:-} - FAKE_AWS_S3: ${FAKE_AWS_S3:-} - FAKE_GCS: ${FAKE_GCS:-} app: container_name: marble-app image: europe-west1-docker.pkg.dev/marble-infra/marble/marble-frontend:v0.1.22 diff --git a/installation.md b/installation.md index cf94503..a75da24 100644 --- a/installation.md +++ b/installation.md @@ -21,22 +21,21 @@ The first time you run the code, you should enter an organization name and organ **In a production setup:** -- set the `FIREBASE_AUTH_EMULATOR_HOST_SERVER` and `FIREBASE_AUTH_EMULATOR_HOST_CLIENT` env variables to empty strings in your .env file +- set the `FIREBASE_AUTH_EMULATOR_HOST_SERVER` and `FIREBASE_AUTH_EMULATOR_HOST_CLIENT` env variables to empty strings in your .env file, or remove them. - create a Firebase project and a Firebase app, and set the relevant env variables (`FIREBASE_API_KEY` to `FIREBASE_APP_ID` as well as `GOOGLE_CLOUD_PROJECT`) in your .env file -- if you plan to use the batch ingestion feature or the case manager with file storign feature, make sure you create the Google Cloud Storage buckets, set the corresponding env variables and run your code in a setup that will allow default application credentials detection +- create a service account json key in the GCP interface, mount it in the docker container and set the path to the key in the `GOOGLE_APPLICATION_CREDENTIALS` env variable (see below) +- if you plan to use the batch ingestion feature or the case manager with file storing feature, you need to create a pair of storage buckets (GCP cloud storage, AWS S3, Azure storage account), pass the url to the bucket in the `INGESTION_BUCKET_URL` and `CASE_MANAGER_BUCKET_URL` env variables (see details in the .env.example - you can use the same bucket for both), and make sure that any information necessary for the client to automatically authenticate is present (using env variables, credential files, or automatic credentials discovery from a metadata server) - create a Firebase user with the email you provided in the `CREATE_ORG_ADMIN_EMAIL` environment variable (you can do this on the Marble login page by using the SSO button or sign up with email) - if you have a license key, set it in the `LICENSE_KEY` env variable in your .env file -**Firebase authentication:** +**GCP & Firebase authentication:** -In a production setup, you need to authenticate to GCP to use Firebase and Cloud Storage. If you are not running the container directly in a GCP environment, here is how you could do this: +In a production setup, you need to authenticate to GCP to use Firebase (as well as Cloud Storage if you are using it as a storage bucket). If you are not running the container directly in a GCP environment, here is how you could do this: -- create a volume attached to the marble-api container (see the ) +- create a volume attached to the marble-api container - place the json service account key for GCP in the local shared folder (or otherwise inject it into the docker container, depending on how you run Marble) - set the `GOOGLE_APPLICATION_CREDENTIALS` variable equal to the path to the service account key -Open the Marble console by visiting `http://localhost:3000`, and interact with the Marble API at `http://localhost:8080` (assuming you use the default ports). Change those values accordingly if you configured a different port or if you are calling a specific host. - #### **How to upgrade your Marble version** You upgrade your Marble version by checking out the release of your choice, and simply running it. By running the docker image with the options `"--server", "--migrations"`, you execute the database migrations of the version and then start the server again. diff --git a/roadmap.md b/roadmap.md index f945d48..cfa04fc 100644 --- a/roadmap.md +++ b/roadmap.md @@ -1,7 +1,9 @@ ### :running: **Roadmap** --- + **Marble core features** + - On premise self installation - Create detection scenarios based on rules. - Run those scenarios in batch or real-time to generate decisions. @@ -33,14 +35,16 @@ - Supervised Machine Learning: use previous results to create a custom ML detection model - Advanced connectors for unstructured data: use documents, GPS points, images… in your rules.

---- + +--- + **High level roadmap** Opensource - self installation - [x] Docker for GCP - [x] Docker for AWS -- [ ] Docker for Azure +- [x] Docker for Azure **Rule Engine :** @@ -113,4 +117,3 @@ Opensource - self installation - [ ] Scenario A/B testing - [ ] Backtest
- diff --git a/terraform_templates/AWS/esc_tasks_definition.tf b/terraform_templates/AWS/esc_tasks_definition.tf index 4d750bf..7f0b4f7 100644 --- a/terraform_templates/AWS/esc_tasks_definition.tf +++ b/terraform_templates/AWS/esc_tasks_definition.tf @@ -32,7 +32,7 @@ resource "aws_ecs_task_definition" "app" { { name = "SESSION_SECRET", value = local.environment.session.secret }, { name = "SESSION_MAX_AGE", value = local.environment.session.max_age }, { name = "LICENSE_KEY", value = local.environment.licence_key }, - { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.frontend.env}, + { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.frontend.env }, { name = "SENTRY_DSN", value = local.environment.sentry.frontend.dsn }, ] @@ -70,10 +70,10 @@ resource "aws_ecs_task_definition" "app" { { name = "SESSION_SECRET", value = local.environment.session.secret }, { name = "SESSION_MAX_AGE", value = local.environment.session.max_age }, { name = "LICENSE_KEY", value = local.environment.licence_key }, - { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.backend.env}, + { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.backend.env }, { name = "SENTRY_DSN", value = local.environment.sentry.backend.dsn }, { name = "SEGMENT_WRITE_KEY", value = local.environment.segment_write_key.backend }, - { name = "AUTHENTICATION_JWT_SIGNING_KEY", value = "${file("config/private.key")}"} + { name = "AUTHENTICATION_JWT_SIGNING_KEY", value = "${file("config/private.key")}" } ] logConfiguration = { @@ -91,12 +91,12 @@ resource "aws_ecs_task_definition" "app" { depends_on = [aws_db_instance.rds-marble] - }, - { - name = "cron", - image = local.environment.backend.image, - essential = true, - + }, + { + name = "cron", + image = local.environment.backend.image, + essential = true, + entryPoint : ["./app", "--cron-scheduler"], environment = [ @@ -106,13 +106,12 @@ resource "aws_ecs_task_definition" "app" { { name = "PG_PORT", value = "${element(split(":", aws_db_instance.rds-marble.endpoint), 1)}" }, { name = "PG_USER", value = "postgres" }, { name = "PG_PASSWORD", value = "${random_string.rds-db-password.result}" }, - { name = "GCS_INGESTION_BUCKET", value = "data-ingestion-bucket" }, // Not Use for AWS ?? - { name = "AWS_REGION", value = var.aws_region}, + # { name = "INGESTION_BUCKET_URL", value = "data-ingestion-bucket" }, + { name = "AWS_REGION", value = var.aws_region }, { name = "AWS_ACCESS_KEY", value = var.aws_access_key_id }, { name = "AWS_SECRET_KEY", value = var.aws_secret_access_key }, - { name = "FAKE_AWS_S3", value = local.environment.cron.s3 }, { name = "LICENSE_KEY", value = local.environment.licence_key }, - { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.backend.env}, + { name = "SENTRY_ENVIRONMENT", value = local.environment.sentry.backend.env }, { name = "SENTRY_DSN", value = local.environment.sentry.backend.dsn }, ] diff --git a/terraform_templates/AWS/rds.tf b/terraform_templates/AWS/rds.tf index e637c68..e3cb7a1 100644 --- a/terraform_templates/AWS/rds.tf +++ b/terraform_templates/AWS/rds.tf @@ -25,22 +25,22 @@ resource "aws_iam_role" "rds_marble_monitoring_role" { name = "rds-marble-monitoring-role" assume_role_policy = jsonencode({ - Version = "2012-10-17", - Statement = [ + Version = "2012-10-17", + Statement = [ { Action = "sts:AssumeRole", Effect = "Allow", Principal = { - Service = "monitoring.rds.amazonaws.com" + Service = "monitoring.rds.amazonaws.com" + } } - } - ] -}) + ] + }) } resource "aws_iam_policy_attachment" "rds_monitoring_attachment" { - name = "rds-monitoring-attachment" - roles = [aws_iam_role.rds_marble_monitoring_role.name] + name = "rds-monitoring-attachment" + roles = [aws_iam_role.rds_marble_monitoring_role.name] policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonRDSEnhancedMonitoringRole" } @@ -48,16 +48,16 @@ resource "aws_security_group" "rds" { vpc_id = aws_vpc.main.id name = "rds-sg" description = "Allow inbound for Postgres from EC2 SG" -ingress { - from_port = 5432 - to_port = 5432 - protocol = "tcp" - security_groups = [ aws_security_group.ecs_node_sg.id ] + ingress { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = [aws_security_group.ecs_node_sg.id] } } resource "aws_db_subnet_group" "marble_rds_subnet_group" { - name = "marble_rds_subnet_group" + name = "marble_rds_subnet_group" subnet_ids = aws_subnet.public[*].id tags = { @@ -66,26 +66,25 @@ resource "aws_db_subnet_group" "marble_rds_subnet_group" { } resource "aws_db_instance" "rds-marble" { - - identifier = "rds-marble-${terraform.workspace}" - name = "marble" - instance_class = "db.t4g.small" - allocated_storage = 10 - engine = "postgres" - engine_version = "15" - publicly_accessible = true + + identifier = "rds-marble-${terraform.workspace}" + instance_class = "db.t4g.small" + allocated_storage = 10 + engine = "postgres" + engine_version = "15" + publicly_accessible = true allow_major_version_upgrade = true - + vpc_security_group_ids = [aws_security_group.rds.id] - db_subnet_group_name = aws_db_subnet_group.marble_rds_subnet_group.name - parameter_group_name = aws_db_parameter_group.pg-marble.name + db_subnet_group_name = aws_db_subnet_group.marble_rds_subnet_group.name + parameter_group_name = aws_db_parameter_group.pg-marble.name - username = "postgres" - password = random_string.rds-db-password.result + username = "postgres" + password = random_string.rds-db-password.result - skip_final_snapshot = false + skip_final_snapshot = false final_snapshot_identifier = "db-marble-snap" - + # Backup retention period (in days) backup_retention_period = 7 @@ -116,4 +115,4 @@ output "rds_username" { description = "RDS instance root username" value = aws_db_instance.rds-marble.username sensitive = true -} \ No newline at end of file +} diff --git a/terraform_templates/GCP/cloud_run_backend_service.tf b/terraform_templates/GCP/cloud_run_backend_service.tf index 10eb4f1..c72e202 100644 --- a/terraform_templates/GCP/cloud_run_backend_service.tf +++ b/terraform_templates/GCP/cloud_run_backend_service.tf @@ -61,13 +61,13 @@ resource "google_cloud_run_v2_service" "backend" { } env { - name = "GCS_INGESTION_BUCKET" - value = google_storage_bucket.data_ingestion.name + name = "INGESTION_BUCKET_URL" + value = "gs://${google_storage_bucket.data_ingestion.name}" } env { - name = "GCS_CASE_MANAGER_BUCKET" - value = google_storage_bucket.case_manager.name + name = "CASE_MANAGER_BUCKET_URL" + value = "gs://${google_storage_bucket.case_manager.name}" } env {