Merge pull request #60 from maciejzj/develop

Release changes v0.2.0 to master
maciejzj · Jun 22, 2023 · 3f0d68b · 3f0d68b
2 parents a567a08 + 9ddfb43
commit 3f0d68b
Show file tree

Hide file tree

Showing 35 changed files with 941 additions and 297 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-# Python
-*.json
+# Project-specific
+deployment/artifacts
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -100,18 +100,20 @@ processed form (ready to be used by the dashboard later).
 
 ```
 $ it-jobs-meta -h
-usage: it-jobs-meta pipeline [-h] [-c CRON_EXPRESSION] (-r CONFIG_PATH | -b CONFIG_PATH) (-m CONFIG_PATH | -s CONFIG_PATH)
+usage: it-jobs-meta pipeline [-h] [-c CRON_EXPRESSION] [-a URL] [-r CONFIG_PATH | -b CONFIG_PATH] (-m CONFIG_PATH | -s CONFIG_PATH)
 
 Run data pipeline once or periodically, scrap data, store it in the data lake, load processed data to the data warehouse.
 
 options:
   -h, --help            show this help message and exit
   -c CRON_EXPRESSION, --schedule CRON_EXPRESSION
                         schedule pipeline to run periodically with a cron expression
+  -a URL, --from-archive URL
+                        Obtain postings data from archive (URL must point to JSON in data lake storage format)
   -r CONFIG_PATH, --redis CONFIG_PATH
                         choose Redis as the data lake with the given config file
   -b CONFIG_PATH, --s3-bucket CONFIG_PATH
-                        choose S3 Bucket as the data lake with given config file
+                        choose S3 Bucket as the data lake with the given config file
   -m CONFIG_PATH, --mongodb CONFIG_PATH
                         choose MongoDB as the data warehouse with the given config file
   -s CONFIG_PATH, --sql CONFIG_PATH
@@ -123,11 +125,13 @@ data after the data is scrapped with the `pipeline` subcommand.
 
 ```
 $ it-jobs-meta -h
-usage: it-jobs-meta dashboard [-h] [-w] -m CONFIG_PATH
+usage: it-jobs-meta dashboard [-h] [-w] [-l LABEL] -m CONFIG_PATH
 
 options:
   -h, --help            show this help message and exit
   -w, --with-wsgi       run dashboard server with WSGI (in deployment mode)
+  -l LABEL, --label LABEL
+                        Extra label to be displayed at the top navbar
   -m CONFIG_PATH, --mongodb CONFIG_PATH
                         choose MongoDb as the data provider with the given config file
 ```
@@ -160,10 +164,12 @@ Install docker, docker-compose, and run `docker-compose up` in the project
 directory to set up the services.
 
 The application can be run with `python -m it_jobs_meta`. Since running the data
-pipeline is going to download data from the web, it is not recommended to
-run it as a whole during the development. Some modules include demo versions of
-parts of the application, resort to using them and unit tests during the
-development process.
+pipeline is going to download data from the web, it is not recommended to run it
+as a whole during the development. The run-from-archive option can be used with
+the supplied data sample in the test directory
+(`./it_jobs_meta/data_pipeline/test/1640874783_nofluffjobs.json`) to run the
+pipeline offline. Some modules include demo versions of parts of the
+application, resort to using them and unit tests during the development process.
 
 ### Development tools
 
@@ -182,8 +188,8 @@ Tools configuration is stored in the `pyproject.toml` file.
 
 The application is not bound to any specific deployment environment; however,
 AWS is used for running the main instance. The setup for creating AWS
-infrastructure for the application using Terraform is placed int the
-`deployment` directory. 
+infrastructure for the application using Terraform and Ansible deployment is
+placed int the `deployment` directory. 
 
 ## License
 

diff --git a/deployment/README.md b/deployment/README.md
@@ -1,11 +1,20 @@
 # IT Jobs Meta deployment
 
+## Prerequisites
+
+Terraform and Ansible are required to run the deployment process.
+
 ## Infrastructure
 
 The application is deployed through AWS. The infrastructure state is stored in a
-separate S3 bucket. To setup Terraform with S3 Bucket backend run:
-```sh
-terraform init -backend-config="access_key=<your access key>" -backend-config="secret_key=<your secret key>"
+separate S3 bucket.
+
+To gain permissions to deploy the app to AWS use access keys. This can be done
+via environmental variables:
+
+```
+ export AWS_ACCESS_KEY_ID=<AWS_ACCESS_KEY_ID>
+ export AWS_SECRET_ACCESS_KEY=<AWS_SECRET_ACCESS_KEY>
 ```
 
 > ❗️ **Warning:** When running the command above place space before typing it to
@@ -16,14 +25,52 @@ terraform init -backend-config="access_key=<your access key>" -backend-config="s
 > is fine for small project, but beware manipulating infrastructure resources
 > from multiple devices at once.
 
-## Deploy to instance
+> ⚠️ **Important:** Don't keep the keys in the open. It is most secure to destroy
+> them after successful deployment.
+
+Terraform workspaces are used for development setup. Arbitrary workspace name
+can be used to setup a new environment. Infrastructure resources will be named
+and tagged according to the workspace name and the service will be deployed to a
+`<worksapce_name>.itjobsmeta.net` domain (e.g. for the `dev` workspace will be
+deployed under `dev.itjobsmeta.net`). The main (`default`) workspace is the
+deployment one—it will be exposed to `itjobsmeta.net`.
+
+Run `terraform apply` to set up the infrastructure.
+
+## Setup DNS
+
+DNS has to be set up in order for the next stage of the deployment to work. The
+DNS setup is not automated. You have to create `A` a record for the domain (both
+`www.itjobsmeta.net` and `itjobsmeta.net`). IP address of the EC2 instance is
+provided by the terraform output (`terraform output`). If you use non-default
+workspace you have to use appropriate subdomain (e.g. for `dev` it should be
+`www.dev.itjobsmeta.net` and `dev.itjobsmeta.net`).
+
+## Deploy to an instance
+
+Application deployment to EC2 is set up via Ansible. Terraform automatically
+sets up Ansible inventory for the deployment.
+
+The itjobsmeta applications are run as systemd services. App version (git commit
+or tag) and launch parameters are specified in `it_jobs_meta_vars.yml` file. The
+CLI command parameters specified in the file are appended to the main commands.
+Read the main README or use `--help` option to see the available options (e.g.
+enable/disable pipeline scheduling, archive mode, data lake archival, etc.).
+
+Run `ansible-playbook playbook.yml` to deploy the app to the instance (wait a
+few seconds afterwards before visiting the website). Deployed app should become
+available under the specified domain shortly.
+
+> 📝 **Notice:** The deployment playbook sets up cache for EC2. When the
+> dashboard server is running the EC2 T2 Micro instance has sufficient memory
+> limit. However, when it is to run the dashboard and the pipeline
+> simultaneously RAM becomes scarce. Swap helps to mitigate it, however it uses
+> EBS disk space which can generate costs. If the pipeline is run sparsely and
+> swappiness is low, the EBS costs should be negligible.
 
-After the infrastructure is set up download the source code (either via git or
-by obtaining a release download). Then run: `./deployment/deploy`.
+## Manual access
 
-> 📝 **Notice:** The `deploy` script sets up cache for EC2. When the dashboard
-> server is running the EC2 T2 Micro instance has sufficient memory limit.
-> However, when it is to run the dashboard and the pipeline simultaneously there
-> is not enough RAM. Cache helps to mitigate it, however it uses EBS disk space
-> which can generate costs. If the pipeline is run sparsely and swappiness is
-> low, the EBS costs should be negligible.
+The EC2 instance can be directly accessed via ssh using the generated `.pem`
+file in the artifacts directory. Run:
+`ssh -i artifacts/it-jobs-meta-ec2-server.pem ubuntu@<EC2_INSTANCE_IP>`
+to log into the server (use the IP address from Terraform output).
diff --git a/deployment/ansible.cfg b/deployment/ansible.cfg
@@ -0,0 +1,2 @@
+[defaults]
+inventory = artifacts/hosts
diff --git a/deployment/deploy b/deployment/deploy
diff --git a/deployment/it_jobs_meta_vars.yml b/deployment/it_jobs_meta_vars.yml
@@ -0,0 +1,9 @@
+it_jobs_meta_version: b962af2dc5a16548dec14f1a8c6e3b1803090505
+# Add the following for weekly updates:
+# '--schedule "0 0 * * 1"'
+# Add the following to run from an archived snapshot:
+# '--from-archive "https://github.com/maciejzj/it-jobs-meta/raw/master/it_jobs_meta/data_pipeline/test/1640874783_nofluffjobs.json"'
+# Add the following to store data in the s3 bucket data lake:
+# '--s3-bucket /home/{{ ansible_user }}/it-jobs-meta/s3_bucket_config.yml'
+pipeline_command_params: '--from-archive "https://github.com/maciejzj/it-jobs-meta/raw/master/it_jobs_meta/data_pipeline/test/1640874783_nofluffjobs.json" -m /home/{{ ansible_user }}/it-jobs-meta/mongodb_config.yml'
+dashboard_command_params: '--with-wsgi --mongodb /home/{{ ansible_user }}/it-jobs-meta/mongodb_config.yml --label ARCHIVE'
diff --git a/deployment/main.tf b/deployment/main.tf
@@ -1,135 +1,44 @@
 provider "aws" {
-  region     = "eu-central-1"
-  access_key = var.aws_access_key
-  secret_key = var.aws_secret_key
+  region = "eu-central-1"
 }
 
-resource "aws_s3_bucket" "data_lake_bucket" {
-  bucket        = "it-jobs-meta-data-lake"
+resource "aws_s3_bucket" "data_lake_storage" {
+  bucket        = "it-jobs-meta-data-lake-${terraform.workspace}"
   force_destroy = true
-}
-
-resource "aws_s3_bucket_public_access_block" "data_lake_bucket_access" {
-  bucket = aws_s3_bucket.data_lake_bucket.id
-
-  # Keep the bucket private
-  block_public_acls   = true
-  block_public_policy = true
-  ignore_public_acls  = true
-}
 
-resource "aws_default_vpc" "default_vpc" {
   tags = {
-    Name = "Default AWS VPC"
+    Name        = "S3 bucket for it-jobs-meta data lake"
+    Project     = "${var.project_name_tag}"
+    Environment = "${terraform.workspace}"
   }
 }
 
-resource "aws_security_group" "allow_web" {
-  name        = "allow_web_traffic"
-  description = "Allow web trafic for hosting a server"
-  vpc_id      = aws_default_vpc.default_vpc.id
+data "aws_ami" "ubuntu" {
+  most_recent = true
 
-  ingress {
-    description = "HTTPS"
-    from_port   = 443
-    to_port     = 443
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
-  }
-  ingress {
-    description = "HTTP"
-    from_port   = 8080
-    to_port     = 8080
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
-  }
-  ingress {
-    description = "HTTP"
-    from_port   = 80
-    to_port     = 80
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
+  filter {
+    name   = "name"
+    values = ["ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-*"]
   }
-  ingress {
-    description = "SSH"
-    from_port   = 22
-    to_port     = 22
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"]
-  }
-
-  egress {
-    from_port   = 0
-    to_port     = 0
-    protocol    = "-1"
-    cidr_blocks = ["0.0.0.0/0"]
-  }
-}
-
-resource "aws_iam_policy" "allow_s3_bucket_access" {
-  name        = "allow-s3-bucket-access"
-  path        = "/"
-  description = "Allow "
 
-  policy = jsonencode({
-    "Version" : "2012-10-17",
-    "Statement" : [
-      {
-        "Sid" : "VisualEditor0",
-        "Effect" : "Allow",
-        "Action" : [
-          "s3:PutObject",
-          "s3:GetObject",
-          "s3:ListBucket",
-          "s3:DeleteObject"
-        ],
-        "Resource" : [
-          "arn:aws:s3:::*/*",
-          aws_s3_bucket.data_lake_bucket.arn
-        ]
-      }
-    ]
-  })
+  # Canonical
+  owners = ["099720109477"]
 }
 
-resource "aws_iam_role" "iam_role_for_ec2" {
-  name = "iam_role_for_ec2_it_jobs_meta_server"
-
-  assume_role_policy = jsonencode({
-    Version = "2012-10-17"
-    Statement = [
-      {
-        Action = "sts:AssumeRole"
-        Effect = "Allow"
-        Sid    = ""
-        Principal = {
-          Service = "ec2.amazonaws.com"
-        }
-      },
-    ]
-  })
-}
-
-resource "aws_iam_instance_profile" "iam_profile_for_ec2" {
-  name = "some-profile"
-  role = aws_iam_role.iam_role_for_ec2.name
-}
+resource "aws_instance" "it_jobs_meta_server" {
+  ami           = data.aws_ami.ubuntu.id
+  instance_type = "t2.micro"
 
+  subnet_id                   = aws_subnet.it_jobs_meta_public.id
+  vpc_security_group_ids      = [aws_security_group.allow_web.id]
+  associate_public_ip_address = true
 
-resource "aws_iam_role_policy_attachment" "data_lake_bucket_policy_attach" {
-  role       = aws_iam_role.iam_role_for_ec2.name
-  policy_arn = aws_iam_policy.allow_s3_bucket_access.arn
-}
-
-resource "aws_instance" "it_jobs_meta_server" {
-  # eu-central-1, Ubuntu 20.04 LTS, amd64
-  ami                    = "ami-0498a49a15494604f"
-  instance_type          = "t2.micro"
-  vpc_security_group_ids = [aws_security_group.allow_web.id]
+  key_name             = aws_key_pair.it_jobs_meta_ec2_server.key_name
+  iam_instance_profile = aws_iam_instance_profile.iam_profile_for_ec2.id
 
   tags = {
-    Name = "It Jobs Meta server"
+    Name        = "EC2 instance for it-jobs-meta server"
+    Project     = "${var.project_name_tag}"
+    Environment = "${terraform.workspace}"
   }
-
-  iam_instance_profile = aws_iam_instance_profile.iam_profile_for_ec2.id
 }