update

mikeizbicki · Apr 4, 2024 · 8cf0cbb · 8cf0cbb
1 parent f7e97ef
commit 8cf0cbb
Show file tree

Hide file tree

Showing 11 changed files with 66 additions and 39 deletions.
diff --git a/.github/workflows/tests_denormalized.yml b/.github/workflows/tests_denormalized.yml
@@ -1,4 +1,4 @@
-name: tests_denormalized
+name: tests_denormalized_sequential
 on:
   push:
     branches: ['*']
@@ -20,4 +20,4 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_sequential.sh
-        docker-compose exec -T pg_denormalized ./check_answers.sh
+        docker-compose exec -T pg_denormalized ./run_tests.sh
diff --git a/.github/workflows/tests_denormalized_parallel.yml b/.github/workflows/tests_denormalized_parallel.yml
@@ -20,5 +20,5 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_parallel.sh
-        docker-compose exec -T pg_denormalized ./check_answers.sh
+        docker-compose exec -T pg_denormalized ./run_tests.sh
 
diff --git a/.github/workflows/tests_normalized.yml b/.github/workflows/tests_normalized.yml
@@ -1,4 +1,4 @@
-name: tests_normalized
+name: tests_normalized_sequential
 on:
   push:
     branches: ['*']
@@ -20,4 +20,4 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_sequential.sh
-        docker-compose exec -T pg_normalized ./check_answers.sh
+        docker-compose exec -T pg_normalized ./run_tests.sh
diff --git a/.github/workflows/tests_normalized_batch.yml b/.github/workflows/tests_normalized_batch.yml
@@ -1,4 +1,4 @@
-name: tests_normalized_batch
+name: tests_normalizedbatch_sequential
 on:
   push:
     branches: ['*']
@@ -20,5 +20,5 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_sequential.sh
-        docker-compose exec -T pg_normalized_batch ./check_answers.sh
+        docker-compose exec -T pg_normalized_batch ./run_tests.sh
 
diff --git a/.github/workflows/tests_normalized_batch_parallel.yml b/.github/workflows/tests_normalized_batch_parallel.yml
@@ -1,4 +1,4 @@
-name: tests_normalized_batch_parallel
+name: tests_normalizedbatch_parallel
 on:
   push:
     branches: ['*']
@@ -20,6 +20,6 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_parallel.sh
-        docker-compose exec -T pg_normalized_batch ./check_answers.sh
+        docker-compose exec -T pg_normalized_batch ./run_tests.sh
 
 
diff --git a/.github/workflows/tests_normalized_parallel.yml b/.github/workflows/tests_normalized_parallel.yml
@@ -20,5 +20,5 @@ jobs:
         docker ps -a
         sleep 20
         sh load_tweets_parallel.sh
-        docker-compose exec -T pg_normalized ./check_answers.sh
+        docker-compose exec -T pg_normalized ./run_tests.sh
 
diff --git a/README.md b/README.md
@@ -1,27 +1,25 @@
-# Parallel Twitter in Postgres
-
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized/badge.svg)
-
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized_parallel/badge.svg)
+> **WARNING:**
+> It is an academic integrity violation to begin this assignment before submitting the previous assignment.
+> (This assignment contains a solution to the previous assignment.)
 
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized_batch/badge.svg)
-
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized_batch_parallel/badge.svg)
-
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_denormalized/badge.svg)
+# Parallel Twitter in Postgres
 
-![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_denormalized_parallel/badge.svg)
+|     | sequential | parallel |
+| --- | ---------- | -------- |
+| normalized (unbatched) | ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized_sequential/badge.svg) | ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalized_parallel/badge.svg) |
+| normalized (batched) | ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalizedbatch_sequential/badge.svg) |  ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_normalizedbatch_parallel/badge.svg) |
+| denormalized | ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_denormalized_sequential/badge.svg) | ![](https://github.com/mikeizbicki/twitter_postgres_parallel/workflows/tests_denormalized_parallel/badge.svg) |
 
-In this assignment, you will make your data loading into postgres significantly faster using batch loading and parallel loading.
-Notice that many of the test cases above are already passing;
-you will have to ensure that they remain passing as you complete the tasks below.
+In this assignment, you will learn how to load data into postgres much faster using two techniques:
+1. batch loading (i.e. running the INSERT command on more than one row at a time)
+1. and parallel loading.
 
 ## Tasks
 
 ### Setup
 
 1. Fork this repo
-1. Enable github action on your fork
+1. Enable github actions on your fork
 1. Clone the fork onto the lambda server
 1. Modify the `README.md` file so that all the test case images point to your repo
 1. Modify the `docker-compose.yml` to specify valid ports for each of the postgres services
@@ -32,13 +30,29 @@ you will have to ensure that they remain passing as you complete the tasks below
        ```
        with no errors
 
+### The Data
+
+In this project, you will be using more data than in the last homework, but still only a small subset of the full twitter dataset.
+The data is located in the `data` folder.
+Familiarize yourself with the data by running the commands
+```
+$ ls data
+$ du -h data
+$ for file in data/*; do echo "$file" $(unzip -p "$file" | wc -l); done
+```
+
 ### Sequential Data Loading
 
+Notice that the test cases above for the sequential data loading are already passing.
+This section walks you through how to run the sequential data loading code.
+It is very similar to the code from the last homework assignment.
+The main difference is that I've also added the `load_tweets_batch.py` file for you which loads tweets in "batches" (instead of 1 at a time).
+
 Bring up a fresh version of your containers by running the commands:
 ```
 $ docker-compose down
 $ docker volume prune
-$ docker-compose up -d --build
+$ docker-compose up -d
 ```
 
 Run the following command to insert data into each of the containers sequentially.
@@ -89,13 +103,14 @@ Complete the following steps:
    The script should then load this file into the database using the same technique as in the `load_tweets_sequential.sh` file for the denormalized database.
    In particular, you know you've implemented this file correctly if the following bash code correctly loads the database.
    ```
-   for file in $(find data); do
+   for file in data/*; do
        sh load_denormalized.sh $file
    done
    ```
 
 2. Call the `load_denormalized.sh` file using the `parallel` program from within the `load_tweets_parallel.sh` script.
-   You know you've completed this step correctly if the `check_answers.sh` script passes and the test badge turns green.
+
+   You know you've completed this step correctly if the `run_tests.sh` script passes (locally) and the test badge turns green (on the lambda server).
 
 #### Normalized Data (unbatched)
 
@@ -105,15 +120,15 @@ Unfortunately, the code is extremely slow,
 so even when run in parallel it is still slower than the batched code.
 
 > **NOTE:**
-> The `tests_normalized_batch_parallel` is currently failing because the `load_tweets_parallel.sh` script is not yet implemented.
+> The `tests_normalizedbatch_parallel` is currently failing because the `load_tweets_parallel.sh` script is not yet implemented.
 > After you use GNU parallel to implement this script, everything should pass.
 
 #### Normalized Data (batched)
 
 Parallel loading of the batched data will fail due to deadlocks.
 These deadlocks will cause some of your parallel loading processes to crash.
 So all the data will not get inserted,
-and you will fail the `check_answers.sh` tests.
+and you will fail the `run_tests.sh` tests.
 
 There are two possible ways to fix this.
 The most naive method is to catch the exceptions generated by the deadlocks in python and repeat the failed queries.
@@ -169,7 +184,7 @@ You should notice that parallelism achieves a nearly (but not quite) 10x speedup
 
 Ensure that your runtimes on the lambda server are recorded below.
 
-|                        | elapsed time (sequential) | elapsed time (parallel) |
+|                        | elapsed time (sequential) | elapsed time (parallel)   |
 | -----------------------| ------------------------- | ------------------------- |
 | `pg_normalized`        |                           |                           | 
 | `pg_normalized_batch`  |                           |                           | 
@@ -181,3 +196,4 @@ Then upload a link to your forked github repo on sakai.
 > It is not enough to just get passing test cases for this assignment in order to get full credit.
 > (It is easy to pass the test cases by just doing everything sequentially.)
 > Instead, you must also implement the parallelism correctly so that the parallel runtimes above are about 10x faster than the sequential runtimes.
+> (Again, they should be 10x faster because we are doing 10 files in parallel.)
diff --git a/load_denormalized.sh b/load_denormalized.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+unzip -p "$1" | sed 's/\\u0000//g' | psql postgresql://postgres:pass@localhost:15433/ -c "COPY tweets_jsonb (data) FROM STDIN csv quote e'\x01' delimiter e'\x02';"
diff --git a/load_tweets_parallel.sh b/load_tweets_parallel.sh
@@ -5,14 +5,18 @@ files=$(find data/*)
 echo '================================================================================'
 echo 'load pg_denormalized'
 echo '================================================================================'
-# FIXME: implement this
+#time for file in $files; do
+#    ./load_denormalized.sh "$file"
+    #unzip -p "$file" | sed 's/\\u0000//g' | psql postgresql://postgres:pass@localhost:15433/ -c "COPY tweets_jsonb (data) FROM STDIN csv quote e'\x01' delimiter e'\x02';"
+#done
+time echo "$files" | parallel ./load_denormalized.sh
 
 echo '================================================================================'
 echo 'load pg_normalized'
 echo '================================================================================'
-echo "$files" | time parallel python3 -u load_tweets.py --db=postgresql://postgres:pass@localhost:2/ --inputs
+# FIXME: implement this with GNU parallel
 
 echo '================================================================================'
 echo 'load pg_normalized_batch'
 echo '================================================================================'
-echo "$files" | time parallel python3 -u load_tweets_batch.py --db=postgresql://postgres:pass@localhost:3/ --inputs
+# FIXME: implement this with GNU parallel
diff --git a/load_tweets_sequential.sh b/load_tweets_sequential.sh
@@ -1,20 +1,24 @@
-#!/bin/sh
+#!/bin/bash
 
 files=$(find data/*)
 
 echo '================================================================================'
 echo 'load denormalized'
 echo '================================================================================'
-for file in $files; do
-    time unzip -p "$file" | sed 's/\\u0000//g' | psql postgresql://postgres:pass@localhost:1/ -c "COPY tweets_jsonb (data) FROM STDIN csv quote e'\x01' delimiter e'\x02';"
+time for file in $files; do
+    unzip -p "$file" | sed 's/\\u0000//g' | psql postgresql://postgres:pass@localhost:1/ -c "COPY tweets_jsonb (data) FROM STDIN csv quote e'\x01' delimiter e'\x02';"
 done
 
 echo '================================================================================'
 echo 'load pg_normalized'
 echo '================================================================================'
-time python3 -u load_tweets.py --db=postgresql://postgres:pass@localhost:2/ --inputs $files
+time for file in $files; do
+    python3 -u load_tweets.py --db=postgresql://postgres:pass@localhost:2/ --inputs $file
+done
 
 echo '================================================================================'
 echo 'load pg_normalized_batch'
 echo '================================================================================'
-time python3 -u load_tweets_batch.py --db=postgresql://postgres:pass@localhost:3/ --inputs $files
+time for file in $files; do
+    python3 -u load_tweets_batch.py --db=postgresql://postgres:pass@localhost:3/ --inputs $file
+done
diff --git a/check_answers.sh → run_tests.sh b/check_answers.sh → run_tests.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/sh

		unzip -p "$1" \| sed 's/\\u0000//g' \| psql postgresql://postgres:pass@localhost:15433/ -c "COPY tweets_jsonb (data) FROM STDIN csv quote e'\x01' delimiter e'\x02';"