Merge pull request #198 from DSC-McMaster-U/fix/upload-endpoint

Feat: BigQuery endpoint + Fix: upload-endpoint + fe-be-connection updates
DSC-McMaster-U · Feb 8, 2024 · 49a4ed5 · 49a4ed5
2 parents 07b59b7 + e2114a0
commit 49a4ed5
Show file tree

Hide file tree

Showing 14 changed files with 244 additions and 167 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.vscode/
+
 app/src/logs.log
 app/src/best_model.pkl
 app/src/dataset.csv
@@ -8,4 +10,7 @@ __pycache__/
 
 /package-lock.json
 
-credentials.json
+# ignore in all directories
+**/credentials.json
+
+backend/big_query.py
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -18,8 +18,12 @@ COPY pyproject.toml poetry.lock ./
 # Install the dependencies
 RUN poetry config virtualenvs.create false && poetry install --no-dev
 
+# Copy the credentials.json file and compute dir into the container
+COPY credentials.json ./
+COPY compute ./compute
+
 # Copy the FastAPI application into the container
-COPY . .
+COPY main.py ./
 
 # Specify the command to run the FastAPI application using uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/backend/README.md b/backend/README.md
@@ -30,10 +30,7 @@ This will set up poetry. check installation using `poetry --version`
 - The `pyproject.toml` file is a configuration file that specifies the dependencies and other settings for your Python project. It doesn't automatically track the imports or other changes in your Python files.
 - If you add a new import to your Python files that requires a package not currently listed in your `pyproject.toml` file, you need to manually add that package to the `[tool.poetry.dependencies]` section of your `pyproject.toml` file.
 - This can be done by running `poetry add package-name` which will add the package to the  `[tool.poetry.dependencies]` section and update the `poetry.lock` file to include the new package and its dependencies.
-- No need to run `poetry lock` or `poetry install` after this because `poetry add` already does the equivalent of both commands.
-
-So, when u add a new dependancy to the code u do `poetry add package-name` 
-when u just wanna run u do `poetry install`
+- Now, run `poetry lock` and then `poetry install` to install the new package and its dependencies.
 </details>
 
 Next, to install the dependencies, run: `poetry install`

diff --git a/backend/api-tests/requests.http b/backend/api-tests/requests.http
@@ -10,8 +10,11 @@ GET http://127.0.0.1:8000/api/python
 #### Get Datasets
 GET http://127.0.0.1:8000/api/datasets
 
+#### BigQuery test
+GET http://127.0.0.1:8000/api/bq?filename=sample_contacts
+
 #### Get Data
-GET http://127.0.0.1:8000/api/data?filename=test
+GET http://127.0.0.1:8000/api/data?filename=data
 
 #### Get AutoML
 GET http://127.0.0.1:8000/api/automl

diff --git a/backend/backend/.dockerignore b/backend/backend/.dockerignore
@@ -0,0 +1 @@
+credentials.json
diff --git a/backend/data.csv b/backend/data.csv
@@ -1,4 +1,6 @@
 Id,name,age,height,weight
 1,5,20,62,120.6
 2,4,21,74,190.6
-3,3,17,68,120.0
+3,3,17,68,120.0
+4,2,19,60,110.0
+5,1,22,72,150.0
diff --git a/backend/main.py b/backend/main.py
@@ -1,19 +1,19 @@
-from io import BytesIO
-import json
+from google.cloud import storage
+from google.cloud import bigquery
 from starlette.responses import FileResponse
 from io import BytesIO, StringIO
 import pandas as pd
 import os
 from fastapi import FastAPI, File, UploadFile, Form
 from fastapi.responses import JSONResponse
-from google.cloud import storage
 from fastapi.middleware.cors import CORSMiddleware
 from compute.autoEDA import generate_eda
 import csv
 
 app = FastAPI()
 
 DATA_BUCKET = "automate-ml-datasets"
+BQ_DATASET = "automl_dataset_1"
 GRAPH_BUCKET = "automate_ml_graphs"
 origins = ["*"]
 
@@ -37,7 +37,7 @@ async def refreshDataSets():
     global dataSetNames
     try:
         storage_client = storage.Client.from_service_account_json(
-            "../credentials.json")
+            "./credentials.json")
 
         blobs = storage_client.list_blobs(DATA_BUCKET)
         dataSetNames = [blob.name for blob in blobs]
@@ -61,17 +61,16 @@ async def root():
     return {"message": "Hello World"}
 
 
-# please don't remove this endpoint, it can be used to check if the fe connection with be is working
 @app.get("/api/python")
-async def root():
+async def root_py():
     return {"message": "Hello from fastAPI backend"}
 
-
+# add a new dataset to the bucket
 @app.put("/api/upload")
 async def upload(file: UploadFile = File(...), fileName: str = Form(...)):
     try:
         storage_client = storage.Client.from_service_account_json(
-            "../credentials.json")
+            "./credentials.json")
 
         bucket = storage_client.get_bucket(DATA_BUCKET)
         # Assuming fileName includes '.csv' extension
@@ -86,26 +85,28 @@ async def upload(file: UploadFile = File(...), fileName: str = Form(...)):
     except Exception as e:
         return JSONResponse(status_code=500, content={"error": f"An error occurred: {str(e)}"})
 
-
+# list all the datasets in the bucket
 @app.get("/api/datasets")
 async def getDataSets():
     if not dataSetNames:
         return {"error": f"No DataSets in Bucket"}
     return {"names": dataSetNames}
 
-
+# get the data from the bucket and return it as a string
 @app.get("/api/data")
 async def getData(filename):
     dataSetLines = ""
     try:
-        storage_client = storage.Client.from_service_account_json(
-            "../credentials.json")
+        storage_client = storage.Client.from_service_account_json("./credentials.json")
 
         bucket = storage_client.get_bucket(DATA_BUCKET)
-        blob = bucket.blob(f"{filename}")
+        blob = bucket.blob(f"{filename}.csv")
 
         with blob.open("r") as f:
             dataSetLines = f.read()
+
+        # convert dataSetLines to be of type str | None (#TODO - check if this is necessary)
+        dataSetLines = str(dataSetLines) if dataSetLines else None 
 
         # convert csv string -> json (for frontend)
         csv_reader = csv.DictReader(StringIO(dataSetLines))
@@ -120,13 +121,12 @@ async def getData(filename):
     }
 
 
-
+# Exploratory Data Analysis
 @app.get("/api/eda")
 async def eda(filename):
     corrMatrix = ""
     try:
-        storage_client = storage.Client.from_service_account_json(
-            "../credentials.json")
+        storage_client = storage.Client.from_service_account_json("./credentials.json")
 
         bucket = storage_client.get_bucket(DATA_BUCKET)
         blob = bucket.blob(f"{filename}.csv")
@@ -155,13 +155,14 @@ async def eda(filename):
 
     return {"data": corrMatrix, "graph_url": public_url}
 
+# return the model as a file
 @app.get("/api/automl")
 async def getModel():
     try:
         #From #172 rawan/pandas-read-bucket
 
         # storage_client = storage.Client.from_service_account_json(
-        #     "../credentials.json")
+        #     "./credentials.json")
         # bucket = storage_client.get_bucket("data-test-automate-ml")
         # blob = bucket.blob("fish_data.csv")
         # byte_stream = BytesIO()
@@ -180,3 +181,50 @@ async def getModel():
     # Return the placeholder file
     return FileResponse(path=placeholder_model_path, filename=placeholder_model_path.split("/")[-1], media_type='application/octet-stream')
 
+
+    return {"data": corrMatrix}
+
+
+# get file from bucket, load it to big query as a table & display the rows
+@app.get("/api/bq")
+async def bq(filename):
+    # construct client objects (authorized with the service account json file)
+    bq_client = bigquery.Client.from_service_account_json("./credentials.json")
+    storage_client = storage.Client.from_service_account_json("./credentials.json")
+
+    uri = f"gs://{DATA_BUCKET}/{filename}.csv"
+    table_id = f"{BQ_DATASET}.{filename}_table"
+
+    # if file does not exist in the bucket, return an error
+    blob = storage_client.get_bucket(DATA_BUCKET).blob(filename + ".csv")
+    if not blob.exists():
+        return {"error": f"File {filename}.csv does not exist in the bucket."}
+
+    # if table does not exist, load it
+    try:
+        bq_client.get_table(table_id)
+    except:
+        job_config = bigquery.LoadJobConfig(
+            autodetect=True,  # Automatically infer the schema.
+            source_format=bigquery.SourceFormat.CSV,
+            skip_leading_rows=1,  # column headers
+            write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,  # Overwrite the table
+        )
+
+        load_job = bq_client.load_table_from_uri(
+            uri, table_id, job_config=job_config
+        ) # Make an API request.
+
+        load_job.result()  # Waits for the job to complete.
+
+    # Query all rows from the table
+    query = f"SELECT * FROM `{table_id}`"
+    query_job = bq_client.query(query)
+    rows = query_job.result()
+
+    # display the rows
+    data = []
+    for row in rows:
+        data.append(dict(row))
+
+    return {"message": f"Loaded {table_id} with {rows.total_rows} rows.", "data": data}