Skip to content

Commit

Permalink
Merge pull request #198 from DSC-McMaster-U/fix/upload-endpoint
Browse files Browse the repository at this point in the history
Feat: BigQuery endpoint + Fix: upload-endpoint + fe-be-connection updates
  • Loading branch information
rawanmahdi authored Feb 8, 2024
2 parents 07b59b7 + e2114a0 commit 49a4ed5
Show file tree
Hide file tree
Showing 14 changed files with 244 additions and 167 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.vscode/

app/src/logs.log
app/src/best_model.pkl
app/src/dataset.csv
Expand All @@ -8,4 +10,7 @@ __pycache__/

/package-lock.json

credentials.json
# ignore in all directories
**/credentials.json

backend/big_query.py
6 changes: 5 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@ COPY pyproject.toml poetry.lock ./
# Install the dependencies
RUN poetry config virtualenvs.create false && poetry install --no-dev

# Copy the credentials.json file and compute dir into the container
COPY credentials.json ./
COPY compute ./compute

# Copy the FastAPI application into the container
COPY . .
COPY main.py ./

# Specify the command to run the FastAPI application using uvicorn
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
5 changes: 1 addition & 4 deletions backend/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,7 @@ This will set up poetry. check installation using `poetry --version`
- The `pyproject.toml` file is a configuration file that specifies the dependencies and other settings for your Python project. It doesn't automatically track the imports or other changes in your Python files.
- If you add a new import to your Python files that requires a package not currently listed in your `pyproject.toml` file, you need to manually add that package to the `[tool.poetry.dependencies]` section of your `pyproject.toml` file.
- This can be done by running `poetry add package-name` which will add the package to the `[tool.poetry.dependencies]` section and update the `poetry.lock` file to include the new package and its dependencies.
- No need to run `poetry lock` or `poetry install` after this because `poetry add` already does the equivalent of both commands.

So, when u add a new dependancy to the code u do `poetry add package-name`
when u just wanna run u do `poetry install`
- Now, run `poetry lock` and then `poetry install` to install the new package and its dependencies.
</details>

Next, to install the dependencies, run: `poetry install`
Expand Down
5 changes: 4 additions & 1 deletion backend/api-tests/requests.http
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ GET http://127.0.0.1:8000/api/python
#### Get Datasets
GET http://127.0.0.1:8000/api/datasets

#### BigQuery test
GET http://127.0.0.1:8000/api/bq?filename=sample_contacts

#### Get Data
GET http://127.0.0.1:8000/api/data?filename=test
GET http://127.0.0.1:8000/api/data?filename=data

#### Get AutoML
GET http://127.0.0.1:8000/api/automl
Expand Down
1 change: 1 addition & 0 deletions backend/backend/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
credentials.json
4 changes: 3 additions & 1 deletion backend/data.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Id,name,age,height,weight
1,5,20,62,120.6
2,4,21,74,190.6
3,3,17,68,120.0
3,3,17,68,120.0
4,2,19,60,110.0
5,1,22,72,150.0
82 changes: 65 additions & 17 deletions backend/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
from io import BytesIO
import json
from google.cloud import storage
from google.cloud import bigquery
from starlette.responses import FileResponse
from io import BytesIO, StringIO
import pandas as pd
import os
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from google.cloud import storage
from fastapi.middleware.cors import CORSMiddleware
from compute.autoEDA import generate_eda
import csv

app = FastAPI()

DATA_BUCKET = "automate-ml-datasets"
BQ_DATASET = "automl_dataset_1"
GRAPH_BUCKET = "automate_ml_graphs"
origins = ["*"]

Expand All @@ -37,7 +37,7 @@ async def refreshDataSets():
global dataSetNames
try:
storage_client = storage.Client.from_service_account_json(
"../credentials.json")
"./credentials.json")

blobs = storage_client.list_blobs(DATA_BUCKET)
dataSetNames = [blob.name for blob in blobs]
Expand All @@ -61,17 +61,16 @@ async def root():
return {"message": "Hello World"}


# please don't remove this endpoint, it can be used to check if the fe connection with be is working
@app.get("/api/python")
async def root():
async def root_py():
return {"message": "Hello from fastAPI backend"}


# add a new dataset to the bucket
@app.put("/api/upload")
async def upload(file: UploadFile = File(...), fileName: str = Form(...)):
try:
storage_client = storage.Client.from_service_account_json(
"../credentials.json")
"./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
# Assuming fileName includes '.csv' extension
Expand All @@ -86,26 +85,28 @@ async def upload(file: UploadFile = File(...), fileName: str = Form(...)):
except Exception as e:
return JSONResponse(status_code=500, content={"error": f"An error occurred: {str(e)}"})


# list all the datasets in the bucket
@app.get("/api/datasets")
async def getDataSets():
if not dataSetNames:
return {"error": f"No DataSets in Bucket"}
return {"names": dataSetNames}


# get the data from the bucket and return it as a string
@app.get("/api/data")
async def getData(filename):
dataSetLines = ""
try:
storage_client = storage.Client.from_service_account_json(
"../credentials.json")
storage_client = storage.Client.from_service_account_json("./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
blob = bucket.blob(f"{filename}")
blob = bucket.blob(f"{filename}.csv")

with blob.open("r") as f:
dataSetLines = f.read()

# convert dataSetLines to be of type str | None (#TODO - check if this is necessary)
dataSetLines = str(dataSetLines) if dataSetLines else None

# convert csv string -> json (for frontend)
csv_reader = csv.DictReader(StringIO(dataSetLines))
Expand All @@ -120,13 +121,12 @@ async def getData(filename):
}



# Exploratory Data Analysis
@app.get("/api/eda")
async def eda(filename):
corrMatrix = ""
try:
storage_client = storage.Client.from_service_account_json(
"../credentials.json")
storage_client = storage.Client.from_service_account_json("./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
blob = bucket.blob(f"{filename}.csv")
Expand Down Expand Up @@ -155,13 +155,14 @@ async def eda(filename):

return {"data": corrMatrix, "graph_url": public_url}

# return the model as a file
@app.get("/api/automl")
async def getModel():
try:
#From #172 rawan/pandas-read-bucket

# storage_client = storage.Client.from_service_account_json(
# "../credentials.json")
# "./credentials.json")
# bucket = storage_client.get_bucket("data-test-automate-ml")
# blob = bucket.blob("fish_data.csv")
# byte_stream = BytesIO()
Expand All @@ -180,3 +181,50 @@ async def getModel():
# Return the placeholder file
return FileResponse(path=placeholder_model_path, filename=placeholder_model_path.split("/")[-1], media_type='application/octet-stream')


return {"data": corrMatrix}


# get file from bucket, load it to big query as a table & display the rows
@app.get("/api/bq")
async def bq(filename):
# construct client objects (authorized with the service account json file)
bq_client = bigquery.Client.from_service_account_json("./credentials.json")
storage_client = storage.Client.from_service_account_json("./credentials.json")

uri = f"gs://{DATA_BUCKET}/{filename}.csv"
table_id = f"{BQ_DATASET}.{filename}_table"

# if file does not exist in the bucket, return an error
blob = storage_client.get_bucket(DATA_BUCKET).blob(filename + ".csv")
if not blob.exists():
return {"error": f"File {filename}.csv does not exist in the bucket."}

# if table does not exist, load it
try:
bq_client.get_table(table_id)
except:
job_config = bigquery.LoadJobConfig(
autodetect=True, # Automatically infer the schema.
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1, # column headers
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, # Overwrite the table
)

load_job = bq_client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.

load_job.result() # Waits for the job to complete.

# Query all rows from the table
query = f"SELECT * FROM `{table_id}`"
query_job = bq_client.query(query)
rows = query_job.result()

# display the rows
data = []
for row in rows:
data.append(dict(row))

return {"message": f"Loaded {table_id} with {rows.total_rows} rows.", "data": data}
Loading

0 comments on commit 49a4ed5

Please sign in to comment.