Skip to content

Commit

Permalink
Resolving lock conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
rawanmahdi committed Feb 21, 2024
2 parents 6878d2d + e1a2377 commit da63590
Show file tree
Hide file tree
Showing 27 changed files with 6,038 additions and 1,553 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ app/src/logs.log
app/src/best_model.pkl
app/src/dataset.csv

frontend/.eslintrc.json

.DS_Store

__pycache__/
Expand All @@ -13,4 +15,9 @@ __pycache__/
# ignore in all directories
**/credentials.json

backend/big_query.py
*.pkl
backend/bash.exe.stackdump
backend/.python-version
backend/big_query.py
*.log
fish_data.csv
1 change: 1 addition & 0 deletions backend/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# credentials.json
5 changes: 4 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ FROM python:3.11.6-slim
# Set the working directory in the container
WORKDIR /app

# Install build dependencies
RUN apt-get update && apt-get install -y gcc musl-dev python3-dev

# Install and cleanup
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
Expand All @@ -18,7 +21,7 @@ COPY pyproject.toml poetry.lock ./
# Install the dependencies
RUN poetry config virtualenvs.create false && poetry install --no-dev

# Copy the credentials.json file and compute dir into the container
# Copy the creds file and compute dir into the container
COPY credentials.json ./
COPY compute ./compute

Expand Down
20 changes: 12 additions & 8 deletions backend/api-tests/requests.http
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,28 @@ GET http://127.0.0.1:8000/api/python
GET http://127.0.0.1:8000/api/datasets

#### BigQuery test
GET http://127.0.0.1:8000/api/bq?filename=sample_contacts
GET http://127.0.0.1:8000/api/bq?fileName=sample_contacts

#### Get Data
GET http://127.0.0.1:8000/api/data?filename=data
GET http://127.0.0.1:8000/api/data?fileName=data.csv

#### Get AutoML
GET http://127.0.0.1:8000/api/automl

#### Upload data set
PUT http://127.0.0.1:8000/api/upload?filename=test-data
Content-Type: multipart/form-data; boundary=MyBoundary
PUT http://127.0.0.1:8000/api/upload
Content-Type: multipart/form-data; boundary=yourBoundary

--MyBoundary
--yourBoundary
Content-Disposition: form-data; name="fileName"

mo11.csv
--yourBoundary
Content-Disposition: form-data; name="file"; filename="data.csv"
Content-Type: text/csv

< @./data.csv
--MyBoundary--
< ../data.csv
--yourBoundary--

#### Get eda
GET http://127.0.0.1:8000/api/eda?filename=data
GET http://127.0.0.1:8000/api/eda?fileName=data
1 change: 0 additions & 1 deletion backend/backend/.dockerignore

This file was deleted.

43 changes: 43 additions & 0 deletions backend/compute/autoML.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# from pycaret.classification import setup as classification_setup, compare_models as classification_compare_models, finalize_model
# from pycaret.regression import setup as regression_setup, compare_models as regression_compare_models, finalize_model

import pycaret.classification as pycaret_cl
import pycaret.regression as pycaret_rg

import pandas as pd
import joblib

def perform_classification(data, target_column):

pycaret_cl.setup(data = data, target = target_column)
best_model = pycaret_cl.compare_models()

model_file_path = 'classification_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path

def perform_regression(data, target_column):

pycaret_rg.setup(data = data, target = target_column)
best_model = pycaret_rg.compare_models()

model_file_path = 'regression_model.pkl'
joblib.dump(best_model, model_file_path)

return best_model, model_file_path

def generate_model(data, target_column, task):

df = pd.read_csv(data) # Supplied dataset
task = task.upper() # Either R or C

if task == 'C':
perform_classification(df, target_column) # Call classification_setup() before classification_compare_models()
final_model, model_file_path = perform_classification(df, target_column)
elif task == 'R':
perform_regression(df, target_column) # Call regression_setup() before regression_compare_models()
final_model, model_file_path = perform_regression(df, target_column)

return final_model, model_file_path

137 changes: 86 additions & 51 deletions backend/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from http.client import HTTPException
from google.cloud import storage
from google.cloud import bigquery
from starlette.responses import FileResponse
Expand All @@ -8,6 +9,8 @@
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from compute.autoEDA import generate_eda
from compute.autoML import generate_model

import csv

app = FastAPI()
Expand Down Expand Up @@ -36,8 +39,7 @@
async def refreshDataSets():
global dataSetNames
try:
storage_client = storage.Client.from_service_account_json(
"./credentials.json")
storage_client = storage.Client.from_service_account_json("./credentials.json")

blobs = storage_client.list_blobs(DATA_BUCKET)
dataSetNames = [blob.name for blob in blobs]
Expand Down Expand Up @@ -65,48 +67,61 @@ async def root():
async def root_py():
return {"message": "Hello from fastAPI backend"}


# add a new dataset to the bucket
@app.put("/api/upload")
async def upload(file: UploadFile = File(...), fileName: str = Form(...)):
async def upload(fileName: str = Form(...), file: UploadFile = File(...)):
try:
storage_client = storage.Client.from_service_account_json(
"./credentials.json")
# Validate file type
if not file.filename.endswith(".csv"):
raise HTTPException(
status_code=400,
detail="Invalid file type. Only CSV files are accepted.",
)

storage_client = storage.Client.from_service_account_json("./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
# Assuming fileName includes '.csv' extension
blob = bucket.blob(f"{fileName}")
content = await file.read()
blob.upload_from_string(content, content_type=file.content_type)
blob.upload_from_file(file.file, content_type="text/csv")

await refreshDataSets() # Make sure this function is defined if you want to use it

return JSONResponse(status_code=200, content={"message": "Data uploaded to GCloud successfully"})
return JSONResponse(
status_code=200, content={"message": "Data uploaded to GCloud successfully"}
)

except Exception as e:
return JSONResponse(status_code=500, content={"error": f"An error occurred: {str(e)}"})
return JSONResponse(
status_code=500, content={"error": f"An error occurred: {str(e)}"}
)


# list all the datasets in the bucket
@app.get("/api/datasets")
async def getDataSets():
# have to refresh the state of the bucket since it may have changed
await refreshDataSets()
if not dataSetNames:
return {"error": f"No DataSets in Bucket"}
return {"names": dataSetNames}


# get the data from the bucket and return it as a string
@app.get("/api/data")
async def getData(filename):
async def getData(fileName):
dataSetLines = ""
try:
storage_client = storage.Client.from_service_account_json("./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
blob = bucket.blob(f"{filename}.csv")
blob = bucket.blob(fileName)

with blob.open("r") as f:
dataSetLines = f.read()

# convert dataSetLines to be of type str | None (#TODO - check if this is necessary)
dataSetLines = str(dataSetLines) if dataSetLines else None
dataSetLines = str(dataSetLines) if dataSetLines else None

# convert csv string -> json (for frontend)
csv_reader = csv.DictReader(StringIO(dataSetLines))
Expand All @@ -115,21 +130,18 @@ async def getData(filename):
except Exception as e:
return {"error": f"An error occurred: {str(e)}"}

return {
"data": dataSetLines,
"json": json_data
}
return {"data": dataSetLines, "json": json_data}


# Exploratory Data Analysis
@app.get("/api/eda")
async def eda(filename):
async def eda(fileName):
corrMatrix = ""
try:
storage_client = storage.Client.from_service_account_json("./credentials.json")

bucket = storage_client.get_bucket(DATA_BUCKET)
blob = bucket.blob(f"{filename}.csv")
blob = bucket.blob(f"{fileName}.csv")

byte_stream = BytesIO()
blob.download_to_file(byte_stream)
Expand All @@ -147,19 +159,20 @@ async def eda(filename):

except Exception as e:
return {"error": f"An error occurred: {str(e)}"}

finally:
# Delete the temporary file
if os.path.exists(f"tempImages/{uniqueFilename}"):
os.remove(f"tempImages/{uniqueFilename}")

return {"data": corrMatrix, "graph_url": public_url}


# return the model as a file
@app.get("/api/automl")
async def getModel():
try:
#From #172 rawan/pandas-read-bucket
# From #172 rawan/pandas-read-bucket

# storage_client = storage.Client.from_service_account_json(
# "./credentials.json")
Expand All @@ -169,59 +182,81 @@ async def getModel():
# blob.download_to_file(byte_stream)
# byte_stream.seek(0)
# df = pd.read_csv(byte_stream)
# model_path = automl(df)

model, model_path = generate_model("fish_data.csv","Species", "C")

# Use a placeholder file for testing download
placeholder_model_path = "./download_test_random_data.pickle"
# placeholder_model_path = "./download_test_random_data.pickle"

except Exception as e:
return {"error": f"An error occurred: {str(e)}"}

# Return the placeholder file
return FileResponse(path=placeholder_model_path, filename=placeholder_model_path.split("/")[-1], media_type='application/octet-stream')


return {"data": corrMatrix}
# return FileResponse(path=placeholder_model_path, filename=placeholder_model_path.split("/")[-1], media_type='application/octet-stream')
return FileResponse(path=model_path, filename=model_path.split("/")[-1], media_type='application/octet-stream')


# get file from bucket, load it to big query as a table & display the rows
@app.get("/api/bq")
async def bq(filename):
async def bq(fileName, query=None):

# construct client objects (authorized with the service account json file)
bq_client = bigquery.Client.from_service_account_json("./credentials.json")
storage_client = storage.Client.from_service_account_json("./credentials.json")

uri = f"gs://{DATA_BUCKET}/{filename}.csv"
table_id = f"{BQ_DATASET}.{filename}_table"
# check if the file name has .csv extension, if not, add it
# if not fileName.endswith('.csv'):
# fileName += '.csv'

uri = f"gs://{DATA_BUCKET}/{fileName}"

# if file does not exist in the bucket, return an error
blob = storage_client.get_bucket(DATA_BUCKET).blob(filename + ".csv")
blob = storage_client.get_bucket(DATA_BUCKET).blob(fileName)
if not blob.exists():
return {"error": f"File {filename}.csv does not exist in the bucket."}
return {"error": f"File {fileName} does not exist in the bucket."}

fileName = fileName.replace('.csv', '')
table_id = f"{BQ_DATASET}.{fileName}_table"

# if table does not exist, load it
try:
bq_client.get_table(table_id)
except:
job_config = bigquery.LoadJobConfig(
autodetect=True, # Automatically infer the schema.
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1, # column headers
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, # Overwrite the table
)
load_job = bq_client.load_table_from_uri(
uri, table_id, job_config=job_config
) # Make an API request.
load_job.result() # Waits for the job to complete.
# try:
# bq_client.get_table(table_id)
# except:
job_config = bigquery.LoadJobConfig(
autodetect=True, # Automatically infer the schema.
source_format=bigquery.SourceFormat.CSV,
skip_leading_rows=1, # column headers
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, # Overwrite the table
)
# Make an API request
load_job = bq_client.load_table_from_uri(
uri, table_id, job_config=job_config
)
# Waits for the job to complete.
load_job.result()

# Query all rows from the table
query = f"SELECT * FROM `{table_id}`"
query_job = bq_client.query(query)
rows = query_job.result()
#------------------------------------------ Query ops ----------------------------------------#

query = query.upper() if query else None

# List of potentially harmful operations
harmful_ops = ['DROP', 'DELETE', 'INSERT', 'UPDATE']

# Check if the query contains any harmful operations
if query and any(op in query.upper() for op in harmful_ops):
print("\nQuery contains harmful operations!\nusing default query.\n")
final_query = f"SELECT * FROM `{table_id}`"
else:
print("\nQuery is safe to be passed.\n")
# remove everything before the `SELECT` keyword from the received query
query = query[query.find("SELECT"):] if query else None
final_query = query.replace("FROM TABLE", f"FROM `{table_id}`") if query else f"SELECT * FROM `{table_id}`"
print("Final Query:\n", final_query, "\n")

query_job = bq_client.query(final_query)
rows = query_job.result()

# display the rows
data = []
for row in rows:
Expand Down
Loading

0 comments on commit da63590

Please sign in to comment.