From 50909ed2e5ca5d5952fccf7974e347fabc36921e Mon Sep 17 00:00:00 2001 From: Kevin Donahue Date: Wed, 18 Sep 2024 17:15:04 -0400 Subject: [PATCH] use toml for config --- api/.idea/api.iml | 2 +- api/.idea/misc.xml | 2 +- api/Dockerfile | 3 +- api/README.md | 37 +++---- api/api/data/{sources.md => data.md} | 0 api/api/model/build.py | 140 +++++++++++++++------------ api/api/model/config.ini | 9 -- api/api/model/config.toml | 9 ++ api/api/model/train.py | 32 +++--- api/requirements.txt | 4 +- update-open-meteo.sh | 2 +- 11 files changed, 123 insertions(+), 117 deletions(-) rename api/api/data/{sources.md => data.md} (100%) delete mode 100644 api/api/model/config.ini create mode 100644 api/api/model/config.toml diff --git a/api/.idea/api.iml b/api/.idea/api.iml index e85cfa8..60bb2a1 100644 --- a/api/.idea/api.iml +++ b/api/.idea/api.iml @@ -4,7 +4,7 @@ - + diff --git a/api/.idea/misc.xml b/api/.idea/misc.xml index 6ae386b..62a009d 100644 --- a/api/.idea/misc.xml +++ b/api/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/api/Dockerfile b/api/Dockerfile index f2861b7..ff88472 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.11.7-slim-bullseye +ARG VERSION=3.12.6-slim-bookworm +FROM python:${VERSION} LABEL maintainer="Kevin Donahue " diff --git a/api/README.md b/api/README.md index 87988ac..dac0e96 100644 --- a/api/README.md +++ b/api/README.md @@ -1,24 +1,34 @@ # api -api server for sky brightness. +api server for [sky brightness](https://en.wikipedia.org/wiki/Sky_brightness) at valid coordinates. ## Building and training the sky brightness model +The api server depends on a model being trained from augmented csv data. + +These commands will generate a new `model.pth`. + - `python -m api.model.build` to write the csv that the model trains on - `python -m api.model.train` to train on the data in the csv ## HTTP APIs -### how to run locally +### endpoints + +#### `/api/v1/predict` -> Note: tested on python 3.11 +Gets the predicted sky brightness at `lat` and `lon` for the current time. ```sh -pip install -r requirements.txt -python -m api.main +curl "http://localhost:8000/api/v1/predict?lat=-30.2466&lon=-70.7494" + ``` -### endpoints +```json +{ + "mpsas": 22.0388 +} +``` #### `/api/v1/pollution` @@ -38,21 +48,6 @@ curl "localhost:8000/api/v1/pollution?lat=40.7277478&lon=-74.0000374" } ``` -#### `/api/v1/predict` - -Gets the predicted sky brightness at `lat` and `lon` for the current time. - -```sh -curl "http://localhost:8000/api/v1/predict?lat=-30.2466&lon=-70.7494" - -``` - -```json -{ - "mpsas": 22.0388 -} -``` - ### swagger ui Open the [ui](http://localhost:8000/docs) in a browser. diff --git a/api/api/data/sources.md b/api/api/data/data.md similarity index 100% rename from api/api/data/sources.md rename to api/api/data/data.md diff --git a/api/api/model/build.py b/api/api/model/build.py index ce40306..dfe6c81 100644 --- a/api/api/model/build.py +++ b/api/api/model/build.py @@ -1,5 +1,6 @@ +import typing from pathlib import Path -from configparser import ConfigParser +import tomllib import astropy.units as u import numpy as np @@ -8,42 +9,6 @@ from astropy.coordinates import EarthLocation from astropy.time import Time -config = ConfigParser() -config.read(Path(__file__).parent / "config.ini") - -max_sqm = config.getint("sqm", "max") -min_sqm = config.getint("sqm", "min") - -csv_filename = config.get("csv", "filename") - -path_to_data_dir = Path(__file__).parent.parent / "data" -path_to_preprocessed_csvs = path_to_data_dir / "globe_at_night" -csvs = path_to_preprocessed_csvs.glob("*.csv") - -dataframes = [ - pd.read_csv(path, on_bad_lines="skip") - for path in csvs -] - -columns_to_drop = ["ID", "SQMSerial", "Constellation", "SkyComment", "LocationComment", "Country"] -columns_that_must_not_be_na = ["ObsDateTime", "Latitude", "Longitude", "Elevation", "CloudCover", "SQMReading"] - -dataframes = [df for df in dataframes if all(c in df.columns for c in columns_to_drop + columns_that_must_not_be_na)] -df = pd.concat(dataframes, ignore_index=True) - -df = df.drop(columns=columns_to_drop) -df = df.dropna(subset=columns_that_must_not_be_na, how="any", axis=0) - -print("dropping rows outside of sqm range") -df = df[df["SQMReading"] <= max_sqm] -df = df[df["SQMReading"] >= min_sqm] -df = df.reset_index() - -print(f"building datetime mapping") -# create utdatetime column in order to form sine-mapped uttimehour -df["UTDatetime"] = pd.to_datetime(df["ObsDateTime"], utc=True) -df["UTTimeHour"] = np.sin(2 * np.pi * df["UTDatetime"].dt.hour / 24) - def get_moon_altaz(datetime, lat, lon): """get moon position in altitude/azimuth""" @@ -60,7 +25,6 @@ def get_moon_alt_for_row(row: pd.Series): lat, lon = row["Latitude"], row["Longitude"] altaz = get_moon_altaz(datetime, lat, lon) alt = altaz.alt.value - print(f"got moon altitude {alt} for row {row.name}") return alt @staticmethod @@ -69,32 +33,80 @@ def get_moon_az_for_row(row: pd.Series): lat, lon = row["Latitude"], row["Longitude"] altaz = get_moon_altaz(datetime, lat, lon) az = altaz.az.value - print(f"got moon azimuth {az} for row {row.name}") return az -print(f"applying moon data to {df.shape[0]} rows") -df["MoonAlt"] = df.apply(RowOps.get_moon_alt_for_row, axis=1) -df["MoonAz"] = df.apply(RowOps.get_moon_az_for_row, axis=1) - - -def get_oktas_from_description(description: str) -> int: - """map description of cloud coverage into int""" - match description: - case "0" | "clear": - return 0 - case "25" | "1/4 of sky": - return 2 - case "50" | "1/2 of sky": - return 4 - case "75" | "over 1/2 of sky": - return 6 - case _: - return 8 - - -print(f"mapping cloud cover to {df.shape[0]} rows") -df["CloudCover"] = df["CloudCover"].map(get_oktas_from_description) - -print(f"writing file to {path_to_data_dir.as_posix()}") -df.to_csv(path_to_data_dir / csv_filename, index=False) +def write_dataframe(csv_source_paths: typing.Generator[Path, None, None], write_path: Path): + """ingest csv sources into dataframe, adding columns where necessary, writing to new single csv file""" + print(f"reading csvs {csv_source_paths}") + dataframes = [ + pd.read_csv(path, on_bad_lines="skip") + for path in csv_source_paths + ] + + columns_to_drop = ["ID", "SQMSerial", "Constellation", "SkyComment", "LocationComment", "Country"] + columns_that_must_not_be_na = ["ObsDateTime", "Latitude", "Longitude", "Elevation", "CloudCover", "SQMReading"] + + dataframes = [df for df in dataframes if + all(c in df.columns for c in columns_to_drop + columns_that_must_not_be_na)] + df = pd.concat(dataframes, ignore_index=True) + + df = df.drop(columns=columns_to_drop) + df = df.dropna(subset=columns_that_must_not_be_na, how="any", axis=0) + + print("dropping rows outside of sqm range") + df = df[df["SQMReading"] <= max_sqm] + df = df[df["SQMReading"] >= min_sqm] + df = df.reset_index() + + print(f"building datetime mapping") + # create utdatetime column in order to form sine-mapped uttimehour + df["UTDatetime"] = pd.to_datetime(df["ObsDateTime"], utc=True) + df["UTTimeHour"] = np.sin(2 * np.pi * df["UTDatetime"].dt.hour / 24) + + print(f"applying moon altitude data to {df.shape[0]} rows") + df["MoonAlt"] = df.apply(RowOps.get_moon_alt_for_row, axis=1) + print(f"applying moon azimuth data to {df.shape[0]} rows") + df["MoonAz"] = df.apply(RowOps.get_moon_az_for_row, axis=1) + + def get_oktas_from_description(description: str) -> int: + """map description of cloud coverage into int""" + match description: + case "0" | "clear": + return 0 + case "25" | "1/4 of sky": + return 2 + case "50" | "1/2 of sky": + return 4 + case "75" | "over 1/2 of sky": + return 6 + case _: + return 8 + + print(f"mapping cloud cover to {df.shape[0]} rows") + df["CloudCover"] = df["CloudCover"].map(get_oktas_from_description) + + df.to_csv(write_path, index=False) + + +if __name__ == "__main__": + with open(Path(__file__).parent / "config.toml", "rb") as f: + config = tomllib.load(f) + + max_sqm = config["sqm"]["max"] + min_sqm = config["sqm"]["min"] + csv_filename = config["csv"]["filename"] + + path_to_data_dir = Path(__file__).parent.parent / "data" + + path_to_preprocessed_csvs = path_to_data_dir / "globe_at_night" + csv_sources = path_to_preprocessed_csvs.glob("*.csv") + + try: + print(f"attempting to write {csv_filename}") + write_dataframe(csv_sources, path_to_data_dir / csv_filename) + except Exception as e: + import traceback + + print(f"failed to write csv: {e}") + print(traceback.format_exc()) diff --git a/api/api/model/config.ini b/api/api/model/config.ini deleted file mode 100644 index 6c67335..0000000 --- a/api/api/model/config.ini +++ /dev/null @@ -1,9 +0,0 @@ -[sqm] -max = 22 -min = 16 - -[csv] -filename = globe_at_night.csv - -[train] -epochs = 100 \ No newline at end of file diff --git a/api/api/model/config.toml b/api/api/model/config.toml new file mode 100644 index 0000000..b940418 --- /dev/null +++ b/api/api/model/config.toml @@ -0,0 +1,9 @@ +[sqm] +max = 22 +min = 16 + +[csv] +filename = "globe_at_night.csv" + +[train] +epochs = 100 diff --git a/api/api/model/train.py b/api/api/model/train.py index 0fdf687..57fe23e 100644 --- a/api/api/model/train.py +++ b/api/api/model/train.py @@ -1,5 +1,5 @@ +import tomllib from pathlib import Path -from configparser import ConfigParser import numpy as np import pandas as pd @@ -7,16 +7,14 @@ import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset, random_split -from ..prediction.constants import features +from api.prediction.constants import features from api.prediction.net.nn import NeuralNetwork -features_size = len(features) +with open(Path(__file__).parent / "config.toml", "rb") as f: + config = tomllib.load(f) -config = ConfigParser() -config.read(Path(__file__).parent / "config.ini") - -csv_filename = config.get("csv", "filename") -epochs = config.getint("train", "epochs") +csv_filename = config["csv"]["filename"] +epochs = config["train"]["epochs"] path_to_prediction_pkg = Path(__file__).parent.parent / "prediction" saved_model_path = path_to_prediction_pkg / "model.pth" @@ -27,7 +25,7 @@ raise FileNotFoundError() df = pd.read_csv(path_to_gan_dataframe) -print(f"read csv with {len(df)} rows") +print(f"consumed csv with {len(df)} rows") torch.set_printoptions(sci_mode=False) feature_tensor = torch.tensor(df[features].values.astype(np.float32)) @@ -59,10 +57,10 @@ def train_loop( - data_loader: DataLoader, - model: NeuralNetwork, - loss_fn: nn.HuberLoss, - optimizer: torch.optim.Adam, + data_loader: DataLoader, + model: NeuralNetwork, + loss_fn: nn.HuberLoss, + optimizer: torch.optim.Adam, ): model.train() for batch, (X, y) in enumerate(data_loader): @@ -78,7 +76,7 @@ def train_loop( print(f"loss: {loss:>7f} [{current:>5d}]") -def test_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberLoss): +def evaluate_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberLoss): model.eval() with torch.no_grad(): @@ -94,10 +92,10 @@ def test_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberL print(f"starting training with {epochs} epochs, and saving state dict to {saved_model_path}") for epoch in range(epochs): - print(f"epoch {epoch + 1}") + print(f"epoch {epoch + 1}/{epochs}") train_loop(train_dataloader, model, loss_fn, optimizer) - test_model(test_dataloader, model, loss_fn) + evaluate_model(test_dataloader, model, loss_fn) - print(f"saving to {saved_model_path}") + print(f"saving state dict to {saved_model_path}") torch.save(model.state_dict(), saved_model_path) diff --git a/api/requirements.txt b/api/requirements.txt index f427b11..5bc387f 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -1,11 +1,11 @@ astroplan==0.9.1 astropy==6.0.0 pandas==2.1.4 -torch==2.1.2 +torch~=2.2.2 requests==2.31.0 fastapi~=0.110.2 httpx==0.26.0 uvicorn==0.25.0 pytest==7.4.3 Pillow~=10.3.0 -numpy~=1.26.2 +numpy~=1.26.4 diff --git a/update-open-meteo.sh b/update-open-meteo.sh index b996ea7..7a3e586 100755 --- a/update-open-meteo.sh +++ b/update-open-meteo.sh @@ -3,7 +3,7 @@ volume_name="open-meteo-data" if docker volume ls -q | grep -q "^${volume_name}$"; then - echo "$volume_name exists; updating volume" + echo "volume $volume_name exists; updating volume" docker run -it --rm -v open-meteo-data:/app/data ghcr.io/open-meteo/open-meteo sync ecmwf_ifs04 cloud_cover,temperature_2m else echo "$volume_name does not exist"