From 50909ed2e5ca5d5952fccf7974e347fabc36921e Mon Sep 17 00:00:00 2001
From: Kevin Donahue <nonnontrivial@gmail.com>
Date: Wed, 18 Sep 2024 17:15:04 -0400
Subject: [PATCH] use toml for config

---
 api/.idea/api.iml                    |   2 +-
 api/.idea/misc.xml                   |   2 +-
 api/Dockerfile                       |   3 +-
 api/README.md                        |  37 +++----
 api/api/data/{sources.md => data.md} |   0
 api/api/model/build.py               | 140 +++++++++++++++------------
 api/api/model/config.ini             |   9 --
 api/api/model/config.toml            |   9 ++
 api/api/model/train.py               |  32 +++---
 api/requirements.txt                 |   4 +-
 update-open-meteo.sh                 |   2 +-
 11 files changed, 123 insertions(+), 117 deletions(-)
 rename api/api/data/{sources.md => data.md} (100%)
 delete mode 100644 api/api/model/config.ini
 create mode 100644 api/api/model/config.toml
diff --git a/api/.idea/api.iml b/api/.idea/api.iml
index e85cfa8..60bb2a1 100644
--- a/api/.idea/api.iml
+++ b/api/.idea/api.iml
@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/.venv" />
     </content>
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Python 3.12 (api)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyDocumentationSettings">
diff --git a/api/.idea/misc.xml b/api/.idea/misc.xml
index 6ae386b..62a009d 100644
--- a/api/.idea/misc.xml
+++ b/api/.idea/misc.xml
@@ -3,5 +3,5 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.11 (api)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (api)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (api)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
diff --git a/api/Dockerfile b/api/Dockerfile
index f2861b7..ff88472 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -1,4 +1,5 @@
-FROM python:3.11.7-slim-bullseye
+ARG VERSION=3.12.6-slim-bookworm
+FROM python:${VERSION}
 
 LABEL maintainer="Kevin Donahue <nonnontrivial@gmail.com>"
 
diff --git a/api/README.md b/api/README.md
index 87988ac..dac0e96 100644
--- a/api/README.md
+++ b/api/README.md
@@ -1,24 +1,34 @@
 # api
 
-api server for sky brightness.
+api server for [sky brightness](https://en.wikipedia.org/wiki/Sky_brightness) at valid coordinates.
 
 ## Building and training the sky brightness model
 
+The api server depends on a model being trained from augmented csv data.
+
+These commands will generate a new `model.pth`.
+
 - `python -m api.model.build` to write the csv that the model trains on
 - `python -m api.model.train` to train on the data in the csv
 
 ## HTTP APIs
 
-### how to run locally
+### endpoints
+
+#### `/api/v1/predict`
 
-> Note: tested on python 3.11
+Gets the predicted sky brightness at `lat` and `lon` for the current time.
 
 ```sh
-pip install -r requirements.txt
-python -m api.main
+curl "http://localhost:8000/api/v1/predict?lat=-30.2466&lon=-70.7494"
+
 ```
 
-### endpoints
+```json
+{
+  "mpsas": 22.0388
+}
+```
 
 #### `/api/v1/pollution`
 
@@ -38,21 +48,6 @@ curl "localhost:8000/api/v1/pollution?lat=40.7277478&lon=-74.0000374"
 }
 ```
 
-#### `/api/v1/predict`
-
-Gets the predicted sky brightness at `lat` and `lon` for the current time.
-
-```sh
-curl "http://localhost:8000/api/v1/predict?lat=-30.2466&lon=-70.7494"
-
-```
-
-```json
-{
-  "mpsas": 22.0388
-}
-```
-
 ### swagger ui
 
 Open the [ui](http://localhost:8000/docs) in a browser.
diff --git a/api/api/data/sources.md b/api/api/data/data.md
similarity index 100%
rename from api/api/data/sources.md
rename to api/api/data/data.md
diff --git a/api/api/model/build.py b/api/api/model/build.py
index ce40306..dfe6c81 100644
--- a/api/api/model/build.py
+++ b/api/api/model/build.py
@@ -1,5 +1,6 @@
+import typing
 from pathlib import Path
-from configparser import ConfigParser
+import tomllib
 
 import astropy.units as u
 import numpy as np
@@ -8,42 +9,6 @@
 from astropy.coordinates import EarthLocation
 from astropy.time import Time
 
-config = ConfigParser()
-config.read(Path(__file__).parent / "config.ini")
-
-max_sqm = config.getint("sqm", "max")
-min_sqm = config.getint("sqm", "min")
-
-csv_filename = config.get("csv", "filename")
-
-path_to_data_dir = Path(__file__).parent.parent / "data"
-path_to_preprocessed_csvs = path_to_data_dir / "globe_at_night"
-csvs = path_to_preprocessed_csvs.glob("*.csv")
-
-dataframes = [
-    pd.read_csv(path, on_bad_lines="skip")
-    for path in csvs
-]
-
-columns_to_drop = ["ID", "SQMSerial", "Constellation", "SkyComment", "LocationComment", "Country"]
-columns_that_must_not_be_na = ["ObsDateTime", "Latitude", "Longitude", "Elevation", "CloudCover", "SQMReading"]
-
-dataframes = [df for df in dataframes if all(c in df.columns for c in columns_to_drop + columns_that_must_not_be_na)]
-df = pd.concat(dataframes, ignore_index=True)
-
-df = df.drop(columns=columns_to_drop)
-df = df.dropna(subset=columns_that_must_not_be_na, how="any", axis=0)
-
-print("dropping rows outside of sqm range")
-df = df[df["SQMReading"] <= max_sqm]
-df = df[df["SQMReading"] >= min_sqm]
-df = df.reset_index()
-
-print(f"building datetime mapping")
-# create utdatetime column in order to form sine-mapped uttimehour
-df["UTDatetime"] = pd.to_datetime(df["ObsDateTime"], utc=True)
-df["UTTimeHour"] = np.sin(2 * np.pi * df["UTDatetime"].dt.hour / 24)
-
 
 def get_moon_altaz(datetime, lat, lon):
     """get moon position in altitude/azimuth"""
@@ -60,7 +25,6 @@ def get_moon_alt_for_row(row: pd.Series):
         lat, lon = row["Latitude"], row["Longitude"]
         altaz = get_moon_altaz(datetime, lat, lon)
         alt = altaz.alt.value
-        print(f"got moon altitude {alt} for row {row.name}")
         return alt
 
     @staticmethod
@@ -69,32 +33,80 @@ def get_moon_az_for_row(row: pd.Series):
         lat, lon = row["Latitude"], row["Longitude"]
         altaz = get_moon_altaz(datetime, lat, lon)
         az = altaz.az.value
-        print(f"got moon azimuth {az} for row {row.name}")
         return az
 
 
-print(f"applying moon data to {df.shape[0]} rows")
-df["MoonAlt"] = df.apply(RowOps.get_moon_alt_for_row, axis=1)
-df["MoonAz"] = df.apply(RowOps.get_moon_az_for_row, axis=1)
-
-
-def get_oktas_from_description(description: str) -> int:
-    """map description of cloud coverage into int"""
-    match description:
-        case "0" | "clear":
-            return 0
-        case "25" | "1/4 of sky":
-            return 2
-        case "50" | "1/2 of sky":
-            return 4
-        case "75" | "over 1/2 of sky":
-            return 6
-        case _:
-            return 8
-
-
-print(f"mapping cloud cover to {df.shape[0]} rows")
-df["CloudCover"] = df["CloudCover"].map(get_oktas_from_description)
-
-print(f"writing file to {path_to_data_dir.as_posix()}")
-df.to_csv(path_to_data_dir / csv_filename, index=False)
+def write_dataframe(csv_source_paths: typing.Generator[Path, None, None], write_path: Path):
+    """ingest csv sources into dataframe, adding columns where necessary, writing to new single csv file"""
+    print(f"reading csvs {csv_source_paths}")
+    dataframes = [
+        pd.read_csv(path, on_bad_lines="skip")
+        for path in csv_source_paths
+    ]
+
+    columns_to_drop = ["ID", "SQMSerial", "Constellation", "SkyComment", "LocationComment", "Country"]
+    columns_that_must_not_be_na = ["ObsDateTime", "Latitude", "Longitude", "Elevation", "CloudCover", "SQMReading"]
+
+    dataframes = [df for df in dataframes if
+                  all(c in df.columns for c in columns_to_drop + columns_that_must_not_be_na)]
+    df = pd.concat(dataframes, ignore_index=True)
+
+    df = df.drop(columns=columns_to_drop)
+    df = df.dropna(subset=columns_that_must_not_be_na, how="any", axis=0)
+
+    print("dropping rows outside of sqm range")
+    df = df[df["SQMReading"] <= max_sqm]
+    df = df[df["SQMReading"] >= min_sqm]
+    df = df.reset_index()
+
+    print(f"building datetime mapping")
+    # create utdatetime column in order to form sine-mapped uttimehour
+    df["UTDatetime"] = pd.to_datetime(df["ObsDateTime"], utc=True)
+    df["UTTimeHour"] = np.sin(2 * np.pi * df["UTDatetime"].dt.hour / 24)
+
+    print(f"applying moon altitude data to {df.shape[0]} rows")
+    df["MoonAlt"] = df.apply(RowOps.get_moon_alt_for_row, axis=1)
+    print(f"applying moon azimuth data to {df.shape[0]} rows")
+    df["MoonAz"] = df.apply(RowOps.get_moon_az_for_row, axis=1)
+
+    def get_oktas_from_description(description: str) -> int:
+        """map description of cloud coverage into int"""
+        match description:
+            case "0" | "clear":
+                return 0
+            case "25" | "1/4 of sky":
+                return 2
+            case "50" | "1/2 of sky":
+                return 4
+            case "75" | "over 1/2 of sky":
+                return 6
+            case _:
+                return 8
+
+    print(f"mapping cloud cover to {df.shape[0]} rows")
+    df["CloudCover"] = df["CloudCover"].map(get_oktas_from_description)
+
+    df.to_csv(write_path, index=False)
+
+
+if __name__ == "__main__":
+    with open(Path(__file__).parent / "config.toml", "rb") as f:
+        config = tomllib.load(f)
+
+    max_sqm = config["sqm"]["max"]
+    min_sqm = config["sqm"]["min"]
+    csv_filename = config["csv"]["filename"]
+
+    path_to_data_dir = Path(__file__).parent.parent / "data"
+
+    path_to_preprocessed_csvs = path_to_data_dir / "globe_at_night"
+    csv_sources = path_to_preprocessed_csvs.glob("*.csv")
+
+    try:
+        print(f"attempting to write {csv_filename}")
+        write_dataframe(csv_sources, path_to_data_dir / csv_filename)
+    except Exception as e:
+        import traceback
+
+        print(f"failed to write csv: {e}")
+        print(traceback.format_exc())
diff --git a/api/api/model/config.ini b/api/api/model/config.ini
deleted file mode 100644
index 6c67335..0000000
--- a/api/api/model/config.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-[sqm]
-max = 22
-min = 16
-
-[csv]
-filename = globe_at_night.csv
-
-[train]
-epochs = 100
\ No newline at end of file
diff --git a/api/api/model/config.toml b/api/api/model/config.toml
new file mode 100644
index 0000000..b940418
--- /dev/null
+++ b/api/api/model/config.toml
@@ -0,0 +1,9 @@
+[sqm]
+max = 22
+min = 16
+
+[csv]
+filename = "globe_at_night.csv"
+
+[train]
+epochs = 100
diff --git a/api/api/model/train.py b/api/api/model/train.py
index 0fdf687..57fe23e 100644
--- a/api/api/model/train.py
+++ b/api/api/model/train.py
@@ -1,5 +1,5 @@
+import tomllib
 from pathlib import Path
-from configparser import ConfigParser
 
 import numpy as np
 import pandas as pd
@@ -7,16 +7,14 @@
 import torch.nn as nn
 from torch.utils.data import DataLoader, TensorDataset, random_split
 
-from ..prediction.constants import features
+from api.prediction.constants import features
 from api.prediction.net.nn import NeuralNetwork
 
-features_size = len(features)
+with open(Path(__file__).parent / "config.toml", "rb") as f:
+    config = tomllib.load(f)
 
-config = ConfigParser()
-config.read(Path(__file__).parent / "config.ini")
-
-csv_filename = config.get("csv", "filename")
-epochs = config.getint("train", "epochs")
+csv_filename = config["csv"]["filename"]
+epochs = config["train"]["epochs"]
 
 path_to_prediction_pkg = Path(__file__).parent.parent / "prediction"
 saved_model_path = path_to_prediction_pkg / "model.pth"
@@ -27,7 +25,7 @@
     raise FileNotFoundError()
 
 df = pd.read_csv(path_to_gan_dataframe)
-print(f"read csv with {len(df)} rows")
+print(f"consumed csv with {len(df)} rows")
 
 torch.set_printoptions(sci_mode=False)
 feature_tensor = torch.tensor(df[features].values.astype(np.float32))
@@ -59,10 +57,10 @@
 
 
 def train_loop(
-    data_loader: DataLoader,
-    model: NeuralNetwork,
-    loss_fn: nn.HuberLoss,
-    optimizer: torch.optim.Adam,
+        data_loader: DataLoader,
+        model: NeuralNetwork,
+        loss_fn: nn.HuberLoss,
+        optimizer: torch.optim.Adam,
 ):
     model.train()
     for batch, (X, y) in enumerate(data_loader):
@@ -78,7 +76,7 @@ def train_loop(
             print(f"loss: {loss:>7f} [{current:>5d}]")
 
 
-def test_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberLoss):
+def evaluate_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberLoss):
     model.eval()
 
     with torch.no_grad():
@@ -94,10 +92,10 @@ def test_model(data_loader: DataLoader, model: NeuralNetwork, loss_fn: nn.HuberL
     print(f"starting training with {epochs} epochs, and saving state dict to {saved_model_path}")
 
     for epoch in range(epochs):
-        print(f"epoch {epoch + 1}")
+        print(f"epoch {epoch + 1}/{epochs}")
         train_loop(train_dataloader, model, loss_fn, optimizer)
 
-    test_model(test_dataloader, model, loss_fn)
+    evaluate_model(test_dataloader, model, loss_fn)
 
-    print(f"saving to {saved_model_path}")
+    print(f"saving state dict to {saved_model_path}")
     torch.save(model.state_dict(), saved_model_path)
diff --git a/api/requirements.txt b/api/requirements.txt
index f427b11..5bc387f 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -1,11 +1,11 @@
 astroplan==0.9.1
 astropy==6.0.0
 pandas==2.1.4
-torch==2.1.2
+torch~=2.2.2
 requests==2.31.0
 fastapi~=0.110.2
 httpx==0.26.0
 uvicorn==0.25.0
 pytest==7.4.3
 Pillow~=10.3.0
-numpy~=1.26.2
+numpy~=1.26.4
diff --git a/update-open-meteo.sh b/update-open-meteo.sh
index b996ea7..7a3e586 100755
--- a/update-open-meteo.sh
+++ b/update-open-meteo.sh
@@ -3,7 +3,7 @@
 volume_name="open-meteo-data"
 
 if docker volume ls -q | grep -q "^${volume_name}$"; then
-    echo "$volume_name exists; updating volume"
+    echo "volume $volume_name exists; updating volume"
     docker run -it --rm -v open-meteo-data:/app/data ghcr.io/open-meteo/open-meteo sync ecmwf_ifs04 cloud_cover,temperature_2m 
 else
     echo "$volume_name does not exist"