Add missing EPFL readers (#155)

ltelab · Jan 26, 2023 · a967207 · a967207
1 parent a7db617
commit a967207
Show file tree

Hide file tree

Showing 7 changed files with 432 additions and 15 deletions.
diff --git a/disdrodb/L0/L0A_processing.py b/disdrodb/L0/L0A_processing.py
@@ -615,6 +615,10 @@ def read_L0A_raw_file_list(
             if df_sanitizer_fun is not None:
                 df = df_sanitizer_fun(df, lazy=lazy)
 
+            # Remove duplicated timesteps
+            # - TODO: Log info !!!
+            df = df.drop_duplicates(subset="time", keep="first")
+
             # ------------------------------------------------------.
             # Check column names met DISDRODB standards
             check_L0A_column_names(df, sensor_name=sensor_name)

diff --git a/disdrodb/L0/readers/EPFL/HYMEX_2012.py → disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP2.py b/disdrodb/L0/readers/EPFL/HYMEX_2012.py → disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP2.py
@@ -16,7 +16,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # -----------------------------------------------------------------------------.
-"""Reader for HYMEX campaign."""
+"""Reader for HYMEX SOP2 campaign."""
 from disdrodb.L0 import run_L0
 from disdrodb.L0.L0_processing import reader_generic_docstring, is_documented_by
 
@@ -101,6 +101,10 @@ def df_sanitizer_fun(df, lazy=False):
         # - Convert time column to datetime
         df["time"] = dd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")
 
+        # - Drop rows when "raw_drop_number" is "NA"
+        # --> This is used to drop all rows where all values are "NA"
+        df = df.dropna(subset="raw_drop_number", axis=0)
+
         # - Drop columns not agreeing with DISDRODB L0 standards
         columns_to_drop = [
             "datalogger_debug",

diff --git a/disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP3.py b/disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP3.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2022 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# -----------------------------------------------------------------------------.
+"""Reader for HYMEX SOP3 campaign."""
+from disdrodb.L0 import run_L0
+from disdrodb.L0.L0_processing import reader_generic_docstring, is_documented_by
+
+
+@is_documented_by(reader_generic_docstring)
+def reader(
+    raw_dir,
+    processed_dir,
+    l0a_processing=True,
+    l0b_processing=True,
+    keep_l0a=False,
+    force=False,
+    verbose=False,
+    debugging_mode=False,
+    lazy=True,
+    single_netcdf=True,
+):
+
+    ##------------------------------------------------------------------------.
+    #### - Define column names
+    # - When no data are logged (every 30 seconds), all columns (except time) have "NA" values
+    column_names = [
+        "time",
+        "id",
+        "datalogger_temperature",
+        "datalogger_voltage",
+        "rainfall_rate_32bit",
+        "rainfall_accumulated_32bit",
+        "weather_code_synop_4680",
+        "weather_code_synop_4677",
+        "reflectivity_32bit",
+        "mor_visibility",
+        "laser_amplitude",
+        "number_particles",
+        "sensor_temperature",
+        "sensor_heating_current",
+        "sensor_battery_voltage",
+        "sensor_status",
+        "rainfall_amount_absolute_32bit",
+        "datalogger_debug",
+        "raw_drop_concentration",
+        "raw_drop_average_velocity",
+        "raw_drop_number",
+        "datalogger_error",
+    ]
+
+    ##------------------------------------------------------------------------.
+    #### - Define reader options
+    reader_kwargs = {}
+    # - Define delimiter
+    reader_kwargs["delimiter"] = ","
+    # - Avoid first column to become df index !!!
+    reader_kwargs["index_col"] = False
+    # - Define behaviour when encountering bad lines
+    reader_kwargs["on_bad_lines"] = "skip"
+    # - Define reader engine
+    #   - C engine is faster
+    #   - Python engine is more feature-complete
+    reader_kwargs["engine"] = "python"
+    # - Define on-the-fly decompression of on-disk data
+    #   - Available: gzip, bz2, zip
+    reader_kwargs["compression"] = "infer"
+    # - Strings to recognize as NA/NaN and replace with standard NA flags
+    #   - Already included: ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’,
+    #                       ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’,
+    #                       ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’
+    reader_kwargs["na_values"] = ["na", "", "error", "-.-", " NA"]
+    # - Define max size of dask dataframe chunks (if lazy=True)
+    #   - If None: use a single block for each file
+    #   - Otherwise: "<max_file_size>MB" by which to cut up larger files
+    reader_kwargs["blocksize"] = None  # "50MB"
+
+    ##------------------------------------------------------------------------.
+    #### - Define dataframe sanitizer function for L0 processing
+    def df_sanitizer_fun(df, lazy=False):
+        # Import dask or pandas
+        if lazy:
+            import dask.dataframe as dd
+        else:
+            import pandas as dd
+
+        # - Convert time column to datetime
+        df["time"] = dd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")
+
+        # - Drop rows when "raw_drop_number" is "NA"
+        # --> This is used to drop all rows where all values are "NA"
+        df = df.dropna(subset="raw_drop_number", axis=0)
+
+        # - Drop columns not agreeing with DISDRODB L0 standards
+        columns_to_drop = [
+            "datalogger_debug",
+            "datalogger_voltage",
+            "id",
+            "datalogger_temperature",
+            "datalogger_error",
+        ]
+        df = df.drop(columns=columns_to_drop)
+        return df
+
+    ##------------------------------------------------------------------------.
+    #### - Define glob pattern to search data files in <raw_dir>/data/<station_id>
+    files_glob_pattern = "*.dat*"
+
+    ####----------------------------------------------------------------------.
+    #### - Create L0 products
+    run_L0(
+        raw_dir=raw_dir,
+        processed_dir=processed_dir,
+        l0a_processing=l0a_processing,
+        l0b_processing=l0b_processing,
+        keep_l0a=keep_l0a,
+        force=force,
+        verbose=verbose,
+        debugging_mode=debugging_mode,
+        lazy=lazy,
+        single_netcdf=single_netcdf,
+        # Custom arguments of the reader
+        files_glob_pattern=files_glob_pattern,
+        column_names=column_names,
+        reader_kwargs=reader_kwargs,
+        df_sanitizer_fun=df_sanitizer_fun,
+    )
diff --git a/disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP4.py b/disdrodb/L0/readers/EPFL/HYMEX_LTE_SOP4.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2022 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# -----------------------------------------------------------------------------.
+"""Reader for HYMEX SOP4 campaign."""
+from disdrodb.L0 import run_L0
+from disdrodb.L0.L0_processing import reader_generic_docstring, is_documented_by
+
+
+@is_documented_by(reader_generic_docstring)
+def reader(
+    raw_dir,
+    processed_dir,
+    l0a_processing=True,
+    l0b_processing=True,
+    keep_l0a=False,
+    force=False,
+    verbose=False,
+    debugging_mode=False,
+    lazy=True,
+    single_netcdf=True,
+):
+
+    ##------------------------------------------------------------------------.
+    #### - Define column names
+    # - When no data are logged (every 30 seconds), all columns (except time) have "NA" values
+    column_names = [
+        "time",
+        "id",
+        "datalogger_temperature",
+        "datalogger_voltage",
+        "rainfall_rate_32bit",
+        "rainfall_accumulated_32bit",
+        "weather_code_synop_4680",
+        "weather_code_synop_4677",
+        "reflectivity_32bit",
+        "mor_visibility",
+        "laser_amplitude",
+        "number_particles",
+        "sensor_temperature",
+        "sensor_heating_current",
+        "sensor_battery_voltage",
+        "sensor_status",
+        "rainfall_amount_absolute_32bit",
+        "datalogger_debug",
+        "raw_drop_concentration",
+        "raw_drop_average_velocity",
+        "raw_drop_number",
+        "datalogger_error",
+    ]
+
+    ##------------------------------------------------------------------------.
+    #### - Define reader options
+    reader_kwargs = {}
+    # - Define delimiter
+    reader_kwargs["delimiter"] = ","
+    # - Avoid first column to become df index !!!
+    reader_kwargs["index_col"] = False
+    # - Define behaviour when encountering bad lines
+    reader_kwargs["on_bad_lines"] = "skip"
+    # - Define reader engine
+    #   - C engine is faster
+    #   - Python engine is more feature-complete
+    reader_kwargs["engine"] = "python"
+    # - Define on-the-fly decompression of on-disk data
+    #   - Available: gzip, bz2, zip
+    reader_kwargs["compression"] = "infer"
+    # - Strings to recognize as NA/NaN and replace with standard NA flags
+    #   - Already included: ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’,
+    #                       ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’,
+    #                       ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’
+    reader_kwargs["na_values"] = ["na", "", "error", "-.-", " NA"]
+    # - Define max size of dask dataframe chunks (if lazy=True)
+    #   - If None: use a single block for each file
+    #   - Otherwise: "<max_file_size>MB" by which to cut up larger files
+    reader_kwargs["blocksize"] = None  # "50MB"
+
+    ##------------------------------------------------------------------------.
+    #### - Define dataframe sanitizer function for L0 processing
+    def df_sanitizer_fun(df, lazy=False):
+        # Import dask or pandas
+        if lazy:
+            import dask.dataframe as dd
+        else:
+            import pandas as dd
+
+        # - Convert time column to datetime
+        df["time"] = dd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")
+
+        # - Drop rows when "raw_drop_number" is "NA"
+        # --> This is used to drop all rows where all values are "NA"
+        df = df.dropna(subset="raw_drop_number", axis=0)
+
+        # - Drop columns not agreeing with DISDRODB L0 standards
+        columns_to_drop = [
+            "datalogger_debug",
+            "datalogger_voltage",
+            "id",
+            "datalogger_temperature",
+            "datalogger_error",
+        ]
+        df = df.drop(columns=columns_to_drop)
+        return df
+
+    ##------------------------------------------------------------------------.
+    #### - Define glob pattern to search data files in <raw_dir>/data/<station_id>
+    files_glob_pattern = "*.dat*"
+
+    ####----------------------------------------------------------------------.
+    #### - Create L0 products
+    run_L0(
+        raw_dir=raw_dir,
+        processed_dir=processed_dir,
+        l0a_processing=l0a_processing,
+        l0b_processing=l0b_processing,
+        keep_l0a=keep_l0a,
+        force=force,
+        verbose=verbose,
+        debugging_mode=debugging_mode,
+        lazy=lazy,
+        single_netcdf=single_netcdf,
+        # Custom arguments of the reader
+        files_glob_pattern=files_glob_pattern,
+        column_names=column_names,
+        reader_kwargs=reader_kwargs,
+        df_sanitizer_fun=df_sanitizer_fun,
+    )