diff --git a/src/iblphotometry/io.py b/src/iblphotometry/io.py index 3318806..6a5dfd1 100644 --- a/src/iblphotometry/io.py +++ b/src/iblphotometry/io.py @@ -14,6 +14,16 @@ def from_raw_neurophotometrics_file_to_raw_df( path: str | Path, validate=True, version='new' ) -> pd.DataFrame: + """reads in a file as generated by the neurophotometrics FP3002 (both new and old versions) with validation + + Args: + path (str | Path): path to the file, can be in either .csv or .pqt format + validate (bool, optional): If True, validates the file. Defaults to True. + version (str, optional): 'new' or 'old' version of the neurophotometrics file format. Defaults to 'new'. + + Returns: + pd.DataFrame: the data as a raw dataframe format + """ path = Path(path) if isinstance(path, str) else path match path.suffix: case '.csv': @@ -30,6 +40,17 @@ def from_raw_neurophotometrics_file_to_raw_df( def from_raw_neurophotometrics_df_to_ibl_df( raw_df: pd.DataFrame, rois=None, drop_first=True ) -> pd.DataFrame: + """reads in a dataframe with the raw photometry data as generated by the neurophotometrics FP3002 into the ibl photometry dataformat. + + + Args: + raw_df (pd.DataFrame): as returned by `from_raw_neurophotometrics_file_to_raw_df` + rois (_type_, optional): names of the rois as selected by the user in the acquisition UI. If None, the names are inferred from the data. Defaults to None. + drop_first (bool, optional): Drop the The first frame, which has all LEDs on by default. Defaults to True. + + Returns: + pd.DataFrame: the data in the ibl photometry data format + """ if rois is None: rois = infer_data_columns(raw_df) @@ -79,6 +100,17 @@ def from_raw_neurophotometrics_df_to_ibl_df( def from_raw_neurophotometrics_file_to_ibl_df( path: str | Path, drop_first=True, validate=True, version='new' ) -> pd.DataFrame: + """convenience function that chains `from_raw_neurophotometrics_file_to_raw_df` and `from_raw_neurophotometrics_df_to_ibl_df`. See docstrings + + Args: + path (str | Path): _description_ + drop_first (bool, optional): Drop the The first frame, which has all LEDs on by default. Defaults to True. + validate (bool, optional): If True, validates the file. Defaults to True. + version (str, optional): 'new' or 'old' version of the neurophotometrics file format. Defaults to 'new'. + + Returns: + pd.DataFrame: _description_ + """ raw_df = from_raw_neurophotometrics_file_to_raw_df( path, validate=validate, version=version ) @@ -88,10 +120,10 @@ def from_raw_neurophotometrics_file_to_ibl_df( def from_ibl_pqt_to_ibl_df(path: str | Path, validate=False): + ibl_df = pd.read_parquet(path) if validate is True: - # TODO - raise NotImplementedError - return pd.read_parquet(path) + ibl_df = validate_ibl_dataframe(ibl_df) + return ibl_df def from_ibl_dataframe( @@ -101,13 +133,14 @@ def from_ibl_dataframe( channel_column: str = 'name', channel_names: list[str] | None = None, rename: dict | None = None, + validate: bool = True, ) -> dict: """main function to convert to analysis ready format Args: ibl_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt - data_columns (list[str], optional): The names of the columns in the dataframe that contain the signals of different fibers. By default, they are named RegionXX. If None is provided, All columns that start with `Region` are treated as data columns. Defaults to None. + data_columns (list[str], optional): The names of the columns in the dataframe that contain the signals of different fibers. By default, they are named RegionXX. If None is provided, All columns that start with `Region` or `G` are treated as data columns. Defaults to None. time_column (str, optional): The name of the column that contains the timestamps. If None is provided, it is assumed that `time` is in the name. Defaults to None. channel_column (str, optional): The name of the column that contains. Defaults to 'name'. channel_names (list[str], optional): The names of the acquisition channel / frequency bands that are acquired. Defaults to None. @@ -120,6 +153,9 @@ def from_ibl_dataframe( # data_columns is a list of str that specifies the names of the column that hold the actual data, like 'RegionXX' # channel_column is the column that specifies the temporally multiplexed acquisition channels + if validate: + ibl_df = validate_ibl_dataframe(ibl_df) + data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns # infer name of time column if not provided @@ -152,11 +188,19 @@ def from_ibl_dataframe( def from_ibl_pqt( signal_pqt_path: str | Path, locations_pqt_path: Optional[str | Path] = None, + validate=True, ): - # read from a single pqt - # if both are provided, do both + """reads in photometry data stored in the ibl format as a .pqt file. If provided, uses the metadata stored in the locations.pqt file as well. + + Args: + signal_pqt_path (str | Path): _description_ + locations_pqt_path (Optional[str | Path], optional): _description_. Defaults to None. - ibl_df = pd.read_parquet(signal_pqt_path) + Returns: + _type_: _description_ + """ + + ibl_df = from_ibl_pqt_to_ibl_df(signal_pqt_path, validate=validate) if locations_pqt_path is not None: locations_df = pd.read_parquet(locations_pqt_path) return from_ibl_dataframes(ibl_df, locations_df) @@ -195,7 +239,17 @@ def from_ibl_dataframes(ibl_df: pd.DataFrame, locations_df: pd.DataFrame): def from_raw_neurophotometrics_file( path: str | Path, drop_first=True, validate=True, version='new' ) -> dict: - # this one bypasses everything + """reads in a file generated by the neurophotometrics FP3002 into the analysis ready format + + Args: + path (str | Path): _description_ + drop_first (bool, optional): Drop the The first frame, which has all LEDs on by default. Defaults to True. + validate (bool, optional): If True, validates the file. Defaults to True. + version (str, optional): 'new' or 'old' version of the neurophotometrics file format. Defaults to 'new'. + + Returns: + dict: _description_ + """ ibl_df = from_raw_neurophotometrics_file_to_ibl_df( path, drop_first=drop_first, validate=validate, version=version ) @@ -233,7 +287,19 @@ def read_digital_inputs_csv(path: str | Path, validate=True) -> pd.DataFrame: """ -def validate_ibl_dataframe(df: pd.DataFrame) -> pd.DataFrame: ... +def validate_ibl_dataframe(ibl_df: pd.DataFrame, data_columns=None) -> pd.DataFrame: + data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns + schema_ibl_data = pandera.DataFrameSchema( + columns=dict( + times=pandera.Column(pandera.Float64), + # valid=pandera.Column(pandera.Bool), # optionally present + wavelength=pandera.Column(pandera.Float64, nullable=True), + name=pandera.Column(pandera.String), + color=pandera.Column(pandera.String), + **{k: pandera.Column(pandera.Float64) for k in data_columns}, + ) + ) + return schema_ibl_data.validate(ibl_df) def validate_neurophotometrics_df( diff --git a/src/iblphotometry_tests/test_loaders.py b/src/iblphotometry_tests/test_loaders.py index 35372d7..9fc66df 100644 --- a/src/iblphotometry_tests/test_loaders.py +++ b/src/iblphotometry_tests/test_loaders.py @@ -50,9 +50,13 @@ def test_from_raw_neurophotometrics_file(self): pd.testing.assert_frame_equal(dfs_a[key], dfs_b[key]) # from pqt files as they are returned from ONE by .load_dataset() - # def test_from_ibl_pqt(self): - # fpio.from_ibl_pqt(self.paths['photometry_signal_pqt']) - # fpio.from_ibl_pqt( - # self.paths['photometry_signal_pqt'], - # self.paths['photometryROI_locations_pqt'], - # ) + def test_from_ibl_pqt(self): + datasets = ['carolina', 'alejandro'] + + for dataset in datasets: + self.set_paths(dataset) + fpio.from_ibl_pqt(self.paths['photometry_signal_pqt']) + fpio.from_ibl_pqt( + self.paths['photometry_signal_pqt'], + self.paths['photometryROI_locations_pqt'], + )