-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
small fix for integrating with iblrig transfer_experiments and valida…
…tion
- Loading branch information
Showing
2 changed files
with
288 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from pathlib import Path | ||
import warnings | ||
import pandera | ||
from typing import Optional | ||
|
||
from iblphotometry.neurophotometrics import ( | ||
LIGHT_SOURCE_MAP, | ||
LED_STATES, | ||
) | ||
|
||
|
||
def from_raw_neurophotometrics_file_to_raw_df( | ||
path: str | Path, | ||
validate=True, | ||
) -> pd.DataFrame: | ||
path = Path(path) if isinstance(path, str) else path | ||
match path.suffix: | ||
case '.csv': | ||
raw_df = pd.read_csv(path) | ||
case '.pqt': | ||
raw_df = pd.read_parquet(path) | ||
|
||
if validate: | ||
raw_df = validate_neurophotometrics_df(raw_df) | ||
|
||
return raw_df | ||
|
||
|
||
def from_raw_neurophotometrics_df_to_ibl_df( | ||
raw_df: pd.DataFrame, rois=None, drop_first=True | ||
) -> pd.DataFrame: | ||
if rois is None: | ||
rois = infer_data_columns(raw_df) | ||
|
||
ibl_df = raw_df.filter(items=rois, axis=1).sort_index(axis=1) | ||
timestamp_name = ( | ||
'SystemTimestamp' if 'SystemTimestamp' in raw_df.columns else 'Timestamp' | ||
) | ||
ibl_df['times'] = raw_df[timestamp_name] | ||
ibl_df['wavelength'] = np.nan | ||
ibl_df['name'] = '' | ||
ibl_df['color'] = '' | ||
|
||
# TODO the names column in channel_meta_map should actually be user defined (experiment description file?) | ||
channel_meta_map = pd.DataFrame(LIGHT_SOURCE_MAP) | ||
led_states = pd.DataFrame(LED_STATES).set_index('Condition') | ||
states = raw_df['LedState'] | ||
|
||
for state in states.unique(): | ||
ir, ic = np.where(led_states == state) | ||
# if not present, multiple LEDs are active | ||
if ic.size == 0: | ||
# find row | ||
ir = np.argmax(led_states['No LED ON'] > state) - 1 | ||
# find active combo | ||
possible_led_combos = [(1, 2), (1, 3), (2, 3), (1, 2, 3)] | ||
for combo in possible_led_combos: # drop enumerate | ||
if state == sum([led_states.iloc[ir, c] for c in combo]): | ||
name = '+'.join([channel_meta_map['name'][c] for c in combo]) | ||
color = '+'.join([channel_meta_map['color'][c] for c in combo]) | ||
wavelength = np.nan | ||
ibl_df.loc[states == state, ['name', 'color', 'wavelength']] = ( | ||
name, | ||
color, | ||
wavelength, | ||
) | ||
else: | ||
for cn in ['name', 'color', 'wavelength']: | ||
ibl_df.loc[states == state, cn] = channel_meta_map.iloc[ic[0]][cn] | ||
|
||
# drop first frame | ||
if drop_first: | ||
ibl_df = ibl_df.iloc[1:].reset_index() | ||
|
||
return ibl_df | ||
|
||
|
||
def from_raw_neurophotometrics_file_to_ibl_df( | ||
path: str | Path, | ||
drop_first=True, | ||
validate=True, | ||
) -> pd.DataFrame: | ||
raw_df = from_raw_neurophotometrics_file_to_raw_df(path, validate=validate) | ||
ibl_df = from_raw_neurophotometrics_df_to_ibl_df(raw_df, drop_first=drop_first) | ||
|
||
return ibl_df | ||
|
||
|
||
def from_ibl_pqt_to_ibl_df(path: str | Path, validate=False): | ||
if validate is True: | ||
# TODO | ||
raise NotImplementedError | ||
return pd.read_parquet(path) | ||
|
||
|
||
def from_ibl_dataframe( | ||
ibl_df: pd.DataFrame, | ||
data_columns: list[str] | None = None, | ||
time_column: str | None = None, | ||
channel_column: str = 'name', | ||
channel_names: list[str] | None = None, | ||
rename: dict | None = None, | ||
) -> dict: | ||
"""main function to convert to analysis ready format | ||
Args: | ||
ibl_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt | ||
data_columns (list[str], optional): The names of the columns in the dataframe that contain the signals of different fibers. By default, they are named RegionXX. If None is provided, All columns that start with `Region` are treated as data columns. Defaults to None. | ||
time_column (str, optional): The name of the column that contains the timestamps. If None is provided, it is assumed that `time` is in the name. Defaults to None. | ||
channel_column (str, optional): The name of the column that contains. Defaults to 'name'. | ||
channel_names (list[str], optional): The names of the acquisition channel / frequency bands that are acquired. Defaults to None. | ||
rename (dict, optional): a renaming map that maps the names of the columns to brain areas. Example: {'RegionXX':'DMS'}. Defaults to None. | ||
Returns: | ||
dict: A dict with the keys being the names of the acquisition channels, the values being nap.TsdFrames with the columns containing the data of the different fibers | ||
""" | ||
# from a raw dataframe as it is stored in ONE (signal.pqt) | ||
# data_columns is a list of str that specifies the names of the column that hold the actual data, like 'RegionXX' | ||
# channel_column is the column that specifies the temporally multiplexed acquisition channels | ||
|
||
data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns | ||
|
||
# infer name of time column if not provided | ||
if time_column is None: | ||
time_columns = [col for col in ibl_df.columns if 'time' in col.lower()] | ||
assert len(time_columns) == 1 | ||
time_column = time_columns[0] | ||
|
||
# infer channel names if they are not explicitly provided | ||
if channel_names is None: | ||
channel_names = ibl_df[channel_column].unique() | ||
|
||
# drop empty acquisition channels | ||
to_drop = ['None', ''] | ||
channel_names = [ch for ch in channel_names if ch not in to_drop] | ||
|
||
dfs = {} | ||
for channel in channel_names: | ||
# get the data for the band | ||
df = ibl_df.groupby(channel_column).get_group(channel) | ||
# if rename dict is passed, rename Region0X to the corresponding brain region | ||
if rename is not None: | ||
df = df.rename(columns=rename) | ||
data_columns = rename.values() | ||
dfs[channel] = df.set_index(time_column)[data_columns] | ||
|
||
return dfs | ||
|
||
|
||
def from_ibl_pqt( | ||
signal_pqt_path: str | Path, | ||
locations_pqt_path: Optional[str | Path] = None, | ||
): | ||
# read from a single pqt | ||
# if both are provided, do both | ||
|
||
ibl_df = pd.read_parquet(signal_pqt_path) | ||
if locations_pqt_path is not None: | ||
locations_df = pd.read_parquet(locations_pqt_path) | ||
return from_ibl_dataframes(ibl_df, locations_df) | ||
else: | ||
warnings.warn( | ||
'loading a photometry.signal.pqt file without its corresponding photometryROI.locations.pqt' | ||
) | ||
data_columns = None | ||
rename = None | ||
|
||
read_config = dict( | ||
data_columns=data_columns, | ||
time_column='times', | ||
channel_column='name', | ||
rename=rename, | ||
) | ||
|
||
return from_ibl_dataframe(ibl_df, **read_config) | ||
|
||
|
||
def from_ibl_dataframes(ibl_df: pd.DataFrame, locations_df: pd.DataFrame): | ||
# if locations are present | ||
data_columns = (list(locations_df.index),) | ||
rename = locations_df['brain_region'].to_dict() | ||
|
||
read_config = dict( | ||
data_columns=data_columns, | ||
time_column='times', | ||
channel_column='name', | ||
rename=rename, | ||
) | ||
|
||
return from_ibl_dataframe(ibl_df, **read_config) | ||
|
||
|
||
def from_raw_neurophotometrics_file( | ||
path: str | Path, | ||
drop_first=True, | ||
validate=True, | ||
) -> dict: | ||
# this one bypasses everything | ||
ibl_df = from_raw_neurophotometrics_file_to_ibl_df( | ||
path, drop_first=drop_first, validate=validate | ||
) | ||
# data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns | ||
read_config = dict( | ||
# data_columns=data_columns, | ||
time_column='times', | ||
channel_column='name', | ||
) | ||
return from_ibl_dataframe(ibl_df, **read_config) | ||
|
||
def read_digital_inputs_csv(path: str | Path, | ||
validate=True) -> pd.DataFrame: | ||
|
||
df_digital_inputs = pd.read_csv(path, header=None) | ||
df_digital_inputs.columns = ['ChannelName', 'Channel', 'AlwaysTrue', 'SystemTimestamp', 'ComputerTimestamp'] | ||
if validate: | ||
df_digital_inputs = validate_neurophotometrics_digital_inputs(df_digital_inputs) | ||
return df_digital_inputs | ||
|
||
""" | ||
## ## ### ## #### ######## ### ######## #### ####### ## ## | ||
## ## ## ## ## ## ## ## ## ## ## ## ## ## ### ## | ||
## ## ## ## ## ## ## ## ## ## ## ## ## ## #### ## | ||
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## | ||
## ## ######### ## ## ## ## ######### ## ## ## ## ## #### | ||
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ### | ||
### ## ## ######## #### ######## ## ## ## #### ####### ## ## | ||
""" | ||
|
||
|
||
def validate_ibl_dataframe(df: pd.DataFrame) -> pd.DataFrame: ... | ||
|
||
|
||
def validate_neurophotometrics_df( | ||
df: pd.DataFrame, | ||
data_columns=None, | ||
) -> pd.DataFrame: | ||
data_columns = infer_data_columns(df) if data_columns is None else data_columns | ||
|
||
schema_raw_data = pandera.DataFrameSchema( | ||
columns=dict( | ||
FrameCounter=pandera.Column(pandera.Int64), | ||
SystemTimestamp=pandera.Column(pandera.Float64), | ||
LedState=pandera.Column(pandera.Int16, coerce=True), | ||
ComputerTimestamp=pandera.Column(pandera.Float64), | ||
**{k: pandera.Column(pandera.Float64) for k in data_columns}, | ||
) | ||
) | ||
|
||
return schema_raw_data.validate(df) | ||
|
||
|
||
def validate_neurophotometrics_digital_inputs(df: pd.DataFrame) -> pd.DataFrame: | ||
schema_digital_inputs = pandera.DataFrameSchema( | ||
columns=dict( | ||
ChannelName=pandera.Column(str, coerce=True), | ||
Channel=pandera.Column(pandera.Int8, coerce=True), | ||
AlwaysTrue=pandera.Column(bool, coerce=True), | ||
SystemTimestamp=pandera.Column(pandera.Float64), | ||
ComputerTimestamp=pandera.Column(pandera.Float64), | ||
) | ||
) | ||
return schema_digital_inputs.validate(df) | ||
|
||
|
||
def infer_data_columns(df: pd.DataFrame) -> list[str]: | ||
# this hacky parser currently deals with the inconsistency between carolinas and alejandros extraction | ||
# https://github.com/int-brain-lab/ibl-photometry/issues/35 | ||
data_columns = [ | ||
col for col in df.columns if col.startswith('Region') or col.startswith('G') | ||
] | ||
return data_columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters