diff --git a/RCSdata/RCS_gcms_data_analysis.py b/RCSdata/RCS_gcms_data_analysis.py index 8425655..76333a4 100644 --- a/RCSdata/RCS_gcms_data_analysis.py +++ b/RCSdata/RCS_gcms_data_analysis.py @@ -5,6 +5,7 @@ @author Matteo Pecchi (mp933@cornell.edu). """ + # ============================================================================= # # necessary packages, install them using conda (not pip) # ============================================================================= @@ -17,35 +18,41 @@ from rdkit import Chem from rdkit.Chem import DataStructs from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect -from gcms_data_analysis import Project, figure_create, figure_save +from gcms_data_analysis import Project +from gcms_data_analysis.plotting import plot_ave_std, MyFigure + +folder_path = plib.Path(plib.Path(__file__).parent, "data") + -folder_path = plib.Path(plib.Path(__file__).parent, 'data') -#%% +# %% def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100): - cols_cal_area = [c for c in list(calibration) if 'Area' in c] - cols_cal_ppms = [c for c in list(calibration) if 'PPM' in c] - calibration[cols_cal_area + cols_cal_ppms] = \ - calibration[cols_cal_area + cols_cal_ppms].apply(pd.to_numeric, - errors='coerce') + cols_cal_area = [c for c in list(calibration) if "Area" in c] + cols_cal_ppms = [c for c in list(calibration) if "PPM" in c] + calibration[cols_cal_area + cols_cal_ppms] = calibration[ + cols_cal_area + cols_cal_ppms + ].apply(pd.to_numeric, errors="coerce") cal_areas0 = calibration.loc[name0, cols_cal_area].to_numpy(dtype=float) cal_ppms0 = calibration.loc[name0, cols_cal_ppms].to_numpy(dtype=float) # linear fit of calibration curve (exclude nan), get ppm from area - fit0 = np.polyfit(cal_areas0[~np.isnan(cal_areas0)], - cal_ppms0[~np.isnan(cal_ppms0)], 1) + fit0 = np.polyfit( + cal_areas0[~np.isnan(cal_areas0)], cal_ppms0[~np.isnan(cal_ppms0)], 1 + ) cal_areas1 = calibration.loc[name1, cols_cal_area].to_numpy(dtype=float) cal_ppms1 = calibration.loc[name1, cols_cal_ppms].to_numpy(dtype=float) # linear fit of calibration curve (exclude nan), get ppm from area - fit1 = np.polyfit(cal_areas1[~np.isnan(cal_areas1)], - cal_ppms1[~np.isnan(cal_ppms1)], 1) + fit1 = np.polyfit( + cal_areas1[~np.isnan(cal_areas1)], cal_ppms1[~np.isnan(cal_ppms1)], 1 + ) x = np.arange(xrange[0], xrange[1], steps) line0 = np.poly1d(fit0)(x) line1 = np.poly1d(fit1)(x) - mse = np.mean((line0 - line1)**2) - mse = np.average(abs(line0-line1)/line1)*100 + mse = np.mean((line0 - line1) ** 2) + mse = np.average(abs(line0 - line1) / line1) * 100 return mse -#%% + +# %% Project.set_folder_path(folder_path) # Set the base folder path for the project's data files @@ -65,7 +72,7 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100 # Load calibration data for standard and derivatized samples, and determine if they are derivatized calibrations, is_calibr_deriv = gcms.load_calibrations() -c1, c2 = calibrations['calibration88'], calibrations['deriv_calibration11'] +c1, c2 = calibrations["calibration88"], calibrations["deriv_calibration11"] # Generate a comprehensive list of all compounds found across samples list_of_all_compounds = gcms.create_list_of_all_compounds() @@ -87,7 +94,7 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100 files, is_files_deriv = gcms.apply_calibration_to_files() # Extract specific files for detailed analysis or further operations -f11, f22, f33 = files['A_1'], files['Ader_1'], files['B_1'] +f11, f22, f33 = files["A_1"], files["Ader_1"], files["B_1"] # Add statistical information to the files_info DataFrame, such as mean, median, and standard deviation for each file files_info = gcms.add_stats_to_files_info() @@ -97,105 +104,194 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100 # Create samples and their standard deviations from the files, storing the results in dictionaries samples, samples_std = gcms.create_samples_from_files() -s1, s2, s3 = samples['A'], samples['Ader'], samples['B'] -sd1, sd2, sd3 = samples_std['A'], samples_std['Ader'], samples_std['B'] - -# Add statistical information to the samples_info DataFrame, enhancing the initial analysis with statistical data -samples_info = gcms.add_stats_to_samples_info() +s1, s2, s3 = samples["A"], samples["Ader"], samples["B"] +sd1, sd2, sd3 = samples_std["A"], samples_std["Ader"], samples_std["B"] # Generate reports for specific parameters (e.g., concentration, mass fraction) for files and samples -rep_files_conc = gcms.create_files_param_report(param='conc_vial_mg_L') -rep_files_fr = gcms.create_files_param_report(param='fraction_of_sample_fr') -rep_samples_conc, rep_samples_conc_std = gcms.create_samples_param_report(param='conc_vial_mg_L') -rep_samples_fr, rep_samples_fr_std = gcms.create_samples_param_report(param='fraction_of_sample_fr') +rep_files_conc = gcms.create_files_param_report(param="conc_vial_mg_L") +rep_files_fr = gcms.create_files_param_report(param="fraction_of_sample_fr") +rep_samples_conc, rep_samples_conc_std = gcms.create_samples_param_report( + param="conc_vial_mg_L" +) +rep_samples_fr, rep_samples_fr_std = gcms.create_samples_param_report( + param="fraction_of_sample_fr" +) # Generate aggregated reports based on functional groups for files and samples, for specific parameters -agg_files_conc = gcms.create_files_param_aggrrep(param='conc_vial_mg_L') -agg_files_fr = gcms.create_files_param_aggrrep(param='fraction_of_sample_fr') -agg_samples_conc, agg_samples_conc_std = gcms.create_samples_param_aggrrep(param='conc_vial_mg_L') -agg_samples_fr, agg_samples_fr_std = gcms.create_samples_param_aggrrep(param='fraction_of_sample_fr') +agg_files_conc = gcms.create_files_param_aggrrep(param="conc_vial_mg_L") +agg_files_fr = gcms.create_files_param_aggrrep(param="fraction_of_sample_fr") +agg_samples_conc, agg_samples_conc_std = gcms.create_samples_param_aggrrep( + param="conc_vial_mg_L" +) +agg_samples_fr, agg_samples_fr_std = gcms.create_samples_param_aggrrep( + param="fraction_of_sample_fr" +) # %% Plotting results based on the generated reports, allowing for visual comparison of average values and standard deviations # Plot results for individual files or samples based -gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.05, files_or_samples='files', - legend_location='outside', xlab_rot=30, filename='sample_fraction_files', +mf = plot_ave_std( + gcms, + width=8, + height=4.5, + param="fraction_of_sample_fr", + min_y_thresh=0.05, + files_or_samples="files", + legend_location="best", + x_label_rotation=30, + filename="sample_fraction_files", # only_samples_to_plot=['A_1', 'A_2', 'Ader_1', 'B_2'], - y_lim=[0, .3], annotate_lttrs='a' - ) -gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.05, files_or_samples='samples', - legend_location='outside', xlab_rot=0, filename='sample_fraction_samples', + y_lim=[0, 0.4], + annotate_lttrs="a", + annotate_lttrs_xy=(-0.08, -0.08), +) +# %% +mf = plot_ave_std( + gcms, + width=7, + height=4.5, + param="fraction_of_sample_fr", + min_y_thresh=0.05, + files_or_samples="samples", + legend_location="outside", + filename="sample_fraction_samples", # only_samples_to_plot=['A_1', 'A_2', 'Ader_1', 'B_2'], - y_lim=[0, .3], annotate_lttrs='b' - ) -#%% + y_lim=[0, 0.3], + annotate_lttrs="b", + annotate_lttrs_xy=(-0.2, -0.05), +) +# %% # plot results bases on aggreport -gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, files_or_samples='files', - filename='sample_fraction_aggr_files', xlab_rot=30, annotate_lttrs='c', - min_y_thresh=0.01, #yt_sum=True, - y_lim=[0, 1], color_palette='Set2') -gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, files_or_samples='samples', - filename='sample_fraction_aggr_samples', annotate_lttrs='d', - min_y_thresh=0.01, #yt_sum=True, - y_lim=[0, 1], color_palette='Set2') -#%% -gcms.plot_ave_std(param='fraction_of_sample_fr', min_y_thresh=0.01, - legend_location='outside', only_samples_to_plot=['A', 'Ader', 'B'], - y_lim=[0, 0.3] - ) +mf = plot_ave_std( + gcms, + width=7, + height=4.5, + param="fraction_of_sample_fr", + aggr=True, + files_or_samples="files", + filename="sample_fraction_aggr_files", + x_label_rotation=30, + annotate_lttrs="c", + min_y_thresh=0.01, # yt_sum=True, + y_lim=[0, 1], + color_palette="Set2", + annotate_lttrs_xy=(-0.08, -0.08), +) +# %% +mf = plot_ave_std( + gcms, + width=4.5, + height=4.5, + param="fraction_of_sample_fr", + aggr=True, + files_or_samples="samples", + filename="sample_fraction_aggr_samples", + annotate_lttrs="d", + min_y_thresh=0.01, # yt_sum=True, + y_lim=[0, 1], + color_palette="Set2", + annotate_lttrs_xy=(-0.15, -0.05), +) +# %% +mf = plot_ave_std( + gcms, + width=8, + height=4.5, + param="fraction_of_sample_fr", + min_y_thresh=0.01, + legend_location="outside", + only_samples_to_plot=["A", "Ader", "B"], + y_lim=[0, 0.3], +) # %% plot results bases on aggreport -gcms.plot_ave_std(param='fraction_of_sample_fr', aggr=True, min_y_thresh=0.01, - y_lim=[0, .5], color_palette='Set2') +mf = plot_ave_std( + gcms, + width=4.5, + height=4.5, + param="fraction_of_sample_fr", + aggr=True, + min_y_thresh=0.01, + y_lim=[0, 0.5], + color_palette="Set2", +) -#%% +# %% run_tanimoto_analysis = True if run_tanimoto_analysis: in_path = folder_path - out_path_cal = plib.Path(folder_path, 'output_tanimoto') + out_path_cal = plib.Path(folder_path, "output_tanimoto") out_path_cal.mkdir(parents=True, exist_ok=True) - calibration = pd.read_excel(plib.Path(in_path, 'calibration88.xlsx'), - engine='openpyxl', index_col='Name') + calibration = pd.read_excel( + plib.Path(in_path, "calibration88.xlsx"), engine="openpyxl", index_col="Name" + ) combs = combinations(calibration.index.tolist(), 2) - tanimoto_error = pd.DataFrame(columns=['CalErr', 'DistMW', 'TanimS'], index=range(3915)) + tanimoto_error = pd.DataFrame( + columns=["CalErr", "DistMW", "TanimS"], index=range(3915) + ) for c, (name0, name1) in enumerate(combs): - tanimoto_error.loc[c, 'CalErr'] = get_calibration_error(name0, name1, calibration) - tanimoto_error.loc[c, 'DistMW'] = abs(calibration.loc[name0,'MW'] - calibration.loc[name1,'MW']) + tanimoto_error.loc[c, "CalErr"] = get_calibration_error( + name0, name1, calibration + ) + tanimoto_error.loc[c, "DistMW"] = abs( + calibration.loc[name0, "MW"] - calibration.loc[name1, "MW"] + ) try: - smis = [calibration.loc[name0, 'canonical_smiles'], - calibration.loc[name1, 'canonical_smiles']] + smis = [ + calibration.loc[name0, "canonical_smiles"], + calibration.loc[name1, "canonical_smiles"], + ] mols = [Chem.MolFromSmiles(smi) for smi in smis] fps = [GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mols] # perform Tanimoto similarity - tanimoto_error.loc[c, 'TanimS'] = DataStructs.TanimotoSimilarity(fps[0], fps[1]) + tanimoto_error.loc[c, "TanimS"] = DataStructs.TanimotoSimilarity( + fps[0], fps[1] + ) except TypeError: - tanimoto_error.loc[c, 'TanimS'] = np.nan - tanimoto_error.to_excel(plib.Path(out_path_cal, 'tanimoto_error.xlsx')) - fig, ax, axt, fig_par = figure_create(rows=1, cols=1, plot_type=0, hgt_mltp=1.2, - paper_col=1.4) - - aa = ax[0].scatter(tanimoto_error['TanimS'].values, tanimoto_error['CalErr'].values, - c=tanimoto_error['DistMW'].values) - ax[0].set_yscale('log') - plt.colorbar(aa, label=r'$\Delta$MW [atomic mass unit]') - plt.hlines(y=100, xmin=0, xmax=1, color='grey', linestyle='dotted') - plt.vlines(x=.4, ymin=0, ymax=100, color='grey', linestyle='dashed') - ax[0].annotate('default\nsetting', ha='left', va='bottom', - xycoords='axes fraction', - xy=(0.3, .01)) - ax[0].annotate('Error = 100%', ha='left', va='bottom', - xycoords='axes fraction', - xy=(0.8, .6)) - figure_save('tanimoto_error', out_path_cal, fig, ax, axt, fig_par, - x_lab='Tanimoto Similarity [-]', x_lim=[0, 1], y_lab='Average error [%]', - legend=None, tight_layout=True) + tanimoto_error.loc[c, "TanimS"] = np.nan + tanimoto_error.to_excel(plib.Path(out_path_cal, "tanimoto_error.xlsx")) + myfig = MyFigure( + rows=1, + cols=1, + width=7, + height=6, + x_lab="Tanimoto Similarity [-]", + x_lim=[0, 1], + y_lab="Average error [%]", + ) + # fig, ax, axt, fig_par = figure_create(rows=1, cols=1, plot_type=0, hgt_mltp=1.2, + # paper_col=1.4) + + aa = myfig.axs[0].scatter( + tanimoto_error["TanimS"].values, + tanimoto_error["CalErr"].values, + c=tanimoto_error["DistMW"].values, + ) + myfig.axs[0].set_yscale("log") + plt.colorbar(aa, label=r"$\Delta$MW [atomic mass unit]") + plt.hlines(y=100, xmin=0, xmax=1, color="grey", linestyle="dotted") + plt.vlines(x=0.4, ymin=0, ymax=100, color="grey", linestyle="dashed") + myfig.axs[0].annotate( + "default\nsetting", + ha="left", + va="bottom", + xycoords="axes fraction", + xy=(0.3, 0.01), + ) + myfig.axs[0].annotate( + "Error = 100%", ha="left", va="bottom", xycoords="axes fraction", xy=(0.8, 0.6) + ) + myfig.save_figure(filename="tanimoto_error", out_path=out_path_cal) + # figure_save('tanimoto_error', out_path_cal, fig, ax, axt, fig_par, + # x_lab='Tanimoto Similarity [-]', x_lim=[0, 1], y_lab='Average error [%]', + # legend=None, tight_layout=True) # create and export the similarity table for tetradecanoic acid - cpmnds = gcms.compounds_properties.set_index('iupac_name') - cpmnds = cpmnds[~cpmnds.index.duplicated(keep='first')].copy() + cpmnds = gcms.compounds_properties.set_index("iupac_name") + cpmnds = cpmnds[~cpmnds.index.duplicated(keep="first")].copy() iupac = cpmnds.index[0] - mws = [cpmnds.loc[iupac, 'molecular_weight']] - smis = [cpmnds.loc[iupac, 'canonical_smiles']] + mws = [cpmnds.loc[iupac, "molecular_weight"]] + smis = [cpmnds.loc[iupac, "canonical_smiles"]] names_cal = [iupac] # then add all properties for all calibrated compounds # if the sample was not derivatized (default) @@ -203,28 +299,28 @@ def get_calibration_error(name0, name1, calibration, xrange=[10, 200], steps=100 for c in cpmnds.index.tolist()[1:6]: names_cal.append(c) # print(df_comps.index) - smis.append(cpmnds.loc[c, 'canonical_smiles']) - mws.append(cpmnds.loc[c, 'molecular_weight']) + smis.append(cpmnds.loc[c, "canonical_smiles"]) + mws.append(cpmnds.loc[c, "molecular_weight"]) # calculate the delta mw with all calib compounds - delta_mw = np.abs(np.asarray(mws)[0] - - np.asarray(mws)[1:]) + delta_mw = np.abs(np.asarray(mws)[0] - np.asarray(mws)[1:]) # get mols and fingerprints from rdkit for each comp mols = [Chem.MolFromSmiles(smi) for smi in smis] - fps = [GetMorganFingerprintAsBitVect(ml, 2, nBits=1024) - for ml in mols] + fps = [GetMorganFingerprintAsBitVect(ml, 2, nBits=1024) for ml in mols] # perform Tanimoto similarity betwenn the first and all # other compounds s = DataStructs.BulkTanimotoSimilarity(fps[0], fps[1:]) # create a df with results - df_sim = pd.DataFrame(data={'name': names_cal[1:], - 'smiles': smis[1:], 'Similarity': s, 'delta_mw': delta_mw}) + df_sim = pd.DataFrame( + data={ + "name": names_cal[1:], + "smiles": smis[1:], + "Similarity": s, + "delta_mw": delta_mw, + } + ) # put the index title as the comp - df_sim.set_index('name', inplace=True) + df_sim.set_index("name", inplace=True) df_sim.index.name = iupac # sort values based on similarity and delta mw - df_sim = df_sim.sort_values(['Similarity', 'delta_mw'], - ascending=[False, True]) - df_sim.to_excel(plib.Path(out_path_cal, 'similarity_table_tetradecanoic.xlsx')) - -# %% - + df_sim = df_sim.sort_values(["Similarity", "delta_mw"], ascending=[False, True]) + df_sim.to_excel(plib.Path(out_path_cal, "similarity_table_tetradecanoic.xlsx")) diff --git a/example/data/classifications_codes_fractions.xlsx b/example/data/classifications_codes_fractions.xlsx index 5f69cee..ab959e7 100644 Binary files a/example/data/classifications_codes_fractions.xlsx and b/example/data/classifications_codes_fractions.xlsx differ diff --git a/example/example_gcms_data_analysis.py b/example/example_gcms_data_analysis.py index de0124a..4a8131d 100644 --- a/example/example_gcms_data_analysis.py +++ b/example/example_gcms_data_analysis.py @@ -1,15 +1,16 @@ # Import necessary libraries import pathlib as plib # Used for handling file and directory paths -from gcms_data_analysis import ( - Project, -) # Import the Project class from the gcms_data_analysis package +from gcms_data_analysis import Project +from gcms_data_analysis.plotting import plot_ave_std # Define the folder path where your data is located. Change this path to where you've stored your data files. # folder_path = plib.Path(plib.Path(__file__).parent, "example\data") folder_path = plib.Path( r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\example\data" ) - +# folder_path: plib.Path = plib.Path( +# r"C:\Users\mp933\OneDrive - Cornell University\Python\GCMS\NNDNDD" +# ) # Set global configurations for the Project class. # These configurations affect all instances of the class. Project.set_folder_path( @@ -17,7 +18,7 @@ ) # Set the base folder path for the project's data files Project.set_plot_grid(False) # Disable grid lines in plots for a cleaner look Project.set_plot_font("Sans") # Set the font style for plots to 'Sans' - +Project.set_auto_save_to_excel(False) # Initialize a Project instance to manage and analyze GCMS data gcms = Project() @@ -41,8 +42,8 @@ list_of_all_deriv_compounds = gcms.create_list_of_all_deriv_compounds() # Load properties for standard and derivatized compounds from provided files -compounds_properties = gcms.load_compounds_properties() -deriv_compounds_properties = gcms.load_deriv_compounds_properties() +compounds_properties = gcms.create_compounds_properties() +deriv_compounds_properties = gcms.create_deriv_compounds_properties() # Flag indicating whether new compounds have been added, triggering a need to regenerate properties data new_files_with_new_compounds_added = False @@ -56,7 +57,7 @@ # Extract specific files for detailed analysis or further operations f11, f22, f33 = files["A_1"], files["Ader_1"], files["B_1"] -# Add statistical information to the files_info DataFrame, such as mean, median, and standard deviation for each file +# # Add statistical information to the files_info DataFrame, such as mean, median, and standard deviation for each file files_info = gcms.add_stats_to_files_info() # Create a samples_info DataFrame without applying calibration data, for initial analysis @@ -90,7 +91,8 @@ # Plotting results based on the generated reports, allowing for visual comparison of average values and standard deviations # Plot results for individual files or samples based -gcms.plot_ave_std( +plot_ave_std( + gcms, param="fraction_of_sample_fr", min_y_thresh=0, files_or_samples="files", @@ -98,7 +100,8 @@ only_samples_to_plot=["A_1", "A_2", "Ader_1", "Ader_2"], # y_lim=[0, 5000] ) # plot results bases on aggreport -gcms.plot_ave_std( +plot_ave_std( + gcms, param="fraction_of_sample_fr", aggr=True, files_or_samples="files", @@ -107,17 +110,61 @@ color_palette="Set2", ) -gcms.plot_ave_std( +plot_ave_std( + gcms, param="fraction_of_sample_fr", min_y_thresh=0, legend_location="outside", only_samples_to_plot=["A", "Ader"], # y_lim=[0, 5000] ) # plot results bases on aggreport -gcms.plot_ave_std( +plot_ave_std( + gcms, param="fraction_of_sample_fr", aggr=True, min_y_thresh=0.01, y_lim=[0, 0.5], color_palette="Set2", ) + +# %% +# import pickle + +# folder_path: plib.Path = plib.Path(r"C:\Users\mp933\Desktop\New folder") +# pickle_path: plib.Path = plib.Path(folder_path, "pickle_object.pkl") +# with open(pickle_path, "wb") as output_file: +# pickle.dump(gcms, output_file) +# %% +# import pickle +# import pathlib as plib # Used for handling file and directory paths +# from gcms_data_analysis import ( +# Project, +# ) # Import the Project class from the gcms_data_analysis package + +# folder_path: plib.Path = plib.Path(r"C:\Users\mp933\Desktop\New folder") +# pickle_path: plib.Path = plib.Path(folder_path, "pickle_object.pkl") +# with open(pickle_path, "rb") as input_file: +# gcms: Project = pickle.load(input_file) +# from gcms_data_analysis.plotting import plot_pave_std + +# # %% +# myfig = plot_pave_std( +# gcms, +# files_or_samples="files", +# width=12, +# height=5, +# legend_location="outside", +# y_lim=[0, 100], +# ) +# # %% +# myfig = plot_pave_std( +# gcms, +# files_or_samples="samples", +# width=6, +# height=6, +# legend_location="best", +# y_lim=[0, 100], +# min_y_thresh=10, +# ) + +# # %% diff --git a/example/example_minimal_case.py b/example/example_minimal_case.py new file mode 100644 index 0000000..ce81bd2 --- /dev/null +++ b/example/example_minimal_case.py @@ -0,0 +1,220 @@ +# %% Import necessary libraries +import pathlib as plib # Used for handling file and directory paths +from gcms_data_analysis import Project +from gcms_data_analysis.plotting import plot_ave_std + +# Define the folder path where your data is located. Change this path to where you've stored your data files. +# folder_path = plib.Path(plib.Path(__file__).parent, "example\data") +folder_path = plib.Path( + r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_minimal_case" +) +# folder_path: plib.Path = plib.Path( +# r"C:\Users\mp933\OneDrive - Cornell University\Python\GCMS\NNDNDD" +# ) +# Set global configurations for the Project class. +# These configurations affect all instances of the class. +Project.set_folder_path( + folder_path +) # Set the base folder path for the project's data files +Project.set_plot_grid(False) # Disable grid lines in plots for a cleaner look +Project.set_plot_font("Sans") # Set the font style for plots to 'Sans' +Project.set_auto_save_to_excel(False) +# Initialize a Project instance to manage and analyze GCMS data +gcms = Project() + +# Load metadata from a user-provided 'files_info.xlsx' file, or generate it from .txt GC-MS files if not provided +files_info = gcms.load_files_info() +# Load individual GCMS .txt files as pandas DataFrames +files = gcms.load_all_files() +files = gcms.add_iupac_to_files() +list_of_all_compounds = gcms.create_list_of_all_compounds() +files, is_files_deriv = gcms.apply_calibration_to_files() +samples_info, samples_info_std = gcms.create_samples_info() +samples, samples_std = gcms.create_samples_from_files() + +params = [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", +] +for param in params: + _ = gcms.create_files_param_report(param) + _ = gcms.create_files_param_aggrrep(param) + + _, _ = gcms.create_samples_param_report(param) + _, _ = gcms.create_samples_param_aggrrep(param) + +# %% +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.files_reports[param]) +# %% + +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.files_aggrreps[param]) +# %% +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.samples_reports[param]) +# %% +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.samples_reports_std[param]) +# %% + +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.samples_aggrreps[param]) +# %% + +for param in params: + print(f"'{param}': ") + print_checked_df_to_script_text_with_arrays(gcms.samples_aggrreps_std[param]) +# %% + + +# Load classification codes and mass fractions for functional groups from a provided file +class_code_frac = gcms.load_class_code_frac() + +# Load calibration data for standard and derivatized samples, and determine if they are derivatized +calibrations, is_calibr_deriv = gcms.load_calibrations() +# c1, c2 = calibrations["calibration"], calibrations["deriv_calibration"] + +# Generate a comprehensive list of all compounds found across samples +list_of_all_compounds = gcms.create_list_of_all_compounds() + +# Similarly, create a list of all derivatized compounds found across samples +list_of_all_deriv_compounds = gcms.create_list_of_all_deriv_compounds() + +# Load properties for standard and derivatized compounds from provided files +compounds_properties = gcms.create_compounds_properties() +deriv_compounds_properties = gcms.create_deriv_compounds_properties() + +# Flag indicating whether new compounds have been added, triggering a need to regenerate properties data +new_files_with_new_compounds_added = False +if new_files_with_new_compounds_added: + compounds_properties = gcms.create_compounds_properties() + deriv_compounds_properties = gcms.create_deriv_compounds_properties() + +# Apply calibration data to all loaded files, adjusting compound concentrations based on calibration curves +files, is_files_deriv = gcms.apply_calibration_to_files() + +# Extract specific files for detailed analysis or further operations +f11, f22, f33 = files["A_1"], files["Ader_1"], files["B_1"] + +# # Add statistical information to the files_info DataFrame, such as mean, median, and standard deviation for each file +files_info = gcms.add_stats_to_files_info() + +# Create a samples_info DataFrame without applying calibration data, for initial analysis +samples_info_0 = gcms.create_samples_info() + +# Create samples and their standard deviations from the files, storing the results in dictionaries +samples, samples_std = gcms.create_samples_from_files() +s1, s2, s3 = samples["A"], samples["Ader"], samples["B"] +sd1, sd2, sd3 = samples_std["A"], samples_std["Ader"], samples_std["B"] + +# Generate reports for specific parameters (e.g., concentration, mass fraction) for files and samples +rep_files_conc = gcms.create_files_param_report(param="conc_vial_mg_L") +rep_files_fr = gcms.create_files_param_report(param="fraction_of_sample_fr") +rep_samples_conc, rep_samples_conc_std = gcms.create_samples_param_report( + param="conc_vial_mg_L" +) +rep_samples_fr, rep_samples_fr_std = gcms.create_samples_param_report( + param="fraction_of_sample_fr" +) + +# Generate aggregated reports based on functional groups for files and samples, for specific parameters +agg_files_conc = gcms.create_files_param_aggrrep(param="conc_vial_mg_L") +agg_files_fr = gcms.create_files_param_aggrrep(param="fraction_of_sample_fr") +agg_samples_conc, agg_samples_conc_std = gcms.create_samples_param_aggrrep( + param="conc_vial_mg_L" +) +agg_samples_fr, agg_samples_fr_std = gcms.create_samples_param_aggrrep( + param="fraction_of_sample_fr" +) + +# Plotting results based on the generated reports, allowing for visual comparison of average values and standard deviations +# Plot results for individual files or samples based + +plot_ave_std( + gcms, + param="fraction_of_sample_fr", + min_y_thresh=0, + files_or_samples="files", + legend_location="outside", + only_samples_to_plot=["A_1", "A_2", "Ader_1", "Ader_2"], # y_lim=[0, 5000] +) +# plot results bases on aggreport +plot_ave_std( + gcms, + param="fraction_of_sample_fr", + aggr=True, + files_or_samples="files", + min_y_thresh=0.01, + y_lim=[0, 0.5], + color_palette="Set2", +) + +plot_ave_std( + gcms, + param="fraction_of_sample_fr", + min_y_thresh=0, + legend_location="outside", + only_samples_to_plot=["A", "Ader"], # y_lim=[0, 5000] +) +# plot results bases on aggreport +plot_ave_std( + gcms, + param="fraction_of_sample_fr", + aggr=True, + min_y_thresh=0.01, + y_lim=[0, 0.5], + color_palette="Set2", +) + +# %% +# import pickle + +# folder_path: plib.Path = plib.Path(r"C:\Users\mp933\Desktop\New folder") +# pickle_path: plib.Path = plib.Path(folder_path, "pickle_object.pkl") +# with open(pickle_path, "wb") as output_file: +# pickle.dump(gcms, output_file) +# %% +# import pickle +# import pathlib as plib # Used for handling file and directory paths +# from gcms_data_analysis import ( +# Project, +# ) # Import the Project class from the gcms_data_analysis package + +# folder_path: plib.Path = plib.Path(r"C:\Users\mp933\Desktop\New folder") +# pickle_path: plib.Path = plib.Path(folder_path, "pickle_object.pkl") +# with open(pickle_path, "rb") as input_file: +# gcms: Project = pickle.load(input_file) +# from gcms_data_analysis.plotting import plot_pave_std + +# # %% +# myfig = plot_pave_std( +# gcms, +# files_or_samples="files", +# width=12, +# height=5, +# legend_location="outside", +# y_lim=[0, 100], +# ) +# # %% +# myfig = plot_pave_std( +# gcms, +# files_or_samples="samples", +# width=6, +# height=6, +# legend_location="best", +# y_lim=[0, 100], +# min_y_thresh=10, +# ) + +# # %% diff --git a/example/name_to_properties/data_name_to_properties/checked_compounds_properties.xlsx b/example/name_to_properties/data_name_to_properties/checked_compounds_properties.xlsx new file mode 100644 index 0000000..360c5b1 Binary files /dev/null and b/example/name_to_properties/data_name_to_properties/checked_compounds_properties.xlsx differ diff --git a/tests/minimal_set/classifications_codes_fractions.xlsx b/example/name_to_properties/data_name_to_properties/classifications_codes_fractions.xlsx similarity index 50% rename from tests/minimal_set/classifications_codes_fractions.xlsx rename to example/name_to_properties/data_name_to_properties/classifications_codes_fractions.xlsx index 5f69cee..f446d7e 100644 Binary files a/tests/minimal_set/classifications_codes_fractions.xlsx and b/example/name_to_properties/data_name_to_properties/classifications_codes_fractions.xlsx differ diff --git a/example/name_to_properties/example_name_to_properties.py b/example/name_to_properties/example_name_to_properties.py new file mode 100644 index 0000000..a55f707 --- /dev/null +++ b/example/name_to_properties/example_name_to_properties.py @@ -0,0 +1,86 @@ +import pathlib as plib +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.transforms import blended_transform_factory +import seaborn as sns +import ele +import pubchempy as pcp +from gcms_data_analysis.fragmenter import Fragmenter + +from gcms_data_analysis import name_to_properties + + +folder_path = plib.Path( + r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_name_to_properties" +) +# %% +classifications_codes_fractions = pd.read_excel( + plib.Path( + folder_path, + "classifications_codes_fractions.xlsx", + ) +) +checked_compounds_properties = pd.read_excel( + plib.Path( + folder_path, + "checked_compounds_properties.xlsx", + ), + index_col="comp_name", +) +dict_cl_to_codes: dict[str, str] = dict( + zip( + classifications_codes_fractions.classes.tolist(), + classifications_codes_fractions.codes.tolist(), + ) +) +dict_cl_to_mass_fractions: dict[str, float] = dict( + zip( + classifications_codes_fractions.classes.tolist(), + classifications_codes_fractions.mfs.tolist(), + ) +) +# %% + +compounds = [ + "2-methylcyclopent-2-en-1-one", # small ketone + "hexadecanoic acid", + "n-hexadecanoic acid", # different names same compounds + "phenol", # ring + "phenol", # repeated compound + "carbolic acid", # same iupac but different comp_name + "2,4,5-trichlorophenol", # clorine (udentified) + "phenoxytrimethylsilane", # silane (not listed in fg) + "bromophenol", # Br not listed + "9-octadecenoic acid, 1,2,3-propanetriyl ester, (e,e,e)-", # large compound + "name_not_on_pcp", # test for legit string that gives no pcp result + " ", # wrong entry or datatype + None, + False, + np.nan, +] + +list_of_compound_properties: list[pd.DataFrame] = [] +for compound in compounds: + print(compound) + n2p = name_to_properties(compound, dict_cl_to_codes, dict_cl_to_mass_fractions) + list_of_compound_properties.append(n2p) + +# %% +to_check = pd.DataFrame() +for compound in compounds: + print(compound) + to_check = name_to_properties( + compound, + dict_cl_to_codes, + dict_cl_to_mass_fractions, + to_check, + ) + +to_check.to_excel( + plib.Path( + r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_name_to_properties", + "checked_compounds_properties.xlsx", + ) +) +# %% diff --git a/pyproject.toml b/pyproject.toml index c01bbcb..e716113 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ classifiers = [ "Operating System :: OS Independent", ] -requires-python = ">=3.6" +requires-python = ">=3.8" [project.urls] Homepage = "https://github.com/mpecchi/gcms_data_analysis/tree/main" diff --git a/pytest.ini b/pytest.ini index 4530dfc..e4fa8c1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,4 +6,4 @@ filterwarnings = markers = slow: marks tests as slow (deselect with '-m "not slow"') -addopts = --cov=gcms_data_analysis --cov-report=html:docs/_coverage_html \ No newline at end of file +addopts = -s \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index beae60e..12b08d9 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -13,6 +13,9 @@ # %% +from collections.abc import Iterable + + def print_checked_df_to_script_text_with_arrays(df): # Convert the DataFrame to a dictionary with 'list' orientation df_dict = df.to_dict(orient="list") diff --git a/src/gcms_data_analysis/__init__.py b/src/gcms_data_analysis/__init__.py index 4e8e883..aea19f1 100644 --- a/src/gcms_data_analysis/__init__.py +++ b/src/gcms_data_analysis/__init__.py @@ -1 +1 @@ -from .main import Fragmenter, Project, figure_create, figure_save, name_to_properties +from .main import Project, name_to_properties diff --git a/src/gcms_data_analysis/fragmenter.py b/src/gcms_data_analysis/fragmenter.py new file mode 100644 index 0000000..c89e20c --- /dev/null +++ b/src/gcms_data_analysis/fragmenter.py @@ -0,0 +1,776 @@ +import marshal +from rdkit import Chem +from rdkit.Chem import DataStructs +from rdkit.Chem import rdmolops +from rdkit.Chem.AllChem import ( + GetMorganFingerprintAsBitVect, +) # pylint: disable=no-name-in-module + + +class Fragmenter: + """ + Class taken from https://github.com/simonmb/fragmentation_algorithm. + The original version of this algorithm was published in: + "Flexible Heuristic Algorithm for Automatic Molecule Fragmentation: + Application to the UNIFAC Group Contribution Model + DOI: 10.1186/s13321-019-0382-39." + MIT License + + ... + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + """ + + # tested with Python 3.8.8 and RDKit version 2021.09.4 + + # does a substructure match and then checks whether the match + # is adjacent to previous matches + @classmethod + def get_substruct_matches( + cls, + mol_searched_for, + mol_searched_in, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + ): + + valid_matches = [] + + if mol_searched_in.GetNumAtoms() >= mol_searched_for.GetNumAtoms(): + matches = mol_searched_in.GetSubstructMatches(mol_searched_for) + + if matches: + for match in matches: + add_this_match = True + if len(atomIdxs_to_which_new_matches_have_to_be_adjacent) > 0: + add_this_match = False + + for i in match: + for neighbor in mol_searched_in.GetAtomWithIdx( + i + ).GetNeighbors(): + if ( + neighbor.GetIdx() + in atomIdxs_to_which_new_matches_have_to_be_adjacent + ): + add_this_match = True + break + + if add_this_match: + valid_matches.append(match) + + return valid_matches + + # count heavier isotopes of hydrogen correctly + @classmethod + def get_heavy_atom_count(cls, mol): + heavy_atom_count = 0 + for atom in mol.GetAtoms(): + if atom.GetAtomicNum() != 1: + heavy_atom_count += 1 + + return heavy_atom_count + + def __init__( + self, + fragmentation_scheme={}, + fragmentation_scheme_order=None, + match_hydrogens=False, + algorithm="", + n_atoms_cuttoff=-1, + function_to_choose_fragmentation=False, + n_max_fragmentations_to_find=-1, + ): + + if not type(fragmentation_scheme) is dict: + raise TypeError( + "fragmentation_scheme must be a dctionary with integers as keys and either strings or list of strings as values." + ) + + if len(fragmentation_scheme) == 0: + raise ValueError("fragmentation_scheme must be provided.") + + if not algorithm in ["simple", "complete", "combined"]: + raise ValueError("Algorithm must be either simple ,complete or combined.") + + if algorithm == "simple": + if n_max_fragmentations_to_find != -1: + raise ValueError( + "Setting n_max_fragmentations_to_find only makes sense with complete or combined algorithm." + ) + + self.algorithm = algorithm + + if algorithm in ["combined", "complete"]: + if n_atoms_cuttoff == -1: + raise ValueError( + "n_atoms_cuttoff needs to be specified for complete or combined algorithms." + ) + + if function_to_choose_fragmentation == False: + raise ValueError( + "function_to_choose_fragmentation needs to be specified for complete or combined algorithms." + ) + + if not callable(function_to_choose_fragmentation): + raise TypeError( + "function_to_choose_fragmentation needs to be a function." + ) + else: + if type(function_to_choose_fragmentation([{}, {}])) != dict: + raise TypeError( + "function_to_choose_fragmentation needs to take a list of fragmentations and choose one of it" + ) + + if n_max_fragmentations_to_find != -1: + if n_max_fragmentations_to_find < 1: + raise ValueError( + "n_max_fragmentations_to_find has to be 1 or higher." + ) + + if fragmentation_scheme_order is None: + fragmentation_scheme_order = [] + + if algorithm in ["simple", "combined"]: + assert len(fragmentation_scheme) == len(fragmentation_scheme_order) + else: + fragmentation_scheme_order = [key for key in fragmentation_scheme.keys()] + + self.n_max_fragmentations_to_find = n_max_fragmentations_to_find + + self.n_atoms_cuttoff = n_atoms_cuttoff + + self.match_hydrogens = match_hydrogens + + self.fragmentation_scheme = fragmentation_scheme + + self.function_to_choose_fragmentation = function_to_choose_fragmentation + + # create a lookup dictionaries to faster finding a group number + self._fragmentation_scheme_group_number_lookup = {} + self._fragmentation_scheme_pattern_lookup = {} + self.fragmentation_scheme_order = fragmentation_scheme_order + + for group_number, list_SMARTS in fragmentation_scheme.items(): + + if type(list_SMARTS) is not list: + list_SMARTS = [list_SMARTS] + + for SMARTS in list_SMARTS: + if SMARTS != "": + self._fragmentation_scheme_group_number_lookup[SMARTS] = ( + group_number + ) + + mol_SMARTS = Chem.MolFromSmarts(SMARTS) + self._fragmentation_scheme_pattern_lookup[SMARTS] = mol_SMARTS + + def fragment(self, SMILES_or_molecule): + + if type(SMILES_or_molecule) is str: + mol_SMILES = Chem.MolFromSmiles(SMILES_or_molecule) + mol_SMILES = Chem.AddHs(mol_SMILES) if self.match_hydrogens else mol_SMILES + is_valid_SMILES = mol_SMILES is not None + + if not is_valid_SMILES: + raise ValueError("Following SMILES is not valid: " + SMILES_or_molecule) + + else: + mol_SMILES = SMILES_or_molecule + + # iterate over all separated molecules + success = [] + fragmentation = {} + fragmentation_matches = {} + for mol in rdmolops.GetMolFrags(mol_SMILES, asMols=True): + + this_mol_fragmentation, this_mol_success = self.__get_fragmentation(mol) + + for SMARTS, matches in this_mol_fragmentation.items(): + group_number = self._fragmentation_scheme_group_number_lookup[SMARTS] + + if not group_number in fragmentation: + fragmentation[group_number] = 0 + fragmentation_matches[group_number] = [] + + fragmentation[group_number] += len(matches) + fragmentation_matches[group_number].extend(matches) + + success.append(this_mol_success) + + return fragmentation, all(success), fragmentation_matches + + def fragment_complete(self, SMILES_or_molecule): + + if type(SMILES_or_molecule) is str: + mol_SMILES = Chem.MolFromSmiles(SMILES_or_molecule) + mol_SMILES = Chem.AddHs(mol_SMILES) if self.match_hydrogens else mol_SMILES + is_valid_SMILES = mol_SMILES is not None + + if not is_valid_SMILES: + raise ValueError("Following SMILES is not valid: " + SMILES_or_molecule) + + else: + mol_SMILES = SMILES_or_molecule + + if len(rdmolops.GetMolFrags(mol_SMILES)) != 1: + raise ValueError( + "fragment_complete does not accept multifragment molecules." + ) + + temp_fragmentations, success = self.__complete_fragmentation(mol_SMILES) + + fragmentations = [] + fragmentations_matches = [] + for temp_fragmentation in temp_fragmentations: + fragmentation = {} + fragmentation_matches = {} + for SMARTS, matches in temp_fragmentation.items(): + group_number = self._fragmentation_scheme_group_number_lookup[SMARTS] + + fragmentation[group_number] = len(matches) + fragmentation_matches[group_number] = matches + + fragmentations.append(fragmentation) + fragmentations_matches.append(fragmentation_matches) + + return fragmentations, success, fragmentations_matches + + def __get_fragmentation(self, mol_SMILES): + + success = False + fragmentation = {} + if self.algorithm in ["simple", "combined"]: + fragmentation, success = self.__simple_fragmentation(mol_SMILES) + + if success: + return fragmentation, success + + if self.algorithm in ["combined", "complete"]: + fragmentations, success = self.__complete_fragmentation(mol_SMILES) + + if success: + fragmentation = self.function_to_choose_fragmentation(fragmentations) + + return fragmentation, success + + def __simple_fragmentation(self, mol_SMILES): + + if self.match_hydrogens: + target_atom_count = len(mol_SMILES.GetAtoms()) + else: + target_atom_count = Fragmenter.get_heavy_atom_count(mol_SMILES) + + success = False + fragmentation = {} + + fragmentation, atomIdxs_included_in_fragmentation = ( + self.__search_non_overlapping_solution(mol_SMILES, {}, set(), set()) + ) + success = len(atomIdxs_included_in_fragmentation) == target_atom_count + + # if not successful, clean up molecule and search again + level = 1 + while not success: + fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far = ( + Fragmenter.__clean_molecule_surrounding_unmatched_atoms( + mol_SMILES, fragmentation, atomIdxs_included_in_fragmentation, level + ) + ) + level += 1 + + if len(atomIdxs_included_in_fragmentation_so_far) == 0: + break + + fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far = ( + self.__search_non_overlapping_solution( + mol_SMILES, + fragmentation_so_far, + atomIdxs_included_in_fragmentation_so_far, + atomIdxs_included_in_fragmentation_so_far, + ) + ) + + success = ( + len(atomIdxs_included_in_fragmentation_so_far) == target_atom_count + ) + + if success: + fragmentation = fragmentation_so_far + + return fragmentation, success + + def __search_non_overlapping_solution( + self, + mol_searched_in, + fragmentation, + atomIdxs_included_in_fragmentation, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + ): + + n_atomIdxs_included_in_fragmentation = ( + len(atomIdxs_included_in_fragmentation) - 1 + ) + + while n_atomIdxs_included_in_fragmentation != len( + atomIdxs_included_in_fragmentation + ): + n_atomIdxs_included_in_fragmentation = len( + atomIdxs_included_in_fragmentation + ) + + for group_number in self.fragmentation_scheme_order: + list_SMARTS = self.fragmentation_scheme[group_number] + if type(list_SMARTS) is not list: + list_SMARTS = [list_SMARTS] + + for SMARTS in list_SMARTS: + if SMARTS != "": + fragmentation, atomIdxs_included_in_fragmentation = ( + self.__get_next_non_overlapping_match( + mol_searched_in, + SMARTS, + fragmentation, + atomIdxs_included_in_fragmentation, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + ) + ) + + return fragmentation, atomIdxs_included_in_fragmentation + + def __get_next_non_overlapping_match( + self, + mol_searched_in, + SMARTS, + fragmentation, + atomIdxs_included_in_fragmentation, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + ): + + mol_searched_for = self._fragmentation_scheme_pattern_lookup[SMARTS] + + if atomIdxs_to_which_new_matches_have_to_be_adjacent: + matches = Fragmenter.get_substruct_matches( + mol_searched_for, + mol_searched_in, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + ) + else: + matches = Fragmenter.get_substruct_matches( + mol_searched_for, mol_searched_in, set() + ) + + if matches: + for match in matches: + all_atoms_of_new_match_are_unassigned = ( + atomIdxs_included_in_fragmentation.isdisjoint(match) + ) + + if all_atoms_of_new_match_are_unassigned: + if not SMARTS in fragmentation: + fragmentation[SMARTS] = [] + + fragmentation[SMARTS].append(match) + atomIdxs_included_in_fragmentation.update(match) + + return fragmentation, atomIdxs_included_in_fragmentation + + @classmethod + def __clean_molecule_surrounding_unmatched_atoms( + cls, mol_searched_in, fragmentation, atomIdxs_included_in_fragmentation, level + ): + + for i in range(0, level): + + atoms_missing = set( + range(0, Fragmenter.get_heavy_atom_count(mol_searched_in)) + ).difference(atomIdxs_included_in_fragmentation) + + new_fragmentation = marshal.loads(marshal.dumps(fragmentation)) + + for atomIdx in atoms_missing: + for neighbor in mol_searched_in.GetAtomWithIdx(atomIdx).GetNeighbors(): + for smart, atoms_found in fragmentation.items(): + for atoms in atoms_found: + if neighbor.GetIdx() in atoms: + if smart in new_fragmentation: + if new_fragmentation[smart].count(atoms) > 0: + new_fragmentation[smart].remove(atoms) + + if smart in new_fragmentation: + if len(new_fragmentation[smart]) == 0: + new_fragmentation.pop(smart) + + new_atomIdxs_included_in_fragmentation = set() + for i in new_fragmentation.values(): + for j in i: + new_atomIdxs_included_in_fragmentation.update(j) + + atomIdxs_included_in_fragmentation = new_atomIdxs_included_in_fragmentation + fragmentation = new_fragmentation + + return fragmentation, atomIdxs_included_in_fragmentation + + def __complete_fragmentation(self, mol_SMILES): + + heavy_atom_count = Fragmenter.get_heavy_atom_count(mol_SMILES) + + if heavy_atom_count > self.n_atoms_cuttoff: + return {}, False + + completed_fragmentations = [] + groups_leading_to_incomplete_fragmentations = [] + ( + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + incomplete_fragmentation_found, + ) = self.__get_next_non_overlapping_adjacent_match_recursively( + mol_SMILES, + heavy_atom_count, + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + {}, + set(), + set(), + self.n_max_fragmentations_to_find, + ) + success = len(completed_fragmentations) > 0 + + return completed_fragmentations, success + + def __get_next_non_overlapping_adjacent_match_recursively( + self, + mol_searched_in, + heavy_atom_count, + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + fragmentation_so_far, + atomIdxs_included_in_fragmentation_so_far, + atomIdxs_to_which_new_matches_have_to_be_adjacent, + n_max_fragmentations_to_find=-1, + ): + + n_completed_fragmentations = len(completed_fragmentations) + incomplete_fragmentation_found = False + complete_fragmentation_found = False + + if len(completed_fragmentations) == n_max_fragmentations_to_find: + return ( + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + incomplete_fragmentation_found, + ) + + for group_number in self.fragmentation_scheme_order: + list_SMARTS = self.fragmentation_scheme[group_number] + + if complete_fragmentation_found: + break + + if type(list_SMARTS) is not list: + list_SMARTS = [list_SMARTS] + + for SMARTS in list_SMARTS: + if complete_fragmentation_found: + break + + if SMARTS != "": + matches = Fragmenter.get_substruct_matches( + self._fragmentation_scheme_pattern_lookup[SMARTS], + mol_searched_in, + atomIdxs_included_in_fragmentation_so_far, + ) + + for match in matches: + + # only allow non-overlapping matches + all_atoms_are_unassigned = ( + atomIdxs_included_in_fragmentation_so_far.isdisjoint(match) + ) + if not all_atoms_are_unassigned: + continue + + # only allow matches that do not contain groups leading to incomplete matches + for ( + groups_leading_to_incomplete_fragmentation + ) in groups_leading_to_incomplete_fragmentations: + if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( + groups_leading_to_incomplete_fragmentation, + fragmentation_so_far, + ): + return ( + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + incomplete_fragmentation_found, + ) + + # only allow matches that will lead to new fragmentations + use_this_match = True + n_found_groups = len(fragmentation_so_far) + + for completed_fragmentation in completed_fragmentations: + + if not SMARTS in completed_fragmentation: + continue + + if n_found_groups == 0: + use_this_match = not Fragmenter.__is_match_contained_in_fragmentation( + match, SMARTS, completed_fragmentation + ) + else: + if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( + fragmentation_so_far, completed_fragmentation + ): + use_this_match = not Fragmenter.__is_match_contained_in_fragmentation( + match, SMARTS, completed_fragmentation + ) + + if not use_this_match: + break + + if not use_this_match: + continue + + # make a deepcopy here, otherwise the variables are modified down the road + # marshal is used here because it works faster than copy.deepcopy + this_SMARTS_fragmentation_so_far = marshal.loads( + marshal.dumps(fragmentation_so_far) + ) + this_SMARTS_atomIdxs_included_in_fragmentation_so_far = ( + atomIdxs_included_in_fragmentation_so_far.copy() + ) + + if not SMARTS in this_SMARTS_fragmentation_so_far: + this_SMARTS_fragmentation_so_far[SMARTS] = [] + + this_SMARTS_fragmentation_so_far[SMARTS].append(match) + this_SMARTS_atomIdxs_included_in_fragmentation_so_far.update( + match + ) + + # only allow matches that do not contain groups leading to incomplete matches + for ( + groups_leading_to_incomplete_match + ) in groups_leading_to_incomplete_fragmentations: + if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( + groups_leading_to_incomplete_match, + this_SMARTS_fragmentation_so_far, + ): + use_this_match = False + break + + if not use_this_match: + continue + + # if the complete molecule has not been fragmented, continue to do so + if ( + len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) + < heavy_atom_count + ): + ( + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + incomplete_fragmentation_found, + ) = self.__get_next_non_overlapping_adjacent_match_recursively( + mol_searched_in, + heavy_atom_count, + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + this_SMARTS_fragmentation_so_far, + this_SMARTS_atomIdxs_included_in_fragmentation_so_far, + this_SMARTS_atomIdxs_included_in_fragmentation_so_far, + n_max_fragmentations_to_find, + ) + break + + # if the complete molecule has been fragmented, save and return + if ( + len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) + == heavy_atom_count + ): + completed_fragmentations.append( + this_SMARTS_fragmentation_so_far + ) + complete_fragmentation_found = True + break + + # if until here no new fragmentation was found check whether an incomplete fragmentation was found + if n_completed_fragmentations == len(completed_fragmentations): + + if not incomplete_fragmentation_found: + + incomplete_matched_groups = {} + + if len(atomIdxs_included_in_fragmentation_so_far) > 0: + unassignes_atom_idx = set(range(0, heavy_atom_count)).difference( + atomIdxs_included_in_fragmentation_so_far + ) + for atom_idx in unassignes_atom_idx: + neighbor_atoms_idx = [ + i.GetIdx() + for i in mol_searched_in.GetAtomWithIdx( + atom_idx + ).GetNeighbors() + ] + + for neighbor_atom_idx in neighbor_atoms_idx: + for ( + found_smarts, + found_matches, + ) in fragmentation_so_far.items(): + for found_match in found_matches: + if neighbor_atom_idx in found_match: + if ( + not found_smarts + in incomplete_matched_groups + ): + incomplete_matched_groups[found_smarts] = [] + + if ( + found_match + not in incomplete_matched_groups[ + found_smarts + ] + ): + incomplete_matched_groups[ + found_smarts + ].append(found_match) + + is_subset_of_groups_already_found = False + indexes_to_remove = [] + + for idx, groups_leading_to_incomplete_match in enumerate( + groups_leading_to_incomplete_fragmentations + ): + is_subset_of_groups_already_found = ( + Fragmenter.__is_fragmentation_subset_of_other_fragmentation( + incomplete_matched_groups, + groups_leading_to_incomplete_match, + ) + ) + if is_subset_of_groups_already_found: + indexes_to_remove.append(idx) + + for index in sorted(indexes_to_remove, reverse=True): + del groups_leading_to_incomplete_fragmentations[index] + + groups_leading_to_incomplete_fragmentations.append( + incomplete_matched_groups + ) + groups_leading_to_incomplete_fragmentations = sorted( + groups_leading_to_incomplete_fragmentations, key=len + ) + + incomplete_fragmentation_found = True + + return ( + completed_fragmentations, + groups_leading_to_incomplete_fragmentations, + incomplete_fragmentation_found, + ) + + @classmethod + def __is_fragmentation_subset_of_other_fragmentation( + cls, fragmentation, other_fragmentation + ): + n_found_groups = len(fragmentation) + n_found_other_groups = len(other_fragmentation) + + if n_found_groups == 0: + return False + + if n_found_other_groups < n_found_groups: + return False + + n_found_SMARTS_that_are_subset = 0 + for found_SMARTS, _ in fragmentation.items(): + if found_SMARTS in other_fragmentation: + found_matches_set = set( + frozenset(i) for i in fragmentation[found_SMARTS] + ) + found_other_matches_set = set( + frozenset(i) for i in other_fragmentation[found_SMARTS] + ) + + if found_matches_set.issubset(found_other_matches_set): + n_found_SMARTS_that_are_subset += 1 + else: + return False + + return n_found_SMARTS_that_are_subset == n_found_groups + + @classmethod + def __is_match_contained_in_fragmentation(cls, match, SMARTS, fragmentation): + if not SMARTS in fragmentation: + return False + + found_matches_set = set(frozenset(i) for i in fragmentation[SMARTS]) + match_set = set(match) + + return match_set in found_matches_set + + +if __name__ == "__main__": + + smiles = ["CCCCO", "CCCO", "CCO", "CO"] + fragmentation_scheme = { + "CH2": "[CH2]", + "OH": "[OH]", + "CH3": "[CH3]", + "CH2-CH2": "[CH2][CH2]", + } + fragmentation_scheme_order1 = ["CH2-CH2", "CH3", "CH2", "OH"] + + print("simple algorithm 1") + frg = Fragmenter( + fragmentation_scheme, + fragmentation_scheme_order=fragmentation_scheme_order1, + algorithm="simple", + ) + for smi in smiles: + fragmentation, success, fragmentation_matches = frg.fragment(smi) + print(smi, fragmentation) + + print() + print("simple algorithm 2") + fragmentation_scheme_order2 = ["CH3", "CH2", "CH2-CH2", "OH"] + frg = Fragmenter( + fragmentation_scheme, + fragmentation_scheme_order=fragmentation_scheme_order2, + algorithm="simple", + ) + for smi in smiles: + fragmentation, success, fragmentation_matches = frg.fragment(smi) + print(smi, fragmentation) + + print() + print("complete algorithm 1") + frg = Fragmenter( + fragmentation_scheme, + algorithm="complete", + n_atoms_cuttoff=30, + function_to_choose_fragmentation=lambda x: x[0], + ) + for smi in smiles: + fragmentation, success, fragmentation_matches = frg.fragment(smi) + print(smi, fragmentation) + + print() + print("complete algorithm 2") + frg = Fragmenter( + fragmentation_scheme, + algorithm="complete", + n_atoms_cuttoff=30, + function_to_choose_fragmentation=lambda x: x[0], + ) + for smi in smiles: + fragmentations, success, fragmentations_matches = frg.fragment_complete(smi) + print(smi, fragmentations) + print( + fragmentations_matches + ) # some of the fragmentations are the same, but the found fragmentation_matches are different. diff --git a/src/gcms_data_analysis/main.py b/src/gcms_data_analysis/main.py index da8095b..7d3177f 100644 --- a/src/gcms_data_analysis/main.py +++ b/src/gcms_data_analysis/main.py @@ -5,9 +5,6 @@ @author: mp933 """ - -# %% -import marshal import pathlib as plib import numpy as np import pandas as pd @@ -22,504 +19,82 @@ from rdkit.Chem.AllChem import ( GetMorganFingerprintAsBitVect, ) # pylint: disable=no-name-in-module +from gcms_data_analysis.fragmenter import Fragmenter -def figure_create( - rows=1, - cols=1, - plot_type=0, - paper_col=1, - hgt_mltp=1, - font="Dejavu Sans", - sns_style="ticks", -): - """ - This function creates all the necessary objects to produce plots with - replicable characteristics. - - Parameters - ---------- - rows : int, optional - Number of plot rows in the grid. The default is 1. - cols : int, optional - Number of plot columns in the grid. The default is 1. - plot_type : int, optional - One of the different plot types available. The default is 0. - Plot types and their labels: - 0. Std: standard plot (single or grid rows x cols) - 1. Twin-x: secondary axis plot (single or grid rows x cols) - 5. Subplots with different heights - 6. Multiplot without internal x and y tick labels - 7. Multiplot without internal x tick labels - 8. Plot with specific distances between subplots and different heights - paper_col : int, optional - Single or double column size for the plot, meaning the actual space - it will fit in a paper. The default is 1. - hgt_mltp: float, optional - Multiplies the figure height. Default is 1. Best using values between - 0.65 and 2. May not work with multiplot and paper_col=1 or out of the - specified range. - font: str, optional - If the string 'Times' is given, it sets Times New Roman as the default - font for the plot, otherwise the default Dejavu Sans is maintained. - Default is 'Dejavu Sans'. - sns_style: str, optional - The style of the seaborn plot. The default is 'ticks'. - - Returns - ------- - fig : object - The figure object to be passed to figure_save. - lst_ax : list of axis - List of axis (it is a list even with 1 axis) on which to plot. - lst_axt : list of axis - List of secondary axis (it is a list even with 1 axis). - fig_par : list of float - List of parameters to reserve space around the plot canvas. - - Raises - ------ - ValueError - If cols > 2, which is not supported. - - """ - sns.set_palette("deep") - # set Times New Roman as the plot font fot text - if font == "Times" or font == "Times New Roman": - # this may require the installation of the font package - sns.set_style(sns_style, {"font.family": "Times New Roman"}) - else: # leave Dejavu Sans (default) as the plot font fot text - sns.set_style(sns_style) - # single or double column in paperthat the figure will occupy - if cols > 2: # numer of columns (thus of plots in the figure) - raise ValueError("\n figure_create: cols>2 not supported") - - # width of the figure in inches, it's fixed to keep the same text size - # is 6, 9, 12 for 1, 1.5, and 3 paper_col (columns in paper) - fig_wdt = 6 * paper_col # width of the plot in inches - fig_hgt = 4 * paper_col * rows / cols * hgt_mltp # heigth of the figure in inches - px = 0.06 * (6 / fig_wdt) * cols # set px so that (A) fits the square - py = px * fig_wdt / fig_hgt / cols * rows / hgt_mltp # set py so that (A) fits - # if more rows are added, it increases, but if cols areadded it decreases - # to maintain the plot ratio - # set plot margins - sp_lab_wdt = 0.156 / paper_col # hor. space for labels - sp_nar_wdt = 0.02294 / paper_col # space narrow no labels (horiz) - sp_lab_hgt = 0.147 / paper_col / rows * cols / hgt_mltp # space for labels (vert) - sp_nar_hgt = 0.02 / paper_col / rows * cols / hgt_mltp # space narrow no labels - # (vert) - # ========================================================================= - # # 0. Std: standard plot (single or grid rows x cols) - # ========================================================================= - if plot_type == 0: - fig, ax = plt.subplots(rows, cols, figsize=(fig_wdt, fig_hgt)) - if rows * cols == 1: # only 1 plot - lst_ax = [ax] # create ax list for uniform iterations over 1 obj. - elif rows * cols > 1: # more than one plot - lst_ax = [axs for axs in ax.flatten()] # create list of axis - lst_axt = None # no secondary axis in this plot_type - # horizontal space between plot in percentage - sp_btp_wdt = 0.26 * paper_col**2 - 1.09 * paper_col + 1.35 - # vertical space between plot in percentage !!! needs DEBUG - sp_btp_hgt = 0.2 / paper_col * cols / hgt_mltp - # left, bottom, right, top, widthspace, heightspace - fig_par = [ - sp_lab_wdt, - sp_lab_hgt, - 1 - sp_nar_wdt, - 1 - sp_nar_hgt, - sp_btp_wdt, - sp_btp_hgt, - px, - py, - ] - # ========================================================================= - # # 1. Twin-x: secondary axis plot (single or grid rows x cols) - # ========================================================================= - elif plot_type == 1: - fig, ax = plt.subplots(rows, cols, figsize=(fig_wdt, fig_hgt)) - if rows * cols == 1: # only 1 plot - lst_ax = [ax] # create ax list for uniform iterations over 1 obj. - lst_axt = [ax.twinx()] # create a list with secondary axis object - elif rows * cols > 1: # more than one plot - lst_ax = [axs for axs in ax.flatten()] # create list of axis - # create list of secondary twin axis - lst_axt = [axs.twinx() for axs in ax.flatten()] - # horizontal space between plot in percentage !!! needs DEBUG - sp_btp_wdt = 1.36 * paper_col**2 - 5.28 * paper_col + 5.57 - # vertical space between plot in percentage !!! needs DEBUG - sp_btp_hgt = 0.2 / paper_col * cols / hgt_mltp - # left, bottom, right(DIFFERENT FROM STD), top, widthspace, heightspace - fig_par = [ - sp_lab_wdt, - sp_lab_hgt, - 1 - sp_lab_wdt, - 1 - sp_nar_hgt, - sp_btp_wdt, - sp_btp_hgt, - px, - py, - ] - - return fig, lst_ax, lst_axt, fig_par - - -def figure_save( - filename, - out_path, - fig, - lst_ax, - lst_axt, - fig_par, - x_lab=None, - y_lab=None, - yt_lab=None, - x_lim=None, - y_lim=None, - yt_lim=None, - x_ticks=None, - y_ticks=None, - yt_ticks=None, - x_tick_labels=None, - y_tick_labels=None, - yt_tick_labels=None, - legend=None, - ncol_leg=1, - annotate_lttrs=False, - annotate_lttrs_loc="down", - pdf=False, - svg=False, - eps=False, - transparency=False, - subfolder=None, - tight_layout=False, - grid=False, - title=False, - set_size_inches=None, -): - """ - This function takes the objects created in figure_create and allows modifying - their appearance and saving the results. - - Parameters - ---------- - filename : str - Name of the figure. It is the name of the PNG or PDF file to be saved. - out_path : pathlib.Path object - Path to the output folder. - fig : figure object - Created in figure_save. - lst_ax : list of axis - Created in figure_create. - lst_axt : list of twin (secondary) axis - Created in figure_create. - fig_par : list - Figure parameters for space settings: left, bottom, right, top, widthspace, heightspace, px, py. Created in figure_create. - tight_layout : bool, optional - If True, ignores fig_par[0:6] and fits the figure to the tightest layout possible. Avoids losing part of the figure but loses control of margins. The default is False. - x_lab : str or list, optional - Label of the x-axis. The default is None. Can be given as: - - None: No axis gets an xlabel. - - 'label': A single string; all axes get the same xlabel. - - ['label1', None, 'Label2', ...]: A list matching the size of lst_ax containing labels and/or None values. Each axis is assigned its label; where None is given, no label is set. - y_lab : str, optional - Label of the y-axis. The default is None. Same options as x_lab. - yt_lab : str, optional - Label of the secondary y-axis. The default is None. Same options as x_lab. - x_lim : list, optional - Limits of the x-axis. The default is None. Can be given as: - - None: No axis gets an xlim. - - [a,b]: All axes get the same xlim. - - [[a,b], None, [c,d], ...]: A list matching the size of lst_ax containing [a,b] ranges and/or None values. Each axis is assigned its limit; where None is given, no limit is set. - y_lim : list, optional - Limits of the y-axis. The default is None. Same options as x_lim. - yt_lim : list, optional - Limits of the secondary y-axis. The default is None. Same options as x_lim. - x_ticks : list, optional - Ticks values to be shown on the x-axis. The default is None. - y_ticks : list, optional - Ticks values to be shown on the y-axis. The default is None. - yt_ticks : list, optional - Ticks values to be shown on the secondary y-axis. The default is None. - legend : str, optional - Contains info on the legend location. To avoid printing the legend (also in case it is empty), set it to None. The default is 'best'. - ncol_leg : int, optional - Number of columns in the legend. The default is 1. - annotate_lttrs : bool, optional - If True, each plot is assigned a letter in the lower left corner. The default is False. If a string is given, the string is used as the letter in the plot even for single plots. - annotate_lttrs_loc : str - Placement of annotation letters. 'down' for bottom-left, 'up' for top-left. The default is 'down'. - pdf : bool, optional - If True, saves the figure also in PDF format in the output folder. The default is False, so only a PNG file with - """ - - fig_adj_par = fig_par[0:6] - if not any(fig_par[0:6]): # True if all element in fig_par[0:6] are False - tight_layout = True - px = fig_par[6] - py = fig_par[7] - n_ax = len(lst_ax) # number of ax objects - # for x_lab, y_lab, yt_lab creates a list with same length as n_ax. - # only one value is given all axis are given the same label - # if a list is given, each axis is given a different value, where False - # is specified, no value is given to that particular axis - vrbls = [x_lab, y_lab, yt_lab, legend] # collect variables for iteration - lst_x_lab, lst_y_lab, lst_yt_lab, lst_legend = ( - [], - [], - [], - [], - ) # create lists for iteration - lst_vrbls = [lst_x_lab, lst_y_lab, lst_yt_lab, lst_legend] # collect lists - for vrbl, lst_vrbl in zip(vrbls, lst_vrbls): - if vrbl is None: # label is not given for any axis - lst_vrbl[:] = [None] * n_ax - else: # label is given - if np.size(vrbl) == 1: # only one value is given - if isinstance(vrbl, str): # create a list before replicating it - lst_vrbl[:] = [vrbl] * n_ax # each axis gets same label - elif isinstance(vrbl, list): # replicate the list - lst_vrbl[:] = vrbl * n_ax # each axis gets same label - elif np.size(vrbl) == n_ax: # each axis has been assigned its lab - lst_vrbl[:] = vrbl # copy the label inside the list +def get_compound_from_pubchempy(comp_name: str) -> pcp.Compound: + if not isinstance(comp_name, str) or comp_name.isspace(): + print(f"WARNING get_compound_from_pubchempy got an invalid {comp_name =}") + return None + cond = True + while cond: # to deal with HTML issues on server sides (timeouts) + try: + # comp contains all info about the chemical from pubchem + try: + comp_inside_list = pcp.get_compounds(comp_name, "name") + except ValueError: + print(f"{comp_name = }") + return None + if comp_inside_list: + comp = comp_inside_list[0] else: - print(vrbl) - print("Labels/legend size does not match axes number") - # for x_lim, y_lim, yt_lim creates a list with same length as n_ax. - # If one list like [a,b] is given, all axis have the same limits, if a list - # of the same length of the axis is given, each axis has its lim. Where - # None is given, no lim is set on that axis - vrbls = [ - x_lim, - y_lim, - yt_lim, - x_ticks, - y_ticks, - yt_ticks, - x_tick_labels, - y_tick_labels, - yt_tick_labels, - ] # collect variables for iteration - ( - lst_x_lim, - lst_y_lim, - lst_yt_lim, - lst_x_ticks, - lst_y_ticks, - lst_yt_ticks, - lst_x_tick_labels, - lst_y_tick_labels, - lst_yt_tick_labels, - ) = ( - [], - [], - [], - [], - [], - [], - [], - [], - [], - ) # create lists for iteration - lst_vrbls = [ - lst_x_lim, - lst_y_lim, - lst_yt_lim, - lst_x_ticks, - lst_y_ticks, - lst_yt_ticks, - lst_x_tick_labels, - lst_y_tick_labels, - lst_yt_tick_labels, - ] # collect lists - for vrbl, lst_vrbl in zip(vrbls, lst_vrbls): - if vrbl is None: # limit is not given for any axis - lst_vrbl[:] = [None] * n_ax - else: - # if only list and None are in vrbl, it is [[], None, [], ..] - # each axis has been assigned its limits - if any([isinstance(v, (int, float, np.int32, str)) for v in vrbl]): - temporary = [] # necessary to allow append on [:] - for i in range(n_ax): - temporary.append(vrbl) # give it to all axis - lst_vrbl[:] = temporary - else: # x_lim=[[a,b], None, ...] = [list, bool] # no float - lst_vrbl[:] = vrbl # a lim for each axis is already given - # loops over each axs in the ax array and set the different properties - for i, axs in enumerate(lst_ax): - # for each property, if the variable is not false, it is set - if lst_x_lab[i] is not None: - axs.set_xlabel(lst_x_lab[i]) - if lst_y_lab[i] is not None: - axs.set_ylabel(lst_y_lab[i]) - if lst_x_lim[i] is not None: - axs.set_xlim( - [ - lst_x_lim[i][0] * (1 + px) - px * lst_x_lim[i][1], - lst_x_lim[i][1] * (1 + px) - px * lst_x_lim[i][0], - ] - ) - if lst_y_lim[i] is not None: - axs.set_ylim( - [ - lst_y_lim[i][0] * (1 + py) - py * lst_y_lim[i][1], - lst_y_lim[i][1] * (1 + py) - py * lst_y_lim[i][0], - ] - ) - if lst_x_ticks[i] is not None: - axs.set_xticks(lst_x_ticks[i]) - if lst_y_ticks[i] is not None: - axs.set_yticks(lst_y_ticks[i]) - if lst_x_tick_labels[i] is not None: - axs.set_xticklabels(lst_x_tick_labels[i]) - if lst_y_tick_labels[i] is not None: - axs.set_yticklabels(lst_y_tick_labels[i]) - if grid: - axs.grid(True) - if annotate_lttrs is not False: - if annotate_lttrs_loc == "down": - y_lttrs = py / px * 0.02 - elif annotate_lttrs_loc == "up": - y_lttrs = 1 - py - if n_ax == 1: # if only one plot is given, do not put the letters - axs.annotate( - "(" + annotate_lttrs + ")", - xycoords="axes fraction", - xy=(0, 0), - rotation=0, - size="large", - xytext=(0, y_lttrs), - weight="bold", + print( + f"WARNING: name_to_properties {comp_name=} does not find an entry in pcp", ) - elif n_ax > 1: # if only one plot is given, do not put the letters - try: # if specific letters are provided - axs.annotate( - "(" + annotate_lttrs[i] + ")", - xycoords="axes fraction", - xy=(0, 0), - rotation=0, - size="large", - xytext=(0, y_lttrs), - weight="bold", - ) - except TypeError: # if no specific letters, use lttrs - lttrs = [ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - ] - axs.annotate( - "(" + lttrs[i] + ")", - xycoords="axes fraction", - xy=(0, 0), - rotation=0, - size="large", - xytext=(0, y_lttrs), - weight="bold", - ) + return None + cond = False + except pcp.PubChemHTTPError: # timeout error, simply try again + print("Caught: pcp.PubChemHTTPError (keep trying)") + return comp + + +def _order_columns_in_compounds_properties( + unsorted_df: pd.DataFrame | None, +) -> pd.DataFrame | None: + if unsorted_df is None: + return None + priority_cols: list[str] = [ + "iupac_name", + "underiv_comp_name", + "molecular_formula", + "canonical_smiles", + "molecular_weight", + "xlogp", + ] - # if secondary (twin) axis are given, set thier properties - if lst_axt is not None: - for i, axst in enumerate(lst_axt): - axst.grid(False) # grid is always false on secondaty axis - # for each property, if the variable is not false, it is set - if lst_yt_lab[i] is not None: - axst.set_ylabel(lst_yt_lab[i]) - if lst_yt_lim[i] is not None: - axst.set_ylim( - [ - lst_yt_lim[i][0] * (1 + py) - py * lst_yt_lim[i][1], - lst_yt_lim[i][1] * (1 + py) - py * lst_yt_lim[i][0], - ] - ) - if lst_yt_ticks[i] is not None: - axst.set_yticks(lst_yt_ticks[i]) - if lst_yt_tick_labels[i] is not None: - axst.set_yticklabels(lst_yt_tick_labels[i]) - # create a legend merging the entries for each couple of ax and axt - if any(lst_legend): - if lst_axt is None: # with no axt, only axs in ax needs a legend - for i, axs in enumerate(lst_ax): - axs.legend(loc=lst_legend[i], ncol=ncol_leg) - else: # merge the legend for each couple of ax and axt - i = 0 - for axs, axst in zip(lst_ax, lst_axt): - hnd_ax, lab_ax = axs.get_legend_handles_labels() - hnd_axt, lab_axt = axst.get_legend_handles_labels() - axs.legend( - hnd_ax + hnd_axt, lab_ax + lab_axt, loc=lst_legend[i], ncol=ncol_leg - ) - i += 1 - try: - fig.align_labels() # align labels of subplots, needed only for multi plot - except AttributeError: - print("align_labels not performed") - # if a subfolder is specified, create the subfolder inside the output - # folder if not already there and save the figure in it - if subfolder is not None: - out_path = plib.Path(out_path, subfolder) # update out_path - plib.Path(out_path).mkdir(parents=True, exist_ok=True) # check if - # folder is there, if not create it - # set figure margins and save the figure in the output folder - if set_size_inches: - fig.set_size_inches(set_size_inches) - if tight_layout is False: # if margins are given sets margins and save - fig.subplots_adjust(*fig_adj_par[0:6]) # set margins - plt.savefig( - plib.Path(out_path, filename + ".png"), dpi=300, transparent=transparency - ) - if pdf is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".pdf")) - if svg is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".svg")) - if eps is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".eps")) - else: # margins are not given, use a tight layout option and save - plt.savefig( - plib.Path(out_path, filename + ".png"), - bbox_inches="tight", - dpi=300, - transparent=transparency, - ) - if pdf is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".pdf"), bbox_inches="tight") - if svg is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".svg"), bbox_inches="tight") - if eps is not False: # save also as pdf - plt.savefig(plib.Path(out_path, filename + ".eps"), bbox_inches="tight") - # add the title after saving, so it's only visible in the console - if title is True: - lst_ax[0].annotate( - filename, - xycoords="axes fraction", - size="small", - xy=(0, 0), - xytext=(0.05, 0.95), - clip_on=True, - ) + # Define a custom sort key function + def sort_key(col): + if col in priority_cols: + return (-1, priority_cols.index(col)) + if col.startswith("el_mf"): + return (2, col) + elif col.startswith("el_"): + return (1, col) + elif col.startswith("fg_mf_unclassified"): + return (5, col) + elif col.startswith("fg_mf"): + return (4, col) + elif col.startswith("fg_"): + return (3, col) + else: + return (0, col) + + # Sort columns using the custom key + sorted_columns = sorted(unsorted_df.columns, key=sort_key) + sorted_df = unsorted_df.reindex(sorted_columns, axis=1) + sorted_df.index.name = "comp_name" + # Reindex the DataFrame with the sorted columns + return sorted_df def name_to_properties( comp_name: str, - df: pd.DataFrame, dict_classes_to_codes: dict[str:str], dict_classes_to_mass_fractions: dict[str:float], -): + df: pd.DataFrame = pd.DataFrame(), + precision_sum_elements: float = 0.05, + precision_sum_functional_group: float = 0.05, +) -> pd.DataFrame: """ used to retrieve chemical properties of the compound indicated by the comp_name and to store those properties in the df @@ -546,36 +121,28 @@ def name_to_properties( if GCname did not yield anything CompNotFound=GCname. """ - # classes used to split compounds into functional groups - cond = True - while cond: # to deal with HTML issues on server sides (timeouts) - try: - # comp contains all info about the chemical from pubchem - try: - comp_inside_list = pcp.get_compounds(comp_name, "name") - except ValueError: - print(f"{comp_name = }") - if comp_inside_list: - comp = comp_inside_list[0] - else: - print( - "WARNING: name_to_properties ", - comp_name, - " does not find an entry in pcp", - ) - df.loc[comp_name, "iupac_name"] = "unidentified" - return df - cond = False - except pcp.PubChemHTTPError: # timeout error, simply try again - print("Caught: pcp.PubChemHTTPError") - # fill the df with the data - if df is None: - df = pd.DataFrame(dtype=float) + if not isinstance(df, pd.DataFrame): + raise TypeError("The argument df must be a pd.DataFrame.") + + if not isinstance(comp_name, str) or comp_name.isspace(): + return _order_columns_in_compounds_properties(df) + + if comp_name in df.index.tolist(): + return _order_columns_in_compounds_properties(df) + + comp = get_compound_from_pubchempy(comp_name) + + if comp is None: + df.loc[comp_name, "iupac_name"] = "unidentified" + return _order_columns_in_compounds_properties(df) + try: - df.loc[comp_name, "iupac_name"] = comp.iupac_name.lower() + valid_iupac_name = comp.iupac_name.lower() except AttributeError: # iupac_name not give - df.loc[comp_name, "iupac_name"] = comp_name.lower() + valid_iupac_name = comp_name.lower() + + df.loc[comp_name, "iupac_name"] = valid_iupac_name df.loc[comp_name, "molecular_formula"] = comp.molecular_formula df.loc[comp_name, "canonical_smiles"] = comp.canonical_smiles df.loc[comp_name, "molecular_weight"] = float(comp.molecular_weight) @@ -586,19 +153,36 @@ def name_to_properties( TypeError ): # float() argument must be a string or a real number, not 'NoneType' df.loc[comp_name, "xlogp"] = np.nan - # count all atoms presence and compoute mass percentage elements = set(comp.to_dict()["elements"]) + el_dict = {} + el_mf_dict = {} + for el in elements: el_count = comp.to_dict()["elements"].count(el) el_mass = ele.element_from_symbol(el).mass - if not "el_" + el in df: - df["el_" + el] = 0 - df["el_mf_" + el] = 0.0 - df.loc[comp_name, "el_" + el] = int(el_count) - df.loc[comp_name, "el_mf_" + el] = ( + + # Using similar logic as in the fg_dict example + if el not in el_dict: + el_dict[el] = 0 + el_mf_dict[el] = 0.0 + + el_dict[el] += int(el_count) + el_mf_dict[el] += ( float(el_count) * float(el_mass) / float(comp.molecular_weight) ) - + # Now, update the DataFrame in a similar way to the fg_dict example + for key, value in el_dict.items(): + df.at[comp_name, f"el_{key}"] = int(value) + + for key, value in el_mf_dict.items(): + df.at[comp_name, f"el_mf_{key}"] = float(value) + cols_el_mf = [col for col in df.columns if col.startswith("el_mf_")] + residual_els = df.loc[comp_name, cols_el_mf].sum() - 1 + # check element sum + try: + assert residual_els <= precision_sum_elements + except AssertionError: + print(f"the total mass fraction of elements in {comp_name =} is > 0.001") # apply fragmentation using the Fragmenter class (thanks simonmb) frg = Fragmenter( dict_classes_to_codes, @@ -606,108 +190,58 @@ def name_to_properties( algorithm="simple", ) fragmentation, _, _ = frg.fragment(comp.canonical_smiles) - classes = list(fragmentation.keys()) - classes_mf = ["mf_" + cl for cl in classes] - # df is the intermediate df for classes that helps with sums of - # similar classes (ex. there are 27 different configs for ketones that - # go in the same final class) - - newdf = pd.DataFrame( - 0, columns=classes + classes_mf, index=[comp_name], dtype=float - ) - - # print(f"{df.loc[comp_name, :]}") - # mol_weight = df.loc[comp_name, "molecular_weight"] - - for cl in classes: # get counts and mf of each class in compound - newdf.loc[comp_name, cl] = fragmentation[cl] # counts in + fg_dict = {} + fg_mf_dict = {} + # Iterate over each item in the dictionary + for key, value in fragmentation.items(): + # Determine the root key (the part before an underscore, if present) + root_key = key.split("_")[0] + # if root_key in hetero_atoms: + # pass + # Check if the root key is in the sum_dict; if not, initialize it + if root_key not in fg_dict: + fg_dict[root_key] = 0 + fg_mf_dict[root_key] = 0 + # Add the value to the corresponding root key in the sum_dict + fg_dict[root_key] += int(fragmentation[key]) + fg_mf_dict[root_key] += ( + float(fragmentation[key]) + * float(dict_classes_to_mass_fractions[key]) + / df.loc[comp_name, "molecular_weight"].astype(float) + ) # mass fraction of total + + # Update df with fg_dict + for key, value in fg_dict.items(): + df.at[comp_name, f"fg_{key}"] = int(value) # Update the cell + # Update df with fg_mf_dict + for key, value in fg_mf_dict.items(): + df.at[comp_name, f"fg_mf_{key}"] = float(value) # Update the cell + cols_fg_mf = [col for col in df.columns if col.startswith("fg_mf")] + residual_fgs = df.loc[comp_name, cols_fg_mf].sum() - 1 try: - for cl in classes: # get counts and mf of each class in compound - newdf.loc[comp_name, "mf_" + cl] = ( - float(fragmentation[cl]) - * float(dict_classes_to_mass_fractions[cl]) - / df.loc[comp_name, "molecular_weight"].astype(float) - ) # mass fraction of total - except ValueError: - print(f"{comp_name = }") - # classes that must be summed and considered a single one are identified - # by the same name followed by _#. if _ is in a class, its not unique - unique_classes = [c if "_" not in c else c.split("_")[0] for c in classes] - for unique_cl in unique_classes: # sum classes that must be merged - sum_cls = [k for k in classes if unique_cl in k] # classes to be summed - occurr = 0 # counts, or occurrencies - cl_mf = 0.0 # class mass fracations - for cl in sum_cls: # for each class that must be summed - occurr += newdf.loc[comp_name, cl].astype(int) # sum counts - cl_mf += newdf.loc[comp_name, "mf_" + cl].astype( - float - ) # sum mass fractions - if not "fg_" + unique_cl in df: # create columns if missing - df["fg_" + unique_cl] = 0 - df["fg_mf_" + unique_cl] = 0.0 - df.loc[comp_name, "fg_" + unique_cl] = occurr # put values in DF - df.loc[comp_name, "fg_mf_" + unique_cl] = float(cl_mf) - # heteroatoms and Si are considered functional groups as they usually - # enter the discussion in a similar way. The atom count is used here - hetero_atoms = [e for e in elements if e not in ["H", "C", "O", "N", "Si"]] - - if hetero_atoms is not None: - for ha in hetero_atoms: - ha_col = "el_" + ha - ha_mf_col = "el_mf_" + ha - fg_col = "fg_" + ha - fg_mf_col = "fg_mf_" + ha - - # Initialize columns if they don't exist - if fg_col not in df.columns: - df[fg_col] = 0 - if fg_mf_col not in df.columns: - df[fg_mf_col] = 0.0 - - # Aggregate counts and mass fractions for hetero atoms - if ha in elements: # Ensure the element is present before processing - df.loc[comp_name, fg_col] = df.loc[comp_name, ha_col].astype(int) - df.loc[comp_name, fg_mf_col] = df.loc[comp_name, ha_mf_col] - # Handle hetero atoms sum separately if needed - if hetero_atoms: - fg_columns = ["fg_" + e for e in hetero_atoms] - fg_mf_columns = ["fg_mf_" + e for e in hetero_atoms] - - # Handle case when selection returns a Series or a single value - if isinstance(df.loc[comp_name, fg_columns], pd.Series): - fg_sum = df.loc[comp_name, fg_columns].astype(int).sum() - else: # If it's not a Series, it could be a single value (if only one column is selected) - fg_sum = df.loc[comp_name, fg_columns].astype(int) - - df.loc[comp_name, "fg_hetero_atoms"] = fg_sum - - # For 'fg_mf_hetero_atoms', assuming you want to assign the value directly - # Here, you might need to handle single/multiple selections differently based on your needs - if isinstance(df.loc[comp_name, fg_mf_columns], pd.Series): - # This assumes you want to somehow aggregate or select from the Series - # Example: selecting the first element if there are multiple. Adjust as needed. - df.loc[comp_name, "fg_mf_hetero_atoms"] = df.loc[ - comp_name, fg_mf_columns - ].iloc[0] - else: - # Direct assignment if it's a single value - df.loc[comp_name, "fg_mf_hetero_atoms"] = df.loc[ - comp_name, fg_mf_columns - ] - df["fg_hetero_atoms"] = df["fg_hetero_atoms"].fillna(0).astype("int64") - df["fg_mf_hetero_atoms"] = df["fg_mf_hetero_atoms"].fillna(0).astype(float) - # Ensure Si is handled correctly if present - if "Si" in elements: - df.loc[comp_name, "fg_Si"] = df.loc[comp_name, "el_Si"].astype(int) - df.loc[comp_name, "fg_mf_Si"] = df.loc[comp_name, "el_mf_Si"] - - fg_mf_cols = [c for c in list(df) if "fg_mf" in c and c != "fg_mf_total"] - df["fg_mf_total"] = df.loc[comp_name, fg_mf_cols].sum() - print("\tInfo: name_to_properties ", comp_name) - return df + assert residual_fgs <= precision_sum_functional_group + except AssertionError: + print(f"{df.loc[comp_name, cols_fg_mf].sum()=}") + print( + f"the total mass fraction of functional groups in {comp_name =} is > 0.05" + ) + if residual_fgs < -precision_sum_functional_group: + df.at[comp_name, "fg_mf_unclassified"] = abs(residual_fgs) + df.loc[df["iupac_name"] != "unidentified"] = df.loc[ + df["iupac_name"] != "unidentified" + ].fillna(0) + return _order_columns_in_compounds_properties(df) +# %% def get_iupac_from_pcp(comp_name: str) -> str: + """get iupac name for compound using pubchempy, needs internet connection + + :param comp_name: _description_ + :type comp_name: str + :return: lowercase iupac name for the compound + :rtype: str + """ cond = True while cond: # to deal with HTML issues on server sides (timeouts) try: @@ -786,821 +320,6 @@ def report_difference(rep1, rep2, diff_type="absolute"): return dif_ave, dif_std, dif_stdp -def _annotate_outliers_in_plot(ax, df_ave, df_std, y_lim): - """ - Annotates the bars in a bar plot with their average value and standard - deviation if these values exceed the specified y-axis limits. - The function iterates over the bars in the plot and checks if their average - values, considering their standard deviations, are outside the provided - y-axis limits. For such bars, it annotates the average and standard - deviation on the - plot, using a specific format for better visualization and understanding. - - Parameters - ---------- - ax : matplotlib.axes.Axes - The matplotlib Axes object where the plot is drawn. - df_ave : pandas.DataFrame - DataFrame containing the average values used in the plot. - df_std : pandas.DataFrame - DataFrame containing the standard deviation values corresponding - to df_ave. - y_lim : list of [float, float] - A list of two floats representing the minimum (y_lim[0]) and - maximum (y_lim[1]) limits of the y-axis. - - Returns - ------- - None - Modifies the provided Axes object (ax) by adding annotations. - - """ - dx = 0.15 * len(df_ave.index) - dy = 0.04 - tform = blended_transform_factory(ax.transData, ax.transAxes) - dfao = pd.DataFrame(columns=["H/L", "xpos", "ypos", "ave", "std", "text"]) - dfao["ave"] = df_ave.transpose().to_numpy().flatten().tolist() - if df_std.empty: - df_std = np.zeros(len(dfao["ave"])) - else: - dfao["std"] = df_std.transpose().to_numpy().flatten().tolist() - try: - dfao["xpos"] = [p.get_x() + p.get_width() / 2 for p in ax.patches] - except ValueError: # otherwise the masking adds twice the columns - dfao["xpos"] = [ - p.get_x() + p.get_width() / 2 for p in ax.patches[: len(ax.patches) // 2] - ] - cond = (dfao["ave"] < y_lim[0]) | (dfao["ave"] > y_lim[1]) - dfao = dfao.drop(dfao[~cond].index) - for ao in dfao.index.tolist(): # loop through bars - if dfao.loc[ao, "ave"] == float("inf"): - dfao.loc[ao, "text"] = "inf" - dfao.loc[ao, "H/L"] = "H" - elif dfao.loc[ao, "ave"] == float("-inf"): - dfao.loc[ao, "text"] = "-inf" - dfao.loc[ao, "H/L"] = "L" - elif dfao.loc[ao, "ave"] > y_lim[1]: - dfao.loc[ao, "H/L"] = "H" - dfao.loc[ao, "text"] = "{:.2f}".format( - round(dfao.loc[ao, "ave"], 2) - ).strip() - if (dfao.loc[ao, "std"] != 0) & (~np.isnan(dfao.loc[ao, "std"])): - dfao.loc[ao, "text"] += r"$\pm$" + "{:.2f}".format( - round(dfao.loc[ao, "std"], 2) - ) - elif dfao.loc[ao, "ave"] < y_lim[0]: - dfao.loc[ao, "H/L"] = "L" - dfao.loc[ao, "text"] = str(round(dfao.loc[ao, "ave"], 2)).strip() - if dfao.loc[ao, "std"] != 0: - dfao.loc[ao, "text"] += r"$\pm$" + "{:.2f}".format( - round(dfao.loc[ao, "std"], 2) - ) - else: - print("Something is wrong", dfao.loc[ao, "ave"]) - for hl, ypos, dy in zip(["L", "H"], [0.02, 0.98], [0.04, -0.04]): - dfao1 = dfao[dfao["H/L"] == hl] - dfao1["ypos"] = ypos - if not dfao1.empty: - dfao1 = dfao1.sort_values("xpos", ascending=True) - dfao1["diffx"] = ( - np.diff(dfao1["xpos"].values, prepend=dfao1["xpos"].values[0]) < dx - ) - dfao1.reset_index(inplace=True) - - for i in dfao1.index.tolist()[1:]: - dfao1.loc[i, "ypos"] = ypos - for e in range(i, 0, -1): - if dfao1.loc[e, "diffx"]: - dfao1.loc[e, "ypos"] += dy - else: - break - for ao in dfao1.index.tolist(): - ax.annotate( - dfao1.loc[ao, "text"], - xy=(dfao1.loc[ao, "xpos"], 0), - xycoords=tform, - textcoords=tform, - xytext=(dfao1.loc[ao, "xpos"], dfao1.loc[ao, "ypos"]), - fontsize=9, - ha="center", - va="center", - bbox={ - "boxstyle": "square,pad=0", - "edgecolor": None, - "facecolor": "white", - "alpha": 0.7, - }, - ) - - -class Fragmenter: - """ - Class taken from https://github.com/simonmb/fragmentation_algorithm. - The original version of this algorithm was published in: - "Flexible Heuristic Algorithm for Automatic Molecule Fragmentation: - Application to the UNIFAC Group Contribution Model - DOI: 10.1186/s13321-019-0382-39." - MIT License - - ... - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - """ - - # tested with Python 3.8.8 and RDKit version 2021.09.4 - - # does a substructure match and then checks whether the match - # is adjacent to previous matches - @classmethod - def get_substruct_matches( - cls, - mol_searched_for, - mol_searched_in, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - ): - - valid_matches = [] - - if mol_searched_in.GetNumAtoms() >= mol_searched_for.GetNumAtoms(): - matches = mol_searched_in.GetSubstructMatches(mol_searched_for) - - if matches: - for match in matches: - add_this_match = True - if len(atomIdxs_to_which_new_matches_have_to_be_adjacent) > 0: - add_this_match = False - - for i in match: - for neighbor in mol_searched_in.GetAtomWithIdx( - i - ).GetNeighbors(): - if ( - neighbor.GetIdx() - in atomIdxs_to_which_new_matches_have_to_be_adjacent - ): - add_this_match = True - break - - if add_this_match: - valid_matches.append(match) - - return valid_matches - - # count heavier isotopes of hydrogen correctly - @classmethod - def get_heavy_atom_count(cls, mol): - heavy_atom_count = 0 - for atom in mol.GetAtoms(): - if atom.GetAtomicNum() != 1: - heavy_atom_count += 1 - - return heavy_atom_count - - def __init__( - self, - fragmentation_scheme={}, - fragmentation_scheme_order=None, - match_hydrogens=False, - algorithm="", - n_atoms_cuttoff=-1, - function_to_choose_fragmentation=False, - n_max_fragmentations_to_find=-1, - ): - - if not type(fragmentation_scheme) is dict: - raise TypeError( - "fragmentation_scheme must be a dctionary with integers as keys and either strings or list of strings as values." - ) - - if len(fragmentation_scheme) == 0: - raise ValueError("fragmentation_scheme must be provided.") - - if not algorithm in ["simple", "complete", "combined"]: - raise ValueError("Algorithm must be either simple ,complete or combined.") - - if algorithm == "simple": - if n_max_fragmentations_to_find != -1: - raise ValueError( - "Setting n_max_fragmentations_to_find only makes sense with complete or combined algorithm." - ) - - self.algorithm = algorithm - - if algorithm in ["combined", "complete"]: - if n_atoms_cuttoff == -1: - raise ValueError( - "n_atoms_cuttoff needs to be specified for complete or combined algorithms." - ) - - if function_to_choose_fragmentation == False: - raise ValueError( - "function_to_choose_fragmentation needs to be specified for complete or combined algorithms." - ) - - if not callable(function_to_choose_fragmentation): - raise TypeError( - "function_to_choose_fragmentation needs to be a function." - ) - else: - if type(function_to_choose_fragmentation([{}, {}])) != dict: - raise TypeError( - "function_to_choose_fragmentation needs to take a list of fragmentations and choose one of it" - ) - - if n_max_fragmentations_to_find != -1: - if n_max_fragmentations_to_find < 1: - raise ValueError( - "n_max_fragmentations_to_find has to be 1 or higher." - ) - - if fragmentation_scheme_order is None: - fragmentation_scheme_order = [] - - if algorithm in ["simple", "combined"]: - assert len(fragmentation_scheme) == len(fragmentation_scheme_order) - else: - fragmentation_scheme_order = [key for key in fragmentation_scheme.keys()] - - self.n_max_fragmentations_to_find = n_max_fragmentations_to_find - - self.n_atoms_cuttoff = n_atoms_cuttoff - - self.match_hydrogens = match_hydrogens - - self.fragmentation_scheme = fragmentation_scheme - - self.function_to_choose_fragmentation = function_to_choose_fragmentation - - # create a lookup dictionaries to faster finding a group number - self._fragmentation_scheme_group_number_lookup = {} - self._fragmentation_scheme_pattern_lookup = {} - self.fragmentation_scheme_order = fragmentation_scheme_order - - for group_number, list_SMARTS in fragmentation_scheme.items(): - - if type(list_SMARTS) is not list: - list_SMARTS = [list_SMARTS] - - for SMARTS in list_SMARTS: - if SMARTS != "": - self._fragmentation_scheme_group_number_lookup[SMARTS] = ( - group_number - ) - - mol_SMARTS = Chem.MolFromSmarts(SMARTS) - self._fragmentation_scheme_pattern_lookup[SMARTS] = mol_SMARTS - - def fragment(self, SMILES_or_molecule): - - if type(SMILES_or_molecule) is str: - mol_SMILES = Chem.MolFromSmiles(SMILES_or_molecule) - mol_SMILES = Chem.AddHs(mol_SMILES) if self.match_hydrogens else mol_SMILES - is_valid_SMILES = mol_SMILES is not None - - if not is_valid_SMILES: - raise ValueError("Following SMILES is not valid: " + SMILES_or_molecule) - - else: - mol_SMILES = SMILES_or_molecule - - # iterate over all separated molecules - success = [] - fragmentation = {} - fragmentation_matches = {} - for mol in rdmolops.GetMolFrags(mol_SMILES, asMols=True): - - this_mol_fragmentation, this_mol_success = self.__get_fragmentation(mol) - - for SMARTS, matches in this_mol_fragmentation.items(): - group_number = self._fragmentation_scheme_group_number_lookup[SMARTS] - - if not group_number in fragmentation: - fragmentation[group_number] = 0 - fragmentation_matches[group_number] = [] - - fragmentation[group_number] += len(matches) - fragmentation_matches[group_number].extend(matches) - - success.append(this_mol_success) - - return fragmentation, all(success), fragmentation_matches - - def fragment_complete(self, SMILES_or_molecule): - - if type(SMILES_or_molecule) is str: - mol_SMILES = Chem.MolFromSmiles(SMILES_or_molecule) - mol_SMILES = Chem.AddHs(mol_SMILES) if self.match_hydrogens else mol_SMILES - is_valid_SMILES = mol_SMILES is not None - - if not is_valid_SMILES: - raise ValueError("Following SMILES is not valid: " + SMILES_or_molecule) - - else: - mol_SMILES = SMILES_or_molecule - - if len(rdmolops.GetMolFrags(mol_SMILES)) != 1: - raise ValueError( - "fragment_complete does not accept multifragment molecules." - ) - - temp_fragmentations, success = self.__complete_fragmentation(mol_SMILES) - - fragmentations = [] - fragmentations_matches = [] - for temp_fragmentation in temp_fragmentations: - fragmentation = {} - fragmentation_matches = {} - for SMARTS, matches in temp_fragmentation.items(): - group_number = self._fragmentation_scheme_group_number_lookup[SMARTS] - - fragmentation[group_number] = len(matches) - fragmentation_matches[group_number] = matches - - fragmentations.append(fragmentation) - fragmentations_matches.append(fragmentation_matches) - - return fragmentations, success, fragmentations_matches - - def __get_fragmentation(self, mol_SMILES): - - success = False - fragmentation = {} - if self.algorithm in ["simple", "combined"]: - fragmentation, success = self.__simple_fragmentation(mol_SMILES) - - if success: - return fragmentation, success - - if self.algorithm in ["combined", "complete"]: - fragmentations, success = self.__complete_fragmentation(mol_SMILES) - - if success: - fragmentation = self.function_to_choose_fragmentation(fragmentations) - - return fragmentation, success - - def __simple_fragmentation(self, mol_SMILES): - - if self.match_hydrogens: - target_atom_count = len(mol_SMILES.GetAtoms()) - else: - target_atom_count = Fragmenter.get_heavy_atom_count(mol_SMILES) - - success = False - fragmentation = {} - - fragmentation, atomIdxs_included_in_fragmentation = ( - self.__search_non_overlapping_solution(mol_SMILES, {}, set(), set()) - ) - success = len(atomIdxs_included_in_fragmentation) == target_atom_count - - # if not successful, clean up molecule and search again - level = 1 - while not success: - fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far = ( - Fragmenter.__clean_molecule_surrounding_unmatched_atoms( - mol_SMILES, fragmentation, atomIdxs_included_in_fragmentation, level - ) - ) - level += 1 - - if len(atomIdxs_included_in_fragmentation_so_far) == 0: - break - - fragmentation_so_far, atomIdxs_included_in_fragmentation_so_far = ( - self.__search_non_overlapping_solution( - mol_SMILES, - fragmentation_so_far, - atomIdxs_included_in_fragmentation_so_far, - atomIdxs_included_in_fragmentation_so_far, - ) - ) - - success = ( - len(atomIdxs_included_in_fragmentation_so_far) == target_atom_count - ) - - if success: - fragmentation = fragmentation_so_far - - return fragmentation, success - - def __search_non_overlapping_solution( - self, - mol_searched_in, - fragmentation, - atomIdxs_included_in_fragmentation, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - ): - - n_atomIdxs_included_in_fragmentation = ( - len(atomIdxs_included_in_fragmentation) - 1 - ) - - while n_atomIdxs_included_in_fragmentation != len( - atomIdxs_included_in_fragmentation - ): - n_atomIdxs_included_in_fragmentation = len( - atomIdxs_included_in_fragmentation - ) - - for group_number in self.fragmentation_scheme_order: - list_SMARTS = self.fragmentation_scheme[group_number] - if type(list_SMARTS) is not list: - list_SMARTS = [list_SMARTS] - - for SMARTS in list_SMARTS: - if SMARTS != "": - fragmentation, atomIdxs_included_in_fragmentation = ( - self.__get_next_non_overlapping_match( - mol_searched_in, - SMARTS, - fragmentation, - atomIdxs_included_in_fragmentation, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - ) - ) - - return fragmentation, atomIdxs_included_in_fragmentation - - def __get_next_non_overlapping_match( - self, - mol_searched_in, - SMARTS, - fragmentation, - atomIdxs_included_in_fragmentation, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - ): - - mol_searched_for = self._fragmentation_scheme_pattern_lookup[SMARTS] - - if atomIdxs_to_which_new_matches_have_to_be_adjacent: - matches = Fragmenter.get_substruct_matches( - mol_searched_for, - mol_searched_in, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - ) - else: - matches = Fragmenter.get_substruct_matches( - mol_searched_for, mol_searched_in, set() - ) - - if matches: - for match in matches: - all_atoms_of_new_match_are_unassigned = ( - atomIdxs_included_in_fragmentation.isdisjoint(match) - ) - - if all_atoms_of_new_match_are_unassigned: - if not SMARTS in fragmentation: - fragmentation[SMARTS] = [] - - fragmentation[SMARTS].append(match) - atomIdxs_included_in_fragmentation.update(match) - - return fragmentation, atomIdxs_included_in_fragmentation - - @classmethod - def __clean_molecule_surrounding_unmatched_atoms( - cls, mol_searched_in, fragmentation, atomIdxs_included_in_fragmentation, level - ): - - for i in range(0, level): - - atoms_missing = set( - range(0, Fragmenter.get_heavy_atom_count(mol_searched_in)) - ).difference(atomIdxs_included_in_fragmentation) - - new_fragmentation = marshal.loads(marshal.dumps(fragmentation)) - - for atomIdx in atoms_missing: - for neighbor in mol_searched_in.GetAtomWithIdx(atomIdx).GetNeighbors(): - for smart, atoms_found in fragmentation.items(): - for atoms in atoms_found: - if neighbor.GetIdx() in atoms: - if smart in new_fragmentation: - if new_fragmentation[smart].count(atoms) > 0: - new_fragmentation[smart].remove(atoms) - - if smart in new_fragmentation: - if len(new_fragmentation[smart]) == 0: - new_fragmentation.pop(smart) - - new_atomIdxs_included_in_fragmentation = set() - for i in new_fragmentation.values(): - for j in i: - new_atomIdxs_included_in_fragmentation.update(j) - - atomIdxs_included_in_fragmentation = new_atomIdxs_included_in_fragmentation - fragmentation = new_fragmentation - - return fragmentation, atomIdxs_included_in_fragmentation - - def __complete_fragmentation(self, mol_SMILES): - - heavy_atom_count = Fragmenter.get_heavy_atom_count(mol_SMILES) - - if heavy_atom_count > self.n_atoms_cuttoff: - return {}, False - - completed_fragmentations = [] - groups_leading_to_incomplete_fragmentations = [] - ( - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - incomplete_fragmentation_found, - ) = self.__get_next_non_overlapping_adjacent_match_recursively( - mol_SMILES, - heavy_atom_count, - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - {}, - set(), - set(), - self.n_max_fragmentations_to_find, - ) - success = len(completed_fragmentations) > 0 - - return completed_fragmentations, success - - def __get_next_non_overlapping_adjacent_match_recursively( - self, - mol_searched_in, - heavy_atom_count, - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - fragmentation_so_far, - atomIdxs_included_in_fragmentation_so_far, - atomIdxs_to_which_new_matches_have_to_be_adjacent, - n_max_fragmentations_to_find=-1, - ): - - n_completed_fragmentations = len(completed_fragmentations) - incomplete_fragmentation_found = False - complete_fragmentation_found = False - - if len(completed_fragmentations) == n_max_fragmentations_to_find: - return ( - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - incomplete_fragmentation_found, - ) - - for group_number in self.fragmentation_scheme_order: - list_SMARTS = self.fragmentation_scheme[group_number] - - if complete_fragmentation_found: - break - - if type(list_SMARTS) is not list: - list_SMARTS = [list_SMARTS] - - for SMARTS in list_SMARTS: - if complete_fragmentation_found: - break - - if SMARTS != "": - matches = Fragmenter.get_substruct_matches( - self._fragmentation_scheme_pattern_lookup[SMARTS], - mol_searched_in, - atomIdxs_included_in_fragmentation_so_far, - ) - - for match in matches: - - # only allow non-overlapping matches - all_atoms_are_unassigned = ( - atomIdxs_included_in_fragmentation_so_far.isdisjoint(match) - ) - if not all_atoms_are_unassigned: - continue - - # only allow matches that do not contain groups leading to incomplete matches - for ( - groups_leading_to_incomplete_fragmentation - ) in groups_leading_to_incomplete_fragmentations: - if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( - groups_leading_to_incomplete_fragmentation, - fragmentation_so_far, - ): - return ( - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - incomplete_fragmentation_found, - ) - - # only allow matches that will lead to new fragmentations - use_this_match = True - n_found_groups = len(fragmentation_so_far) - - for completed_fragmentation in completed_fragmentations: - - if not SMARTS in completed_fragmentation: - continue - - if n_found_groups == 0: - use_this_match = not Fragmenter.__is_match_contained_in_fragmentation( - match, SMARTS, completed_fragmentation - ) - else: - if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( - fragmentation_so_far, completed_fragmentation - ): - use_this_match = not Fragmenter.__is_match_contained_in_fragmentation( - match, SMARTS, completed_fragmentation - ) - - if not use_this_match: - break - - if not use_this_match: - continue - - # make a deepcopy here, otherwise the variables are modified down the road - # marshal is used here because it works faster than copy.deepcopy - this_SMARTS_fragmentation_so_far = marshal.loads( - marshal.dumps(fragmentation_so_far) - ) - this_SMARTS_atomIdxs_included_in_fragmentation_so_far = ( - atomIdxs_included_in_fragmentation_so_far.copy() - ) - - if not SMARTS in this_SMARTS_fragmentation_so_far: - this_SMARTS_fragmentation_so_far[SMARTS] = [] - - this_SMARTS_fragmentation_so_far[SMARTS].append(match) - this_SMARTS_atomIdxs_included_in_fragmentation_so_far.update( - match - ) - - # only allow matches that do not contain groups leading to incomplete matches - for ( - groups_leading_to_incomplete_match - ) in groups_leading_to_incomplete_fragmentations: - if Fragmenter.__is_fragmentation_subset_of_other_fragmentation( - groups_leading_to_incomplete_match, - this_SMARTS_fragmentation_so_far, - ): - use_this_match = False - break - - if not use_this_match: - continue - - # if the complete molecule has not been fragmented, continue to do so - if ( - len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) - < heavy_atom_count - ): - ( - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - incomplete_fragmentation_found, - ) = self.__get_next_non_overlapping_adjacent_match_recursively( - mol_searched_in, - heavy_atom_count, - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - this_SMARTS_fragmentation_so_far, - this_SMARTS_atomIdxs_included_in_fragmentation_so_far, - this_SMARTS_atomIdxs_included_in_fragmentation_so_far, - n_max_fragmentations_to_find, - ) - break - - # if the complete molecule has been fragmented, save and return - if ( - len(this_SMARTS_atomIdxs_included_in_fragmentation_so_far) - == heavy_atom_count - ): - completed_fragmentations.append( - this_SMARTS_fragmentation_so_far - ) - complete_fragmentation_found = True - break - - # if until here no new fragmentation was found check whether an incomplete fragmentation was found - if n_completed_fragmentations == len(completed_fragmentations): - - if not incomplete_fragmentation_found: - - incomplete_matched_groups = {} - - if len(atomIdxs_included_in_fragmentation_so_far) > 0: - unassignes_atom_idx = set(range(0, heavy_atom_count)).difference( - atomIdxs_included_in_fragmentation_so_far - ) - for atom_idx in unassignes_atom_idx: - neighbor_atoms_idx = [ - i.GetIdx() - for i in mol_searched_in.GetAtomWithIdx( - atom_idx - ).GetNeighbors() - ] - - for neighbor_atom_idx in neighbor_atoms_idx: - for ( - found_smarts, - found_matches, - ) in fragmentation_so_far.items(): - for found_match in found_matches: - if neighbor_atom_idx in found_match: - if ( - not found_smarts - in incomplete_matched_groups - ): - incomplete_matched_groups[found_smarts] = [] - - if ( - found_match - not in incomplete_matched_groups[ - found_smarts - ] - ): - incomplete_matched_groups[ - found_smarts - ].append(found_match) - - is_subset_of_groups_already_found = False - indexes_to_remove = [] - - for idx, groups_leading_to_incomplete_match in enumerate( - groups_leading_to_incomplete_fragmentations - ): - is_subset_of_groups_already_found = ( - Fragmenter.__is_fragmentation_subset_of_other_fragmentation( - incomplete_matched_groups, - groups_leading_to_incomplete_match, - ) - ) - if is_subset_of_groups_already_found: - indexes_to_remove.append(idx) - - for index in sorted(indexes_to_remove, reverse=True): - del groups_leading_to_incomplete_fragmentations[index] - - groups_leading_to_incomplete_fragmentations.append( - incomplete_matched_groups - ) - groups_leading_to_incomplete_fragmentations = sorted( - groups_leading_to_incomplete_fragmentations, key=len - ) - - incomplete_fragmentation_found = True - - return ( - completed_fragmentations, - groups_leading_to_incomplete_fragmentations, - incomplete_fragmentation_found, - ) - - @classmethod - def __is_fragmentation_subset_of_other_fragmentation( - cls, fragmentation, other_fragmentation - ): - n_found_groups = len(fragmentation) - n_found_other_groups = len(other_fragmentation) - - if n_found_groups == 0: - return False - - if n_found_other_groups < n_found_groups: - return False - - n_found_SMARTS_that_are_subset = 0 - for found_SMARTS, _ in fragmentation.items(): - if found_SMARTS in other_fragmentation: - found_matches_set = set( - frozenset(i) for i in fragmentation[found_SMARTS] - ) - found_other_matches_set = set( - frozenset(i) for i in other_fragmentation[found_SMARTS] - ) - - if found_matches_set.issubset(found_other_matches_set): - n_found_SMARTS_that_are_subset += 1 - else: - return False - - return n_found_SMARTS_that_are_subset == n_found_groups - - @classmethod - def __is_match_contained_in_fragmentation(cls, match, SMARTS, fragmentation): - if not SMARTS in fragmentation: - return False - - found_matches_set = set(frozenset(i) for i in fragmentation[SMARTS]) - match_set = set(match) - - return match_set in found_matches_set - - class Project: """the class that contains all method and info to analyze the project (intended as a collection of GCMS files, calibrations, etc) @@ -1635,10 +354,9 @@ class Project: } acceptable_params: list[str] = list(param_to_axis_label.keys()) string_in_deriv_names: list[str] = [ - "deriv.", - "derivative", - "TMS", - "TBDMS", + "deriv", + "tms", + "tbms", "trimethylsilyl", ] string_in_deriv_names = [s.lower() for s in string_in_deriv_names] @@ -2093,20 +811,22 @@ def load_compounds_properties(self): """Attempts to load the 'compounds_properties.xlsx' file containing physical and chemical properties of compounds. If not found, it creates a new properties DataFrame and updates the 'compounds_properties_created' attribute.""" - try: + compounds_properties_path = plib.Path( + Project.in_path, "compounds_properties.xlsx" + ) + if compounds_properties_path.exists(): cpdf = pd.read_excel( - plib.Path(Project.in_path, "compounds_properties.xlsx"), + compounds_properties_path, index_col="comp_name", ) - cpdf = self._order_columns_in_compounds_properties(cpdf) - cpdf = cpdf.fillna(0) + # cpdf = _order_columns_in_compounds_properties(cpdf) + # cpdf = cpdf.fillna(0) self.compounds_properties = cpdf self.compounds_properties_created = True print("Info: compounds_properties loaded") - except FileNotFoundError: + else: print("Warning: compounds_properties.xlsx not found, creating it") cpdf = self.create_compounds_properties() - return self.compounds_properties def load_deriv_compounds_properties(self): @@ -2114,17 +834,20 @@ def load_deriv_compounds_properties(self): for derivatized compounds. If not found, it creates a new properties DataFrame for derivatized compounds and updates the 'deriv_compounds_properties_created' attribute. """ - try: + compounds_deriv_properties_path = plib.Path( + Project.in_path, "deriv_compounds_properties.xlsx" + ) + if compounds_deriv_properties_path.exists(): dcpdf = pd.read_excel( - plib.Path(Project.in_path, "deriv_compounds_properties.xlsx"), + compounds_deriv_properties_path, index_col="comp_name", ) - dcpdf = self._order_columns_in_compounds_properties(dcpdf) - dcpdf = dcpdf.fillna(0) + # dcpdf = _order_columns_in_compounds_properties(dcpdf) + # cpdf = dcpdf.fillna(0) self.deriv_compounds_properties = dcpdf self.deriv_compounds_properties_created = True print("Info: deriv_compounds_properties loaded") - except FileNotFoundError: + else: print("Warning: deriv_compounds_properties.xlsx not found, creating it") dcpdf = self.create_deriv_compounds_properties() return self.deriv_compounds_properties @@ -2139,18 +862,20 @@ def create_compounds_properties(self): self.load_class_code_frac() if not self.list_of_all_compounds_created: self.create_list_of_all_compounds() - cpdf = pd.DataFrame(index=pd.Index(self.list_of_all_compounds)) - cpdf.index.name = "comp_name" + # cpdf = pd.DataFrame(index=pd.Index(self.list_of_all_compounds)) + # + cpdf = pd.DataFrame() print("Info: create_compounds_properties: looping over names") - for name in cpdf.index: + for name in self.list_of_all_compounds: cpdf = name_to_properties( - name, - cpdf, - self.dict_classes_to_codes, - self.dict_classes_to_mass_fractions, + comp_name=name, + dict_classes_to_codes=self.dict_classes_to_codes, + dict_classes_to_mass_fractions=self.dict_classes_to_mass_fractions, + df=cpdf, ) - cpdf = self._order_columns_in_compounds_properties(cpdf) - cpdf = cpdf.fillna(0) + # cpdf = self._order_columns_in_compounds_properties(cpdf) + # cpdf = cpdf.fillna(0) + cpdf.index.name = "comp_name" self.compounds_properties = cpdf self.compounds_properties_created = True # save db in the project folder in the input @@ -2169,82 +894,54 @@ def create_deriv_compounds_properties(self): self.load_class_code_frac() if not self.list_of_all_deriv_compounds_created: self.create_list_of_all_deriv_compounds() - - old_unique_deriv_compounds = self.list_of_all_deriv_compounds - # unique_underiv_compounds = [ - # ",".join(name.split(",")[:-1]) for name in unique_deriv_compounds - # ] - unique_deriv_compounds = [] - unique_underiv_compounds = [] - for name in old_unique_deriv_compounds: - underiv_name = ",".join(name.split(",")[:-1]) - deriv_string = name.split(",")[-1] - if underiv_name == "": - underiv_name = name + deriv_to_underiv = {} + for derivname in self.list_of_all_deriv_compounds: + parts = derivname.split(",") + is_der_str_in_part2: bool = any( + [ + der_str in parts[-1].strip() + for der_str in Project.string_in_deriv_names + ] + ) + if len(parts) > 1 and is_der_str_in_part2: + # If the suffix is a known derivatization, use the part before the comma + deriv_to_underiv[derivname] = ",".join(parts[:-1]) else: - if any([der in deriv_string for der in Project.string_in_deriv_names]): - unique_deriv_compounds.append(name) - unique_underiv_compounds.append(underiv_name) - dcpdf = pd.DataFrame(index=pd.Index(unique_underiv_compounds)) - dcpdf.index.name = "comp_name" - dcpdf["deriv_comp_name"] = unique_deriv_compounds + # In all other cases, mark as "unidentified" + deriv_to_underiv[derivname] = "unidentified" print("Info: create_deriv_compounds_properties: looping over names") - for name in dcpdf.index: + underiv_comps_to_search_for = [ + c for c in deriv_to_underiv.values() if c != "unidentified" + ] + dcpdf = pd.DataFrame() + for name in underiv_comps_to_search_for: dcpdf = name_to_properties( - name, - dcpdf, - self.dict_classes_to_codes, - self.dict_classes_to_mass_fractions, + comp_name=name, + dict_classes_to_codes=self.dict_classes_to_codes, + dict_classes_to_mass_fractions=self.dict_classes_to_mass_fractions, + df=dcpdf, ) - # remove duplicates that may come from the "made up" name in calibration - # dcpdf = dcpdf.drop_duplicates(subset='iupac_name') - dcpdf["underiv_comp_name"] = dcpdf.index - dcpdf.set_index("deriv_comp_name", inplace=True) - dcpdf.index.name = "comp_name" - dcpdf = self._order_columns_in_compounds_properties(dcpdf) - dcpdf = dcpdf.fillna(0) + dcpdf.index.name = "underiv_comp_name" + dcpdf.reset_index(inplace=True) + underiv_to_deriv = { + v: k for k, v in deriv_to_underiv.items() if v != "unidentified" + } + # Add a new column for the derivatized compound names + # If a name is not in the underiv_to_deriv (thus 'unidentified'), it will get a value of NaN + + dcpdf["comp_name"] = dcpdf["underiv_comp_name"].apply( + lambda x: underiv_to_deriv.get(x, "unidentified") + ) + dcpdf.set_index("comp_name", inplace=True) # save db in the project folder in the input self.deriv_compounds_properties = dcpdf dcpdf.to_excel(plib.Path(Project.in_path, "deriv_compounds_properties.xlsx")) self.compounds_properties_created = True print( - "Info: create_deriv_compounds_properties:" - + "deriv_compounds_properties created and saved" + "Info: create_deriv_compounds_properties: deriv_compounds_properties created and saved" ) return self.deriv_compounds_properties - def _order_columns_in_compounds_properties(self, comp_df): - ord_cols1, ord_cols2, ord_cols3, ord_cols4, ord_cols5, ord_cols6 = ( - [], - [], - [], - [], - [], - [], - ) - for c in comp_df.columns: - if not c.startswith(("el_", "fg_")): - ord_cols1.append(c) - elif c.startswith("el_mf"): - ord_cols3.append(c) - elif c.startswith("el_"): - ord_cols2.append(c) - elif c.startswith("fg_mf_total"): - ord_cols6.append(c) - elif c.startswith("fg_mf"): - ord_cols5.append(c) - elif c.startswith("fg_"): - ord_cols4.append(c) - comp_df = comp_df[ - ord_cols1 - + sorted(ord_cols2) - + sorted(ord_cols3) - + sorted(ord_cols4) - + sorted(ord_cols5) - + sorted(ord_cols6) - ] - return comp_df - # def add_iupac_to_calibrations(self): # """Adds the IUPAC name to each compound in the calibration data, # istinguishing between underivatized and derivatized calibrations, @@ -2862,128 +1559,6 @@ def create_samples_param_aggrrep(self, param: str = "conc_vial_mg_L"): self.save_samples_param_aggrrep(param=param) return self.samples_aggrreps[param], self.samples_aggrreps_std[param] - # def create_samples_param_report(self, param="conc_vial_mg_L"): - # """Creates a detailed report for each parameter across all SAMPLES, - # displaying the concentration of each compound in each sample. - # This report aids in the analysis and comparison of compound - # concentrations across SAMPLES.""" - # print("Info: create_param_report: ", param) - # if param not in Project.acceptable_params: - # raise ValueError(f"{param = } is not an acceptable param") - # if not self.samples_created: - # self.create_samples_from_files() - # _all_comps = self.compounds_properties["iupac_name"].tolist() - # if self.deriv_files_present: - # _all_comps += self.deriv_compounds_properties["iupac_name"].tolist() - # rep = pd.DataFrame( - # index=list(set(_all_comps)), - # columns=list(self.samples_info.index), - # dtype="float", - # ) - # rep_std = pd.DataFrame( - # index=list(set(_all_comps)), - # columns=list(self.samples_info.index), - # dtype="float", - # ) - # rep.index.name, rep_std.index.name = param, param - - # for comp in rep.index.tolist(): # add conc values - # for samplename in rep.columns.tolist(): - # smp = self.samples[samplename].set_index("iupac_name") - # try: - # ave = smp.loc[comp, param] - # except KeyError: - # ave = 0 - # smp_std = self.samples_std[samplename].set_index("iupac_name") - # try: - # std = smp_std.loc[comp, param] - # except KeyError: - # std = np.nan - # rep.loc[comp, samplename] = ave - # rep_std.loc[comp, samplename] = std - - # rep = rep.sort_index(key=rep.max(1).get, ascending=False) - # rep = rep.loc[:, rep.any(axis=0)] # drop columns with only 0s - # rep = rep.loc[rep.any(axis=1), :] # drop rows with only 0s - # rep_std = rep_std.reindex(rep.index) - # self.samples_reports[param] = rep - # self.samples_reports_std[param] = rep_std - # self.list_of_samples_param_reports.append(param) - # if Project.auto_save_to_excel: - # self.save_samples_param_report(param=param) - # return rep, rep_std - - # def create_samples_param_aggrrep(self, param="conc_vial_mg_L"): - # """Aggregates compound concentration data by functional group for each - # parameter across all SAMPLES, providing a summarized view of functional - # group concentrations. This aggregation facilitates the understanding - # of functional group distribution across SAMPLES.""" - # print("Info: create_param_aggrrep: ", param) - # if param not in Project.acceptable_params: - # raise ValueError(f"{param = } is not an acceptable param") - # if param not in self.list_of_samples_param_reports: - # self.create_samples_param_report(param) - # # fg = functional groups, mf = mass fraction - # samplenames = self.samples_info.index.tolist() - # _all_comps = self.samples_reports[param].index.tolist() - # cols_with_fg_mf_labs = list(self.compounds_properties) - # if self.deriv_files_present: - # for c in list(self.deriv_compounds_properties): - # if c not in cols_with_fg_mf_labs: - # cols_with_fg_mf_labs.append(c) - # fg_mf_labs = [ - # c - # for c in cols_with_fg_mf_labs - # if c.startswith("fg_mf_") - # if c != "fg_mf_total" - # ] - # fg_labs = [c[6:] for c in fg_mf_labs] - # # create a df with iupac name index and fg_mf columns (underiv and deriv) - # comps_df = self.compounds_properties.set_index("iupac_name") - # if self.deriv_files_present: - # deriv_comps_df = self.deriv_compounds_properties.set_index("iupac_name") - # all_comps_df = pd.concat([comps_df, deriv_comps_df]) - # else: - # all_comps_df = comps_df - # all_comps_df = all_comps_df[~all_comps_df.index.duplicated(keep="first")] - # fg_mf_all = pd.DataFrame(index=_all_comps, columns=fg_mf_labs) - # for idx in fg_mf_all.index.tolist(): - # fg_mf_all.loc[idx, fg_mf_labs] = all_comps_df.loc[idx, fg_mf_labs] - # # create the aggregated dataframes and compute aggregated results - # aggrrep = pd.DataFrame(columns=samplenames, index=fg_labs, dtype="float") - # aggrrep.index.name = param # is the parameter - # aggrrep.fillna(0, inplace=True) - # aggrrep_std = pd.DataFrame(columns=samplenames, index=fg_labs, dtype="float") - # aggrrep_std.index.name = param # is the parameter - # aggrrep_std.fillna(0, inplace=True) - # for col in samplenames: - # list_iupac = self.samples_reports[param].index - # signal = self.samples_reports[param].loc[:, col].values - # signal_std = self.samples_reports_std[param].loc[:, col].values - # for fg, fg_mf in zip(fg_labs, fg_mf_labs): - # # each compound contributes to the cumulative sum of each - # # functional group for the based on the mass fraction it has - # # of that functional group (fg_mf act as weights) - # # if fg_mf in subrep: multiply signal for weight and sum - # # to get aggregated - # weights = fg_mf_all.loc[list_iupac, fg_mf].astype(signal.dtype) - - # aggrrep.loc[fg, col] = (signal * weights).sum() - # aggrrep_std.loc[fg, col] = (signal_std * weights).sum() - # aggrrep = aggrrep.loc[(aggrrep != 0).any(axis=1), :] # drop rows with only 0 - # aggrrep_std = aggrrep_std.reindex(aggrrep.index) - # aggrrep = aggrrep.sort_index( - # key=aggrrep[samplenames].max(1).get, ascending=False - # ) - # aggrrep_std = aggrrep_std.reindex(aggrrep.index) - - # self.samples_aggrreps[param] = aggrrep - # self.samples_aggrreps_std[param] = aggrrep_std - # self.list_of_samples_param_aggrreps.append(param) - # if Project.auto_save_to_excel: - # self.save_samples_param_aggrrep(param=param) - # return aggrrep, aggrrep_std - def save_files_info(self): """Saves the 'files_info' DataFrame as an Excel file in a 'files' subfolder within the project's output path, @@ -3081,366 +1656,3 @@ def save_samples_param_aggrrep(self, param="conc_inj_mg_L"): plib.Path(out_path, name + "_std.xlsx") ) print("Info: save_samples_param_aggrrep: ", name, " saved") - - def plot_ave_std( - self, - filename: str = "plot", - files_or_samples: str = "samples", - param: str = "conc_vial_mg_L", - aggr: bool = False, - min_y_thresh: float | None = None, - only_samples_to_plot: list[str] = None, - rename_samples: list[str] = None, - reorder_samples: list[str] = None, - item_to_color_to_hatch: pd.DataFrame | None = None, - paper_col=0.8, - fig_hgt_mlt=1.5, - xlab_rot=0, - annotate_outliers=True, - color_palette="deep", - y_lab=None, - y_lim=None, - y_ticks=None, - yt_sum=False, - yt_lim=None, - yt_lab=None, - yt_ticks=None, - yt_sum_label="total\n(right axis)", - legend_location="best", - legend_columns=1, - legend_x_anchor=1, - legend_y_anchor=1.02, - legend_labelspacing=0.5, - annotate_lttrs=False, - note_plt=None, - ): - """ - Generates a bar plot displaying average values with optional standard deviation - bars for a specified parameter from either files or samples. This function allows - for detailed customization of the plot, including aggregation by functional groups, - filtering based on minimum thresholds, renaming and reordering samples, and applying - specific color schemes and hatching patterns to items. - Additionally, it supports adjusting plot aesthetics such as size, figure height multiplier, - x-label rotation, and outlier annotation. The plot can include a secondary y-axis - to display the sum of values, with customizable limits, labels, ticks, and sum label. - The legend can be placed inside or outside the plot area, with adjustable location, - columns, anchor points, and label spacing. An optional note can be added to the plot - for additional context. - - Parameters: - - filename (str): Name for the output plot file. Default is 'plot'. - - files_or_samples (str): Specifies whether to plot data from 'files' - or 'samples'. Default is 'samples'. - - param (str): The parameter to plot, such as 'conc_vial_mg_L'. - Default is 'conc_vial_mg_L'. - - aggr (bool): Boolean indicating whether to aggregate data by functional groups. - Default is False, meaning no aggregation. - - min_y_thresh (float, optional): Minimum y-value threshold for including data in the plot. - Default is None, including all data. - - only_samples_to_plot (list, optional): List of samples to include in the plot. - Default is None, including all samples. - - rename_samples (dict, optional): Dictionary to rename samples in the plot. - Default is None, using original names. - - reorder_samples (list, optional): List specifying the order of samples in the plot. - Default is None, using original order. - - item_to_color_to_hatch (DataFrame, optional): DataFrame mapping items to specific colors and hatching patterns. - Default is None, using default colors and no hatching. - - paper_col (float): Background color of the plot area. Default is .8, a light grey. - - fig_hgt_mlt (float): Multiplier for the figure height to adjust plot size. Default is 1.5. - - xlab_rot (int): Rotation angle for x-axis labels. Default is 0, meaning no rotation. - - annotate_outliers (bool): Boolean indicating whether to annotate outliers exceeding y_lim. - Default is True. - - color_palette (str): Color palette for the plot. Default is 'deep'. - - y_lab (str, optional): Label for the y-axis. Default is None, using parameter name as label. - - y_lim (tuple[float, float], optional): Limits for the y-axis. Default is None, automatically determined. - - y_ticks (list[float], optional): Custom tick marks for the y-axis. Default is None, automatically determined. - - yt_sum (bool): Boolean indicating whether to display a sum on a secondary y-axis. Default is False. - - yt_lim (tuple[float, float], optional): Limits for the secondary y-axis. Default is None, automatically determined. - - yt_lab (str, optional): Label for the secondary y-axis. Default is None, using parameter name as label. - - yt_ticks (list[float], optional): Custom tick marks for the secondary y-axis. Default is None, automatically determined. - - yt_sum_label (str): Label for the sum on the secondary y-axis. Default is 'total (right axis)'. - - legend_location (str): Location of the legend within or outside the plot area. Default is 'best'. - - legend_columns (int): Number of columns in the legend. Default is 1. - - legend_x_anchor (float): X-anchor for the legend when placed outside the plot area. Default is 1. - - legend_y_anchor (float): Y-anchor for the legend when placed outside the plot area. Default is 1.02. - - legend_labelspacing (float): Spacing between labels in the legend. Default is 0.5. - - annotate_lttrs (bool): Boolean indicating whether to annotate letters for statistical significance. Default is False. - - note_plt (str, optional): Optional note to add to the plot for additional context. Default is None. - - - """ - - # create folder where Plots are stored - out_path = plib.Path(Project.out_path, "plots") - out_path.mkdir(parents=True, exist_ok=True) - if not aggr: # then use compounds reports - if files_or_samples == "files": - df_ave = self.files_reports[param].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = self.samples_reports[param].T - df_std = self.samples_reports_std[param].T - else: # use aggregated reports - if files_or_samples == "files": - df_ave = self.files_aggrreps[param].T - df_std = pd.DataFrame() - elif files_or_samples == "samples": - df_ave = self.samples_aggrreps[param].T - df_std = self.samples_aggrreps_std[param].T - - if only_samples_to_plot is not None: - df_ave = df_ave.loc[only_samples_to_plot, :].copy() - if files_or_samples == "samples": - df_std = df_std.loc[only_samples_to_plot, :].copy() - - if rename_samples is not None: - df_ave.index = rename_samples - if files_or_samples == "samples": - df_std.index = rename_samples - - if reorder_samples is not None: - filtered_reorder_samples = [ - idx for idx in reorder_samples if idx in df_ave.index - ] - df_ave = df_ave.reindex(filtered_reorder_samples) - if files_or_samples == "samples": - df_std = df_std.reindex(filtered_reorder_samples) - - if min_y_thresh is not None: - df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() - if files_or_samples == "samples": - df_std = df_std.loc[:, df_ave.columns].copy() - - if item_to_color_to_hatch is not None: # specific color and hatches to each fg - colors = [ - item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns - ] - htchs = [ - item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns - ] - else: # no specific colors and hatches specified - colors = sns.color_palette(color_palette, df_ave.shape[1]) - htchs = ( - None, - "//", - "...", - "--", - "O", - "\\\\", - "oo", - "\\\\\\", - "/////", - ".....", - "//", - "...", - "--", - "O", - "\\\\", - "oo", - "\\\\\\", - "/////", - ".....", - "//", - "...", - "--", - "O", - "\\\\", - "oo", - "\\\\\\", - "/////", - ".....", - "//", - "...", - "--", - "O", - "\\\\", - "oo", - "\\\\\\", - "/////", - ".....", - ) - if yt_sum: - plot_type = 1 - else: - plot_type = 0 - - fig, ax, axt, fig_par = figure_create( - rows=1, - cols=1, - plot_type=plot_type, - paper_col=paper_col, - hgt_mltp=fig_hgt_mlt, - font=Project.plot_font, - ) - if df_std.isna().all().all() or df_std.empty: # means that no std is provided - df_ave.plot( - ax=ax[0], - kind="bar", - rot=xlab_rot, - width=0.9, - edgecolor="k", - legend=False, - capsize=3, - color=colors, - ) - bars = ax[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0]) - else: # no legend is represented but non-significant values are shaded - mask = (df_ave.abs() > df_std.abs()) | df_std.isna() - - df_ave[mask].plot( - ax=ax[0], - kind="bar", - rot=xlab_rot, - width=0.9, - edgecolor="k", - legend=False, - yerr=df_std[mask], - capsize=3, - color=colors, - label="_nolegend", - ) - df_ave[~mask].plot( - ax=ax[0], - kind="bar", - rot=xlab_rot, - width=0.9, - legend=False, - edgecolor="grey", - color=colors, - alpha=0.5, - label="_nolegend", - ) - bars = ax[0].patches # needed to add patches to the bars - n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) - if yt_sum: - axt[0].scatter( - df_ave.index, - df_ave.sum(axis=1).values, - color="k", - linestyle="None", - edgecolor="k", - facecolor="grey", - s=100, - label=yt_sum_label, - alpha=0.5, - ) - if not df_std.empty: - axt[0].errorbar( - df_ave.index, - df_ave.sum(axis=1).values, - df_std.sum(axis=1).values, - capsize=3, - linestyle="None", - color="grey", - ecolor="k", - ) - bar_htchs = [] - # get a list with the htchs - for h in htchs[:n_different_hatches] + htchs[:n_different_hatches]: - for n in range(df_ave.shape[0]): # htcs repeated for samples - bar_htchs.append(h) # append based on samples number - for bar, hatch in zip(bars, bar_htchs): # assign htchs to each bar - bar.set_hatch(hatch) - ax[0].set(xlabel=None) - if y_lab is None: - y_lab = Project.param_to_axis_label[param] - if yt_sum: - legend_x_anchor += 0.14 - yt_lab = y_lab - if xlab_rot != 0: - ax[0].set_xticklabels( - df_ave.index, rotation=xlab_rot, ha="right", rotation_mode="anchor" - ) - if legend_location is not None: - hnd_ax, lab_ax = ax[0].get_legend_handles_labels() - if not df_std.empty: - hnd_ax = hnd_ax[: len(hnd_ax) // 2] - lab_ax = lab_ax[: len(lab_ax) // 2] - if legend_labelspacing > 0.5: # large legend spacing for molecules - ax[0].plot(np.nan, np.nan, "-", color="None", label=" ") - hhhh, aaaa = ax[0].get_legend_handles_labels() - hnd_ax.append(hhhh[0]) - lab_ax.append(aaaa[0]) - if yt_sum: - hnd_axt, lab_axt = axt[0].get_legend_handles_labels() - else: - hnd_axt, lab_axt = [], [] - if legend_location == "outside": # legend goes outside of plot area - ax[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc="upper left", - ncol=legend_columns, - bbox_to_anchor=(legend_x_anchor, legend_y_anchor), - labelspacing=legend_labelspacing, - ) - else: # legend is inside of plot area - ax[0].legend( - hnd_ax + hnd_axt, - lab_ax + lab_axt, - loc=legend_location, - ncol=legend_columns, - labelspacing=legend_labelspacing, - ) - # annotate ave+-std at the top of outliers bar (exceeding y_lim) - if annotate_outliers and (y_lim is not None): # and (not df_std.empty): - _annotate_outliers_in_plot(ax[0], df_ave, df_std, y_lim) - if note_plt: - ax[0].annotate( - note_plt, - ha="left", - va="bottom", - xycoords="axes fraction", - xy=(0.005, 0.945 + fig_hgt_mlt / 100), - ) - figure_save( - filename, - out_path, - fig, - ax, - axt, - fig_par, - y_lab=y_lab, - yt_lab=yt_lab, - y_lim=y_lim, - yt_lim=yt_lim, - legend=False, - y_ticks=y_ticks, - yt_ticks=yt_ticks, - tight_layout=True, - annotate_lttrs=annotate_lttrs, - grid=Project.plot_grid, - ) - - -# %% diff --git a/src/gcms_data_analysis/plotting.py b/src/gcms_data_analysis/plotting.py new file mode 100644 index 0000000..8ecee68 --- /dev/null +++ b/src/gcms_data_analysis/plotting.py @@ -0,0 +1,1249 @@ +from __future__ import annotations +from typing import Literal, Any, Dict +import string +import pathlib as plib +import pandas as pd +import seaborn as sns +import numpy as np +from matplotlib.transforms import blended_transform_factory +from matplotlib.figure import Figure +from matplotlib.axes import Axes +import matplotlib.pyplot as plt +from gcms_data_analysis.main import Project + + +lttrs: list[str] = list(string.ascii_lowercase) + +# list with colors +clrs: list[tuple] = sns.color_palette("deep", 30) + +# list with linestyles for plotting +lnstls: list[tuple] = [ + (0, ()), # solid + (0, (1, 1)), # 'densely dotted' + (0, (5, 1)), # 'densely dashed' + (0, (3, 1, 1, 1)), # 'densely dashdotted' + (0, (3, 1, 1, 1, 1, 1)), # 'densely dashdotdotted' + (0, (5, 5)), # 'dashed' + (0, (3, 5, 1, 5)), # 'dashdotted' + (0, (1, 5)), # dotted + (0, (3, 5, 1, 5, 1, 5)), # 'dashdotdotted' + (0, (1, 10)), # 'loosely dotted' + (0, (5, 10)), # 'loosely dashed' + (0, (3, 10, 1, 10)), # 'loosely dashdotted' + (0, (3, 10, 1, 10, 1, 10)), + (0, ()), # solid + (0, (1, 1)), # 'densely dotted' + (0, (5, 1)), # 'densely dashed' + (0, (3, 1, 1, 1)), # 'densely dashdotted' + (0, (3, 1, 1, 1, 1, 1)), # 'densely dashdotdotted' + (0, (5, 5)), # 'dashed' + (0, (3, 5, 1, 5)), # 'dashdotted' + (0, (1, 5)), # dotted + (0, (3, 5, 1, 5, 1, 5)), # 'dashdotdotted' + (0, (1, 10)), # 'loosely dotted' + (0, (5, 10)), # 'loosely dashed' + (0, (3, 10, 1, 10)), # 'loosely dashdotted' + (0, (3, 10, 1, 10, 1, 10)), +] # 'loosely dashdotdotted' + +# list with markers for plotting +mrkrs: list[str] = [ + "o", + "v", + "X", + "s", + "p", + "^", + "P", + "<", + ">", + "*", + "d", + "1", + "2", + "3", + "o", + "v", + "X", + "s", + "p", + "^", + "P", + "<", + ">", + "*", + "d", + "1", + "2", + "3", +] + +htchs: list[str] = [ + None, + "//", + "...", + "--", + "O", + "\\\\", + "oo", + "\\\\\\", + "/////", + ".....", + "//", + "...", + "--", + "O", + "\\\\", + "oo", + "\\\\\\", + "/////", + ".....", + "//", + "...", + "--", + "O", + "\\\\", + "oo", + "\\\\\\", + "/////", + ".....", + "//", + "...", + "--", + "O", + "\\\\", + "oo", + "\\\\\\", + "/////", + ".....", +] + + +def _annotate_outliers_in_plot(ax, df_ave, df_std, y_lim): + """ + Annotates the bars in a bar plot with their average value and standard + deviation if these values exceed the specified y-axis limits. + The function iterates over the bars in the plot and checks if their average + values, considering their standard deviations, are outside the provided + y-axis limits. For such bars, it annotates the average and standard + deviation on the + plot, using a specific format for better visualization and understanding. + + Parameters + ---------- + ax : matplotlib.axes.Axes + The matplotlib Axes object where the plot is drawn. + df_ave : pandas.DataFrame + DataFrame containing the average values used in the plot. + df_std : pandas.DataFrame + DataFrame containing the standard deviation values corresponding + to df_ave. + y_lim : list of [float, float] + A list of two floats representing the minimum (y_lim[0]) and + maximum (y_lim[1]) limits of the y-axis. + + Returns + ------- + None + Modifies the provided Axes object (ax) by adding annotations. + + """ + dx = 0.15 * len(df_ave.index) + dy = 0.04 + tform = blended_transform_factory(ax.transData, ax.transAxes) + dfao = pd.DataFrame(columns=["H/L", "xpos", "ypos", "ave", "std", "text"]) + dfao["ave"] = df_ave.transpose().to_numpy().flatten().tolist() + if df_std.empty: + df_std = np.zeros(len(dfao["ave"])) + else: + dfao["std"] = df_std.transpose().to_numpy().flatten().tolist() + try: + dfao["xpos"] = [p.get_x() + p.get_width() / 2 for p in ax.patches] + except ValueError: # otherwise the masking adds twice the columns + dfao["xpos"] = [ + p.get_x() + p.get_width() / 2 for p in ax.patches[: len(ax.patches) // 2] + ] + cond = (dfao["ave"] < y_lim[0]) | (dfao["ave"] > y_lim[1]) + dfao = dfao.drop(dfao[~cond].index) + for ao in dfao.index.tolist(): # loop through bars + if dfao.loc[ao, "ave"] == float("inf"): + dfao.loc[ao, "text"] = "inf" + dfao.loc[ao, "H/L"] = "H" + elif dfao.loc[ao, "ave"] == float("-inf"): + dfao.loc[ao, "text"] = "-inf" + dfao.loc[ao, "H/L"] = "L" + elif dfao.loc[ao, "ave"] > y_lim[1]: + dfao.loc[ao, "H/L"] = "H" + dfao.loc[ao, "text"] = "{:.2f}".format( + round(dfao.loc[ao, "ave"], 2) + ).strip() + if (dfao.loc[ao, "std"] != 0) & (~np.isnan(dfao.loc[ao, "std"])): + dfao.loc[ao, "text"] += r"$\pm$" + "{:.2f}".format( + round(dfao.loc[ao, "std"], 2) + ) + elif dfao.loc[ao, "ave"] < y_lim[0]: + dfao.loc[ao, "H/L"] = "L" + dfao.loc[ao, "text"] = str(round(dfao.loc[ao, "ave"], 2)).strip() + if dfao.loc[ao, "std"] != 0: + dfao.loc[ao, "text"] += r"$\pm$" + "{:.2f}".format( + round(dfao.loc[ao, "std"], 2) + ) + else: + print("Something is wrong", dfao.loc[ao, "ave"]) + for hl, ypos, dy in zip(["L", "H"], [0.02, 0.98], [0.04, -0.04]): + dfao1 = dfao[dfao["H/L"] == hl] + dfao1["ypos"] = ypos + if not dfao1.empty: + dfao1 = dfao1.sort_values("xpos", ascending=True) + dfao1["diffx"] = ( + np.diff(dfao1["xpos"].values, prepend=dfao1["xpos"].values[0]) < dx + ) + dfao1.reset_index(inplace=True) + + for i in dfao1.index.tolist()[1:]: + dfao1.loc[i, "ypos"] = ypos + for e in range(i, 0, -1): + if dfao1.loc[e, "diffx"]: + dfao1.loc[e, "ypos"] += dy + else: + break + for ao in dfao1.index.tolist(): + ax.annotate( + dfao1.loc[ao, "text"], + xy=(dfao1.loc[ao, "xpos"], 0), + xycoords=tform, + textcoords=tform, + xytext=(dfao1.loc[ao, "xpos"], dfao1.loc[ao, "ypos"]), + fontsize=9, + ha="center", + va="center", + bbox={ + "boxstyle": "square,pad=0", + "edgecolor": None, + "facecolor": "white", + "alpha": 0.7, + }, + ) + + +class MyFigure: + """ + A class for creating and customizing figures using matplotlib and seaborn. + + MyFigure provides a structured way to create figures with multiple subplots, + allowing for detailed customization of each subplot. It supports features like + adjusting axis limits, adding legends, annotating, and creating inset plots, + all with an emphasis on easy configurability through keyword arguments. + + :ivar broad_props: A dictionary to store properties that are broadcasted across all axes. + :type broad_props: dict + :ivar kwargs: A dictionary to store all the configuration keyword arguments. + :type kwargs: dict + :ivar fig: The main figure object from matplotlib. + :type fig: matplotlib.figure.Figure + :ivar axs: A list of axes objects corresponding to the subplots in the figure. + :type axs: list[matplotlib.axes.Axes] + :ivar axts: A list of twin axes objects if 'twinx' is enabled, otherwise None. + :type axts: list[matplotlib.axes.Axes] or None + :ivar n_axs: The number of axes/subplots in the figure. + :type n_axs: int + + The class is designed to work seamlessly with seaborn's styling features, + making it suitable for creating publication-quality figures with minimal code. + """ + + @staticmethod + def _adjust_lims(lims: tuple[float] | None, gap=0.05) -> tuple[float] | None: + """ + Adjusts the provided axis limits by a specified gap percentage to add padding + around the data. + + :param lims: _description_ + :type lims: tuple[float] | None + :param gap: _description_, defaults to 0.05 + :type gap: float, optional + :return: _description_ + :rtype: tuple[float] | None + """ + if lims is None: + return None + else: + new_lims = ( + lims[0] * (1 + gap) - gap * lims[1], + lims[1] * (1 + gap) - gap * lims[0], + ) + return new_lims + + def __init__(self, **kwargs: Any) -> None: + """ + Initializes a MyFigure object with custom or default settings for creating plots. + + :param kwargs: Keyword arguments to override default figure settings. + """ + self.broad_props: dict[str, list] = {} # broadcasted properties for each axis + self.kwargs = self.default_kwargs() + self.kwargs.update(kwargs) # Override defaults with any kwargs provided + self.process_kwargs() + + sns.set_palette(self.kwargs["color_palette"]) + sns.set_style( + self.kwargs["sns_style"], {"font.family": self.kwargs["text_font"]} + ) + + self.create_figure() + + self.update_axes_single_props() + + self.update_axes_list_props() + + def default_kwargs(self) -> Dict[str, Any]: + """ + Defines the default settings for the figure. + + :return: A dictionary of default settings. + """ + defaults = { + "rows": 1, + "cols": 1, + "width": 6.0, + "height": 6.0, + "x_lab": None, + "y_lab": None, + "x_lim": None, + "y_lim": None, + "x_ticks": None, + "y_ticks": None, + "x_ticklabels": None, + "y_ticklabels": None, + "twinx": False, + "yt_lab": None, + "yt_lim": None, + "yt_ticks": None, + "yt_ticklabels": None, + "legend": True, + "legend_loc": "best", + "legend_ncols": 1, + "annotate_lttrs": False, + "annotate_lttrs_xy": None, + "grid": False, + "color_palette": "deep", + "text_font": "Dejavu Sans", + "sns_style": "ticks", + } + return defaults + + def process_kwargs(self) -> None: + """ + Validates and processes the provided keyword arguments for figure configuration. + + + :raises ValueError: _description_ + :raises ValueError: _description_ + :raises ValueError: _description_ + :raises ValueError: _description_ + :raises ValueError: _description_ + """ + self.kwargs["rows"] = int(self.kwargs["rows"]) + self.kwargs["cols"] = int(self.kwargs["cols"]) + self.kwargs["width"] = float(self.kwargs["width"]) + self.kwargs["height"] = float(self.kwargs["height"]) + self.kwargs["legend_ncols"] = int(self.kwargs["legend_ncols"]) + + if self.kwargs["rows"] <= 0: + raise ValueError("Number of rows must be positive.") + if self.kwargs["cols"] <= 0: + raise ValueError("Number of cols must be positive.") + if self.kwargs["width"] <= 0: + raise ValueError("Width must be positive.") + if self.kwargs["height"] <= 0: + raise ValueError("Height must be positive.") + if self.kwargs["legend_ncols"] <= 0: + raise ValueError("Number of legend columns must be positive.") + + def create_figure(self) -> MyFigure: + """ + Creates the figure and its axes. + + :return: _description_ + :rtype: MyFigure + """ + self.fig: Figure + self.axs: Axes + self.axts: Axes | None + self.fig, axes = plt.subplots( + self.kwargs["rows"], + self.kwargs["cols"], + figsize=(self.kwargs["width"], self.kwargs["height"]), + constrained_layout=True, + ) + # Ensure ax is always an array, even if it's just one subplot + self.axs: list[Axes] = np.atleast_1d(axes).flatten().tolist() + if self.kwargs["twinx"]: + self.axts: list[Axes] = [a.twinx() for a in self.axs] + + self.n_axs = len(self.axs) + return self + + def save_figure( + self, + filename: str = "figure", + out_path: plib.Path | None = plib.Path("."), + tight_layout: bool = True, + save_as_png: bool = True, + save_as_pdf: bool = False, + save_as_svg: bool = False, + save_as_eps: bool = False, + png_transparency: bool = False, + ) -> None: + """_summary_ + + :param filename: _description_, defaults to "figure" + :type filename: str, optional + :param out_path: _description_, defaults to plib.Path(".") + :type out_path: plib.Path | None, optional + :param tight_layout: _description_, defaults to True + :type tight_layout: bool, optional + :param save_as_png: _description_, defaults to True + :type save_as_png: bool, optional + :param save_as_pdf: _description_, defaults to False + :type save_as_pdf: bool, optional + :param save_as_svg: _description_, defaults to False + :type save_as_svg: bool, optional + :param save_as_eps: _description_, defaults to False + :type save_as_eps: bool, optional + :param png_transparency: _description_, defaults to False + :type png_transparency: bool, optional + """ + self.update_axes_single_props() + + self.update_axes_list_props() + + self.add_legend() + try: + self.fig.align_labels() # align labels of subplots, needed only for multi plot + except AttributeError: + print("align_labels not performed") + self.annotate_letters() + # Saving the figure + formats = { + "png": save_as_png, + "pdf": save_as_pdf, + "svg": save_as_svg, + "eps": save_as_eps, + } + + for fmt, should_save in formats.items(): + if should_save: + full_path = plib.Path(out_path, f"{filename}.{fmt}") + self.fig.savefig( + full_path, + dpi=300, + transparent=png_transparency, + bbox_inches="tight" if tight_layout else None, + ) + + def add_legend(self) -> None: + """_summary_""" + for sprop in ["legend", "legend_loc", "legend_ncols"]: + self.broad_props[sprop] = self._broadcast_value_prop( + self.kwargs[sprop], sprop + ) + + if self.kwargs["twinx"] is False: + for i, ax in enumerate(self.axs): + if self.broad_props["legend"][i]: + ax.legend( + loc=self.broad_props["legend_loc"][i], + ncol=self.broad_props["legend_ncols"][i], + ) + else: + for i, (ax, axt) in enumerate(zip(self.axs, self.axts)): + if self.broad_props["legend"][i]: + hnd_ax, lab_ax = ax.get_legend_handles_labels() + hnd_axt, lab_axt = axt.get_legend_handles_labels() + ax.legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc=self.broad_props["legend_loc"][i], + ncol=self.broad_props["legend_ncols"][i], + ) + + def annotate_letters(self) -> None: + """_summary_""" + if ( + self.kwargs["annotate_lttrs_xy"] is not None + and isinstance(self.kwargs["annotate_lttrs_xy"], (list, tuple)) + and len(self.kwargs["annotate_lttrs_xy"]) >= 2 + ): + xylttrs: list | tuple = self.kwargs["annotate_lttrs_xy"] + x_lttrs = xylttrs[0] # pylint: disable=unsubscriptable-object + y_lttrs = xylttrs[1] # pylint: disable=unsubscriptable-object + else: + x_lttrs = -0.15 + y_lttrs = -0.15 + if self.kwargs["annotate_lttrs"] is not False: + if isinstance(self.kwargs["annotate_lttrs"], str): + letters_list = [self.kwargs["annotate_lttrs"]] + elif isinstance(self.kwargs["annotate_lttrs"], list, tuple): + letters_list = self.kwargs["annotate_lttrs"] + for i, ax in enumerate(self.axs): + ax.annotate( + f"({letters_list[i]})", + xycoords="axes fraction", + xy=(0, 0), + xytext=(x_lttrs, y_lttrs), + size="large", + weight="bold", + ) + + def create_inset( + self, + ax: Axes, + ins_x_loc: list[float, float], + ins_y_loc: list[float, float], + ins_x_lim: list[float, float], + ins_y_lim: list[float, float], + ) -> Axes: + """_summary_ + + :param ax: _description_ + :type ax: Axes + :param ins_x_loc: _description_ + :type ins_x_loc: list[float, float] + :param ins_y_loc: _description_ + :type ins_y_loc: list[float, float] + :param ins_x_lim: _description_ + :type ins_x_lim: list[float, float] + :param ins_y_lim: _description_ + :type ins_y_lim: list[float, float] + :return: _description_ + :rtype: Axes + """ + wdt = ins_x_loc[1] - ins_x_loc[0] + hgt = ins_y_loc[1] - ins_y_loc[0] + inset = ax.inset_axes([ins_x_loc[0], ins_y_loc[0], wdt, hgt]) + + inset.set_xlim(MyFigure._adjust_lims(ins_x_lim)) + inset.set_ylim(MyFigure._adjust_lims(ins_y_lim)) + return inset + + def update_axes_single_props(self): + """_summary_""" + for sprop in ["x_lab", "y_lab", "yt_lab", "grid"]: + self.broad_props[sprop] = self._broadcast_value_prop( + self.kwargs[sprop], sprop + ) + + # Update each axis with the respective properties + for i, ax in enumerate(self.axs): + ax.set_xlabel(self.broad_props["x_lab"][i]) + ax.set_ylabel(self.broad_props["y_lab"][i]) + if self.broad_props["grid"][i] is not None: + ax.grid(self.broad_props["grid"][i]) + + if self.kwargs["twinx"]: + for i, axt in enumerate(self.axts): + axt.set_ylabel(self.broad_props["yt_lab"][i]) + + def update_axes_list_props(self): + """_summary_""" + for lprop in [ + "x_lim", + "y_lim", + "yt_lim", + "x_ticks", + "y_ticks", + "yt_ticks", + "x_ticklabels", + "y_ticklabels", + "yt_ticklabels", + ]: + self.broad_props[lprop] = self._broadcast_list_prop( + self.kwargs[lprop], lprop + ) + + # Update each axis with the respective properties + for i, ax in enumerate(self.axs): + if self.broad_props["x_lim"][i] is not None: + ax.set_xlim(MyFigure._adjust_lims(self.broad_props["x_lim"][i])) + if self.broad_props["y_lim"][i] is not None: + ax.set_ylim(MyFigure._adjust_lims(self.broad_props["y_lim"][i])) + if self.broad_props["x_ticks"][i] is not None: + ax.set_xticks(self.broad_props["x_ticks"][i]) + if self.broad_props["y_ticks"][i] is not None: + ax.set_yticks(self.broad_props["y_ticks"][i]) + if self.broad_props["x_ticklabels"][i] is not None: + ax.set_xticklabels(self.broad_props["x_ticklabels"][i]) + if self.broad_props["y_ticklabels"][i] is not None: + ax.set_yticklabels(self.broad_props["y_ticklabels"][i]) + + if self.kwargs["twinx"]: + for i, axt in enumerate(self.axts): + if self.broad_props["yt_lim"][i] is not None: + axt.set_ylim(MyFigure._adjust_lims(self.broad_props["yt_lim"][i])) + if self.broad_props["yt_ticks"][i] is not None: + axt.set_yticks(self.broad_props["yt_ticks"][i]) + if self.broad_props["yt_ticklabels"][i] is not None: + axt.set_yticklabels(self.broad_props["yt_ticklabels"][i]) + + def _broadcast_value_prop( + self, prop: list | str | float | int | bool, prop_name: str + ) -> list: + """_summary_ + + :param prop: _description_ + :type prop: list | str | float | int | bool + :param prop_name: The name of the property for error messages. + :type prop_name: str + :raises ValueError: _description_ + :return: _description_ + :rtype: list + """ + if prop is None: + return [None] * self.n_axs + if isinstance(prop, (list, tuple)): + if len(prop) == self.n_axs: + return prop + else: + raise ValueError( + f"The size of the property '{prop_name}' does not match the number of axes." + ) + if isinstance(prop, (str, float, int, bool)): + return [prop] * self.n_axs + + def _broadcast_list_prop(self, prop: list | None, prop_name: str): + """_summary_ + + :param prop: _description_ + :type prop: list | None + :param prop_name: The name of the property for error messages. + :type prop_name: str + :raises ValueError: _description_ + :return: _description_ + :rtype: _type_ + """ + if prop is None: + return [None] * self.n_axs + + if ( + all(isinstance(item, (list, tuple)) for item in prop) + and len(prop) == self.n_axs + ): + return prop + elif isinstance(prop, (list, tuple)) and all( + isinstance(item, (int, float, str)) for item in prop + ): + return [prop] * self.n_axs + else: + raise ValueError( + f"The structure of '{prop_name = }' does not match expected pair-wise input." + ) + + +def plot_ave_std( + proj: Project, + filename: str = "plot", + files_or_samples: Literal["files", "samples"] = "samples", + param: str = "conc_vial_mg_L", + aggr: bool = False, + show_total_in_twinx: bool = False, + annotate_outliers: bool = True, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + y_lim: tuple[float] | None = None, + y_lab: str | None = None, + yt_lab: str | None = None, + color_palette: str = "deep", + x_label_rotation: int = 0, + legend_location: Literal["best", "outside"] = "best", + legend_columns: int = 1, + legend_x_anchor: float = 1, + legend_y_anchor: float = 1.02, + legend_labelspacing: float = 0.5, + **kwargs, +) -> MyFigure: + """ + Generates a bar plot displaying average values with optional standard deviation + bars for a specified parameter from either files or samples. This function allows + for detailed customization of the plot, including aggregation by functional groups, + filtering based on minimum thresholds, renaming and reordering samples, and applying + specific color schemes and hatching patterns to items. + Additionally, it supports adjusting plot aesthetics such as size, figure height multiplier, + x-label rotation, and outlier annotation. The plot can include a secondary y-axis + to display the sum of values, with customizable limits, labels, ticks, and sum label. + The legend can be placed inside or outside the plot area, with adjustable location, + columns, anchor points, and label spacing. An optional note can be added to the plot + for additional context. + + Parameters: + + filename (str): Name for the output plot file. Default is 'plot'. + + files_or_samples (str): Specifies whether to plot data from 'files' + or 'samples'. Default is 'samples'. + + param (str): The parameter to plot, such as 'conc_vial_mg_L'. + Default is 'conc_vial_mg_L'. + + aggr (bool): Boolean indicating whether to aggregate data by functional groups. + Default is False, meaning no aggregation. + + min_y_thresh (float, optional): Minimum y-value threshold for including data in the plot. + Default is None, including all data. + + only_samples_to_plot (list, optional): List of samples to include in the plot. + Default is None, including all samples. + + rename_samples (dict, optional): Dictionary to rename samples in the plot. + Default is None, using original names. + + reorder_samples (list, optional): List specifying the order of samples in the plot. + Default is None, using original order. + + item_to_color_to_hatch (DataFrame, optional): DataFrame mapping items to specific colors and hatching patterns. + Default is None, using default colors and no hatching. + + paper_col (float): Background color of the plot area. Default is .8, a light grey. + + fig_hgt_mlt (float): Multiplier for the figure height to adjust plot size. Default is 1.5. + + x_label_rotation (int): Rotation angle for x-axis labels. Default is 0, meaning no rotation. + + annotate_outliers (bool): Boolean indicating whether to annotate outliers exceeding y_lim. + Default is True. + + color_palette (str): Color palette for the plot. Default is 'deep'. + + y_lab (str, optional): Label for the y-axis. Default is None, using parameter name as label. + + y_lim (tuple[float, float], optional): Limits for the y-axis. Default is None, automatically determined. + + y_ticks (list[float], optional): Custom tick marks for the y-axis. Default is None, automatically determined. + + yt_sum (bool): Boolean indicating whether to display a sum on a secondary y-axis. Default is False. + + yt_lim (tuple[float, float], optional): Limits for the secondary y-axis. Default is None, automatically determined. + + yt_lab (str, optional): Label for the secondary y-axis. Default is None, using parameter name as label. + + yt_ticks (list[float], optional): Custom tick marks for the secondary y-axis. Default is None, automatically determined. + + yt_sum_label (str): Label for the sum on the secondary y-axis. Default is 'total (right axis)'. + + legend_location (str): Location of the legend within or outside the plot area. Default is 'best'. + + legend_columns (int): Number of columns in the legend. Default is 1. + + legend_x_anchor (float): X-anchor for the legend when placed outside the plot area. Default is 1. + + legend_y_anchor (float): Y-anchor for the legend when placed outside the plot area. Default is 1.02. + + legend_labelspacing (float): Spacing between labels in the legend. Default is 0.5. + + annotate_lttrs (bool): Boolean indicating whether to annotate letters for statistical significance. Default is False. + + note_plt (str, optional): Optional note to add to the plot for additional context. Default is None. + + """ + + # create folder where Plots are stored + out_path = plib.Path(Project.out_path, "plots") + out_path.mkdir(parents=True, exist_ok=True) + if not aggr: # then use compounds reports + if files_or_samples == "files": + df_ave = proj.files_reports[param].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_reports[param].T + df_std = proj.samples_reports_std[param].T + else: # use aggregated reports + if files_or_samples == "files": + df_ave = proj.files_aggrreps[param].T + df_std = pd.DataFrame() + elif files_or_samples == "samples": + df_ave = proj.samples_aggrreps[param].T + df_std = proj.samples_aggrreps_std[param].T + + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if files_or_samples == "samples": + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if files_or_samples == "samples": + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if files_or_samples == "samples": + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if files_or_samples == "samples": + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] + hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] + else: # no specific colors and hatches specified + colors = sns.color_palette(color_palette, df_ave.shape[1]) + hatches = htchs + + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = False + + if y_lab is None: + y_lab = Project.param_to_axis_label[param] + if show_total_in_twinx: + legend_x_anchor += 0.14 + yt_lab = y_lab + + myfig = MyFigure( + rows=1, + cols=1, + twinx=plot_twinx, + text_font=Project.plot_font, + y_lab=y_lab, + yt_lab=yt_lab, + y_lim=y_lim, + legend=False, + grid=Project.plot_grid, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0]) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend", + ) + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend", + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + ) + bar_hatches = [] + # get a list with the hatches + for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: + for n in range(df_ave.shape[0]): # htcs repeated for samples + bar_hatches.append(h) # append based on samples number + for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar + bar.set_hatch(hatch) + myfig.axs[0].set(xlabel=None) + if x_label_rotation != 0: + myfig.axs[0].set_xticklabels( + df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" + ) + if legend_location is not None: + hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() + if not df_std.empty: + hnd_ax = hnd_ax[: len(hnd_ax) // 2] + lab_ax = lab_ax[: len(lab_ax) // 2] + if legend_labelspacing > 0.5: # large legend spacing for molecules + myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") + hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() + hnd_ax.append(hhhh[0]) + lab_ax.append(aaaa[0]) + if show_total_in_twinx: + hnd_axt, lab_axt = myfig.axt[0].get_legend_handles_labels() + else: + hnd_axt, lab_axt = [], [] + if legend_location == "outside": # legend goes outside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc="upper left", + ncol=legend_columns, + bbox_to_anchor=(legend_x_anchor, legend_y_anchor), + labelspacing=legend_labelspacing, + ) + else: # legend is inside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc=legend_location, + ncol=legend_columns, + labelspacing=legend_labelspacing, + ) + # annotate ave+-std at the top of outliers bar (exceeding y_lim) + if annotate_outliers and (y_lim is not None): # and (not df_std.empty): + _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) + myfig.save_figure(filename, out_path) + return myfig + + +def plot_df_ave_std( + proj: Project, + df_ave: pd.DataFrame, + df_std: pd.DataFrame = pd.DataFrame(), + filename: str = "plot", + show_total_in_twinx: bool = False, + annotate_outliers: bool = True, + min_y_thresh: float | None = None, + only_samples_to_plot: list[str] | None = None, + rename_samples: list[str] | None = None, + reorder_samples: list[str] | None = None, + item_to_color_to_hatch: pd.DataFrame | None = None, + yt_sum_label: str = "total\n(right axis)", + y_lim: tuple[float] | None = None, + y_lab: str | None = None, + yt_lab: str | None = None, + color_palette: str = "deep", + x_label_rotation: int = 0, + legend_location: Literal["best", "outside"] = "best", + legend_columns: int = 1, + legend_x_anchor: float = 1, + legend_y_anchor: float = 1.02, + legend_labelspacing: float = 0.5, + **kwargs, +) -> MyFigure: + """ + Generates a bar plot displaying average values with optional standard deviation + bars for a specified parameter from either files or samples. This function allows + for detailed customization of the plot, including aggregation by functional groups, + filtering based on minimum thresholds, renaming and reordering samples, and applying + specific color schemes and hatching patterns to items. + Additionally, it supports adjusting plot aesthetics such as size, figure height multiplier, + x-label rotation, and outlier annotation. The plot can include a secondary y-axis + to display the sum of values, with customizable limits, labels, ticks, and sum label. + The legend can be placed inside or outside the plot area, with adjustable location, + columns, anchor points, and label spacing. An optional note can be added to the plot + for additional context. + + Parameters: + + filename (str): Name for the output plot file. Default is 'plot'. + + files_or_samples (str): Specifies whether to plot data from 'files' + or 'samples'. Default is 'samples'. + + param (str): The parameter to plot, such as 'conc_vial_mg_L'. + Default is 'conc_vial_mg_L'. + + aggr (bool): Boolean indicating whether to aggregate data by functional groups. + Default is False, meaning no aggregation. + + min_y_thresh (float, optional): Minimum y-value threshold for including data in the plot. + Default is None, including all data. + + only_samples_to_plot (list, optional): List of samples to include in the plot. + Default is None, including all samples. + + rename_samples (dict, optional): Dictionary to rename samples in the plot. + Default is None, using original names. + + reorder_samples (list, optional): List specifying the order of samples in the plot. + Default is None, using original order. + + item_to_color_to_hatch (DataFrame, optional): DataFrame mapping items to specific colors and hatching patterns. + Default is None, using default colors and no hatching. + + paper_col (float): Background color of the plot area. Default is .8, a light grey. + + fig_hgt_mlt (float): Multiplier for the figure height to adjust plot size. Default is 1.5. + + x_label_rotation (int): Rotation angle for x-axis labels. Default is 0, meaning no rotation. + + annotate_outliers (bool): Boolean indicating whether to annotate outliers exceeding y_lim. + Default is True. + + color_palette (str): Color palette for the plot. Default is 'deep'. + + y_lab (str, optional): Label for the y-axis. Default is None, using parameter name as label. + + y_lim (tuple[float, float], optional): Limits for the y-axis. Default is None, automatically determined. + + y_ticks (list[float], optional): Custom tick marks for the y-axis. Default is None, automatically determined. + + yt_sum (bool): Boolean indicating whether to display a sum on a secondary y-axis. Default is False. + + yt_lim (tuple[float, float], optional): Limits for the secondary y-axis. Default is None, automatically determined. + + yt_lab (str, optional): Label for the secondary y-axis. Default is None, using parameter name as label. + + yt_ticks (list[float], optional): Custom tick marks for the secondary y-axis. Default is None, automatically determined. + + yt_sum_label (str): Label for the sum on the secondary y-axis. Default is 'total (right axis)'. + + legend_location (str): Location of the legend within or outside the plot area. Default is 'best'. + + legend_columns (int): Number of columns in the legend. Default is 1. + + legend_x_anchor (float): X-anchor for the legend when placed outside the plot area. Default is 1. + + legend_y_anchor (float): Y-anchor for the legend when placed outside the plot area. Default is 1.02. + + legend_labelspacing (float): Spacing between labels in the legend. Default is 0.5. + + annotate_lttrs (bool): Boolean indicating whether to annotate letters for statistical significance. Default is False. + + note_plt (str, optional): Optional note to add to the plot for additional context. Default is None. + + """ + + # create folder where Plots are stored + out_path = plib.Path(Project.out_path, "df_plots") + out_path.mkdir(parents=True, exist_ok=True) + if only_samples_to_plot is not None: + df_ave = df_ave.loc[only_samples_to_plot, :].copy() + if not df_std.empty: + df_std = df_std.loc[only_samples_to_plot, :].copy() + + if rename_samples is not None: + df_ave.index = rename_samples + if not df_std.empty: + df_std.index = rename_samples + + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + if reorder_samples is not None: + filtered_reorder_samples = [ + idx for idx in reorder_samples if idx in df_ave.index + ] + df_ave = df_ave.reindex(filtered_reorder_samples) + if not df_std.empty: + df_std = df_std.reindex(filtered_reorder_samples) + + if min_y_thresh is not None: + df_ave = df_ave.loc[:, (df_ave > min_y_thresh).any(axis=0)].copy() + if not df_std.empty: + df_std = df_std.loc[:, df_ave.columns].copy() + + if item_to_color_to_hatch is not None: # specific color and hatches to each fg + colors = [item_to_color_to_hatch.loc[item, "clr"] for item in df_ave.columns] + hatches = [item_to_color_to_hatch.loc[item, "htch"] for item in df_ave.columns] + else: # no specific colors and hatches specified + colors = sns.color_palette(color_palette, df_ave.shape[1]) + hatches = htchs + + if show_total_in_twinx: + plot_twinx: bool = True + else: + plot_twinx: bool = False + + if show_total_in_twinx: + legend_x_anchor += 0.14 + yt_lab = y_lab + + myfig = MyFigure( + rows=1, + cols=1, + twinx=plot_twinx, + text_font=Project.plot_font, + y_lab=y_lab, + yt_lab=yt_lab, + y_lim=y_lim, + legend=False, + grid=Project.plot_grid, + **kwargs, + ) + if df_std.isna().all().all() or df_std.empty: # means that no std is provided + df_ave.plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + capsize=3, + color=colors, + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0]) + else: # no legend is represented but non-significant values are shaded + mask = (df_ave.abs() > df_std.abs()) | df_std.isna() + + df_ave[mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + edgecolor="k", + legend=False, + yerr=df_std[mask], + capsize=3, + color=colors, + label="_nolegend", + ) + df_ave[~mask].plot( + ax=myfig.axs[0], + kind="bar", + rot=x_label_rotation, + width=0.9, + legend=False, + edgecolor="grey", + color=colors, + alpha=0.5, + label="_nolegend", + ) + bars = myfig.axs[0].patches # needed to add patches to the bars + n_different_hatches = int(len(bars) / df_ave.shape[0] / 2) + if show_total_in_twinx: + myfig.axts[0].scatter( + df_ave.index, + df_ave.sum(axis=1).values, + color="k", + linestyle="None", + edgecolor="k", + facecolor="grey", + s=100, + label=yt_sum_label, + alpha=0.5, + ) + if not df_std.empty: + myfig.axts[0].errorbar( + df_ave.index, + df_ave.sum(axis=1).values, + df_std.sum(axis=1).values, + capsize=3, + linestyle="None", + color="grey", + ecolor="k", + ) + bar_hatches = [] + # get a list with the hatches + for h in hatches[:n_different_hatches] + hatches[:n_different_hatches]: + for n in range(df_ave.shape[0]): # htcs repeated for samples + bar_hatches.append(h) # append based on samples number + for bar, hatch in zip(bars, bar_hatches): # assign hatches to each bar + bar.set_hatch(hatch) + myfig.axs[0].set(xlabel=None) + if x_label_rotation != 0: + myfig.axs[0].set_xticklabels( + df_ave.index, rotation=x_label_rotation, ha="right", rotation_mode="anchor" + ) + if legend_location is not None: + hnd_ax, lab_ax = myfig.axs[0].get_legend_handles_labels() + if not df_std.empty: + hnd_ax = hnd_ax[: len(hnd_ax) // 2] + lab_ax = lab_ax[: len(lab_ax) // 2] + if legend_labelspacing > 0.5: # large legend spacing for molecules + myfig.axs[0].plot(np.nan, np.nan, "-", color="None", label=" ") + hhhh, aaaa = myfig.axs[0].get_legend_handles_labels() + hnd_ax.append(hhhh[0]) + lab_ax.append(aaaa[0]) + if show_total_in_twinx: + hnd_axt, lab_axt = myfig.axt[0].get_legend_handles_labels() + else: + hnd_axt, lab_axt = [], [] + if legend_location == "outside": # legend goes outside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc="upper left", + ncol=legend_columns, + bbox_to_anchor=(legend_x_anchor, legend_y_anchor), + labelspacing=legend_labelspacing, + ) + else: # legend is inside of plot area + myfig.axs[0].legend( + hnd_ax + hnd_axt, + lab_ax + lab_axt, + loc=legend_location, + ncol=legend_columns, + labelspacing=legend_labelspacing, + ) + # annotate ave+-std at the top of outliers bar (exceeding y_lim) + if annotate_outliers and (y_lim is not None): # and (not df_std.empty): + _annotate_outliers_in_plot(myfig.axs[0], df_ave, df_std, y_lim) + myfig.save_figure(filename, out_path) + return myfig + + +# if __file__ == "__main__": +# f = MyFigure( +# rows=4, +# cols=1, +# width=6, +# height=12, +# twinx=True, +# x_lab=["aaa", "qqq", "aa", "qq"], +# y_lab="bbb", +# yt_lab="ccc", +# x_lim=[0, 1], +# y_lim=[0, 1], +# yt_lim=[[0, 1], [0, 0.5], [0, 1], [0, 0.5]], +# x_ticks=[[0, 0.5, 1], [0, 0.5, 2], [0, 1], [0, 0.5]], +# # x_ticklabels=["a", "c", "d"], +# grid=True, +# annotate_lttrs=["a", "b", "a", "b"], +# annotate_lttrs_xy=[-0.11, -0.15], +# ) + +# f.axs[0].plot([0, 1], [0, 3], label="a") +# f.axts[0].plot([0, 2], [0, 4], label="b") +# f.axts[0].plot([0, 2], [0, 5], label="ccc") +# f.axs[1].plot([0, 1], [0, 3], label="aaa") +# ins = f.create_insex(f.axs[0], [0.6, 0.8], [0.4, 0.6], [0, 0.2], [0, 0.2]) +# ins.plot([0, 1], [0, 3], label="a") +# f.save_figure( +# filename="my_plot", out_path=plib.Path(r"C:\Users\mp933\Desktop\New folder") +# ) diff --git a/tests/test_gcms_data_analysis.py b/tests/AAAA_test_gcms_data_analysis.py similarity index 100% rename from tests/test_gcms_data_analysis.py rename to tests/AAAA_test_gcms_data_analysis.py diff --git a/tests/conftest.py b/tests/conftest.py index b9bcce0..62c24e6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,49 +2,97 @@ import pandas as pd import numpy as np import pytest -import rdkit from gcms_data_analysis.main import Project +test_dir: plib.Path = plib.Path(__file__).parent + + +# testing name_to_properties +name_to_properties_dir = test_dir / "data_name_to_properties" + @pytest.fixture -def gcms() -> Project: +def dicts_classifications_codes_fractions(): + ccf = pd.read_excel( + plib.Path( + name_to_properties_dir, + "classifications_codes_fractions.xlsx", + ) + ) + dict_class_to_code: dict[str, str] = dict( + zip( + ccf.classes.tolist(), + ccf.codes.tolist(), + ) + ) + dict_class_to_mass_fraction: dict[str, float] = dict( + zip( + ccf.classes.tolist(), + ccf.mfs.tolist(), + ) + ) + return dict_class_to_code, dict_class_to_mass_fraction + - folder_path: plib.Path = plib.Path( - plib.Path(__file__).parent.parent, "tests/data_for_testing/" +@pytest.fixture +def checked_n2p_compounds_properties(): + properties = pd.read_excel( + plib.Path( + name_to_properties_dir, + "checked_compounds_properties.xlsx", + ), + index_col="comp_name", ) - Project.set_folder_path(folder_path) + return properties + + +# test minimal_case +minimal_case_dir = test_dir / "data_minimal_case" + + +@pytest.fixture +def gcms() -> Project: + Project.set_folder_path(minimal_case_dir) + Project.set_auto_save_to_excel(False) return Project() -# fmt: off @pytest.fixture def checked_files_info(): files_info = pd.DataFrame( - index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), + index=pd.Index(["S_1", "S_2", "T_1", "T_2", "T_3"], name="filename"), data={ - 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], - 'derivatized': [False, False, True, True, False, False], - 'dilution_factor': [25, 25, 125, 125, 1, 1], - 'total_sample_conc_in_vial_mg_L': [560.0000000000001, 560.0000000000001, 112.0, 112.0, 2800.0, 2800.0], - 'sample_yield_on_feedstock_basis_fr': [0.45, 0.46, 0.47, 0.48, 0.49, 0.5], - 'calibration_file': ['calibration', 'calibration', 'deriv_calibration', 'deriv_calibration', 'calibration', 'calibration'], - } + "samplename": ["S", "S", "T", "T", "T"], + "replicate_number": [1, 2, 1, 2, 3], + "derivatized": [False, False, False, False, False], + "calibration_file": [ + "cal_minimal", + "cal_minimal", + "cal_minimal", + "cal_minimal", + "cal_minimal", + ], + "dilution_factor": [1, 1, 1, 1, 1], + "total_sample_conc_in_vial_mg_L": [1, 1, 1, 1, 1], + "sample_yield_on_feedstock_basis_fr": [1, 1, 1, 1, 1], + }, ) return files_info + @pytest.fixture def checked_created_files_info(): created_files_info = pd.DataFrame( - index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), + index=pd.Index(["S_1", "S_2", "T_1", "T_2", "T_3"], name="filename"), data={ - 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], - 'replicate_number': ['1', '2', '1', '2', '1', '2'], - 'derivatized': [False, False, False, False, False, False], - 'calibration_file': [False, False, False, False, False, False], - 'dilution_factor': [1, 1, 1, 1, 1, 1], - 'total_sample_conc_in_vial_mg_L': [1, 1, 1, 1, 1, 1], - 'sample_yield_on_feedstock_basis_fr': [1, 1, 1, 1, 1, 1], - } + "samplename": ["S", "S", "T", "T", "T"], + "replicate_number": ["1", "2", "1", "2", "3"], + "derivatized": [False, False, False, False, False], + "calibration_file": [False, False, False, False, False], + "dilution_factor": [1, 1, 1, 1, 1], + "total_sample_conc_in_vial_mg_L": [1, 1, 1, 1, 1], + "sample_yield_on_feedstock_basis_fr": [1, 1, 1, 1, 1], + }, ) return created_files_info @@ -52,79 +100,81 @@ def checked_created_files_info(): @pytest.fixture def checked_files(): files = { - 'A_1': pd.DataFrame( - index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), - data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], - 'area': [23386, 44389, 15068, 1878180, 1456119, 6379752], - 'height': [24797, 15019, 5705, 493759, 339605, 1147599], - 'area_if_undiluted': [584650, 1109725, 376700, 46954500, 36402975, 159493800], - }), - 'A_2': pd.DataFrame( - index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), + "S_1": pd.DataFrame( + index=pd.Index(["phenol", "naphthalene", "dodecane"], name="S_1"), data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], - 'area': [25493, 10952, 50650, 21294, 1656756, 1371069, 6394708], - 'height': [25716, 4259, 14520, 6739, 461942, 324690, 1138647], - 'area_if_undiluted': [637325, 273800, 1266250, 532350, 41418900, 34276725, 159867700], - }), - 'Ader_1': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), + "iupac_name": ["n.a.", "n.a.", "n.a."], + "retention_time": [13.703, 20.942, 21.426], + "area": [20, 200, 2000], + "height": [20, 200, 2000], + "area_if_undiluted": [20, 200, 2000], + }, + ), + "S_2": pd.DataFrame( + index=pd.Index(["phenol", "naphthalene", "dodecane"], name="S_2"), data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], - 'area': [16741, 49508, 27798, 1415205, 519476, 1724814], - 'height': [13451, 18415, 9132, 484890, 180850, 501749], - 'area_if_undiluted': [2092625, 6188500, 3474750, 176900625, 64934500, 215601750], - }), - 'Ader_2': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), + "iupac_name": ["n.a.", "n.a.", "n.a."], + "retention_time": [13.703, 20.942, 21.426], + "area": [40, 400, 4000], + "height": [40, 400, 4000], + "area_if_undiluted": [40, 400, 4000], + }, + ), + "T_1": pd.DataFrame( + index=pd.Index(["phenol", "naphthalene", "dodecane"], name="T_1"), data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], - 'area': [14698, 53613, 25213, 1402990, 605137, 1956560], - 'height': [12802, 18373, 8775, 496504, 202599, 594688], - 'area_if_undiluted': [1837250, 6701625, 3151625, 175373750, 75642125, 244570000], - }), - 'B_1': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), + "iupac_name": ["n.a.", "n.a.", "n.a."], + "retention_time": [13.703, 20.942, 21.426], + "area": [20, 50, 500], + "height": [20, 50, 500], + "area_if_undiluted": [20, 50, 500], + }, + ), + "T_2": pd.DataFrame( + index=pd.Index(["phenol", "naphthalene", "dodecane"], name="T_2"), data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], - 'area': [147566, 69223, 40376, 441077, 19522, 200947], - 'height': [39393, 18515, 12132, 112797, 7194, 64421], - 'area_if_undiluted': [147566, 69223, 40376, 441077, 19522, 200947], - }), - 'B_2': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), + "iupac_name": ["n.a.", "n.a.", "n.a."], + "retention_time": [13.703, 20.942, 21.426], + "area": [10, 100, 1000], + "height": [10, 1000, 1000], + "area_if_undiluted": [10, 100, 1000], + }, + ), + "T_3": pd.DataFrame( + index=pd.Index(["phenol", "naphthalene", "dodecane"], name="T_3"), data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], - 'area': [181021, 64531, 35791, 472362, 228750], - 'height': [44551, 19823, 12737, 120142, 75153], - 'area_if_undiluted': [181021, 64531, 35791, 472362, 228750], - }) + "iupac_name": ["n.a.", "n.a.", "n.a."], + "retention_time": [13.703, 20.942, 21.426], + "area": [0, 150, 1500], + "height": [0, 150, 1500], + "area_if_undiluted": [0, 150, 1500], + }, + ), } return files + @pytest.fixture def checked_is_files_deriv(): is_files_deriv = { - 'A_1': False, 'A_2': False, 'Ader_1': True, - 'Ader_2': True, 'B_1': False, 'B_2': False + "S_1": False, + "S_2": False, + "T_1": False, + "T_2": False, + "T_3": False, } return is_files_deriv + +# fmt: off @pytest.fixture def checked_load_class_code_fractions(): class_code_fractions = pd.DataFrame( - index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], + index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], data={ - 'classes': ['ester', 'ester_1', 'ester_2', 'ester_3', 'ester_4', 'ester_5', 'ester_6', 'carboxyl', 'ketone', 'ketone_1', 'ketone_2', 'ketone_3', 'ketone_4', 'ketone_5', 'ketone_6', 'ketone_7', 'ketone_8', 'ketone_9', 'ketone_10', 'ketone_11', 'ketone_12', 'ketone_13', 'ketone_14', 'ketone_15', 'ketone_16', 'ketone_17', 'ketone_18', 'ketone_19', 'ketone_20', 'ketone_21', 'ketone_22', 'ketone_23', 'ketone_24', 'ketone_25', 'ketone_26', 'ketone_27', 'aldehyde', 'ether', 'ether_1', 'ether_2', 'ether_3', 'ether_4', 'ether_5', 'ether_6', 'ether_7', 'ether_8', 'ether_9', 'ether_10', 'ether_11', 'ether_12', 'ether_13', 'ether_14', 'ether_15', 'ether_16', 'ether_17', 'ether_18', 'ether_19', 'ether_20', 'ether_21', 'ether_22', 'ether_23', 'ether_24', 'ether_25', 'ether_26', 'ether_27', 'alcohol', 'C-aliph', 'C-aliph_1', 'C-aliph_2', 'C-aliph_3', 'C-arom', 'C-arom_1', 'C-arom_2', 'N-aliph', 'N-aliph_1', 'N-aliph_3', 'N-arom', 'N-arom_2', 'O-arom', 'O-aliph'], - 'codes': ['[CH0](=O)O[CH3]', '[CH0](=O)O[CH2]', '[CH0](=O)O[CH1]', '[CH0](=O)O[C]', '[CH0](=O)O[cH2]', '[CH0](=O)O[cH1]', '[CH0](=O)O[c]', '[CH0](=O)O', '[CH3]C(=O)[CH3]', '[CH3]C(=O)[CH2]', '[CH3]C(=O)[CH]', '[CH3]C(=O)[C]', '[CH3]C(=O)[cH2]', '[CH3]C(=O)[cH]', '[CH3]C(=O)[c]', '[CH2]C(=O)[CH2]', '[CH2]C(=O)[CH]', '[CH2]C(=O)[C]', '[CH2]C(=O)[cH2]', '[CH2]C(=O)[cH]', '[CH2]C(=O)[c]', '[CH]C(=O)[CH]', '[CH]C(=O)[C]', '[CH]C(=O)[cH2]', '[CH]C(=O)[cH]', '[CH]C(=O)[c]', '[C]C(=O)[C]', '[C]C(=O)[cH2]', '[C]C(=O)[cH]', '[C]C(=O)[c]', '[cH2]C(=O)[cH2]', '[cH2]C(=O)[cH]', '[cH2]C(=O)[c]', '[cH]C(=O)[cH]', '[cH]C(=O)[c]', '[c]C(=O)[c]', '[CH]=O', '[CH3]O[CH3]', '[CH3]O[CH2]', '[CH3]O[CH]', '[CH3]O[C]', '[CH3]O[cH2]', '[CH3]O[cH]', '[CH3]O[c]', '[CH2]O[CH2]', '[CH2]O[CH]', '[CH2]O[C]', '[CH2]O[cH2]', '[CH2]O[cH]', '[CH2]O[c]', '[CH]O[CH]', '[CH]O[C]', '[CH]O[cH2]', '[CH]O[cH]', '[CH]O[c]', '[C]O[C]', '[C]O[cH2]', '[C]O[cH]', '[C]O[c]', '[cH2]O[cH2]', '[cH2]O[cH]', '[cH2]O[c]', '[cH]O[cH]', '[cH]O[c]', '[c]O[c]', '[OH1]', '[CH3]', '[CH2]', '[CH1]', '[C]', '[cH2]', '[cH1]', '[c]', '[NH2]', '[NH1]', '[NH0]', '[nH1]', '[n]', '[o]', '[O]'], - 'mfs': [59.044, 58.035999999999994, 57.028, 56.019999999999996, 58.035999999999994, 57.028, 56.019999999999996, 45.017, 58.080000000000005, 57.072, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 55.056000000000004, 54.048, 53.040000000000006, 52.032000000000004, 54.048, 53.040000000000006, 52.032000000000004, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 52.032000000000004, 29.017999999999997, 46.069, 45.061, 44.053, 43.045, 45.061, 44.053, 43.045, 44.053, 43.045, 45.061, 44.053, 43.045, 42.037, 42.037, 41.029, 43.045, 42.037, 41.029, 40.021, 42.037, 41.029, 40.021, 44.053, 43.045, 42.037, 42.037, 41.029, 40.021, 17.007, 15.035, 14.027, 13.018999999999998, 12.011, 14.027, 13.018999999999998, 12.011, 16.023, 15.015, 14.007, 15.015, 14.007, 15.999, 15.999], + 'classes': ['ester', 'ester_1', 'ester_2', 'ester_3', 'ester_4', 'ester_5', 'ester_6', 'carboxyl', 'ketone', 'ketone_1', 'ketone_2', 'ketone_3', 'ketone_4', 'ketone_5', 'ketone_6', 'ketone_7', 'ketone_8', 'ketone_9', 'ketone_10', 'ketone_11', 'ketone_12', 'ketone_13', 'ketone_14', 'ketone_15', 'ketone_16', 'ketone_17', 'ketone_18', 'ketone_19', 'ketone_20', 'ketone_21', 'ketone_22', 'ketone_23', 'ketone_24', 'ketone_25', 'ketone_26', 'ketone_27', 'aldehyde', 'ether', 'ether_1', 'ether_2', 'ether_3', 'ether_4', 'ether_5', 'ether_6', 'ether_7', 'ether_8', 'ether_9', 'ether_10', 'ether_11', 'ether_12', 'ether_13', 'ether_14', 'ether_15', 'ether_16', 'ether_17', 'ether_18', 'ether_19', 'ether_20', 'ether_21', 'ether_22', 'ether_23', 'ether_24', 'ether_25', 'ether_26', 'ether_27', 'alcohol', 'C-aliph', 'C-aliph_1', 'C-aliph_2', 'C-aliph_3', 'C-arom', 'C-arom_1', 'C-arom_2', 'N-aliph', 'N-aliph_1', 'N-aliph_3', 'N-arom', 'N-arom_2', 'O-arom', 'O-aliph', 'Cl'], + 'codes': ['[CH0](=O)O[CH3]', '[CH0](=O)O[CH2]', '[CH0](=O)O[CH1]', '[CH0](=O)O[C]', '[CH0](=O)O[cH2]', '[CH0](=O)O[cH1]', '[CH0](=O)O[c]', '[CH0](=O)O', '[CH3]C(=O)[CH3]', '[CH3]C(=O)[CH2]', '[CH3]C(=O)[CH]', '[CH3]C(=O)[C]', '[CH3]C(=O)[cH2]', '[CH3]C(=O)[cH]', '[CH3]C(=O)[c]', '[CH2]C(=O)[CH2]', '[CH2]C(=O)[CH]', '[CH2]C(=O)[C]', '[CH2]C(=O)[cH2]', '[CH2]C(=O)[cH]', '[CH2]C(=O)[c]', '[CH]C(=O)[CH]', '[CH]C(=O)[C]', '[CH]C(=O)[cH2]', '[CH]C(=O)[cH]', '[CH]C(=O)[c]', '[C]C(=O)[C]', '[C]C(=O)[cH2]', '[C]C(=O)[cH]', '[C]C(=O)[c]', '[cH2]C(=O)[cH2]', '[cH2]C(=O)[cH]', '[cH2]C(=O)[c]', '[cH]C(=O)[cH]', '[cH]C(=O)[c]', '[c]C(=O)[c]', '[CH]=O', '[CH3]O[CH3]', '[CH3]O[CH2]', '[CH3]O[CH]', '[CH3]O[C]', '[CH3]O[cH2]', '[CH3]O[cH]', '[CH3]O[c]', '[CH2]O[CH2]', '[CH2]O[CH]', '[CH2]O[C]', '[CH2]O[cH2]', '[CH2]O[cH]', '[CH2]O[c]', '[CH]O[CH]', '[CH]O[C]', '[CH]O[cH2]', '[CH]O[cH]', '[CH]O[c]', '[C]O[C]', '[C]O[cH2]', '[C]O[cH]', '[C]O[c]', '[cH2]O[cH2]', '[cH2]O[cH]', '[cH2]O[c]', '[cH]O[cH]', '[cH]O[c]', '[c]O[c]', '[OH1]', '[CH3]', '[CH2]', '[CH1]', '[C]', '[cH2]', '[cH1]', '[c]', '[NH2]', '[NH1]', '[NH0]', '[nH1]', '[n]', '[o]', '[O]', '[Cl]'], + 'mfs': [59.044, 58.035999999999994, 57.028, 56.019999999999996, 58.035999999999994, 57.028, 56.019999999999996, 45.017, 58.080000000000005, 57.072, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 55.056000000000004, 54.048, 53.040000000000006, 52.032000000000004, 54.048, 53.040000000000006, 52.032000000000004, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 52.032000000000004, 29.017999999999997, 46.069, 45.061, 44.053, 43.045, 45.061, 44.053, 43.045, 44.053, 43.045, 45.061, 44.053, 43.045, 42.037, 42.037, 41.029, 43.045, 42.037, 41.029, 40.021, 42.037, 41.029, 40.021, 44.053, 43.045, 42.037, 42.037, 41.029, 40.021, 17.007, 15.035, 14.027, 13.018999999999998, 12.011, 14.027, 13.018999999999998, 12.011, 16.023, 15.015, 14.007, 15.015, 14.007, 15.999, 15.999, 35.45], } ) return class_code_fractions @@ -150,604 +200,258 @@ def checked_load_calibrations(): 'Area 5': [np.nan, np.nan, np.nan, 2957268.0, 3164919.0, 741540.0, 5345977.0], 'Area 6': [np.nan, np.nan, np.nan, 11730886.0, 12451729.0, 3975200.0, 19779576.0], } - ), - 'deriv_calibration': pd.DataFrame( - index=pd.Index(['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], name='comp_name'), - data={ - 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], - 'MW': [122.1213, 256.4241, 280.4455, 282.4614, 94.1112, 116.1152, 110.1106], - 'PPM 1': [np.nan, 5.0, 5.0, 5.0, np.nan, 5.0, 5.0], - 'PPM 2': [np.nan, 10.0, 10.0, 10.0, np.nan, 10.0, 10.0], - 'PPM 3': [np.nan, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0], - 'PPM 4': [np.nan, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0], - 'PPM 5': [30, 30, 30, 30, 25, 25, 25], - 'PPM 6': [50, 50, 50, 50, 30, 30, 30], - 'Area 1': [np.nan, 403058.0, 126644.0, 467088.0, np.nan, 48330.0, 184752.0], - 'Area 2': [np.nan, 570479.0, 183307.0, 741971.0, np.nan, 206224.0, 729379.0], - 'Area 3': [np.nan, 694901.0, 241591.0, 953554.0, 17168.0, 620353.0, 1607583.0], - 'Area 4': [np.nan, 936570.0, 350170.0, 1408563.0, 21329.0, 885337.0, 2232039.0], - 'Area 5': [73458, 1474014, 475205, 2476003, 21557, 1096645, 2972508], - 'Area 6': [113812, 2605959, 824267, 4300414, 71706, 1394486, 3629582], - 'CAS': ['65-85-0', '57-10-3', '60-33-3', '112-79-8', '108-95-2', '123-76-2', '120-80-9'], - } ) } return calibrations -@pytest.fixture -def checked_is_calibrations_deriv(): - is_calibrations_deriv = {'calibration': False, 'deriv_calibration': True} - return is_calibrations_deriv - @pytest.fixture def checked_list_of_all_compounds(): - list_of_all_compounds = ['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', - '9,12-octadecadienoic acid (z,z)-', 'oleic acid', - 'n-decanoic acid', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', - 'trans-2-pentenoic acid', '2,5-hexanedione', - '1-hexene, 4,5-dimethyl-', 'phenol', - '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', - '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid' - ] + list_of_all_compounds = ['phenol', 'naphthalene', 'dodecane'] return list_of_all_compounds -@pytest.fixture -def checked_list_of_all_deriv_compounds(): - list_of_all_deriv_compounds = ['myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', - 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', - '9-octadecenoic acid, (z)-, tms derivative', 'benzoic acid, deriv.', - 'hexadecanoic acid, deriv.', '(9z,12z)-octadeca-9,12-dienoic acid, deriv.', - '9-octadecenoic acid, (e)-, deriv.', 'phenol, deriv.', - '4-oxopentanoic acid, deriv.', 'benzene-1,2-diol, deriv.' - ] - return list_of_all_deriv_compounds - @pytest.fixture def checked_compounds_properties(): compounds_properties = pd.DataFrame( - index=pd.Index(['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid', 'n-decanoic acid', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], name='comp_name'), - data={ - 'iupac_name': ['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'decanoic acid', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'molecular_formula': ['C14H28O2', 'C16H30O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C10H20O2', 'C4H8O', 'C6H8O', 'C5H8O2', 'C6H10O2', 'C8H16', 'C6H6O', 'C6H8O', 'C6H3Cl3O', 'C16H32O2', 'C18H32O2', 'C18H34O2'], - 'canonical_smiles': ['CCCCCCCCCCCCCC(=O)O', 'C1CCCCCCCC(=O)OCCCCCCC1', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'CCCCCCCCCC(=O)O', 'CCC(=O)C', 'CC1=CCCC1=O', 'CCC=CC(=O)O', 'CC(=O)CCC(=O)C', 'CC(C)C(C)CC=C', 'C1=CC=C(C=C1)O', 'CC1=CCCC1=O', 'C1=C(C(=CC(=C1Cl)Cl)Cl)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O'], - 'molecular_weight': [228.37, 254.41, 256.42, 280.4, 282.5, 172.26, 72.11, 96.13, 100.12, 114.14, 112.21, 94.11, 96.13, 197.4, 256.42, 280.4, 282.5], - 'xlogp': [5.3, 6.3, 6.4, 6.8, 6.5, 4.1, 0.3, 0.9, 1.0, -0.3, 3.5, 1.5, 0.9, 3.7, 6.4, 6.8, 6.5], - 'el_C': [14, 16, 16, 18, 18, 10, 4, 6, 5, 6, 8, 6, 6, 6, 16, 18, 18], - 'el_Cl': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], - 'el_H': [28, 30, 32, 32, 34, 20, 8, 8, 8, 10, 16, 6, 8, 3, 32, 32, 34], - 'el_O': [2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 0, 1, 1, 1, 2, 2, 2], - 'el_mf_C': [0.7363226343214958, 0.7553791124562713, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.6972599558806455, 0.6662598807377618, 0.7496723187350464, 0.5998302037554933, 0.6313825127036973, 0.8563229658675697, 0.765763468281798, 0.7496723187350464, 0.3650759878419453, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566], - 'el_mf_Cl': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], - 'el_mf_H': [0.12358891272934273, 0.11886325223065132, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.11703239289446186, 0.11182914990985994, 0.08388640382814938, 0.08054334798242109, 0.08831259856316805, 0.1437305053025577, 0.06426522154925088, 0.08388640382814938, 0.015319148936170212, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203], - 'el_mf_O': [0.1401147261023777, 0.12577335796548877, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.18575409265064438, 0.22186936624601306, 0.16643087485696453, 0.31959648421893727, 0.28033993341510427, 0.0, 0.17000318775900541, 0.16643087485696453, 0.08104863221884498, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815], - 'fg_C-aliph': [13, 14, 15, 17, 17, 9, 1, 3, 4, 0, 8, 0, 3, 0, 15, 17, 17], - 'fg_C-arom': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 0, 0], - 'fg_Cl': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], - 'fg_alcohol': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0], - 'fg_carboxyl': [1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1], - 'fg_ester': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - 'fg_hetero_atoms': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], - 'fg_ketone': [0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0], - 'fg_mf_C-aliph': [0.8029031834303979, 0.771895758814512, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.7387147335423198, 0.20850090140063793, 0.4377509622386352, 0.5503395924890131, 0.0, 1.0000534711701274, 0.0, 0.4377509622386352, 0.0, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363], - 'fg_mf_C-arom': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8193178195728402, 0.0, 0.3752887537993921, 0.0, 0.0, 0.0], - 'fg_mf_Cl': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], - 'fg_mf_alcohol': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1807140580172139, 0.0, 0.0861550151975684, 0.0, 0.0, 0.0], - 'fg_mf_carboxyl': [0.19712308972281825, 0.0, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.2613317078834321, 0.0, 0.0, 0.4496304434678386, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055], - 'fg_mf_ester': [0.0, 0.22811996383789943, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'fg_mf_hetero_atoms': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], - 'fg_mf_ketone': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7914574954929968, 0.5936960366170811, 0.0, 1.0000350446819695, 0.0, 0.0, 0.5936960366170811, 0.0, 0.0, 0.0, 0.0], - 'fg_mf_total': [0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168], + index=pd.Index(['phenol', 'naphthalene', 'dodecane'], name='comp_name'), + data={ + 'iupac_name': ['phenol', 'naphthalene', 'dodecane'], + 'molecular_formula': ['C6H6O', 'C10H8', 'C12H26'], + 'canonical_smiles': ['C1=CC=C(C=C1)O', 'C1=CC=C2C=CC=CC2=C1', 'CCCCCCCCCCCC'], + 'molecular_weight': [94.11, 128.17, 170.33], + 'xlogp': [1.5, 3.3, 6.1], + 'el_C': [6, 10, 12], + 'el_H': [6, 8, 26], + 'el_O': [1, 0, 0], + 'el_mf_C': [0.765763468281798, 0.9371147694468284, 0.846192684788352], + 'el_mf_H': [0.06426522154925088, 0.06291643910431459, 0.1538660247754359], + 'el_mf_O': [0.1700031877590054, 0.0, 0.0], + 'fg_C-aliph': [0, 0, 12], + 'fg_C-arom': [6, 10, 0], + 'fg_alcohol': [1, 0, 0], + 'fg_mf_C-aliph': [0.0, 0.0, 1.000058709563788], + 'fg_mf_C-arom': [0.8193178195728402, 1.000031208551143, 0.0], + 'fg_mf_alcohol': [0.1807140580172139, 0.0, 0.0], } ) return compounds_properties -@pytest.fixture -def checked_deriv_compounds_properties(): - deriv_compounds_properties = pd.DataFrame( - index=pd.Index(['myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative', 'benzoic acid, deriv.', 'hexadecanoic acid, deriv.', '(9z,12z)-octadeca-9,12-dienoic acid, deriv.', '9-octadecenoic acid, (e)-, deriv.', 'phenol, deriv.', '4-oxopentanoic acid, deriv.', 'benzene-1,2-diol, deriv.'], name='comp_name'), - data={ - 'iupac_name': ['tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(e)-octadec-9-enoic acid', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], - 'molecular_formula': ['C14H28O2', 'C16H30O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C7H6O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C6H6O', 'C5H8O3', 'C6H6O2'], - 'canonical_smiles': ['CCCCCCCCCCCCCC(=O)O', 'CCCCCCC=CCCCCCCCC(=O)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'C1=CC=C(C=C1)C(=O)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'C1=CC=C(C=C1)O', 'CC(=O)CCC(=O)O', 'C1=CC=C(C(=C1)O)O'], - 'molecular_weight': [228.37, 254.41, 256.42, 280.4, 282.5, 122.12, 256.42, 280.4, 282.5, 94.11, 116.11, 110.11], - 'xlogp': [5.3, 6.4, 6.4, 6.8, 6.5, 1.9, 6.4, 6.8, 6.5, 1.5, -0.5, 0.9], - 'underiv_comp_name': ['myristic acid', 'palmitelaidic acid', 'palmitic acid', '9,12-octadecadienoic acid (z,z)-', '9-octadecenoic acid, (z)-', 'benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], - 'el_C': [14, 16, 16, 18, 18, 7, 16, 18, 18, 6, 5, 6], - 'el_H': [28, 30, 32, 32, 34, 6, 32, 32, 34, 6, 8, 6], - 'el_O': [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2], - 'el_mf_C': [0.7363226343214958, 0.7553791124562713, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.6884785456927611, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.765763468281798, 0.5172250452157436, 0.6544909635818728], - 'el_mf_H': [0.12358891272934273, 0.11886325223065132, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.04952505732066819, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.06426522154925088, 0.06945138230987856, 0.054926891290527656], - 'el_mf_O': [0.1401147261023777, 0.12577335796548877, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.26202096298722566, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.17000318775900541, 0.413375247610025, 0.29060030878212695], - 'fg_C-aliph': [13, 15, 15, 17, 17, 0, 15, 17, 17, 0, 1, 0], - 'fg_C-arom': [0, 0, 0, 0, 0, 6, 0, 0, 0, 6, 0, 6], - 'fg_alcohol': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2], - 'fg_carboxyl': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], - 'fg_ketone': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], - 'fg_mf_C-aliph': [0.8029031834303979, 0.8230690617507173, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.0, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.0, 0.12080785462061837, 0.0], - 'fg_mf_C-arom': [0.0, 0.0, 0.0, 0.0, 0.0, 0.6313953488372093, 0.0, 0.0, 0.0, 0.8193178195728402, 0.0, 0.6911088911088911], - 'fg_mf_alcohol': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1807140580172139, 0.0, 0.3089092725456362], - 'fg_mf_carboxyl': [0.19712308972281825, 0.17694666090169414, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.3686292171634458, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.0, 0.3877099302385669, 0.0], - 'fg_mf_ketone': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.491533890276462, 0.0], - 'fg_mf_total': [1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273], - } - ) - return deriv_compounds_properties - -@pytest.fixture -def checked_calibrations_added_iupac_only_iupac_and_mw(): - calibrations = { - 'calibration': pd.DataFrame( - index=pd.Index(['phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'tetradecanoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], name='comp_name'), - data={ - 'iupac_name': ['phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'tetradecanoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'MW': [94.11, 96.1271, 197.4, 228.3709, 256.4241, 280.4455, 282.4614], - }), - 'deriv_calibration': pd.DataFrame( - index=pd.Index(['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], name='comp_name'), - data={ - 'iupac_name': ['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(e)-octadec-9-enoic acid', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], - 'MW': [122.1213, 256.4241, 280.4455, 282.4614, 94.1112, 116.1152, 110.1106], - }) - } - return calibrations - -@pytest.fixture -def checked_files_added_iupac_only_iupac_and_time(): - files = { - 'A_1': pd.DataFrame( - index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], - }), - 'A_2': pd.DataFrame( - index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), - data={ - 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], - }), - 'Ader_1': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], - }), - 'Ader_2': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], - }), - 'B_1': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), - data={ - 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol'], - 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], - }), - 'B_2': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), - data={ - 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], - 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], - }) - } - return files - -@pytest.fixture -def checked_files_applied_calibration(): - files = { - 'A_1': pd.DataFrame( - index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], - 'area': [23386, 44389, 15068, 1878180, 1456119, 6379752], - 'height': [24797, 15019, 5705, 493759, 339605, 1147599], - 'area_if_undiluted': [584650, 1109725, 376700, 46954500, 36402975, 159493800], - 'conc_vial_mg_L': [np.nan, 23.581503644987627, np.nan, 66.05436178187291, 131.18800047103497, 113.61850020825628], - 'conc_vial_if_undiluted_mg_L': [np.nan, 589.5375911246907, np.nan, 1651.3590445468228, 3279.7000117758744, 2840.462505206407], - 'fraction_of_sample_fr': [np.nan, 0.042109827937477896, np.nan, 0.11795421746763018, 0.23426428655541953, 0.20289017894331474], - 'fraction_of_feedstock_fr': [np.nan, 0.018949422571865052, np.nan, 0.053079397860433586, 0.10541892894993879, 0.09130058052449164], - 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'self', 'self', 'self'], - }), - 'A_2': pd.DataFrame( - index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), - data={ - 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], - 'area': [25493, 10952, 50650, 21294, 1656756, 1371069, 6394708], - 'height': [25716, 4259, 14520, 6739, 461942, 324690, 1138647], - 'area_if_undiluted': [637325, 273800, 1266250, 532350, 41418900, 34276725, 159867700], - 'conc_vial_mg_L': [np.nan, 22.78427785050836, 23.730782309318595, np.nan, 61.11672684588226, 125.38077898437679, 113.82730072166243], - 'conc_vial_if_undiluted_mg_L': [np.nan, 569.606946262709, 593.2695577329649, np.nan, 1527.9181711470565, 3134.51947460942, 2845.682518041561], - 'fraction_of_sample_fr': [np.nan, 0.04068621044733635, 0.04237639698092605, np.nan, 0.10913701222478973, 0.2238942481863871, 0.20326303700296858], - 'fraction_of_feedstock_fr': [np.nan, 0.018715656805774722, 0.019493142611225985, np.nan, 0.05020302562340328, 0.10299135416573807, 0.09350099702136555], - 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], - }), - 'Ader_1': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], - 'area': [16741, 49508, 27798, 1415205, 519476, 1724814], - 'height': [13451, 18415, 9132, 484890, 180850, 501749], - 'area_if_undiluted': [2092625, 6188500, 3474750, 176900625, 64934500, 215601750], - 'conc_vial_mg_L': [np.nan, 0.600983241036704, 2.5980281295127825, 27.623189632994073, 31.36776718294773, 21.669084708496513], - 'conc_vial_if_undiluted_mg_L': [np.nan, 75.12290512958799, 324.7535161890978, 3452.898704124259, 3920.970897868466, 2708.635588562064], - 'fraction_of_sample_fr': [np.nan, 0.005365921794970571, 0.023196679727792702, 0.24663562172316136, 0.2800693498477476, 0.193473970611576], - 'fraction_of_feedstock_fr': [np.nan, 0.0025219832436361683, 0.01090243947206257, 0.11591874220988584, 0.13163259442844139, 0.09093276618744071], - 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], - }), - 'Ader_2': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], - 'area': [14698, 53613, 25213, 1402990, 605137, 1956560], - 'height': [12802, 18373, 8775, 496504, 202599, 594688], - 'area_if_undiluted': [1837250, 6701625, 3151625, 175373750, 75642125, 244570000], - 'conc_vial_mg_L': [np.nan, 0.6822063507301317, 2.5689779135709925, 27.38149894239597, 36.81298755438084, 24.27344499617392], - 'conc_vial_if_undiluted_mg_L': [np.nan, 85.27579384126646, 321.12223919637404, 3422.6873677994963, 4601.623444297605, 3034.1806245217404], - 'fraction_of_sample_fr': [np.nan, 0.006091128131519033, 0.022937302799741006, 0.24447766912853547, 0.3286873888784004, 0.21672718746583858], - 'fraction_of_feedstock_fr': [np.nan, 0.0029237415031291357, 0.011009905343875682, 0.11734928118169702, 0.1577699466616322, 0.10402904998360252], - 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], - }), - 'B_1': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), - data={ - 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol'], - 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], - 'area': [147566, 69223, 40376, 441077, 19522, 200947], - 'height': [39393, 18515, 12132, 112797, 7194, 64421], - 'area_if_undiluted': [147566, 69223, 40376, 441077, 19522, 200947], - 'conc_vial_mg_L': [np.nan, 6.243800844792131, np.nan, np.nan, np.nan, 7.167230535550548], - 'conc_vial_if_undiluted_mg_L': [np.nan, 6.243800844792131, np.nan, np.nan, np.nan, 7.167230535550548], - 'fraction_of_sample_fr': [np.nan, 0.0022299288731400468, np.nan, np.nan, np.nan, 0.0025597251912680527], - 'fraction_of_feedstock_fr': [np.nan, 0.001092665147838623, np.nan, np.nan, np.nan, 0.0012542653437213457], - 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'n.a.', 'n.a.', 'self'], - }), - 'B_2': pd.DataFrame( - index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), - data={ - 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], - 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], - 'area': [181021, 64531, 35791, 472362, 228750], - 'height': [44551, 19823, 12737, 120142, 75153], - 'area_if_undiluted': [181021, 64531, 35791, 472362, 228750], - 'conc_vial_mg_L': [np.nan, 6.134683722446865, np.nan, np.nan, 7.884941445329839], - 'conc_vial_if_undiluted_mg_L': [np.nan, 6.134683722446865, np.nan, np.nan, 7.884941445329839], - 'fraction_of_sample_fr': [np.nan, 0.0021909584723024517, np.nan, np.nan, 0.002816050516189228], - 'fraction_of_feedstock_fr': [np.nan, 0.0010954792361512259, np.nan, np.nan, 0.001408025258094614], - 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'n.a.', 'self'], - }) - } - return files - -@pytest.fixture -def checked_files_info_added_stats(): - files_info = pd.DataFrame( - index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), - data={ - 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], - 'derivatized': [False, False, True, True, False, False], - 'dilution_factor': [25, 25, 125, 125, 1, 1], - 'total_sample_conc_in_vial_mg_L': [560.0000000000001, 560.0000000000001, 112.0, 112.0, 2800.0, 2800.0], - 'sample_yield_on_feedstock_basis_fr': [0.45, 0.46, 0.47, 0.48, 0.49, 0.5], - 'calibration_file': ['calibration', 'calibration', 'deriv_calibration', 'deriv_calibration', 'calibration', 'calibration'], - 'max_height': [1147599.0, 1138647.0, 501749.0, 594688.0, 112797.0, 120142.0], - 'total_height': [2026484.0, 1976513.0, 1208487.0, 1333741.0, 254452.0, 272406.0], - 'max_area': [6379752.0, 6394708.0, 1724814.0, 1956560.0, 441077.0, 472362.0], - 'total_area': [9796894.0, 9530922.0, 3753542.0, 4058211.0, 918711.0, 982455.0], - 'max_area_if_undiluted': [159493800.0, 159867700.0, 215601750.0, 244570000.0, 441077.0, 472362.0], - 'total_area_if_undiluted': [244922350.0, 238273050.0, 469192750.0, 507276375.0, 918711.0, 982455.0], - 'max_conc_vial_mg_L': [131.18800047103497, 125.38077898437679, 31.36776718294773, 36.81298755438084, 7.167230535550548, 7.884941445329839], - 'total_conc_vial_mg_L': [334.4423661061518, 346.8398667117484, 83.8590528949878, 91.71911575725186, 13.411031380342678, 14.019625167776704], - 'max_conc_vial_if_undiluted_mg_L': [3279.7000117758744, 3134.51947460942, 3920.970897868466, 4601.623444297605, 7.167230535550548, 7.884941445329839], - 'total_conc_vial_if_undiluted_mg_L': [8361.059152653796, 8670.996667793712, 10482.381611873476, 11464.889469656482, 13.411031380342678, 14.019625167776704], - 'max_fraction_of_sample_fr': [0.23426428655541953, 0.2238942481863871, 0.2800693498477476, 0.3286873888784004, 0.0025597251912680527, 0.002816050516189228], - 'total_fraction_of_sample_fr': [0.5972185109038424, 0.6193569048424078, 0.7487415437052483, 0.8189206764040344, 0.0047896540644080995, 0.005007008988491679], - 'max_fraction_of_feedstock_fr': [0.10541892894993879, 0.10299135416573807, 0.13163259442844139, 0.1577699466616322, 0.0012542653437213457, 0.001408025258094614], - 'total_fraction_of_feedstock_fr': [0.26874832990672903, 0.28490417622750763, 0.3519085255414667, 0.39308192467393654, 0.0023469304915599686, 0.0025035044942458397], - 'compound_with_max_area': ['oleic acid', 'oleic acid', '9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative', '2,5-hexanedione', '2,5-hexanedione'], - 'compound_with_max_conc': ['9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', 'phenol', 'phenol'], - } - ) - return files_info - - @pytest.fixture def checked_samples_info(): samples_info = pd.DataFrame( - index=pd.Index(['A', 'Ader', 'B'], name='samplename'), + index=pd.Index(['S', 'T'], name='samplename'), data={ - 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], - 'derivatized': [(False, False), (True, True), (False, False)], - 'dilution_factor': [(25, 25), (125, 125), (1, 1)], - 'total_sample_conc_in_vial_mg_L': [(560.0000000000001, 560.0000000000001), (112.0, 112.0), (2800.0, 2800.0)], - 'sample_yield_on_feedstock_basis_fr': [(0.45, 0.46), (0.47, 0.48), (0.49, 0.5)], - 'calibration_file': [('calibration', 'calibration'), ('deriv_calibration', 'deriv_calibration'), ('calibration', 'calibration')], - 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], - 'compound_with_max_conc': [('9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-'), ('9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative'), ('phenol', 'phenol')], - 'max_height': [1143123.0, 548218.5, 116469.5], - 'max_area': [6387230.0, 1840687.0, 456719.5], - 'max_area_if_undiluted': [159680750.0, 230085875.0, 456719.5], - 'max_conc_vial_mg_L': [128.28438972770587, 34.090377368664285, 7.526085990440194], - 'max_conc_vial_if_undiluted_mg_L': [3207.1097431926473, 4261.297171083035, 7.526085990440194], - 'max_fraction_of_sample_fr': [0.2290792673709033, 0.304378369363074, 0.0026878878537286406], - 'max_fraction_of_feedstock_fr': [0.10420514155783843, 0.1447012705450368, 0.00133114530090798], - 'total_height': [2001498.5, 1271114.0, 263429.0], - 'total_area': [9663908.0, 3905876.5, 950583.0], - 'total_area_if_undiluted': [241597700.0, 488234562.5, 950583.0], - 'total_conc_vial_mg_L': [340.6411164089501, 87.78908432611982, 13.71532827405969], - 'total_conc_vial_if_undiluted_mg_L': [8516.027910223755, 10973.635540764979, 13.71532827405969], - 'total_fraction_of_sample_fr': [0.608287707873125, 0.7838311100546413, 0.0048983315264498895], - 'total_fraction_of_feedstock_fr': [0.27682625306711833, 0.3724952251077016, 0.002425217492902904], + 'filename': [('S_1', 'S_2'), ('T_1', 'T_2', 'T_3')], + 'replicate_number': [(1, 2), (1, 2, 3)], + 'derivatized': [(False, False), (False, False, False)], + 'calibration_file': [('cal_minimal', 'cal_minimal'), ('cal_minimal', 'cal_minimal', 'cal_minimal')], + 'dilution_factor': [(1, 1), (1, 1, 1)], + 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1, 1)], + 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1, 1)], + 'compound_with_max_area': [('dodecane', 'dodecane'), ('dodecane', 'dodecane', 'dodecane')], + 'compound_with_max_conc': [('naphthalene', 'dodecane'), ('phenol', 'dodecane', 'dodecane')], + 'max_height': [3000.0, 1000.0], + 'max_area': [3000.0, 1000.0], + 'max_area_if_undiluted': [3000.0, 1000.0], + 'max_conc_vial_mg_L': [6.000000000000002, 3.0000000000000013], + 'max_conc_vial_if_undiluted_mg_L': [6.000000000000002, 3.0000000000000013], + 'max_fraction_of_sample_fr': [6.000000000000002, 3.0000000000000013], + 'max_fraction_of_feedstock_fr': [6.000000000000002, 3.0000000000000013], + 'total_height': [3330.0, 1410.0], + 'total_area': [3330.0, 1110.0], + 'total_area_if_undiluted': [3330.0, 1110.0], + 'total_conc_vial_mg_L': [18.0, 6.000000000000003], + 'total_conc_vial_if_undiluted_mg_L': [18.0, 6.000000000000003], + 'total_fraction_of_sample_fr': [18.0, 6.000000000000003], + 'total_fraction_of_feedstock_fr': [18.0, 6.000000000000003], } ) return samples_info @pytest.fixture def checked_samples_info_std(): - samples_info_std = pd.DataFrame( - index=pd.Index(['A', 'Ader', 'B'], name='samplename'), + samples_info = pd.DataFrame( + index=pd.Index(['S', 'T'], name='samplename'), data={ - 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], - 'derivatized': [(False, False), (True, True), (False, False)], - 'dilution_factor': [(25, 25), (125, 125), (1, 1)], - 'total_sample_conc_in_vial_mg_L': [(560.0000000000001, 560.0000000000001), (112.0, 112.0), (2800.0, 2800.0)], - 'sample_yield_on_feedstock_basis_fr': [(0.45, 0.46), (0.47, 0.48), (0.49, 0.5)], - 'calibration_file': [('calibration', 'calibration'), ('deriv_calibration', 'deriv_calibration'), ('calibration', 'calibration')], - 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], - 'compound_with_max_conc': [('9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-'), ('9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative'), ('phenol', 'phenol')], - 'max_height': [6330.019905181974, 65717.79713669654, 5193.699307815192], - 'max_area': [10575.489019426004, 163869.16811285765, 22121.83564942114], - 'max_area_if_undiluted': [264387.2254856501, 20483646.014107205, 22121.83564942114], - 'max_conc_vial_mg_L': [4.106325693068217, 3.8503522496954825, 0.5074982512365029], - 'max_conc_vial_if_undiluted_mg_L': [102.65814232670576, 481.29403121193536, 0.5074982512365029], - 'max_fraction_of_sample_fr': [0.007332724451907525, 0.03437814508656681, 0.00018124937544160814], - 'max_fraction_of_feedstock_fr': [0.001716554591745799, 0.01848189900635057, 0.00010872467812800095], - 'total_height': [35334.83296267297, 88567.95277073982, 12695.395149423275], - 'total_area': [188070.60480574841, 215433.51591732426, 45073.814659955286], - 'total_area_if_undiluted': [4701765.120143711, 26929189.48966553, 45073.814659955286], - 'total_conc_vial_mg_L': [8.766356747981709, 5.557903750459465, 0.4303407940826049], - 'total_conc_vial_if_undiluted_mg_L': [219.1589186995424, 694.7379688074318, 0.4303407940826049], - 'total_fraction_of_sample_fr': [0.015654208478538812, 0.049624140629102274, 0.00015369314074378663], - 'total_fraction_of_feedstock_fr': [0.011423908489230281, 0.029113989731069757, 0.00011071453905670015], + 'filename': [('S_1', 'S_2'), ('T_1', 'T_2', 'T_3')], + 'replicate_number': [(1, 2), (1, 2, 3)], + 'derivatized': [(False, False), (False, False, False)], + 'calibration_file': [('cal_minimal', 'cal_minimal'), ('cal_minimal', 'cal_minimal', 'cal_minimal')], + 'dilution_factor': [(1, 1), (1, 1, 1)], + 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1, 1)], + 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1, 1)], + 'compound_with_max_area': [('dodecane', 'dodecane'), ('dodecane', 'dodecane', 'dodecane')], + 'compound_with_max_conc': [('naphthalene', 'dodecane'), ('phenol', 'dodecane', 'dodecane')], + 'max_height': [1414.213562373095, 500.0], + 'max_area': [1414.213562373095, 500.0], + 'max_area_if_undiluted': [1414.213562373095, 500.0], + 'max_conc_vial_mg_L': [2.8284271247461903, 0.9999999999999994], + 'max_conc_vial_if_undiluted_mg_L': [2.8284271247461903, 0.9999999999999994], + 'max_fraction_of_sample_fr': [2.8284271247461903, 0.9999999999999994], + 'max_fraction_of_feedstock_fr': [2.8284271247461903, 0.9999999999999994], + 'total_height': [1569.7770542341354, 749.3997598078078], + 'total_area': [1569.7770542341354, 540.0], + 'total_area_if_undiluted': [1569.7770542341354, 540.0], + 'total_conc_vial_mg_L': [8.48528137423857, 8.881784197001252e-16], + 'total_conc_vial_if_undiluted_mg_L': [8.48528137423857, 8.881784197001252e-16], + 'total_fraction_of_sample_fr': [8.48528137423857, 8.881784197001252e-16], + 'total_fraction_of_feedstock_fr': [8.48528137423857, 8.881784197001252e-16], } ) - return samples_info_std - -@pytest.fixture -def checked_samples_info_no_calibrations(): - samples_info = pd.DataFrame( - index=pd.Index(['A', 'Ader', 'B'], name='samplename'), - data={ - 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], - 'replicate_number': [('1', '2'), ('1', '2'), ('1', '2')], - 'derivatized': [(False, False), (False, False), (False, False)], - 'calibration_file': [(False, False), (False, False), (False, False)], - 'dilution_factor': [(1, 1), (1, 1), (1, 1)], - 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1), (1, 1)], - 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1), (1, 1)], - 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], - 'max_height': [1143123.0, 548218.5, 116469.5], - 'max_area': [6387230.0, 1840687.0, 456719.5], - 'max_area_if_undiluted': [6387230.0, 1840687.0, 456719.5], - 'total_height': [2001498.5, 1271114.0, 263429.0], - 'total_area': [9663908.0, 3905876.5, 950583.0], - 'total_area_if_undiluted': [9663908.0, 3905876.5, 950583.0], - } -) return samples_info -@pytest.fixture -def checked_samples_info_no_calibrations_std(): - samples_info_std = pd.DataFrame( - index=pd.Index(['A', 'Ader', 'B'], name='samplename'), - data={ - 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], - 'replicate_number': [('1', '2'), ('1', '2'), ('1', '2')], - 'derivatized': [(False, False), (False, False), (False, False)], - 'calibration_file': [(False, False), (False, False), (False, False)], - 'dilution_factor': [(1, 1), (1, 1), (1, 1)], - 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1), (1, 1)], - 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1), (1, 1)], - 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], - 'max_height': [6330.019905181974, 65717.79713669654, 5193.699307815192], - 'max_area': [10575.489019426004, 163869.16811285765, 22121.83564942114], - 'max_area_if_undiluted': [10575.489019426004, 163869.16811285765, 22121.83564942114], - 'total_height': [35334.83296267297, 88567.95277073982, 12695.395149423275], - 'total_area': [188070.60480574841, 215433.51591732426, 45073.814659955286], - 'total_area_if_undiluted': [188070.60480574841, 215433.51591732426, 45073.814659955286], - } -) - return samples_info_std - @pytest.fixture def checked_samples(): samples = { - 'A': pd.DataFrame( - index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A'), + 'S': pd.DataFrame( + index=pd.Index(['phenol', 'naphthalene', 'dodecane'], name='S'), data={ - 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 26.284, 36.1605, 40.046499999999995, 40.492999999999995, 43.847, 43.986999999999995], - 'area': [24439.5, 10952.0, 47519.5, 18181.0, 1767468.0, 1413594.0, 6387230.0], - 'height': [25256.5, 4259.0, 14769.5, 6222.0, 477850.5, 332147.5, 1143123.0], - 'area_if_undiluted': [610987.5, 273800.0, 1187987.5, 454525.0, 44186700.0, 35339850.0, 159680750.0], - 'conc_vial_mg_L': [0.0, 22.78427785050836, 23.65614297715311, 0.0, 63.58554431387759, 128.28438972770587, 113.72290046495935], - 'conc_vial_if_undiluted_mg_L': [0.0, 569.606946262709, 591.4035744288278, 0.0, 1589.6386078469395, 3207.1097431926473, 2843.0725116239837], - 'fraction_of_sample_fr': [0.0, 0.04068621044733635, 0.042243112459201974, 0.0, 0.11354561484620995, 0.2290792673709033, 0.20307660797314164], - 'fraction_of_feedstock_fr': [0.0, 0.018715656805774722, 0.01922128259154552, 0.0, 0.051641211741918436, 0.10420514155783843, 0.0924007887729286], - 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], - } + 'iupac_name': ['phenol', 'naphthalene', 'dodecane'], + 'retention_time': [13.703, 20.942, 21.426], + 'area': [30.0, 300.0, 3000.0], + 'height': [30.0, 300.0, 3000.0], + 'area_if_undiluted': [30.0, 300.0, 3000.0], + 'conc_vial_mg_L': [6.0, 6.0, 6.000000000000002], + 'conc_vial_if_undiluted_mg_L': [6.0, 6.0, 6.000000000000002], + 'fraction_of_sample_fr': [6.0, 6.0, 6.000000000000002], + 'fraction_of_feedstock_fr': [6.0, 6.0, 6.000000000000002], + 'compound_used_for_calibration': ['self', 'self', 'self'], + } ), - 'Ader': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.123999999999995, 41.7365, 42.159, 45.2555, 45.3695], - 'area': [15719.5, 51560.5, 26505.5, 1409097.5, 562306.5, 1840687.0], - 'height': [13126.5, 18394.0, 8953.5, 490697.0, 191724.5, 548218.5], - 'area_if_undiluted': [1964937.5, 6445062.5, 3313187.5, 176137187.5, 70288312.5, 230085875.0], - 'conc_vial_mg_L': [0.0, 0.6415947958834178, 2.5835030215418877, 27.502344287695024, 34.090377368664285, 22.971264852335217], - 'conc_vial_if_undiluted_mg_L': [0.0, 80.19934948542723, 322.9378776927359, 3437.7930359618776, 4261.297171083035, 2871.4081065419023], - 'fraction_of_sample_fr': [0.0, 0.005728524963244802, 0.023066991263766854, 0.24555664542584843, 0.304378369363074, 0.2051005790387073], - 'fraction_of_feedstock_fr': [0.0, 0.002722862373382652, 0.010956172407969126, 0.11663401169579143, 0.1447012705450368, 0.09748090808552162], - 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], - } - ) -, - 'B': pd.DataFrame( - index=pd.Index(['1-hexene, 4,5-dimethyl-', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B'), - data={ - 'iupac_name': ['4,5-dimethylhex-1-ene', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], - 'retention_time': [6.107, 8.5145, 10.4905, 11.049, 11.471, 13.674], - 'area': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], - 'height': [3597.0, 41972.0, 19169.0, 12434.5, 116469.5, 69787.0], - 'area_if_undiluted': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], - 'conc_vial_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], - 'conc_vial_if_undiluted_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], - 'fraction_of_sample_fr': [0.0, 0.0, 0.0022104436727212492, 0.0, 0.0, 0.0026878878537286406], - 'fraction_of_feedstock_fr': [0.0, 0.0, 0.0010940721919949245, 0.0, 0.0, 0.00133114530090798], - 'compound_used_for_calibration': ['n.a.', 'n.a.', 'self', 'n.a.', 'n.a.', 'self'], - } - ) + 'T': pd.DataFrame( + index=pd.Index(['phenol', 'naphthalene', 'dodecane'], name='T'), + data={ + 'iupac_name': ['phenol', 'naphthalene', 'dodecane'], + 'retention_time': [13.702999999999998, 20.942, 21.426], + 'area': [10.0, 100.0, 1000.0], + 'height': [10.0, 400.0, 1000.0], + 'area_if_undiluted': [10.0, 100.0, 1000.0], + 'conc_vial_mg_L': [2.0000000000000004, 2.000000000000001, 2.0000000000000013], + 'conc_vial_if_undiluted_mg_L': [2.0000000000000004, 2.000000000000001, 2.0000000000000013], + 'fraction_of_sample_fr': [2.0000000000000004, 2.000000000000001, 2.0000000000000013], + 'fraction_of_feedstock_fr': [2.0000000000000004, 2.000000000000001, 2.0000000000000013], + 'compound_used_for_calibration': ['self', 'self', 'self'], + } + ) } return samples @pytest.fixture -def checked_samples_applied_calibration(): - samples = { - 'A': pd.DataFrame( - index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A'), - data={ - 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.025, 26.284, 36.1605, 40.046499999999995, 40.492999999999995, 43.847, 43.986999999999995], - 'area': [24439.5, 10952.0, 47519.5, 18181.0, 1767468.0, 1413594.0, 6387230.0], - 'height': [25256.5, 4259.0, 14769.5, 6222.0, 477850.5, 332147.5, 1143123.0], - 'area_if_undiluted': [610987.5, 273800.0, 1187987.5, 454525.0, 44186700.0, 35339850.0, 159680750.0], - 'conc_vial_mg_L': [0.0, 22.78427785050836, 23.65614297715311, 0.0, 63.58554431387759, 128.28438972770587, 113.72290046495935], - 'conc_vial_if_undiluted_mg_L': [0.0, 569.606946262709, 591.4035744288278, 0.0, 1589.6386078469395, 3207.1097431926473, 2843.0725116239837], - 'fraction_of_sample_fr': [0.0, 0.04068621044733635, 0.042243112459201974, 0.0, 0.11354561484620995, 0.2290792673709033, 0.20307660797314164], - 'fraction_of_feedstock_fr': [0.0, 0.018715656805774722, 0.01922128259154552, 0.0, 0.051641211741918436, 0.10420514155783843, 0.0924007887729286], - 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], - } - ), - 'Ader': pd.DataFrame( - index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader'), - data={ - 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], - 'retention_time': [6.027, 38.123999999999995, 41.7365, 42.159, 45.2555, 45.3695], - 'area': [15719.5, 51560.5, 26505.5, 1409097.5, 562306.5, 1840687.0], - 'height': [13126.5, 18394.0, 8953.5, 490697.0, 191724.5, 548218.5], - 'area_if_undiluted': [1964937.5, 6445062.5, 3313187.5, 176137187.5, 70288312.5, 230085875.0], - 'conc_vial_mg_L': [0.0, 0.6415947958834178, 2.5835030215418877, 27.502344287695024, 34.090377368664285, 22.971264852335217], - 'conc_vial_if_undiluted_mg_L': [0.0, 80.19934948542723, 322.9378776927359, 3437.7930359618776, 4261.297171083035, 2871.4081065419023], - 'fraction_of_sample_fr': [0.0, 0.005728524963244802, 0.023066991263766854, 0.24555664542584843, 0.304378369363074, 0.2051005790387073], - 'fraction_of_feedstock_fr': [0.0, 0.002722862373382652, 0.010956172407969126, 0.11663401169579143, 0.1447012705450368, 0.09748090808552162], - 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], - } - ) -, - 'B': pd.DataFrame( - index=pd.Index(['1-hexene, 4,5-dimethyl-', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B'), - data={ - 'iupac_name': ['4,5-dimethylhex-1-ene', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], - 'retention_time': [6.107, 8.5145, 10.4905, 11.049, 11.471, 13.674], - 'area': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], - 'height': [3597.0, 41972.0, 19169.0, 12434.5, 116469.5, 69787.0], - 'area_if_undiluted': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], - 'conc_vial_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], - 'conc_vial_if_undiluted_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], - 'fraction_of_sample_fr': [0.0, 0.0, 0.0022104436727212492, 0.0, 0.0, 0.0026878878537286406], - 'fraction_of_feedstock_fr': [0.0, 0.0, 0.0010940721919949245, 0.0, 0.0, 0.00133114530090798], - 'compound_used_for_calibration': ['n.a.', 'n.a.', 'self', 'n.a.', 'n.a.', 'self'], - } - ) +def checked_samples_std(): + samples_std = { + 'S': pd.DataFrame( + index=pd.Index(['phenol', 'naphthalene', 'dodecane'], name='S'), + data={ + 'iupac_name': ['phenol', 'naphthalene', 'dodecane'], + 'retention_time': [0.0, 0.0, 0.0], + 'area': [14.142135623730951, 141.4213562373095, 1414.213562373095], + 'height': [14.142135623730951, 141.4213562373095, 1414.213562373095], + 'area_if_undiluted': [14.142135623730951, 141.4213562373095, 1414.213562373095], + 'conc_vial_mg_L': [2.8284271247461903, 2.82842712474619, 2.8284271247461903], + 'conc_vial_if_undiluted_mg_L': [2.8284271247461903, 2.82842712474619, 2.8284271247461903], + 'fraction_of_sample_fr': [2.8284271247461903, 2.82842712474619, 2.8284271247461903], + 'fraction_of_feedstock_fr': [2.8284271247461903, 2.82842712474619, 2.8284271247461903], + 'compound_used_for_calibration': ['self', 'self', 'self'], + } + ), + 'T': pd.DataFrame( + index=pd.Index(['phenol', 'naphthalene', 'dodecane'], name='T'), + data={ + 'iupac_name': ['phenol', 'naphthalene', 'dodecane'], + 'retention_time': [0.0, 0.0, 0.0], + 'area': [10.0, 50.0, 500.0], + 'height': [10.0, 522.0153254455275, 500.0], + 'area_if_undiluted': [10.0, 50.0, 500.0], + 'conc_vial_mg_L': [1.9999999999999996, 1.0, 1.0], + 'conc_vial_if_undiluted_mg_L': [1.9999999999999996, 1.0, 1.0], + 'fraction_of_sample_fr': [1.9999999999999996, 1.0, 1.0], + 'fraction_of_feedstock_fr': [1.9999999999999996, 1.0, 1.0], + 'compound_used_for_calibration': ['self', 'self', 'self'], } - return samples +) + } + return samples_std @pytest.fixture def checked_files_param_reports(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', '4,5-dimethylhex-1-ene', 'oxacycloheptadecan-2-one', 'decanoic acid'], name='height'), - data={ - 'A_1': [1147599.0, 493759.0, 339605.0, 0.0, 0.0, 0.0, 0.0, 15019.0, 0.0, 0.0, 0.0, 5705.0, 0.0], - 'A_2': [1138647.0, 461942.0, 324690.0, 0.0, 0.0, 0.0, 0.0, 14520.0, 0.0, 0.0, 0.0, 6739.0, 4259.0], - 'Ader_1': [501749.0, 484890.0, 180850.0, 0.0, 0.0, 0.0, 0.0, 18415.0, 0.0, 9132.0, 0.0, 0.0, 0.0], - 'Ader_2': [594688.0, 496504.0, 202599.0, 0.0, 0.0, 0.0, 0.0, 18373.0, 0.0, 8775.0, 0.0, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 112797.0, 64421.0, 39393.0, 18515.0, 0.0, 12132.0, 0.0, 7194.0, 0.0, 0.0], - 'B_2': [0.0, 0.0, 0.0, 120142.0, 75153.0, 44551.0, 19823.0, 0.0, 12737.0, 0.0, 0.0, 0.0, 0.0], - } - ), - 'area': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), - data={ - 'A_1': [6379752.0, 1878180.0, 1456119.0, 0.0, 0.0, 0.0, 0.0, 44389.0, 0.0, 0.0, 15068.0, 0.0, 0.0], - 'A_2': [6394708.0, 1656756.0, 1371069.0, 0.0, 0.0, 0.0, 0.0, 50650.0, 0.0, 0.0, 21294.0, 0.0, 10952.0], - 'Ader_1': [1724814.0, 1415205.0, 519476.0, 0.0, 0.0, 0.0, 0.0, 49508.0, 0.0, 27798.0, 0.0, 0.0, 0.0], - 'Ader_2': [1956560.0, 1402990.0, 605137.0, 0.0, 0.0, 0.0, 0.0, 53613.0, 0.0, 25213.0, 0.0, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 441077.0, 200947.0, 147566.0, 69223.0, 0.0, 40376.0, 0.0, 0.0, 19522.0, 0.0], - 'B_2': [0.0, 0.0, 0.0, 472362.0, 228750.0, 181021.0, 64531.0, 0.0, 35791.0, 0.0, 0.0, 0.0, 0.0], - } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', 'hexane-2,5-dione', 'decanoic acid', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), - data={ - 'A_1': [159493800.0, 46954500.0, 36402975.0, 1109725.0, 0.0, 376700.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'A_2': [159867700.0, 41418900.0, 34276725.0, 1266250.0, 0.0, 532350.0, 0.0, 273800.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'Ader_1': [215601750.0, 176900625.0, 64934500.0, 6188500.0, 3474750.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'Ader_2': [244570000.0, 175373750.0, 75642125.0, 6701625.0, 3151625.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 441077.0, 0.0, 200947.0, 147566.0, 69223.0, 40376.0, 19522.0], - 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 472362.0, 0.0, 228750.0, 181021.0, 64531.0, 35791.0, 0.0], - } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), - data={ - 'A_1': [131.18800047103497, 113.61850020825628, 66.05436178187291, 23.581503644987627, 0.0, 0.0, 0.0, 0.0], - 'A_2': [125.38077898437679, 113.82730072166243, 61.11672684588226, 23.730782309318595, 22.78427785050836, 0.0, 0.0, 0.0], - 'Ader_1': [31.36776718294773, 21.669084708496513, 27.623189632994073, 0.600983241036704, 0.0, 0.0, 0.0, 2.5980281295127825], - 'Ader_2': [36.81298755438084, 24.27344499617392, 27.38149894239597, 0.6822063507301317, 0.0, 0.0, 0.0, 2.5689779135709925], - 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 7.167230535550548, 6.243800844792131, 0.0], - 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 7.884941445329839, 6.134683722446865, 0.0], - } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), - data={ - 'A_1': [3279.7000117758744, 1651.3590445468228, 2840.462505206407, 589.5375911246907, 0.0, 0.0, 0.0, 0.0], - 'A_2': [3134.51947460942, 1527.9181711470565, 2845.682518041561, 593.2695577329649, 569.606946262709, 0.0, 0.0, 0.0], - 'Ader_1': [3920.970897868466, 3452.898704124259, 2708.635588562064, 75.12290512958799, 0.0, 324.7535161890978, 0.0, 0.0], - 'Ader_2': [4601.623444297605, 3422.6873677994963, 3034.1806245217404, 85.27579384126646, 0.0, 321.12223919637404, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.167230535550548, 6.243800844792131], - 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.884941445329839, 6.134683722446865], - } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), - data={ - 'A_1': [0.23426428655541953, 0.11795421746763018, 0.20289017894331474, 0.042109827937477896, 0.0, 0.0, 0.0, 0.0], - 'A_2': [0.2238942481863871, 0.10913701222478973, 0.20326303700296858, 0.04237639698092605, 0.04068621044733635, 0.0, 0.0, 0.0], - 'Ader_1': [0.2800693498477476, 0.24663562172316136, 0.193473970611576, 0.005365921794970571, 0.0, 0.023196679727792702, 0.0, 0.0], - 'Ader_2': [0.3286873888784004, 0.24447766912853547, 0.21672718746583858, 0.006091128131519033, 0.0, 0.022937302799741006, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0025597251912680527, 0.0022299288731400468], - 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002816050516189228, 0.0021909584723024517], - } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), - data={ - 'A_1': [0.10541892894993879, 0.053079397860433586, 0.09130058052449164, 0.018949422571865052, 0.0, 0.0, 0.0, 0.0], - 'A_2': [0.10299135416573807, 0.05020302562340328, 0.09350099702136555, 0.019493142611225985, 0.018715656805774722, 0.0, 0.0, 0.0], - 'Ader_1': [0.13163259442844139, 0.11591874220988584, 0.09093276618744071, 0.0025219832436361683, 0.0, 0.01090243947206257, 0.0, 0.0], - 'Ader_2': [0.1577699466616322, 0.11734928118169702, 0.10402904998360252, 0.0029237415031291357, 0.0, 0.011009905343875682, 0.0, 0.0], - 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0012542653437213457, 0.001092665147838623], - 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001408025258094614, 0.0010954792361512259], - } - ), + 'height': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='height'), + data={ + 'S_1': [2000.0, 200.0, 20.0], + 'S_2': [4000.0, 400.0, 40.0], + 'T_1': [500.0, 50.0, 20.0], + 'T_2': [1000.0, 1000.0, 10.0], + 'T_3': [1500.0, 150.0, 0.0], + } +), +'area': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area'), + data={ + 'S_1': [2000.0, 200.0, 20.0], + 'S_2': [4000.0, 400.0, 40.0], + 'T_1': [500.0, 50.0, 20.0], + 'T_2': [1000.0, 100.0, 10.0], + 'T_3': [1500.0, 150.0, 0.0], + } +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area_if_undiluted'), + data={ + 'S_1': [2000.0, 200.0, 20.0], + 'S_2': [4000.0, 400.0, 40.0], + 'T_1': [500.0, 50.0, 20.0], + 'T_2': [1000.0, 100.0, 10.0], + 'T_3': [1500.0, 150.0, 0.0], + } +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_mg_L'), + data={ + 'S_1': [4.000000000000001, 4.0, 4.000000000000001], + 'S_2': [8.000000000000002, 8.0, 8.0], + 'T_1': [1.0000000000000013, 4.0, 1.000000000000001], + 'T_2': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + 'T_3': [3.0000000000000013, 4.61113287893387e-16, 3.000000000000001], + } +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_if_undiluted_mg_L'), + data={ + 'S_1': [4.000000000000001, 4.0, 4.000000000000001], + 'S_2': [8.000000000000002, 8.0, 8.0], + 'T_1': [1.0000000000000013, 4.0, 1.000000000000001], + 'T_2': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + 'T_3': [3.0000000000000013, 4.61113287893387e-16, 3.000000000000001], + } +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_sample_fr'), + data={ + 'S_1': [4.000000000000001, 4.0, 4.000000000000001], + 'S_2': [8.000000000000002, 8.0, 8.0], + 'T_1': [1.0000000000000013, 4.0, 1.000000000000001], + 'T_2': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + 'T_3': [3.0000000000000013, 4.61113287893387e-16, 3.000000000000001], + } +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_feedstock_fr'), + data={ + 'S_1': [4.000000000000001, 4.0, 4.000000000000001], + 'S_2': [8.000000000000002, 8.0, 8.0], + 'T_1': [1.0000000000000013, 4.0, 1.000000000000001], + 'T_2': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + 'T_3': [3.0000000000000013, 4.61113287893387e-16, 3.000000000000001], + } +), } return reports @@ -755,83 +459,83 @@ def checked_files_param_reports(): @pytest.fixture def checked_files_param_aggrreps(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), - data={ - 'A_1': [1673299.018636137, 327039.28314786457, 0.0, 0.0, 0.0, 1301.4243936952162], - 'A_2': [1630562.5435198732, 318647.09040130535, 0.0, 0.0, 0.0, 1537.300436303604], - 'Ader_1': [995669.7397917995, 199362.50083044838, 0.0, 0.0, 0.0, 0.0], - 'Ader_2': [1101297.4487493301, 219631.74381979596, 0.0, 0.0, 0.0, 0.0], - 'B_1': [30189.53968239826, 5454.916540151818, 154971.12017291295, 52781.27325470194, 11641.780331526938, 0.0], - 'B_2': [24976.13637228884, 5726.94295844986, 167175.26975375, 61574.19209435766, 13581.203602167678, 0.0], - } - ), - 'area': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), - data={ - 'A_1': [8180808.865926539, 1588883.8460032765, 0.0, 0.0, 0.0, 3437.311615109468], - 'A_2': [7957332.218705356, 1542835.8875348074, 0.0, 0.0, 0.0, 4857.58650996423], - 'Ader_1': [3115375.5519706816, 621383.3360462904, 0.0, 0.0, 0.0, 0.0], - 'Ader_2': [3373187.7167576416, 670272.3970037951, 0.0, 0.0, 0.0, 0.0], - 'B_1': [102813.63412565118, 18154.278785457453, 598982.0949258526, 164639.4578897035, 36313.94781638508, 0.0], - 'B_2': [85688.75337144051, 16092.723202157411, 653960.780006639, 187418.9512272872, 41338.34077143768, 0.0], - } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), - data={ - 'A_1': [204520221.64816347, 39722096.15008192, 0.0, 0.0, 85932.7903777367, 0.0], - 'A_2': [198933305.4676339, 38570897.18837019, 0.0, 0.0, 121439.66274910574, 0.0], - 'Ader_1': [389421943.99633527, 77672917.00578631, 0.0, 0.0, 0.0, 0.0], - 'Ader_2': [421648464.59470516, 83784049.6254744, 0.0, 0.0, 0.0, 0.0], - 'B_1': [102813.63412565118, 18154.278785457453, 598982.0949258526, 164639.4578897035, 0.0, 36313.94781638508], - 'B_2': [85688.75337144051, 16092.723202157411, 653960.780006639, 187418.9512272872, 0.0, 41338.34077143768], - } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), - data={ - 'A_1': [279.04506020687086, 55.41196015223182, 0.0, 0.0, 0.0], - 'A_2': [287.2245498713612, 59.62973999202588, 0.0, 0.0, 0.0], - 'Ader_1': [69.94687725385289, 13.916672123312216, 0.0, 0.0, 0.0], - 'Ader_2': [76.54999067692155, 15.17432861960896, 0.0, 0.0, 0.0], - 'B_1': [2.7332298278341587, 0.0, 5.872239694763155, 3.706919814979471, 1.2952193148242288], - 'B_2': [2.6854637025308077, 0.0, 6.460273032447163, 3.6421374119160257, 1.424919765813671], - } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), - data={ - 'A_1': [6976.126505171772, 1385.2990038057956, 0.0, 0.0, 0.0], - 'A_2': [7180.61374678403, 1490.7434998006472, 0.0, 0.0, 0.0], - 'Ader_1': [8743.35965673161, 1739.5840154140271, 0.0, 0.0, 0.0], - 'Ader_2': [9568.748834615195, 1896.79107745112, 0.0, 0.0, 0.0], - 'B_1': [2.7332298278341587, 0.0, 5.872239694763155, 3.706919814979471, 1.2952193148242288], - 'B_2': [2.6854637025308077, 0.0, 6.460273032447163, 3.6421374119160257, 1.424919765813671], - } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), - data={ - 'A_1': [0.4982947503694122, 0.09894992884327108, 0.0, 0.0, 0.0], - 'A_2': [0.5129009819131448, 0.10648167855718907, 0.0, 0.0, 0.0], - 'Ader_1': [0.6245256897665437, 0.12425600110100192, 0.0, 0.0, 0.0], - 'Ader_2': [0.6834820596153709, 0.1354850769607943, 0.0, 0.0, 0.0], - 'B_1': [0.0009761535099407709, 0.0, 0.0020972284624154124, 0.0013238999339212397, 0.00046257832672293885], - 'B_2': [0.0009590941794752884, 0.0, 0.0023072403687311297, 0.0013007633613985805, 0.0005088999163620254], - } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), - data={ - 'A_1': [0.22423263766623547, 0.04452746797947199, 0.0, 0.0, 0.0], - 'A_2': [0.23593445168004665, 0.04898157213630697, 0.0, 0.0, 0.0], - 'Ader_1': [0.29352707419027546, 0.058400320517470905, 0.0, 0.0, 0.0], - 'Ader_2': [0.32807138861537805, 0.06503283694118125, 0.0, 0.0, 0.0], - 'B_1': [0.00047831521987097775, 0.0, 0.001027641946583552, 0.0006487109676214074, 0.00022666338009424], - 'B_2': [0.0004795470897376442, 0.0, 0.0011536201843655649, 0.0006503816806992902, 0.0002544499581810127], - } - ) + 'height': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='height'), + data={ + 'S_1': [2000.117419127576, 216.3925981016854, 3.6142811603442784], + 'S_2': [4000.234838255152, 432.7851962033708, 7.228562320688557], + 'T_1': [500.029354781894, 66.38791681901395, 3.6142811603442784], + 'T_2': [1000.058709563788, 1008.2243867468712, 1.8071405801721392], + 'T_3': [1500.0880643456821, 150.00468128267144, 0.0], + } +), +'area': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area'), + data={ + 'S_1': [2000.117419127576, 216.3925981016854, 3.6142811603442784], + 'S_2': [4000.234838255152, 432.7851962033708, 7.228562320688557], + 'T_1': [500.029354781894, 66.38791681901395, 3.6142811603442784], + 'T_2': [1000.058709563788, 108.1962990508427, 1.8071405801721392], + 'T_3': [1500.0880643456821, 150.00468128267144, 0.0], + } +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area_if_undiluted'), + data={ + 'S_1': [2000.117419127576, 216.3925981016854, 3.6142811603442784], + 'S_2': [4000.234838255152, 432.7851962033708, 7.228562320688557], + 'T_1': [500.029354781894, 66.38791681901395, 3.6142811603442784], + 'T_2': [1000.058709563788, 108.1962990508427, 1.8071405801721392], + 'T_3': [1500.0880643456821, 150.00468128267144, 0.0], + } +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_mg_L'), + data={ + 'S_1': [7.277396112495934, 4.000234838255153, 0.7228562320688556], + 'S_2': [14.554792224991864, 8.000469676510306, 1.4457124641377113], + 'T_1': [4.277302486842505, 1.0000587095637894, 0.7228562320688556], + 'T_2': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], + 'T_3': [3.00009362565343, 3.0001761286913657, 8.332965346087379e-17], + } +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), + data={ + 'S_1': [7.277396112495934, 4.000234838255153, 0.7228562320688556], + 'S_2': [14.554792224991864, 8.000469676510306, 1.4457124641377113], + 'T_1': [4.277302486842505, 1.0000587095637894, 0.7228562320688556], + 'T_2': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], + 'T_3': [3.00009362565343, 3.0001761286913657, 8.332965346087379e-17], + } +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_sample_fr'), + data={ + 'S_1': [7.277396112495934, 4.000234838255153, 0.7228562320688556], + 'S_2': [14.554792224991864, 8.000469676510306, 1.4457124641377113], + 'T_1': [4.277302486842505, 1.0000587095637894, 0.7228562320688556], + 'T_2': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], + 'T_3': [3.00009362565343, 3.0001761286913657, 8.332965346087379e-17], + } +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_feedstock_fr'), + data={ + 'S_1': [7.277396112495934, 4.000234838255153, 0.7228562320688556], + 'S_2': [14.554792224991864, 8.000469676510306, 1.4457124641377113], + 'T_1': [4.277302486842505, 1.0000587095637894, 0.7228562320688556], + 'T_2': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], + 'T_3': [3.00009362565343, 3.0001761286913657, 8.332965346087379e-17], + } +) } return reports @@ -839,249 +543,1329 @@ def checked_files_param_aggrreps(): @pytest.fixture def checked_samples_param_reports(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='height'), - data={ - 'A': [1143123.0, 477850.5, 332147.5, 0.0, 0.0, 0.0, 0.0, 14769.5, 0.0, 0.0, 6222.0, 0.0, 2129.5], - 'Ader': [548218.5, 490697.0, 191724.5, 0.0, 0.0, 0.0, 0.0, 18394.0, 0.0, 8953.5, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 116469.5, 69787.0, 41972.0, 19169.0, 0.0, 12434.5, 0.0, 0.0, 3597.0, 0.0], - } - ), - 'area': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), - data={ - 'A': [6387230.0, 1767468.0, 1413594.0, 0.0, 0.0, 0.0, 0.0, 47519.5, 0.0, 0.0, 18181.0, 0.0, 5476.0], - 'Ader': [1840687.0, 1409097.5, 562306.5, 0.0, 0.0, 0.0, 0.0, 51560.5, 0.0, 26505.5, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 456719.5, 214848.5, 164293.5, 66877.0, 0.0, 38083.5, 0.0, 0.0, 9761.0, 0.0], - } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexane-2,5-dione', 'oxacycloheptadecan-2-one', 'phenol', 'butan-2-one', 'decanoic acid', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), - data={ - 'A': [159680750.0, 44186700.0, 35339850.0, 1187987.5, 0.0, 0.0, 454525.0, 0.0, 0.0, 136900.0, 0.0, 0.0, 0.0], - 'Ader': [230085875.0, 176137187.5, 70288312.5, 6445062.5, 3313187.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 456719.5, 0.0, 214848.5, 164293.5, 0.0, 66877.0, 38083.5, 9761.0], - } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), - data={ - 'A': [128.28438972770587, 113.72290046495935, 63.58554431387759, 23.65614297715311, 11.39213892525418, 0.0, 0.0, 0.0], - 'Ader': [34.090377368664285, 22.971264852335217, 27.502344287695024, 0.6415947958834178, 0.0, 0.0, 0.0, 2.5835030215418877], - 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 7.526085990440194, 6.189242283619498, 0.0], - } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), - data={ - 'A': [3207.1097431926473, 1589.6386078469395, 2843.0725116239837, 591.4035744288278, 0.0, 284.8034731313545, 0.0, 0.0], - 'Ader': [4261.297171083035, 3437.7930359618776, 2871.4081065419023, 80.19934948542723, 322.9378776927359, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.526085990440194, 6.189242283619498], - } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), - data={ - 'A': [0.2290792673709033, 0.11354561484620995, 0.20307660797314164, 0.042243112459201974, 0.0, 0.020343105223668174, 0.0, 0.0], - 'Ader': [0.304378369363074, 0.24555664542584843, 0.2051005790387073, 0.005728524963244802, 0.023066991263766854, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026878878537286406, 0.0022104436727212492], - } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), - data={ - 'A': [0.10420514155783843, 0.051641211741918436, 0.0924007887729286, 0.01922128259154552, 0.0, 0.009357828402887361, 0.0, 0.0], - 'Ader': [0.1447012705450368, 0.11663401169579143, 0.09748090808552162, 0.002722862373382652, 0.010956172407969126, 0.0, 0.0, 0.0], - 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00133114530090798, 0.0010940721919949245], - } - ) + 'height': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='height'), + data={ + 'S': [3000.0, 300.0, 30.0], + 'T': [1000.0, 400.0, 10.0], + } +), +'area': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area'), + data={ + 'S': [3000.0, 300.0, 30.0], + 'T': [1000.0, 100.0, 10.0], + } +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area_if_undiluted'), + data={ + 'S': [3000.0, 300.0, 30.0], + 'T': [1000.0, 100.0, 10.0], + } +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_mg_L'), + data={ + 'S': [6.000000000000002, 6.0, 6.0], + 'T': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + } +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_if_undiluted_mg_L'), + data={ + 'S': [6.000000000000002, 6.0, 6.0], + 'T': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + } +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_sample_fr'), + data={ + 'S': [6.000000000000002, 6.0, 6.0], + 'T': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + } +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_feedstock_fr'), + data={ + 'S': [6.000000000000002, 6.0, 6.0], + 'T': [2.0000000000000013, 2.0000000000000004, 2.000000000000001], + } +) } return reports @pytest.fixture def checked_samples_param_reports_std(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='height'), - data={ - 'A': [6330.019905181974, 22498.01645701238, 10546.497641397356, np.nan, np.nan, np.nan, np.nan, 352.8462838120872, np.nan, np.nan, 731.1484117468901, np.nan, 3011.567781073506], - 'Ader': [65717.79713669654, 8212.338156700564, 15378.865384026221, np.nan, np.nan, np.nan, np.nan, 29.698484809834994, np.nan, 252.43712088359746, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 5193.699307815192, 7588.669975694028, 3647.2567773602123, 924.8956697920041, np.nan, 427.79960261786124, np.nan, np.nan, 5086.926183856023, np.nan], - } - ), - 'area': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), - data={ - 'A': [10575.489019426004, 156570.4119174501, 60139.43173991587, np.nan, np.nan, np.nan, np.nan, 4427.195557008974, np.nan, np.nan, 4402.446819667445, np.nan, 7744.233467555068], - 'Ader': [163869.16811285765, 8637.309332193678, 60571.47398322085, np.nan, np.nan, np.nan, np.nan, 2902.6733367707775, np.nan, 1827.8710293672254, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 22121.83564942114, 19659.68983732958, 23656.25736459595, 3317.745017327281, np.nan, 3242.0845917403203, np.nan, np.nan, 13804.138582323782, np.nan], - } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexane-2,5-dione', 'oxacycloheptadecan-2-one', 'phenol', 'butan-2-one', 'decanoic acid', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), - data={ - 'A': [264387.2254856501, 3914260.2979362523, 1503485.7934978968, 110679.88892522435, np.nan, np.nan, 110061.17049168612, np.nan, np.nan, 193605.8366888767, np.nan, np.nan, np.nan], - 'Ader': [20483646.014107205, 1079663.6665242098, 7571434.247902606, 362834.1670963472, 228483.87867090316, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, 22121.83564942114, np.nan, 19659.68983732958, 23656.25736459595, np.nan, 3317.745017327281, 3242.0845917403203, 13804.138582323782], - } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), - data={ - 'A': [4.106325693068217, 0.14764425894471855, 3.4914351462625968, 0.10555595583489899, 16.110917372532917, np.nan, np.nan, np.nan], - 'Ader': [3.8503522496954825, 1.8415608200696434, 0.1709011262715786, 0.05743341165328154, np.nan, np.nan, np.nan, 0.02054160468737343], - 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, 0.5074982512365029, 0.07715745715389961, np.nan], - } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), - data={ - 'A': [102.65814232670576, 87.28587865656482, 3.691106473618185, 2.638898895872451, np.nan, 402.77293431332293, np.nan, np.nan], - 'Ader': [481.29403121193536, 21.362640783947153, 230.19510250870547, 7.179176456660192, 2.5677005859216706, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.5074982512365029, 0.07715745715389961], - } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), - data={ - 'A': [0.007332724451907525, 0.006234705618326057, 0.00026365046240129915, 0.00018849277827660252, np.nan, 0.028769495308094487, np.nan, np.nan], - 'Ader': [0.03437814508656681, 0.001525902913139085, 0.01644250732205038, 0.0005127983183328709, 0.00018340718470868995, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.00018124937544160814, 2.7556234697821363e-05], - } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), - data={ - 'A': [0.001716554591745799, 0.002033902314020852, 0.001555929426374292, 0.0003844681268991305, np.nan, 0.013233967841723464, np.nan, np.nan], - 'Ader': [0.01848189900635057, 0.0010115438077193133, 0.009260471080609506, 0.00028408598968518184, 7.598984670517598e-05, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.00010872467812800095, 1.9898609286992866e-06], - } - ) + 'height': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='height'), + data={ + 'S': [1414.213562373095, 141.4213562373095, 14.142135623730951], + 'T': [500.0, 522.0153254455275, 10.0], + } +), +'area': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area'), + data={ + 'S': [1414.213562373095, 141.4213562373095, 14.142135623730951], + 'T': [500.0, 50.0, 10.0], + } +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['dodecane', 'naphthalene', 'phenol'], name='area_if_undiluted'), + data={ + 'S': [1414.213562373095, 141.4213562373095, 14.142135623730951], + 'T': [500.0, 50.0, 10.0], + } +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_mg_L'), + data={ + 'S': [2.8284271247461903, 2.8284271247461903, 2.82842712474619], + 'T': [1.0, 1.9999999999999996, 1.0], + } +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='conc_vial_if_undiluted_mg_L'), + data={ + 'S': [2.8284271247461903, 2.8284271247461903, 2.82842712474619], + 'T': [1.0, 1.9999999999999996, 1.0], + } +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_sample_fr'), + data={ + 'S': [2.8284271247461903, 2.8284271247461903, 2.82842712474619], + 'T': [1.0, 1.9999999999999996, 1.0], + } +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['dodecane', 'phenol', 'naphthalene'], name='fraction_of_feedstock_fr'), + data={ + 'S': [2.8284271247461903, 2.8284271247461903, 2.82842712474619], + 'T': [1.0, 1.9999999999999996, 1.0], + } +), } return reports - @pytest.fixture def checked_samples_param_aggrreps(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), - data={ - 'A': [1651930.781078005, 322843.186774585, 0.0, 0.0, 0.0, 1419.36241499941], - 'Ader': [1048483.5942705647, 209497.12232512212, 0.0, 0.0, 0.0, 0.0], - 'B': [27582.83802734355, 5590.929749300839, 161073.19496333148, 57177.7326745298, 12611.491966847307, 0.0], - } - ), - 'area': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), + 'height': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='height'), data={ - 'A': [8069070.542315948, 1565859.8667690419, 0.0, 0.0, 0.0, 4147.449062536849], - 'Ader': [3244281.634364161, 645827.8665250427, 0.0, 0.0, 0.0, 0.0], - 'B': [94251.19374854585, 17123.500993807433, 626471.4374662458, 176029.20455849537, 38826.144293911384, 0.0], + 'S': [3000.1761286913643, 324.5888971525281, 5.421421740516418], + 'T': [1000.058709563788, 408.2056616161856, 1.8071405801721392], } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), +), +'area': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area'), data={ - 'A': [201726763.55789867, 39146496.66922605, 0.0, 0.0, 103686.22656342122, 0.0], - 'Ader': [405535204.2955202, 80728483.31563035, 0.0, 0.0, 0.0, 0.0], - 'B': [94251.19374854585, 17123.500993807433, 626471.4374662458, 176029.20455849537, 0.0, 38826.144293911384], + 'S': [3000.1761286913643, 324.5888971525281, 5.421421740516418], + 'T': [1000.058709563788, 108.1962990508427, 1.8071405801721392], } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area_if_undiluted'), data={ - 'A': [283.134805039116, 57.52085007212886, 0.0, 0.0, 0.0], - 'Ader': [73.2484339653872, 14.545500371460587, 0.0, 0.0, 0.0], - 'B': [2.709346765182483, 0.0, 6.166256363605159, 3.674528613447748, 1.36006954031895], + 'S': [3000.1761286913643, 324.5888971525281, 5.421421740516418], + 'T': [1000.058709563788, 108.1962990508427, 1.8071405801721392], } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_mg_L'), data={ - 'A': [7078.3701259779, 1438.0212518032215, 0.0, 0.0, 0.0], - 'Ader': [9156.054245673402, 1818.1875464325735, 0.0, 0.0, 0.0], - 'B': [2.709346765182483, 0.0, 6.166256363605159, 3.674528613447748, 1.36006954031895], + 'S': [10.9160941687439, 6.00035225738273, 1.0842843481032833], + 'T': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), data={ - 'A': [0.5055978661412784, 0.10271580370023008, 0.0, 0.0, 0.0], - 'Ader': [0.6540038746909574, 0.12987053903089812, 0.0, 0.0, 0.0], - 'B': [0.0009676238447080297, 0.0, 0.002202234415573271, 0.0013123316476599102, 0.00048573912154248214], + 'S': [10.9160941687439, 6.00035225738273, 1.0842843481032833], + 'T': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_sample_fr'), data={ - 'A': [0.23008354467314104, 0.04675452005788948, 0.0, 0.0, 0.0], - 'Ader': [0.3107992314028268, 0.06171657872932608, 0.0, 0.0, 0.0], - 'B': [0.000478931154804311, 0.0, 0.0010906310654745584, 0.0006495463241603489, 0.00024055666913762634], + 'S': [10.9160941687439, 6.00035225738273, 1.0842843481032833], + 'T': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], } - ) +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_feedstock_fr'), + data={ + 'S': [10.9160941687439, 6.00035225738273, 1.0842843481032833], + 'T': [3.638698056247968, 2.0001174191275775, 0.36142811603442787], + } +), } return reports @pytest.fixture def checked_samples_param_aggrreps_std(): reports = { - 'height': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), + 'height': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='height'), data={ - 'A': [35797.371056783835, 7508.212703385821, 0.0, 0.0, 0.0, 166.78954924783815], - 'Ader': [75153.30566953593, 14433.563492713163, 0.0, 0.0, 0.0, 0.0], - 'B': [6487.963541864086, 192.35172504043408, 8629.636927224863, 6217.532537943509, 1371.3793462610597, 0.0], + 'S': [1414.2965902344451, 153.01267351627698, 2.5556827175942227], + 'T': [500.029354781894, 521.310649906016, 1.8071405801721392], } - ), - 'area': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), +), +'area': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area'), data={ - 'A': [201147.21731284214, 41724.311288199315, 0.0, 0.0, 0.0, 1004.2860093008128], - 'Ader': [199552.22488919983, 38249.3835185481, 0.0, 0.0, 0.0, 0.0], - 'B': [21973.821276880837, 1457.7399327444466, 42815.265175930195, 16107.534210999198, 3552.7823298636085, 0.0], + 'S': [1414.2965902344451, 153.01267351627698, 2.5556827175942227], + 'T': [500.029354781894, 41.808382231828745, 1.8071405801721392], } - ), - 'area_if_undiluted': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), +), +'area_if_undiluted': +pd.DataFrame( + index=pd.Index(['C-aliph', 'C-arom', 'alcohol'], name='area_if_undiluted'), data={ - 'A': [5028680.432821053, 1043107.7822049828, 0.0, 0.0, 25107.15023252032, 0.0], - 'Ader': [24944028.111149978, 4781172.939818514, 0.0, 0.0, 0.0, 0.0], - 'B': [21973.821276880837, 1457.7399327444466, 42815.26517593019, 16107.534210999198, 0.0, 3552.7823298636085], + 'S': [1414.2965902344451, 153.01267351627698, 2.5556827175942227], + 'T': [500.029354781894, 41.808382231828745, 1.8071405801721392], } - ), - 'conc_vial_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), +), +'conc_vial_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_mg_L'), data={ - 'A': [18.436674076139045, 5.526836289719974, 0.0, 0.0, 0.0], - 'Ader': [4.984729502757059, 0.9565736502096863, 0.0, 0.0, 0.0], - 'B': [0.03377575111300582, 0.0, 0.41580236064012105, 0.04580807650772246, 0.09171206841758799], + 'S': [5.1458961405264905, 2.8285931804688906, 0.5111365435188446], + 'T': [0.6386044305945372, 1.0000587095637883, 0.3614281160344278], } - ), - 'conc_vial_if_undiluted_mg_L': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), +), +'conc_vial_if_undiluted_mg_L': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), data={ - 'A': [460.9168519034765, 138.1709072429994, 0.0, 0.0, 0.0], - 'Ader': [623.0911878446323, 119.57170627621078, 0.0, 0.0, 0.0], - 'B': [0.03377575111300582, 0.0, 0.41580236064012105, 0.04580807650772246, 0.09171206841758799], + 'S': [5.1458961405264905, 2.8285931804688906, 0.5111365435188446], + 'T': [0.6386044305945372, 1.0000587095637883, 0.3614281160344278], } - ), - 'fraction_of_sample_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), +), +'fraction_of_sample_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_sample_fr'), data={ - 'A': [0.03292263227881972, 0.009869350517357094, 0.0, 0.0, 0.0], - 'Ader': [0.04450651341747372, 0.00854083616258648, 0.0, 0.0, 0.0], - 'B': [1.2062768254644968e-05, 0.0, 0.0001485008430857575, 1.6360027324186634e-05, 3.275431014913856e-05], + 'S': [5.1458961405264905, 2.8285931804688906, 0.5111365435188446], + 'T': [0.6386044305945372, 1.0000587095637883, 0.3614281160344278], } - ), - 'fraction_of_feedstock_fr': pd.DataFrame( - index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), +), +'fraction_of_feedstock_fr': +pd.DataFrame( + index=pd.Index(['C-arom', 'C-aliph', 'alcohol'], name='fraction_of_feedstock_fr'), data={ - 'A': [0.014510828146666238, 0.004414840265872383, 0.0, 0.0, 0.0], - 'Ader': [0.024426518981430268, 0.004689897339536736, 0.0, 0.0, 0.0], - 'B': [8.710635362591769e-07, 0.0, 8.908006621759261e-05, 1.1813725467879507e-06, 1.9648077791126472e-05], + 'S': [5.1458961405264905, 2.8285931804688906, 0.5111365435188446], + 'T': [0.6386044305945372, 1.0000587095637883, 0.3614281160344278], } - ) +), } return reports + + +# Project class testing +# @pytest.fixture +# def gcms() -> Project: + +# folder_path: plib.Path = plib.Path( +# plib.Path(__file__).parent.parent, "tests/data_for_testing/" +# ) +# Project.set_folder_path(folder_path) +# Project.set_auto_save_to_excel(False) +# return Project() + + +# @pytest.fixture +# def checked_files_info(): +# files_info = pd.DataFrame( +# index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), +# data={ +# 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], +# 'derivatized': [False, False, True, True, False, False], +# 'dilution_factor': [25, 25, 125, 125, 1, 1], +# 'total_sample_conc_in_vial_mg_L': [560.0000000000001, 560.0000000000001, 112.0, 112.0, 2800.0, 2800.0], +# 'sample_yield_on_feedstock_basis_fr': [0.45, 0.46, 0.47, 0.48, 0.49, 0.5], +# 'calibration_file': ['calibration', 'calibration', 'deriv_calibration', 'deriv_calibration', 'calibration', 'calibration'], +# } +# ) +# return files_info + +# @pytest.fixture +# def checked_created_files_info(): +# created_files_info = pd.DataFrame( +# index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), +# data={ +# 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], +# 'replicate_number': ['1', '2', '1', '2', '1', '2'], +# 'derivatized': [False, False, False, False, False, False], +# 'calibration_file': [False, False, False, False, False, False], +# 'dilution_factor': [1, 1, 1, 1, 1, 1], +# 'total_sample_conc_in_vial_mg_L': [1, 1, 1, 1, 1, 1], +# 'sample_yield_on_feedstock_basis_fr': [1, 1, 1, 1, 1, 1], +# } +# ) +# return created_files_info + + +# @pytest.fixture +# def checked_files(): +# files = { +# 'A_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], +# 'area': [23386, 44389, 15068, 1878180, 1456119, 6379752], +# 'height': [24797, 15019, 5705, 493759, 339605, 1147599], +# 'area_if_undiluted': [584650, 1109725, 376700, 46954500, 36402975, 159493800], +# }), +# 'A_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], +# 'area': [25493, 10952, 50650, 21294, 1656756, 1371069, 6394708], +# 'height': [25716, 4259, 14520, 6739, 461942, 324690, 1138647], +# 'area_if_undiluted': [637325, 273800, 1266250, 532350, 41418900, 34276725, 159867700], +# }), +# 'Ader_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], +# 'area': [16741, 49508, 27798, 1415205, 519476, 1724814], +# 'height': [13451, 18415, 9132, 484890, 180850, 501749], +# 'area_if_undiluted': [2092625, 6188500, 3474750, 176900625, 64934500, 215601750], +# }), +# 'Ader_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], +# 'area': [14698, 53613, 25213, 1402990, 605137, 1956560], +# 'height': [12802, 18373, 8775, 496504, 202599, 594688], +# 'area_if_undiluted': [1837250, 6701625, 3151625, 175373750, 75642125, 244570000], +# }), +# 'B_1': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], +# 'area': [147566, 69223, 40376, 441077, 19522, 200947], +# 'height': [39393, 18515, 12132, 112797, 7194, 64421], +# 'area_if_undiluted': [147566, 69223, 40376, 441077, 19522, 200947], +# }), +# 'B_2': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], +# 'area': [181021, 64531, 35791, 472362, 228750], +# 'height': [44551, 19823, 12737, 120142, 75153], +# 'area_if_undiluted': [181021, 64531, 35791, 472362, 228750], +# }) +# } +# return files + +# @pytest.fixture +# def checked_is_files_deriv(): +# is_files_deriv = { +# 'A_1': False, 'A_2': False, 'Ader_1': True, +# 'Ader_2': True, 'B_1': False, 'B_2': False +# } +# return is_files_deriv + +# @pytest.fixture +# def checked_load_class_code_fractions(): +# class_code_fractions = pd.DataFrame( +# index=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79], +# data={ +# 'classes': ['ester', 'ester_1', 'ester_2', 'ester_3', 'ester_4', 'ester_5', 'ester_6', 'carboxyl', 'ketone', 'ketone_1', 'ketone_2', 'ketone_3', 'ketone_4', 'ketone_5', 'ketone_6', 'ketone_7', 'ketone_8', 'ketone_9', 'ketone_10', 'ketone_11', 'ketone_12', 'ketone_13', 'ketone_14', 'ketone_15', 'ketone_16', 'ketone_17', 'ketone_18', 'ketone_19', 'ketone_20', 'ketone_21', 'ketone_22', 'ketone_23', 'ketone_24', 'ketone_25', 'ketone_26', 'ketone_27', 'aldehyde', 'ether', 'ether_1', 'ether_2', 'ether_3', 'ether_4', 'ether_5', 'ether_6', 'ether_7', 'ether_8', 'ether_9', 'ether_10', 'ether_11', 'ether_12', 'ether_13', 'ether_14', 'ether_15', 'ether_16', 'ether_17', 'ether_18', 'ether_19', 'ether_20', 'ether_21', 'ether_22', 'ether_23', 'ether_24', 'ether_25', 'ether_26', 'ether_27', 'alcohol', 'C-aliph', 'C-aliph_1', 'C-aliph_2', 'C-aliph_3', 'C-arom', 'C-arom_1', 'C-arom_2', 'N-aliph', 'N-aliph_1', 'N-aliph_3', 'N-arom', 'N-arom_2', 'O-arom', 'O-aliph'], +# 'codes': ['[CH0](=O)O[CH3]', '[CH0](=O)O[CH2]', '[CH0](=O)O[CH1]', '[CH0](=O)O[C]', '[CH0](=O)O[cH2]', '[CH0](=O)O[cH1]', '[CH0](=O)O[c]', '[CH0](=O)O', '[CH3]C(=O)[CH3]', '[CH3]C(=O)[CH2]', '[CH3]C(=O)[CH]', '[CH3]C(=O)[C]', '[CH3]C(=O)[cH2]', '[CH3]C(=O)[cH]', '[CH3]C(=O)[c]', '[CH2]C(=O)[CH2]', '[CH2]C(=O)[CH]', '[CH2]C(=O)[C]', '[CH2]C(=O)[cH2]', '[CH2]C(=O)[cH]', '[CH2]C(=O)[c]', '[CH]C(=O)[CH]', '[CH]C(=O)[C]', '[CH]C(=O)[cH2]', '[CH]C(=O)[cH]', '[CH]C(=O)[c]', '[C]C(=O)[C]', '[C]C(=O)[cH2]', '[C]C(=O)[cH]', '[C]C(=O)[c]', '[cH2]C(=O)[cH2]', '[cH2]C(=O)[cH]', '[cH2]C(=O)[c]', '[cH]C(=O)[cH]', '[cH]C(=O)[c]', '[c]C(=O)[c]', '[CH]=O', '[CH3]O[CH3]', '[CH3]O[CH2]', '[CH3]O[CH]', '[CH3]O[C]', '[CH3]O[cH2]', '[CH3]O[cH]', '[CH3]O[c]', '[CH2]O[CH2]', '[CH2]O[CH]', '[CH2]O[C]', '[CH2]O[cH2]', '[CH2]O[cH]', '[CH2]O[c]', '[CH]O[CH]', '[CH]O[C]', '[CH]O[cH2]', '[CH]O[cH]', '[CH]O[c]', '[C]O[C]', '[C]O[cH2]', '[C]O[cH]', '[C]O[c]', '[cH2]O[cH2]', '[cH2]O[cH]', '[cH2]O[c]', '[cH]O[cH]', '[cH]O[c]', '[c]O[c]', '[OH1]', '[CH3]', '[CH2]', '[CH1]', '[C]', '[cH2]', '[cH1]', '[c]', '[NH2]', '[NH1]', '[NH0]', '[nH1]', '[n]', '[o]', '[O]'], +# 'mfs': [59.044, 58.035999999999994, 57.028, 56.019999999999996, 58.035999999999994, 57.028, 56.019999999999996, 45.017, 58.080000000000005, 57.072, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 56.06400000000001, 55.056000000000004, 57.072, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 55.056000000000004, 54.048, 53.040000000000006, 52.032000000000004, 54.048, 53.040000000000006, 52.032000000000004, 56.06400000000001, 55.056000000000004, 54.048, 54.048, 53.040000000000006, 52.032000000000004, 29.017999999999997, 46.069, 45.061, 44.053, 43.045, 45.061, 44.053, 43.045, 44.053, 43.045, 45.061, 44.053, 43.045, 42.037, 42.037, 41.029, 43.045, 42.037, 41.029, 40.021, 42.037, 41.029, 40.021, 44.053, 43.045, 42.037, 42.037, 41.029, 40.021, 17.007, 15.035, 14.027, 13.018999999999998, 12.011, 14.027, 13.018999999999998, 12.011, 16.023, 15.015, 14.007, 15.015, 14.007, 15.999, 15.999], +# } +# ) +# return class_code_fractions + +# @pytest.fixture +# def checked_load_calibrations(): +# calibrations = { +# 'calibration': pd.DataFrame( +# index=pd.Index(['phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'tetradecanoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], name='comp_name'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'MW': [94.11, 96.1271, 197.4, 228.3709, 256.4241, 280.4455, 282.4614], +# 'PPM 1': [5.0, 10.0, 5.0, 10.0, 10.0, np.nan, 10.0], +# 'PPM 2': [10, 20, 10, 20, 20, 20, 20], +# 'PPM 3': [20, 30, 20, 35, 35, 35, 35], +# 'PPM 4': [50.0, 50.0, 50.0, 50.0, 50.0, np.nan, 50.0], +# 'PPM 5': [np.nan, np.nan, np.nan, 100.0, 100.0, 100.0, 100.0], +# 'PPM 6': [np.nan, np.nan, np.nan, 300.0, 300.0, 300.0, 300.0], +# 'Area 1': [135884.0, 175083.0, 155710.0, 70675.0, 51545.0, np.nan, 31509.0], +# 'Area 2': [304546, 759316, 343277, 203215, 130834, 22338, 133847], +# 'Area 3': [678618, 1070146, 805095, 500430, 361070, 63841, 551470], +# 'Area 4': [1866918.0, 1928385.0, 2302730.0, 469543.0, 430809.0, np.nan, 494928.0], +# 'Area 5': [np.nan, np.nan, np.nan, 2957268.0, 3164919.0, 741540.0, 5345977.0], +# 'Area 6': [np.nan, np.nan, np.nan, 11730886.0, 12451729.0, 3975200.0, 19779576.0], +# } +# ), +# 'deriv_calibration': pd.DataFrame( +# index=pd.Index(['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], name='comp_name'), +# data={ +# 'iupac_name': ['n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.', 'n.a.'], +# 'MW': [122.1213, 256.4241, 280.4455, 282.4614, 94.1112, 116.1152, 110.1106], +# 'PPM 1': [np.nan, 5.0, 5.0, 5.0, np.nan, 5.0, 5.0], +# 'PPM 2': [np.nan, 10.0, 10.0, 10.0, np.nan, 10.0, 10.0], +# 'PPM 3': [np.nan, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0], +# 'PPM 4': [np.nan, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0], +# 'PPM 5': [30, 30, 30, 30, 25, 25, 25], +# 'PPM 6': [50, 50, 50, 50, 30, 30, 30], +# 'Area 1': [np.nan, 403058.0, 126644.0, 467088.0, np.nan, 48330.0, 184752.0], +# 'Area 2': [np.nan, 570479.0, 183307.0, 741971.0, np.nan, 206224.0, 729379.0], +# 'Area 3': [np.nan, 694901.0, 241591.0, 953554.0, 17168.0, 620353.0, 1607583.0], +# 'Area 4': [np.nan, 936570.0, 350170.0, 1408563.0, 21329.0, 885337.0, 2232039.0], +# 'Area 5': [73458, 1474014, 475205, 2476003, 21557, 1096645, 2972508], +# 'Area 6': [113812, 2605959, 824267, 4300414, 71706, 1394486, 3629582], +# 'CAS': ['65-85-0', '57-10-3', '60-33-3', '112-79-8', '108-95-2', '123-76-2', '120-80-9'], +# } +# ) +# } +# return calibrations + +# @pytest.fixture +# def checked_is_calibrations_deriv(): +# is_calibrations_deriv = {'calibration': False, 'deriv_calibration': True} +# return is_calibrations_deriv + +# @pytest.fixture +# def checked_list_of_all_compounds(): +# list_of_all_compounds = ['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', +# '9,12-octadecadienoic acid (z,z)-', 'oleic acid', +# 'n-decanoic acid', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', +# 'trans-2-pentenoic acid', '2,5-hexanedione', +# '1-hexene, 4,5-dimethyl-', 'phenol', +# '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', +# '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid' +# ] +# return list_of_all_compounds + +# @pytest.fixture +# def checked_list_of_all_deriv_compounds(): +# list_of_all_deriv_compounds = ['myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', +# 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', +# '9-octadecenoic acid, (z)-, tms derivative', 'benzoic acid, deriv.', +# 'hexadecanoic acid, deriv.', '(9z,12z)-octadeca-9,12-dienoic acid, deriv.', +# '9-octadecenoic acid, (e)-, deriv.', 'phenol, deriv.', +# '4-oxopentanoic acid, deriv.', 'benzene-1,2-diol, deriv.' +# ] +# return list_of_all_deriv_compounds + +# @pytest.fixture +# def checked_compounds_properties(): +# compounds_properties = pd.DataFrame( +# index=pd.Index(['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid', 'n-decanoic acid', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], name='comp_name'), +# data={ +# 'iupac_name': ['tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'decanoic acid', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'molecular_formula': ['C14H28O2', 'C16H30O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C10H20O2', 'C4H8O', 'C6H8O', 'C5H8O2', 'C6H10O2', 'C8H16', 'C6H6O', 'C6H8O', 'C6H3Cl3O', 'C16H32O2', 'C18H32O2', 'C18H34O2'], +# 'canonical_smiles': ['CCCCCCCCCCCCCC(=O)O', 'C1CCCCCCCC(=O)OCCCCCCC1', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'CCCCCCCCCC(=O)O', 'CCC(=O)C', 'CC1=CCCC1=O', 'CCC=CC(=O)O', 'CC(=O)CCC(=O)C', 'CC(C)C(C)CC=C', 'C1=CC=C(C=C1)O', 'CC1=CCCC1=O', 'C1=C(C(=CC(=C1Cl)Cl)Cl)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O'], +# 'molecular_weight': [228.37, 254.41, 256.42, 280.4, 282.5, 172.26, 72.11, 96.13, 100.12, 114.14, 112.21, 94.11, 96.13, 197.4, 256.42, 280.4, 282.5], +# 'xlogp': [5.3, 6.3, 6.4, 6.8, 6.5, 4.1, 0.3, 0.9, 1.0, -0.3, 3.5, 1.5, 0.9, 3.7, 6.4, 6.8, 6.5], +# 'el_C': [14, 16, 16, 18, 18, 10, 4, 6, 5, 6, 8, 6, 6, 6, 16, 18, 18], +# 'el_Cl': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], +# 'el_H': [28, 30, 32, 32, 34, 20, 8, 8, 8, 10, 16, 6, 8, 3, 32, 32, 34], +# 'el_O': [2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 0, 1, 1, 1, 2, 2, 2], +# 'el_mf_C': [0.7363226343214958, 0.7553791124562713, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.6972599558806455, 0.6662598807377618, 0.7496723187350464, 0.5998302037554933, 0.6313825127036973, 0.8563229658675697, 0.765763468281798, 0.7496723187350464, 0.3650759878419453, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566], +# 'el_mf_Cl': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], +# 'el_mf_H': [0.12358891272934273, 0.11886325223065132, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.11703239289446186, 0.11182914990985994, 0.08388640382814938, 0.08054334798242109, 0.08831259856316805, 0.1437305053025577, 0.06426522154925088, 0.08388640382814938, 0.015319148936170212, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203], +# 'el_mf_O': [0.1401147261023777, 0.12577335796548877, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.18575409265064438, 0.22186936624601306, 0.16643087485696453, 0.31959648421893727, 0.28033993341510427, 0.0, 0.17000318775900541, 0.16643087485696453, 0.08104863221884498, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815], +# 'fg_C-aliph': [13, 14, 15, 17, 17, 9, 1, 3, 4, 0, 8, 0, 3, 0, 15, 17, 17], +# 'fg_C-arom': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 6, 0, 0, 0], +# 'fg_Cl': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], +# 'fg_alcohol': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0], +# 'fg_carboxyl': [1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1], +# 'fg_ester': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], +# 'fg_hetero_atoms': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0], +# 'fg_ketone': [0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0], +# 'fg_mf_C-aliph': [0.8029031834303979, 0.771895758814512, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.7387147335423198, 0.20850090140063793, 0.4377509622386352, 0.5503395924890131, 0.0, 1.0000534711701274, 0.0, 0.4377509622386352, 0.0, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363], +# 'fg_mf_C-arom': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8193178195728402, 0.0, 0.3752887537993921, 0.0, 0.0, 0.0], +# 'fg_mf_Cl': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], +# 'fg_mf_alcohol': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1807140580172139, 0.0, 0.0861550151975684, 0.0, 0.0, 0.0], +# 'fg_mf_carboxyl': [0.19712308972281825, 0.0, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.2613317078834321, 0.0, 0.0, 0.4496304434678386, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055], +# 'fg_mf_ester': [0.0, 0.22811996383789943, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'fg_mf_hetero_atoms': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5387537993920973, 0.0, 0.0, 0.0], +# 'fg_mf_ketone': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7914574954929968, 0.5936960366170811, 0.0, 1.0000350446819695, 0.0, 0.0, 0.5936960366170811, 0.0, 0.0, 0.0, 0.0], +# 'fg_mf_total': [0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168, 0.9998867256637168], +# } +# ) +# return compounds_properties + +# @pytest.fixture +# def checked_deriv_compounds_properties(): +# deriv_compounds_properties = pd.DataFrame( +# index=pd.Index(['myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative', 'benzoic acid, deriv.', 'hexadecanoic acid, deriv.', '(9z,12z)-octadeca-9,12-dienoic acid, deriv.', '9-octadecenoic acid, (e)-, deriv.', 'phenol, deriv.', '4-oxopentanoic acid, deriv.', 'benzene-1,2-diol, deriv.'], name='comp_name'), +# data={ +# 'iupac_name': ['tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(e)-octadec-9-enoic acid', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], +# 'molecular_formula': ['C14H28O2', 'C16H30O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C7H6O2', 'C16H32O2', 'C18H32O2', 'C18H34O2', 'C6H6O', 'C5H8O3', 'C6H6O2'], +# 'canonical_smiles': ['CCCCCCCCCCCCCC(=O)O', 'CCCCCCC=CCCCCCCCC(=O)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'C1=CC=C(C=C1)C(=O)O', 'CCCCCCCCCCCCCCCC(=O)O', 'CCCCCC=CCC=CCCCCCCCC(=O)O', 'CCCCCCCCC=CCCCCCCCC(=O)O', 'C1=CC=C(C=C1)O', 'CC(=O)CCC(=O)O', 'C1=CC=C(C(=C1)O)O'], +# 'molecular_weight': [228.37, 254.41, 256.42, 280.4, 282.5, 122.12, 256.42, 280.4, 282.5, 94.11, 116.11, 110.11], +# 'xlogp': [5.3, 6.4, 6.4, 6.8, 6.5, 1.9, 6.4, 6.8, 6.5, 1.5, -0.5, 0.9], +# 'underiv_comp_name': ['myristic acid', 'palmitelaidic acid', 'palmitic acid', '9,12-octadecadienoic acid (z,z)-', '9-octadecenoic acid, (z)-', 'benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], +# 'el_C': [14, 16, 16, 18, 18, 7, 16, 18, 18, 6, 5, 6], +# 'el_H': [28, 30, 32, 32, 34, 6, 32, 32, 34, 6, 8, 6], +# 'el_O': [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2], +# 'el_mf_C': [0.7363226343214958, 0.7553791124562713, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.6884785456927611, 0.7494579205990172, 0.7710342368045648, 0.7653026548672566, 0.765763468281798, 0.5172250452157436, 0.6544909635818728], +# 'el_mf_H': [0.12358891272934273, 0.11886325223065132, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.04952505732066819, 0.12579361984244597, 0.11503566333808846, 0.12131681415929203, 0.06426522154925088, 0.06945138230987856, 0.054926891290527656], +# 'el_mf_O': [0.1401147261023777, 0.12577335796548877, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.26202096298722566, 0.12478745807659308, 0.11411554921540658, 0.11326725663716815, 0.17000318775900541, 0.413375247610025, 0.29060030878212695], +# 'fg_C-aliph': [13, 15, 15, 17, 17, 0, 15, 17, 17, 0, 1, 0], +# 'fg_C-arom': [0, 0, 0, 0, 0, 6, 0, 0, 0, 6, 0, 6], +# 'fg_alcohol': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2], +# 'fg_carboxyl': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], +# 'fg_ketone': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], +# 'fg_mf_C-aliph': [0.8029031834303979, 0.8230690617507173, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.0, 0.8244793697839481, 0.8396398002853066, 0.8405345132743363, 0.0, 0.12080785462061837, 0.0], +# 'fg_mf_C-arom': [0.0, 0.0, 0.0, 0.0, 0.0, 0.6313953488372093, 0.0, 0.0, 0.0, 0.8193178195728402, 0.0, 0.6911088911088911], +# 'fg_mf_alcohol': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1807140580172139, 0.0, 0.3089092725456362], +# 'fg_mf_carboxyl': [0.19712308972281825, 0.17694666090169414, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.3686292171634458, 0.1755596287341081, 0.16054564907275323, 0.15935221238938055, 0.0, 0.3877099302385669, 0.0], +# 'fg_mf_ketone': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.491533890276462, 0.0], +# 'fg_mf_total': [1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273, 1.0000181636545273], +# } +# ) +# return deriv_compounds_properties + +# @pytest.fixture +# def checked_calibrations_added_iupac_only_iupac_and_mw(): +# calibrations = { +# 'calibration': pd.DataFrame( +# index=pd.Index(['phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'tetradecanoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], name='comp_name'), +# data={ +# 'iupac_name': ['phenol', '2-methylcyclopent-2-en-1-one', '2,4,5-trichlorophenol', 'tetradecanoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'MW': [94.11, 96.1271, 197.4, 228.3709, 256.4241, 280.4455, 282.4614], +# }), +# 'deriv_calibration': pd.DataFrame( +# index=pd.Index(['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '9-octadecenoic acid, (e)-', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], name='comp_name'), +# data={ +# 'iupac_name': ['benzoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(e)-octadec-9-enoic acid', 'phenol', '4-oxopentanoic acid', 'benzene-1,2-diol'], +# 'MW': [122.1213, 256.4241, 280.4455, 282.4614, 94.1112, 116.1152, 110.1106], +# }) +# } +# return calibrations + +# @pytest.fixture +# def checked_files_added_iupac_only_iupac_and_time(): +# files = { +# 'A_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], +# }), +# 'A_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), +# data={ +# 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], +# }), +# 'Ader_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], +# }), +# 'Ader_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], +# }), +# 'B_1': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), +# data={ +# 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol'], +# 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], +# }), +# 'B_2': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), +# data={ +# 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], +# 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], +# }) +# } +# return files + +# @pytest.fixture +# def checked_files_applied_calibration(): +# files = { +# 'A_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_1'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 36.163, 40.052, 40.492, 43.847, 43.986], +# 'area': [23386, 44389, 15068, 1878180, 1456119, 6379752], +# 'height': [24797, 15019, 5705, 493759, 339605, 1147599], +# 'area_if_undiluted': [584650, 1109725, 376700, 46954500, 36402975, 159493800], +# 'conc_vial_mg_L': [np.nan, 23.581503644987627, np.nan, 66.05436178187291, 131.18800047103497, 113.61850020825628], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 589.5375911246907, np.nan, 1651.3590445468228, 3279.7000117758744, 2840.462505206407], +# 'fraction_of_sample_fr': [np.nan, 0.042109827937477896, np.nan, 0.11795421746763018, 0.23426428655541953, 0.20289017894331474], +# 'fraction_of_feedstock_fr': [np.nan, 0.018949422571865052, np.nan, 0.053079397860433586, 0.10541892894993879, 0.09130058052449164], +# 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'self', 'self', 'self'], +# }), +# 'A_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A_2'), +# data={ +# 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 26.284, 36.158, 40.041, 40.494, 43.847, 43.988], +# 'area': [25493, 10952, 50650, 21294, 1656756, 1371069, 6394708], +# 'height': [25716, 4259, 14520, 6739, 461942, 324690, 1138647], +# 'area_if_undiluted': [637325, 273800, 1266250, 532350, 41418900, 34276725, 159867700], +# 'conc_vial_mg_L': [np.nan, 22.78427785050836, 23.730782309318595, np.nan, 61.11672684588226, 125.38077898437679, 113.82730072166243], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 569.606946262709, 593.2695577329649, np.nan, 1527.9181711470565, 3134.51947460942, 2845.682518041561], +# 'fraction_of_sample_fr': [np.nan, 0.04068621044733635, 0.04237639698092605, np.nan, 0.10913701222478973, 0.2238942481863871, 0.20326303700296858], +# 'fraction_of_feedstock_fr': [np.nan, 0.018715656805774722, 0.019493142611225985, np.nan, 0.05020302562340328, 0.10299135416573807, 0.09350099702136555], +# 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], +# }), +# 'Ader_1': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_1'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.123, 41.729, 42.157, 45.253, 45.369], +# 'area': [16741, 49508, 27798, 1415205, 519476, 1724814], +# 'height': [13451, 18415, 9132, 484890, 180850, 501749], +# 'area_if_undiluted': [2092625, 6188500, 3474750, 176900625, 64934500, 215601750], +# 'conc_vial_mg_L': [np.nan, 0.600983241036704, 2.5980281295127825, 27.623189632994073, 31.36776718294773, 21.669084708496513], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 75.12290512958799, 324.7535161890978, 3452.898704124259, 3920.970897868466, 2708.635588562064], +# 'fraction_of_sample_fr': [np.nan, 0.005365921794970571, 0.023196679727792702, 0.24663562172316136, 0.2800693498477476, 0.193473970611576], +# 'fraction_of_feedstock_fr': [np.nan, 0.0025219832436361683, 0.01090243947206257, 0.11591874220988584, 0.13163259442844139, 0.09093276618744071], +# 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], +# }), +# 'Ader_2': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader_2'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.125, 41.744, 42.161, 45.258, 45.37], +# 'area': [14698, 53613, 25213, 1402990, 605137, 1956560], +# 'height': [12802, 18373, 8775, 496504, 202599, 594688], +# 'area_if_undiluted': [1837250, 6701625, 3151625, 175373750, 75642125, 244570000], +# 'conc_vial_mg_L': [np.nan, 0.6822063507301317, 2.5689779135709925, 27.38149894239597, 36.81298755438084, 24.27344499617392], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 85.27579384126646, 321.12223919637404, 3422.6873677994963, 4601.623444297605, 3034.1806245217404], +# 'fraction_of_sample_fr': [np.nan, 0.006091128131519033, 0.022937302799741006, 0.24447766912853547, 0.3286873888784004, 0.21672718746583858], +# 'fraction_of_feedstock_fr': [np.nan, 0.0029237415031291357, 0.011009905343875682, 0.11734928118169702, 0.1577699466616322, 0.10402904998360252], +# 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], +# }), +# 'B_1': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', '1-hexene, 4,5-dimethyl-', 'phenol'], name='B_1'), +# data={ +# 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', '4,5-dimethylhex-1-ene', 'phenol'], +# 'retention_time': [8.527, 10.507, 11.071, 11.486, 12.214, 13.687], +# 'area': [147566, 69223, 40376, 441077, 19522, 200947], +# 'height': [39393, 18515, 12132, 112797, 7194, 64421], +# 'area_if_undiluted': [147566, 69223, 40376, 441077, 19522, 200947], +# 'conc_vial_mg_L': [np.nan, 6.243800844792131, np.nan, np.nan, np.nan, 7.167230535550548], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 6.243800844792131, np.nan, np.nan, np.nan, 7.167230535550548], +# 'fraction_of_sample_fr': [np.nan, 0.0022299288731400468, np.nan, np.nan, np.nan, 0.0025597251912680527], +# 'fraction_of_feedstock_fr': [np.nan, 0.001092665147838623, np.nan, np.nan, np.nan, 0.0012542653437213457], +# 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'n.a.', 'n.a.', 'self'], +# }), +# 'B_2': pd.DataFrame( +# index=pd.Index(['2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B_2'), +# data={ +# 'iupac_name': ['butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], +# 'retention_time': [8.502, 10.474, 11.027, 11.456, 13.661], +# 'area': [181021, 64531, 35791, 472362, 228750], +# 'height': [44551, 19823, 12737, 120142, 75153], +# 'area_if_undiluted': [181021, 64531, 35791, 472362, 228750], +# 'conc_vial_mg_L': [np.nan, 6.134683722446865, np.nan, np.nan, 7.884941445329839], +# 'conc_vial_if_undiluted_mg_L': [np.nan, 6.134683722446865, np.nan, np.nan, 7.884941445329839], +# 'fraction_of_sample_fr': [np.nan, 0.0021909584723024517, np.nan, np.nan, 0.002816050516189228], +# 'fraction_of_feedstock_fr': [np.nan, 0.0010954792361512259, np.nan, np.nan, 0.001408025258094614], +# 'compound_used_for_calibration': ['n.a.', 'self', 'n.a.', 'n.a.', 'self'], +# }) +# } +# return files + +# @pytest.fixture +# def checked_files_info_added_stats(): +# files_info = pd.DataFrame( +# index=pd.Index(['A_1', 'A_2', 'Ader_1', 'Ader_2', 'B_1', 'B_2'], name='filename'), +# data={ +# 'samplename': ['A', 'A', 'Ader', 'Ader', 'B', 'B'], +# 'derivatized': [False, False, True, True, False, False], +# 'dilution_factor': [25, 25, 125, 125, 1, 1], +# 'total_sample_conc_in_vial_mg_L': [560.0000000000001, 560.0000000000001, 112.0, 112.0, 2800.0, 2800.0], +# 'sample_yield_on_feedstock_basis_fr': [0.45, 0.46, 0.47, 0.48, 0.49, 0.5], +# 'calibration_file': ['calibration', 'calibration', 'deriv_calibration', 'deriv_calibration', 'calibration', 'calibration'], +# 'max_height': [1147599.0, 1138647.0, 501749.0, 594688.0, 112797.0, 120142.0], +# 'total_height': [2026484.0, 1976513.0, 1208487.0, 1333741.0, 254452.0, 272406.0], +# 'max_area': [6379752.0, 6394708.0, 1724814.0, 1956560.0, 441077.0, 472362.0], +# 'total_area': [9796894.0, 9530922.0, 3753542.0, 4058211.0, 918711.0, 982455.0], +# 'max_area_if_undiluted': [159493800.0, 159867700.0, 215601750.0, 244570000.0, 441077.0, 472362.0], +# 'total_area_if_undiluted': [244922350.0, 238273050.0, 469192750.0, 507276375.0, 918711.0, 982455.0], +# 'max_conc_vial_mg_L': [131.18800047103497, 125.38077898437679, 31.36776718294773, 36.81298755438084, 7.167230535550548, 7.884941445329839], +# 'total_conc_vial_mg_L': [334.4423661061518, 346.8398667117484, 83.8590528949878, 91.71911575725186, 13.411031380342678, 14.019625167776704], +# 'max_conc_vial_if_undiluted_mg_L': [3279.7000117758744, 3134.51947460942, 3920.970897868466, 4601.623444297605, 7.167230535550548, 7.884941445329839], +# 'total_conc_vial_if_undiluted_mg_L': [8361.059152653796, 8670.996667793712, 10482.381611873476, 11464.889469656482, 13.411031380342678, 14.019625167776704], +# 'max_fraction_of_sample_fr': [0.23426428655541953, 0.2238942481863871, 0.2800693498477476, 0.3286873888784004, 0.0025597251912680527, 0.002816050516189228], +# 'total_fraction_of_sample_fr': [0.5972185109038424, 0.6193569048424078, 0.7487415437052483, 0.8189206764040344, 0.0047896540644080995, 0.005007008988491679], +# 'max_fraction_of_feedstock_fr': [0.10541892894993879, 0.10299135416573807, 0.13163259442844139, 0.1577699466616322, 0.0012542653437213457, 0.001408025258094614], +# 'total_fraction_of_feedstock_fr': [0.26874832990672903, 0.28490417622750763, 0.3519085255414667, 0.39308192467393654, 0.0023469304915599686, 0.0025035044942458397], +# 'compound_with_max_area': ['oleic acid', 'oleic acid', '9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative', '2,5-hexanedione', '2,5-hexanedione'], +# 'compound_with_max_conc': ['9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', 'phenol', 'phenol'], +# } +# ) +# return files_info + + +# @pytest.fixture +# def checked_samples_info(): +# samples_info = pd.DataFrame( +# index=pd.Index(['A', 'Ader', 'B'], name='samplename'), +# data={ +# 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], +# 'derivatized': [(False, False), (True, True), (False, False)], +# 'dilution_factor': [(25, 25), (125, 125), (1, 1)], +# 'total_sample_conc_in_vial_mg_L': [(560.0000000000001, 560.0000000000001), (112.0, 112.0), (2800.0, 2800.0)], +# 'sample_yield_on_feedstock_basis_fr': [(0.45, 0.46), (0.47, 0.48), (0.49, 0.5)], +# 'calibration_file': [('calibration', 'calibration'), ('deriv_calibration', 'deriv_calibration'), ('calibration', 'calibration')], +# 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], +# 'compound_with_max_conc': [('9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-'), ('9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative'), ('phenol', 'phenol')], +# 'max_height': [1143123.0, 548218.5, 116469.5], +# 'max_area': [6387230.0, 1840687.0, 456719.5], +# 'max_area_if_undiluted': [159680750.0, 230085875.0, 456719.5], +# 'max_conc_vial_mg_L': [128.28438972770587, 34.090377368664285, 7.526085990440194], +# 'max_conc_vial_if_undiluted_mg_L': [3207.1097431926473, 4261.297171083035, 7.526085990440194], +# 'max_fraction_of_sample_fr': [0.2290792673709033, 0.304378369363074, 0.0026878878537286406], +# 'max_fraction_of_feedstock_fr': [0.10420514155783843, 0.1447012705450368, 0.00133114530090798], +# 'total_height': [2001498.5, 1271114.0, 263429.0], +# 'total_area': [9663908.0, 3905876.5, 950583.0], +# 'total_area_if_undiluted': [241597700.0, 488234562.5, 950583.0], +# 'total_conc_vial_mg_L': [340.6411164089501, 87.78908432611982, 13.71532827405969], +# 'total_conc_vial_if_undiluted_mg_L': [8516.027910223755, 10973.635540764979, 13.71532827405969], +# 'total_fraction_of_sample_fr': [0.608287707873125, 0.7838311100546413, 0.0048983315264498895], +# 'total_fraction_of_feedstock_fr': [0.27682625306711833, 0.3724952251077016, 0.002425217492902904], +# } +# ) +# return samples_info + +# @pytest.fixture +# def checked_samples_info_std(): +# samples_info_std = pd.DataFrame( +# index=pd.Index(['A', 'Ader', 'B'], name='samplename'), +# data={ +# 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], +# 'derivatized': [(False, False), (True, True), (False, False)], +# 'dilution_factor': [(25, 25), (125, 125), (1, 1)], +# 'total_sample_conc_in_vial_mg_L': [(560.0000000000001, 560.0000000000001), (112.0, 112.0), (2800.0, 2800.0)], +# 'sample_yield_on_feedstock_basis_fr': [(0.45, 0.46), (0.47, 0.48), (0.49, 0.5)], +# 'calibration_file': [('calibration', 'calibration'), ('deriv_calibration', 'deriv_calibration'), ('calibration', 'calibration')], +# 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], +# 'compound_with_max_conc': [('9,12-octadecadienoic acid (z,z)-', '9,12-octadecadienoic acid (z,z)-'), ('9,12-octadecadienoic acid (z,z)-, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative'), ('phenol', 'phenol')], +# 'max_height': [6330.019905181974, 65717.79713669654, 5193.699307815192], +# 'max_area': [10575.489019426004, 163869.16811285765, 22121.83564942114], +# 'max_area_if_undiluted': [264387.2254856501, 20483646.014107205, 22121.83564942114], +# 'max_conc_vial_mg_L': [4.106325693068217, 3.8503522496954825, 0.5074982512365029], +# 'max_conc_vial_if_undiluted_mg_L': [102.65814232670576, 481.29403121193536, 0.5074982512365029], +# 'max_fraction_of_sample_fr': [0.007332724451907525, 0.03437814508656681, 0.00018124937544160814], +# 'max_fraction_of_feedstock_fr': [0.001716554591745799, 0.01848189900635057, 0.00010872467812800095], +# 'total_height': [35334.83296267297, 88567.95277073982, 12695.395149423275], +# 'total_area': [188070.60480574841, 215433.51591732426, 45073.814659955286], +# 'total_area_if_undiluted': [4701765.120143711, 26929189.48966553, 45073.814659955286], +# 'total_conc_vial_mg_L': [8.766356747981709, 5.557903750459465, 0.4303407940826049], +# 'total_conc_vial_if_undiluted_mg_L': [219.1589186995424, 694.7379688074318, 0.4303407940826049], +# 'total_fraction_of_sample_fr': [0.015654208478538812, 0.049624140629102274, 0.00015369314074378663], +# 'total_fraction_of_feedstock_fr': [0.011423908489230281, 0.029113989731069757, 0.00011071453905670015], +# } +# ) +# return samples_info_std + +# @pytest.fixture +# def checked_samples_info_no_calibrations(): +# samples_info = pd.DataFrame( +# index=pd.Index(['A', 'Ader', 'B'], name='samplename'), +# data={ +# 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], +# 'replicate_number': [('1', '2'), ('1', '2'), ('1', '2')], +# 'derivatized': [(False, False), (False, False), (False, False)], +# 'calibration_file': [(False, False), (False, False), (False, False)], +# 'dilution_factor': [(1, 1), (1, 1), (1, 1)], +# 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1), (1, 1)], +# 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1), (1, 1)], +# 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], +# 'max_height': [1143123.0, 548218.5, 116469.5], +# 'max_area': [6387230.0, 1840687.0, 456719.5], +# 'max_area_if_undiluted': [6387230.0, 1840687.0, 456719.5], +# 'total_height': [2001498.5, 1271114.0, 263429.0], +# 'total_area': [9663908.0, 3905876.5, 950583.0], +# 'total_area_if_undiluted': [9663908.0, 3905876.5, 950583.0], +# } +# ) +# return samples_info + +# @pytest.fixture +# def checked_samples_info_no_calibrations_std(): +# samples_info_std = pd.DataFrame( +# index=pd.Index(['A', 'Ader', 'B'], name='samplename'), +# data={ +# 'filename': [('A_1', 'A_2'), ('Ader_1', 'Ader_2'), ('B_1', 'B_2')], +# 'replicate_number': [('1', '2'), ('1', '2'), ('1', '2')], +# 'derivatized': [(False, False), (False, False), (False, False)], +# 'calibration_file': [(False, False), (False, False), (False, False)], +# 'dilution_factor': [(1, 1), (1, 1), (1, 1)], +# 'total_sample_conc_in_vial_mg_L': [(1, 1), (1, 1), (1, 1)], +# 'sample_yield_on_feedstock_basis_fr': [(1, 1), (1, 1), (1, 1)], +# 'compound_with_max_area': [('oleic acid', 'oleic acid'), ('9-octadecenoic acid, (z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'), ('2,5-hexanedione', '2,5-hexanedione')], +# 'max_height': [6330.019905181974, 65717.79713669654, 5193.699307815192], +# 'max_area': [10575.489019426004, 163869.16811285765, 22121.83564942114], +# 'max_area_if_undiluted': [10575.489019426004, 163869.16811285765, 22121.83564942114], +# 'total_height': [35334.83296267297, 88567.95277073982, 12695.395149423275], +# 'total_area': [188070.60480574841, 215433.51591732426, 45073.814659955286], +# 'total_area_if_undiluted': [188070.60480574841, 215433.51591732426, 45073.814659955286], +# } +# ) +# return samples_info_std + + +# @pytest.fixture +# def checked_samples(): +# samples = { +# 'A': pd.DataFrame( +# index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A'), +# data={ +# 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 26.284, 36.1605, 40.046499999999995, 40.492999999999995, 43.847, 43.986999999999995], +# 'area': [24439.5, 10952.0, 47519.5, 18181.0, 1767468.0, 1413594.0, 6387230.0], +# 'height': [25256.5, 4259.0, 14769.5, 6222.0, 477850.5, 332147.5, 1143123.0], +# 'area_if_undiluted': [610987.5, 273800.0, 1187987.5, 454525.0, 44186700.0, 35339850.0, 159680750.0], +# 'conc_vial_mg_L': [0.0, 22.78427785050836, 23.65614297715311, 0.0, 63.58554431387759, 128.28438972770587, 113.72290046495935], +# 'conc_vial_if_undiluted_mg_L': [0.0, 569.606946262709, 591.4035744288278, 0.0, 1589.6386078469395, 3207.1097431926473, 2843.0725116239837], +# 'fraction_of_sample_fr': [0.0, 0.04068621044733635, 0.042243112459201974, 0.0, 0.11354561484620995, 0.2290792673709033, 0.20307660797314164], +# 'fraction_of_feedstock_fr': [0.0, 0.018715656805774722, 0.01922128259154552, 0.0, 0.051641211741918436, 0.10420514155783843, 0.0924007887729286], +# 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], +# } +# ), +# 'Ader': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.123999999999995, 41.7365, 42.159, 45.2555, 45.3695], +# 'area': [15719.5, 51560.5, 26505.5, 1409097.5, 562306.5, 1840687.0], +# 'height': [13126.5, 18394.0, 8953.5, 490697.0, 191724.5, 548218.5], +# 'area_if_undiluted': [1964937.5, 6445062.5, 3313187.5, 176137187.5, 70288312.5, 230085875.0], +# 'conc_vial_mg_L': [0.0, 0.6415947958834178, 2.5835030215418877, 27.502344287695024, 34.090377368664285, 22.971264852335217], +# 'conc_vial_if_undiluted_mg_L': [0.0, 80.19934948542723, 322.9378776927359, 3437.7930359618776, 4261.297171083035, 2871.4081065419023], +# 'fraction_of_sample_fr': [0.0, 0.005728524963244802, 0.023066991263766854, 0.24555664542584843, 0.304378369363074, 0.2051005790387073], +# 'fraction_of_feedstock_fr': [0.0, 0.002722862373382652, 0.010956172407969126, 0.11663401169579143, 0.1447012705450368, 0.09748090808552162], +# 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], +# } +# ) +# , +# 'B': pd.DataFrame( +# index=pd.Index(['1-hexene, 4,5-dimethyl-', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B'), +# data={ +# 'iupac_name': ['4,5-dimethylhex-1-ene', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], +# 'retention_time': [6.107, 8.5145, 10.4905, 11.049, 11.471, 13.674], +# 'area': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], +# 'height': [3597.0, 41972.0, 19169.0, 12434.5, 116469.5, 69787.0], +# 'area_if_undiluted': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], +# 'conc_vial_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], +# 'conc_vial_if_undiluted_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], +# 'fraction_of_sample_fr': [0.0, 0.0, 0.0022104436727212492, 0.0, 0.0, 0.0026878878537286406], +# 'fraction_of_feedstock_fr': [0.0, 0.0, 0.0010940721919949245, 0.0, 0.0, 0.00133114530090798], +# 'compound_used_for_calibration': ['n.a.', 'n.a.', 'self', 'n.a.', 'n.a.', 'self'], +# } +# ) +# } +# return samples + +# @pytest.fixture +# def checked_samples_applied_calibration(): +# samples = { +# 'A': pd.DataFrame( +# index=pd.Index(['unidentified', 'n-decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'n-hexadecanoic acid', '9,12-octadecadienoic acid (z,z)-', 'oleic acid'], name='A'), +# data={ +# 'iupac_name': ['unidentified', 'decanoic acid', 'tetradecanoic acid', 'oxacycloheptadecan-2-one', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.025, 26.284, 36.1605, 40.046499999999995, 40.492999999999995, 43.847, 43.986999999999995], +# 'area': [24439.5, 10952.0, 47519.5, 18181.0, 1767468.0, 1413594.0, 6387230.0], +# 'height': [25256.5, 4259.0, 14769.5, 6222.0, 477850.5, 332147.5, 1143123.0], +# 'area_if_undiluted': [610987.5, 273800.0, 1187987.5, 454525.0, 44186700.0, 35339850.0, 159680750.0], +# 'conc_vial_mg_L': [0.0, 22.78427785050836, 23.65614297715311, 0.0, 63.58554431387759, 128.28438972770587, 113.72290046495935], +# 'conc_vial_if_undiluted_mg_L': [0.0, 569.606946262709, 591.4035744288278, 0.0, 1589.6386078469395, 3207.1097431926473, 2843.0725116239837], +# 'fraction_of_sample_fr': [0.0, 0.04068621044733635, 0.042243112459201974, 0.0, 0.11354561484620995, 0.2290792673709033, 0.20307660797314164], +# 'fraction_of_feedstock_fr': [0.0, 0.018715656805774722, 0.01922128259154552, 0.0, 0.051641211741918436, 0.10420514155783843, 0.0924007887729286], +# 'compound_used_for_calibration': ['n.a.', 'tetradecanoic acid (sim=1.0; dwt=56)', 'self', 'n.a.', 'self', 'self', 'self'], +# } +# ), +# 'Ader': pd.DataFrame( +# index=pd.Index(['unidentified', 'myristic acid, tms derivative', 'palmitelaidic acid, tms derivative', 'palmitic acid, tms derivative', '9,12-octadecadienoic acid (z,z)-, tms derivative', '9-octadecenoic acid, (z)-, tms derivative'], name='Ader'), +# data={ +# 'iupac_name': ['unidentified', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid'], +# 'retention_time': [6.027, 38.123999999999995, 41.7365, 42.159, 45.2555, 45.3695], +# 'area': [15719.5, 51560.5, 26505.5, 1409097.5, 562306.5, 1840687.0], +# 'height': [13126.5, 18394.0, 8953.5, 490697.0, 191724.5, 548218.5], +# 'area_if_undiluted': [1964937.5, 6445062.5, 3313187.5, 176137187.5, 70288312.5, 230085875.0], +# 'conc_vial_mg_L': [0.0, 0.6415947958834178, 2.5835030215418877, 27.502344287695024, 34.090377368664285, 22.971264852335217], +# 'conc_vial_if_undiluted_mg_L': [0.0, 80.19934948542723, 322.9378776927359, 3437.7930359618776, 4261.297171083035, 2871.4081065419023], +# 'fraction_of_sample_fr': [0.0, 0.005728524963244802, 0.023066991263766854, 0.24555664542584843, 0.304378369363074, 0.2051005790387073], +# 'fraction_of_feedstock_fr': [0.0, 0.002722862373382652, 0.010956172407969126, 0.11663401169579143, 0.1447012705450368, 0.09748090808552162], +# 'compound_used_for_calibration': ['n.a.', 'hexadecanoic acid (sim=1.0; dwt=28)', '(e)-octadec-9-enoic acid (sim=1.0; dwt=28)', 'self', 'self', '(e)-octadec-9-enoic acid (sim=1.0; dwt=0)'], +# } +# ) +# , +# 'B': pd.DataFrame( +# index=pd.Index(['1-hexene, 4,5-dimethyl-', '2-butanone', '2-cyclopenten-1-one, 2-methyl-', 'trans-2-pentenoic acid', '2,5-hexanedione', 'phenol'], name='B'), +# data={ +# 'iupac_name': ['4,5-dimethylhex-1-ene', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', 'hexane-2,5-dione', 'phenol'], +# 'retention_time': [6.107, 8.5145, 10.4905, 11.049, 11.471, 13.674], +# 'area': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], +# 'height': [3597.0, 41972.0, 19169.0, 12434.5, 116469.5, 69787.0], +# 'area_if_undiluted': [9761.0, 164293.5, 66877.0, 38083.5, 456719.5, 214848.5], +# 'conc_vial_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], +# 'conc_vial_if_undiluted_mg_L': [0.0, 0.0, 6.189242283619498, 0.0, 0.0, 7.526085990440194], +# 'fraction_of_sample_fr': [0.0, 0.0, 0.0022104436727212492, 0.0, 0.0, 0.0026878878537286406], +# 'fraction_of_feedstock_fr': [0.0, 0.0, 0.0010940721919949245, 0.0, 0.0, 0.00133114530090798], +# 'compound_used_for_calibration': ['n.a.', 'n.a.', 'self', 'n.a.', 'n.a.', 'self'], +# } +# ) +# } +# return samples + +# @pytest.fixture +# def checked_files_param_reports(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', '4,5-dimethylhex-1-ene', 'oxacycloheptadecan-2-one', 'decanoic acid'], name='height'), +# data={ +# 'A_1': [1147599.0, 493759.0, 339605.0, 0.0, 0.0, 0.0, 0.0, 15019.0, 0.0, 0.0, 0.0, 5705.0, 0.0], +# 'A_2': [1138647.0, 461942.0, 324690.0, 0.0, 0.0, 0.0, 0.0, 14520.0, 0.0, 0.0, 0.0, 6739.0, 4259.0], +# 'Ader_1': [501749.0, 484890.0, 180850.0, 0.0, 0.0, 0.0, 0.0, 18415.0, 0.0, 9132.0, 0.0, 0.0, 0.0], +# 'Ader_2': [594688.0, 496504.0, 202599.0, 0.0, 0.0, 0.0, 0.0, 18373.0, 0.0, 8775.0, 0.0, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 112797.0, 64421.0, 39393.0, 18515.0, 0.0, 12132.0, 0.0, 7194.0, 0.0, 0.0], +# 'B_2': [0.0, 0.0, 0.0, 120142.0, 75153.0, 44551.0, 19823.0, 0.0, 12737.0, 0.0, 0.0, 0.0, 0.0], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), +# data={ +# 'A_1': [6379752.0, 1878180.0, 1456119.0, 0.0, 0.0, 0.0, 0.0, 44389.0, 0.0, 0.0, 15068.0, 0.0, 0.0], +# 'A_2': [6394708.0, 1656756.0, 1371069.0, 0.0, 0.0, 0.0, 0.0, 50650.0, 0.0, 0.0, 21294.0, 0.0, 10952.0], +# 'Ader_1': [1724814.0, 1415205.0, 519476.0, 0.0, 0.0, 0.0, 0.0, 49508.0, 0.0, 27798.0, 0.0, 0.0, 0.0], +# 'Ader_2': [1956560.0, 1402990.0, 605137.0, 0.0, 0.0, 0.0, 0.0, 53613.0, 0.0, 25213.0, 0.0, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 441077.0, 200947.0, 147566.0, 69223.0, 0.0, 40376.0, 0.0, 0.0, 19522.0, 0.0], +# 'B_2': [0.0, 0.0, 0.0, 472362.0, 228750.0, 181021.0, 64531.0, 0.0, 35791.0, 0.0, 0.0, 0.0, 0.0], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', 'hexane-2,5-dione', 'decanoic acid', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), +# data={ +# 'A_1': [159493800.0, 46954500.0, 36402975.0, 1109725.0, 0.0, 376700.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'A_2': [159867700.0, 41418900.0, 34276725.0, 1266250.0, 0.0, 532350.0, 0.0, 273800.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'Ader_1': [215601750.0, 176900625.0, 64934500.0, 6188500.0, 3474750.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'Ader_2': [244570000.0, 175373750.0, 75642125.0, 6701625.0, 3151625.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 441077.0, 0.0, 200947.0, 147566.0, 69223.0, 40376.0, 19522.0], +# 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 472362.0, 0.0, 228750.0, 181021.0, 64531.0, 35791.0, 0.0], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), +# data={ +# 'A_1': [131.18800047103497, 113.61850020825628, 66.05436178187291, 23.581503644987627, 0.0, 0.0, 0.0, 0.0], +# 'A_2': [125.38077898437679, 113.82730072166243, 61.11672684588226, 23.730782309318595, 22.78427785050836, 0.0, 0.0, 0.0], +# 'Ader_1': [31.36776718294773, 21.669084708496513, 27.623189632994073, 0.600983241036704, 0.0, 0.0, 0.0, 2.5980281295127825], +# 'Ader_2': [36.81298755438084, 24.27344499617392, 27.38149894239597, 0.6822063507301317, 0.0, 0.0, 0.0, 2.5689779135709925], +# 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 7.167230535550548, 6.243800844792131, 0.0], +# 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 7.884941445329839, 6.134683722446865, 0.0], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A_1': [3279.7000117758744, 1651.3590445468228, 2840.462505206407, 589.5375911246907, 0.0, 0.0, 0.0, 0.0], +# 'A_2': [3134.51947460942, 1527.9181711470565, 2845.682518041561, 593.2695577329649, 569.606946262709, 0.0, 0.0, 0.0], +# 'Ader_1': [3920.970897868466, 3452.898704124259, 2708.635588562064, 75.12290512958799, 0.0, 324.7535161890978, 0.0, 0.0], +# 'Ader_2': [4601.623444297605, 3422.6873677994963, 3034.1806245217404, 85.27579384126646, 0.0, 321.12223919637404, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.167230535550548, 6.243800844792131], +# 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.884941445329839, 6.134683722446865], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), +# data={ +# 'A_1': [0.23426428655541953, 0.11795421746763018, 0.20289017894331474, 0.042109827937477896, 0.0, 0.0, 0.0, 0.0], +# 'A_2': [0.2238942481863871, 0.10913701222478973, 0.20326303700296858, 0.04237639698092605, 0.04068621044733635, 0.0, 0.0, 0.0], +# 'Ader_1': [0.2800693498477476, 0.24663562172316136, 0.193473970611576, 0.005365921794970571, 0.0, 0.023196679727792702, 0.0, 0.0], +# 'Ader_2': [0.3286873888784004, 0.24447766912853547, 0.21672718746583858, 0.006091128131519033, 0.0, 0.022937302799741006, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0025597251912680527, 0.0022299288731400468], +# 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002816050516189228, 0.0021909584723024517], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', 'decanoic acid', '(e)-hexadec-9-enoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), +# data={ +# 'A_1': [0.10541892894993879, 0.053079397860433586, 0.09130058052449164, 0.018949422571865052, 0.0, 0.0, 0.0, 0.0], +# 'A_2': [0.10299135416573807, 0.05020302562340328, 0.09350099702136555, 0.019493142611225985, 0.018715656805774722, 0.0, 0.0, 0.0], +# 'Ader_1': [0.13163259442844139, 0.11591874220988584, 0.09093276618744071, 0.0025219832436361683, 0.0, 0.01090243947206257, 0.0, 0.0], +# 'Ader_2': [0.1577699466616322, 0.11734928118169702, 0.10402904998360252, 0.0029237415031291357, 0.0, 0.011009905343875682, 0.0, 0.0], +# 'B_1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0012542653437213457, 0.001092665147838623], +# 'B_2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.001408025258094614, 0.0010954792361512259], +# } +# ), + +# } +# return reports + +# @pytest.fixture +# def checked_files_param_aggrreps(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), +# data={ +# 'A_1': [1673299.018636137, 327039.28314786457, 0.0, 0.0, 0.0, 1301.4243936952162], +# 'A_2': [1630562.5435198732, 318647.09040130535, 0.0, 0.0, 0.0, 1537.300436303604], +# 'Ader_1': [995669.7397917995, 199362.50083044838, 0.0, 0.0, 0.0, 0.0], +# 'Ader_2': [1101297.4487493301, 219631.74381979596, 0.0, 0.0, 0.0, 0.0], +# 'B_1': [30189.53968239826, 5454.916540151818, 154971.12017291295, 52781.27325470194, 11641.780331526938, 0.0], +# 'B_2': [24976.13637228884, 5726.94295844986, 167175.26975375, 61574.19209435766, 13581.203602167678, 0.0], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), +# data={ +# 'A_1': [8180808.865926539, 1588883.8460032765, 0.0, 0.0, 0.0, 3437.311615109468], +# 'A_2': [7957332.218705356, 1542835.8875348074, 0.0, 0.0, 0.0, 4857.58650996423], +# 'Ader_1': [3115375.5519706816, 621383.3360462904, 0.0, 0.0, 0.0, 0.0], +# 'Ader_2': [3373187.7167576416, 670272.3970037951, 0.0, 0.0, 0.0, 0.0], +# 'B_1': [102813.63412565118, 18154.278785457453, 598982.0949258526, 164639.4578897035, 36313.94781638508, 0.0], +# 'B_2': [85688.75337144051, 16092.723202157411, 653960.780006639, 187418.9512272872, 41338.34077143768, 0.0], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), +# data={ +# 'A_1': [204520221.64816347, 39722096.15008192, 0.0, 0.0, 85932.7903777367, 0.0], +# 'A_2': [198933305.4676339, 38570897.18837019, 0.0, 0.0, 121439.66274910574, 0.0], +# 'Ader_1': [389421943.99633527, 77672917.00578631, 0.0, 0.0, 0.0, 0.0], +# 'Ader_2': [421648464.59470516, 83784049.6254744, 0.0, 0.0, 0.0, 0.0], +# 'B_1': [102813.63412565118, 18154.278785457453, 598982.0949258526, 164639.4578897035, 0.0, 36313.94781638508], +# 'B_2': [85688.75337144051, 16092.723202157411, 653960.780006639, 187418.9512272872, 0.0, 41338.34077143768], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), +# data={ +# 'A_1': [279.04506020687086, 55.41196015223182, 0.0, 0.0, 0.0], +# 'A_2': [287.2245498713612, 59.62973999202588, 0.0, 0.0, 0.0], +# 'Ader_1': [69.94687725385289, 13.916672123312216, 0.0, 0.0, 0.0], +# 'Ader_2': [76.54999067692155, 15.17432861960896, 0.0, 0.0, 0.0], +# 'B_1': [2.7332298278341587, 0.0, 5.872239694763155, 3.706919814979471, 1.2952193148242288], +# 'B_2': [2.6854637025308077, 0.0, 6.460273032447163, 3.6421374119160257, 1.424919765813671], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A_1': [6976.126505171772, 1385.2990038057956, 0.0, 0.0, 0.0], +# 'A_2': [7180.61374678403, 1490.7434998006472, 0.0, 0.0, 0.0], +# 'Ader_1': [8743.35965673161, 1739.5840154140271, 0.0, 0.0, 0.0], +# 'Ader_2': [9568.748834615195, 1896.79107745112, 0.0, 0.0, 0.0], +# 'B_1': [2.7332298278341587, 0.0, 5.872239694763155, 3.706919814979471, 1.2952193148242288], +# 'B_2': [2.6854637025308077, 0.0, 6.460273032447163, 3.6421374119160257, 1.424919765813671], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), +# data={ +# 'A_1': [0.4982947503694122, 0.09894992884327108, 0.0, 0.0, 0.0], +# 'A_2': [0.5129009819131448, 0.10648167855718907, 0.0, 0.0, 0.0], +# 'Ader_1': [0.6245256897665437, 0.12425600110100192, 0.0, 0.0, 0.0], +# 'Ader_2': [0.6834820596153709, 0.1354850769607943, 0.0, 0.0, 0.0], +# 'B_1': [0.0009761535099407709, 0.0, 0.0020972284624154124, 0.0013238999339212397, 0.00046257832672293885], +# 'B_2': [0.0009590941794752884, 0.0, 0.0023072403687311297, 0.0013007633613985805, 0.0005088999163620254], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), +# data={ +# 'A_1': [0.22423263766623547, 0.04452746797947199, 0.0, 0.0, 0.0], +# 'A_2': [0.23593445168004665, 0.04898157213630697, 0.0, 0.0, 0.0], +# 'Ader_1': [0.29352707419027546, 0.058400320517470905, 0.0, 0.0, 0.0], +# 'Ader_2': [0.32807138861537805, 0.06503283694118125, 0.0, 0.0, 0.0], +# 'B_1': [0.00047831521987097775, 0.0, 0.001027641946583552, 0.0006487109676214074, 0.00022666338009424], +# 'B_2': [0.0004795470897376442, 0.0, 0.0011536201843655649, 0.0006503816806992902, 0.0002544499581810127], +# } +# ) + +# } +# return reports + +# @pytest.fixture +# def checked_samples_param_reports(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='height'), +# data={ +# 'A': [1143123.0, 477850.5, 332147.5, 0.0, 0.0, 0.0, 0.0, 14769.5, 0.0, 0.0, 6222.0, 0.0, 2129.5], +# 'Ader': [548218.5, 490697.0, 191724.5, 0.0, 0.0, 0.0, 0.0, 18394.0, 0.0, 8953.5, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 116469.5, 69787.0, 41972.0, 19169.0, 0.0, 12434.5, 0.0, 0.0, 3597.0, 0.0], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), +# data={ +# 'A': [6387230.0, 1767468.0, 1413594.0, 0.0, 0.0, 0.0, 0.0, 47519.5, 0.0, 0.0, 18181.0, 0.0, 5476.0], +# 'Ader': [1840687.0, 1409097.5, 562306.5, 0.0, 0.0, 0.0, 0.0, 51560.5, 0.0, 26505.5, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 456719.5, 214848.5, 164293.5, 66877.0, 0.0, 38083.5, 0.0, 0.0, 9761.0, 0.0], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexane-2,5-dione', 'oxacycloheptadecan-2-one', 'phenol', 'butan-2-one', 'decanoic acid', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), +# data={ +# 'A': [159680750.0, 44186700.0, 35339850.0, 1187987.5, 0.0, 0.0, 454525.0, 0.0, 0.0, 136900.0, 0.0, 0.0, 0.0], +# 'Ader': [230085875.0, 176137187.5, 70288312.5, 6445062.5, 3313187.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 456719.5, 0.0, 214848.5, 164293.5, 0.0, 66877.0, 38083.5, 9761.0], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), +# data={ +# 'A': [128.28438972770587, 113.72290046495935, 63.58554431387759, 23.65614297715311, 11.39213892525418, 0.0, 0.0, 0.0], +# 'Ader': [34.090377368664285, 22.971264852335217, 27.502344287695024, 0.6415947958834178, 0.0, 0.0, 0.0, 2.5835030215418877], +# 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 7.526085990440194, 6.189242283619498, 0.0], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A': [3207.1097431926473, 1589.6386078469395, 2843.0725116239837, 591.4035744288278, 0.0, 284.8034731313545, 0.0, 0.0], +# 'Ader': [4261.297171083035, 3437.7930359618776, 2871.4081065419023, 80.19934948542723, 322.9378776927359, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.526085990440194, 6.189242283619498], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), +# data={ +# 'A': [0.2290792673709033, 0.11354561484620995, 0.20307660797314164, 0.042243112459201974, 0.0, 0.020343105223668174, 0.0, 0.0], +# 'Ader': [0.304378369363074, 0.24555664542584843, 0.2051005790387073, 0.005728524963244802, 0.023066991263766854, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0026878878537286406, 0.0022104436727212492], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), +# data={ +# 'A': [0.10420514155783843, 0.051641211741918436, 0.0924007887729286, 0.01922128259154552, 0.0, 0.009357828402887361, 0.0, 0.0], +# 'Ader': [0.1447012705450368, 0.11663401169579143, 0.09748090808552162, 0.002722862373382652, 0.010956172407969126, 0.0, 0.0, 0.0], +# 'B': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00133114530090798, 0.0010940721919949245], +# } +# ) +# } +# return reports + +# @pytest.fixture +# def checked_samples_param_reports_std(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='height'), +# data={ +# 'A': [6330.019905181974, 22498.01645701238, 10546.497641397356, np.nan, np.nan, np.nan, np.nan, 352.8462838120872, np.nan, np.nan, 731.1484117468901, np.nan, 3011.567781073506], +# 'Ader': [65717.79713669654, 8212.338156700564, 15378.865384026221, np.nan, np.nan, np.nan, np.nan, 29.698484809834994, np.nan, 252.43712088359746, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, 5193.699307815192, 7588.669975694028, 3647.2567773602123, 924.8956697920041, np.nan, 427.79960261786124, np.nan, np.nan, 5086.926183856023, np.nan], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'hexane-2,5-dione', 'phenol', 'butan-2-one', '2-methylcyclopent-2-en-1-one', 'tetradecanoic acid', '(e)-pent-2-enoic acid', '(e)-hexadec-9-enoic acid', 'oxacycloheptadecan-2-one', '4,5-dimethylhex-1-ene', 'decanoic acid'], name='area'), +# data={ +# 'A': [10575.489019426004, 156570.4119174501, 60139.43173991587, np.nan, np.nan, np.nan, np.nan, 4427.195557008974, np.nan, np.nan, 4402.446819667445, np.nan, 7744.233467555068], +# 'Ader': [163869.16811285765, 8637.309332193678, 60571.47398322085, np.nan, np.nan, np.nan, np.nan, 2902.6733367707775, np.nan, 1827.8710293672254, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, 22121.83564942114, 19659.68983732958, 23656.25736459595, 3317.745017327281, np.nan, 3242.0845917403203, np.nan, np.nan, 13804.138582323782, np.nan], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['(z)-octadec-9-enoic acid', 'hexadecanoic acid', '(9z,12z)-octadeca-9,12-dienoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'hexane-2,5-dione', 'oxacycloheptadecan-2-one', 'phenol', 'butan-2-one', 'decanoic acid', '2-methylcyclopent-2-en-1-one', '(e)-pent-2-enoic acid', '4,5-dimethylhex-1-ene'], name='area_if_undiluted'), +# data={ +# 'A': [264387.2254856501, 3914260.2979362523, 1503485.7934978968, 110679.88892522435, np.nan, np.nan, 110061.17049168612, np.nan, np.nan, 193605.8366888767, np.nan, np.nan, np.nan], +# 'Ader': [20483646.014107205, 1079663.6665242098, 7571434.247902606, 362834.1670963472, 228483.87867090316, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, 22121.83564942114, np.nan, 19659.68983732958, 23656.25736459595, np.nan, 3317.745017327281, 3242.0845917403203, 13804.138582323782], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', '(z)-octadec-9-enoic acid', 'hexadecanoic acid', 'tetradecanoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one', '(e)-hexadec-9-enoic acid'], name='conc_vial_mg_L'), +# data={ +# 'A': [4.106325693068217, 0.14764425894471855, 3.4914351462625968, 0.10555595583489899, 16.110917372532917, np.nan, np.nan, np.nan], +# 'Ader': [3.8503522496954825, 1.8415608200696434, 0.1709011262715786, 0.05743341165328154, np.nan, np.nan, np.nan, 0.02054160468737343], +# 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, 0.5074982512365029, 0.07715745715389961, np.nan], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A': [102.65814232670576, 87.28587865656482, 3.691106473618185, 2.638898895872451, np.nan, 402.77293431332293, np.nan, np.nan], +# 'Ader': [481.29403121193536, 21.362640783947153, 230.19510250870547, 7.179176456660192, 2.5677005859216706, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.5074982512365029, 0.07715745715389961], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_sample_fr'), +# data={ +# 'A': [0.007332724451907525, 0.006234705618326057, 0.00026365046240129915, 0.00018849277827660252, np.nan, 0.028769495308094487, np.nan, np.nan], +# 'Ader': [0.03437814508656681, 0.001525902913139085, 0.01644250732205038, 0.0005127983183328709, 0.00018340718470868995, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.00018124937544160814, 2.7556234697821363e-05], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['(9z,12z)-octadeca-9,12-dienoic acid', 'hexadecanoic acid', '(z)-octadec-9-enoic acid', 'tetradecanoic acid', '(e)-hexadec-9-enoic acid', 'decanoic acid', 'phenol', '2-methylcyclopent-2-en-1-one'], name='fraction_of_feedstock_fr'), +# data={ +# 'A': [0.001716554591745799, 0.002033902314020852, 0.001555929426374292, 0.0003844681268991305, np.nan, 0.013233967841723464, np.nan, np.nan], +# 'Ader': [0.01848189900635057, 0.0010115438077193133, 0.009260471080609506, 0.00028408598968518184, 7.598984670517598e-05, np.nan, np.nan, np.nan], +# 'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 0.00010872467812800095, 1.9898609286992866e-06], +# } +# ) +# } +# return reports + + +# @pytest.fixture +# def checked_samples_param_aggrreps(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), +# data={ +# 'A': [1651930.781078005, 322843.186774585, 0.0, 0.0, 0.0, 1419.36241499941], +# 'Ader': [1048483.5942705647, 209497.12232512212, 0.0, 0.0, 0.0, 0.0], +# 'B': [27582.83802734355, 5590.929749300839, 161073.19496333148, 57177.7326745298, 12611.491966847307, 0.0], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), +# data={ +# 'A': [8069070.542315948, 1565859.8667690419, 0.0, 0.0, 0.0, 4147.449062536849], +# 'Ader': [3244281.634364161, 645827.8665250427, 0.0, 0.0, 0.0, 0.0], +# 'B': [94251.19374854585, 17123.500993807433, 626471.4374662458, 176029.20455849537, 38826.144293911384, 0.0], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), +# data={ +# 'A': [201726763.55789867, 39146496.66922605, 0.0, 0.0, 103686.22656342122, 0.0], +# 'Ader': [405535204.2955202, 80728483.31563035, 0.0, 0.0, 0.0, 0.0], +# 'B': [94251.19374854585, 17123.500993807433, 626471.4374662458, 176029.20455849537, 0.0, 38826.144293911384], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), +# data={ +# 'A': [283.134805039116, 57.52085007212886, 0.0, 0.0, 0.0], +# 'Ader': [73.2484339653872, 14.545500371460587, 0.0, 0.0, 0.0], +# 'B': [2.709346765182483, 0.0, 6.166256363605159, 3.674528613447748, 1.36006954031895], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A': [7078.3701259779, 1438.0212518032215, 0.0, 0.0, 0.0], +# 'Ader': [9156.054245673402, 1818.1875464325735, 0.0, 0.0, 0.0], +# 'B': [2.709346765182483, 0.0, 6.166256363605159, 3.674528613447748, 1.36006954031895], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), +# data={ +# 'A': [0.5055978661412784, 0.10271580370023008, 0.0, 0.0, 0.0], +# 'Ader': [0.6540038746909574, 0.12987053903089812, 0.0, 0.0, 0.0], +# 'B': [0.0009676238447080297, 0.0, 0.002202234415573271, 0.0013123316476599102, 0.00048573912154248214], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), +# data={ +# 'A': [0.23008354467314104, 0.04675452005788948, 0.0, 0.0, 0.0], +# 'Ader': [0.3107992314028268, 0.06171657872932608, 0.0, 0.0, 0.0], +# 'B': [0.000478931154804311, 0.0, 0.0010906310654745584, 0.0006495463241603489, 0.00024055666913762634], +# } +# ) +# } +# return reports + +# @pytest.fixture +# def checked_samples_param_aggrreps_std(): +# reports = { +# 'height': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='height'), +# data={ +# 'A': [35797.371056783835, 7508.212703385821, 0.0, 0.0, 0.0, 166.78954924783815], +# 'Ader': [75153.30566953593, 14433.563492713163, 0.0, 0.0, 0.0, 0.0], +# 'B': [6487.963541864086, 192.35172504043408, 8629.636927224863, 6217.532537943509, 1371.3793462610597, 0.0], +# } +# ), +# 'area': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'alcohol', 'ester'], name='area'), +# data={ +# 'A': [201147.21731284214, 41724.311288199315, 0.0, 0.0, 0.0, 1004.2860093008128], +# 'Ader': [199552.22488919983, 38249.3835185481, 0.0, 0.0, 0.0, 0.0], +# 'B': [21973.821276880837, 1457.7399327444466, 42815.265175930195, 16107.534210999198, 3552.7823298636085, 0.0], +# } +# ), +# 'area_if_undiluted': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'ketone', 'C-arom', 'ester', 'alcohol'], name='area_if_undiluted'), +# data={ +# 'A': [5028680.432821053, 1043107.7822049828, 0.0, 0.0, 25107.15023252032, 0.0], +# 'Ader': [24944028.111149978, 4781172.939818514, 0.0, 0.0, 0.0, 0.0], +# 'B': [21973.821276880837, 1457.7399327444466, 42815.26517593019, 16107.534210999198, 0.0, 3552.7823298636085], +# } +# ), +# 'conc_vial_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_mg_L'), +# data={ +# 'A': [18.436674076139045, 5.526836289719974, 0.0, 0.0, 0.0], +# 'Ader': [4.984729502757059, 0.9565736502096863, 0.0, 0.0, 0.0], +# 'B': [0.03377575111300582, 0.0, 0.41580236064012105, 0.04580807650772246, 0.09171206841758799], +# } +# ), +# 'conc_vial_if_undiluted_mg_L': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='conc_vial_if_undiluted_mg_L'), +# data={ +# 'A': [460.9168519034765, 138.1709072429994, 0.0, 0.0, 0.0], +# 'Ader': [623.0911878446323, 119.57170627621078, 0.0, 0.0, 0.0], +# 'B': [0.03377575111300582, 0.0, 0.41580236064012105, 0.04580807650772246, 0.09171206841758799], +# } +# ), +# 'fraction_of_sample_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_sample_fr'), +# data={ +# 'A': [0.03292263227881972, 0.009869350517357094, 0.0, 0.0, 0.0], +# 'Ader': [0.04450651341747372, 0.00854083616258648, 0.0, 0.0, 0.0], +# 'B': [1.2062768254644968e-05, 0.0, 0.0001485008430857575, 1.6360027324186634e-05, 3.275431014913856e-05], +# } +# ), +# 'fraction_of_feedstock_fr': pd.DataFrame( +# index=pd.Index(['C-aliph', 'carboxyl', 'C-arom', 'ketone', 'alcohol'], name='fraction_of_feedstock_fr'), +# data={ +# 'A': [0.014510828146666238, 0.004414840265872383, 0.0, 0.0, 0.0], +# 'Ader': [0.024426518981430268, 0.004689897339536736, 0.0, 0.0, 0.0], +# 'B': [8.710635362591769e-07, 0.0, 8.908006621759261e-05, 1.1813725467879507e-06, 1.9648077791126472e-05], +# } +# ) +# } +# return reports # fmt: on diff --git a/tests/data_for_testing/compounds_properties.xlsx b/tests/data_for_testing/compounds_properties.xlsx index b330fe3..8c18045 100644 Binary files a/tests/data_for_testing/compounds_properties.xlsx and b/tests/data_for_testing/compounds_properties.xlsx differ diff --git a/tests/minimal_set/S_1.txt b/tests/data_minimal_case/S_1.txt similarity index 100% rename from tests/minimal_set/S_1.txt rename to tests/data_minimal_case/S_1.txt diff --git a/tests/minimal_set/S_2.txt b/tests/data_minimal_case/S_2.txt similarity index 100% rename from tests/minimal_set/S_2.txt rename to tests/data_minimal_case/S_2.txt diff --git a/tests/minimal_set/T_1.txt b/tests/data_minimal_case/T_1.txt similarity index 100% rename from tests/minimal_set/T_1.txt rename to tests/data_minimal_case/T_1.txt diff --git a/tests/minimal_set/T_2.txt b/tests/data_minimal_case/T_2.txt similarity index 100% rename from tests/minimal_set/T_2.txt rename to tests/data_minimal_case/T_2.txt diff --git a/tests/minimal_set/T_3.txt b/tests/data_minimal_case/T_3.txt similarity index 100% rename from tests/minimal_set/T_3.txt rename to tests/data_minimal_case/T_3.txt diff --git a/tests/minimal_set/cal_minimal.xlsx b/tests/data_minimal_case/cal_minimal.xlsx similarity index 100% rename from tests/minimal_set/cal_minimal.xlsx rename to tests/data_minimal_case/cal_minimal.xlsx diff --git a/tests/data_minimal_case/classifications_codes_fractions.xlsx b/tests/data_minimal_case/classifications_codes_fractions.xlsx new file mode 100644 index 0000000..f446d7e Binary files /dev/null and b/tests/data_minimal_case/classifications_codes_fractions.xlsx differ diff --git a/tests/data_minimal_case/compounds_properties.xlsx b/tests/data_minimal_case/compounds_properties.xlsx new file mode 100644 index 0000000..0304844 Binary files /dev/null and b/tests/data_minimal_case/compounds_properties.xlsx differ diff --git a/tests/data_minimal_case/files_info.xlsx b/tests/data_minimal_case/files_info.xlsx new file mode 100644 index 0000000..687f990 Binary files /dev/null and b/tests/data_minimal_case/files_info.xlsx differ diff --git a/tests/data_name_to_properties/checked_compounds_properties.xlsx b/tests/data_name_to_properties/checked_compounds_properties.xlsx new file mode 100644 index 0000000..bd814d5 Binary files /dev/null and b/tests/data_name_to_properties/checked_compounds_properties.xlsx differ diff --git a/tests/data_name_to_properties/classifications_codes_fractions.xlsx b/tests/data_name_to_properties/classifications_codes_fractions.xlsx new file mode 100644 index 0000000..f446d7e Binary files /dev/null and b/tests/data_name_to_properties/classifications_codes_fractions.xlsx differ diff --git a/tests/example.py b/tests/example.py new file mode 100644 index 0000000..4c44779 --- /dev/null +++ b/tests/example.py @@ -0,0 +1,37 @@ +import pathlib as plib +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.transforms import blended_transform_factory +import seaborn as sns +import ele +import pubchempy as pcp +from gcms_data_analysis.fragmenter import Fragmenter + +from gcms_data_analysis import name_to_properties, Project + + +folder_path = plib.Path( + r"C:\Users\mp933\OneDrive - Cornell University\Python\gcms_data_analysis\tests\data_minimal_case" +) + +Project.set_folder_path(folder_path) +Project.set_auto_save_to_excel(False) +gcms = Project() +# %% +to_check = gcms.create_files_info() + + +# %% +def test_load_files_info(gcms, checked_files_info): + to_check = gcms.load_files_info() + assert_frame_equal( + to_check, checked_files_info, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +# %% +to_check = gcms.create_files_info() +to_check = gcms.create_list_of_all_compounds() + +# %% diff --git a/tests/minimal_set/files_info.xlsx b/tests/minimal_set/files_info.xlsx deleted file mode 100644 index c489ddc..0000000 Binary files a/tests/minimal_set/files_info.xlsx and /dev/null differ diff --git a/tests/test_create_compounds_properties.py b/tests/test_create_compounds_properties.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_fragmenter.py b/tests/test_fragmenter.py new file mode 100644 index 0000000..76653df --- /dev/null +++ b/tests/test_fragmenter.py @@ -0,0 +1,50 @@ +import pytest +from gcms_data_analysis.fragmenter import Fragmenter + + +def test_fragmenter_simple(): + """test simple algorithm for fragmenter, + gcms_data analysi only uses the simple fragmentation + """ + algorithm = "simple" + smiles = ["CCCCO", "CCCO", "CCO", "CO"] + fragmentation_scheme = { + "CH2": "[CH2]", + "OH": "[OH]", + "CH3": "[CH3]", + "CH2-CH2": "[CH2][CH2]", + } + + checked_fragmentations_1 = { + "CCCCO": {"CH2-CH2": 1, "CH3": 1, "CH2": 1, "OH": 1}, + "CCCO": {"CH2-CH2": 1, "CH3": 1, "OH": 1}, + "CCO": {"CH3": 1, "CH2": 1, "OH": 1}, + "CO": {"CH3": 1, "OH": 1}, + } + + fragmentation_scheme_order_1 = ["CH2-CH2", "CH3", "CH2", "OH"] + + for smi in smiles: + frg = Fragmenter( + fragmentation_scheme, + fragmentation_scheme_order=fragmentation_scheme_order_1, + algorithm=algorithm, + ) + fragmentation, _, _ = frg.fragment(smi) + assert fragmentation == checked_fragmentations_1[smi] + + fragmentation_scheme_order_2 = ["CH3", "CH2", "CH2-CH2", "OH"] + checked_fragmentations_2 = { + "CCCCO": {"CH3": 1, "CH2": 3, "OH": 1}, + "CCCO": {"CH3": 1, "CH2": 2, "OH": 1}, + "CCO": {"CH3": 1, "CH2": 1, "OH": 1}, + "CO": {"CH3": 1, "OH": 1}, + } + for smi in smiles: + frg = Fragmenter( + fragmentation_scheme, + fragmentation_scheme_order=fragmentation_scheme_order_2, + algorithm=algorithm, + ) + fragmentation, _, _ = frg.fragment(smi) + assert fragmentation == checked_fragmentations_2[smi] diff --git a/tests/test_minimal_case.py b/tests/test_minimal_case.py new file mode 100644 index 0000000..a2f91ff --- /dev/null +++ b/tests/test_minimal_case.py @@ -0,0 +1,250 @@ +import pytest +from pandas.testing import assert_frame_equal + + +def test_load_files_info(gcms, checked_files_info): + to_check = gcms.load_files_info() + + assert_frame_equal( + to_check, checked_files_info, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_create_files_info(gcms, checked_created_files_info): + to_check = gcms.create_files_info() + assert_frame_equal( + to_check, checked_created_files_info, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_load_all_files(gcms, checked_files, checked_is_files_deriv): + + files_to_check, is_deriv_files_to_check = gcms.load_all_files() + for filename_to_check, checked_filename in zip(files_to_check, checked_files): + assert filename_to_check == checked_filename + + for file_to_check, checked_file in zip( + files_to_check.values(), checked_files.values() + ): + assert_frame_equal( + file_to_check, checked_file, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + assert is_deriv_files_to_check == checked_is_files_deriv + + +def test_load_all_files_wrong_names(gcms): + wrong_files_info = gcms.create_files_info() + wrong_files_info.index = ["Wrong_filename"] + wrong_files_info.index.tolist()[1:] + gcms.files_info = wrong_files_info + with pytest.raises(FileNotFoundError): + gcms.load_all_files() + + +def test_load_class_code_fractions(gcms, checked_load_class_code_fractions): + to_check = gcms.load_class_code_frac() + assert_frame_equal( + to_check, + checked_load_class_code_fractions, + check_exact=False, + atol=1e-5, + rtol=1e-5, + ) + + +def test_load_calibrations( + gcms, + checked_load_calibrations, +): + files_info = gcms.create_files_info() + calib_to_check, is_calib_deriv_to_check = gcms.load_calibrations() + for to_check, checked in zip(calib_to_check, checked_load_calibrations): + assert to_check == checked + + for to_check, checked in zip( + calib_to_check.values(), checked_load_calibrations.values() + ): + assert_frame_equal( + to_check, checked, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + assert is_calib_deriv_to_check == False + + +def test_list_of_all_compounds(gcms, checked_list_of_all_compounds): + to_check = gcms.create_list_of_all_compounds() + assert to_check.sort() == checked_list_of_all_compounds.sort() + + +# def test_list_of_all_deriv_compounds(gcms, checked_list_of_all_deriv_compounds): +# to_check = gcms.create_list_of_all_deriv_compounds() +# assert to_check.sort() == checked_list_of_all_deriv_compounds.sort() + + +@pytest.mark.slow +def test_create_compounds_properties(gcms, checked_compounds_properties): + to_check = gcms.create_compounds_properties() + assert_frame_equal( + to_check.sort_index(), + checked_compounds_properties.sort_index(), + check_exact=False, + check_dtype=False, + atol=1e-5, + rtol=1e-5, + ) + + +# @pytest.mark.slow +# def test_create_deriv_compounds_properties(gcms, checked_deriv_compounds_properties): +# to_check = gcms.create_deriv_compounds_properties() +# assert_frame_equal( +# to_check.sort_index(), +# checked_deriv_compounds_properties.sort_index(), +# check_exact=False, +# atol=1e-3, +# rtol=1e-3, +# ) + + +def test_load_compounds_properties(gcms, checked_compounds_properties): + to_check = gcms.load_compounds_properties() + assert_frame_equal( + to_check.sort_index(), + checked_compounds_properties.sort_index(), + check_exact=False, + atol=1e-3, + rtol=1e-3, + ) + + +# def test_load_deriv_compounds_properties(gcms, checked_deriv_compounds_properties): +# to_check = gcms.load_deriv_compounds_properties() +# assert_frame_equal( +# to_check.sort_index(), +# checked_deriv_compounds_properties.sort_index(), +# check_exact=False, +# atol=1e-3, +# rtol=1e-3, +# ) + + +def test_create_samples_info(gcms, checked_samples_info, checked_samples_info_std): + to_check, to_check_std = gcms.create_samples_info() + assert_frame_equal( + to_check, checked_samples_info, check_exact=False, atol=1e-5, rtol=1e-5 + ) + assert_frame_equal( + to_check_std, checked_samples_info_std, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +@pytest.mark.parametrize( + "parameter", + [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ], +) +def test_files_param_reports(gcms, checked_files_param_reports, parameter): + to_check = gcms.create_files_param_report(param=parameter) + checked_report = checked_files_param_reports[parameter] + assert_frame_equal( + to_check, checked_report, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_files_param_reports_exception(gcms): + with pytest.raises(ValueError): + gcms.create_files_param_report(param="wrong_parameter") + + +@pytest.mark.parametrize( + "parameter", + [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ], +) +def test_files_param_aggrreps(gcms, checked_files_param_aggrreps, parameter): + to_check = gcms.create_files_param_aggrrep(param=parameter) + checked_report = checked_files_param_aggrreps[parameter] + assert_frame_equal( + to_check, checked_report, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_files_param_aggreps_exception(gcms): + with pytest.raises(ValueError): + gcms.create_files_param_aggrrep(param="wrong_parameter") + + +@pytest.mark.parametrize( + "parameter", + [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ], +) +def test_samples_param_reports( + gcms, checked_samples_param_reports, checked_samples_param_reports_std, parameter +): + to_check, to_check_std = gcms.create_samples_param_report(param=parameter) + checked_report = checked_samples_param_reports[parameter] + checked_report_std = checked_samples_param_reports_std[parameter] + assert_frame_equal( + to_check, checked_report, check_exact=False, atol=1e-5, rtol=1e-5 + ) + assert_frame_equal( + to_check_std, checked_report_std, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_samples_param_reports_exception(gcms): + with pytest.raises(ValueError): + gcms.create_samples_param_report(param="wrong_parameter") + + +@pytest.mark.parametrize( + "parameter", + [ + "height", + "area", + "area_if_undiluted", + "conc_vial_mg_L", + "conc_vial_if_undiluted_mg_L", + "fraction_of_sample_fr", + "fraction_of_feedstock_fr", + ], +) +def test_samples_param_aggrreps( + gcms, checked_samples_param_aggrreps, checked_samples_param_aggrreps_std, parameter +): + to_check, to_check_std = gcms.create_samples_param_aggrrep(param=parameter) + checked_report = checked_samples_param_aggrreps[parameter] + checked_report_std = checked_samples_param_aggrreps_std[parameter] + assert_frame_equal( + to_check, checked_report, check_exact=False, atol=1e-5, rtol=1e-5 + ) + assert_frame_equal( + to_check_std, checked_report_std, check_exact=False, atol=1e-5, rtol=1e-5 + ) + + +def test_samples_param_aggrreps_exception(gcms): + with pytest.raises(ValueError): + gcms.create_samples_param_aggrrep(param="wrong_parameter") diff --git a/tests/test_name_to_properties.py b/tests/test_name_to_properties.py new file mode 100644 index 0000000..44c654f --- /dev/null +++ b/tests/test_name_to_properties.py @@ -0,0 +1,165 @@ +import pytest +from gcms_data_analysis import name_to_properties +from pandas.testing import assert_frame_equal +import pandas as pd +import numpy as np + + +@pytest.mark.parametrize("compound_name", [" ", None, False, np.nan]) +def test_name_to_properties_wrong_input_df_empty( + compound_name, dicts_classifications_codes_fractions +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + df = pd.DataFrame() + to_check = name_to_properties( + compound_name, dict_class_to_code, dict_class_to_mass_fraction, df + ) + assert to_check.empty + + +@pytest.mark.parametrize("compound_name", [" ", None, False, np.nan]) +def test_name_to_properties_wrong_input_df_not_empty( + compound_name, + dicts_classifications_codes_fractions, + checked_n2p_compounds_properties, +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + to_check = name_to_properties( + compound_name, + dict_class_to_code, + dict_class_to_mass_fraction, + checked_n2p_compounds_properties, + ) + assert_frame_equal( + to_check, + checked_n2p_compounds_properties, + check_exact=False, + atol=1e-3, + rtol=1e-3, + ) + + +def test_name_to_properties_name_not_on_pubchem_df_empty( + dicts_classifications_codes_fractions, +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + df = pd.DataFrame() + to_check = name_to_properties( + "name_not_on_pcp", dict_class_to_code, dict_class_to_mass_fraction, df + ) + df.loc["name_not_on_pcp", "iupac_name"] = "unidentified" + assert_frame_equal( + to_check, + df, + check_exact=False, + atol=1e-5, + rtol=1e-5, + ) + + +def test_name_to_properties_name_not_on_pubchem_df_not_empty( + dicts_classifications_codes_fractions, + checked_n2p_compounds_properties, +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + to_check = name_to_properties( + "name_not_on_pcp", + dict_class_to_code, + dict_class_to_mass_fraction, + checked_n2p_compounds_properties, + ) + checked_n2p_compounds_properties.loc["name_not_on_pcp", "iupac_name"] = ( + "unidentified" + ) + assert_frame_equal( + to_check, + checked_n2p_compounds_properties, + check_exact=False, + atol=1e-5, + rtol=1e-5, + ) + + +@pytest.mark.parametrize( + "compound", + [ + "2-methylcyclopent-2-en-1-one", # Comment: small ketone + "hexadecanoic acid", # Comment: another compound + "n-hexadecanoic acid", # Comment: different names, same compounds + "phenol", # Comment: a ring structure + "phenol", # Comment: repeated compound to test idempotency + "carbolic acid", # same iupac but different comp_name + "2,4,5-trichlorophenol", # Comment: chlorine, unidentified + "phenoxytrimethylsilane", # Comment: silane, not listed in fg + "bromophenol", # Comment: Br not listed + "9-octadecenoic acid, 1,2,3-propanetriyl ester, (e,e,e)-", # Comment: large compound + ], +) +def test_name_to_properties_single_compounds( + compound, dicts_classifications_codes_fractions, checked_n2p_compounds_properties +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + + to_check = name_to_properties( + compound, dict_class_to_code, dict_class_to_mass_fraction + ) + to_check = to_check.loc[[compound], :] + to_check = to_check.loc[:, (to_check != 0).any(axis=0)] + checked = checked_n2p_compounds_properties.loc[[compound], :] + checked = checked.loc[:, (checked != 0).any(axis=0)] + assert_frame_equal( + to_check, + checked, + check_exact=False, + atol=1e-5, + rtol=1e-5, + ) + + +def test_name_to_properties_all_compounds( + dicts_classifications_codes_fractions, checked_n2p_compounds_properties +): + dict_class_to_code, dict_class_to_mass_fraction = ( + dicts_classifications_codes_fractions + ) + + compounds = [ + "2-methylcyclopent-2-en-1-one", # Comment: small ketone + "hexadecanoic acid", # Comment: another compound + "n-hexadecanoic acid", # Comment: different names, same compounds + "phenol", # Comment: a ring structure + "phenol", # Comment: repeated compound to test + "carbolic acid", # same iupac but different comp_name + "2,4,5-trichlorophenol", # Comment: chlorine, unidentified + "phenoxytrimethylsilane", # Comment: silane, not listed in fg + "bromophenol", # Comment: Br not listed + "9-octadecenoic acid, 1,2,3-propanetriyl ester, (e,e,e)-", # Comment: large compound + "name_not_on_pcp", # test for legit string that gives no pcp result + " ", # wrong entry or datatype + None, + False, + np.nan, + ] + to_check = pd.DataFrame() + for compound in compounds: + to_check = name_to_properties( + compound, dict_class_to_code, dict_class_to_mass_fraction, to_check + ) + checked = checked_n2p_compounds_properties + assert_frame_equal( + to_check, + checked, + check_exact=False, + atol=1e-3, + rtol=1e-3, + )