-
Notifications
You must be signed in to change notification settings - Fork 1
/
liver_data_analysis.py
340 lines (321 loc) · 12.3 KB
/
liver_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
"""Liver data analysis
The script should be ran and will read imzML files, process data and create
relevant plot such as spatial distribution of feature counts.
"""
import os
import random
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from pathlib import Path
from matplotlib import pyplot as plt
from scipy.signal import find_peaks
from scipy.stats import sem, ttest_ind
from pyimzml.ImzMLParser import ImzMLParser
from processing import process, aligned_representation
from chip_types_data_analysis import (num_features_df, msi_spatial_num_features, msi_mean_num_features, plot_num_features_thresholds)
from utils import read_msi
def concatenate_images_array(
img_1: np.ndarray, img_2: np.ndarray
) -> np.ndarray:
"""
Concatenates two images horizontally with a separator and applies a
vertical shift to the second image.
Args:
img_1 (: np.ndarray): The first image to be concatenated.
It should be a 2D numpy array.
img_2 (: np.ndarray): The second image to be concatenated.
It should also be a 2D numpy array.
Returns:
np.ndarray: A new image created by horizontally concatenating `img_1`
and `img_2`, with a vertical shift applied to `img_2`.
"""
# Set the width of the separator between images
separator = 1
# Calculate the new shape for the concatenated image
# It's the height of the first image and the sum of widths of both
# images plus the separator
new_shape = (img_1.shape[0], img_1.shape[1] + img_2.shape[1] + separator)
# Initialize a new image with zeros, of the calculated shape
new_image = np.zeros(new_shape)
# Place the first image in the beginning part of the new image
new_image[:, :img_1.shape[1]] = img_1
# Set the separator area to NaN values
new_image[:, img_1.shape[1]:img_1.shape[1] + separator] = np.nan
# Set the amount of vertical shift for the second image
shift_y = 13
# Place the second image in the remaining part of the new image,
# applying the vertical shift to it
new_image[:, img_1.shape[1] + separator:] = np.roll(img_2, -shift_y, axis=0)
# Return the concatenated image
return new_image
def plot_spatial_num_features(
parsers: Dict[str, ImzMLParser], masks: Dict[str, np.ndarray],
metadata_df: pd.DataFrame, save_path: Path,
mz_range: Tuple[float, float] = (600, 900)
) -> None:
"""
Plots and saves an image that represents spatially the number of features
in each msi.
Args:
parsers (Dict[str, ImzMLParser]): A dictionary containing ImzMLParser
objects with keys as sample names.
masks (Dict[str, np.ndarray]): A dictionary containing masks for each
sample.
metadata_df (pd.DataFrame): DataFrame containing metadata for the
samples.
save_path (Path): Path object where the output image will be saved.
mz_range (Tuple[float,float]): Start and end m/z value to consider.
Defaults to (600, 900).
"""
# Dictionary to store spatial number of features for each sample
spatial_num_features = {}
# Loop through each parser and calculate spatial number of features
for name, p in parsers.items():
spatial_num_features[name] = msi_spatial_num_features(
p, masks[name], mz_range=mz_range
)
# List to store images for each group
images = []
# Group the metadata by file name and process each group
for _, group in metadata_df.groupby("file_name"):
# Initialize an empty image array
img = np.zeros((group["y_max"].max(), group["x_max"].max() + 1))
# Loop through each row in the group and update the image array
for _, row in group.iterrows():
img[:, row["x_min"]:row["x_max"] +
1] = spatial_num_features[row.sample_file_name]
# Append the processed image to the images list
images.append(img)
# Create a combined image from the first two images in the list
combined_img = concatenate_images_array(
np.rot90(images[1], 1), np.rot90(images[0], 1)
)[:-14, 14:-8]
# Create a figure for plotting
fig, ax = plt.subplots(1, 1, figsize=(5, 5), tight_layout=True)
# Show the combined image
shw = ax.imshow(
combined_img, cmap='magma',
vmax=np.percentile(combined_img[~np.isnan(combined_img)], 99)
)
# Add a color bar to the plot
bar = fig.colorbar(shw, orientation='vertical')
bar.set_label(
'Number of features', rotation=270, labelpad=15, fontweight='bold',
fontsize=14, color='0.2'
)
bar.outline.set_edgecolor('0.2')
bar.ax.tick_params(labelsize=14, width=2.5, color='0.2')
for l in bar.ax.get_yticklabels():
l.set_fontweight('bold')
l.set_color('0.2')
# Set the ticks for the color bar
ticks = np.linspace(
np.percentile(combined_img[~np.isnan(combined_img)], 99),
combined_img[~np.isnan(combined_img)].min(), 6
)
bar.set_ticks(ticks)
bar.set_ticklabels(['{:.0f}'.format(t) for t in ticks])
# Remove the axis for a cleaner look
plt.axis("off")
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("maps_of_the_number_of_features.png"),
bbox_inches='tight', dpi=1200, transparent=True
)
# Display the plot
plt.show()
def plot_num_features(
parsers: Dict[str, ImzMLParser], masks: Dict[str, np.ndarray],
metadata_df: pd.DataFrame, save_path: Path,
mz_range: Tuple[float, float] = (600, 900)
):
"""
Plots and saves a bar chart comparing the number of features between standard
and optimized conditions
Args:
parser (Dict[str, ImzMLParser]): A dictionary containing ImzMLParser
objects with keys as sample names.
masks (Dict[str, np.ndarray]): A dictionary containing masks for each
sample.
metadata_df (pd.DataFrame): DataFrame containing metadata for the
samples.
save_path (Path): Path object where the output image will be saved.
mz_range (Tuple[float,float]): Start and end m/z value to consider.
Defaults to (600, 900).
"""
# Create dict to store num features
num_features = {}
# Group the metadata by file name and process each group
for group_name, group in metadata_df.groupby("file_name"):
# Loop through each row in the group
for index, row in group.iterrows():
# Get MSO num features
msi_features = msi_mean_num_features(
parsers[row.sample_file_name], masks[row.sample_file_name],
mz_range=mz_range
)
# Update group number of features list
num_features[group_name] = num_features.get(group_name,
[]) + [msi_features]
# Get values for bar plot
means = {key: np.mean(value) for key, value in num_features.items()}
sems = {key: sem(value) for key, value in num_features.items()}
mean_values = list(means.values())[::-1]
sem_values = list(sems.values())[::-1]
# Extract the points for plotting on the bar chart
std_points = num_features['220224-optimization-liver-standard-1 Analyte 1_1']
opt_points = num_features['220224-optimization-liver-optimised-1 Analyte 1_1']
# Get ttest p value
_, pvalue = ttest_ind(std_points, opt_points)
if pvalue < 0.0001:
p_text = "* * * *"
elif 0.0001 <= pvalue < 0.001:
p_text = "* * *"
elif 0.001 <= pvalue < 0.01:
p_text = "* *"
elif 0.01 <= pvalue < 0.05:
p_text = "*"
else:
p_text = "ns"
# Define categories for the x-axis
categories = ['Std', 'Opt']
# Create the bar chart
_, ax = plt.subplots(1, 1, figsize=(3, 6), tight_layout=True)
ax.bar(
categories, mean_values, yerr=sem_values, color=['tab:red', 'tab:blue'],
capsize=10, error_kw=dict(ecolor='0.2', lw=2.5, capsize=10, capthick=2.5)
)
# Add individual points to the bars
ax.plot(
np.repeat(0, len(std_points)), std_points, 'o', markersize=7, color='0.2'
)
ax.plot(
np.repeat(1, len(opt_points)), opt_points, 's', markersize=7, color='0.2'
)
# Make ticks bold and font size 14
plt.xticks(fontsize=14, fontweight='bold', color='0.2')
plt.yticks(fontsize=14, fontweight='bold', color='0.2')
# Make the tick lines thicker
ax.tick_params(axis='both', which='major', width=2.5, color='0.2')
ax.tick_params(axis='both', which='minor', width=2.5, color='0.2')
# Set labels font size to 14
ax.set_ylabel('Number of Features', fontsize=14, weight='bold', color='0.2')
# Add a line to show the p-value
y, h = max(mean_values) + max(sem_values) + 60, 5
ax.plot([0, 0, 1, 1], [y, y + h, y + h, y], lw=2, c='0.2')
ax.text(
0.5, y + h + 5, p_text, ha='center', va='bottom', color='0.2',
fontweight="bold", fontsize=14
)
# Customize the spines
for axis in ['bottom', 'left']:
ax.spines[axis].set_linewidth(2.5)
ax.spines[axis].set_color('0.2')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add space between subplots and show the plot
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("number_of_features_bar_graph.png"),
bbox_inches='tight', dpi=1200, transparent=True
)
# Display the plot
plt.show()
# Save number of features csv
pd.DataFrame(num_features).to_csv(
save_path.joinpath("number_of_features.csv"), index=False
)
def main():
"""Function containing main code"""
# Define current folder using this file
CWD = Path(os.path.dirname(os.path.abspath(__file__)))
# Define folder that contains the revision chip type dataset
LIVER_PATH = Path(os.path.join(CWD, "..", "data", "LIVER"))
# Define folder that contains raw data
RAW_DATA = LIVER_PATH.joinpath("raw")
# Define folder to save aligned data
ALIGNED_DATA = LIVER_PATH.joinpath("aligned")
# Define folder to save processed data
PROCESSED_DATA = LIVER_PATH.joinpath("processed")
# Define file that contains dhg metadata
METADATA_PATH = LIVER_PATH.joinpath("metadata.csv")
# Define mass range start value
MZ_START = 50
# Define mass range end value
MZ_END = 1200
# Define mass resolution of the data
MASS_RESOLUTION = 0.025
# Define lock mass reference peak
LOCK_MASS_PEAK = 885.5498
# Define lock mass tol
LOCK_MASK_TOL = 0.3
# Define mz where lipid start
MZ_LIPID_START = 600
# Define mz where lipid end
MZ_LIPID_END = 900
# Define representative peaks
representative_peaks_map = {
'std': [634.40, 794.54, 886.55], 'opt': [794.5, 834.5, 886.6]
}
# Define random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# Read metadata csv
metadata_df = pd.read_csv(METADATA_PATH)
"""
# Loop over each unique msi imzML file
for file_name in metadata_df.file_name.unique():
# Define path to msi imzML file
msi_path = os.path.join(RAW_DATA, f"{file_name}.imzML")
# Define path to new msi imzML file after alignment
output_path = os.path.join(ALIGNED_DATA, f"{file_name}.imzML")
# Align MSI
aligned_representation(msi_path, output_path, LOCK_MASS_PEAK, LOCK_MASK_TOL)
# Loop over each ROI in data frame
for _, roi in metadata_df.iterrows():
#
representative_peaks = next(
(
value for key, value in representative_peaks_map.items()
if key in roi.sample_file_name
), None
)
# Define path to msi imzML file
msi_path = os.path.join(ALIGNED_DATA, f"{roi.file_name}.imzML")
# Define path to new msi imzML file after processing
output_path = os.path.join(PROCESSED_DATA, f"{roi.sample_file_name}")
# Create output folder if doesn't exist
Path(output_path).mkdir(parents=True, exist_ok=True)
# Process msi
process(
msi_path, output_path, roi.x_min, roi.x_max, roi.y_min, roi.y_max,
MZ_START, MZ_END, MASS_RESOLUTION, representative_peaks
)
"""
# Define path to save figures
PLOT_PATH = Path(CWD / "liver/")
# Create dirs
PLOT_PATH.mkdir(parents=True, exist_ok=True)
# Create dict of msi parsers and masks
parsers = {}
masks = {}
for folder in PROCESSED_DATA.iterdir():
name = folder.stem
parsers[name] = ImzMLParser(folder.joinpath("meaningful_signal.imzML"))
masks[name] = np.load(folder.joinpath("segmentation.npy"), mmap_mode='r')
# Plot figures
plot_spatial_num_features(
parsers, masks, metadata_df, PLOT_PATH, (MZ_LIPID_START, MZ_LIPID_END)
)
plot_num_features(
parsers, masks, metadata_df, PLOT_PATH, (MZ_LIPID_START, MZ_LIPID_END)
)
num_features = num_features_df(parsers, masks, np.arange(1.0, 3.1, 0.2))
num_features.to_csv(PLOT_PATH.joinpath("num_features.csv"), index=False)
plot_num_features_thresholds(num_features, PLOT_PATH)
if __name__ == '__main__':
main()