-
Notifications
You must be signed in to change notification settings - Fork 0
/
sfp1_gfp_analysis.py
236 lines (188 loc) · 9.87 KB
/
sfp1_gfp_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import os
import sys
import time
import cv2
import traceback
import pandas as pd
import numpy as np
from scipy import interpolate
from skimage.filters import threshold_local
from skimage.morphology import remove_small_objects
from shared.shared_functions import round_up_to_odd, read_images, load_all_budj_data, get_whole_cell_mask, load_events,\
create_excel_dir, get_nuc_and_cyt_gfp_av_signal
from shared.signal_analysis import generate_plots # import file that allows for generating plots
#######################
### IMPORTANT PATHS ###
#######################
data_dir = "C:/Users/Olle de Jong/Documents/MSc Biology/MSB Research/SignalAnalysisData/Sfp1_GFP/"
tiff_files_dir = f"{data_dir}Processed_Tiffs/" # relative path from data directory to tiff directory
budj_data_dir = f"{data_dir}input/" # folder that holds the BudJ info on all cells
budding_data_path = f"{data_dir}input/buddings.txt" # budding events
kario_data_path = f"{data_dir}input/kariokinesis.txt" # kariokynesis events
output_dir = f"{data_dir}output/" # relative path from data directory to image output directory
###############
### GLOBALS ###
###############
bloc_size_frac_to_use = 0.095 # fraction of the total cell mask pixels that is used as local thresholding block size
offset_to_use = -15 # offset for local thresholding
pd.options.mode.chained_assignment = None # default='warn'
#################
### FUNCTIONS ###
#################
def get_nuc_thresh_mask(imageGFP_at_frame, whole_cell_mask):
imageGFP_cell_mask = imageGFP_at_frame * whole_cell_mask # keep only data within the whole cell mask
num_cell_pixels = np.count_nonzero(whole_cell_mask == True) # count number of pixels in that mask
bloc_size_cell_size_dependent = round_up_to_odd(bloc_size_frac_to_use * num_cell_pixels)
nucl_thresh_mask_local = threshold_local(
image=imageGFP_cell_mask,
block_size=bloc_size_cell_size_dependent,
offset=offset_to_use
)
imageGFP_nuc_mask_local = remove_small_objects(imageGFP_cell_mask > nucl_thresh_mask_local)
return imageGFP_nuc_mask_local
def get_data_for_single_cell(image_gfp, single_cell_data):
"""
This function, for the given cell, calculates the GFP values for every time-frame.
:param image_gfp:
:param single_cell_data:
:return:
"""
GFP_total, GFP_cyto, GFP_nucleus, GFP_ratio, GFP_std = [], [], [], [], [] # GFP lists
for t in single_cell_data.TimeID:
t_in_tiff = t - 1 # skew the time by one for tiff dataframe
imageGFP_at_frame = image_gfp[t_in_tiff, :, :] # get the GFP data
nrows, ncols = imageGFP_at_frame.shape
# WHOLE CELL #
# get whole-cell mask based on budj data
whole_cell_mask, x_pos, y_pos = get_whole_cell_mask(t, single_cell_data, imageGFP_at_frame.shape)
# get mean GFP signal and the standard deviation
mean_GFP = np.mean(imageGFP_at_frame[whole_cell_mask == True])
if mean_GFP < 5:
mean_GFP = np.nan
GFP_total.append(mean_GFP)
GFP_std.append(np.std(imageGFP_at_frame[whole_cell_mask == True]))
# NUCLEUS #
imageGFP_nuc_mask_local = get_nuc_thresh_mask(imageGFP_at_frame, whole_cell_mask)
# NUCLEAR / CYTO RFP CALCULATIONS #
cyto_mean, nucleus_mean = get_nuc_and_cyt_gfp_av_signal(imageGFP_at_frame, imageGFP_nuc_mask_local, ncols,
nrows, whole_cell_mask)
GFP_nucleus.append(nucleus_mean)
GFP_cyto.append(cyto_mean)
GFP_ratio.append(nucleus_mean / cyto_mean)
return GFP_total, GFP_std, GFP_cyto, GFP_nucleus, GFP_ratio
def get_data_for_all_cells():
"""
The most complex, and by far most compute intensive, function of this script. It goes through all tiff movies and
all cells, and for each cell it loops over all frames in the movie. For every frame, local thresholding is
performed. Based on that, the GFP datapoints are generated.
:return:
"""
budj_data = load_all_budj_data(data_dir)
tiff_images = read_images(tiff_files_dir)
individual_cells = sorted([x for x in budj_data["Cell_pos"].unique()])
nth_cell = 0
final_data = pd.DataFrame({})
# for every cell that was tracked using BudJ, we generate the data
for cell in individual_cells:
print(f"Generating data ( {round(nth_cell / len(individual_cells) * 100)}% )", end="\r", flush=True)
# get right image from images dictionary and select the right channel (GFP)
image = tiff_images[cell[3:5]]
image = cv2.convertScaleAbs(image, alpha=1.2, beta=0) # enhance contrast
image_gfp = image[:, :, :, 1]
# get this cell it's budj data
cell_data = budj_data[budj_data["Cell_pos"] == cell]
# get the GFP datapoints for every frame that this cell was tracked
GFP_total, GFP_std, GFP_cyto, GFP_nucleus, GFP_ratio = get_data_for_single_cell(image_gfp, cell_data)
# store data in this cell its dataframe
cell_data.loc[:, "GFP_total"] = pd.Series(GFP_total, index=cell_data.index)
cell_data.loc[:, "GFP_std"] = pd.Series(GFP_std, index=cell_data.index)
cell_data.loc[:, "GFP_nucleus"] = pd.Series(GFP_nucleus, index=cell_data.index)
cell_data.loc[:, "GFP_cyto"] = pd.Series(GFP_cyto, index=cell_data.index)
cell_data.loc[:, "GFP_ratio"] = pd.Series(GFP_ratio, index=cell_data.index)
# store this cell's data in the final dataframe which eventually contains data of all cells
final_data = pd.concat([final_data, cell_data])
nth_cell += 1
# reset the index and save the final dataframe
final_data = final_data.reset_index(drop=True)
final_data.to_excel(f"{output_dir}excel/sfp1_GFP_only_data.xlsx")
print("Generating data.. Done!")
return final_data
def split_cycles_and_interpolate(final_data, kario_events):
"""
Function responsible splitting the data of each cell on karyokinesis events, where after the data per
cycle is interpolated to 100 datapoints.
:param final_data:
:param kario_events:
:return interpolated_dataframes: a list with the interpolated dataframes
"""
print("Performing interpolation on data..", end="\r", flush=True)
individual_cells = sorted(list(set(final_data["Cell_pos"])))
cycle_durations = []
min_dp_for_int = 10
desired_datapoints = 100
cols = ["cell", "cycle", "start", "end"]
cols += range(desired_datapoints)
interpolated_dataframes = {}
tot_under = 0
for data_type in ["GFP_total", "GFP_nucleus", "GFP_cyto", "GFP_ratio"]:
data_interpolated = pd.DataFrame(columns=cols)
for cell in individual_cells:
single_cell_data = final_data[final_data.Cell_pos == cell]
single_cell_data = single_cell_data[~single_cell_data.GFP_nucleus.isnull()]
count = 0
while count < len(kario_events[cell]) - 1:
tp1, tp2 = kario_events[cell][count], kario_events[cell][count + 1]
cycle_durations.append(tp2 - tp1)
cell_cycle_dat = single_cell_data[single_cell_data.TimeID.between(tp1, tp2)]
# when there are not enough datapoints (because of outlier removal) for this cycle, ignore it
if len(cell_cycle_dat) < min_dp_for_int:
tot_under += 1
count += 1
continue
# get the original x and y data
x = np.linspace(0, len(cell_cycle_dat), len(cell_cycle_dat))
y = cell_cycle_dat[data_type]
# interpolate the data towards 100 datapoints
x_new = np.linspace(0, len(cell_cycle_dat), desired_datapoints)
f2 = interpolate.interp1d(x, y, kind='linear')
# add the interpolated data to a dataframe
data_interpolated.loc[len(data_interpolated)] = [cell, count + 1, tp1, tp2] + f2(x_new).tolist()
count += 1
data_interpolated.to_excel(f"{output_dir}excel/cycles_interpolated_{data_type}s.xlsx")
interpolated_dataframes[data_type] = data_interpolated
av_cycle_duration = np.average(cycle_durations)
print("Performing interpolation on data.. Done!")
print(f"There were {tot_under} cycles removed that had not enough datapoints (less than {min_dp_for_int}).")
print(f"Average cycle duration: {av_cycle_duration} frames, which is equal to {av_cycle_duration * 5} minutes.")
return interpolated_dataframes
def main():
print("Getting started!")
tic = time.perf_counter() # start counter
create_excel_dir(data_dir)
# if dataset already exists, prevent generating this again and load it
final_dataframe_path = f"{output_dir}excel/sfp1_GFP_only_data.xlsx"
if not os.path.exists(final_dataframe_path):
final_data = get_data_for_all_cells()
else:
print("Data has been generated already. The output file exists.")
final_data = pd.read_excel(final_dataframe_path)
# load event data
kario_events, budding_events = load_events(kario_data_path, budding_data_path)
# check if cycles have been split and interpolated, if not, do this
interpolated_dataframes = split_cycles_and_interpolate(final_data, kario_events)
# generate a big overview image that shows gfp intensities
generate_plots.dynamics_overview_all_cells(final_data, kario_events, budding_events, output_dir, "GFP")
# generate a plot per interpolated cycle
generate_plots.separate_interpolated_cycles(interpolated_dataframes, output_dir, "GFP")
generate_plots.averaged_plots(interpolated_dataframes, output_dir, "GFP")
toc = time.perf_counter()
secs = round(toc - tic, 4)
print(f"Done. Runtime was {secs} seconds ({round(secs / 60, 2)} minutes)")
# SCRIPT STARTS HERE
if __name__ == '__main__':
try:
main()
sys.exit(0)
except Exception:
print("\nSomething went wrong while running the model:\n", traceback.format_exc())
sys.exit(1)