forked from sjteresi/TE_Density_Old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Discrete_Windows.py
139 lines (102 loc) · 5.54 KB
/
Discrete_Windows.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# By Scott Teresi
# =======================================================
import sys
import pandas as pd
import numpy as np
import os
import multiprocessing
from multiprocessing import Process
from threading import Thread
import time
from collections import defaultdict
def discrete(current_density, current_window, previous_density, step = 500):
'''Mathematical function to compute the change in density between windows'''
discrete_density = (current_density * current_window) / (current_window - step) - previous_density # formula for the density calculation to remove previous window
return discrete_density # return value
def set_up(file_name1, file_name2):
df1=pd.read_csv(file_name1, sep=',', delimiter = None, header='infer', engine='python') # read the data
df2=pd.read_csv(file_name2, sep=',', delimiter = None, header='infer', engine='python')
df1.replace('', np.NaN) # for objects that have no value, replace with NULL value in the PandaFrame
df2.replace('', np.NaN)
df1_length = len(df1.index) # get length for later iteration
df2_length = len(df2.index)
if df1_length != df2_length: # value check to makemsure PandaFrames are the same length. This is expected
raise ValueError
TEs = list(df1) # generate fieldnames list from PandaFrame
list_to_remove = ['maker_name','chromosome','start','stop','length','prox_left','prox_right','they_are_inside','number','window_size']
# fieldnames to remove from fieldnames list
for item in list_to_remove:
TEs.remove(item) # we will only run the calculations on the filtered fieldnames
for i in range(0,df1_length): # iterate over range of numbers to correspond to PandaFrame
for item in TEs: # iterate over acceptable fieldnames
new_col_name = 'Change_From_Last_Window '+ item # generate fieldname for new data column
current_density = np.float(df2.at[i,item]) # declare as Numpy object, density equal to fieldname (item) density at ith row
current_window = np.int(df2.at[i,'window_size']) # declare as Numpy object, window equal to fieldname (item) window size at ith row (same for whole file)
# TO DO:
#Could streamline to declare once
previous_density = np.float(df1.at[i,item]) # declare as Numpy object, previous density equal to item density at ith row
difference = discrete(current_density, current_window, previous_density) # calculate the difference from previous window and assign it to difference
df2.at[i,new_col_name] = difference # Add to df2 a new column, at the ith row, the value of difference
if df1.at[0,'window_size'] == 500: # Add the columns to 500 bp files, just to have the columns, and fill the values with NULL
for i in range(0,df1_length):
for item in TEs:
new_col_name = 'Change_From_Last_Window '+ item
df1.at[i,new_col_name] = np.nan
# TO DO
# drop the unneccessary columns since I only want the discrete jumps, to save space and RAM when computing
#list_to_remove_2 = list_to_remove.remove('maker_name')
#['maker_name','chromosome','start','stop','length','prox_left','prox_right','they_are_inside','number','window_size']
#df1.drop(columns = ['chromosome','start','stop','length','prox_left','prox_right','they_are_inside','number','window_size'])
#df2.drop(columns = ['chromosome','start','stop','length','prox_left','prox_right','they_are_inside','number','window_size'])
df1.to_csv('Discrete_' + file_name1) # write the PandaFrames to a csv
df2.to_csv('Discrete_' + file_name2)
def show_status(status_queue):
"""Print to screen the percent completion of each response."""
time.sleep(10) # MAGIC NUMBER wait for printouts from the process start
status_map = defaultdict(tuple)
while True: # MAGIC NUMBER make sure to use a daemon if no kill event
response = status_queue.get(block=True)
try:
percent, my_id = response
except ValueError as ve:
msg = ("show_status expecting 3 values in response but got {}"
.format(response))
logger.debug(msg)
continue
status_map[str(my_id)] = percent
status = "percent complete: "
for id, complete in status_map.items():
status += id + ": {:.2f}".format(complete) + ", "
sys.stdout.write('\r')
sys.stdout.write(status)
sys.stdout.flush()
def run_all():
os.chdir('CAMDATA')
my_inputs = ['xx00','xx01','xx02','xx03','xx04','xx05','xx06'] # determine the chromosome headings of files to loop over
count = 500 # the jump in windows of the file scheme
my_dict = {}
for item in my_inputs:
last = str(count+500) + '_' + item + '_density_data.csv'
while count <= 10000:
my_file = str(count) + '_' + item + '_density_data.csv'
my_dict[last] = str(count) + '_' + item + '_density_data.csv'
count += 500
last = my_file
count = 500
file_list = []
for item in my_inputs:
file_list.append(item[0])
status_queue = multiprocessing.Queue()
status_thread = Thread(target=show_status, args=(status_queue,))
status_thread.daemon = True
p_list = []
for f_name in my_dict.keys():
p = Process(target=set_up, args=(f_name, my_dict[f_name],))
p.start()
p_list.append(p)
status_thread.start()
for p in p_list:
current_pid = p.pid
p.join(None)
print("pid '{}' joined!".format(current_pid))
run_all()