forked from DarkMachines/collider-unsupervised-learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphenom_data_read_in.py
71 lines (56 loc) · 2.18 KB
/
phenom_data_read_in.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import os
def csv_to_df(directory):
data = []
for fname in os.scandir(directory):
with open(fname, 'r') as file:
for line in file.readlines():
line = line.replace(';', ',')
line = line.rstrip(',\n')
line = line.split(',')
data.append(line)
#Find the longest line in the data
longest_line = max(data, key = len)
#Set the maximum number of columns
max_col_num = len(longest_line)
#Set the columns names
col_names = ['event_ID', 'process_ID', 'event_weight', 'MET', 'MET_Phi']
for i in range(1, (int((max_col_num-5)/5))+1):
col_names.append('obj'+str(i))
col_names.append('E'+str(i))
col_names.append('pt'+str(i))
col_names.append('eta'+str(i))
col_names.append('phi'+str(i))
#Create a dataframe from the list, using the column names from before
df = pd.DataFrame(data, columns=col_names)
df.fillna(value=pd.np.nan, inplace=True)
#Pickle the dataframe to keep it fresh
p_path = 'my_df.pkl'
df.to_pickle(p_path)
return df
##############################################################################################
def csv_to_np(directory):
data = []
for fname in os.scandir(directory):
with open(fname, 'r') as file:
for line in file.readlines():
line = line.replace(';', ',')
line = line.rstrip(',\n')
line = line.split(',')
data.append(line)
#Find the longest line in the data
longest_line = max(data, key = len)
#Set the maximum number of columns
max_col_num = len(longest_line)
#Set the columns names
col_names = ['event_ID', 'process_ID', 'event_weight', 'MET', 'MET_Phi']
for i in range(1, (int((max_col_num-5)/5))+1):
col_names.append('obj'+str(i))
col_names.append('E'+str(i))
col_names.append('pt'+str(i))
col_names.append('eta'+str(i))
col_names.append('phi'+str(i))
#Convert everything into numpy arrays
data = np.array(data)
col_names = np.array(col_names)
return data, col_names