-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_of_cma_lcshs_final.py
243 lines (165 loc) · 9.67 KB
/
analysis_of_cma_lcshs_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# -*- coding: utf-8 -*-
"""analysis_of_cma_lcshs_final.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/gist/tiagochavo87/681030d1b4ae6b78c0c3f83525276767/analysis_of_cma_lcshs_final.ipynb
"""
import pandas as pd
# Read the data from the CSV file = Input of data taken from CHAs (Chromosome Analysis Suite)= Columns (File; type; Chrom; cytoband; Size; Mark; Genes Q.; Genes Desc; OMIN; OMIM; Microarray Name..)
data = pd.read_csv("lcsh_data.csv", sep=";")
# Apply a filter to exclude chromosomes X and Y
data = data.loc[(data['Chrom'] != 'X') & (data['Chrom'] != 'Y')]
# Replace dots and convert Tamanho column to integer type
data['Tamanho'] = data['Tamanho'].str.replace('.', '').astype(int)
# Filter data for autosomes with a size greater than 10,000,000
data_autosomes = data.loc[data['Chrom'].str.contains('^([1-9]|[1-9][0-9])$')]
data_autosomes = data_autosomes[data_autosomes['Tamanho'] > 10000000]
# Write the autosomes data to a new CSV file
data_autosomes.to_csv('LCSHs_>_10.csv', sep=';', index=False)
# Filter data for regions with size greater than or equal to 3,000,000
data = data[data['Tamanho'] >= 3000000]
# Filter data for regions with sizes between 3,000,000 and 5,000,000
data_3_5 = data[(data['Tamanho'] > 3000000) & (data['Tamanho'] <= 5000000)]
# Write the 3-5 MB regions data to a new CSV file
data_3_5.to_csv('LCSHs_>_3_<_5.csv', sep=';', index=False)
# Filter data for 3-5 MB regions with a total size of at least 10,000,000 and only one region per file
data_3_5 = data_3_5.groupby('File').filter(lambda x: len(x) == 1 and x['Tamanho'].sum() >= 10000000)
# Filter data for regions with size greater than 5,000,000
data_5 = data[data['Tamanho'] > 5000000]
# Filter data for regions with a total size of at least 10,000,000 and only one chromosome per file
data_5 = data_5.groupby('File').filter(lambda x: len(x.groupby('Chrom')) == 1 and x['Tamanho'].sum() >= 10000000)
# Write the 5 MB regions data to a new CSV file
data_5.to_csv('possibles_UPDs.csv', sep=';', index=False)
# Filter data for cases with at least one LCSH greater than 5 MB
data_5_mb = data[data['Tamanho'] > 5000000]
data_5_mb = data_5_mb.groupby('File').filter(lambda x: x[x['Tamanho'] >= 5000000].shape[0] > 0)
# Print all cases with LCSHs greater than 5 MB to a CSV file
data_5_mb.to_csv('LCSHs_>_5.csv', sep=';', index=False)
# Sum LCSHs greater than 3 MB for each case
sum_lcshs = data[data['Tamanho'] >= 3000000].groupby('File')['Tamanho'].sum()
# Create a new DataFrame with case names and total LCSHs greater than 3 MB
output = pd.DataFrame({'File': sum_lcshs.index, 'Total LCSHs > 3MB': sum_lcshs.values})
# Write the output DataFrame to a CSV file
output.to_csv('total_lcshs.csv', sep=';', index=False)
# Calculate the total size of human autosomal DNA
total_autosomal_dna = 2881000000
# Calculate the proportion of LCSHs greater than 3 MB for each case
proportions = []
for file, size in sum_lcshs.items():
proportion = size / total_autosomal_dna
proportions.append({'File': file, 'Total LCSHs > 3MB': size, 'Proportion of LCSHs > 3MB to Autosomal DNA': proportion})
# Create a new DataFrame with the proportion for each case
proportions_df = pd.DataFrame(proportions)
# Write the output DataFrame to a CSV file
proportions_df.to_csv('lcsh_proportions.csv', sep=';', index=False)
"""This code imports the NumPy and Pandas libraries and uses them to calculate inbreeding coefficients based on certain proportions of data.
The first step is to define an array of values for F (inbreeding coefficient) that the code will use. These values are defined as percentages and then divided by 100.
Next, the code reads in a CSV file containing proportions of data and stores them in a Pandas DataFrame.
Then, the code iterates through each row of the DataFrame and calculates the F value that is closest to the proportion of LCSHs (Library of Congress Subject Headings) greater than 3MB to Autosomal DNA. This is done using NumPy's argmin() function to find the index of the closest F value in the array.
After calculating the closest F value for each row, the code creates a new Pandas DataFrame with the data transposed so that the F values are columns and the file names are rows.
Finally, the transposed DataFrame is written to a new CSV file along with the original proportions DataFrame.
Overall, this code is useful for calculating inbreeding coefficients based on specific proportions of data, which can be helpful in genetic research.
Output:
There are two output files: 'lcsh_transposed_proportions.csv' and 'inbreeding_coefficients.csv'. The first file contains the transposed proportions DataFrame, and the second file contains the original proportions DataFrame with an additional column for the closest F value.
"""
import numpy as np
# Define the F values
f_values = np.array([0.5, 1.5, 3, 6, 12.5, 25]) / 100
# Get the data proportions
proportions = pd.read_csv('lcsh_proportions.csv', sep=';')
# Calculate the F values closest to the proportions
for index, row in proportions.iterrows():
proportion = row['Proportion of LCSHs > 3MB to Autosomal DNA']
closest_f = f_values[np.abs(f_values - proportion).argmin()]
proportions.at[index, 'Inbreeding Coefficient (F)'] = closest_f
# Transpose the cases to the columns corresponding to the F values
transposed_proportions = pd.DataFrame(columns=['File', '0.5%', '1.5%', '3%', '6%', '12.5%', '25%'])
for index, row in proportions.iterrows():
file = row['File']
f_value = row['Inbreeding Coefficient (F)']
transposed_proportions.at[index, 'File'] = file
transposed_proportions.at[index, f'{f_value*100:.1f}%'] = row['Proportion of LCSHs > 3MB to Autosomal DNA']
# Write the transposed DataFrame to a new CSV file
transposed_proportions.to_csv('lcsh_transposed_proportions.csv', sep=';', index=False)
# Write the output DataFrame to a CSV file
proportions.to_csv('inbreeding_coefficients.csv', sep=';', index=False)
import pandas as pd
# Load the CSV file
df = pd.read_csv("LCSHs_>_3_<_5.csv", sep=";")
# Extract information from the "Microarray Nome.." column
df['start'] = df['Microarray Nome..'].str.extract('\((.*?)\-')
df['end'] = df['Microarray Nome..'].str.extract('\-(.*?)\)')
def extract_coordinates(row):
pattern = r'\((\d+,?\d*)-(\d+,?\d*)\)'
match = re.search(pattern, row)
if match:
start = match.group(1).replace(',', '.')
end = match.group(2).replace(',', '.')
return pd.Series([start, end])
else:
return pd.Series(['', ''])
# Remove the original column
df.drop('Microarray Nome..', axis=1, inplace=True)
# Create a new dataframe with the columns "Chrom", "cytoband", "start", and "end"
new_df = pd.DataFrame({
'Chrom': df['Chrom'],
'cytoband': df['cytoband'],
'start': df['start'],
'end': df['end']
})
new_df['start'] = new_df['start'].replace(',', '', regex=True)
new_df['end'] = new_df['end'].replace(',', '', regex=True)
# Save the new dataframe to a CSV file
new_df.to_csv('nova_dataframe_fron.csv', sep=";", index=False, decimal='.')
import pandas as pd
# Read the CSV file
data = pd.read_csv('lcsh_data.csv', sep=';')
# Count the number of unique values in the "File" column
file_count = data['File'].nunique()
# Print the total number of cases in the "File" column
print(f"The total number of cases in the 'File' column is: {file_count}")
# Store the result in a variable
result0 = pd.DataFrame([file_count])
# Save the result to a CSV file
result0.to_csv('output.csv', sep=';', index=False)
# Print the result
print(result0)
import pandas as pd
# Read input file (assuming it is in csv format)
df = pd.read_csv('nova_dataframe_fron.csv', sep=";")
result = 917
# Group by chromosome and cytoband, including the chromosome number in the aggregation
grouped_df = df.groupby(['Chrom', 'cytoband']).agg({'start': 'median', 'end': 'median', 'Chrom': ['count', 'first']})
# Rename columns
grouped_df.columns = ['start', 'end', 'frequency', 'Chrom']
# Normalize frequency with respect to a reference value
grouped_df['normalized_frequency'] = grouped_df['frequency'] / result
# Save result to a new csv file
grouped_df.to_csv('resultado.csv', index=False)
# Find cases with absolute frequency greater than 5%
absolute_frequency = grouped_df[grouped_df['frequency'] >= 0.005 * len(df)]
# Save cases to a new csv file
absolute_frequency.to_csv('frequencia_absoluta.csv', sep=";", index=True, decimal='.')
import pandas as pd
# Ler arquivo de entrada (assumindo que esteja em formato csv)
df = pd.read_csv('nova_dataframe_fron.csv', sep=";")
result = 917
# Agrupar por cromossomo e citobanda, incluindo o número de cromossomo na agregação
grouped_df = df.groupby(['Chrom', 'cytoband']).agg({'start': 'median', 'end': 'median', 'Chrom': ['count', 'first']})
# Renomear colunas
grouped_df.columns = ['start', 'end', 'frequency', 'Chrom']
# Normalizar frequência em relação a um valor de referência
grouped_df['normalized_frequency'] = grouped_df['frequency'] / result
# Formatar as colunas 'start' e 'end'
grouped_df[['start', 'end']] = grouped_df[['start', 'end']].astype(int).applymap('{:09d}'.format)
# Calcular tamanho em Kilobases de base
grouped_df['Size'] = ((grouped_df['end'].astype(int) - grouped_df['start'].astype(int)) / 1000).astype(int)
# Salvar resultado em um novo arquivo csv
grouped_df.to_csv('resultado.csv', index=False)
# Encontrar casos com frequência absoluta maior que 5%
frequencia_absoluta = grouped_df[grouped_df['frequency'] >= 0.005 * len(df)]
# Arredondar colunas start, end e normalized_frequency
frequencia_absoluta['normalized_frequency'] = frequencia_absoluta['normalized_frequency'].round(decimals=3)
# Salvar casos em um novo arquivo csv
frequencia_absoluta.to_csv('frequencia_absoluta.csv', sep=";", index=True, decimal='.')
print(frequencia_absoluta)