analysis_of_cma_lcshs_final.py

# -*- coding: utf-8 -*-
"""analysis_of_cma_lcshs_final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/gist/tiagochavo87/681030d1b4ae6b78c0c3f83525276767/analysis_of_cma_lcshs_final.ipynb
"""

import pandas as pd

# Read the data from the CSV file = Input of data taken from CHAs (Chromosome Analysis Suite)= Columns (File; type; Chrom; cytoband; Size; Mark; Genes Q.; Genes Desc; OMIN; OMIM; Microarray Name..)
data = pd.read_csv("lcsh_data.csv", sep=";")

# Apply a filter to exclude chromosomes X and Y
data = data.loc[(data['Chrom'] != 'X') & (data['Chrom'] != 'Y')]

# Replace dots and convert Tamanho column to integer type
data['Tamanho'] = data['Tamanho'].str.replace('.', '').astype(int)

# Filter data for autosomes with a size greater than 10,000,000
data_autosomes = data.loc[data['Chrom'].str.contains('^([1-9]|[1-9][0-9])$')]
data_autosomes = data_autosomes[data_autosomes['Tamanho'] > 10000000]

# Write the autosomes data to a new CSV file
data_autosomes.to_csv('LCSHs_>_10.csv', sep=';', index=False)

# Filter data for regions with size greater than or equal to 3,000,000
data = data[data['Tamanho'] >= 3000000]

# Filter data for regions with sizes between 3,000,000 and 5,000,000
data_3_5 = data[(data['Tamanho'] > 3000000) & (data['Tamanho'] <= 5000000)]

# Write the 3-5 MB regions data to a new CSV file
data_3_5.to_csv('LCSHs_>_3_<_5.csv', sep=';', index=False)

# Filter data for 3-5 MB regions with a total size of at least 10,000,000 and only one region per file
data_3_5 = data_3_5.groupby('File').filter(lambda x: len(x) == 1 and x['Tamanho'].sum() >= 10000000)

# Filter data for regions with size greater than 5,000,000
data_5 = data[data['Tamanho'] > 5000000]

# Filter data for regions with a total size of at least 10,000,000 and only one chromosome per file
data_5 = data_5.groupby('File').filter(lambda x: len(x.groupby('Chrom')) == 1 and x['Tamanho'].sum() >= 10000000)

# Write the 5 MB regions data to a new CSV file
data_5.to_csv('possibles_UPDs.csv', sep=';', index=False)

# Filter data for cases with at least one LCSH greater than 5 MB
data_5_mb = data[data['Tamanho'] > 5000000]
data_5_mb = data_5_mb.groupby('File').filter(lambda x: x[x['Tamanho'] >= 5000000].shape[0] > 0)

# Print all cases with LCSHs greater than 5 MB to a CSV file
data_5_mb.to_csv('LCSHs_>_5.csv', sep=';', index=False)

# Sum LCSHs greater than 3 MB for each case
sum_lcshs = data[data['Tamanho'] >= 3000000].groupby('File')['Tamanho'].sum()

# Create a new DataFrame with case names and total LCSHs greater than 3 MB
output = pd.DataFrame({'File': sum_lcshs.index, 'Total LCSHs > 3MB': sum_lcshs.values})

# Write the output DataFrame to a CSV file
output.to_csv('total_lcshs.csv', sep=';', index=False)

# Calculate the total size of human autosomal DNA
total_autosomal_dna = 2881000000

# Calculate the proportion of LCSHs greater than 3 MB for each case
proportions = []
for file, size in sum_lcshs.items():
    proportion = size / total_autosomal_dna
    proportions.append({'File': file, 'Total LCSHs > 3MB': size, 'Proportion of LCSHs > 3MB to Autosomal DNA': proportion})

# Create a new DataFrame with the proportion for each case
proportions_df = pd.DataFrame(proportions)

# Write the output DataFrame to a CSV file
proportions_df.to_csv('lcsh_proportions.csv', sep=';', index=False)

"""This code imports the NumPy and Pandas libraries and uses them to calculate inbreeding coefficients based on certain proportions of data.

The first step is to define an array of values for F (inbreeding coefficient) that the code will use. These values are defined as percentages and then divided by 100.

Next, the code reads in a CSV file containing proportions of data and stores them in a Pandas DataFrame.

Then, the code iterates through each row of the DataFrame and calculates the F value that is closest to the proportion of LCSHs (Library of Congress Subject Headings) greater than 3MB to Autosomal DNA. This is done using NumPy's argmin() function to find the index of the closest F value in the array.

After calculating the closest F value for each row, the code creates a new Pandas DataFrame with the data transposed so that the F values are columns and the file names are rows.

Finally, the transposed DataFrame is written to a new CSV file along with the original proportions DataFrame.

Overall, this code is useful for calculating inbreeding coefficients based on specific proportions of data, which can be helpful in genetic research.

Output:
There are two output files: 'lcsh_transposed_proportions.csv' and 'inbreeding_coefficients.csv'. The first file contains the transposed proportions DataFrame, and the second file contains the original proportions DataFrame with an additional column for the closest F value.


"""

import numpy as np

# Define the F values
f_values = np.array([0.5, 1.5, 3, 6, 12.5, 25]) / 100

# Get the data proportions
proportions = pd.read_csv('lcsh_proportions.csv', sep=';')

# Calculate the F values closest to the proportions
for index, row in proportions.iterrows():
    proportion = row['Proportion of LCSHs > 3MB to Autosomal DNA']
    closest_f = f_values[np.abs(f_values - proportion).argmin()]
    proportions.at[index, 'Inbreeding Coefficient (F)'] = closest_f

# Transpose the cases to the columns corresponding to the F values
transposed_proportions = pd.DataFrame(columns=['File', '0.5%', '1.5%', '3%', '6%', '12.5%', '25%'])
for index, row in proportions.iterrows():
    file = row['File']
    f_value = row['Inbreeding Coefficient (F)']
    transposed_proportions.at[index, 'File'] = file
    transposed_proportions.at[index, f'{f_value*100:.1f}%'] = row['Proportion of LCSHs > 3MB to Autosomal DNA']

# Write the transposed DataFrame to a new CSV file
transposed_proportions.to_csv('lcsh_transposed_proportions.csv', sep=';', index=False)

# Write the output DataFrame to a CSV file
proportions.to_csv('inbreeding_coefficients.csv', sep=';', index=False)

import pandas as pd

# Load the CSV file
df = pd.read_csv("LCSHs_>_3_<_5.csv", sep=";")
# Extract information from the "Microarray Nome.." column
df['start'] = df['Microarray Nome..'].str.extract('\((.*?)\-')
df['end'] = df['Microarray Nome..'].str.extract('\-(.*?)\)')

def extract_coordinates(row):
    pattern = r'\((\d+,?\d*)-(\d+,?\d*)\)'
    match = re.search(pattern, row)
    if match:
        start = match.group(1).replace(',', '.')
        end = match.group(2).replace(',', '.')
        return pd.Series([start, end])
    else:
        return pd.Series(['', ''])

# Remove the original column
df.drop('Microarray Nome..', axis=1, inplace=True)

# Create a new dataframe with the columns "Chrom", "cytoband", "start", and "end"
new_df = pd.DataFrame({
    'Chrom': df['Chrom'],
    'cytoband': df['cytoband'],
    'start': df['start'],
    'end': df['end']
})

new_df['start'] = new_df['start'].replace(',', '', regex=True)
new_df['end'] = new_df['end'].replace(',', '', regex=True)

# Save the new dataframe to a CSV file
new_df.to_csv('nova_dataframe_fron.csv', sep=";", index=False, decimal='.')

import pandas as pd

# Read the CSV file
data = pd.read_csv('lcsh_data.csv', sep=';')

# Count the number of unique values in the "File" column
file_count = data['File'].nunique()

# Print the total number of cases in the "File" column
print(f"The total number of cases in the 'File' column is: {file_count}")

# Store the result in a variable
result0 = pd.DataFrame([file_count])

# Save the result to a CSV file
result0.to_csv('output.csv', sep=';', index=False)

# Print the result
print(result0)

import pandas as pd

# Read input file (assuming it is in csv format)
df = pd.read_csv('nova_dataframe_fron.csv', sep=";")
result = 917

# Group by chromosome and cytoband, including the chromosome number in the aggregation
grouped_df = df.groupby(['Chrom', 'cytoband']).agg({'start': 'median', 'end': 'median', 'Chrom': ['count', 'first']})

# Rename columns
grouped_df.columns = ['start', 'end', 'frequency', 'Chrom']

# Normalize frequency with respect to a reference value
grouped_df['normalized_frequency'] = grouped_df['frequency'] / result

# Save result to a new csv file
grouped_df.to_csv('resultado.csv', index=False)

# Find cases with absolute frequency greater than 5%
absolute_frequency = grouped_df[grouped_df['frequency'] >= 0.005 * len(df)]

# Save cases to a new csv file
absolute_frequency.to_csv('frequencia_absoluta.csv', sep=";", index=True, decimal='.')

import pandas as pd

# Ler arquivo de entrada (assumindo que esteja em formato csv)
df = pd.read_csv('nova_dataframe_fron.csv', sep=";")
result = 917

# Agrupar por cromossomo e citobanda, incluindo o número de cromossomo na agregação
grouped_df = df.groupby(['Chrom', 'cytoband']).agg({'start': 'median', 'end': 'median', 'Chrom': ['count', 'first']})

# Renomear colunas
grouped_df.columns = ['start', 'end', 'frequency', 'Chrom']

# Normalizar frequência em relação a um valor de referência
grouped_df['normalized_frequency'] = grouped_df['frequency'] / result

# Formatar as colunas 'start' e 'end'
grouped_df[['start', 'end']] = grouped_df[['start', 'end']].astype(int).applymap('{:09d}'.format)

# Calcular tamanho em Kilobases de base
grouped_df['Size'] = ((grouped_df['end'].astype(int) - grouped_df['start'].astype(int)) / 1000).astype(int)

# Salvar resultado em um novo arquivo csv
grouped_df.to_csv('resultado.csv', index=False)

# Encontrar casos com frequência absoluta maior que 5%
frequencia_absoluta = grouped_df[grouped_df['frequency'] >= 0.005 * len(df)]

# Arredondar colunas start, end e normalized_frequency
frequencia_absoluta['normalized_frequency'] = frequencia_absoluta['normalized_frequency'].round(decimals=3)

# Salvar casos em um novo arquivo csv
frequencia_absoluta.to_csv('frequencia_absoluta.csv', sep=";", index=True, decimal='.')

print(frequencia_absoluta)