-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathseparate_protein_files.py
More file actions
53 lines (39 loc) · 1.67 KB
/
separate_protein_files.py
File metadata and controls
53 lines (39 loc) · 1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 28 10:10:24 2025
@author: shay
"""
import os
import pandas as pd
def split_protein_data(file_path, subfolder):
"""
Splits a CSV file based on the 'PROTEIN_NUMBER' column and saves each part separately.
Args:
file_path (str): Path to the input CSV file.
subfolder (str): Path to the folder where separated files should be stored.
Returns:
list: List of file paths for the separated CSV files.
"""
# Load the CSV file
print(file_path)
df = pd.read_csv(file_path)
# Ensure necessary columns exist
required_columns = {"PROTEIN_NUMBER", "ASMS_BATCH_NUM", "TARGET_ID"}
if not required_columns.issubset(df.columns):
raise ValueError(f"Missing required columns: {required_columns - set(df.columns)}")
# Dictionary to store output file paths
separated_files = []
# Group by PROTEIN_NUMBER and process each group
for protein_number, group_df in df.groupby("TARGET_ID"):
# Extract batch number (NUM) and protein name
batch_number = group_df["ASMS_BATCH_NUM"].iloc[0] # Take the first batch number
protein_name = group_df["TARGET_ID"].iloc[0] # Take the first protein name
# Construct the filename
file_name = f"{protein_name}_AsmBatchNumber{batch_number}.csv"
# Define the file path
output_path = os.path.join(subfolder, file_name)
# Save the separated file
group_df.to_csv(output_path, index=False)
separated_files.append(output_path)
print(f" Saved separated file: {output_path}")
return separated_files