-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_unique_proteins.py
More file actions
141 lines (79 loc) · 3.87 KB
/
extract_unique_proteins.py
File metadata and controls
141 lines (79 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
"""
Created on Wed Mar 19 14:58:58 2025
@author: shagh
"""
"""
import os
import pandas as pd
def extract_unique_protein_values(folder_path):
# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
if not csv_files:
print("No CSV files found in the folder.")
return
for file in csv_files:
file_path = os.path.join(folder_path, file)
try:
# Read the CSV file
df = pd.read_csv(file_path)
# Extract unique values if the columns exist
if 'PROTEIN_NUMBER' in df.columns:
unique_protein_numbers = df['PROTEIN_NUMBER'].dropna().unique().tolist()
# Count occurrences of each unique PROTEIN_NUMBER
protein_number_counts = df['PROTEIN_NUMBER'].value_counts().to_dict()
else:
unique_protein_numbers = []
protein_number_counts = {}
#unique_protein_ids = df['PROTEIN_ID'].dropna().unique().tolist() if 'PROTEIN_ID' in df.columns else []
unique_protein_ids = df['TARGET_ID'].dropna().unique().tolist() if 'TARGET_ID' in df.columns else []
# Print results
print(f"\nFile: {file}")
print(f"Unique PROTEIN_NUMBER values: {unique_protein_numbers}")
print(f"Number of rows per PROTEIN_NUMBER: {list(protein_number_counts.values())}")
print(f"Unique PROTEIN_ID values: {unique_protein_ids}")
except Exception as e:
print(f"Error reading {file}: {e}")
# Example usage
folder_path = r"D:\0000-UHN\03-DataAndCodes\Data\ASMS\EASMS-7March\RawData" # Replace with your actual folder path
extract_unique_protein_values(folder_path)
"""
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
import os
import pandas as pd
#### Chack data
def extract_unique_protein_values(folder_path):
# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
if not csv_files:
print("No CSV files found in the folder.")
return
for file in csv_files:
file_path = os.path.join(folder_path, file)
try:
# Read the CSV file
df = pd.read_csv(file_path)
# Extract unique PROTEIN_NUMBER values
'''if 'PROTEIN_NUMBER' in df.columns:
unique_protein_numbers = df['PROTEIN_NUMBER'].dropna().unique().tolist()
else:
unique_protein_numbers = []'''
# Extract unique TARGET_ID values and count rows per TARGET_ID
if 'TARGET_ID' in df.columns:
unique_protein_ids = df['TARGET_ID'].dropna().unique().tolist()
target_id_counts = df['TARGET_ID'].value_counts().to_dict()
else:
unique_protein_ids = []
target_id_counts = {}
# Print results
print(f"\nFile: {file}")
# print(f"Unique PROTEIN_NUMBER values: {unique_protein_numbers}")
print(f"Number of rows per TARGET_ID: {target_id_counts}")
print(f"Unique TARGET_ID values: {unique_protein_ids}")
except Exception as e:
print(f"Error reading {file}: {e}")
# Example usage
folder_path = r"D:\0000-UHN\03-DataAndCodes\Data\ASMS\EASMS-7March\RawData" # Replace with your actual folder path
extract_unique_protein_values(folder_path)