-
Notifications
You must be signed in to change notification settings - Fork 0
/
GlycoShape_DB_bake.py
281 lines (230 loc) · 12.6 KB
/
GlycoShape_DB_bake.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import config
from pathlib import Path
import shutil
import logging
from datetime import datetime
import zipfile
import glob
import json
import requests
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
faq_dict = {
"What is GlycoShape?" : "GlycoShape is an OA database of glycans 3D structural data and information that can be downloaded or used with Re-Glyco to rebuild glycoproteins from the RCSB PDB or EMBL-EBI AlphaFold repositories.",
"How to search?":"You can search by GlyTouCan ID, IUPAC, GLYCAM, WURCS, SMILES or you can draw your own glycan using draw' button in search bar and search the closest match from our database.",
"What are clusters?":"Clusters and their centroids are crucial for understanding the dynamic behavior of glycan structures in our database. A cluster groups similar conformations of glycans based on molecular dynamics simulations, simplifying the complex data from these simulations. The cluster centroid represents the most typical conformation within its cluster, offering a clear view of probable glycan shapes and orientations. This approach helps us explore and quantify the glycan's conformational space, enhancing our understanding of its biological interactions and functions.",
"How are clusters calculated?" : "The conformational ensembles from multi-microsecond MD simulations are clustered into representative conformations. To do this, a principal component analysis (PCA) is conducted on the trajectory for dimensionality reduction. Clusters from the PCA are then identified using a Gaussian mixture model (GMM), with representative structures for each conformational cluster identified from the kernel density (more details at The conformational ensembles from multi-microsecond MD simulations are clustered into representative conformations. To do this, a principal component analysis (PCA) is conducted on the trajectory for dimensionality reduction. Clusters from the PCA are then identified using a Gaussian mixture model (GMM), with representative structures for each conformational cluster identified from the kernel density (more details at https://doi.org/10.1101/2023.12.11.571101 ). ",
"What is Re-Glyco?" : "Re-Glyco is a tool we designed to restore the missing glycosylation on glycoproteins deposited in the RCSB PDB, the EBI-EMBL AlphaFold protein structure database or on your own structure file in PDB format.",
"What is GlcNAc Scan?" : "The ability of Re-Glyco to resolve steric clashes can be used within GlycoShape also to assess the potential occupancy of N-glycosylation sites through an implementation we called ‘GlcNAc Scanning’. Where Re-Glyco will try to fit a single GlcNAc monosaccharide into all the NXS/T sequons in the protein. The process outputs a list of sequons that passed the test, marked with a simple ‘yes’ or ‘no’ label.",
"Having Performance Issue?" : "The website is optimized for the Chrome browser. If you are using a different browser and experiencing slowdowns while using Re-Glyco, we recommend trying Chrome.",
"Advanced (Site-by-Site) Glycosylation" : "This option allows users to select different glycans for different residues. It is intended for use with up to 5-10 residues at a time, as using it with more may cause your browser to slow down. If you wish to glycosylate more than 10 residues, we recommend using our API through a Python script (more information at https://glycoshape.org/api-docs).",
"What is the Re-Glyco Ensemble?" : "The Ensemble option becomes available only after a successful Re-Glyco run, appearing next to the download PDB button. It outputs a multiframe PDB of the glycoprotein, along with the SASA (Solvent Accessible Surface Area) of the protein, taking into account the influence of the glycans.",
"Experiencing timeout with Re-Glyco job?":"We have set a 4000-second hard timeout for any job with Re-Glyco. If you wish to process an extremely large system, we recommend contacting us directly via email at elisa.fadda@soton.ac.uk. We would love to help and build your system.",
"I would like a new feature!":"Please contact us at elisa.fadda@soton.ac.uk, we would be glad to hear any feedback or feature requests."
}
def create_glycoshape_archive(dir: Path, output_file: str = "GlycoShape.zip") -> None:
"""Create GlycoShape archive from directory contents.
Args:
dir1: Path to first database directory
dir2: Path to second database directory
output_file: Name of final zip file
"""
try:
# Convert to Path objects
dir = Path(dir)
output_path = dir / output_file
# Remove existing archive if present
if output_path.exists():
output_path.unlink()
logger.info(f"Removed existing {output_file}")
# Process each glycan directory
directories = list(dir.glob("*/"))
for directory in directories:
if not directory.is_dir():
continue
glycan = directory.name
logger.info(f"Processing Zip for {glycan}")
# Remove existing zip if present
glycan_zip = directory / f"{glycan}.zip"
if glycan_zip.exists():
glycan_zip.unlink()
# Files to include
files_to_zip = [
directory / "data.json",
directory / "snfg.svg",
directory / "PDB_format_HETATM",
directory / "CHARMM_format_HETATM",
directory / "GLYCAM_format_HETATM",
directory / "PDB_format_ATOM",
directory / "CHARMM_format_ATOM",
directory / "GLYCAM_format_ATOM"
]
# Create zip for glycan
with zipfile.ZipFile(glycan_zip, 'w', zipfile.ZIP_DEFLATED) as zf:
for file_path in files_to_zip:
if file_path.exists():
if file_path.is_dir():
# Handle directories
for file in file_path.rglob('*'):
if file.is_file():
arcname = file.relative_to(directory)
zf.write(file, arcname)
else:
# Handle files
zf.write(file_path, file_path.name)
# Create final archive
logger.info("Creating final zip file")
# Copy all zip files to temp location
temp_zips = []
for src_dir in [dir]:
for zip_file in src_dir.rglob("*.zip"):
if zip_file.name != output_file:
shutil.copy2(zip_file, dir)
temp_zips.append(dir / zip_file.name)
# Create final archive
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as final_zip:
for zip_file in temp_zips:
final_zip.write(zip_file, zip_file.name)
# Cleanup
logger.info("Cleaning up temporary files")
for temp_zip in temp_zips:
temp_zip.unlink()
logger.info(f"Successfully created {output_file}")
except Exception as e:
logger.error(f"Failed to create archive: {str(e)}")
raise
def create_glycoshape_json(dir: Path, output_file: str = "GLYCOSHAPE.json") -> None:
"""Create consolidated JSON file from all data.json files in subdirectories.
Args:
dir: Path to database directory
output_file: Name of output JSON file
"""
try:
dir = Path(dir)
output_path = dir / output_file
consolidated_data = {}
# Process each glycan directory
for directory in dir.glob("*/"):
if not directory.is_dir():
continue
glycan = directory.name
json_path = directory / "data.json"
if json_path.exists():
logger.info(f"Processing JSON for {glycan}")
with open(json_path) as f:
consolidated_data[glycan] = json.load(f)
# Write consolidated JSON
logger.info(f"Writing consolidated JSON to {output_file}")
with open(output_path, 'w') as f:
json.dump(dict(sorted(consolidated_data.items())), f, indent=2, ensure_ascii=False)
logger.info("Successfully created consolidated JSON file")
except Exception as e:
logger.error(f"Failed to create consolidated JSON: {str(e)}")
raise
def save_faq_json(dir: Path, faq_data: dict, output_file: str = "faq.json") -> None:
"""Save FAQ data to JSON file.
Args:
dir: Path to output directory
faq_data: Dictionary containing FAQ data
output_file: Name of output JSON file
"""
try:
dir = Path(dir)
output_path = dir / output_file
logger.info(f"Writing FAQ JSON to {output_file}")
with open(output_path, 'w') as f:
json.dump(faq_data, f, indent=2)
logger.info("Successfully created FAQ JSON file")
except Exception as e:
logger.error(f"Failed to create FAQ JSON: {str(e)}")
raise
def wurcs_registration(dir: Path, file: str = "GLYCOSHAPE.json", output_file: str = "missing_glytoucan.txt") -> None:
# Load the JSON data
file_path = dir / file
with open(file_path, 'r') as file:
data = json.load(file)
# Initialize a list to store the output
output_lines = []
# Iterate through the data to extract relevant information
for key, type in data.items():
glycan = type.get("archetype", {})
print(glycan.get("wurcs"))
wurcs_values = {
"wurcs": glycan.get("wurcs")
}
glytoucan_values = {
"glytoucan": glycan.get("glytoucan")
}
# Write non-null wurcs if corresponding glytoucan is null
for wurcs_key, wurcs_value in wurcs_values.items():
glytoucan_key = wurcs_key.replace("wurcs", "glytoucan")
if wurcs_value and not glytoucan_values.get(glytoucan_key):
output_lines.append(f"{wurcs_value}")
# Write the output to a text file
output_file_path = dir / output_file
with open(output_file_path, 'w') as output_file:
output_file.write("\n".join(output_lines))
output_file_path
def submit_wurcs(contributor_id: str, api_key: str, file_path: Path) -> None:
"""Submit WURCS data to GlyTouCan API.
Args:
contributor_id: Contributor ID for authentication
api_key: API key for authentication
file_path: Path to the text file containing WURCS data
"""
url = "https://glytoucan.org/api/bulkload/wurcs"
try:
with open(file_path, 'rb') as file:
response = requests.post(
url,
files={"file": file},
auth=(contributor_id, api_key)
)
if response.status_code == 200:
logger.info("Successfully submitted WURCS data")
else:
logger.error(f"Failed to submit WURCS data: {response.status_code} {response.text}")
response.raise_for_status()
except Exception as e:
logger.error(f"Error during WURCS submission: {str(e)}")
raise
def total_simulation_length(file_path: Path) -> float:
"""Sum the 'length' values from archtype entries in a JSON file.
Args:
file_path: Path to the JSON file
Returns:
The sum of all 'length' values as a float
"""
try:
with open(file_path, 'r') as file:
data = json.load(file)
total_length = 0.0
for entry in data.values():
archtype = entry.get("archetype", {})
length = archtype.get("length")
if length is not None:
total_length += float(length)
logger.info(f"Total simulation length: {total_length}")
return total_length
except Exception as e:
logger.error(f"Failed to sum lengths in JSON: {str(e)}")
raise
if __name__ == "__main__":
save_faq_json(
dir=config.output_path,
faq_data=faq_dict,
)
create_glycoshape_json(
dir=config.output_path,)
total_simulation_length(file_path=config.output_path / "GLYCOSHAPE.json")
create_glycoshape_archive(
dir=config.output_path,
)
wurcs_registration(
dir=config.output_path,
file="GLYCOSHAPE.json",
output_file="missing_glytoucan.txt"
)
# submit_wurcs(
# contributor_id=config.contributor_id,
# api_key=config.api_key, file_path=config.output_path / "missing_glytoucan.txt")