support fetch pdb method

Hekstra-Lab · Sep 26, 2023 · 8f3f936 · 8f3f936
1 parent ab7e57c
commit 8f3f936
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 2 deletions.
diff --git a/SFC_Torch/__init__.py b/SFC_Torch/__init__.py
@@ -1,6 +1,6 @@
 # Top level API
 from .Fmodel import SFcalculator
-from .io import PDBParser
+from .io import PDBParser, fetch_pdb
 
 # Suboodules
 from . import utils

diff --git a/SFC_Torch/io.py b/SFC_Torch/io.py
@@ -1,6 +1,9 @@
 import gemmi
 import torch
 import numpy as np
+import urllib.request, os
+from tqdm import tqdm
+import pandas as pd
 
 from .utils import try_gpu
 
@@ -276,3 +279,66 @@ def from_atom_slices(self, atom_slices, inplace=False):
     def savePDB(self, savefilename, include_header=True):
         structure = self.to_gemmi(include_header=include_header)
         structure.write_pdb(savefilename)
+
+def fetch_pdb(idlist, outpath):
+    '''
+    Fetch pdb and mtz files from Protein Data Bank, with static urllib
+
+    Parameters
+    ----------
+    idlist : [str]
+        List of PDB ids
+    
+    outpath : str
+
+    Returns
+    -------
+    DataFrame of fetch stats
+
+    pdb files will be saved at outpath/models/
+    mtz files will be saved at outpath/reflections/
+    Record csv file will be saved at outpath/fetchpdb.csv
+    '''
+    model_path = os.path.join(outpath, 'models/')
+    reflection_path = os.path.join(outpath, 'reflections/')
+    for folder in [model_path, reflection_path]:
+        if os.path.exists(folder):
+            print(f"{folder:<80}" + f"{'already exists': >20}")
+        else:
+            os.makedirs(folder)
+            print(f"{folder:<80}" + f"{'created': >20}")
+
+    codes = []
+    with_pdb = []
+    with_mtz = []
+    for pdb_code in tqdm(idlist):
+        valid_code = pdb_code.lower()
+
+        pdblink = "https://files.rcsb.org/download/" + valid_code.upper() + ".pdb"
+        mtzlink = "https://edmaps.rcsb.org/coefficients/" + valid_code + ".mtz"
+        codes.append(valid_code)
+        try:
+            urllib.request.urlretrieve(pdblink, os.path.join(model_path, valid_code+".pdb"))
+            with_pdb.append(1)
+        except:
+            with_pdb.append(0) 
+        try:
+            urllib.request.urlretrieve(mtzlink, os.path.join(reflection_path, valid_code+".mtz"))
+            with_mtz.append(1)
+        except:
+            with_mtz.append(0)
+
+    stat_df = pd.DataFrame({
+        "code" : codes,
+        "with_pdb" : with_pdb,
+        "with_mtz" : with_mtz
+    })
+    stat_df.to_csv(os.path.join(outpath, "fetchpdb.csv"))
+    return stat_df
+
+
+
+
+
+
+
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -5,7 +5,7 @@
 import gemmi
 import numpy as np
 
-from SFC_Torch.io import PDBParser
+from SFC_Torch.io import PDBParser, fetch_pdb
 
 
 def test_setdata(data_pdb):
@@ -80,3 +80,12 @@ def test_fromatomslices(data_pdb, inplace):
         assert b.cell == a.cell
         assert b.spacegroup.hm == a.spacegroup.hm
         assert len(b.atom_pos) == 55
+
+def test_fetchpdb():
+    df = fetch_pdb(['4lZt', '1cTS'], outpath='../dev/')
+    assert df['code'].tolist() == ['4lzt', '1cts']
+    assert df['with_pdb'].tolist() == [1, 1]
+    assert df['with_mtz'].tolist() == [1, 0]
+    assert exists("../dev/models/4lzt.pdb")
+    assert exists("../dev/models/1cts.pdb")    
+    assert exists("../dev/reflections/4lzt.mtz")