Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jakub/strip mmcif files #4

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions app/static/attributions.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Project Contributors</title>
<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">
</head>
<body>
<div class="container my-4">
<h1 class="text-center mb-4">Project Contributors</h1>

<!-- Contributor 1 -->
<div class="card mb-3">
<div class="card-body">
<h5 class="card-title">Contributor Name</h5>
<p class="card-text">Description of the role and work. For example, development of search algorithms. Mention of "binary sketches in the Hamming space, index, PPP-codes of David Novák and Pavel Zezula".</p>
<a href="link-to-relevant-paper" class="card-link">Relevant Paper Title</a>
</div>
</div>

<!-- Contributor 2 -->
<div class="card mb-3">
<div class="card-body">
<h5 class="card-title">Contributor Name</h5>
<p class="card-text">Brief description of their work within the project.</p>
<!-- Add paper link if applicable -->
</div>
</div>

<!-- Additional contributors here -->

<!-- References Section -->
<h2 class="mt-5 mb-3">References</h2>
<ul class="list-group list-group-flush">
<li class="list-group-item">Similarity search for an extreme application: experience and implementation</li>
<li class="list-group-item">Designing sketches for similarity filtering</li>
<li class="list-group-item">Binary sketches for secondary filtering</li>
<li class="list-group-item">PPP-codes for large-scale similarity searching</li>
</ul>
</div>

<!-- Bootstrap JS Bundle -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/js/bootstrap.bundle.min.js"></script>
</body>
</html>
3 changes: 3 additions & 0 deletions app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ <h5 class="modal-title">Select protein</h5>
<p class="lead">
Search for the most similar protein chains to a given query chain. <br>
</p>
See <a href="/static/attributions.html">attributions page</a> for information about project authors.<br>
Indexed {{ chain_count }} chains from {{ protein_count }} proteins downloaded from <a
href="http://pdbe.org/" rel="noreferrer" target="_blank">PDBe</a>.
Last update: {{ updated }}.
Expand Down Expand Up @@ -291,6 +292,8 @@ <h5 class="card-title">Detected chains {% if selected %} of
<input type="hidden" name="uploaded" id="uploaded" value="{{ uploaded }}">

</form>
<hr>

</div>

{% endblock body %}
Expand Down
40 changes: 38 additions & 2 deletions utils/get_stats.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,44 @@
import python_distance
import gemmi

protein = "/mnt/data-ssd/PDBe_raw/xh/2xhc.cif"
protein = "/mnt/data/PDBe_raw/xh/2xhc.cif"
protein = '/mnt/data/PDBe_raw/as/1asj.cif'
protein = '/mnt/data/PDBe_raw/pc/3pcc.cif'

print(python_distance.save_chains(f'/mnt/data-ssd/PDBe_raw/{protein}.cif', '/tmp', 'test'))
protein_out = '/tmp/stripped.bin'


def strip_file(filename):
with open(filename, 'r') as fin:
contents = fin.read()
doc = gemmi.cif.read_string(contents)
block = doc.sole_block()
pdb_id = block.find_pair('_struct.entry_id')
pdb_title = block.find_pair('_struct.title')

lines = []

for line in contents.splitlines(keepends=True):
if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
lines.append(line)
for i, line in enumerate(lines[:-1]):
if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
lines[i] = None
lines = [line for line in lines if line is not None]
for i in range(1, len(lines)):
if lines[i].startswith('#') and lines[i - 1].startswith('#'):
lines[i - 1] = None
lines = [line for line in lines if line is not None]

lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
return lines

with open(protein_out, 'w') as fout:
fout.writelines(strip_file(protein))



print(python_distance.save_chains(protein_out, '/tmp', 'test'))


def get_raw_from_gesamt(strid):
Expand Down
10 changes: 10 additions & 0 deletions utils/strip_mmcif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import io
import python_distance

def strip_stream(fin: io.TextIOWrapper, fout: io.TextIOWrapper):
for line in fin:
...

def test_stripping():
python_distance.distance("test", "test")

129 changes: 49 additions & 80 deletions utils/update_binary_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,17 @@ def is_updated(filename: str, mirror_dir: str, raw_dir: str) -> Tuple[str, bool]
gzip_path = Path(mirror_dir) / get_dir(filename) / f'{filename}.gz'
raw_path = Path(raw_dir) / get_dir(filename) / filename
with gzip.open(gzip_path, 'rt') as f_gzip, open(raw_path, 'r') as f_raw:
if f_gzip.read() != f_raw.read():
return filename, True
gzip_contents = f_gzip.read()
raw_contents = f_raw.read()
stripped = strip_cif(gzip_contents)
stripped = ''.join(stripped)
# this if is troublesome
# the first part is present just for legacy bins
# if all old bins are regenarated, it can be dropped
if gzip_contents == raw_contents or stripped == raw_contents:
return filename, False

return filename, False
return filename, True


def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecutor) -> Tuple[
Expand Down Expand Up @@ -79,7 +86,6 @@ def get_whats_updated(mirror_dir: str, raw_dir: str, executor: ProcessPoolExecut

def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariadb.connection') -> None:
cursor = conn.cursor()
print(files)
for file in files:
pdb_id = Path(file).with_suffix('').name.upper()
cursor.execute('DELETE FROM protein WHERE pdbId = %s', (pdb_id,))
Expand All @@ -93,18 +99,50 @@ def remove_chains(files: List[str], raw_dir: str, binary_dir: str, conn: 'mariad
cursor.execute(f'UPDATE proteinChain SET indexedAsDataObject = 0 WHERE intId IN ({ids_format})', int_ids)

for chain_id in chain_ids:
(Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
try:
(Path(binary_dir) / dirpath / f'{chain_id}.bin').unlink()
except FileNotFoundError:
pass # the .bin chain might have never existed if gesamt was unable to read the cif file

(Path(raw_dir) / dirpath / file).unlink()

conn.commit()
cursor.close()


def strip_cif(cif_content: str):
# parse the info we care about
doc = gemmi.cif.read_string(cif_content)
block = doc.sole_block()
pdb_id = block.find_pair('_struct.entry_id')
pdb_title = block.find_pair('_struct.title')

lines = []

# strip everything else
for line in cif_content.splitlines(keepends=True):
if line.startswith(('data_', 'loop_', '_atom_site', 'ATOM ', 'HETATM ', '#')):
lines.append(line)
for i, line in enumerate(lines[:-1]):
if line.startswith('loop_') and not lines[i + 1].startswith('_atom_site.group_PDB'):
lines[i] = None
lines = [line for line in lines if line is not None]
for i in range(1, len(lines)):
if lines[i].startswith('#') and lines[i - 1].startswith('#'):
lines[i - 1] = None
lines = [line for line in lines if line is not None]

# put the info back in
lines += (f"_struct.entry_id {pdb_id}\n", '#\n' + f"_struct.title {pdb_title}\n", '#\n')
return lines


def decompress_file(filename, src_dir: str, dest_dir: str) -> None:
with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as f_in:
with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as f_out:
shutil.copyfileobj(f_in, f_out)
with gzip.open(Path(src_dir) / get_dir(filename) / f'{filename}.gz', 'rt') as fin:
contents = fin.read()
lines = strip_cif(contents)
with open(Path(dest_dir) / get_dir(filename) / filename, 'w') as fout:
fout.writelines(lines)


def create_binaries(filename: str, src_dir: str, dest_dir: str) -> List[Tuple[str, str, int]]:
Expand Down Expand Up @@ -137,7 +175,6 @@ def read_protein_title(filename: str) -> Tuple[str, Optional[str]]:
def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str, conn: 'mariadb.connection',
executor: ProcessPoolExecutor) -> None:
cursor = conn.cursor()
print(files)

# Decompress gzipped CIFs
jobs = [executor.submit(decompress_file, filename, mirror_dir, raw_dir) for filename in files]
Expand Down Expand Up @@ -177,88 +214,18 @@ def add_chains(files: List[str], mirror_dir: str, raw_dir: str, binary_dir: str,
cursor.close()


def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
'''
performs a consistency check between raw directory and database # todo shouldn't it be with binary?
'''
gesamt_ids = set()
num_top_level_folders = len(
[name for name in os.listdir(Path(raw_dir))])

with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
for dirpath, _, fnames in os.walk(Path(raw_dir)):
for filename in fnames:
file = Path(raw_dir) / get_dir(filename) / filename
pdb_id = file.name[:4].upper()
gesamt_ids.add(pdb_id)
pbar.update(1)

cur = conn.cursor()
cur.execute("select gesamtId from proteinChain")
gesamt_ids_db = set()
for gid in cur:
gid = gid[0]
gesamt_ids_db.add(gid.split(':')[0])

diff = gesamt_ids - gesamt_ids_db
print(gesamt_ids - gesamt_ids_db)
print(f"ids in fs {len(gesamt_ids)}")
print(f"ids in db {len(gesamt_ids_db)}")
print(f"got {len(diff)} more ids in the raw_dir than db")
print("Consistency check for raw directories failed")


def consistency_check(raw_dir: str, conn: 'mariadb.connection') -> None:
'''
performs a consistency check between raw directory and database
'''
gesamt_ids = set()
num_top_level_folders = len(
[name for name in os.listdir(Path(raw_dir))])

with tqdm.tqdm(total=num_top_level_folders, desc='Getting ids from filesystem') as pbar:
for dirpath, _, fnames in os.walk(Path(raw_dir)):
for filename in fnames:
file = Path(raw_dir) / get_dir(filename) / filename
pdb_id = file.name[:4].upper()
gesamt_ids.add(pdb_id)
pbar.update(1)

cur = conn.cursor()
cur.execute("select gesamtId from proteinChain")
gesamt_ids_db = set()
for gid in cur:
gid = gid[0]
gesamt_ids_db.add(gid.split(':')[0])

diff = gesamt_ids - gesamt_ids_db
print(gesamt_ids - gesamt_ids_db)
print(f"ids in fs {len(gesamt_ids)}")
print(f"ids in db {len(gesamt_ids_db)}")
print(f"got {len(diff)} more ids in the raw_dir than db")
print("Consistency check for raw directories failed")


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default='/etc/protein_search.ini', help='File with configuration of DB')
parser.add_argument('--mirror-directory', type=str, default=True, help='Directory with rsynced files')
parser.add_argument('--binary-directory', type=str, required=True, help='Directory to store binaries')
parser.add_argument('--raw-directory', type=str, required=True, help='Directory with uncompressed files')
parser.add_argument('--workers', type=int, default=1, help='Number of workers ')
parser.add_argument('--consistency-check', type=bool, default=False, help='Should a consistency check with DB be performed')
args = parser.parse_args()

config = configparser.ConfigParser()
config.read(args.config)

if args.consistency_check:
print("performing consistency check")
conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
database=config['db']['database'])
consistency_check(args.raw_directory, conn)
return

executor = ProcessPoolExecutor(args.workers)

print('*** Updating directories ***')
Expand All @@ -273,11 +240,13 @@ def main():
print(f'Removed files: {stats["removed"]}')
print(f'Up-to-date files: {stats["ok"]}')

if len(removed_files) > 10000 or len(modified_files) > 10000:
print('Too many files to remove, aborting...')
exit(1)

conn = mariadb.connect(host=config['db']['host'], user=config['db']['user'], password=config['db']['password'],
database=config['db']['database'])


print('*** Processing new entries ***')
add_chains(new_files, args.mirror_directory, args.raw_directory, args.binary_directory, conn, executor)

Expand Down