Skip to content

Commit

Permalink
Allow compression conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivierBeq committed Aug 25, 2022
1 parent a98ebb3 commit 5cc446b
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 3 deletions.
66 changes: 63 additions & 3 deletions src/papyrus_scripts/cli.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# -*- coding: utf-8 -*-

import sys
import os
import inspect

import click
import pystow

from .download import download_papyrus, remove_papyrus
from .matchRCSB import get_matches, update_rcsb_data
from .reader import read_papyrus
from .utils.IO import get_num_rows_in_file
from .utils.IO import get_num_rows_in_file, process_data_version, convert_gz_to_xz, convert_xz_to_gz
from .subsim_search import FPSubSim2
from .fingerprint import Fingerprint, get_fp_from_name

Expand Down Expand Up @@ -43,10 +45,14 @@ def main():
'final hidden states and final cell states), '
'all (all descriptors for the selected stereochemistry), or '
'none (do not download any descriptor).'))
@click.option('-r', '--repo', 'repo', type=click.Choice(['zenodo', 'googledrive']),
required=False, default='zenodo', nargs=1,
show_default=True, multiple=False,
help=('Repository to download the data from..'))
@click.option('--force', is_flag=True, required=False, default=False, nargs=1,
show_default=True, help='Force download if disk space is low'
'(default: False for 10% disk space margin).')
def download(output_directory, version, stereo, structs, descs, force):
def download(output_directory, version, stereo, structs, descs, force, repo):
if isinstance(version, tuple):
version = list(version)
if isinstance(descs, tuple):
Expand All @@ -57,6 +63,7 @@ def download(output_directory, version, stereo, structs, descs, force):
stereo=stereo in ['with', 'both'],
structures=structs,
descriptors=descs,
repo=repo,
progress=True,
disk_margin=0.0 if force else 0.1)

Expand Down Expand Up @@ -297,4 +304,57 @@ def fpsubsim2(indir, output, version, is3D, fingerprint, verbose, njobs, fingerp
fingerprints.append(get_fp_from_name(fp_name, **fp_param))
for version_ in version:
fpss.create_from_papyrus(is3d=is3D, version=version_, outfile=output, fingerprint=fingerprints, root_folder=indir,
progress=verbose, njobs=njobs)
progress=verbose, njobs=njobs)


@main.command(help='Transform the compression of Papyrus files from LZMA to Gzip and vice-versa.')
@click.option('-i', '--indir', 'indir', type=str, required=False, default=None, nargs=1,
metavar='INDIR', show_default=True,
help='Directory where Papyrus data is stored\n(default: pystow\'s home folder).')
@click.option('-v', '--version', 'version', type=str, required=False, default=['latest'], multiple=False,
metavar='XX.X', help='Version of the Papyrus data to be transformed (default: latest).')
@click.option('-f', '--format', 'format', type=click.Choice(['xz', 'gzip']),
required=False, default=None, nargs=1, show_default=True, multiple=False,
help=('Compression type to transform the data to. Is inferred if not specified.'))
@click.option('-l', '--level', 'level', type=click.IntRange(0, 9),
required=False, default=None, nargs=1, show_default=True, multiple=False,
help=('Compression level of output files.'))
@click.option('-e', '--extreme', 'extreme', is_flag=True, required=False, default=False, nargs=1,
show_default=True, help='Should extreme compression be toggled on.')
def convert(indir, version, format, level, extreme):
if isinstance(version, tuple):
version = list(version)
if indir is None:
indir = str(pystow.utils.get_base(''))
version = process_data_version(version, indir)
if format is None:
# Infer from the most abundant file type
formats = {'xz': [], 'gzip': []}
for root, _, files in os.walk(os.path.join(indir, 'papyrus', version)):
for name in files:
if name.lower().endswith('xz'):
format['gzip'].append(os.path.join(root, name))
elif name.lower().endswith('gz'):
format['xz'].append(os.path.join(root, name))
if len(format['gzip']) > len(format['xz']):
format = 'gzip'
elif len(format['xz']) != 0:
format = 'xz'
else:
raise ValueError('Equal number of LZMA and GZIP files, please indicate the output format.')
# Transform files of the specified format
for root, _, files in os.walk(os.path.join(indir, 'papyrus', version)):
for name in files:
if format == 'gzip' and name.endswith('xz'):
convert_xz_to_gz(os.path.join(root, name),
os.path.join(root, name).rstrip('xz') + 'gz',
compression_level=level,
progress=True)
os.remove(os.path.join(root, name))
elif format == 'xz' and name.endswith('gz'):
convert_gz_to_xz(os.path.join(root, name),
os.path.join(root, name).rstrip('gz') + 'xz',
compression_level=level,
extreme=extreme,
progress=True)
os.remove(os.path.join(root, name))
72 changes: 72 additions & 0 deletions src/papyrus_scripts/utils/IO.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
import os
import requests
import shutil
import lzma
import gzip
from typing import List, Optional

import pystow
from tqdm.auto import tqdm


def sha256sum(filename, blocksize=None):
Expand Down Expand Up @@ -214,3 +217,72 @@ def get_papyrus_links():
with open(local_file) as fh:
data = json.load(fh)
return data


def convert_xz_to_gz(input_file: str, output_file: str,
compression_level: int = 9,
progress: bool = False) -> None:
"""Convert a LZMA- compressed xz file to a GZIP-compressed file.
:param input_file: Path of the input file
:param ouput_file: Path of the output file
:param compression_level: Compression level of the output file (if None, defaults to 9)
:param progress: Show conversion progress.
"""
if compression_level is None:
compression_level = 9
# Transform per chunk
chunksize = 10 * 1048576 # 10 MB
with lzma.open(input_file, 'rb') as fh, gzip.open(output_file, 'wb', compresslevel=compression_level) as oh:
if progress:
pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
size = fh.seek(0, 2) # Determine original size
_ = fh.seek(0, 0) # Go back to the beginning
pbar.set_description('Converting')
pbar.total = size
# pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
while True:
chunk = fh.read(chunksize)
if not chunk:
if progress:
pbar.close()
break
written = oh.write(chunk)
if progress:
pbar.update(written)


def convert_gz_to_xz(input_file: str, output_file: str,
compression_level: int = lzma.PRESET_DEFAULT,
extreme: bool = False,
progress: bool = False) -> None:
"""Convert a GZIP- compressed file to a LZMA-compressed xz file.
:param input_file: Path of the input file
:param ouput_file: Path of the output file
:param compression_level: Compression level of the output file (if None, defaults to 6)
:param extreme: Should extreme compression be toggled on top of the compression level
:param progress: Show conversion progress.
"""
if compression_level is None:
compression_level = lzma.PRESET_DEFAULT
preset = compression_level | lzma.PRESET_EXTREME if extreme else compression_level
# Transform per chunk
chunksize = 10 * 1048576 # 10 MB
with gzip.open(input_file, 'rb') as fh, lzma.open(output_file, 'wb', preset=preset) as oh:
if progress:
pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
size = fh.seek(0, 2) # Determine original size
_ = fh.seek(0, 0) # Go back to the beginning
pbar.set_description('Converting')
pbar.total = size
# pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
while True:
chunk = fh.read(chunksize)
if not chunk:
if progress:
pbar.close()
break
written = oh.write(chunk)
if progress:
pbar.update(written)

0 comments on commit 5cc446b

Please sign in to comment.