Allow compression conversion

OlivierBeq · Aug 25, 2022 · 5cc446b · 5cc446b
1 parent a98ebb3
commit 5cc446b
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 3 deletions.
diff --git a/src/papyrus_scripts/cli.py b/src/papyrus_scripts/cli.py
@@ -1,14 +1,16 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import os
 import inspect
 
 import click
+import pystow
 
 from .download import download_papyrus, remove_papyrus
 from .matchRCSB import get_matches, update_rcsb_data
 from .reader import read_papyrus
-from .utils.IO import get_num_rows_in_file
+from .utils.IO import get_num_rows_in_file, process_data_version, convert_gz_to_xz, convert_xz_to_gz
 from .subsim_search import FPSubSim2
 from .fingerprint import Fingerprint, get_fp_from_name
 
@@ -43,10 +45,14 @@ def main():
                     'final hidden states and final cell states), '
                     'all (all descriptors for the selected stereochemistry), or '
                     'none (do not download any descriptor).'))
+@click.option('-r', '--repo', 'repo', type=click.Choice(['zenodo', 'googledrive']),
+              required=False, default='zenodo', nargs=1,
+              show_default=True, multiple=False,
+              help=('Repository to download the data from..'))
 @click.option('--force', is_flag=True, required=False, default=False, nargs=1,
               show_default=True, help='Force download if disk space is low'
                                       '(default: False for 10% disk space margin).')
-def download(output_directory, version, stereo, structs, descs, force):
+def download(output_directory, version, stereo, structs, descs, force, repo):
     if isinstance(version, tuple):
         version = list(version)
     if isinstance(descs, tuple):
@@ -57,6 +63,7 @@ def download(output_directory, version, stereo, structs, descs, force):
                      stereo=stereo in ['with', 'both'],
                      structures=structs,
                      descriptors=descs,
+                     repo=repo,
                      progress=True,
                      disk_margin=0.0 if force else 0.1)
 
@@ -297,4 +304,57 @@ def fpsubsim2(indir, output, version, is3D, fingerprint, verbose, njobs, fingerp
             fingerprints.append(get_fp_from_name(fp_name, **fp_param))
         for version_ in version:
             fpss.create_from_papyrus(is3d=is3D, version=version_, outfile=output, fingerprint=fingerprints, root_folder=indir,
-                                     progress=verbose, njobs=njobs)
+                                     progress=verbose, njobs=njobs)
+
+
+@main.command(help='Transform the compression of Papyrus files from LZMA to Gzip and vice-versa.')
+@click.option('-i', '--indir', 'indir', type=str, required=False, default=None, nargs=1,
+              metavar='INDIR', show_default=True,
+              help='Directory where Papyrus data is stored\n(default: pystow\'s home folder).')
+@click.option('-v', '--version', 'version', type=str, required=False, default=['latest'], multiple=False,
+              metavar='XX.X', help='Version of the Papyrus data to be transformed (default: latest).')
+@click.option('-f', '--format', 'format', type=click.Choice(['xz', 'gzip']),
+              required=False, default=None, nargs=1, show_default=True, multiple=False,
+              help=('Compression type to transform the data to. Is inferred if not specified.'))
+@click.option('-l', '--level', 'level', type=click.IntRange(0, 9),
+              required=False, default=None, nargs=1, show_default=True, multiple=False,
+              help=('Compression level of output files.'))
+@click.option('-e', '--extreme', 'extreme', is_flag=True, required=False, default=False, nargs=1,
+              show_default=True, help='Should extreme compression be toggled on.')
+def convert(indir, version, format, level, extreme):
+    if isinstance(version, tuple):
+        version = list(version)
+    if indir is None:
+        indir = str(pystow.utils.get_base(''))
+    version = process_data_version(version, indir)
+    if format is None:
+        # Infer from the most abundant file type
+        formats = {'xz': [], 'gzip': []}
+        for root, _, files in os.walk(os.path.join(indir, 'papyrus', version)):
+            for name in files:
+                if name.lower().endswith('xz'):
+                    format['gzip'].append(os.path.join(root, name))
+                elif name.lower().endswith('gz'):
+                    format['xz'].append(os.path.join(root, name))
+        if len(format['gzip']) > len(format['xz']):
+            format = 'gzip'
+        elif len(format['xz']) != 0:
+            format = 'xz'
+        else:
+            raise ValueError('Equal number of LZMA and GZIP files, please indicate the output format.')
+    # Transform files of the specified format
+    for root, _, files in os.walk(os.path.join(indir, 'papyrus', version)):
+        for name in files:
+            if format == 'gzip' and name.endswith('xz'):
+                convert_xz_to_gz(os.path.join(root, name),
+                                 os.path.join(root, name).rstrip('xz') + 'gz',
+                                 compression_level=level,
+                                 progress=True)
+                os.remove(os.path.join(root, name))
+            elif format == 'xz' and name.endswith('gz'):
+                convert_gz_to_xz(os.path.join(root, name),
+                                 os.path.join(root, name).rstrip('gz') + 'xz',
+                                 compression_level=level,
+                                 extreme=extreme,
+                                 progress=True)
+                os.remove(os.path.join(root, name))
diff --git a/src/papyrus_scripts/utils/IO.py b/src/papyrus_scripts/utils/IO.py
@@ -10,9 +10,12 @@
 import os
 import requests
 import shutil
+import lzma
+import gzip
 from typing import List, Optional
 
 import pystow
+from tqdm.auto import tqdm
 
 
 def sha256sum(filename, blocksize=None):
@@ -214,3 +217,72 @@ def get_papyrus_links():
     with open(local_file) as fh:
         data = json.load(fh)
     return data
+
+
+def convert_xz_to_gz(input_file: str, output_file: str,
+                     compression_level: int = 9,
+                     progress: bool = False) -> None:
+    """Convert a LZMA- compressed xz file to a GZIP-compressed file.
+
+    :param input_file: Path of the input file
+    :param ouput_file: Path of the output file
+    :param compression_level: Compression level of the output file (if None, defaults to 9)
+    :param progress: Show conversion progress.
+    """
+    if compression_level is None:
+        compression_level = 9
+    # Transform per chunk
+    chunksize = 10 * 1048576  # 10 MB
+    with lzma.open(input_file, 'rb') as fh, gzip.open(output_file, 'wb', compresslevel=compression_level) as oh:
+        if progress:
+            pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
+            size = fh.seek(0, 2)  # Determine original size
+            _ = fh.seek(0, 0)  # Go back to the beginning
+            pbar.set_description('Converting')
+            pbar.total = size
+            # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
+        while True:
+            chunk = fh.read(chunksize)
+            if not chunk:
+                if progress:
+                    pbar.close()
+                break
+            written = oh.write(chunk)
+            if progress:
+                pbar.update(written)
+
+
+def convert_gz_to_xz(input_file: str, output_file: str,
+                     compression_level: int = lzma.PRESET_DEFAULT,
+                     extreme: bool = False,
+                     progress: bool = False) -> None:
+    """Convert a GZIP- compressed file to a LZMA-compressed xz file.
+
+    :param input_file: Path of the input file
+    :param ouput_file: Path of the output file
+    :param compression_level: Compression level of the output file (if None, defaults to 6)
+    :param extreme: Should extreme compression be toggled on top of the compression level
+    :param progress: Show conversion progress.
+    """
+    if compression_level is None:
+        compression_level = lzma.PRESET_DEFAULT
+    preset = compression_level | lzma.PRESET_EXTREME if extreme else compression_level
+    # Transform per chunk
+    chunksize = 10 * 1048576  # 10 MB
+    with gzip.open(input_file, 'rb') as fh, lzma.open(output_file, 'wb', preset=preset) as oh:
+        if progress:
+            pbar = tqdm(desc='Determining size', unit='B', unit_scale=True)
+            size = fh.seek(0, 2)  # Determine original size
+            _ = fh.seek(0, 0)  # Go back to the beginning
+            pbar.set_description('Converting')
+            pbar.total = size
+            # pbar = tqdm(total=size, desc='Converting', unit='B', unit_scale=True)
+        while True:
+            chunk = fh.read(chunksize)
+            if not chunk:
+                if progress:
+                    pbar.close()
+                break
+            written = oh.write(chunk)
+            if progress:
+                pbar.update(written)