From 97ac6df7109d280bcfc16b5a39f8920e556a695c Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 12 Jan 2017 14:44:44 -0800 Subject: [PATCH 01/47] Initial restructure --- .coveragerc | 1 + .gitignore | 1 + .travis.yml | 1 + README.md | 18 + README.rst | 426 ----- blobxfer.py | 3033 ----------------------------------- blobxfer/__init__.py | 25 + blobxfer/util.py | 213 +++ blobxfer/version.py | 25 + setup.py | 68 +- test/test_blobxfer.py | 1436 ----------------- test_requirements.txt | 5 + tests/test_blobxfer_util.py | 133 ++ tox.ini | 18 + 14 files changed, 492 insertions(+), 4911 deletions(-) create mode 100644 README.md delete mode 100644 README.rst delete mode 100755 blobxfer.py create mode 100644 blobxfer/__init__.py create mode 100644 blobxfer/util.py create mode 100644 blobxfer/version.py delete mode 100644 test/test_blobxfer.py create mode 100644 test_requirements.txt create mode 100644 tests/test_blobxfer_util.py create mode 100644 tox.ini diff --git a/.coveragerc b/.coveragerc index b710cba..5fc34c3 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,6 +6,7 @@ omit = exclude_lines = # Have to re-enable the standard pragma pragma: no cover + noqa # Don't complain about missing debug-only code: def __repr__ diff --git a/.gitignore b/.gitignore index ddc86bb..21d27b6 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ htmlcov/ nosetests.xml coverage.xml *,cover +junit-*.xml # Translations *.mo diff --git a/.travis.yml b/.travis.yml index cdf9217..5bc451d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - 3.3 - 3.4 - 3.5 + - 3.6 - pypy # disable pypy3 until 3.3 compliance #- pypy3 diff --git a/README.md b/README.md new file mode 100644 index 0000000..6853cb4 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +blobxfer +======== + +AzCopy-like OS independent Azure storage blob and file share transfer tool + +Change Log +---------- + +See the [CHANGELOG.md](https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md) file. + +------------------------------------------------------------------------ + +This project has adopted the +[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the +[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +or contact [](mailto:opencode@microsoft.com) with any +additional questions or comments. diff --git a/README.rst b/README.rst deleted file mode 100644 index 882d883..0000000 --- a/README.rst +++ /dev/null @@ -1,426 +0,0 @@ -.. image:: https://travis-ci.org/Azure/blobxfer.svg?branch=master - :target: https://travis-ci.org/Azure/blobxfer -.. image:: https://coveralls.io/repos/github/Azure/blobxfer/badge.svg?branch=master - :target: https://coveralls.io/github/Azure/blobxfer?branch=master -.. image:: https://img.shields.io/pypi/v/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/pypi/pyversions/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/pypi/l/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/docker/pulls/alfpark/blobxfer.svg - :target: https://hub.docker.com/r/alfpark/blobxfer -.. image:: https://images.microbadger.com/badges/image/alfpark/blobxfer.svg - :target: https://microbadger.com/images/alfpark/blobxfer - -blobxfer -======== -AzCopy-like OS independent Azure storage blob and file share transfer tool - -Installation ------------- -`blobxfer`_ is on PyPI and can be installed via: - -:: - - pip install blobxfer - -blobxfer is compatible with Python 2.7 and 3.3+. To install for Python 3, some -distributions may use ``pip3`` instead. If you do not want to install blobxfer -as a system-wide binary and modify system-wide python packages, use the -``--user`` flag with ``pip`` or ``pip3``. - -blobxfer is also on `Docker Hub`_, and the Docker image for Linux can be -pulled with the following command: - -:: - - docker pull alfpark/blobxfer - -Please see example usage below on how to use the docker image. - -If you encounter difficulties installing the script, it may be due to the -``cryptography`` dependency. Please ensure that your system is able to install -binary wheels provided by these dependencies (e.g., on Windows) or is able to -compile the dependencies (i.e., ensure you have a C compiler, python, ssl, -and ffi development libraries/headers installed prior to invoking pip). For -instance, to install blobxfer on a fresh Ubuntu 14.04/16.04 installation for -Python 2.7, issue the following commands: - -:: - - apt-get update - apt-get install -y build-essential libssl-dev libffi-dev libpython-dev python-dev python-pip - pip install --upgrade blobxfer - -If you need more fine-grained control on installing dependencies, continue -reading this section. Depending upon the desired mode of authentication with -Azure and options, the script will require the following packages, some of -which will automatically pull required dependent packages. Below is a list of -dependent packages: - -- Base Requirements - - - `azure-common`_ - - `azure-storage`_ - - `requests`_ - -- Encryption Support - - - `cryptography`_ - -- Service Management Certificate Support - - - `azure-servicemanagement-legacy`_ - -You can install these packages using pip, easy_install or through standard -setup.py procedures. These dependencies will be automatically installed if -using a package-based install or setup.py. The required versions of these -dependent packages can be found in ``setup.py``. - -.. _blobxfer: https://pypi.python.org/pypi/blobxfer -.. _Docker Hub: https://hub.docker.com/r/alfpark/blobxfer -.. _azure-common: https://pypi.python.org/pypi/azure-common -.. _azure-storage: https://pypi.python.org/pypi/azure-storage -.. _requests: https://pypi.python.org/pypi/requests -.. _cryptography: https://pypi.python.org/pypi/cryptography -.. _azure-servicemanagement-legacy: https://pypi.python.org/pypi/azure-servicemanagement-legacy - -Introduction ------------- - -The blobxfer.py script allows interacting with storage accounts using any of -the following methods: (1) management certificate, (2) shared account key, -(3) SAS key. The script can, in addition to working with single files, mirror -entire directories into and out of containers or file shares from Azure -Storage, respectively. File and block/page level MD5 integrity checking is -supported along with various transfer optimizations, built-in retries, -user-specified timeouts, and client-side encryption. - -Program parameters and command-line options can be listed via the ``-h`` -switch. Please invoke this first if you are unfamiliar with blobxfer operation -as not all options are explained below. At the minimum, three positional -arguments are required: storage account name, container or share name, and -local resource. Additionally, one of the following authentication switches -must be supplied: ``--subscriptionid`` with ``--managementcert``, -``--storageaccountkey``, or ``--saskey``. Do not combine different -authentication schemes together. - -Environment variables ``BLOBXFER_STORAGEACCOUNTKEY``, ``BLOBXFER_SASKEY``, -and ``BLOBXFER_RSAKEYPASSPHRASE`` can take the place of -``--storageaccountkey``, ``--saskey``, and ``--rsakeypassphrase`` respectively -if you do not want to expose credentials on a command line. - -It is generally recommended to use SAS keys wherever appropriate; only HTTPS -transport is used in the script. Please note that when using SAS keys that -only container- or fileshare-level SAS keys will allow for entire directory -uploading or container/fileshare downloading. The container/fileshare must -also have been created beforehand if using a service SAS, as -containers/fileshares cannot be created using service SAS keys. Account-level -SAS keys with a signed resource type of ``c`` or container will allow -containers/fileshares to be created with SAS keys. - -Example Usage -------------- - -The following examples show how to invoke the script with commonly used -options. Note that the authentication parameters are missing from the below -examples. You will need to select a preferred method of authenticating with -Azure and add the authentication switches (or as environment variables) as -noted above. - -The script will attempt to perform a smart transfer, by detecting if the local -resource exists. For example: - -:: - - blobxfer mystorageacct container0 mylocalfile.txt - -Note: if you downloaded the script directly from github, then you should append -``.py`` to the blobxfer command. - -If mylocalfile.txt exists locally, then the script will attempt to upload the -file to container0 on mystorageacct. If the file does not exist, then it will -attempt to download the resource. If the desired behavior is to download the -file from Azure even if the local file exists, one can override the detection -mechanism with ``--download``. ``--upload`` is available to force the transfer -to Azure storage. Note that specifying a particular direction does not force -the actual operation to occur as that depends on other options specified such -as skipping on MD5 matches. Note that you may use the ``--remoteresource`` flag -to rename the local file as the blob name on Azure storage if uploading, -however, ``--remoteresource`` has no effect if uploading a directory of files. -Please refer to the ``--collate`` option as explained below. - -If the local resource is a directory that exists, the script will attempt to -mirror (recursively copy) the entire directory to Azure storage while -maintaining subdirectories as virtual directories in Azure storage. You can -disable the recursive copy (i.e., upload only the files in the directory) -using the ``--no-recursive`` flag. - -To upload a directory with files only matching a Unix-style shell wildcard -pattern, an example commandline would be: - -:: - - blobxfer mystorageacct container0 mylocaldir --upload --include '**/*.txt' - -This would attempt to recursively upload the contents of mylocaldir -to container0 for any file matching the wildcard pattern ``*.txt`` within -all subdirectories. Include patterns can be applied for uploads as well as -downloads. Note that you will need to prevent globbing by your shell such -that wildcard expansion does not take place before script interprets the -argument. If ``--include`` is not specified, all files will be uploaded -or downloaded for the specific context. - -To download an entire container from your storage account, an example -commandline would be: - -:: - - blobxfer mystorageacct container0 mylocaldir --remoteresource . - -Assuming mylocaldir directory does not exist, the script will attempt to -download all of the contents in container0 because “.” is set with -``--remoteresource`` flag. To download individual blobs, one would specify the -blob name instead of “.” with the ``--remoteresource`` flag. If mylocaldir -directory exists, the script will attempt to upload the directory instead of -downloading it. If you want to force the download direction even if the -directory exists, indicate that with the ``--download`` flag. When downloading -an entire container, the script will attempt to pre-allocate file space and -recreate the sub-directory structure as needed. - -To collate files into specified virtual directories or local paths, use -the ``--collate`` flag with the appropriate parameter. For example, the -following commandline: - -:: - - blobxfer mystorageacct container0 myvhds --upload --collate vhds --autovhd - -If the directory ``myvhds`` had two vhd files a.vhd and subdir/b.vhd, these -files would be uploaded into ``container0`` under the virtual directory named -``vhds``, and b.vhd would not contain the virtual directory subdir; thus, -flattening the directory structure. The ``--autovhd`` flag would automatically -enable page blob uploads for these files. If you wish to collate all files -into the container directly, you would replace ``--collate vhds`` with -``--collate .`` - -To strip leading components of a path on upload, use ``--strip-components`` -with a number argument which will act similarly to tar's -``--strip-components=NUMBER`` parameter. This parameter is only applied -during an upload. - -To encrypt or decrypt files, the option ``--rsapublickey`` and -``--rsaprivatekey`` is available. This option requires a file location for a -PEM encoded RSA public or private key. An optional parameter, -``--rsakeypassphrase`` is available for passphrase protected RSA private keys. - -To encrypt and upload, only the RSA public key is required although an RSA -private key may be specified. To download and decrypt blobs which are -encrypted, the RSA private key is required. - -:: - - blobxfer mystorageacct container0 myblobs --upload --rsapublickey mypublickey.pem - -The above example commandline would encrypt and upload files contained in -``myblobs`` using an RSA public key named ``mypublickey.pem``. An RSA private -key may be specified instead for uploading (public parts will be used). - -:: - - blobxfer mystorageacct container0 myblobs --remoteresource . --download --rsaprivatekey myprivatekey.pem - -The above example commandline would download and decrypt all blobs in the -container ``container0`` using an RSA private key named ``myprivatekey.pem``. -An RSA private key must be specified for downloading and decryption of -encrypted blobs. - -Currently only the ``FullBlob`` encryption mode is supported for the -parameter ``--encmode``. The ``FullBlob`` encryption mode either uploads or -downloads Azure Storage .NET/Java compatible client-side encrypted block blobs. - -Please read important points in the Encryption Notes below for more -information. - -To transfer to an Azure Files share, specify the ``--fileshare`` option and -specify the share name as the second positional argument. - -:: - - blobxfer mystorageacct myshare localfiles --fileshare --upload - -The above example would upload all files in the ``localfiles`` directory to -the share named ``myshare``. Encryption/decryption options are compatible with -Azure Files as the destination or source. Please refer to this `MSDN article`_ -for features not supported by the Azure File Service. - -.. _MSDN article: https://msdn.microsoft.com/en-us/library/azure/dn744326.aspx - -Docker Usage ------------- - -An example execution for uploading the host path ``/example/host/path`` -to a storage container named ``container0`` would be: - -:: - - docker run --rm -t -v /example/host/path:/path/in/container alfpark/blobxfer mystorageacct container0 /path/in/container --upload - -Note that docker volume mount mappings must be crafted with care to ensure -consistency with directory depth between the host and the container. -Optionally, you can utilize the ``--strip-components`` flag to remove leading -path components as desired. - -General Notes -------------- - -- If the pyOpenSSL package is present, urllib3/requests may use this package - (as discussed in the Performance Notes below), which may result in - exceptions being thrown that are not normalized by urllib3. This may - result in exceptions that should be retried, but are not. It is recommended - to upgrade your Python where pyOpenSSL is not required for fully validating - peers and such that blobxfer can operate without pyOpenSSL in a secure - fashion. You can also run blobxfer via Docker or in a virtualenv - environment without pyOpenSSL. -- blobxfer does not take any leases on blobs or containers. It is up to - the user to ensure that blobs are not modified while download/uploads - are being performed. -- No validation is performed regarding container and file naming and length - restrictions. -- blobxfer will attempt to download from blob storage as-is. If the source - filename is incompatible with the destination operating system, then - failure may result. -- When using SAS, the SAS key must be a container- or share-level SAS if - performing recursive directory upload or container/file share download. -- If uploading via service-level SAS keys, the container or file share must - already be created in Azure storage prior to upload. Account-level SAS keys - with the signed resource type of ``c`` or container-level permission will - allow conatiner or file share creation. -- For non-SAS requests, timeouts may not be properly honored due to - limitations of the Azure Python SDK. -- By default, files with matching MD5 checksums will be skipped for both - download (if MD5 information is present on the blob) and upload. Specify - ``--no-skiponmatch`` to disable this functionality. -- When uploading files as page blobs, the content is page boundary - byte-aligned. The MD5 for the blob is computed using the final aligned - data if the source is not page boundary byte-aligned. This enables these - page blobs or files to be skipped during subsequent download or upload by - default (i.e., ``--no-skiponmatch`` parameter is not specified). -- If ``--delete`` is specified, any remote files found that have no - corresponding local file in directory upload mode will be deleted. Deletion - occurs prior to any transfers, analogous to the delete-before rsync option. - Please note that this parameter will interact with ``--include`` and any - file not included from the include pattern will be deleted. -- ``--include`` has no effect when specifying a single file to upload or - blob to download. When specifying ``--include`` on container download, - the pattern will be applied to the blob name without the container name. - Globbing of wildcards must be disabled such that the script can read - the include pattern without the shell expanding the wildcards, if specified. -- Empty directories are not created locally when downloading from an Azure - file share which has empty directories. -- Empty directories are not deleted if ``--delete`` is specified and no - files remain in the directory on the Azure file share. - -Performance Notes ------------------ - -- Most likely, you will need to tweak the ``--numworkers`` argument that best - suits your environment. The default is the number of CPUs on the running - machine multiplied by 3 (except when transferring to/from file shares). - Increasing this number (or even using the default) may not provide the - optimal balance between concurrency and your network conditions. - Additionally, this number may not work properly if you are attempting to - run multiple blobxfer sessions in parallel from one machine or IP address. - Futhermore, this number may be defaulted to be set too high if encryption - is enabled and the machine cannot handle processing multiple threads in - parallel. -- Computing file MD5 can be time consuming for large files. If integrity - checking or rsync-like capability is not required, specify - ``--no-computefilemd5`` to disable MD5 computation for files. -- File share performance can be "slow" or become a bottleneck, especially for - file shares containing thousands of files as multiple REST calls must be - performed for each file. Currently, a single file share has a limit of up - to 60 MB/s and 1000 8KB IOPS. Please refer to the - `Azure Storage Scalability and Performance Targets`_ for performance targets - and limits regarding Azure Storage Blobs and Files. If scalable high - performance is required, consider using blob storage or multiple file - shares. -- Using SAS keys may provide the best performance as the script bypasses - the Azure Storage Python SDK and uses requests/urllib3 directly with - Azure Storage endpoints. Transfers to/from Azure Files will always use - the Azure Storage Python SDK even with SAS keys. -- As of requests 2.6.0 and Python versions < 2.7.9 (i.e., interpreter found - on default Ubuntu 14.04 installations), if certain packages are installed, - as those found in ``requests[security]`` then the underlying ``urllib3`` - package will utilize the ``ndg-httpsclient`` package which will use - `pyOpenSSL`_. This will ensure the peers are `fully validated`_. However, - this incurs a rather larger performance penalty. If you understand the - potential security risks for disabling this behavior due to high performance - requirements, you can either remove ``ndg-httpsclient`` or use the script - in a ``virtualenv`` environment without the ``ndg-httpsclient`` package. - Python versions >= 2.7.9 are not affected by this issue. These warnings can - be suppressed using ``--disable-urllib-warnings``, but is not recommended - unless you understand the security implications. - -.. _Azure Storage Scalability and Performance Targets: https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/ -.. _pyOpenSSL: https://urllib3.readthedocs.org/en/latest/security.html#pyopenssl -.. _fully validated: https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning - - -Encryption Notes ----------------- - -- All required information regarding the encryption process is stored on - each blob's ``encryptiondata`` and ``encryptiondata_authentication`` - metadata. These metadata entries are used on download to configure the proper - download and parameters for the decryption process as well as to authenticate - the encryption. Encryption metadata set by blobxfer (or the Azure Storage - .NET/Java client library) should not be modified or blobs/files may be - unrecoverable. -- Local files can be encrypted by blobxfer and stored in Azure Files and, - correspondingly, remote files on Azure File shares can be decrypted by - blobxfer as long as the metdata portions remain in-tact. -- Keys for AES256 block cipher are generated on a per-blob/file basis. These - keys are encrypted using RSAES-OAEP. -- MD5 for both the pre-encrypted and encrypted version of the file is stored - in blob/file metadata. Rsync-like synchronization is still supported - transparently with encrypted blobs/files. -- Whole file MD5 checks are skipped if a message authentication code is found - to validate the integrity of the encrypted data. -- Attempting to upload the same file as an encrypted blob with a different RSA - key or under a different encryption mode will not occur if the file content - MD5 is the same. This behavior can be overridden by including the option - ``--no-skiponmatch``. -- If one wishes to apply encryption to a blob/file already uploaded to Azure - Storage that has not changed, the upload will not occur since the underlying - file content MD5 has not changed; this behavior can be overriden by - including the option ``--no-skiponmatch``. -- Encryption is only applied to block blobs (or fileshare files). Encrypted - page blobs appear to be of minimal value stored in Azure Storage via - blobxfer. Thus, if uploading VHDs while enabling encryption in the script, - do not enable the option ``--pageblob``. ``--autovhd`` will continue to work - transparently where vhd files will be uploaded as page blobs in unencrypted - form while other files will be uploaded as encrypted block blobs. Note that - using ``--autovhd`` with encryption will force set the max chunk size to - 4 MiB for non-encrypted vhd files. -- Downloading encrypted blobs/files may not fully preallocate each file due to - padding. Script failure can result during transfer if there is insufficient - disk space. -- Zero-byte (empty) files are not encrypted. - -Change Log ----------- - -See the `CHANGELOG.md`_ file. - -.. _CHANGELOG.md: https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md - ----- - -This project has adopted the -`Microsoft Open Source Code of Conduct `__. -For more information see the -`Code of Conduct FAQ `__ -or contact `opencode@microsoft.com `__ with any -additional questions or comments. diff --git a/blobxfer.py b/blobxfer.py deleted file mode 100755 index 5cadcba..0000000 --- a/blobxfer.py +++ /dev/null @@ -1,3033 +0,0 @@ -#!/usr/bin/env python - -# blobxfer Tool -# -# Copyright (c) Microsoft Corporation -# -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -""" -Data transfer tool for Azure blob and file storage - -See notes in the README.rst file. - -TODO list: -- convert from threading to multiprocessing -- move instruction queue data to class -- migrate connections with sas to azure-storage -""" - -# pylint: disable=R0913,R0914 - -# stdlib imports -from __future__ import print_function -import argparse -import base64 -import errno -import fnmatch -import hashlib -import hmac -import json -import mimetypes -import multiprocessing -import os -import platform -# pylint: disable=F0401 -try: - import queue -except ImportError: # pragma: no cover - import Queue as queue -# pylint: enable=F0401 -import socket -import sys -import threading -import time -import traceback -try: - from urllib.parse import quote as urlquote -except ImportError: # pramga: no cover - from urllib import quote as urlquote -import xml.etree.ElementTree as ET -# non-stdlib imports -import azure.common -try: - import azure.servicemanagement -except ImportError: # pragma: no cover - pass -import azure.storage.blob -import azure.storage.file -try: - import cryptography.hazmat.backends - import cryptography.hazmat.primitives.asymmetric.padding - import cryptography.hazmat.primitives.asymmetric.rsa - import cryptography.hazmat.primitives.ciphers - import cryptography.hazmat.primitives.ciphers.algorithms - import cryptography.hazmat.primitives.ciphers.modes - import cryptography.hazmat.primitives.constant_time - import cryptography.hazmat.primitives.hashes - import cryptography.hazmat.primitives.padding - import cryptography.hazmat.primitives.serialization -except ImportError: # pragma: no cover - pass -import requests - -# remap keywords for Python3 -# pylint: disable=W0622,C0103 -try: - xrange -except NameError: # pragma: no cover - xrange = range -try: - long -except NameError: # pragma: no cover - long = int -# pylint: enable=W0622,C0103 - -# global defines -_SCRIPT_VERSION = '0.12.1' -_PY2 = sys.version_info.major == 2 -_DEFAULT_MAX_STORAGEACCOUNT_WORKERS = multiprocessing.cpu_count() * 3 -_MAX_BLOB_CHUNK_SIZE_BYTES = 4194304 -_EMPTY_MAX_PAGE_SIZE_MD5 = 'tc+p1sj+vWGPkawoQ9UKHA==' -_MAX_LISTBLOBS_RESULTS = 1000 -_PAGEBLOB_BOUNDARY = 512 -_DEFAULT_STORAGE_ENDPOINT = 'core.windows.net' -_DEFAULT_MANAGEMENT_ENDPOINT = 'management.core.windows.net' -_ENVVAR_STORAGEACCOUNTKEY = 'BLOBXFER_STORAGEACCOUNTKEY' -_ENVVAR_SASKEY = 'BLOBXFER_SASKEY' -_ENVVAR_RSAKEYPASSPHRASE = 'BLOBXFER_RSAKEYPASSPHRASE' -# encryption defines -_AES256_KEYLENGTH_BYTES = 32 -_AES256_BLOCKSIZE_BYTES = 16 -_HMACSHA256_DIGESTSIZE_BYTES = 32 -_AES256CBC_HMACSHA256_OVERHEAD_BYTES = _AES256_BLOCKSIZE_BYTES + \ - _HMACSHA256_DIGESTSIZE_BYTES -_ENCRYPTION_MODE_FULLBLOB = 'FullBlob' -_ENCRYPTION_MODE_CHUNKEDBLOB = 'ChunkedBlob' -_DEFAULT_ENCRYPTION_MODE = _ENCRYPTION_MODE_FULLBLOB -_ENCRYPTION_PROTOCOL_VERSION = '1.0' -_ENCRYPTION_ALGORITHM = 'AES_CBC_256' -_ENCRYPTION_AUTH_ALGORITHM = 'HMAC-SHA256' -_ENCRYPTION_CHUNKSTRUCTURE = 'IV || EncryptedData || Signature' -_ENCRYPTION_ENCRYPTED_KEY_SCHEME = 'RSA-OAEP' -_ENCRYPTION_METADATA_NAME = 'encryptiondata' -_ENCRYPTION_METADATA_MODE = 'EncryptionMode' -_ENCRYPTION_METADATA_ALGORITHM = 'Algorithm' -_ENCRYPTION_METADATA_MAC = 'MessageAuthenticationCode' -_ENCRYPTION_METADATA_LAYOUT = 'EncryptedDataLayout' -_ENCRYPTION_METADATA_CHUNKOFFSETS = 'ChunkByteOffsets' -_ENCRYPTION_METADATA_CHUNKSTRUCTURE = 'ChunkStructure' -_ENCRYPTION_METADATA_AGENT = 'EncryptionAgent' -_ENCRYPTION_METADATA_PROTOCOL = 'Protocol' -_ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM = 'EncryptionAlgorithm' -_ENCRYPTION_METADATA_INTEGRITY_AUTH = 'EncryptionAuthentication' -_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY = 'WrappedContentKey' -_ENCRYPTION_METADATA_ENCRYPTEDKEY = 'EncryptedKey' -_ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY = 'EncryptedAuthenticationKey' -_ENCRYPTION_METADATA_CONTENT_IV = 'ContentEncryptionIV' -_ENCRYPTION_METADATA_KEYID = 'KeyId' -_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS = 'BlobxferExtensions' -_ENCRYPTION_METADATA_PREENCRYPTED_MD5 = 'PreEncryptedContentMD5' -_ENCRYPTION_METADATA_AUTH_NAME = 'encryptiondata_authentication' -_ENCRYPTION_METADATA_AUTH_METAAUTH = 'EncryptionMetadataAuthentication' -_ENCRYPTION_METADATA_AUTH_ENCODING = 'Encoding' -_ENCRYPTION_METADATA_AUTH_ENCODING_TYPE = 'UTF-8' - - -class EncryptionMetadataJson(object): - """Class for handling encryption metadata json""" - def __init__( - self, args, symkey, signkey, iv, encdata_signature, - preencrypted_md5, rsakeyid=None): - """Ctor for EncryptionMetadataJson - Parameters: - args - program arguments - symkey - symmetric key - signkey - signing key - iv - initialization vector - encdata_signature - encrypted data signature (MAC) - preencrypted_md5 - pre-encrypted md5 hash - rsakeyid - symmetric key id - Returns: - Nothing - Raises: - Nothing - """ - self.encmode = args.encmode - self.rsaprivatekey = args.rsaprivatekey - self.rsapublickey = args.rsapublickey - self.chunksizebytes = args.chunksizebytes - self.symkey = symkey - self.signkey = signkey - if rsakeyid is None: - self.rsakeyid = 'private:key1' - else: - self.rsakeyid = rsakeyid - self.iv = iv - self.hmac = encdata_signature - self.md5 = preencrypted_md5 - - def construct_metadata_json(self): - """Constructs encryptiondata metadata - Paramters: - None - Returns: - dict of encryptiondata and encryptiondata_authentiation json - Raises: - Nothing - """ - encsymkey, _ = rsa_encrypt_key( - self.rsaprivatekey, self.rsapublickey, self.symkey) - encsignkey, _ = rsa_encrypt_key( - self.rsaprivatekey, self.rsapublickey, self.signkey) - encjson = { - _ENCRYPTION_METADATA_MODE: self.encmode, - _ENCRYPTION_METADATA_WRAPPEDCONTENTKEY: { - _ENCRYPTION_METADATA_KEYID: self.rsakeyid, - _ENCRYPTION_METADATA_ENCRYPTEDKEY: encsymkey, - _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY: encsignkey, - _ENCRYPTION_METADATA_ALGORITHM: - _ENCRYPTION_ENCRYPTED_KEY_SCHEME, - }, - _ENCRYPTION_METADATA_AGENT: { - _ENCRYPTION_METADATA_PROTOCOL: _ENCRYPTION_PROTOCOL_VERSION, - _ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM: - _ENCRYPTION_ALGORITHM - }, - _ENCRYPTION_METADATA_INTEGRITY_AUTH: { - _ENCRYPTION_METADATA_ALGORITHM: - _ENCRYPTION_AUTH_ALGORITHM, - }, - 'KeyWrappingMetadata': {}, - } - if self.md5 is not None: - encjson[_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS] = { - _ENCRYPTION_METADATA_PREENCRYPTED_MD5: self.md5 - } - if self.encmode == _ENCRYPTION_MODE_FULLBLOB: - encjson[_ENCRYPTION_METADATA_CONTENT_IV] = base64encode(self.iv) - encjson[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_MAC] = base64encode(self.hmac) - elif self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - encjson[_ENCRYPTION_METADATA_LAYOUT] = {} - encjson[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKOFFSETS] = \ - self.chunksizebytes + _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - encjson[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKSTRUCTURE] = \ - _ENCRYPTION_CHUNKSTRUCTURE - else: - raise RuntimeError( - 'Unknown encryption mode: {}'.format(self.encmode)) - bencjson = json.dumps( - encjson, sort_keys=True, ensure_ascii=False).encode( - _ENCRYPTION_METADATA_AUTH_ENCODING_TYPE) - encjson = {_ENCRYPTION_METADATA_NAME: - json.dumps(encjson, sort_keys=True)} - # compute MAC over encjson - hmacsha256 = hmac.new(self.signkey, digestmod=hashlib.sha256) - hmacsha256.update(bencjson) - authjson = { - _ENCRYPTION_METADATA_AUTH_METAAUTH: { - _ENCRYPTION_METADATA_ALGORITHM: _ENCRYPTION_AUTH_ALGORITHM, - _ENCRYPTION_METADATA_AUTH_ENCODING: - _ENCRYPTION_METADATA_AUTH_ENCODING_TYPE, - _ENCRYPTION_METADATA_MAC: base64encode(hmacsha256.digest()), - } - } - encjson[_ENCRYPTION_METADATA_AUTH_NAME] = json.dumps( - authjson, sort_keys=True) - return encjson - - def parse_metadata_json( - self, blobname, rsaprivatekey, rsapublickey, mddict): - """Parses a meta data dictionary containing the encryptiondata - metadata - Parameters: - blobname - name of blob - rsaprivatekey - RSA private key - rsapublickey - RSA public key - mddict - metadata dictionary - Returns: - Nothing - Raises: - RuntimeError if encryptiondata metadata contains invalid or - unknown fields - """ - if _ENCRYPTION_METADATA_NAME not in mddict: - return - # json parse internal dict - meta = json.loads(mddict[_ENCRYPTION_METADATA_NAME]) - # populate preencryption md5 - if (_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS in meta and - _ENCRYPTION_METADATA_PREENCRYPTED_MD5 in meta[ - _ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS]): - self.md5 = meta[_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS][ - _ENCRYPTION_METADATA_PREENCRYPTED_MD5] - else: - self.md5 = None - # if RSA key is not present return - if rsaprivatekey is None and rsapublickey is None: - return - # check for required metadata fields - if (_ENCRYPTION_METADATA_MODE not in meta or - _ENCRYPTION_METADATA_AGENT not in meta): - return - # populate encryption mode - self.encmode = meta[_ENCRYPTION_METADATA_MODE] - # validate known encryption metadata is set to proper values - if self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - chunkstructure = meta[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKSTRUCTURE] - if chunkstructure != _ENCRYPTION_CHUNKSTRUCTURE: - raise RuntimeError( - '{}: unknown encrypted chunk structure {}'.format( - blobname, chunkstructure)) - protocol = meta[_ENCRYPTION_METADATA_AGENT][ - _ENCRYPTION_METADATA_PROTOCOL] - if protocol != _ENCRYPTION_PROTOCOL_VERSION: - raise RuntimeError('{}: unknown encryption protocol: {}'.format( - blobname, protocol)) - blockcipher = meta[_ENCRYPTION_METADATA_AGENT][ - _ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM] - if blockcipher != _ENCRYPTION_ALGORITHM: - raise RuntimeError('{}: unknown block cipher: {}'.format( - blobname, blockcipher)) - if _ENCRYPTION_METADATA_INTEGRITY_AUTH in meta: - intauth = meta[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_ALGORITHM] - if intauth != _ENCRYPTION_AUTH_ALGORITHM: - raise RuntimeError( - '{}: unknown integrity/auth method: {}'.format( - blobname, intauth)) - symkeyalg = meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ALGORITHM] - if symkeyalg != _ENCRYPTION_ENCRYPTED_KEY_SCHEME: - raise RuntimeError('{}: unknown key encryption scheme: {}'.format( - blobname, symkeyalg)) - # populate iv and hmac - if self.encmode == _ENCRYPTION_MODE_FULLBLOB: - self.iv = base64.b64decode(meta[_ENCRYPTION_METADATA_CONTENT_IV]) - # don't base64 decode hmac - if _ENCRYPTION_METADATA_INTEGRITY_AUTH in meta: - self.hmac = meta[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_MAC] - else: - self.hmac = None - # populate chunksize - if self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - self.chunksizebytes = long( - meta[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKOFFSETS]) - # if RSA key is a public key, stop here as keys cannot be decrypted - if rsaprivatekey is None: - return - # decrypt symmetric key - self.symkey = rsa_decrypt_key( - rsaprivatekey, - meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ENCRYPTEDKEY], None) - # decrypt signing key, if it exists - if _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY in meta[ - _ENCRYPTION_METADATA_WRAPPEDCONTENTKEY]: - self.signkey = rsa_decrypt_key( - rsaprivatekey, - meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY], None) - else: - self.signkey = None - # validate encryptiondata metadata using the signing key - if (self.signkey is not None and - _ENCRYPTION_METADATA_AUTH_NAME in mddict): - authmeta = json.loads(mddict[_ENCRYPTION_METADATA_AUTH_NAME]) - if _ENCRYPTION_METADATA_AUTH_METAAUTH not in authmeta: - raise RuntimeError( - '{}: encryption metadata auth block not found'.format( - blobname)) - if _ENCRYPTION_METADATA_AUTH_ENCODING not in authmeta[ - _ENCRYPTION_METADATA_AUTH_METAAUTH]: - raise RuntimeError( - '{}: encryption metadata auth encoding not found'.format( - blobname)) - intauth = authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_ALGORITHM] - if intauth != _ENCRYPTION_AUTH_ALGORITHM: - raise RuntimeError( - '{}: unknown integrity/auth method: {}'.format( - blobname, intauth)) - authhmac = base64.b64decode( - authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_MAC]) - bmeta = mddict[_ENCRYPTION_METADATA_NAME].encode( - authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_AUTH_ENCODING]) - hmacsha256 = hmac.new(self.signkey, digestmod=hashlib.sha256) - hmacsha256.update(bmeta) - if hmacsha256.digest() != authhmac: - raise RuntimeError( - '{}: encryption metadata authentication failed'.format( - blobname)) - - -class PqTupleSort(tuple): - """Priority Queue tuple sorter: handles priority collisions. - 0th item in the tuple is the priority number.""" - def __lt__(self, rhs): - return self[0] < rhs[0] - - def __gt__(self, rhs): - return self[0] > rhs[0] - - def __le__(self, rhs): - return self[0] <= rhs[0] - - def __ge__(self, rhs): - return self[0] >= rhs[0] - - -class SasBlobList(object): - """Sas Blob listing object""" - def __init__(self): - """Ctor for SasBlobList""" - self.blobs = [] - self.next_marker = None - - def __iter__(self): - """Iterator""" - return iter(self.blobs) - - def __len__(self): - """Length""" - return len(self.blobs) - - def __getitem__(self, index): - """Accessor""" - return self.blobs[index] - - def add_blob(self, name, content_length, content_md5, blobtype, mddict): - """Adds a blob to the list - Parameters: - name - blob name - content_length - content length - content_md5 - content md5 - blobtype - blob type - mddict - metadata dictionary - Returns: - Nothing - Raises: - Nothing - """ - obj = type('bloblistobject', (object,), {}) - obj.name = name - obj.metadata = mddict - obj.properties = type('properties', (object,), {}) - obj.properties.content_length = content_length - obj.properties.content_settings = azure.storage.blob.ContentSettings() - if content_md5 is not None and len(content_md5) > 0: - obj.properties.content_settings.content_md5 = content_md5 - obj.properties.blobtype = blobtype - self.blobs.append(obj) - - def set_next_marker(self, marker): - """Set the continuation token - Parameters: - marker - next marker - Returns: - Nothing - Raises: - Nothing - """ - if marker is not None and len(marker) > 0: - self.next_marker = marker - - -class SasBlobService(object): - """BlobService supporting SAS for functions used in the Python SDK. - create_container method does not exist because it is not a supported - operation under SAS""" - def __init__(self, endpoint, saskey, timeout): - """SAS Blob Service ctor - Parameters: - endpoint - storage endpoint - saskey - saskey - timeout - timeout - Returns: - Nothing - Raises: - Nothing - """ - self.endpoint = endpoint - # normalize sas key - if saskey[0] != '?': - self.saskey = '?' + saskey - else: - self.saskey = saskey - self.timeout = timeout - - def _parse_blob_list_xml(self, content): - """Parse blob list in xml format to an attribute-based object - Parameters: - content - http response content in xml - Returns: - attribute-based object - Raises: - No special exception handling - """ - result = SasBlobList() - root = ET.fromstring(content) - blobs = root.find('Blobs') - for blob in blobs.iter('Blob'): - name = blob.find('Name').text - props = blob.find('Properties') - cl = long(props.find('Content-Length').text) - md5 = props.find('Content-MD5').text - bt = props.find('BlobType').text - metadata = blob.find('Metadata') - mddict = {} - for md in metadata: - mddict[md.tag] = md.text - result.add_blob(name, cl, md5, bt, mddict) - try: - result.set_next_marker(root.find('NextMarker').text) - except Exception: - pass - return result - - def list_blobs( - self, container_name, marker=None, - max_results=_MAX_LISTBLOBS_RESULTS, include=None): - """List blobs in container - Parameters: - container_name - container name - marker - marker - max_results - max results - include - `azure.storage.models.Include` include object - Returns: - List of blobs - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - saskey=self.saskey) - reqparams = { - 'restype': 'container', - 'comp': 'list', - 'maxresults': str(max_results)} - if marker is not None: - reqparams['marker'] = marker - if include is not None and include.metadata: - reqparams['include'] = 'metadata' - response = azure_request( - requests.get, url=url, params=reqparams, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError( - 'incorrect status code returned for list_blobs: {}'.format( - response.status_code)) - return self._parse_blob_list_xml(response.content) - - def _get_blob(self, container_name, blob_name, start_range, end_range): - """Get blob - Parameters: - container_name - container name - blob_name - name of blob - start_range - start range of bytes - end_range - end range of bytes - Returns: - `azure.storage.blob.Blob` object - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-range': 'bytes={}-{}'.format(start_range, end_range) - } - response = azure_request( - requests.get, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200 and response.status_code != 206: - raise IOError( - 'incorrect status code returned for get_blob: {}'.format( - response.status_code)) - return azure.storage.blob.Blob(content=response.content) - - def get_blob_properties(self, container_name, blob_name): - """Get blob properties - Parameters: - container_name - container name - blob_name - name of blob - Returns: - `azure.storage.blob.Blob` object - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - response = azure_request( - requests.head, url=url, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError('incorrect status code returned for ' - 'get_blob_properties: {}'.format( - response.status_code)) - # parse response headers into blob object - blob = azure.storage.blob.Blob() - blob.propertes = azure.storage.blob.BlobProperties() - blob.properties.content_length = \ - long(response.headers['content-length']) - blob.properties.content_settings = azure.storage.blob.ContentSettings() - if 'content-md5' in response.headers: - blob.properties.content_settings.content_md5 = \ - response.headers['content-md5'] - # read meta values, all meta values are lowercased - mddict = {} - for res in response.headers: - if res.startswith('x-ms-meta-'): - mddict[res[10:]] = response.headers[res] - blob.metadata = mddict - return blob - - def set_blob_metadata( - self, container_name, blob_name, metadata): - """Set blob metadata. Clearing is not supported. - Parameters: - container_name - container name - blob_name - name of blob - metadata - blob metadata dictionary - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - if metadata is None or len(metadata) == 0: - return - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqparams = {'comp': 'metadata'} - reqheaders = {} - for key in metadata: - reqheaders['x-ms-meta-' + key] = metadata[key] - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError( - 'incorrect status code returned for ' - 'set_blob_metadata: {}'.format(response.status_code)) - - def create_blob( - self, container_name, blob_name, content_length, content_settings): - """Create blob for initializing page blobs - Parameters: - container_name - container name - blob_name - name of blob - content_length - content length aligned to 512-byte boundary - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - response content - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-blob-type': 'PageBlob', - 'x-ms-blob-content-length': str(content_length), - } - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - response = azure_request( - requests.put, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for create_blob: {}'.format( - response.status_code)) - return response.content - - def _put_blob( - self, container_name, blob_name, blob, content_settings): - """Put blob for creating/updated block blobs - Parameters: - container_name - container name - blob_name - name of blob - blob - blob content - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - response content - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {'x-ms-blob-type': 'BlockBlob'} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - response = azure_request( - requests.put, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_blob: {}'.format( - response.status_code)) - return response.content - - def update_page( - self, container_name, blob_name, page, start_range, end_range, - validate_content=False, content_md5=None): - """Put page for page blob. This API differs from the Python storage - sdk to maintain efficiency for block md5 computation. - Parameters: - container_name - container name - blob_name - name of blob - page - page data - start_range - start range of bytes - end_range - end range of bytes - validate_content - validate content - content_md5 - md5 hash for page data - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-range': 'bytes={}-{}'.format(start_range, end_range), - 'x-ms-page-write': 'update'} - if validate_content and content_md5 is not None: - reqheaders['Content-MD5'] = content_md5 - reqparams = {'comp': 'page'} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=page, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for update_page: {}'.format( - response.status_code)) - - def put_block( - self, container_name, blob_name, block, block_id, - validate_content=False): - """Put block for blob - Parameters: - container_name - container name - blob_name - name of blob - block - block data - block_id - block id - validate_content - validate content - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - # compute block md5 - if validate_content: - reqheaders = {'Content-MD5': compute_md5_for_data_asbase64(block)} - else: - reqheaders = None - reqparams = {'comp': 'block', 'blockid': block_id} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=block, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_block: {}'.format( - response.status_code)) - - def put_block_list( - self, container_name, blob_name, block_list, - content_settings): - """Put block list for blob - Parameters: - container_name - container name - blob_name - name of blob - block_list - list of `azure.storage.blob.BlobBlock` - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - reqparams = {'comp': 'blocklist'} - body = [''] - for block in block_list: - body.append('{}'.format(block.id)) - body.append('') - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=''.join(body), timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_block_list: {}'.format( - response.status_code)) - - def set_blob_properties( - self, container_name, blob_name, content_settings): - """Sets blob properties (MD5 only) - Parameters: - container_name - container name - blob_name - name of blob - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - reqparams = {'comp': 'properties'} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError('incorrect status code returned for ' - 'set_blob_properties: {}'.format( - response.status_code)) - - def delete_blob( - self, container_name, blob_name): - """Deletes a blob - Parameters: - container_name - container name - blob_name - name of blob - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - response = azure_request( - requests.delete, url=url, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 202: - raise IOError( - 'incorrect status code returned for delete_blob: {}'.format( - response.status_code)) - - def create_container( - self, container_name, fail_on_exist=False): - """Create a container - Parameters: - container_name - container name - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - saskey=self.saskey) - reqparams = {'restype': 'container'} - response = azure_request( - requests.put, url=url, params=reqparams, timeout=self.timeout) - if response.status_code != 201: - if response.status_code == 409: - if fail_on_exist: - response.raise_for_status() - else: - return - raise IOError('incorrect status code returned for ' - 'create_container: {}'.format( - response.status_code)) - - -class StorageChunkWorker(threading.Thread): - """Chunk worker for a storage entity""" - def __init__( - self, exc, s_in_queue, s_out_queue, args, xfertoazure, - blob_service, file_service): - """Storage Chunk worker Thread ctor - Parameters: - exc - exception list - s_in_queue - storage in queue - s_out_queue - storage out queue - args - program arguments - xfertoazure - xfer to azure (direction) - blob_service - blob service - file_service - file service - Returns: - Nothing - Raises: - Nothing - """ - threading.Thread.__init__(self) - self.terminate = False - self._exc = exc - self._in_queue = s_in_queue - self._out_queue = s_out_queue - self.args = args - self.xfertoazure = xfertoazure - self.blob_service = blob_service - self.file_service = file_service - - def run(self): - """Thread code - Parameters: - Nothing - Returns: - Nothing - Raises: - Nothing - """ - while not self.terminate: - try: - pri, (localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) = \ - self._in_queue.get_nowait() - except queue.Empty: - break - # detect termination early and break if necessary - if self.terminate: - break - try: - if self.xfertoazure: - # if iv is not ready for this chunk, re-add back to queue - if (not as_page_blob(self.args, localresource) and - ((self.args.rsaprivatekey is not None or - self.args.rsapublickey is not None) and - self.args.encmode == _ENCRYPTION_MODE_FULLBLOB)): - _iblockid = int(blockid) - if _iblockid not in encparam[2]: - self._in_queue.put( - PqTupleSort(( - pri, - (localresource, container, remoteresource, - blockid, offset, bytestoxfer, encparam, - flock, filedesc)))) - continue - # upload block/page - self.put_storage_data( - localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) - else: - # download range - self.get_storage_range( - localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) - # pylint: disable=W0703 - except Exception: - # pylint: enable=W0703 - self._exc.append(traceback.format_exc()) - self._out_queue.put((localresource, encparam)) - if len(self._exc) > 0: - break - - def put_storage_data( - self, localresource, container, remoteresource, blockid, offset, - bytestoxfer, encparam, flock, filedesc): - """Puts data (blob, page or file bits) into Azure storage - Parameters: - localresource - name of local resource - container - blob container - remoteresource - name of remote resource - blockid - block id (ignored for page blobs) - offset - file offset - bytestoxfer - number of bytes to xfer - encparam - encryption metadata: (symkey, signkey, ivmap, pad) - flock - file lock - filedesc - file handle - Returns: - Nothing - Raises: - IOError if file cannot be read - """ - # if bytestoxfer is zero, then we're transferring a zero-byte - # file, use put blob instead of page/block ops - if bytestoxfer == 0: - contentmd5 = compute_md5_for_data_asbase64(b'') - if as_page_blob(self.args, localresource): - azure_request( - self.blob_service[1].create_blob, container_name=container, - blob_name=remoteresource, content_length=bytestoxfer, - content_settings=azure.storage.blob.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - elif self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - azure_request( - self.file_service.create_file, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=bytestoxfer, - content_settings=azure.storage.file.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - else: - azure_request( - self.blob_service[0]._put_blob, container_name=container, - blob_name=remoteresource, blob=None, - content_settings=azure.storage.blob.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - return - # read the file at specified offset, must take lock - data = None - with flock: - closefd = False - if not filedesc: - filedesc = open(localresource, 'rb') - closefd = True - filedesc.seek(offset, 0) - data = filedesc.read(bytestoxfer) - if closefd: - filedesc.close() - if not data: - raise IOError('could not read {}: {} -> {}'.format( - localresource, offset, offset + bytestoxfer)) - # issue REST put - if as_page_blob(self.args, localresource): - aligned = page_align_content_length(bytestoxfer) - # fill data to boundary - if aligned != bytestoxfer: - data = data.ljust(aligned, b'\0') - # compute page md5 - contentmd5 = compute_md5_for_data_asbase64(data) - # check if this page is empty - if contentmd5 == _EMPTY_MAX_PAGE_SIZE_MD5: - return - elif len(data) != _MAX_BLOB_CHUNK_SIZE_BYTES: - data_chk = b'\0' * len(data) - data_chk_md5 = compute_md5_for_data_asbase64(data_chk) - del data_chk - if data_chk_md5 == contentmd5: - return - del data_chk_md5 - # upload page range - if self.args.saskey: - azure_request( - self.blob_service[1].update_page, container_name=container, - blob_name=remoteresource, page=data, start_range=offset, - end_range=offset + aligned - 1, - validate_content=self.args.computeblockmd5, - content_md5=contentmd5, timeout=self.args.timeout) - else: - azure_request( - self.blob_service[1].update_page, container_name=container, - blob_name=remoteresource, page=data, start_range=offset, - end_range=offset + aligned - 1, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - else: - # encrypt block if required - if (encparam is not None and - (self.args.rsaprivatekey is not None or - self.args.rsapublickey is not None)): - symkey = encparam[0] - signkey = encparam[1] - if self.args.encmode == _ENCRYPTION_MODE_FULLBLOB: - _blkid = int(blockid) - iv = encparam[2][_blkid] - pad = encparam[3] - else: - iv = None - pad = True - data = encrypt_chunk( - symkey, signkey, data, self.args.encmode, iv=iv, pad=pad) - with flock: - if self.args.encmode == _ENCRYPTION_MODE_FULLBLOB: - # compute hmac for chunk - if _blkid == 0: - encparam[2]['hmac'].update(iv + data) - else: - encparam[2]['hmac'].update(data) - # store iv for next chunk - encparam[2][_blkid + 1] = data[ - len(data) - _AES256_BLOCKSIZE_BYTES:] - # compute md5 for encrypted data chunk - encparam[2]['md5'].update(data) - if self.args.fileshare: - bytestoxfer = len(data) - encparam[2]['filesize'] += bytestoxfer - if self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - # subtract 1 from end_range - azure_request( - self.file_service.update_range, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - data=data, start_range=offset, - end_range=offset + bytestoxfer - 1, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - else: - azure_request( - self.blob_service[0].put_block, container_name=container, - blob_name=remoteresource, block=data, block_id=blockid, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - del data - - def get_storage_range( - self, localresource, container, remoteresource, blockid, offset, - bytestoxfer, encparam, flock, filedesc): - """Get a segment of a blob/page/file using range offset downloading - Parameters: - localresource - name of local resource - container - blob container - remoteresource - name of remote resource - blockid - block id (integral) - offset - file offset - bytestoxfer - number of bytes to xfer - encparam - decryption metadata: - (symkey, signkey, offset_mod, encmode, ivmap, unpad) - flock - file lock - filedesc - file handle - Returns: - Nothing - Raises: - Nothing - """ - if (encparam[0] is not None and - encparam[3] == _ENCRYPTION_MODE_FULLBLOB): - if offset == 0: - start_range = offset - end_range = offset + bytestoxfer - else: - # retrieve block size data prior for IV - start_range = offset - _AES256_BLOCKSIZE_BYTES - end_range = offset + bytestoxfer - else: - start_range = offset - end_range = offset + bytestoxfer - if self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - _blob = azure_request( - self.file_service._get_file, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - start_range=start_range, end_range=end_range, - timeout=self.args.timeout) - else: - if as_page_blob(self.args, localresource): - blob_service = self.blob_service[1] - else: - blob_service = self.blob_service[0] - _blob = azure_request( - blob_service._get_blob, timeout=self.args.timeout, - container_name=container, blob_name=remoteresource, - start_range=start_range, end_range=end_range) - blobdata = _blob.content - # decrypt block if required - if encparam[0] is not None: - if encparam[3] == _ENCRYPTION_MODE_FULLBLOB: - if offset == 0: - iv = encparam[4][0] - else: - iv = blobdata[:_AES256_BLOCKSIZE_BYTES] - blobdata = blobdata[_AES256_BLOCKSIZE_BYTES:] - unpad = encparam[5] - # update any buffered data to hmac - hmacdict = encparam[4]['hmac'] - if hmacdict['hmac'] is not None: - # grab file lock to manipulate hmac - with flock: - # include iv in first hmac calculation - if offset == 0: - hmacdict['buffered'][blockid] = iv + blobdata - else: - hmacdict['buffered'][blockid] = blobdata - # try to process hmac data - while True: - curr = hmacdict['curr'] - if curr in hmacdict['buffered']: - hmacdict['hmac'].update( - hmacdict['buffered'][curr]) - hmacdict['buffered'].pop(curr) - hmacdict['curr'] = curr + 1 - else: - break - else: - iv = None - unpad = True - blobdata = decrypt_chunk( - encparam[0], encparam[1], blobdata, encparam[3], iv=iv, - unpad=unpad) - if blobdata is not None: - with flock: - closefd = False - if not filedesc: - filedesc = open(localresource, 'r+b') - closefd = True - filedesc.seek(offset - (encparam[2] or 0), 0) - filedesc.write(blobdata) - if closefd: - filedesc.close() - del blobdata - del _blob - - -def pad_pkcs7(buf): - """Appends PKCS7 padding to an input buffer. - Parameters: - buf - buffer to add padding - Returns: - buffer with PKCS7_PADDING - Raises: - No special exception handling - """ - padder = cryptography.hazmat.primitives.padding.PKCS7( - cryptography.hazmat.primitives.ciphers. - algorithms.AES.block_size).padder() - return padder.update(buf) + padder.finalize() - - -def unpad_pkcs7(buf): - """Removes PKCS7 padding a decrypted object. - Parameters: - buf - buffer to remove padding - Returns: - buffer without PKCS7_PADDING - Raises: - No special exception handling - """ - unpadder = cryptography.hazmat.primitives.padding.PKCS7( - cryptography.hazmat.primitives.ciphers. - algorithms.AES.block_size).unpadder() - return unpadder.update(buf) + unpadder.finalize() - - -def generate_aes256_keys(): - """Generate AES256 symmetric key and signing key - Parameters: - None - Returns: - Tuple of symmetric key and signing key - Raises: - Nothing - """ - symkey = os.urandom(_AES256_KEYLENGTH_BYTES) - signkey = os.urandom(_AES256_KEYLENGTH_BYTES) - return symkey, signkey - - -def rsa_encrypt_key(rsaprivatekey, rsapublickey, plainkey, asbase64=True): - """Encrypt a plaintext key using RSA and PKCS1_OAEP padding - Parameters: - rsaprivatekey - rsa private key for encryption - rsapublickey - rsa public key for encryption - plainkey - plaintext key - asbase64 - encode as base64 - Returns: - Tuple of encrypted key and signature (if RSA private key is given) - Raises: - Nothing - """ - if rsapublickey is None: - rsapublickey = rsaprivatekey.public_key() - if rsaprivatekey is None: - signature = None - else: - signer = rsaprivatekey.signer( - cryptography.hazmat.primitives.asymmetric.padding.PSS( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - cryptography.hazmat.primitives.hashes.SHA256()), - salt_length=cryptography.hazmat.primitives.asymmetric. - padding.PSS.MAX_LENGTH), - cryptography.hazmat.primitives.hashes.SHA256()) - signer.update(plainkey) - signature = signer.finalize() - enckey = rsapublickey.encrypt( - plainkey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - algorithm=cryptography.hazmat.primitives.hashes.SHA1()), - algorithm=cryptography.hazmat.primitives.hashes.SHA1(), - label=None)) - if asbase64: - return base64encode(enckey), base64encode( - signature) if signature is not None else signature - else: - return enckey, signature - - -def rsa_decrypt_key(rsaprivatekey, enckey, signature, isbase64=True): - """Decrypt an RSA encrypted key and optional signature verification - Parameters: - rsaprivatekey - rsa private key for decryption - enckey - encrypted key - signature - optional signature to verify encrypted data - isbase64 - if keys are base64 encoded - Returns: - Decrypted key - Raises: - RuntimeError if RSA signature validation fails - """ - if isbase64: - enckey = base64.b64decode(enckey) - deckey = rsaprivatekey.decrypt( - enckey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - algorithm=cryptography.hazmat.primitives.hashes.SHA1()), - algorithm=cryptography.hazmat.primitives.hashes.SHA1(), - label=None)) - if signature is not None and len(signature) > 0: - rsapublickey = rsaprivatekey.public_key() - if isbase64: - signature = base64.b64decode(signature) - verifier = rsapublickey.verifier( - signature, cryptography.hazmat.primitives.asymmetric.padding.PSS( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - cryptography.hazmat.primitives.hashes.SHA256()), - salt_length=cryptography.hazmat.primitives.asymmetric. - padding.PSS.MAX_LENGTH), - cryptography.hazmat.primitives.hashes.SHA256()) - verifier.update(deckey) - verifier.verify() - return deckey - - -def encrypt_chunk(symkey, signkey, data, encmode, iv=None, pad=False): - """Encrypt a chunk of data - Parameters: - symkey - symmetric key - signkey - signing key - data - data to encrypt - encmode - encryption mode - iv - initialization vector - pad - pad data - Returns: - iv and hmac not specified: iv || encrypted data || signature - else: encrypted data - Raises: - No special exception handling - """ - # create iv - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - iv = os.urandom(_AES256_BLOCKSIZE_BYTES) - # force padding on since this will be an individual encrypted chunk - pad = True - # encrypt data - cipher = cryptography.hazmat.primitives.ciphers.Cipher( - cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), - cryptography.hazmat.primitives.ciphers.modes.CBC(iv), - backend=cryptography.hazmat.backends.default_backend()).encryptor() - if pad: - encdata = cipher.update(pad_pkcs7(data)) + cipher.finalize() - else: - encdata = cipher.update(data) + cipher.finalize() - # sign encrypted data - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - hmacsha256 = hmac.new(signkey, digestmod=hashlib.sha256) - hmacsha256.update(iv + encdata) - return iv + encdata + hmacsha256.digest() - else: - return encdata - - -def decrypt_chunk( - symkey, signkey, encchunk, encmode, iv=None, unpad=False): - """Decrypt a chunk of data - Parameters: - symkey - symmetric key - signkey - signing key - encchunk - data to decrypt - encmode - encryption mode - blockid - block id - iv - initialization vector - unpad - unpad data - Returns: - decrypted data - Raises: - RuntimeError if signature verification fails - """ - # if chunked blob, then preprocess for iv and signature - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - # retrieve iv - iv = encchunk[:_AES256_BLOCKSIZE_BYTES] - # retrieve encrypted data - encdata = encchunk[ - _AES256_BLOCKSIZE_BYTES:-_HMACSHA256_DIGESTSIZE_BYTES] - # retrieve signature - sig = encchunk[-_HMACSHA256_DIGESTSIZE_BYTES:] - # validate integrity of data - hmacsha256 = hmac.new(signkey, digestmod=hashlib.sha256) - # compute hmac over iv + encdata - hmacsha256.update(encchunk[:-_HMACSHA256_DIGESTSIZE_BYTES]) - if not cryptography.hazmat.primitives.constant_time.bytes_eq( - hmacsha256.digest(), sig): - raise RuntimeError( - 'Encrypted data integrity check failed for chunk') - else: - encdata = encchunk - # decrypt data - cipher = cryptography.hazmat.primitives.ciphers.Cipher( - cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), - cryptography.hazmat.primitives.ciphers.modes.CBC(iv), - backend=cryptography.hazmat.backends.default_backend()).decryptor() - decrypted = cipher.update(encdata) + cipher.finalize() - if unpad: - return unpad_pkcs7(decrypted) - else: - return decrypted - - -def azure_request(req, timeout=None, *args, **kwargs): - """Wrapper method to issue/retry requests to Azure, works with both - the Azure Python SDK and Requests - Parameters: - req - request to issue - timeout - timeout in seconds - args - positional args to req - kwargs - keyworded args to req - Returns: - result of request - Raises: - Any uncaught exceptions - IOError if timeout - """ - start = time.clock() - lastwait = None - while True: - try: - return req(*args, **kwargs) - except requests.Timeout: - pass - except (requests.ConnectionError, - requests.exceptions.ChunkedEncodingError) as exc: - if (isinstance(exc.args[0], requests.packages.urllib3. - exceptions.ProtocolError) and - isinstance(exc.args[0].args[1], socket.error)): - err = exc.args[0].args[1].errno - if (err != errno.ECONNRESET and - err != errno.ECONNREFUSED and - err != errno.ECONNABORTED and - err != errno.ENETRESET and - err != errno.ETIMEDOUT): - raise - except requests.HTTPError as exc: - if (exc.response.status_code < 500 or - exc.response.status_code == 501 or - exc.response.status_code == 505): - raise - except azure.common.AzureHttpError as exc: - if (exc.status_code < 500 or - exc.status_code == 501 or - exc.status_code == 505): - raise - if timeout is not None and time.clock() - start > timeout: - raise IOError( - 'waited {} sec for request {}, exceeded timeout of {}'.format( - time.clock() - start, req.__name__, timeout)) - if lastwait is None or lastwait > 8: - wait = 1 - else: - wait = lastwait << 1 - lastwait = wait - time.sleep(wait) - - -def create_dir_ifnotexists(dirname): - """Create a directory if it doesn't exist - Parameters: - dirname - name of directory to create - Returns: - Nothing - Raises: - Unhandled exceptions - """ - try: - os.makedirs(dirname) - print('created local directory: {}'.format(dirname)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise # pragma: no cover - - -def get_mime_type(filename): - """Guess the type of a file based on its filename - Parameters: - filename - filename to guess the content-type - Returns: - A string of the form 'type/subtype', - usable for a MIME content-type header - Raises: - Nothing - """ - return (mimetypes.guess_type(filename)[0] or 'application/octet-stream') - - -def encode_blobname(args, blobname): - """Encode blob name: url encode. Due to current Azure Python Storage SDK - limitations, does not apply to non-SAS requests. - Parameters: - args - program arguments - Returns: - urlencoded blob name - Raises: - Nothing - """ - if args.saskey is None or args.fileshare: - return blobname - else: - return urlquote(blobname) - - -def base64encode(obj): - """Encode object to base64 - Parameters: - obj - object to encode - Returns: - base64 encoded string - Raises: - Nothing - """ - if _PY2: - return base64.b64encode(obj) - else: - return str(base64.b64encode(obj), 'ascii') - - -def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): - """Compute MD5 hash for file and encode as Base64 - Parameters: - filename - filename to compute md5 - pagealign - align bytes for page boundary - blocksize - block size in bytes - Returns: - MD5 for file encoded as Base64 - Raises: - Nothing - """ - hasher = hashlib.md5() - with open(filename, 'rb') as filedesc: - while True: - buf = filedesc.read(blocksize) - if not buf: - break - buflen = len(buf) - if pagealign and buflen < blocksize: - aligned = page_align_content_length(buflen) - if aligned != buflen: - buf = buf.ljust(aligned, b'\0') - hasher.update(buf) - return base64encode(hasher.digest()) - - -def compute_md5_for_data_asbase64(data): - """Compute MD5 hash for bits and encode as Base64 - Parameters: - data - data to compute MD5 hash over - Returns: - MD5 for data encoded as Base64 - Raises: - Nothing - """ - hasher = hashlib.md5() - hasher.update(data) - return base64encode(hasher.digest()) - - -def page_align_content_length(length): - """Compute page boundary alignment - Parameters: - length - content length - Returns: - aligned byte boundary - Raises: - Nothing - """ - mod = length % _PAGEBLOB_BOUNDARY - if mod != 0: - return length + (_PAGEBLOB_BOUNDARY - mod) - return length - - -def as_page_blob(args, name): - """Determines if the file should be a pageblob depending upon args - Parameters: - args - program args - name - file name - Returns: - True if file should be a pageblob - Raises: - Nothing - """ - if not args.fileshare and ( - args.pageblob or (args.autovhd and name.lower().endswith('.vhd'))): - return True - return False - - -def get_blob_listing(blob_service, args, metadata=True): - """Convenience method for generating a blob listing of a container - Parameters: - blob_service - blob service - args - program arguments - metadata - include metadata - Returns: - dictionary of blob -> list [content length, content md5, enc metadata] - Raises: - Nothing - """ - marker = None - blobdict = {} - if metadata: - incl = azure.storage.blob.Include.METADATA - else: - incl = None - while True: - try: - result = azure_request( - blob_service.list_blobs, timeout=args.timeout, - container_name=args.container, marker=marker, include=incl) - except azure.common.AzureMissingResourceHttpError: - break - for blob in result: - blobdict[blob.name] = [ - blob.properties.content_length, - blob.properties.content_settings.content_md5, None] - if (blob.metadata is not None and - _ENCRYPTION_METADATA_NAME in blob.metadata): - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - blob.name, args.rsaprivatekey, args.rsapublickey, - blob.metadata) - blobdict[blob.name][1] = encmeta.md5 - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - blobdict[blob.name][2] = encmeta - marker = result.next_marker - if marker is None or len(marker) < 1: - break - return blobdict - - -def get_fileshare_listing(file_service, args, metadata=True): - """Retrieve all files and directories under a file share - Parameters: - file_service - file service - args - program args - metadata - retrieve metadata - Returns: - dictionary of files -> list [content length, content md5, enc metadata] - Raises: - Nothing - """ - blobdict = {} - dirs = [None] - while len(dirs) > 0: - dir = dirs.pop() - fsfiles = file_service.list_directories_and_files( - share_name=args.container, directory_name=dir, - timeout=args.timeout) - if dir is None: - dir = '' - for fsfile in fsfiles: - fspath = os.path.join(dir, fsfile.name) - if isinstance(fsfile, azure.storage.file.File): - fsprop = get_fileshare_file_properties( - file_service, args, fspath) - blobdict[fspath] = fsprop[1] - else: - dirs.append(fspath) - return blobdict - - -def split_fileshare_path_into_parts(remotefname): - """Split fileshare name into parts - Parameters: - remotefname - remote file name - Returns: - tuple of (directory name, file name) - Raises: - Nothing - """ - parts = remotefname.split(os.path.sep) - dirname = os.path.sep.join(parts[:len(parts) - 1]) - return (dirname, parts[-1]) - - -def get_fileshare_file_properties(file_service, args, remotefname): - """Convenience method for retrieving a file share file's properties and - metadata - Parameters: - file_service - file service - args - program arguments - remotefname - remote file name - Returns: - blobdict entry tuple (file name, blobdict value) - Raises: - Nothing - """ - # split directory and file name - dirname, fname = split_fileshare_path_into_parts(remotefname) - try: - fsfile = file_service.get_file_properties( - args.container, dirname, fname, timeout=args.timeout) - except azure.common.AzureMissingResourceHttpError: - return None - fsmeta = file_service.get_file_metadata( - args.container, dirname, fname, timeout=args.timeout) - entry = [ - fsfile.properties.content_length, - fsfile.properties.content_settings.content_md5, None] - if fsmeta is not None and _ENCRYPTION_METADATA_NAME in fsmeta: - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - fsfile.name, args.rsaprivatekey, args.rsapublickey, - fsmeta) - entry[1] = encmeta.md5 - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - entry[2] = encmeta - return (fsfile.name, entry) - - -def create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated): - """Create all parent directories of a given file share path - Parameters - file_service - file service - args - program args - fsfile - file share path - dirscreated - directories created set - Returns: - Nothing - Raises: - Nothing - """ - dirs = fsfile[0].split(os.path.sep) - for i in xrange(0, len(dirs)): - dir = os.path.join(*(dirs[0:i + 1])) - if dir not in dirscreated: - file_service.create_directory( - share_name=args.container, - directory_name=dir, fail_on_exist=False, - timeout=args.timeout) - dirscreated.add(dir) - - -def generate_xferspec_download( - blob_service, file_service, args, storage_in_queue, localfile, - remoteresource, addfd, blobprop): - """Generate an xferspec for download - Parameters: - blob_service - blob service - file_service - file service - args - program arguments - storage_in_queue - storage input queue - localfile - name of local resource - remoteresource - name of remote resource - addfd - create and add file handle - blobprop - blob properties list [length, md5, metadatadict] - Returns: - xferspec containing instructions - Raises: - ValueError if get_blob_properties returns an invalid result or - contentlength is invalid - """ - contentlength = blobprop[0] - contentmd5 = blobprop[1] - encmeta = blobprop[2] - remoteresource = encode_blobname(args, remoteresource) - # get the blob metadata if missing - if not args.fileshare and ( - contentlength is None or contentmd5 is None or - (args.rsaprivatekey is not None and encmeta is None)): - result = azure_request( - blob_service.get_blob_properties, timeout=args.timeout, - container_name=args.container, blob_name=remoteresource) - if not result: - raise ValueError( - 'unexpected result for get_blob_properties is None') - contentmd5 = result.properties.content_settings.content_md5 - contentlength = result.properties.content_length - if (args.rsaprivatekey is not None and - _ENCRYPTION_METADATA_NAME in result.metadata): - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - remoteresource, args.rsaprivatekey, args.rsapublickey, - result.metadata) - if contentlength < 0: - raise ValueError( - 'contentlength is invalid for {}'.format(remoteresource)) - # overwrite content md5 if encryption metadata exists - if encmeta is not None: - contentmd5 = encmeta.md5 - # check if download is needed - if (args.skiponmatch and contentmd5 is not None and - os.path.exists(localfile)): - print('computing file md5 on: {} length: {}'.format( - localfile, contentlength)) - lmd5 = compute_md5_for_file_asbase64(localfile) - print(' >> {} {} {} '.format( - lmd5, contentmd5, remoteresource), end='') - if lmd5 != contentmd5: - print('MISMATCH: re-download') - else: - print('match: skip') - return None, None, None, None - else: - print('remote blob: {} length: {} bytes, md5: {}'.format( - remoteresource, contentlength, contentmd5)) - tmpfilename = localfile + '.blobtmp' - if encmeta is not None: - chunksize = encmeta.chunksizebytes - symkey = encmeta.symkey - signkey = encmeta.signkey - if encmeta.encmode == _ENCRYPTION_MODE_FULLBLOB: - ivmap = { - 0: encmeta.iv, - 'hmac': { - 'hmac': None, - 'buffered': {}, - 'curr': 0, - 'sig': encmeta.hmac, - } - } - if signkey is not None: - ivmap['hmac']['hmac'] = hmac.new( - signkey, digestmod=hashlib.sha256) - offset_mod = 0 - elif encmeta.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - ivmap = None - offset_mod = _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - else: - raise RuntimeError('Unknown encryption mode: {}'.format( - encmeta.encmode)) - else: - chunksize = args.chunksizebytes - offset_mod = 0 - symkey = None - signkey = None - ivmap = None - nchunks = contentlength // chunksize - # compute allocation size, if encrypted this will be an - # underallocation estimate - if contentlength > 0: - if encmeta is not None: - if encmeta.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - allocatesize = contentlength - ((nchunks + 2) * offset_mod) - else: - allocatesize = contentlength - _AES256_BLOCKSIZE_BYTES - else: - allocatesize = contentlength - if allocatesize < 0: - allocatesize = 0 - else: - allocatesize = 0 - currfileoffset = 0 - nstorageops = 0 - flock = threading.Lock() - filedesc = None - # preallocate file - with flock: - filedesc = open(tmpfilename, 'wb') - if allocatesize > 0: - filedesc.seek(allocatesize - 1) - filedesc.write(b'\0') - filedesc.close() - if addfd: - # reopen under r+b mode - filedesc = open(tmpfilename, 'r+b') - else: - filedesc = None - chunktoadd = min(chunksize, contentlength) - for i in xrange(nchunks + 1): - if chunktoadd + currfileoffset > contentlength: - chunktoadd = contentlength - currfileoffset - # on download, chunktoadd must be offset by 1 as the x-ms-range - # header expects it that way. x -> y bytes means first bits of the - # (x+1)th byte to the last bits of the (y+1)th byte. for example, - # 0 -> 511 means byte 1 to byte 512 - encparam = [ - symkey, signkey, i * offset_mod, - encmeta.encmode if encmeta is not None else None, ivmap, False] - xferspec = (tmpfilename, args.container, remoteresource, i, - currfileoffset, chunktoadd - 1, encparam, flock, filedesc) - currfileoffset = currfileoffset + chunktoadd - nstorageops = nstorageops + 1 - storage_in_queue.put(PqTupleSort((i, xferspec))) - if currfileoffset >= contentlength: - encparam[5] = True - break - return contentlength, nstorageops, contentmd5, filedesc - - -def generate_xferspec_upload( - args, storage_in_queue, blobskipdict, blockids, localfile, - remoteresource, addfd): - """Generate an xferspec for upload - Parameters: - args - program arguments - storage_in_queue - storage input queue - blobskipdict - blob skip dictionary - blockids - block id dictionary - localfile - name of local resource - remoteresource - name of remote resource - addfd - create and add file handle - Returns: - xferspec containing instructions - Raises: - Nothing - """ - # compute md5 hash - md5digest = None - if args.computefilemd5: - print('computing file md5 on: {}'.format(localfile)) - md5digest = compute_md5_for_file_asbase64( - localfile, as_page_blob(args, localfile)) - # check if upload is needed - if args.skiponmatch and remoteresource in blobskipdict: - print(' >> {} {} {} '.format( - md5digest, blobskipdict[remoteresource][1], - remoteresource), end='') - if md5digest != blobskipdict[remoteresource][1]: - print('MISMATCH: re-upload') - else: - print('match: skip') - return None, 0, None, None - else: - print(' >> md5: {}'.format(md5digest)) - # create blockids entry - if localfile not in blockids: - blockids[localfile] = [] - # partition local file into chunks - filesize = os.path.getsize(localfile) - if as_page_blob(args, localfile) and ( - args.rsaprivatekey is not None or - args.rsapublickey is not None): - chunksizebytes = _MAX_BLOB_CHUNK_SIZE_BYTES - nchunks = filesize // chunksizebytes - if nchunks > 250000: - raise RuntimeError( - '{} chunks for file {} exceeds Azure Storage limits for a ' - 'single page blob'.format(nchunks, localfile)) - else: - chunksizebytes = args.chunksizebytes - nchunks = filesize // chunksizebytes - if nchunks > 50000: - raise RuntimeError( - '{} chunks for file {} exceeds Azure Storage limits for a ' - 'single block blob'.format(nchunks, localfile)) - chunktoadd = min(chunksizebytes, filesize) - currfileoffset = 0 - nstorageops = 0 - flock = threading.Lock() - filedesc = None - if addfd: - with flock: - filedesc = open(localfile, 'rb') - symkey = None - signkey = None - ivmap = None - for i in xrange(nchunks + 1): - if chunktoadd + currfileoffset > filesize: - chunktoadd = filesize - currfileoffset - blockid = '{0:08d}'.format(currfileoffset // chunksizebytes) - # generate the ivmap for the first block - if (not as_page_blob(args, localfile) and - (args.rsaprivatekey is not None or - args.rsapublickey is not None) and currfileoffset == 0): - # generate sym/signing keys - symkey, signkey = generate_aes256_keys() - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - ivmap = { - i: os.urandom(_AES256_BLOCKSIZE_BYTES), - 'hmac': hmac.new(signkey, digestmod=hashlib.sha256), - } - else: - ivmap = {} - ivmap['md5'] = hashlib.md5() - ivmap['filesize'] = 0 - blockids[localfile].append(blockid) - encparam = [symkey, signkey, ivmap, False] - xferspec = (localfile, args.container, - encode_blobname(args, remoteresource), blockid, - currfileoffset, chunktoadd, encparam, flock, filedesc) - currfileoffset = currfileoffset + chunktoadd - nstorageops = nstorageops + 1 - storage_in_queue.put(PqTupleSort((i, xferspec))) - if currfileoffset >= filesize: - encparam[3] = True - break - return filesize, nstorageops, md5digest, filedesc - - -def apply_file_collation_and_strip(args, fname): - """Apply collation path or component strip to a remote filename - Parameters: - args - arguments - fname - file name - Returns: - remote filename - Raises: - No special exception handling - """ - remotefname = fname.strip(os.path.sep) - if args.collate is not None: - remotefname = remotefname.split(os.path.sep)[-1] - if args.collate != '.': - remotefname = os.path.sep.join((args.collate, remotefname)) - elif args.stripcomponents > 0: - rtmp = remotefname.split(os.path.sep) - nsc = min((len(rtmp) - 1, args.stripcomponents)) - if nsc > 0: - remotefname = os.path.sep.join(rtmp[nsc:]) - return remotefname - - -def main(): - """Main function - Parameters: - None - Returns: - Nothing - Raises: - ValueError for invalid arguments - """ - # get command-line args - args = parseargs() - - # populate args from env vars - if args.storageaccountkey is None: - args.storageaccountkey = os.getenv(_ENVVAR_STORAGEACCOUNTKEY) - if args.saskey is None: - args.saskey = os.getenv(_ENVVAR_SASKEY) - if args.rsakeypassphrase is None: - args.rsakeypassphrase = os.getenv(_ENVVAR_RSAKEYPASSPHRASE) - - # check some parameters - if (len(args.localresource) < 1 or len(args.storageaccount) < 1 or - len(args.container) < 1): - raise ValueError('invalid positional arguments') - if len(args.endpoint) < 1: - raise ValueError('storage endpoint is invalid') - if args.upload and args.download: - raise ValueError( - 'cannot specify both download and upload transfer direction ' - 'within the same invocation') - if args.subscriptionid is not None and args.managementcert is None: - raise ValueError( - 'cannot specify subscription id without a management cert') - if args.subscriptionid is None and args.managementcert is not None: - raise ValueError( - 'cannot specify a management cert without a subscription id') - if args.storageaccountkey is not None and args.saskey is not None: - raise ValueError('cannot use both a sas key and storage account key') - if args.pageblob and args.fileshare: - raise ValueError( - 'cannot specify both page blob and file share destinations') - if args.autovhd and args.fileshare: - raise ValueError( - 'cannot specify both autovhd and file share destination') - if args.pageblob and args.autovhd: - raise ValueError('cannot specify both pageblob and autovhd parameters') - if args.collate is not None and args.stripcomponents is not None: - raise ValueError( - 'cannot specify collate and non-default component ' - 'strip: {}'.format(args.stripcomponents)) - if args.stripcomponents is None: - args.stripcomponents = 1 - if args.stripcomponents < 0: - raise ValueError('invalid component strip number: {}'.format( - args.stripcomponents)) - if args.rsaprivatekey is not None and args.rsapublickey is not None: - raise ValueError('cannot specify both RSA private and public keys') - if args.rsapublickey is not None and args.rsakeypassphrase is not None: - raise ValueError('cannot specify an RSA public key and passphrase') - if args.timeout is not None and args.timeout <= 0: - args.timeout = None - - # get key if we don't have a handle on one - sms = None - if args.saskey is not None: - if len(args.saskey) < 1: - raise ValueError('invalid sas key specified') - elif args.storageaccountkey is None: - if (args.managementcert is not None and - args.subscriptionid is not None): - # check to ensure management cert is valid - if len(args.managementcert) == 0 or \ - args.managementcert.split('.')[-1].lower() != 'pem': - raise ValueError('management cert appears to be invalid') - if args.managementep is None or len(args.managementep) == 0: - raise ValueError('management endpoint is invalid') - # expand management cert path out if contains ~ - args.managementcert = os.path.abspath(args.managementcert) - # get sms reference - sms = azure.servicemanagement.ServiceManagementService( - args.subscriptionid, args.managementcert, args.managementep) - # get keys - service_keys = azure_request( - sms.get_storage_account_keys, timeout=args.timeout, - service_name=args.storageaccount) - args.storageaccountkey = service_keys.storage_service_keys.primary - else: - raise ValueError('could not determine authentication to use') - - # check storage account key validity - if args.storageaccountkey is not None and \ - len(args.storageaccountkey) < 1: - raise ValueError('storage account key is invalid') - - # set valid num workers - if args.numworkers < 1: - args.numworkers = 1 - if (args.fileshare and - args.numworkers == _DEFAULT_MAX_STORAGEACCOUNT_WORKERS): - args.numworkers //= 2 - - # expand any paths - args.localresource = os.path.expanduser(args.localresource) - - # sanitize remote file name - if args.remoteresource: - args.remoteresource = args.remoteresource.strip(os.path.sep) - - # set chunk size - if (args.chunksizebytes is None or args.chunksizebytes < 64 or - args.chunksizebytes > _MAX_BLOB_CHUNK_SIZE_BYTES): - args.chunksizebytes = _MAX_BLOB_CHUNK_SIZE_BYTES - - # set storage ep - endpoint = None - if sms: - storage_acct = azure_request( - sms.get_storage_account_properties, timeout=args.timeout, - service_name=args.storageaccount) - if args.fileshare: - endpoint = storage_acct.storage_service_properties.endpoints[3] - else: - endpoint = storage_acct.storage_service_properties.endpoints[0] - else: - if args.fileshare: - endpoint = 'https://{}.file.{}/'.format( - args.storageaccount, args.endpoint) - else: - endpoint = 'https://{}.blob.{}/'.format( - args.storageaccount, args.endpoint) - - # create master block blob, page blob and file service - blob_service = None - if args.storageaccountkey: - if args.endpoint[0] == '.': - args.endpoint = args.endpoint[1:] - block_blob_service = azure.storage.blob.BlockBlobService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - page_blob_service = azure.storage.blob.PageBlobService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - file_service = azure.storage.file.FileService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - blob_service = (block_blob_service, page_blob_service) - elif args.saskey: - _bs = SasBlobService(endpoint, args.saskey, args.timeout) - blob_service = (_bs, _bs) - # normalize sas key for python sdk - if args.saskey[0] == '?': - args.saskey = args.saskey[1:] - file_service = azure.storage.file.FileService( - account_name=args.storageaccount, - sas_token=args.saskey, - endpoint_suffix=args.endpoint) - # disable container/share creation if SAS is not account-level and - # does not contain a signed resource type with container-level access - if args.createcontainer: - args.createcontainer = False - sasparts = args.saskey.split('&') - for part in sasparts: - tmp = part.split('=') - if tmp[0] == 'srt': - if 'c' in tmp[1]: - args.createcontainer = True - break - del sasparts - if blob_service is None: - raise ValueError('blob_service is invalid') - if args.fileshare and file_service is None: - raise ValueError('file_service is invalid') - - # check which way we're transfering - xfertoazure = False - if (args.upload or - (not args.download and os.path.exists(args.localresource))): - xfertoazure = True - else: - if args.remoteresource is None: - raise ValueError('cannot download remote file if not specified') - - # import rsa key - if args.rsaprivatekey is not None: - rsakeyfile = args.rsaprivatekey - elif args.rsapublickey is not None: - rsakeyfile = args.rsapublickey - else: - rsakeyfile = None - if rsakeyfile is not None: - # check for conflicting options - if args.pageblob: - raise ValueError( - 'cannot operate in page blob mode with encryption enabled') - # check for supported encryption modes - if (args.encmode != _ENCRYPTION_MODE_FULLBLOB and - args.encmode != _ENCRYPTION_MODE_CHUNKEDBLOB): - raise RuntimeError( - 'Unknown encryption mode: {}'.format(args.encmode)) - # only allow full blob encryption mode for now due to - # possible compatibility issues - if args.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - raise RuntimeError( - '{} encryption mode not allowed'.format(args.encmode)) - with open(rsakeyfile, 'rb') as keyfile: - if args.rsaprivatekey is not None: - args.rsaprivatekey = cryptography.hazmat.primitives.\ - serialization.load_pem_private_key( - keyfile.read(), args.rsakeypassphrase, - backend=cryptography.hazmat.backends.default_backend()) - else: - args.rsapublickey = cryptography.hazmat.primitives.\ - serialization.load_pem_public_key( - keyfile.read(), - backend=cryptography.hazmat.backends.default_backend()) - if args.rsaprivatekey is None and not xfertoazure: - raise ValueError('imported RSA key does not have a private key') - # adjust chunk size for padding for chunked mode - if xfertoazure: - if args.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - args.chunksizebytes -= _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - elif args.encmode == _ENCRYPTION_MODE_FULLBLOB: - nchunks = args.chunksizebytes // \ - _AES256CBC_HMACSHA256_OVERHEAD_BYTES - args.chunksizebytes = (nchunks - 1) * \ - _AES256CBC_HMACSHA256_OVERHEAD_BYTES - del nchunks - # ensure chunk size is greater than overhead - if args.chunksizebytes <= ( - _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1) << 1: - raise ValueError('chunksizebytes {} <= encryption min {}'.format( - args.chunksizebytes, - (_AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1) << 1)) - - # disable urllib3 warnings if specified - if args.disableurllibwarnings: - print('!!! WARNING: DISABLING URLLIB3 WARNINGS !!!') - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.InsecurePlatformWarning) - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.SNIMissingWarning) - - # collect package versions - packages = ['az.common=' + azure.common.__version__] - try: - packages.append('az.sml=' + azure.servicemanagement.__version__) - except Exception: - pass - try: - packages.append('az.stor=' + azure.storage.__version__) - except Exception: - pass - try: - packages.append('crypt=' + cryptography.__version__) - except Exception: - pass - packages.append( - 'req=' + requests.__version__) - - # print all parameters - print('=====================================') - print(' azure blobxfer parameters [v{}]'.format(_SCRIPT_VERSION)) - print('=====================================') - print(' platform: {}'.format(platform.platform())) - print(' python interpreter: {} {}'.format( - platform.python_implementation(), platform.python_version())) - print(' package versions: {}'.format(' '.join(packages))) - del packages - print(' subscription id: {}'.format(args.subscriptionid)) - print(' management cert: {}'.format(args.managementcert)) - print(' transfer direction: {}'.format( - 'local->Azure' if xfertoazure else 'Azure->local')) - print(' local resource: {}'.format(args.localresource)) - print(' include pattern: {}'.format(args.include)) - print(' remote resource: {}'.format(args.remoteresource)) - print(' max num of workers: {}'.format(args.numworkers)) - print(' timeout: {}'.format(args.timeout)) - print(' storage account: {}'.format(args.storageaccount)) - print(' use SAS: {}'.format(True if args.saskey else False)) - print(' upload as page blob: {}'.format(args.pageblob)) - print(' auto vhd->page blob: {}'.format(args.autovhd)) - print(' upload to file share: {}'.format(args.fileshare)) - print(' container/share name: {}'.format(args.container)) - print(' container/share URI: {}'.format(endpoint + args.container)) - print(' compute block MD5: {}'.format(args.computeblockmd5)) - print(' compute file MD5: {}'.format(args.computefilemd5)) - print(' skip on MD5 match: {}'.format(args.skiponmatch)) - print(' chunk size (bytes): {}'.format(args.chunksizebytes)) - print(' create container: {}'.format(args.createcontainer)) - print(' keep mismatched MD5: {}'.format(args.keepmismatchedmd5files)) - print(' recursive if dir: {}'.format(args.recursive)) - print('component strip on up: {}'.format(args.stripcomponents)) - print(' remote delete: {}'.format(args.delete)) - print(' collate to: {}'.format(args.collate or 'disabled')) - print(' local overwrite: {}'.format(args.overwrite)) - print(' encryption mode: {}'.format( - (args.encmode or 'disabled' if xfertoazure else 'file dependent') - if args.rsaprivatekey is not None or args.rsapublickey is not None - else 'disabled')) - print(' RSA key file: {}'.format(rsakeyfile or 'disabled')) - print(' RSA key type: {}'.format( - 'private' if args.rsaprivatekey is not None else 'public' - if args.rsapublickey is not None else 'disabled')) - print('=======================================\n') - - # mark start time after init - print('script start time: {}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))) - start = time.time() - - # populate instruction queues - allfilesize = 0 - storage_in_queue = queue.PriorityQueue() - nstorageops = 0 - blockids = {} - completed_blockids = {} - filemap = {} - filesizes = {} - delblobs = None - md5map = {} - filedesc = None - if xfertoazure: - # if skiponmatch is enabled, list blobs first and check - if args.skiponmatch and not args.fileshare: - blobskipdict = get_blob_listing(blob_service[0], args) - else: - blobskipdict = {} - if os.path.isdir(args.localresource): - if args.remoteresource is not None: - print('WARNING: ignorning specified remoteresource {} for ' - 'directory upload'.format(args.remoteresource)) - _remotefiles = set() - # mirror directory - if args.recursive: - for root, _, files in os.walk(args.localresource): - for dirfile in files: - fname = os.path.join(root, dirfile) - if args.include is not None and not fnmatch.fnmatch( - fname, args.include): - continue - remotefname = apply_file_collation_and_strip( - args, fname) - _remotefiles.add(remotefname) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, remotefname) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, ops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, - blockids, fname, remotefname, False) - if filesize is not None: - completed_blockids[fname] = 0 - md5map[fname] = md5digest - filemap[fname] = encode_blobname(args, remotefname) - filesizes[fname] = filesize - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - else: - # copy just directory contents, non-recursively - for lfile in os.listdir(args.localresource): - fname = os.path.join(args.localresource, lfile) - if os.path.isdir(fname) or ( - args.include is not None and not fnmatch.fnmatch( - fname, args.include)): - continue - remotefname = apply_file_collation_and_strip(args, fname) - _remotefiles.add(remotefname) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, remotefname) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, ops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, - blockids, fname, remotefname, False) - if filesize is not None: - completed_blockids[fname] = 0 - md5map[fname] = md5digest - filemap[fname] = encode_blobname(args, remotefname) - filesizes[fname] = filesize - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - # fill deletion list - if args.delete: - # get blob skip dict if it hasn't been populated - if len(blobskipdict) == 0: - if args.fileshare: - blobskipdict = get_fileshare_listing( - file_service, args) - else: - blobskipdict = get_blob_listing( - blob_service[0], args, metadata=False) - delblobs = [x for x in blobskipdict if x not in _remotefiles] - del _remotefiles - else: - # upload single file - if args.remoteresource is None: - args.remoteresource = args.localresource - else: - if args.stripcomponents > 0: - args.stripcomponents -= 1 - args.remoteresource = apply_file_collation_and_strip( - args, args.remoteresource) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, args.remoteresource) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, nstorageops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, blockids, - args.localresource, args.remoteresource, True) - if filesize is not None: - completed_blockids[args.localresource] = 0 - md5map[args.localresource] = md5digest - filemap[args.localresource] = encode_blobname( - args, args.remoteresource) - filesizes[args.localresource] = filesize - allfilesize = allfilesize + filesize - del blobskipdict - # create container/file share if needed - if args.createcontainer: - if args.fileshare: - print('creating file share, if needed: {}'.format( - args.container)) - try: - azure_request( - file_service.create_share, share_name=args.container, - fail_on_exist=False, timeout=args.timeout) - except azure.common.AzureConflictHttpError: - pass - else: - print('creating container, if needed: {}'.format( - args.container)) - try: - azure_request( - blob_service[0].create_container, timeout=args.timeout, - container_name=args.container, fail_on_exist=False) - except azure.common.AzureConflictHttpError: - pass - # initialize page blobs or file share files - if len(filemap) > 0: - if args.pageblob or args.autovhd: - print('initializing page blobs') - for key in filemap: - if as_page_blob(args, key): - blob_service[1].create_blob( - container_name=args.container, - blob_name=filemap[key], - content_length=page_align_content_length( - filesizes[key]), content_settings=None) - elif args.fileshare: - print('initializing files on fileshare') - dirscreated = set() - for key in filemap: - fsfile = split_fileshare_path_into_parts(filemap[key]) - if args.rsaprivatekey or args.rsapublickey: - fspad = _AES256_BLOCKSIZE_BYTES - else: - fspad = 0 - # try to create the file first, if preconditon failure - # then try creating the parent directory - try: - file_service.create_file( - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=filesizes[key] + fspad, - content_settings=None, timeout=args.timeout) - except azure.common.AzureMissingResourceHttpError as exc: - create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - file_service.create_file( - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=filesizes[key] + fspad, - content_settings=None, timeout=args.timeout) - del dirscreated - else: - if args.remoteresource == '.': - print('attempting to copy entire {} {} to {}'.format( - 'file share' if args.fileshare else 'container', - args.container, args.localresource)) - if args.fileshare: - blobdict = get_fileshare_listing(file_service, args) - else: - blobdict = get_blob_listing(blob_service[0], args) - else: - if args.fileshare: - fsfile = get_fileshare_file_properties( - file_service, args, args.remoteresource) - if fsfile is None: - raise RuntimeError('file {} not found on share {}'.format( - args.remoteresource, args.container)) - blobdict = {args.remoteresource: fsfile[1]} - else: - blobdict = {args.remoteresource: [None, None, None]} - if len(blobdict) > 0: - print('generating local directory structure and ' - 'pre-allocating space') - # make the localresource directory - created_dirs = set() - create_dir_ifnotexists(args.localresource) - created_dirs.add(args.localresource) - # generate xferspec for all blobs - for blob in blobdict: - # filter results - if args.include is not None and not fnmatch.fnmatch( - blob, args.include): - continue - if args.collate is not None: - localfile = os.path.join( - args.localresource, args.collate, blob) - else: - localfile = os.path.join(args.localresource, blob) - # create any subdirectories if required - localdir = os.path.dirname(localfile) - if localdir not in created_dirs: - create_dir_ifnotexists(localdir) - created_dirs.add(localdir) - # add instructions - filesize, ops, md5digest, filedesc = \ - generate_xferspec_download( - blob_service[0], file_service, args, storage_in_queue, - localfile, blob, False, blobdict[blob]) - if filesize is not None: - md5map[localfile] = md5digest - filemap[localfile] = localfile + '.blobtmp' - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - if len(blobdict) > 0: - del created_dirs - del blobdict - - # delete any remote blobs if specified - if xfertoazure and delblobs is not None: - if args.fileshare: - print('deleting {} remote files'.format(len(delblobs))) - for blob in delblobs: - fsfile = split_fileshare_path_into_parts(blob) - azure_request( - file_service.delete_file, - share_name=args.container, directory_name=fsfile[0], - file_name=fsfile[1], timeout=args.timeout) - else: - print('deleting {} remote blobs'.format(len(delblobs))) - for blob in delblobs: - azure_request( - blob_service[0].delete_blob, timeout=args.timeout, - container_name=args.container, blob_name=blob) - print('deletion complete.') - - if nstorageops == 0: - print('detected no transfer actions needed to be taken, exiting...') - sys.exit(0) - - if xfertoazure: - # count number of empty files - emptyfiles = 0 - for fsize in filesizes.items(): - if fsize[1] == 0: - emptyfiles += 1 - print('detected {} empty files to upload'.format(emptyfiles)) - if args.fileshare: - print('performing {} put ranges and {} set file properties'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'ranges' - elif args.pageblob: - print('performing {} put pages/blobs and {} set blob ' - 'properties'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'pages' - elif args.autovhd: - print('performing {} mixed page/block operations with {} ' - 'finalizing operations'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'chunks' - else: - print('performing {} put blocks/blobs and {} put block ' - 'lists'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'blocks' - else: - print('performing {} range-gets'.format(nstorageops)) - progress_text = 'range-gets' - - # spawn workers - storage_out_queue = queue.Queue(nstorageops) - maxworkers = min((args.numworkers, nstorageops)) - print('spawning {} worker threads'.format(maxworkers)) - exc_list = [] - threads = [] - for _ in xrange(maxworkers): - thr = StorageChunkWorker( - exc_list, storage_in_queue, storage_out_queue, args, xfertoazure, - blob_service, file_service) - thr.start() - threads.append(thr) - - done_ops = 0 - hmacs = {} - storage_start = time.time() - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - while True: - try: - localresource, encparam = storage_out_queue.get() - except KeyboardInterrupt: - print('\n\nKeyboardInterrupt detected, force terminating ' - 'threads (this may take a while)...') - for thr in threads: - thr.terminate = True - for thr in threads: - thr.join() - raise - if len(exc_list) > 0: - for exc in exc_list: - print(exc) - sys.exit(1) - if xfertoazure: - completed_blockids[localresource] = completed_blockids[ - localresource] + 1 - if completed_blockids[localresource] == len( - blockids[localresource]): - if as_page_blob(args, localresource): - if args.computefilemd5: - azure_request( - blob_service[1].set_blob_properties, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - content_settings=azure.storage.blob. - ContentSettings(content_md5=md5map[localresource])) - elif args.fileshare: - fsfile = split_fileshare_path_into_parts( - filemap[localresource]) - # set file metadata for encrypted files - if filesizes[localresource] > 0 and ( - args.rsaprivatekey is not None or - args.rsapublickey is not None): - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], - encparam[2][0], - encparam[2]['hmac'].digest(), - md5map[localresource] - ).construct_metadata_json() - else: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], None, - None, md5map[localresource] - ).construct_metadata_json() - azure_request( - file_service.set_file_metadata, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - metadata=encmetadata, - timeout=args.timeout) - # resize file to final encrypted size if required - if (filesizes[localresource] + - _AES256_BLOCKSIZE_BYTES != - encparam[2]['filesize']): - azure_request( - file_service.resize_file, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=encparam[2]['filesize'], - timeout=args.timeout) - if args.computefilemd5: - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - md5 = base64encode(encparam[2]['md5'].digest()) - else: - md5 = md5map[localresource] - azure_request( - file_service.set_file_properties, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_settings=azure.storage.file. - ContentSettings(content_md5=md5), - timeout=args.timeout) - else: - # only perform put block list on non-zero byte files - if filesizes[localresource] > 0: - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - md5 = base64encode(encparam[2]['md5'].digest()) - else: - md5 = md5map[localresource] - block_list = [] - for bid in blockids[localresource]: - block_list.append( - azure.storage.blob.BlobBlock(id=bid)) - azure_request( - blob_service[0].put_block_list, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - block_list=block_list, - content_settings=azure.storage.blob. - ContentSettings( - content_type=get_mime_type(localresource), - content_md5=md5)) - # set blob metadata for encrypted blobs - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], - encparam[2][0], - encparam[2]['hmac'].digest(), - md5map[localresource] - ).construct_metadata_json() - else: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], None, - None, md5map[localresource] - ).construct_metadata_json() - azure_request( - blob_service[0].set_blob_metadata, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - metadata=encmetadata) - else: - if (args.rsaprivatekey is not None and - encparam[3] == _ENCRYPTION_MODE_FULLBLOB and - not as_page_blob(args, localresource) and - encparam[4]['hmac']['hmac'] is not None): - hmacs[localresource] = encparam[4]['hmac'] - done_ops += 1 - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - if done_ops == nstorageops: - break - endtime = time.time() - if filedesc is not None: - filedesc.close() - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - print('\n\n{} MiB transfered, elapsed {} sec. ' - 'Throughput = {} Mbit/sec\n'.format( - allfilesize / 1048576.0, endtime - storage_start, - (8.0 * allfilesize / 1048576.0) / (endtime - storage_start))) - - # finalize files/blobs - if not xfertoazure: - print( - 'performing finalization (if applicable): {}: {}, MD5: {}'.format( - _ENCRYPTION_AUTH_ALGORITHM, - args.rsaprivatekey is not None, args.computefilemd5)) - for localfile in filemap: - tmpfilename = filemap[localfile] - finalizefile = True - skipmd5 = False - # check hmac - if (args.rsaprivatekey is not None and - args.encmode == _ENCRYPTION_MODE_FULLBLOB): - if tmpfilename in hmacs: - hmacdict = hmacs[tmpfilename] - # process any remaining hmac data - while len(hmacdict['buffered']) > 0: - curr = hmacdict['curr'] - if curr in hmacdict['buffered']: - hmacdict['hmac'].update(hmacdict['buffered'][curr]) - hmacdict['buffered'].pop(curr) - hmacdict['curr'] = curr + 1 - else: - break - digest = base64encode(hmacdict['hmac'].digest()) - res = 'OK' - if digest != hmacdict['sig']: - res = 'MISMATCH' - finalizefile = False - else: - skipmd5 = True - print('[{}: {}, {}] {} {}'.format( - _ENCRYPTION_AUTH_ALGORITHM, res, localfile, - digest, hmacdict['sig'])) - # compare md5 hash - if args.computefilemd5 and not skipmd5: - lmd5 = compute_md5_for_file_asbase64(tmpfilename) - if md5map[localfile] is None: - print('[MD5: SKIPPED, {}] {} {}'.format( - localfile, lmd5, md5map[localfile])) - else: - if lmd5 != md5map[localfile]: - res = 'MISMATCH' - if not args.keepmismatchedmd5files: - finalizefile = False - else: - res = 'OK' - print('[MD5: {}, {}] {} {}'.format( - res, localfile, lmd5, md5map[localfile])) - if finalizefile: - # check for existing file first - if os.path.exists(localfile): - if args.overwrite: - os.remove(localfile) - else: - raise IOError( - 'cannot overwrite existing file: {}'.format( - localfile)) - # move tmp file to real file - os.rename(tmpfilename, localfile) - else: - os.remove(tmpfilename) - print('finalization complete.') - - # output final log lines - print('\nscript elapsed time: {} sec'.format(time.time() - start)) - print('script end time: {}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))) - - -def progress_bar(display, sprefix, rtext, value, qsize, start): - """Display a progress bar - Parameters: - display - display bar - sprefix - progress prefix - rtext - rate text - value - value input value - qsize - queue size - start - start time - Returns: - Nothing - Raises: - Nothing - """ - if not display: - return - done = float(qsize) / value - diff = time.time() - start - if diff <= 0: - # arbitrarily give a small delta - diff = 1e-6 - rate = float(qsize) / (diff / 60) - sys.stdout.write( - '\r{0} progress: [{1:30s}] {2:.2f}% {3:10.2f} {4}/min '.format( - sprefix, '>' * int(done * 30), done * 100, rate, rtext)) - sys.stdout.flush() - - -def parseargs(): # pragma: no cover - """Sets up command-line arguments and parser - Parameters: - Nothing - Returns: - Parsed command line arguments - Raises: - Nothing - """ - parser = argparse.ArgumentParser( - description='Transfer files/blobs to/from Azure blob or file storage') - parser.set_defaults( - autovhd=False, endpoint=_DEFAULT_STORAGE_ENDPOINT, - chunksizebytes=_MAX_BLOB_CHUNK_SIZE_BYTES, collate=None, - computeblockmd5=False, computefilemd5=True, createcontainer=True, - delete=False, disableurllibwarnings=False, - encmode=_DEFAULT_ENCRYPTION_MODE, fileshare=False, include=None, - managementep=_DEFAULT_MANAGEMENT_ENDPOINT, - numworkers=_DEFAULT_MAX_STORAGEACCOUNT_WORKERS, overwrite=True, - pageblob=False, progressbar=True, recursive=True, rsaprivatekey=None, - rsapublickey=None, rsakeypassphrase=None, skiponmatch=True, - stripcomponents=None, timeout=None) - parser.add_argument('storageaccount', help='name of storage account') - parser.add_argument( - 'container', - help='name of blob container or file share') - parser.add_argument( - 'localresource', - help='name of the local file or directory, if mirroring. "."=use ' - 'current directory') - parser.add_argument( - '--autovhd', action='store_true', - help='automatically upload files ending in .vhd as page blobs') - parser.add_argument( - '--collate', nargs='?', - help='collate all files into a specified path') - parser.add_argument( - '--computeblockmd5', dest='computeblockmd5', action='store_true', - help='compute block/page level MD5 during upload') - parser.add_argument( - '--chunksizebytes', type=int, - help='maximum chunk size to transfer in bytes [{}]'.format( - _MAX_BLOB_CHUNK_SIZE_BYTES)) - parser.add_argument( - '--delete', action='store_true', - help='delete extraneous remote blobs that have no corresponding ' - 'local file when uploading directories') - parser.add_argument( - '--disable-urllib-warnings', action='store_true', - dest='disableurllibwarnings', - help='disable urllib warnings (not recommended)') - parser.add_argument( - '--download', action='store_true', - help='force transfer direction to download from Azure') - parser.add_argument( - '--encmode', - help='encryption mode [{}]'.format(_DEFAULT_ENCRYPTION_MODE)) - parser.add_argument( - '--endpoint', - help='storage endpoint [{}]'.format(_DEFAULT_STORAGE_ENDPOINT)) - parser.add_argument( - '--fileshare', action='store_true', - help='transfer to a file share rather than block/page blob') - parser.add_argument( - '--include', type=str, - help='include pattern (Unix shell-style wildcards)') - parser.add_argument( - '--keepmismatchedmd5files', action='store_true', - help='keep files with MD5 mismatches') - parser.add_argument( - '--managementcert', - help='path to management certificate .pem file') - parser.add_argument( - '--managementep', - help='management endpoint [{}]'.format(_DEFAULT_MANAGEMENT_ENDPOINT)) - parser.add_argument( - '--no-computefilemd5', dest='computefilemd5', action='store_false', - help='do not compute file MD5 and either upload as metadata ' - 'or validate on download') - parser.add_argument( - '--no-createcontainer', dest='createcontainer', action='store_false', - help='do not create container if it does not exist') - parser.add_argument( - '--no-overwrite', dest='overwrite', action='store_false', - help='do not overwrite local files on download') - parser.add_argument( - '--no-progressbar', dest='progressbar', action='store_false', - help='disable progress bar') - parser.add_argument( - '--no-recursive', dest='recursive', action='store_false', - help='do not mirror local directory recursively') - parser.add_argument( - '--no-skiponmatch', dest='skiponmatch', action='store_false', - help='do not skip upload/download on MD5 match') - parser.add_argument( - '--numworkers', type=int, - help='max number of workers [{}]'.format( - _DEFAULT_MAX_STORAGEACCOUNT_WORKERS)) - parser.add_argument( - '--pageblob', action='store_true', - help='upload as page blob rather than block blob, blobs will ' - 'be page-aligned in Azure storage') - parser.add_argument( - '--rsaprivatekey', - help='RSA private key file in PEM format. Specifying an RSA private ' - 'key will turn on decryption (or encryption). An RSA private key is ' - 'required for downloading and decrypting blobs and may be specified ' - 'for encrypting and uploading blobs.') - parser.add_argument( - '--rsapublickey', - help='RSA public key file in PEM format. Specifying an RSA public ' - 'key will turn on encryption. An RSA public key can only be used ' - 'for encrypting and uploading blobs.') - parser.add_argument( - '--rsakeypassphrase', - help='Optional passphrase for decrypting an RSA private key; can be ' - 'specified as {} environment variable instead'.format( - _ENVVAR_RSAKEYPASSPHRASE)) - parser.add_argument( - '--remoteresource', - help='name of remote resource on Azure storage. "."=container ' - 'copy recursive implied') - parser.add_argument( - '--saskey', - help='SAS key to use, if recursive upload or container download, ' - 'this must be a container SAS; can be specified as ' - '{} environment variable instead'.format(_ENVVAR_SASKEY)) - parser.add_argument( - '--storageaccountkey', - help='storage account shared key; can be specified as ' - '{} environment variable instead'.format(_ENVVAR_STORAGEACCOUNTKEY)) - parser.add_argument( - '--strip-components', dest='stripcomponents', type=int, - help='strip N leading components from path on upload [1]') - parser.add_argument('--subscriptionid', help='subscription id') - parser.add_argument( - '--timeout', type=float, - help='timeout in seconds for any operation to complete') - parser.add_argument( - '--upload', action='store_true', - help='force transfer direction to upload to Azure') - parser.add_argument('--version', action='version', version=_SCRIPT_VERSION) - return parser.parse_args() - - -if __name__ == '__main__': - main() diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py new file mode 100644 index 0000000..157d59f --- /dev/null +++ b/blobxfer/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +from .version import __version__ # noqa diff --git a/blobxfer/util.py b/blobxfer/util.py new file mode 100644 index 0000000..bf3a9a8 --- /dev/null +++ b/blobxfer/util.py @@ -0,0 +1,213 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, str, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import base64 +import copy +import hashlib +import logging +import logging.handlers +import mimetypes +try: + from os import scandir as scandir +except ImportError: # noqa + from scandir import scandir as scandir +import sys +# non-stdlib imports +# local imports + +# global defines +_PY2 = sys.version_info.major == 2 +_PAGEBLOB_BOUNDARY = 512 + + +def on_python2(): + # type: (None) -> bool + """Execution on python2 + :rtype: bool + :return: if on Python2 + """ + return _PY2 + + +def setup_logger(logger): # noqa + # type: (logger) -> None + """Set up logger""" + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + formatter = logging.Formatter( + '%(asctime)sZ %(levelname)s %(name)s:%(funcName)s:%(lineno)d ' + '%(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + + +def is_none_or_empty(obj): + # type: (any) -> bool + """Determine if object is None or empty + :type any obj: object + :rtype: bool + :return: if object is None or empty + """ + if obj is None or len(obj) == 0: + return True + return False + + +def is_not_empty(obj): + # type: (any) -> bool + """Determine if object is not None and is length is > 0 + :type any obj: object + :rtype: bool + :return: if object is not None and length is > 0 + """ + if obj is not None and len(obj) > 0: + return True + return False + + +def merge_dict(dict1, dict2): + # type: (dict, dict) -> dict + """Recursively merge dictionaries: dict2 on to dict1. This differs + from dict.update() in that values that are dicts are recursively merged. + Note that only dict value types are merged, not lists, etc. + + :param dict dict1: dictionary to merge to + :param dict dict2: dictionary to merge with + :rtype: dict + :return: merged dictionary + """ + if not isinstance(dict1, dict) or not isinstance(dict2, dict): + raise ValueError('dict1 or dict2 is not a dictionary') + result = copy.deepcopy(dict1) + for k, v in dict2.items(): + if k in result and isinstance(result[k], dict): + result[k] = merge_dict(result[k], v) + else: + result[k] = copy.deepcopy(v) + return result + + +def scantree(path): + # type: (str) -> os.DirEntry + """Recursively scan a directory tree + :param str path: path to scan + :rtype: DirEntry + :return: DirEntry via generator + """ + for entry in scandir(path): + if entry.is_dir(follow_symlinks=True): + # due to python2 compat, cannot use yield from here + for t in scantree(entry.path): + yield t + else: + yield entry + + +def get_mime_type(filename): + # type: (str) -> str + """Guess the type of a file based on its filename + :param str filename: filename to guess the content-type + :rtype: str + :rturn: string of form 'class/type' for MIME content-type header + """ + return (mimetypes.guess_type(filename)[0] or 'application/octet-stream') + + +def base64_encode_as_string(obj): # noqa + # type: (any) -> str + """Encode object to base64 + :param any obj: object to encode + :rtype: str + :return: base64 encoded string + """ + if _PY2: + return base64.b64encode(obj) + else: + return str(base64.b64encode(obj), 'ascii') + + +def base64_decode_string(string): + # type: (str) -> str + """Base64 decode a string + :param str string: string to decode + :rtype: str + :return: decoded string + """ + return base64.b64decode(string) + + +def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): + # type: (str, bool, int) -> str + """Compute MD5 hash for file and encode as Base64 + :param str filename: file to compute MD5 for + :param bool pagealign: page align data + :param int blocksize: block size + :rtype: str + :return: MD5 for file encoded as Base64 + """ + hasher = hashlib.md5() + with open(filename, 'rb') as filedesc: + while True: + buf = filedesc.read(blocksize) + if not buf: + break + buflen = len(buf) + if pagealign and buflen < blocksize: + aligned = page_align_content_length(buflen) + if aligned != buflen: + buf = buf.ljust(aligned, b'\0') + hasher.update(buf) + return base64_encode_as_string(hasher.digest()) + + +def compute_md5_for_data_asbase64(data): + # type: (obj) -> str + """Compute MD5 hash for bits and encode as Base64 + :param any data: data to compute MD5 for + :rtype: str + :return: MD5 for data + """ + hasher = hashlib.md5() + hasher.update(data) + return base64_encode_as_string(hasher.digest()) + + +def page_align_content_length(length): + # type: (int) -> int + """Compute page boundary alignment + :param int length: content length + :rtype: int + :return: aligned byte boundary + """ + mod = length % _PAGEBLOB_BOUNDARY + if mod != 0: + return length + (_PAGEBLOB_BOUNDARY - mod) + return length diff --git a/blobxfer/version.py b/blobxfer/version.py new file mode 100644 index 0000000..9e8b65b --- /dev/null +++ b/blobxfer/version.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +__version__ = '1.0.0a1' diff --git a/setup.py b/setup.py index 7709e0e..83d5abb 100644 --- a/setup.py +++ b/setup.py @@ -1,41 +1,76 @@ +from codecs import open +import os import re try: from setuptools import setup except ImportError: from distutils.core import setup +import sys -with open('blobxfer.py', 'r') as fd: +if sys.argv[-1] == 'publish': + os.system('rm -rf blobxfer.egg-info/ build dist __pycache__/') + os.system('python setup.py sdist bdist_wheel') + os.unlink('README.rst') + sys.exit() +elif sys.argv[-1] == 'upload': + os.system('twine upload dist/*') + sys.exit() +elif sys.argv[-1] == 'sdist' or sys.argv[-1] == 'bdist_wheel': + import pypandoc + long_description = pypandoc.convert('README.md', 'rst') +else: + long_description = '' + +with open('blobxfer/version.py', 'r', 'utf-8') as fd: version = re.search( - r'^_SCRIPT_VERSION\s*=\s*[\'"]([^\'"]*)[\'"]', + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) -with open('README.rst') as readme: - long_description = ''.join(readme).strip() +if not version: + raise RuntimeError('Cannot find version') + +packages = [ + 'blobxfer', + 'blobxfer.blob', + 'blobxfer.blob.block', + 'blobxfer_cli', +] + +install_requires = [ + 'azure-common==1.1.4', + 'azure-storage==0.33.0', + 'click==6.6', + 'cryptography>=1.7.1', + 'future==0.16.0', + 'ruamel.yaml==0.13.11', +] + +if sys.version_info < (3, 5): + install_requires.append('pathlib2') + install_requires.append('scandir') setup( name='blobxfer', version=version, author='Microsoft Corporation, Azure Batch and HPC Team', author_email='', - description='Azure storage transfer tool with AzCopy-like features', + description=( + 'Azure storage transfer tool and library with AzCopy-like features'), long_description=long_description, platforms='any', url='https://github.com/Azure/blobxfer', license='MIT', - py_modules=['blobxfer'], + packages=packages, + package_data={'blobxfer': ['LICENSE']}, + package_dir={'blobxfer': 'blobxfer', 'blobxfer_cli': 'cli'}, entry_points={ - 'console_scripts': 'blobxfer=blobxfer:main', + 'console_scripts': 'blobxfer=blobxfer_cli.cli:cli', }, - install_requires=[ - 'azure-common==1.1.4', - 'azure-storage==0.33.0', - 'azure-servicemanagement-legacy==0.20.5', - 'cryptography>=1.6', - 'requests==2.12.3' - ], + zip_safe=False, + install_requires=install_requires, tests_require=['pytest'], classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 3 - Alpha', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: System Administrators', @@ -47,7 +82,8 @@ 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Utilities', ], - keywords='azcopy azure storage blob files transfer copy smb', + keywords='azcopy azure storage blob files transfer copy smb cifs', ) diff --git a/test/test_blobxfer.py b/test/test_blobxfer.py deleted file mode 100644 index 28208af..0000000 --- a/test/test_blobxfer.py +++ /dev/null @@ -1,1436 +0,0 @@ -# coding=utf-8 -"""Tests for blobxfer""" - -# stdlib imports -import base64 -import copy -import errno -import json -import math -import os -try: - import queue -except ImportError: - import Queue as queue -import socket -import sys -import threading -import uuid -# non-stdlib imports -import azure.common -import azure.storage.blob -import cryptography.exceptions -import cryptography.hazmat.backends -import cryptography.hazmat.primitives.asymmetric.rsa -import cryptography.hazmat.primitives.serialization -from mock import (MagicMock, Mock, patch) -import pytest -import requests -import requests_mock -# module under test -sys.path.append('..') -import blobxfer # noqa - - -# global defines -_RSAKEY = cryptography.hazmat.primitives.asymmetric.rsa.generate_private_key( - public_exponent=65537, key_size=2048, - backend=cryptography.hazmat.backends.default_backend()) - - -def test_encrypt_decrypt_chunk(): - enckey, signkey = blobxfer.generate_aes256_keys() - assert len(enckey) == blobxfer._AES256_KEYLENGTH_BYTES - assert len(signkey) == blobxfer._AES256_KEYLENGTH_BYTES - - # test random binary data, unaligned - iv = os.urandom(16) - plaindata = os.urandom(31) - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - unpad=True) - assert decdata == plaindata - with pytest.raises(RuntimeError): - badsig = base64.b64encode(b'0') - blobxfer.decrypt_chunk( - enckey, badsig, encdata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - unpad=True) - - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, pad=True) - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, unpad=True) - assert decdata == plaindata - - # test random binary data aligned on boundary - plaindata = os.urandom(32) - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, unpad=True) - assert decdata == plaindata - - # test text data - plaindata = b'attack at dawn!' - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv, pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv, unpad=True) - assert decdata == plaindata - - -def test_rsa_keys(): - symkey = os.urandom(32) - enckey, sig = blobxfer.rsa_encrypt_key( - _RSAKEY, None, symkey, asbase64=False) - assert enckey is not None - assert sig is not None - plainkey = blobxfer.rsa_decrypt_key(_RSAKEY, enckey, sig, isbase64=False) - assert symkey == plainkey - - with pytest.raises(cryptography.exceptions.InvalidSignature): - badsig = base64.b64encode(b'0') - blobxfer.rsa_decrypt_key(_RSAKEY, enckey, badsig, isbase64=False) - - enckey, sig = blobxfer.rsa_encrypt_key( - _RSAKEY, None, symkey, asbase64=True) - assert enckey is not None - assert sig is not None - plainkey = blobxfer.rsa_decrypt_key(_RSAKEY, enckey, sig, isbase64=True) - assert symkey == plainkey - - with pytest.raises(cryptography.exceptions.InvalidSignature): - badsig = base64.b64encode(b'0') - blobxfer.rsa_decrypt_key(_RSAKEY, enckey, badsig, isbase64=True) - - -def test_compute_md5(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - testdata = str(uuid.uuid4()) - with open(lpath, 'wt') as f: - f.write(testdata) - md5_file = blobxfer.compute_md5_for_file_asbase64(lpath) - md5_data = blobxfer.compute_md5_for_data_asbase64(testdata.encode('utf8')) - assert md5_file == md5_data - - # test non-existent file - with pytest.raises(IOError): - blobxfer.compute_md5_for_file_asbase64(testdata) - - -def test_page_align_content_length(): - assert 0 == blobxfer.page_align_content_length(0) - assert 512 == blobxfer.page_align_content_length(511) - assert 512 == blobxfer.page_align_content_length(512) - assert 1024 == blobxfer.page_align_content_length(513) - - -def _func_successful_requests_call(timeout=None): - response = MagicMock() - response.raise_for_status = lambda: None - return response - - -def _func_raise_requests_exception_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.Timeout() - raise ex - - -def _func_raise_requests_connection_error_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.ConnectionError( - requests.packages.urllib3.exceptions.ProtocolError( - 'Connection aborted.', - socket.error(errno.ECONNRESET, 'Connection reset by peer'))) - raise ex - - -def _func_raise_requests_chunked_encoding_error_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.exceptions.ChunkedEncodingError( - requests.packages.urllib3.exceptions.ProtocolError( - 'Connection aborted.', - socket.error(errno.ECONNRESET, 'Connection reset by peer'))) - raise ex - - -def _func_raise_azurehttperror_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - return response - val.append(0) - ex = azure.common.AzureHttpError('ServerBusy', 503) - raise ex - - -@patch('time.sleep', return_value=None) -def test_azure_request(patched_time_sleep): - azcomerr = azure.common.AzureHttpError('ServerBusy', 503) - - with pytest.raises(IOError): - mock = Mock(side_effect=azcomerr) - mock.__name__ = 'name' - blobxfer.azure_request(mock, timeout=0.001) - - with pytest.raises(Exception): - ex = Exception() - ex.message = 'Uncaught' - blobxfer.azure_request(Mock(side_effect=ex)) - - with pytest.raises(Exception): - ex = Exception() - ex.__delattr__('message') - blobxfer.azure_request(Mock(side_effect=ex)) - - blobxfer.azure_request( - _func_raise_requests_connection_error_once, val=[], timeout=1) - - blobxfer.azure_request( - _func_raise_requests_chunked_encoding_error_once, val=[], timeout=1) - - blobxfer.azure_request( - _func_raise_azurehttperror_once, val=[], timeout=1) - - with pytest.raises(requests.HTTPError): - exc = requests.HTTPError() - exc.response = MagicMock() - exc.response.status_code = 404 - mock = Mock(side_effect=exc) - blobxfer.azure_request(mock) - - try: - blobxfer.azure_request( - _func_raise_requests_exception_once, val=[], timeout=1) - except Exception: - pytest.fail('unexpected Exception raised') - - try: - blobxfer.azure_request(_func_successful_requests_call) - except Exception: - pytest.fail('unexpected Exception raised') - - -def test_sasblobservice_listblobs(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - content = b'string-value' + \ - b'string-valueint-value' + \ - b'string-valueblob-name' + \ - b'date-time-value' + \ - b'date-time-valueetag' + \ - b'2147483648' + \ - b'blob-content-type' + \ - b'abc' + \ - b'sequence-number' + \ - b'BlockBlob' + \ - b'locked|unlocked' + \ - b'available | leased | expired | breaking | broken' + \ - b'infinite | fixedid' + \ - b'pending | success | aborted | failed' + \ - b'source url' + \ - b'bytes copied/bytes total' + \ - b'datetime' + \ - b'error string' + \ - b'value' + \ - b'blob-prefixnm' + \ - b'' - - with requests_mock.mock() as m: - m.get('mock://blobepcontainer?saskey', content=content) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - metamock = MagicMock() - metamock.metadata = True - result = sbs.list_blobs('container', 'marker', include=metamock) - assert len(result) == 1 - assert result[0].name == 'blob-name' - assert result[0].properties.content_length == 2147483648 - assert result[0].properties.content_settings.content_md5 == 'abc' - assert result[0].properties.blobtype == 'BlockBlob' - assert result[0].metadata['Name'] == 'value' - assert result.next_marker == 'nm' - - m.get('mock://blobepcontainer?saskey', content=b'', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.list_blobs('container', 'marker') - - -def test_sasblobservice_setblobmetadata(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey') - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - sbs.set_blob_metadata('container', 'blob', None) - sbs.set_blob_metadata('container', 'blob', {'name': 'value'}) - - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - with pytest.raises(IOError): - sbs.set_blob_metadata('container', 'blob', {'name': 'value'}) - - -def test_sasblobservice_getblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.get('mock://blobepcontainer/blob?saskey', content=b'data') - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - results = sbs._get_blob('container', 'blob', 0, 1) - assert results.content == b'data' - - m.get('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs._get_blob('container', 'blob', 0, 1) - - -def test_sasblobservice_getblobproperties(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', - headers={'x-ms-meta-hello': 'world', 'content-length': '1'}) - sbs = blobxfer.SasBlobService('mock://blobep', '?saskey', None) - results = sbs.get_blob_properties('container', 'blob') - assert results.metadata['hello'] == 'world' - - m.head('mock://blobepcontainer/blob?saskey', text='', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.get_blob_properties('container', 'blob') - - -def test_sasblobservice_putblock(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', '?saskey', None) - try: - sbs.put_block( - 'container', 'blob', 'block', 'blockid', - validate_content=False) - except Exception: - pytest.fail('unexpected Exception raised') - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.put_block( - 'container', 'blob', 'block', 'blockid', - validate_content=False) - - -def test_sasblobservice_putblocklist(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - block_list = [ - azure.storage.blob.BlobBlock(id='1'), - azure.storage.blob.BlobBlock(id='2') - ] - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.put_block_list('container', 'blob', block_list, cs) - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.put_block_list('container', 'blob', block_list, cs) - - -def test_sasblobservice_setblobproperties(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.set_blob_properties('container', 'blob', cs) - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.set_blob_properties('container', 'blob', cs) - - -def test_sasblobservice_putblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings( - content_type='a', content_md5='md5') - sbs._put_blob('container', 'blob', None, cs) - - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs._put_blob('container', 'blob', None, cs) - - -def test_sasblobservice_createblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.create_blob('container', 'blob', 0, cs) - - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.create_blob('container', 'blob', 0, cs) - - -def test_sasblobservice_createcontainer(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - sbs.create_container('container', fail_on_exist=False) - - m.put('mock://blobepcontainer?saskey', status_code=409) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(requests.exceptions.HTTPError): - sbs.create_container('container', fail_on_exist=True) - - -def test_storagechunkworker_run(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.rsakey = None - args.pageblob = True - args.autovhd = False - args.timeout = None - args.fileshare = False - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - exc_list = [] - flock = threading.Lock() - sa_in_queue = queue.PriorityQueue() - sa_out_queue = queue.Queue() - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, (sbs, sbs), None) - with pytest.raises(IOError): - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - args.pageblob = False - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, (sbs, sbs), None) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - m.get('mock://blobepcontainer/blob?saskey', status_code=200) - bcw.get_storage_range( - lpath, 'container', 'blob', 0, 0, 4, - [None, None, None, None, None, False], flock, None) - - # test zero-length putblob - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 0, None, flock, None) - bcw._pageblob = True - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 0, None, flock, None) - - # test empty page - with open(lpath, 'wb') as f: - f.write(b'\0' * 4 * 1024 * 1024) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4 * 1024 * 1024, - None, flock, None) - with open(lpath, 'wb') as f: - f.write(b'\0' * 4 * 1024) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4 * 1024, - None, flock, None) - - sa_in_queue.put((0, (lpath, 'container', 'blob', 'blockid', 0, 4, - [None, None, None, None], flock, None))) - with requests_mock.mock() as m: - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, False, (sbs, sbs), None) - m.get('mock://blobepcontainer/blob?saskey', status_code=201) - bcw.run() - assert len(exc_list) > 0 - - -@patch('azure.storage.file.FileService.update_range') -@patch('azure.storage.file.FileService._get_file') -def test_storagechunkworker_files_run( - patched_get_file, patched_update_range, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.rsakey = None - args.pageblob = False - args.autovhd = False - args.timeout = None - args.fileshare = True - - exc_list = [] - flock = threading.Lock() - sa_in_queue = queue.PriorityQueue() - sa_out_queue = queue.Queue() - fs = azure.storage.file.FileService(account_name='sa', account_key='key') - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, None, fs) - patched_update_range.return_value = MagicMock() - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, False, None, fs) - patched_get_file.return_value = MagicMock() - patched_get_file.return_value.content = b'' - bcw.get_storage_range( - lpath, 'container', 'blob', 0, 0, 4, - [None, None, None, None, None, False], flock, None) - - -@patch('blobxfer.azure_request', return_value=None) -def test_generate_xferspec_download_invalid(patched_azure_request): - args = MagicMock() - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = 'saskey' - args.chunksizebytes = 5 - args.timeout = None - args.fileshare = False - sa_in_queue = queue.PriorityQueue() - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '-1', 'content-md5': 'md5'}) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(ValueError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, 'tmppath', 'blob', True, - [None, None, None]) - - -def test_generate_xferspec_download(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - args.rsakey = None - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = 'saskey' - args.chunksizebytes = 5 - args.timeout = None - args.fileshare = False - sa_in_queue = queue.PriorityQueue() - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '-1', 'content-md5': 'md5'}) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(ValueError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert sa_in_queue.qsize() == 0 - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '6', 'content-md5': 'md5'}) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert sa_in_queue.qsize() == 2 - assert 2 == nsops - assert 6 == cl - assert 2 == nsops - assert 'md5' == md5 - assert fd is not None - fd.close() - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - assert 2 == nsops - assert fd is None - assert sa_in_queue.qsize() == 4 - with open(lpath, 'wt') as f: - f.write('012345') - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert nsops is None - assert cl is None - assert sa_in_queue.qsize() == 4 - - sa_in_queue = queue.PriorityQueue() - args.rsaprivatekey = _RSAKEY - args.rsapublickey = None - symkey, signkey = blobxfer.generate_aes256_keys() - args.encmode = blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB - metajson = blobxfer.EncryptionMetadataJson( - args, symkey, signkey, iv=b'0', encdata_signature=b'0', - preencrypted_md5=None) - encmeta = metajson.construct_metadata_json() - goodencjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - goodauthjson = json.loads( - encmeta[blobxfer._ENCRYPTION_METADATA_AUTH_NAME]) - metajson2 = blobxfer.EncryptionMetadataJson( - args, None, None, None, None, None) - metajson2.parse_metadata_json( - 'blob', args.rsaprivatekey, args.rsapublickey, encmeta) - assert metajson2.symkey == symkey - assert metajson2.signkey == signkey - assert metajson2.encmode == args.encmode - assert metajson2.chunksizebytes == args.chunksizebytes + \ - blobxfer._AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - encjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - encjson[blobxfer._ENCRYPTION_METADATA_LAYOUT][ - blobxfer._ENCRYPTION_METADATA_CHUNKSTRUCTURE] = 'X' - headers = { - 'content-length': '64', - 'content-md5': 'md5', - 'x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME: - json.dumps(encjson), - 'x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME: - json.dumps(goodauthjson), - } - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - # switch to full blob mode tests - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - metajson = blobxfer.EncryptionMetadataJson( - args, symkey, signkey, iv=b'0', encdata_signature=b'0', - preencrypted_md5=None) - encmeta = metajson.construct_metadata_json() - goodencjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - goodauthjson = json.loads( - encmeta[blobxfer._ENCRYPTION_METADATA_AUTH_NAME]) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(goodauthjson) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_AGENT][ - blobxfer._ENCRYPTION_METADATA_PROTOCOL] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_AGENT][ - blobxfer._ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_INTEGRITY_AUTH][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson.pop(blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH, None) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH].pop( - blobxfer._ENCRYPTION_METADATA_AUTH_ENCODING, None) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH][ - blobxfer._ENCRYPTION_METADATA_MAC] = blobxfer.base64encode(b'X') - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - args.chunksizebytes = 5 - metajson.chunksizebytes = args.chunksizebytes - metajson.md5 = headers['content-md5'] - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - encjson = copy.deepcopy(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(goodauthjson) - hcl = int(headers['content-length']) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [hcl, headers['content-md5'], metajson]) - assert hcl == cl - calcops = hcl // args.chunksizebytes - hclmod = hcl % args.chunksizebytes - if hclmod > 0: - calcops += 1 - assert calcops == nsops - assert headers['content-md5'] == md5 - assert fd is None - assert sa_in_queue.qsize() == nsops - data = sa_in_queue.get() - assert data is not None - - -def test_generate_xferspec_upload(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.storageaccount = 'sa' - args.container = 'container' - args.storageaccountkey = 'key' - args.chunksizebytes = 5 - args.skiponmatch = False - args.pageblob = False - args.autovhd = False - sa_in_queue = queue.PriorityQueue() - fs, nsops, md5, fd = blobxfer.generate_xferspec_upload( - args, sa_in_queue, {}, {}, lpath, 'rr', True) - stat = os.stat(lpath) - assert stat.st_size == fs - assert math.ceil(stat.st_size / 5.0) == nsops - assert fd is not None - fd.close() - args.skiponmatch = True - with open(lpath, 'wt') as f: - f.write('012345') - sd = {} - sd['rr'] = [6, '1qmpM8iq/FHlWsBmK25NSg=='] - fs, nsops, md5, fd = blobxfer.generate_xferspec_upload( - args, sa_in_queue, sd, {}, lpath, 'rr', False) - assert fs is None - - -def test_apply_file_collation_and_strip(): - args = MagicMock() - args.collate = 'collatedir' - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'collatedir/file0' - - args.collate = None - args.stripcomponents = 0 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'tmpdir/file0' - args.stripcomponents = 1 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'file0' - args.stripcomponents = 2 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'file0' - args.stripcomponents = 1 - rfname = blobxfer.apply_file_collation_and_strip( - args, '/tmpdir/tmpdir2/file0') - assert rfname == 'tmpdir2/file0' - args.stripcomponents = 2 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/tmpdir2/file0') - assert rfname == 'file0' - - -@patch('azure.storage.file.FileService.create_directory') -def test_create_all_parent_directories_fileshare(patched_cd): - patched_cd.return_value = MagicMock() - fsfile = ['tmp/a/b', None] - file_service = MagicMock() - args = MagicMock() - args.container = 'fshare' - args.timeout = None - dirscreated = set() - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 3 - assert 'tmp' in dirscreated - assert 'tmp/a' in dirscreated - assert 'tmp/a/b' in dirscreated - fsfile = ['tmp/a/b/c', None] - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 4 - assert 'tmp/a/b/c' in dirscreated - fsfile = ['x/a/b/c', None] - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 8 - assert 'x/a/b/c' in dirscreated - - -def _mock_get_storage_account_keys(timeout=None, service_name=None): - ret = MagicMock() - ret.storage_service_keys.primary = 'mmkey' - return ret - - -def _mock_get_storage_account_properties(timeout=None, service_name=None): - ret = MagicMock() - ret.storage_service_properties.endpoints = [None] - return ret - - -def _mock_blobservice_create_container(timeout=None, container_name=None, - fail_on_exist=None): - raise azure.common.AzureConflictHttpError('conflict', 409) - - -@patch('blobxfer.parseargs') -@patch('azure.servicemanagement.ServiceManagementService.' - 'get_storage_account_keys') -@patch('azure.servicemanagement.ServiceManagementService.' - 'get_storage_account_properties') -def test_main1( - patched_sms_saprops, patched_sms_sakeys, patched_parseargs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - args.include = None - args.stripcomponents = 0 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.rsakeypassphrase = None - args.numworkers = 0 - args.localresource = '' - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = None - os.environ[blobxfer._ENVVAR_STORAGEACCOUNTKEY] = 'saskey' - args.chunksizebytes = 5 - args.pageblob = False - args.autovhd = False - args.fileshare = False - patched_parseargs.return_value = args - with pytest.raises(ValueError): - blobxfer.main() - args.localresource = lpath - args.endpoint = '' - with pytest.raises(ValueError): - blobxfer.main() - args.endpoint = 'blobep' - args.upload = True - args.download = True - with pytest.raises(ValueError): - blobxfer.main() - args.upload = None - args.download = None - with pytest.raises(ValueError): - blobxfer.main() - os.environ.pop(blobxfer._ENVVAR_STORAGEACCOUNTKEY) - args.storageaccountkey = None - args.timeout = -1 - args.saskey = '' - with pytest.raises(ValueError): - blobxfer.main() - args.saskey = None - args.storageaccountkey = None - args.managementcert = 'cert.spam' - args.subscriptionid = '1234' - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = 'cert.pem' - args.managementep = None - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = 'mep' - args.subscriptionid = None - with pytest.raises(ValueError): - blobxfer.main() - args.subscriptionid = '1234' - args.pageblob = True - args.autovhd = True - with pytest.raises(ValueError): - blobxfer.main() - args.autovhd = False - args.fileshare = True - with pytest.raises(ValueError): - blobxfer.main() - args.pageblob = False - args.autovhd = True - with pytest.raises(ValueError): - blobxfer.main() - args.autovhd = False - args.fileshare = False - with patch('azure.servicemanagement.ServiceManagementService') as mock: - mock.return_value = MagicMock() - mock.return_value.get_storage_account_keys = \ - _mock_get_storage_account_keys - mock.return_value.get_storage_account_properties = \ - _mock_get_storage_account_properties - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.remoteresource = 'blob' - args.chunksizebytes = None - with patch('azure.storage.blob.BlockBlobService') as mock: - mock.return_value = None - with pytest.raises(ValueError): - blobxfer.main() - args.storageaccountkey = None - args.saskey = None - os.environ[blobxfer._ENVVAR_SASKEY] = 'saskey' - args.remoteresource = None - args.download = True - with pytest.raises(ValueError): - blobxfer.main() - - args.download = False - args.upload = True - args.remoteresource = None - args.storageaccountkey = '' - args.saskey = None - with pytest.raises(ValueError): - blobxfer.main() - - args.collate = 'collatetmp' - with pytest.raises(ValueError): - blobxfer.main() - - args.collate = None - args.storageaccountkey = None - args.saskey = '' - with pytest.raises(ValueError): - blobxfer.main() - - args.saskey = None - os.environ.pop(blobxfer._ENVVAR_SASKEY) - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = '0' - args.managementep = '' - args.subscriptionid = '0' - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = 'test.pem' - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = 'mep.mep' - ssk = MagicMock() - ssk.storage_service_keys = MagicMock() - ssk.storage_service_keys.primary = '' - patched_sms_sakeys.return_value = ssk - ssp = MagicMock() - ssp.storage_service_properties = MagicMock() - ssp.storage_service_properties.endpoints = ['blobep'] - patched_sms_saprops.return_value = ssp - with pytest.raises(ValueError): - blobxfer.main() - ssk.storage_service_keys.primary = 'key1' - args.storageaccountkey = None - args.rsaprivatekey = '' - args.rsapublickey = '' - with pytest.raises(ValueError): - blobxfer.main() - args.rsaprivatekey = '' - args.rsapublickey = None - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - with pytest.raises(IOError): - blobxfer.main() - - args.rsaprivatekey = None - args.storageaccountkey = None - args.managementcert = None - args.managementep = None - args.subscriptionid = None - - args.upload = False - args.download = True - args.remoteresource = None - args.saskey = 'saskey&srt=c' - with pytest.raises(ValueError): - blobxfer.main() - args.upload = True - args.download = False - args.saskey = None - - os.environ[blobxfer._ENVVAR_SASKEY] = 'saskey' - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - with requests_mock.mock() as m: - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=block&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&blockid=00000000&comp=block', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=block&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=metadata', status_code=200) - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - '' + lpath + '' - '6' - 'md5BlockBlob' - '') - args.progressbar = False - args.skiponmatch = True - blobxfer.main() - - args.progressbar = True - args.download = True - args.upload = False - args.remoteresource = None - with pytest.raises(ValueError): - blobxfer.main() - - args.remoteresource = 'blob' - args.localresource = str(tmpdir) - m.head('https://blobep.blob.blobep/container/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - m.get('https://blobep.blob.blobep/container/blob?saskey', - content=b'012345') - blobxfer.main() - - args.pageblob = False - args.autovhd = False - args.skiponmatch = False - pemcontents = _RSAKEY.private_bytes( - encoding=cryptography.hazmat.primitives.serialization. - Encoding.PEM, - format=cryptography.hazmat.primitives.serialization. - PrivateFormat.PKCS8, - encryption_algorithm=cryptography.hazmat.primitives. - serialization.NoEncryption()) - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - args.rsaprivatekey = pempath - blobxfer.main() - os.remove(pempath) - - args.rsaprivatekey = None - args.skiponmatch = True - args.remoteresource = '.' - args.keepmismatchedmd5files = False - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6' - 'BlockBlob' - '') - m.get('https://blobep.blob.blobep/container/?saskey') - with pytest.raises(SystemExit): - blobxfer.main() - - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6md5' - 'BlockBlob' - '') - blobxfer.main() - - tmplpath = str(tmpdir.join('test', 'test2', 'test3')) - args.localresource = tmplpath - blobxfer.main() - - args.localresource = str(tmpdir) - notmp_lpath = '/'.join(lpath.strip('/').split('/')[1:]) - - with requests_mock.mock() as m: - args.delete = True - args.download = False - args.upload = True - args.remoteresource = None - args.skiponmatch = False - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=blocklist', status_code=201) - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6md5' - 'BlockBlob' - '') - m.delete('https://blobep.blob.blobep/container/blob?saskey', - status_code=202) - with pytest.raises(SystemExit): - blobxfer.main() - - args.recursive = False - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/test.tmp.blobtmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=blocklist', status_code=201) - with pytest.raises(SystemExit): - blobxfer.main() - - args.stripcomponents = None - args.collate = '.' - args.pageblob = True - args.upload = True - args.download = False - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey', - status_code=201) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey', - status_code=201) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=properties', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=properties', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey', - status_code=201) - with pytest.raises(IOError): - blobxfer.main() - - args.stripcomponents = None - m.put('https://blobep.blob.blobep/container/blobsaskey', - status_code=200) - with pytest.raises(IOError): - blobxfer.main() - - args.stripcomponents = None - args.pageblob = False - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/blob?saskey', - status_code=201) - blobxfer.main() - - args.stripcomponents = None - args.autovhd = True - blobxfer.main() - - args.stripcomponents = None - args.include = 'nofiles' - with pytest.raises(SystemExit): - blobxfer.main() - - args.stripcomponents = None - args.include = '*' - blobxfer.main() - - args.include = None - args.stripcomponents = None - args.pageblob = False - args.autovhd = False - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - args.rsaprivatekey = pempath - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey&comp=block' - '&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp.blobtmp?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=metadata', status_code=200) - blobxfer.main() - - args.stripcomponents = None - args.download = True - args.upload = False - args.rsaprivatekey = pempath - args.remoteresource = 'blob' - args.localresource = str(tmpdir) - m.head('https://blobep.blob.blobep/container/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - m.get('https://blobep.blob.blobep/container/blob?saskey', - content=b'012345') - # TODO add encrypted data json - blobxfer.main() - - os.environ.pop(blobxfer._ENVVAR_SASKEY) - - -@patch('blobxfer.parseargs') -def test_main2(patched_parseargs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - patched_parseargs.return_value = args - args.include = None - args.stripcomponents = 1 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.numworkers = 64 - args.storageaccount = 'blobep' - args.container = 'container' - args.chunksizebytes = 5 - args.localresource = lpath - args.endpoint = '.blobep' - args.timeout = 10 - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.chunksizebytes = None - args.download = False - args.upload = True - args.remoteresource = None - args.collate = None - args.saskey = None - args.storageaccountkey = 'key' - args.fileshare = False - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with patch('azure.storage.blob.BlockBlobService') as mock: - args.createcontainer = True - args.pageblob = False - args.autovhd = False - mock.return_value = MagicMock() - mock.return_value.create_container = _mock_blobservice_create_container - blobxfer.main() - - -@patch('azure.storage.file.FileService.create_share') -@patch('azure.storage.file.FileService.create_file') -@patch('azure.storage.file.FileService.create_directory') -@patch('azure.storage.file.FileService.get_file_properties') -@patch('azure.storage.file.FileService.get_file_metadata') -@patch('azure.storage.file.FileService.list_directories_and_files') -@patch('azure.storage.file.FileService.update_range') -@patch('azure.storage.file.FileService._get_file') -@patch('azure.storage.file.FileService.set_file_properties') -@patch('azure.storage.file.FileService.set_file_metadata') -@patch('azure.storage.file.FileService.resize_file') -@patch('blobxfer.parseargs') -def test_main3( - patched_parseargs, patched_rf, patched_sfm, patched_sfp, - patched_get_file, patched_update_range, patched_ldaf, patched_gfm, - patched_gfp, patched_cd, patched_cf, patched_cs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - patched_parseargs.return_value = args - args.include = None - args.stripcomponents = 1 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.numworkers = 64 - args.storageaccount = 'sa' - args.container = 'myshare' - args.chunksizebytes = 5 - args.localresource = lpath - args.endpoint = 'core.windows.net' - args.timeout = 10 - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.chunksizebytes = None - args.download = False - args.upload = True - args.remoteresource = None - args.collate = None - args.saskey = None - args.storageaccountkey = 'key' - args.pageblob = False - args.autovhd = False - args.fileshare = True - args.computefilemd5 = True - args.skiponmatch = True - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - patched_cs.return_value = MagicMock() - patched_cf.return_value = MagicMock() - patched_gfp.return_value = MagicMock() - patched_update_range.return_value = MagicMock() - patched_get_file.return_value = MagicMock() - patched_get_file.return_value.content = b'\0' * 8 - - pemcontents = _RSAKEY.private_bytes( - encoding=cryptography.hazmat.primitives.serialization. - Encoding.PEM, - format=cryptography.hazmat.primitives.serialization. - PrivateFormat.PKCS8, - encryption_algorithm=cryptography.hazmat.primitives. - serialization.NoEncryption()) - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - - args.rsaprivatekey = pempath - args.rsakeypassphrase = None - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - blobxfer.main() - - args.download = True - args.upload = False - args.rsaprivatekey = pempath - args.remoteresource = '.' - with pytest.raises(SystemExit): - blobxfer.main() - - patched_ldaf.return_value = [azure.storage.file.File(name='test.tmp')] - patched_gfp.return_value = MagicMock() - patched_gfp.return_value.properties = MagicMock() - patched_gfp.return_value.properties.content_length = 1 - patched_gfp.return_value.properties.content_settings = MagicMock() - patched_gfp.return_value.properties.content_settings.content_md5 = 'md5' - args.rsaprivatekey = pempath - args.localresource = lpath.rstrip(os.path.sep + 'test.tmp') - blobxfer.main() - - os.remove(pempath) diff --git a/test_requirements.txt b/test_requirements.txt new file mode 100644 index 0000000..925320c --- /dev/null +++ b/test_requirements.txt @@ -0,0 +1,5 @@ +flake8>=3.2.1 +mock>=2.0.0 +pypandoc>=1.3.3 +pytest>=3.0.5 +pytest-cov>=2.4.0 diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py new file mode 100644 index 0000000..bc17d06 --- /dev/null +++ b/tests/test_blobxfer_util.py @@ -0,0 +1,133 @@ +# coding=utf-8 +"""Tests for util""" + +# stdlib imports +import sys +import uuid +# non-stdlib imports +import pytest +# module under test +import blobxfer.util + + +def test_on_python2(): + py2 = sys.version_info.major == 2 + assert py2 == blobxfer.util.on_python2() + + +def test_is_none_or_empty(): + a = None + assert blobxfer.util.is_none_or_empty(a) + a = [] + assert blobxfer.util.is_none_or_empty(a) + a = {} + assert blobxfer.util.is_none_or_empty(a) + a = '' + assert blobxfer.util.is_none_or_empty(a) + a = 'asdf' + assert not blobxfer.util.is_none_or_empty(a) + a = ['asdf'] + assert not blobxfer.util.is_none_or_empty(a) + a = {'asdf': 0} + assert not blobxfer.util.is_none_or_empty(a) + a = [None] + assert not blobxfer.util.is_none_or_empty(a) + + +def test_is_not_empty(): + a = None + assert not blobxfer.util.is_not_empty(a) + a = [] + assert not blobxfer.util.is_not_empty(a) + a = {} + assert not blobxfer.util.is_not_empty(a) + a = '' + assert not blobxfer.util.is_not_empty(a) + a = 'asdf' + assert blobxfer.util.is_not_empty(a) + a = ['asdf'] + assert blobxfer.util.is_not_empty(a) + a = {'asdf': 0} + assert blobxfer.util.is_not_empty(a) + a = [None] + assert blobxfer.util.is_not_empty(a) + + +def test_merge_dict(): + with pytest.raises(ValueError): + blobxfer.util.merge_dict(1, 2) + + a = {'a_only': 42, 'a_and_b': 43, + 'a_only_dict': {'a': 44}, 'a_and_b_dict': {'a_o': 45, 'a_a_b': 46}} + b = {'b_only': 45, 'a_and_b': 46, + 'b_only_dict': {'a': 47}, 'a_and_b_dict': {'b_o': 48, 'a_a_b': 49}} + c = blobxfer.util.merge_dict(a, b) + assert c['a_only'] == 42 + assert c['b_only'] == 45 + assert c['a_and_b_dict']['a_o'] == 45 + assert c['a_and_b_dict']['b_o'] == 48 + assert c['a_and_b_dict']['a_a_b'] == 49 + assert c['b_only_dict']['a'] == 47 + assert c['a_and_b'] == 46 + assert a['a_only'] == 42 + assert a['a_and_b'] == 43 + assert b['b_only'] == 45 + assert b['a_and_b'] == 46 + + +def test_scantree(tmpdir): + tmpdir.mkdir('abc') + abcpath = tmpdir.join('abc') + abcpath.join('hello.txt').write('hello') + abcpath.mkdir('def') + defpath = abcpath.join('def') + defpath.join('world.txt').write('world') + found = set() + for de in blobxfer.util.scantree(str(tmpdir.dirpath())): + if de.name != '.lock': + found.add(de.name) + assert 'hello.txt' in found + assert 'world.txt' in found + assert len(found) == 2 + + +def test_get_mime_type(): + a = 'b.txt' + mt = blobxfer.util.get_mime_type(a) + assert mt == 'text/plain' + a = 'c.probably_cant_determine_this' + mt = blobxfer.util.get_mime_type(a) + assert mt == 'application/octet-stream' + + +def test_base64_encode_as_string(): + a = b'abc' + enc = blobxfer.util.base64_encode_as_string(a) + assert type(enc) != bytes + dec = blobxfer.util.base64_decode_string(enc) + assert a == dec + + +def test_compute_md5(tmpdir): + lpath = str(tmpdir.join('test.tmp')) + testdata = str(uuid.uuid4()) + with open(lpath, 'wt') as f: + f.write(testdata) + md5_file = blobxfer.util.compute_md5_for_file_asbase64(lpath) + md5_data = blobxfer.util.compute_md5_for_data_asbase64( + testdata.encode('utf8')) + assert md5_file == md5_data + + md5_file_page = blobxfer.util.compute_md5_for_file_asbase64(lpath, True) + assert md5_file != md5_file_page + + # test non-existent file + with pytest.raises(IOError): + blobxfer.util.compute_md5_for_file_asbase64(testdata) + + +def test_page_align_content_length(): + assert 0 == blobxfer.util.page_align_content_length(0) + assert 512 == blobxfer.util.page_align_content_length(511) + assert 512 == blobxfer.util.page_align_content_length(512) + assert 1024 == blobxfer.util.page_align_content_length(513) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..f2b110d --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +[tox] +envlist = py35 + +[testenv] +deps = -rtest_requirements.txt +commands = + #flake8 {envsitepackagesdir}/blobxfer_cli/ + #flake8 {envsitepackagesdir}/blobxfer/ + py.test \ + -x -l -s \ + --ignore venv/ \ + --cov-config .coveragerc \ + --cov-report term-missing \ + --cov {envsitepackagesdir}/blobxfer + +[flake8] +max-line-length = 79 +select = F,E,W From 8053cf69ec591df4bd2714e0a4a1f9f4583ec0f5 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 7 Feb 2017 08:18:16 -0800 Subject: [PATCH 02/47] Continue layout restructure - Add base models - Add client create ops - Travis CI to tox --- .travis.yml | 23 +- blobxfer/api.py | 47 ++ blobxfer/blob/__init__.py | 0 blobxfer/blob/append/__init__.py | 0 blobxfer/blob/append/operations.py | 69 ++ blobxfer/blob/block/__init__.py | 0 blobxfer/blob/block/operations.py | 63 ++ blobxfer/blob/operations.py | 64 ++ blobxfer/blob/page/__init__.py | 0 blobxfer/blob/page/operations.py | 58 ++ blobxfer/file/__init__.py | 0 blobxfer/file/operations.py | 85 +++ blobxfer/models.py | 448 ++++++++++++ blobxfer/operations.py | 81 +++ blobxfer/util.py | 2 +- cli/__init__.py | 0 cli/cli.py | 643 ++++++++++++++++++ cli/settings.py | 211 ++++++ setup.py | 6 + tests/test_blobxfer_blob_append_operations.py | 29 + tests/test_blobxfer_blob_block_operations.py | 29 + tests/test_blobxfer_blob_page_operations.py | 29 + tests/test_blobxfer_file_operations.py | 29 + tests/test_blobxfer_models.py | 132 ++++ tests/test_blobxfer_util.py | 7 +- tox.ini | 2 +- 26 files changed, 2032 insertions(+), 25 deletions(-) create mode 100644 blobxfer/api.py create mode 100644 blobxfer/blob/__init__.py create mode 100644 blobxfer/blob/append/__init__.py create mode 100644 blobxfer/blob/append/operations.py create mode 100644 blobxfer/blob/block/__init__.py create mode 100644 blobxfer/blob/block/operations.py create mode 100644 blobxfer/blob/operations.py create mode 100644 blobxfer/blob/page/__init__.py create mode 100644 blobxfer/blob/page/operations.py create mode 100644 blobxfer/file/__init__.py create mode 100644 blobxfer/file/operations.py create mode 100644 blobxfer/models.py create mode 100644 blobxfer/operations.py create mode 100644 cli/__init__.py create mode 100644 cli/cli.py create mode 100644 cli/settings.py create mode 100644 tests/test_blobxfer_blob_append_operations.py create mode 100644 tests/test_blobxfer_blob_block_operations.py create mode 100644 tests/test_blobxfer_blob_page_operations.py create mode 100644 tests/test_blobxfer_file_operations.py create mode 100644 tests/test_blobxfer_models.py diff --git a/.travis.yml b/.travis.yml index 5bc451d..b183124 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,28 +6,9 @@ python: - 3.4 - 3.5 - 3.6 - - pypy - # disable pypy3 until 3.3 compliance - #- pypy3 install: - - | - if [ "$TRAVIS_PYTHON_VERSION" = "pypy" ]; then - export PYENV_ROOT="$HOME/.pyenv" - if [ -f "$PYENV_ROOT/bin/pyenv" ]; then - pushd "$PYENV_ROOT" && git pull && popd - else - rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT" - fi - export PYPY_VERSION="5.4.1" - "$PYENV_ROOT/bin/pyenv" install --skip-existing "pypy-$PYPY_VERSION" - virtualenv --python="$PYENV_ROOT/versions/pypy-$PYPY_VERSION/bin/python" "$HOME/virtualenvs/pypy-$PYPY_VERSION" - source "$HOME/virtualenvs/pypy-$PYPY_VERSION/bin/activate" - fi - - travis_retry pip install -e . - - travis_retry pip install coveralls flake8 mock pytest pytest-cov requests_mock + - travis_retry pip install tox-travis coveralls script: - - flake8 blobxfer.py test/test_blobxfer.py - - PYTHONPATH=. py.test -l --full-trace --cov-config .coveragerc --cov-report term-missing --cov blobxfer test/test_blobxfer.py + - tox after_success: - coveralls --rcfile=.coveragerc --verbose - diff --git a/blobxfer/api.py b/blobxfer/api.py new file mode 100644 index 0000000..45f2145 --- /dev/null +++ b/blobxfer/api.py @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function # noqa +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +# non-stdlib imports +# local imports + +from .blob.operations import check_if_single_blob # noqa +from .blob.append.operations import ( # noqa + create_client as create_append_blob_client +) +from .blob.block.operations import ( # noqa + create_client as create_block_blob_client +) +from .blob.page.operations import ( # noqa + create_client as create_page_blob_client +) +from .file.operations import ( # noqa + create_client as create_file_client +) diff --git a/blobxfer/blob/__init__.py b/blobxfer/blob/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/blob/append/__init__.py b/blobxfer/blob/append/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/blob/append/operations.py b/blobxfer/blob/append/operations.py new file mode 100644 index 0000000..88d5b58 --- /dev/null +++ b/blobxfer/blob/append/operations.py @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +from azure.storage.blob import AppendBlobService +# local imports + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.models.AzureStorageAccount) -> AppendBlobService + """Create Append blob client + :param blobxfer.models.AzureStorageAccount storage_account: storage account + :rtype: AppendBlobService + :return: append blob service client + """ + if storage_account.is_sas: + client = AppendBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint) + else: + client = AppendBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint) + return client + + +def list_blobs(client, container, prefix): + # type: (azure.storage.blob.AppendBlobService, str, str) -> list + """List append blobs in path + :param AppendBlobService client: append blob client + :param str container: container + :param str prefix: path prefix + """ + + pass diff --git a/blobxfer/blob/block/__init__.py b/blobxfer/blob/block/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/blob/block/operations.py b/blobxfer/blob/block/operations.py new file mode 100644 index 0000000..94fd534 --- /dev/null +++ b/blobxfer/blob/block/operations.py @@ -0,0 +1,63 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +from azure.storage.blob import BlockBlobService +# local imports + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.models.AzureStorageAccount) -> BlockBlobService + """Create block blob client + :param blobxfer.models.AzureStorageAccount storage_account: storage account + :rtype: BlockBlobService + :return: block blob service client + """ + if storage_account.is_sas: + client = BlockBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint) + else: + client = BlockBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint) + return client + + +def upload_block(): + logger.info('upload block') + print('upload') diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py new file mode 100644 index 0000000..567d9aa --- /dev/null +++ b/blobxfer/blob/operations.py @@ -0,0 +1,64 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +# local imports +from ..util import is_none_or_empty + +# create logger +logger = logging.getLogger(__name__) + + +def check_if_single_blob(client, container, prefix): + # type: (azure.storage.blob.BaseBlobService, str, str) -> bool + """List append blobs in path + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + :rtype: bool + :return: if prefix in container is a single blob + """ + blobs = client.list_blobs( + container_name=container, prefix=prefix, num_results=1) + return is_none_or_empty(blobs.next_marker) + + +def list_blobs(client, container, prefix, mode): + # type: (azure.storage.blob.BaseBlobService, str, str, + # blobxfer.models.AzureStorageModes) -> list + """List blobs in path conforming to mode + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + """ + + pass diff --git a/blobxfer/blob/page/__init__.py b/blobxfer/blob/page/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/blob/page/operations.py b/blobxfer/blob/page/operations.py new file mode 100644 index 0000000..f23520b --- /dev/null +++ b/blobxfer/blob/page/operations.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +from azure.storage.blob import PageBlobService +# local imports + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.models.AzureStorageAccount) -> PageBlobService + """Create block blob client + :param blobxfer.models.AzureStorageAccount storage_account: storage account + :rtype: PageBlobService + :return: block blob service client + """ + if storage_account.is_sas: + client = PageBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint) + else: + client = PageBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint) + return client diff --git a/blobxfer/file/__init__.py b/blobxfer/file/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py new file mode 100644 index 0000000..38705d6 --- /dev/null +++ b/blobxfer/file/operations.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +from azure.storage.file import FileService +# local imports +from ..util import is_none_or_empty + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.models.AzureStorageAccount) -> FileService + """Create file client + :param blobxfer.models.AzureStorageAccount storage_account: storage account + :rtype: FileService + :return: file service client + """ + if storage_account.is_sas: + client = FileService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint) + else: + client = FileService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint) + return client + + +def check_if_single_file(client, container, prefix): + # type: (azure.storage.blob.BaseBlobService, str, str) -> bool + """List append blobs in path + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + :rtype: bool + :return: if prefix in container is a single blob + """ + blobs = client.list_blobs( + container_name=container, prefix=prefix, num_results=1) + return is_none_or_empty(blobs.next_marker) + + +def list_blobs(client, container, prefix, mode): + # type: (azure.storage.blob.BaseBlobService, str, str, + # blobxfer.models.AzureStorageModes) -> list + """List blobs in path conforming to mode + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + """ + + pass diff --git a/blobxfer/models.py b/blobxfer/models.py new file mode 100644 index 0000000..d511e15 --- /dev/null +++ b/blobxfer/models.py @@ -0,0 +1,448 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import enum +import fnmatch +import logging +import os +try: + import pathlib2 as pathlib +except ImportError: + import pathlib +# non-stdlib imports +# local imports +from .util import scantree + +# create logger +logger = logging.getLogger(__name__) + + +# enums +class AzureStorageModes(enum.Enum): + Auto = 1 + Append = 2 + Block = 3 + File = 4 + Page = 5 + + +# named tuples +GeneralOptions = collections.namedtuple( + 'GeneralOptions', [ + 'progress_bar', + 'timeout_sec', + 'verbose', + ] +) +VectoredIoOptions = collections.namedtuple( + 'VectoredIoOptions', [ + 'stripe_chunk_size_bytes', + 'multi_storage_account_distribution_mode', + ] +) +SkipOnOptions = collections.namedtuple( + 'SkipOnOptions', [ + 'filesize_match', + 'lmt_ge', + 'md5_match', + ] +) +UploadOptions = collections.namedtuple( + 'UploadOptions', [ + 'chunk_size_bytes', + 'delete_extraneous_destination', + 'exclude', + 'include', + 'mode', + 'overwrite', + 'recursive', + 'rsa_private_key', + 'rsa_private_key_passphrase', + 'rsa_public_key', + 'skip_on', + 'store_file_attributes', + 'store_file_md5', + 'strip_components', + 'vectored_io', + 'split_size_bytes', + ] +) +DownloadOptions = collections.namedtuple( + 'DownloadOptions', [ + 'check_file_md5', + 'delete_extraneous_destination', + 'exclude', + 'include', + 'mode', + 'overwrite', + 'recursive', + 'restore_file_attributes', + 'rsa_private_key', + 'rsa_private_key_passphrase', + 'skip_on', + ] +) +SyncCopyOptions = collections.namedtuple( + 'SyncCopyOptions', [ + 'exclude', + 'include', + 'mode', + 'overwrite', + 'skip_on', + ] +) +LocalPath = collections.namedtuple( + 'LocalPath', [ + 'parent_path', 'relative_path' + ] +) +AzureDestinationOptions = collections.namedtuple( + 'AzureDestinationOptions', [ + 'path', + ] +) +AzureSourceOptions = collections.namedtuple( + 'AzureSourceOptions', [ + ] +) + + +class AzureStorageCredentials(object): + """Azure Storage Credentials""" + def __init__(self): + # type: (AzureStorageCredentials) -> None + """Ctor for AzureStorageCredentials""" + self._storage_accounts = {} + + def add_storage_account(self, name, key, endpoint): + # type: (AzureStorageCredentials, str, str, str) -> None + """Add a storage account + :param AzureStorageCredentials self: this + :param str name: name of storage account to store + :param str key: storage key or sas + :param str endpoint: endpoint + """ + if name in self._storage_accounts: + raise ValueError( + '{} already exists in storage accounts'.format(name)) + self._storage_accounts[name] = AzureStorageAccount(name, key, endpoint) + + def get_storage_account(self, name): + # type: (AzureStorageCredentials, str) -> AzureStorageAccount + """Get storage account details + :param AzureStorageCredentials self: this + :param str name: name of storage account to retrieve + :rtype: AzureStorageAccount + :return: storage account details + """ + return self._storage_accounts[name] + + +class AzureStorageAccount(object): + """Azure Storage Account""" + def __init__(self, name, key, endpoint): + # type: (AzureStorageAccount, str, str, str) -> None + """Ctor for AzureStorageAccount + :param str name: name of storage account + :param str key: storage key or sas + :param str endpoint: endpoint + """ + self.name = name + self.key = key + self.endpoint = endpoint + self.is_sas = self._key_is_sas(self.key) + + @staticmethod + def _key_is_sas(key): + # type: (str) -> bool + """Determine if key is a sas + :param str key: key to parse + :rtype: bool + :return: if key is a sas + """ + # keys starting with ? are sas keys as ? is not in the base-64 + # character range + if key.startswith('?'): + return True + else: + # & is not in the base-64 character range, so technically + # the presence of this character means the key is a sas. however, + # perform a stronger check for the sig= parameter. + tmp = key.split('&') + if len(tmp) == 1: + return False + elif any(x.startswith('sig=') for x in tmp): + return True + return False + + +class _BaseSourcePaths(object): + """Base Source Paths""" + def __init__(self): + # type: (_BaseSourcePaths) -> None + """Ctor for _BaseSourcePaths + :param _BaseSourcePaths self: this + """ + self._include = None + self._exclude = None + self._paths = [] + + def add_include(self, incl): + # type: (_BaseSourcePaths, str) -> None + """Add an include + :param _BaseSourcePaths self: this + :param str incl: include filter + """ + if self._include is None: + self._include = [incl] + else: + self._include.append(incl) + + def add_includes(self, includes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of includes + :param _BaseSourcePaths self: this + :param list includes: list of includes + """ + if not isinstance(includes, list): + raise ValueError('includes is not of type list') + if self._include is None: + self._include = includes + else: + self._include.extend(includes) + + def add_exclude(self, excl): + # type: (_BaseSourcePaths, str) -> None + """Add an exclude + :param _BaseSourcePaths self: this + :param str excl: exclude filter + """ + if self._exclude is None: + self._exclude = [excl] + else: + self._exclude.append(excl) + + def add_excludes(self, excludes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of excludes + :param _BaseSourcePaths self: this + :param list excludes: list of excludes + """ + if not isinstance(excludes, list): + raise ValueError('excludes is not of type list') + if self._exclude is None: + self._exclude = excludes + else: + self._exclude.extend(excludes) + + def add_path(self, path): + # type: (_BaseSourcePaths, str) -> None + """Add a local path + :param _BaseSourcePaths self: this + :param str path: path to add + """ + self._paths.append(pathlib.Path(path)) + + def add_paths(self, paths): + # type: (_BaseSourcePaths, list) -> None + """Add a list of local paths + :param _BaseSourcePaths self: this + :param list paths: paths to add + """ + for path in paths: + self.add_path(path) + + def _inclusion_check(self, path): + # type: (_BaseSourcePaths, pathlib.Path) -> bool + """Check file for inclusion against filters + :param _BaseSourcePaths self: this + :param pathlib.Path path: path to check + :rtype: bool + :return: if file should be included + """ + _spath = str(path) + inc = True + if self._include is not None: + inc = any([fnmatch.fnmatch(_spath, x) for x in self._include]) + if inc and self._exclude is not None: + inc = not any([fnmatch.fnmatch(_spath, x) for x in self._exclude]) + return inc + + +class LocalSourcePaths(_BaseSourcePaths): + """Local Source Paths""" + def files(self): + # type: (LocalSourcePaths) -> LocalPath + """Generator for files in paths + :param LocalSourcePaths self: this + :rtype: LocalPath + :return: LocalPath + """ + for _path in self._paths: + _ppath = os.path.expandvars(os.path.expanduser(str(_path))) + _expath = pathlib.Path(_ppath) + for entry in scantree(_ppath): + _rpath = pathlib.Path(entry.path).relative_to(_ppath) + if not self._inclusion_check(_rpath): + logger.debug( + 'skipping file {} due to filters'.format(_rpath)) + continue + yield LocalPath(parent_path=_expath, relative_path=_rpath) + + +class LocalDestinationPath(object): + """Local Destination Path""" + def __init__(self, path=None): + # type: (LocalDestinationPath, str) -> None + """Ctor for LocalDestinationPath + :param LocalDestinationPath self: this + :param str path: path + """ + self._is_dir = None + if path is not None: + self.path = path + + @property + def path(self): + # type: (LocalDestinationPath) -> pathlib.Path + """Path property + :param LocalDestinationPath self: this + :rtype: pathlib.Path + :return: local destination path + """ + return self._path + + @path.setter + def path(self, value): + # type: (LocalDestinationPath, str) -> None + """Path property setter + :param LocalDestinationPath self: this + :param str value: value to set path to + """ + self._path = pathlib.Path(value) + + @property + def is_dir(self): + # type: (LocalDestinationPath) -> bool + """is_dir property + :param LocalDestinationPath self: this + :rtype: bool + :return: if local destination path is a directory + """ + return self._is_dir + + @is_dir.setter + def is_dir(self, value): + # type: (LocalDestinationPath, bool) -> None + """is_dir property setter + :param LocalDestinationPath self: this + :param bool value: value to set is_dir to + """ + self._is_dir = value + + def ensure_path_exists(self): + # type: (LocalDestinationPath) -> None + """Ensure path exists + :param LocalDestinationPath self: this + """ + if self._is_dir is None: + raise RuntimeError('is_dir not set') + if self._is_dir: + self._path.mkdir(mode=0o750, parents=True, exist_ok=True) + else: + if self._path.exists() and self._path.is_dir(): + raise RuntimeError( + ('destination path {} already exists and is a ' + 'directory').format(self._path)) + else: + # ensure parent path exists and is created + self._path.parent.mkdir( + mode=0o750, parents=True, exist_ok=True) + + +class AzureSourcePaths(_BaseSourcePaths): + def __init__(self, mode): + super.__init__() + self._mode = mode + + def set_clients(self, append, block, file, page): + pass + + def files(self): + if self._mode == AzureStorageModes.Auto: + pass + elif self._mode == AzureStorageModes.Append: + pass + elif self._mode == AzureStorageModes.Block: + pass + elif self._mode == AzureStorageModes.File: + pass + elif self._mode == AzureStorageModes.Page: + pass + else: + raise RuntimeError('unknown Azure Storage Mode: {}'.format( + self._mode)) + + def _append_files(self): + for _path in self._paths: + + pass + + +class AzureDestinationPaths(object): + def __init__(self): + pass + + +class FileDescriptor(object): + def __init__(self, filepath): + if filepath == '-': + self.stdin = True + self.path = None + else: + self.stdin = False + self.path = pathlib.Path(filepath) + self.size = None + self.hmac = None + self.md5 = None + self.bytes_xferred = 0 + + +class ReadFileDescriptor(FileDescriptor): + def __init__(self, filepath): + super().__init__(filepath) + + +class WriteFileDescriptor(FileDescriptor): + def __init__(self, filepath): + super().__init__(filepath) diff --git a/blobxfer/operations.py b/blobxfer/operations.py new file mode 100644 index 0000000..b073367 --- /dev/null +++ b/blobxfer/operations.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +# local imports +from .models import FileDescriptor + + +def file_chunks(fd, chunk_size): + # type: (FileDescriptor, int) -> bytes + """Generator for getting file chunks of a file + :param FileDescriptor fd: file descriptor + :param int chunk_size: the amount of data to read + :rtype: bytes + :return: file data + """ + with fd.path.open('rb') as f: + while True: + data = f.read(chunk_size) + if not data: + break + yield data + + +def read_file_chunk(fd, chunk_num, chunk_size): + # type: (FileDescriptor, int, int) -> bytes + """Read file chunk + :param FileDescriptor fd: file descriptor + :param int chunk_num: chunk number + :param int chunk_size: the amount of data to read + :rtype: bytes + :return: file data + """ + offset = chunk_num * chunk_size + with fd.path.open('rb') as f: + f.seek(offset, 0) + return f.read(chunk_size) + + +def write_file_chunk(fd, chunk_num, chunk_size, data): + # type: (FileDescriptor, int, int, bytes) -> None + """Write file chunk + :param FileDescriptor fd: file descriptor + :param int chunk_num: chunk number + :param int chunk_size: the amount of data to read + :rtype: bytes + :return: file data + """ + offset = chunk_num * chunk_size + with fd.path.open('wb') as f: + f.seek(offset, 0) + f.write(data) diff --git a/blobxfer/util.py b/blobxfer/util.py index bf3a9a8..f498ff6 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -25,7 +25,7 @@ # compat imports from __future__ import absolute_import, division, print_function from builtins import ( # noqa - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, + bytes, dict, int, list, object, range, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip ) # stdlib imports diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cli/cli.py b/cli/cli.py new file mode 100644 index 0000000..cb23a31 --- /dev/null +++ b/cli/cli.py @@ -0,0 +1,643 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import json +import logging +try: + import pathlib2 as pathlib +except ImportError: + import pathlib +# non-stdlib imports +import click +import ruamel.yaml +# blobxfer library imports +import blobxfer.api +import blobxfer.util +# local imports +import settings + +# create logger +logger = logging.getLogger('blobxfer') +blobxfer.util.setup_logger(logger) +# global defines +_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +class CliContext(object): + """CliContext class: holds context for CLI commands""" + def __init__(self): + """Ctor for CliContext""" + self.yaml_config = None + self.config = {} + self.cli_options = {} + self.block_blob_client = None + self.page_blob_client = None + self.append_blob_client = None + self.smb_file_client = None + + def initialize(self): + # type: (CliContext) -> None + """Initialize context + :param CliContext self: this + """ + self._init_config() + + def _read_yaml_file(self, yaml_file): + # type: (CliContext, pathlib.Path) -> None + """Read a yaml file into self.config + :param CliContext self: this + :param pathlib.Path yaml_file: yaml file to load + """ + with yaml_file.open('r') as f: + if self.config is None: + self.config = ruamel.yaml.load( + f, Loader=ruamel.yaml.RoundTripLoader) + else: + self.config = blobxfer.util.merge_dict( + self.config, ruamel.yaml.load( + f, Loader=ruamel.yaml.RoundTripLoader)) + + def _init_config(self): + # type: (CliContext) -> None + """Initializes configuration of the context + :param CliContext self: this + """ + # load yaml config file into memory + if blobxfer.util.is_not_empty(self.yaml_config): + self.yaml_config = pathlib.Path(self.yaml_config) + self._read_yaml_file(self.yaml_config) + # merge cli options with config + settings.merge_settings(self.config, self.cli_options) + if self.config['options']['verbose']: + logger.debug('config: \n' + json.dumps(self.config, indent=4)) + # free mem + del self.yaml_config + del self.cli_options + + +# create a pass decorator for shared context between commands +pass_cli_context = click.make_pass_decorator(CliContext, ensure=True) + + +def _progress_bar_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['progress_bar'] = value + return value + return click.option( + '--progress-bar/--no-progress-bar', + expose_value=False, + default=True, + help='Display progress bar', + callback=callback)(f) + + +def _timeout_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['timeout'] = value + return value + return click.option( + '--timeout', + expose_value=False, + type=int, + help='Individual chunk transfer timeout', + callback=callback)(f) + + +def _verbose_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['verbose'] = value + return value + return click.option( + '-v', '--verbose', + expose_value=False, + is_flag=True, + help='Verbose output', + callback=callback)(f) + + +def common_options(f): + f = _verbose_option(f) + f = _timeout_option(f) + f = _progress_bar_option(f) + return f + + +def _local_resource_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.local_resource = value + return value + return click.argument( + 'local-resource', + callback=callback)(f) + + +def _storage_account_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['storage_account'] = value + return value + return click.argument( + 'storage-account', + callback=callback)(f) + + +def _remote_path_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['remote_path'] = value + return value + return click.argument( + 'remote-path', + callback=callback)(f) + + +def upload_download_arguments(f): + f = _remote_path_argument(f) + f = _storage_account_argument(f) + f = _local_resource_argument(f) + return f + + +def _sync_copy_dest_storage_account_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_storage_account'] = value + return value + return click.argument( + 'sync-copy-dest-storage-account', + callback=callback)(f) + + +def _sync_copy_dest_remote_path_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_remote_path'] = value + return value + return click.argument( + 'sync-copy-dest-remote-path', + callback=callback)(f) + + +def sync_copy_arguments(f): + f = _sync_copy_dest_remote_path_argument(f) + f = _sync_copy_dest_storage_account_argument(f) + f = _remote_path_argument(f) + f = _storage_account_argument(f) + return f + + +def _access_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['access_key'] = value + return value + return click.option( + '--access-key', + expose_value=False, + help='Storage account access key', + envvar='BLOBXFER_ACCESS_KEY', + callback=callback)(f) + + +def _chunk_size_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['chunk_size_bytes'] = value + return value + return click.option( + '--chunk-size-bytes', + expose_value=False, + type=int, + default=4194304, + help='Chunk size in bytes [4194304]', + callback=callback)(f) + + +def _delete_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['delete'] = value + return value + return click.option( + '--delete', + expose_value=False, + is_flag=True, + help='Delete extraneous files on target [False]', + callback=callback)(f) + + +def _endpoint_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['endpoint'] = value + return value + return click.option( + '--endpoint', + expose_value=False, + default='core.windows.net', + help='Azure Storage endpoint [core.windows.net]', + callback=callback)(f) + + +def _exclude_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['exclude'] = value + return value + return click.option( + '--exclude', + expose_value=False, + default=None, + help='Exclude pattern', + callback=callback)(f) + + +def _file_attributes(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['file_attributes'] = value + return value + return click.option( + '--file-attributes', + expose_value=False, + is_flag=True, + help='Store or restore file attributes [False]', + callback=callback)(f) + + +def _file_md5_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['file_md5'] = value + return value + return click.option( + '--file-md5/--no-file-md5', + expose_value=False, + default=True, + help='Compute file MD5 [True]', + callback=callback)(f) + + +def _include_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['include'] = value + return value + return click.option( + '--include', + expose_value=False, + default=None, + help='Include pattern', + callback=callback)(f) + + +def _mode_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['mode'] = value + return value + return click.option( + '--mode', + expose_value=False, + default='auto', + help='Transfer mode: auto, append, block, file, page [auto]', + callback=callback)(f) + + +def _overwrite_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['overwrite'] = value + return value + return click.option( + '--overwrite/--no-overwrite', + expose_value=False, + default=True, + help='Overwrite destination if exists [True]', + callback=callback)(f) + + +def _recursive_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['recursive'] = value + return value + return click.option( + '--recursive/--no-recursive', + expose_value=False, + default=True, + help='Recursive [True]', + callback=callback)(f) + + +def _rsa_private_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_private_key'] = value + return value + return click.option( + '--rsa-private-key', + expose_value=False, + default=None, + help='RSA private key', + envvar='BLOBXFER_RSA_PRIVATE_KEY', + callback=callback)(f) + + +def _rsa_private_key_passphrase_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_private_key_passphrase'] = value + return value + return click.option( + '--rsa-private-key-passphrase', + expose_value=False, + default=None, + help='RSA private key passphrase', + envvar='BLOBXFER_RSA_PRIVATE_KEY_PASSPHRASE', + callback=callback)(f) + + +def _rsa_public_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_public_key'] = value + return value + return click.option( + '--rsa-public-key', + expose_value=False, + default=None, + help='RSA public key', + envvar='BLOBXFER_RSA_PUBLIC_KEY', + callback=callback)(f) + + +def _sas_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sas'] = value + return value + return click.option( + '--sas', + expose_value=False, + help='Shared access signature', + envvar='BLOBXFER_SAS', + callback=callback)(f) + + +def _skip_on_filesize_match_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_filesize_match'] = value + return value + return click.option( + '--skip-on-filesize-match', + expose_value=False, + is_flag=True, + help='Skip on equivalent file size [False]', + callback=callback)(f) + + +def _skip_on_lmt_ge_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_lmt_ge'] = value + return value + return click.option( + '--skip-on-lmt-ge', + expose_value=False, + is_flag=True, + help='Skip on last modified time greater than or equal to [False]', + callback=callback)(f) + + +def _skip_on_md5_match_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_md5_match'] = value + return value + return click.option( + '--skip-on-md5-match', + expose_value=False, + is_flag=True, + help='Skip on MD5 match [False]', + callback=callback)(f) + + +def _strip_components_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['strip_components'] = value + return value + return click.option( + '--strip-components', + expose_value=False, + type=int, + default=1, + help='Strip leading file path components [1]', + callback=callback)(f) + + +def _sync_copy_dest_access_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_access_key'] = value + return value + return click.option( + '--sync-copy-dest-access-key', + expose_value=False, + help='Storage account access key for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_DEST_ACCESS_KEY', + callback=callback)(f) + + +def _sync_copy_dest_sas_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_sas'] = value + return value + return click.option( + '--sync-copy-dest-sas', + expose_value=False, + help='Shared access signature for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_SAS', + callback=callback)(f) + + +def upload_options(f): + f = _strip_components_option(f) + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _rsa_public_key_option(f) + f = _rsa_private_key_passphrase_option(f) + f = _rsa_private_key_option(f) + f = _recursive_option(f) + f = _overwrite_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _file_md5_option(f) + f = _file_attributes(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _delete_option(f) + f = _chunk_size_bytes_option(f) + f = _access_key_option(f) + return f + + +def download_options(f): + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _rsa_private_key_passphrase_option(f) + f = _rsa_private_key_option(f) + f = _recursive_option(f) + f = _overwrite_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _file_md5_option(f) + f = _file_attributes(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _delete_option(f) + f = _access_key_option(f) + return f + + +def sync_copy_options(f): + f = _sync_copy_dest_sas_option(f) + f = _sync_copy_dest_access_key_option(f) + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _overwrite_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _access_key_option(f) + return f + + +def _config_argument(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.yaml_config = value + return value + return click.argument( + 'config', + callback=callback)(f) + + +def config_arguments(f): + f = _config_argument(f) + return f + + +@click.group(context_settings=_CONTEXT_SETTINGS) +@click.version_option(version=blobxfer.__version__) +@click.pass_context +def cli(ctx): + """Blobxfer-CLI: Azure Storage transfer tool""" + pass + + +@cli.command('download') +@upload_download_arguments +@download_options +@common_options +@pass_cli_context +def download(ctx, local_resource, storage_account, remote_path): + """Download blobs or files from Azure Storage""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Download, local_resource, + storage_account, remote_path) + ctx.initialize() + raise NotImplementedError() + + +@cli.command('synccopy') +@sync_copy_arguments +@sync_copy_options +@common_options +@pass_cli_context +def synccopy( + ctx, local_resource, storage_account, remote_path, + sync_copy_dest_storage_account, sync_copy_dest_remote_path): + """Synchronously copy blobs between Azure Storage accounts""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Synccopy, local_resource, + storage_account, remote_path, sync_copy_dest_storage_account, + sync_copy_dest_remote_path) + ctx.initialize() + raise NotImplementedError() + + +@cli.command('upload') +@upload_download_arguments +@upload_options +@common_options +@pass_cli_context +def upload(ctx, local_resource, storage_account, remote_path): + """Upload files to Azure Storage""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Upload, local_resource, + storage_account, remote_path) + ctx.initialize() + blobxfer.api.upload_block() + + +@cli.group() +@pass_cli_context +def useconfig(ctx): + """Use config file for transfer""" + pass + + +@useconfig.command('upload') +@config_arguments +@common_options +@pass_cli_context +def useconfig_upload(ctx): + """Upload files to Azure File Storage""" + ctx.initialize() + raise NotImplementedError() + + +if __name__ == '__main__': + cli() diff --git a/cli/settings.py b/cli/settings.py new file mode 100644 index 0000000..9d54d42 --- /dev/null +++ b/cli/settings.py @@ -0,0 +1,211 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +# non-stdlib imports +# local imports +from blobxfer.util import is_none_or_empty, is_not_empty, merge_dict + + +# enums +class TransferAction(enum.Enum): + Download = 1, + Upload = 2, + Synccopy = 3, + + +def add_cli_options( + cli_options, action, local_resource=None, storage_account=None, + remote_path=None, sync_copy_dest_storage_account=None, + sync_copy_dest_remote_path=None): + # type: (dict, str, str, str, str, str, str) -> None + """Adds CLI options to the configuration object + :param dict cli_options: CLI options dict + :param TransferAction action: action + :param str local_resource: local resource + :param str storage_account: storage account + :param str remote_path: remote path + :param str sync_copy_dest_storage_account: synccopy dest sa + :param str sync_copy_dest_remote_path: synccopy dest rp + """ + cli_options['_action'] = action.name.lower() + if is_not_empty(storage_account): + # add credentials + try: + key = cli_options['access_key'] + if is_none_or_empty(key): + raise KeyError() + except KeyError: + try: + key = cli_options['sas'] + if is_none_or_empty(key): + raise KeyError() + except KeyError: + raise RuntimeError('access key or sas must be provided') + azstorage = { + 'endpoint': cli_options['endpoint'], + 'accounts': { + storage_account: key + } + } + del key + # construct "argument" from cli options + sa_rp = {storage_account: remote_path} + if action == TransferAction.Upload: + arg = { + 'source': [local_resource], + 'destination': [sa_rp], + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'chunk_size_bytes': cli_options['chunk_size_bytes'], + 'delete_extraneous_destination': cli_options['delete'], + 'mode': cli_options['mode'], + 'overwrite': cli_options['overwrite'], + 'recursive': cli_options['recursive'], + 'rsa_private_key': cli_options['rsa_private_key'], + 'rsa_private_key_passphrase': cli_options[ + 'rsa_private_key_passphrase'], + 'rsa_public_key': cli_options['rsa_public_key'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + 'store_file_attributes': cli_options['file_attributes'], + 'store_file_md5': cli_options['file_md5'], + 'strip_components': cli_options['strip_components'], + }, + } + elif action == TransferAction.Download: + arg = { + 'source': [sa_rp], + 'destination': local_resource, + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'check_file_md5': cli_options['file_md5'], + 'delete_extraneous_destination': cli_options['delete'], + 'mode': cli_options['mode'], + 'overwrite': cli_options['overwrite'], + 'recursive': cli_options['recursive'], + 'rsa_private_key': cli_options['rsa_private_key'], + 'rsa_private_key_passphrase': cli_options[ + 'rsa_private_key_passphrase'], + 'restore_file_attributes': cli_options['file_attributes'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + }, + } + elif action == TransferAction.Synccopy: + if is_none_or_empty(sync_copy_dest_storage_account): + raise RuntimeError( + 'must specify a destination storage account') + arg = { + 'source': sa_rp, + 'destination': [ + { + sync_copy_dest_storage_account: + sync_copy_dest_remote_path + } + ], + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'mode': cli_options['mode'], + 'overwrite': cli_options['overwrite'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + }, + } + try: + destkey = cli_options['sync_copy_dest_access_key'] + if is_none_or_empty(destkey): + raise KeyError() + except KeyError: + try: + destkey = cli_options['sync_copy_dest_sas'] + if is_none_or_empty(destkey): + raise KeyError() + except KeyError: + raise RuntimeError( + 'destination access key or sas must be provided') + azstorage['accounts'][ + cli_options['sync_copy_dest_storage_account']] = destkey + del destkey + cli_options[action.name.lower()] = arg + cli_options['azure_storage'] = azstorage + + +def merge_settings(config, cli_options): + # type: (dict, dict) -> None + """Merge CLI options into main config + :param dict config: config dict + :param dict cli_options: cli options + """ + action = cli_options['_action'] + if (action != TransferAction.Upload.name.lower() and + action != TransferAction.Download.name.lower() and + action == TransferAction.Synccopy.name.lower()): + raise ValueError('invalid action: {}'.format(action)) + # create action options + if action not in config: + config[action] = [] + # merge any argument options + if action in cli_options: + config[action].append(cli_options[action]) + # merge credentials + if 'azure_storage' in cli_options: + if 'azure_storage' not in config: + config['azure_storage'] = {} + config['azure_storage'] = merge_dict( + config['azure_storage'], cli_options['azure_storage']) + # merge general options + if 'options' not in config: + config['options'] = {} + try: + config['options']['verbose'] = cli_options['verbose'] + except KeyError: + pass + try: + config['options']['timeout_sec'] = cli_options['timeout'] + except KeyError: + pass diff --git a/setup.py b/setup.py index 83d5abb..6ea9896 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,10 @@ packages = [ 'blobxfer', 'blobxfer.blob', + 'blobxfer.blob.append', 'blobxfer.blob.block', + 'blobxfer.blob.page', + 'blobxfer.file', 'blobxfer_cli', ] @@ -45,6 +48,9 @@ 'ruamel.yaml==0.13.11', ] +if sys.version_info < (3, 4): + install_requires.append('enum34') + if sys.version_info < (3, 5): install_requires.append('pathlib2') install_requires.append('scandir') diff --git a/tests/test_blobxfer_blob_append_operations.py b/tests/test_blobxfer_blob_append_operations.py new file mode 100644 index 0000000..b4ad982 --- /dev/null +++ b/tests/test_blobxfer_blob_append_operations.py @@ -0,0 +1,29 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +# non-stdlib imports +import azure.storage +import pytest +# local imports +import blobxfer.models as models +# module under test +import blobxfer.blob.append.operations as ops + + +def test_create_client(): + sa = models.AzureStorageAccount('name', 'key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.AppendBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.AppendBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_blob_block_operations.py b/tests/test_blobxfer_blob_block_operations.py new file mode 100644 index 0000000..dc83b8b --- /dev/null +++ b/tests/test_blobxfer_blob_block_operations.py @@ -0,0 +1,29 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +# non-stdlib imports +import azure.storage +import pytest +# local imports +import blobxfer.models as models +# module under test +import blobxfer.blob.block.operations as ops + + +def test_create_client(): + sa = models.AzureStorageAccount('name', 'key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.BlockBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.BlockBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_blob_page_operations.py b/tests/test_blobxfer_blob_page_operations.py new file mode 100644 index 0000000..8ae5989 --- /dev/null +++ b/tests/test_blobxfer_blob_page_operations.py @@ -0,0 +1,29 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +# non-stdlib imports +import azure.storage +import pytest +# local imports +import blobxfer.models as models +# module under test +import blobxfer.blob.page.operations as ops + + +def test_create_client(): + sa = models.AzureStorageAccount('name', 'key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.PageBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.PageBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_file_operations.py b/tests/test_blobxfer_file_operations.py new file mode 100644 index 0000000..56c4b95 --- /dev/null +++ b/tests/test_blobxfer_file_operations.py @@ -0,0 +1,29 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +# non-stdlib imports +import azure.storage +import pytest +# local imports +import blobxfer.models as models +# module under test +import blobxfer.file.operations as ops + + +def test_create_client(): + sa = models.AzureStorageAccount('name', 'key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.file.FileService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.file.FileService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py new file mode 100644 index 0000000..2de4bc2 --- /dev/null +++ b/tests/test_blobxfer_models.py @@ -0,0 +1,132 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +import os +# non-stdlib imports +import pytest +# module under test +import blobxfer.models + + +def test_storage_credentials(): + creds = blobxfer.models.AzureStorageCredentials() + creds.add_storage_account('sa1', 'somekey1', 'endpoint') + + a = creds.get_storage_account('sa1') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + + with pytest.raises(KeyError): + a = creds.get_storage_account('sa2') + + with pytest.raises(ValueError): + creds.add_storage_account('sa1', 'somekeyxx', 'endpoint') + + creds.add_storage_account('sa2', 'somekey2', 'endpoint2') + a = creds.get_storage_account('sa1') + b = creds.get_storage_account('sa2') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + assert b.name == 'sa2' + assert b.key == 'somekey2' + assert b.endpoint == 'endpoint2' + + +def test_key_is_sas(): + a = blobxfer.models.AzureStorageAccount('name', 'abcdef', 'endpoint') + assert not a.is_sas + + a = blobxfer.models.AzureStorageAccount('name', 'abcdef&blah', 'endpoint') + assert not a.is_sas + + a = blobxfer.models.AzureStorageAccount('name', '?abcdef', 'endpoint') + assert a.is_sas + + a = blobxfer.models.AzureStorageAccount( + 'name', '?sv=0&sr=1&sig=2', 'endpoint') + assert a.is_sas + + a = blobxfer.models.AzureStorageAccount( + 'name', 'sv=0&sr=1&sig=2', 'endpoint') + assert a.is_sas + + a = blobxfer.models.AzureStorageAccount( + 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint') + assert a.is_sas + + +def test_localsourcepaths_files(tmpdir): + tmpdir.mkdir('abc') + tmpdir.join('moo.cow').write('z') + abcpath = tmpdir.join('abc') + abcpath.join('hello.txt').write('hello') + abcpath.join('blah.x').write('x') + abcpath.join('blah.y').write('x') + abcpath.join('blah.z').write('x') + abcpath.mkdir('def') + defpath = abcpath.join('def') + defpath.join('world.txt').write('world') + defpath.join('moo.cow').write('y') + + a = blobxfer.models.LocalSourcePaths() + a.add_include('*.txt') + a.add_includes(['moo.cow', '*blah*']) + with pytest.raises(ValueError): + a.add_includes('abc') + a.add_exclude('**/blah.x') + a.add_excludes(['world.txt']) + with pytest.raises(ValueError): + a.add_excludes('abc') + a.add_path(str(tmpdir)) + a_set = set() + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + a_set.add(sfile) + + assert str(abcpath.join('blah.x')) not in a_set + assert str(defpath.join('world.txt')) in a_set + assert str(defpath.join('moo.cow')) not in a_set + + b = blobxfer.models.LocalSourcePaths() + b.add_includes(['moo.cow', '*blah*']) + b.add_include('*.txt') + b.add_excludes(['world.txt']) + b.add_exclude('**/blah.x') + b.add_paths([str(tmpdir)]) + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + assert sfile in a_set + + +def test_localdestinationpath(tmpdir): + tmpdir.mkdir('1') + path = tmpdir.join('1') + + a = blobxfer.models.LocalDestinationPath(str(path)) + a.is_dir = True + assert str(a.path) == str(path) + assert a.is_dir + + a.ensure_path_exists() + assert os.path.exists(str(a.path)) + + b = blobxfer.models.LocalDestinationPath() + b.is_dir = False + b.path = str(path) + with pytest.raises(RuntimeError): + b.ensure_path_exists() + assert not b.is_dir + + path2 = tmpdir.join('2') + path3 = path2.join('3') + c = blobxfer.models.LocalDestinationPath(str(path3)) + with pytest.raises(RuntimeError): + c.ensure_path_exists() + c.is_dir = False + c.ensure_path_exists() + assert os.path.exists(str(path2)) + assert os.path.isdir(str(path2)) + assert not c.is_dir diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index bc17d06..37e070c 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -83,7 +83,7 @@ def test_scantree(tmpdir): defpath = abcpath.join('def') defpath.join('world.txt').write('world') found = set() - for de in blobxfer.util.scantree(str(tmpdir.dirpath())): + for de in blobxfer.util.scantree(str(tmpdir)): if de.name != '.lock': found.add(de.name) assert 'hello.txt' in found @@ -103,7 +103,10 @@ def test_get_mime_type(): def test_base64_encode_as_string(): a = b'abc' enc = blobxfer.util.base64_encode_as_string(a) - assert type(enc) != bytes + if blobxfer.util.on_python2(): + assert type(enc) == str + else: + assert type(enc) != bytes dec = blobxfer.util.base64_decode_string(enc) assert a == dec diff --git a/tox.ini b/tox.ini index f2b110d..58a6df6 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py35 +envlist = py27, py35 [testenv] deps = -rtest_requirements.txt From f0a4209e438420171927e742e03c686e1ab43533 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 14 Feb 2017 14:57:29 -0800 Subject: [PATCH 03/47] Continue download model evolution with tests --- blobxfer/__init__.py | 5 + blobxfer/api.py | 1 - blobxfer/blob/operations.py | 2 +- blobxfer/file/operations.py | 37 +++++-- blobxfer/models.py | 177 ++++++++++++++++++++++++------ blobxfer/operations.py | 38 ++++++- blobxfer/util.py | 38 ++++++- cli/cli.py | 14 ++- cli/settings.py | 104 ++++++++++++++++-- tests/test_blobxfer.py | 13 +++ tests/test_blobxfer_models.py | 61 +++++++++- tests/test_blobxfer_operations.py | 83 ++++++++++++++ tests/test_blobxfer_util.py | 43 ++++++++ 13 files changed, 552 insertions(+), 64 deletions(-) create mode 100644 tests/test_blobxfer.py create mode 100644 tests/test_blobxfer_operations.py diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py index 157d59f..e05319c 100644 --- a/blobxfer/__init__.py +++ b/blobxfer/__init__.py @@ -23,3 +23,8 @@ # DEALINGS IN THE SOFTWARE. from .version import __version__ # noqa + +# monkeypatch User-Agent string +import azure.storage +azure.storage._constants.USER_AGENT_STRING = 'blobxfer/{} {}'.format( + __version__, azure.storage._constants.USER_AGENT_STRING) diff --git a/blobxfer/api.py b/blobxfer/api.py index 45f2145..65e91ad 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -32,7 +32,6 @@ # non-stdlib imports # local imports -from .blob.operations import check_if_single_blob # noqa from .blob.append.operations import ( # noqa create_client as create_append_blob_client ) diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index 567d9aa..126c267 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -40,7 +40,7 @@ def check_if_single_blob(client, container, prefix): # type: (azure.storage.blob.BaseBlobService, str, str) -> bool - """List append blobs in path + """Check if prefix is a single blob or multiple blobs :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index 38705d6..3501b65 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -30,10 +30,14 @@ ) # stdlib imports import logging +try: + import pathlib2 as pathlib +except ImportError: + import pathlib # non-stdlib imports +import azure.common from azure.storage.file import FileService # local imports -from ..util import is_none_or_empty # create logger logger = logging.getLogger(__name__) @@ -59,18 +63,30 @@ def create_client(storage_account): return client -def check_if_single_file(client, container, prefix): - # type: (azure.storage.blob.BaseBlobService, str, str) -> bool - """List append blobs in path - :param azure.storage.blob.BaseBlobService client: blob client - :param str container: container +def parse_file_path(filepath): + # type: (pathlib.Path) -> tuple + if not isinstance(filepath, pathlib.Path): + filepath = pathlib.Path(filepath) + dirname = '/'.join(filepath.parts[:len(filepath.parts) - 1]) + return (dirname, filepath.parts[-1]) + + +def check_if_single_file(client, fileshare, prefix): + # type: (azure.storage.file.FileService, str, str) -> bool + """Check if prefix is a single file or multiple files + :param FileService client: blob client + :param str fileshare: file share name :param str prefix: path prefix :rtype: bool - :return: if prefix in container is a single blob + :return: if prefix in fileshare is a single file """ - blobs = client.list_blobs( - container_name=container, prefix=prefix, num_results=1) - return is_none_or_empty(blobs.next_marker) + dirname, fname = parse_file_path(prefix) + try: + client.get_file_properties( + share_name=fileshare, directory_name=dirname, file_name=fname) + except azure.common.AzureMissingResourceHttpError: + return False + return True def list_blobs(client, container, prefix, mode): @@ -81,5 +97,4 @@ def list_blobs(client, container, prefix, mode): :param str container: container :param str prefix: path prefix """ - pass diff --git a/blobxfer/models.py b/blobxfer/models.py index d511e15..1e7e583 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -41,7 +41,16 @@ import pathlib # non-stdlib imports # local imports -from .util import scantree +from .api import ( + create_append_blob_client, + create_block_blob_client, + create_file_client, + create_page_blob_client, +) +from .util import ( + normalize_azure_path, + scantree, +) # create logger logger = logging.getLogger(__name__) @@ -81,15 +90,12 @@ class AzureStorageModes(enum.Enum): 'UploadOptions', [ 'chunk_size_bytes', 'delete_extraneous_destination', - 'exclude', - 'include', 'mode', 'overwrite', 'recursive', 'rsa_private_key', 'rsa_private_key_passphrase', 'rsa_public_key', - 'skip_on', 'store_file_attributes', 'store_file_md5', 'strip_components', @@ -101,15 +107,12 @@ class AzureStorageModes(enum.Enum): 'DownloadOptions', [ 'check_file_md5', 'delete_extraneous_destination', - 'exclude', - 'include', 'mode', 'overwrite', 'recursive', 'restore_file_attributes', 'rsa_private_key', - 'rsa_private_key_passphrase', - 'skip_on', + 'rsa_private_key_passphrase' ] ) SyncCopyOptions = collections.namedtuple( @@ -126,15 +129,6 @@ class AzureStorageModes(enum.Enum): 'parent_path', 'relative_path' ] ) -AzureDestinationOptions = collections.namedtuple( - 'AzureDestinationOptions', [ - 'path', - ] -) -AzureSourceOptions = collections.namedtuple( - 'AzureSourceOptions', [ - ] -) class AzureStorageCredentials(object): @@ -177,10 +171,15 @@ def __init__(self, name, key, endpoint): :param str key: storage key or sas :param str endpoint: endpoint """ + self._append_blob_client = None + self._block_blob_client = None + self._file_client = None + self._page_blob_client = None self.name = name self.key = key self.endpoint = endpoint self.is_sas = self._key_is_sas(self.key) + self._create_clients() @staticmethod def _key_is_sas(key): @@ -205,6 +204,56 @@ def _key_is_sas(key): return True return False + def _create_clients(self): + # type: (AzureStorageAccount) -> None + """Create Azure Storage clients + :param AzureStorageAccount self: this + """ + self._append_blob_client = create_append_blob_client(self) + self._block_blob_client = create_block_blob_client(self) + self._file_client = create_file_client(self) + self._page_blob_client = create_page_blob_client(self) + + @property + def append_blob_client(self): + # type: (AzureStorageAccount) -> azure.storage.blob.AppendBlobService + """Get append blob client + :param AzureStorageAccount self: this + :rtype: azure.storage.blob.AppendBlobService + :return: append blob client + """ + return self._append_blob_client + + @property + def block_blob_client(self): + # type: (AzureStorageAccount) -> azure.storage.blob.BlockBlobService + """Get block blob client + :param AzureStorageAccount self: this + :rtype: azure.storage.blob.BlockBlobService + :return: block blob client + """ + return self._block_blob_client + + @property + def file_client(self): + # type: (AzureStorageAccount) -> azure.storage.file.FileService + """Get file client + :param AzureStorageAccount self: this + :rtype: azure.storage.file.FileService + :return: file client + """ + return self._file_client + + @property + def page_blob_client(self): + # type: (AzureStorageAccount) -> azure.storage.blob.PageBlobService + """Get page blob client + :param AzureStorageAccount self: this + :rtype: azure.storage.blob.PageBlobService + :return: page blob client + """ + return self._page_blob_client + class _BaseSourcePaths(object): """Base Source Paths""" @@ -217,6 +266,16 @@ def __init__(self): self._exclude = None self._paths = [] + @property + def paths(self): + # type: (_BaseSourcePaths) -> List[pathlib.Path] + """Stored paths + :param _BaseSourcePaths self: this + :rtype: list + :return: list of pathlib.Path + """ + return self._paths + def add_include(self, incl): # type: (_BaseSourcePaths, str) -> None """Add an include @@ -271,7 +330,10 @@ def add_path(self, path): :param _BaseSourcePaths self: this :param str path: path to add """ - self._paths.append(pathlib.Path(path)) + if isinstance(path, pathlib.Path): + self._paths.append(path) + else: + self._paths.append(pathlib.Path(path)) def add_paths(self, paths): # type: (_BaseSourcePaths, list) -> None @@ -390,28 +452,79 @@ def ensure_path_exists(self): mode=0o750, parents=True, exist_ok=True) -class AzureSourcePaths(_BaseSourcePaths): - def __init__(self, mode): - super.__init__() - self._mode = mode +class DownloadSpecification(object): + """DownloadSpecification""" + def __init__( + self, download_options, skip_on_options, local_destination_path): + # type: (DownloadSpecification, DownloadOptions, SkipOnOptions, + # LocalDestinationPath) -> None + """Ctor for DownloadSpecification + :param DownloadSepcification self: this + :param DownloadOptions download_options: download options + :param SkipOnOptions skip_on_options: skip on options + :param LocalDestinationPath local_destination_path: local dest path + """ + self.options = download_options + self.skip_on = skip_on_options + self.destination = local_destination_path + self.sources = [] + + def add_azure_source_path(self, source): + # type: (DownloadSpecification, AzureSourcePath) -> None + """Add an Azure Source Path + :param DownloadSepcification self: this + :param AzureSourcePath source: Azure source path to add + """ + self.sources.append(source) - def set_clients(self, append, block, file, page): - pass - def files(self): - if self._mode == AzureStorageModes.Auto: +class AzureSourcePath(_BaseSourcePaths): + """AzureSourcePath""" + def __init__(self): + # type: (AzureSourcePath) -> None + """Ctor for AzureSourcePath + :param AzureSourcePath self: this + """ + super(AzureSourcePath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (AzureSourcePath, str, str) -> None + """Add a path with an associated storage account + :param AzureSourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to AzureSourcePath objects') + rpath = normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (AzureSourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param AzureSourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[normalize_azure_path(remote_path)] + + def files(self, mode): + if mode == AzureStorageModes.Auto: pass - elif self._mode == AzureStorageModes.Append: + elif mode == AzureStorageModes.Append: pass - elif self._mode == AzureStorageModes.Block: + elif mode == AzureStorageModes.Block: pass - elif self._mode == AzureStorageModes.File: + elif mode == AzureStorageModes.File: pass - elif self._mode == AzureStorageModes.Page: + elif mode == AzureStorageModes.Page: pass else: - raise RuntimeError('unknown Azure Storage Mode: {}'.format( - self._mode)) + raise RuntimeError('unknown Azure Storage Mode: {}'.format(mode)) def _append_files(self): for _path in self._paths: diff --git a/blobxfer/operations.py b/blobxfer/operations.py index b073367..476e597 100644 --- a/blobxfer/operations.py +++ b/blobxfer/operations.py @@ -32,7 +32,43 @@ import logging # non-stdlib imports # local imports -from .models import FileDescriptor +from .models import ( # noqa + AzureStorageCredentials, + AzureStorageModes, + DownloadSpecification, + FileDescriptor, +) +from .blob.operations import check_if_single_blob +from .file.operations import check_if_single_file +from .util import explode_azure_path + + +def ensure_local_destination(creds, spec): + """Ensure a local destination path given a download spec + :param AzureStorageCredentials creds: creds + :param DownloadSpecification spec: download spec + """ + # ensure destination path is writable given the source + if len(spec.sources) < 1: + raise RuntimeError('no sources to download from specified') + # set is_dir for destination + spec.destination.is_dir = True + if len(spec.sources) == 1: + # we need to query the source to see if this is a directory + rpath = str(spec.sources[0].paths[0]) + sa = creds.get_storage_account( + spec.sources[0].lookup_storage_account(rpath)) + cont, dir = explode_azure_path(rpath) + if spec.options.mode == AzureStorageModes.File: + if check_if_single_file(sa.file_client, cont, dir): + spec.destination.is_dir = False + else: + if check_if_single_blob(sa.block_blob_client, cont, dir): + spec.destination.is_dir = False + logging.debug('dest is_dir={} for {} specs'.format( + spec.destination.is_dir, len(spec.sources))) + # ensure destination path + spec.destination.ensure_path_exists() def file_chunks(fd, chunk_size): diff --git a/blobxfer/util.py b/blobxfer/util.py index f498ff6..9b778bd 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -39,12 +39,12 @@ from os import scandir as scandir except ImportError: # noqa from scandir import scandir as scandir -import sys +import re # non-stdlib imports +import future.utils # local imports # global defines -_PY2 = sys.version_info.major == 2 _PAGEBLOB_BOUNDARY = 512 @@ -54,7 +54,7 @@ def on_python2(): :rtype: bool :return: if on Python2 """ - return _PY2 + return future.utils.PY2 def setup_logger(logger): # noqa @@ -148,7 +148,7 @@ def base64_encode_as_string(obj): # noqa :rtype: str :return: base64 encoded string """ - if _PY2: + if on_python2(): return base64.b64encode(obj) else: return str(base64.b64encode(obj), 'ascii') @@ -211,3 +211,33 @@ def page_align_content_length(length): if mod != 0: return length + (_PAGEBLOB_BOUNDARY - mod) return length + + +def normalize_azure_path(path): + # type: (str) -> str + """Normalize remote path (strip slashes and use forward slashes) + :param str path: path to normalize + :rtype: str + :return: normalized path + """ + if is_none_or_empty(path): + raise ValueError('provided path is invalid') + _path = path.strip('/').strip('\\') + return '/'.join(re.split('/|\\\\', _path)) + + +def explode_azure_path(path): + # type: (str) -> Tuple[str, str] + """Explodes an azure path into a container or fileshare and the + remaining virtual path + :param str path: path to explode + :rtype: tuple + :return: container, vpath + """ + rpath = normalize_azure_path(path).split('/') + container = rpath[0] + if len(rpath) > 1: + rpath = '/'.join(rpath[1:]) + else: + rpath = '' + return container, rpath diff --git a/cli/cli.py b/cli/cli.py index cb23a31..1ec99d4 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -41,6 +41,7 @@ import blobxfer.api import blobxfer.util # local imports +import download as dl import settings # create logger @@ -57,10 +58,8 @@ def __init__(self): self.yaml_config = None self.config = {} self.cli_options = {} - self.block_blob_client = None - self.page_blob_client = None - self.append_blob_client = None - self.smb_file_client = None + self.credentials = None + self.general_options = None def initialize(self): # type: (CliContext) -> None @@ -68,6 +67,9 @@ def initialize(self): :param CliContext self: this """ self._init_config() + self.credentials = settings.create_azure_storage_credentials( + self.config) + self.general_options = settings.create_general_options(self.config) def _read_yaml_file(self, yaml_file): # type: (CliContext, pathlib.Path) -> None @@ -588,7 +590,9 @@ def download(ctx, local_resource, storage_account, remote_path): ctx.cli_options, settings.TransferAction.Download, local_resource, storage_account, remote_path) ctx.initialize() - raise NotImplementedError() + specs = settings.create_download_specifications(ctx.config) + for spec in specs: + dl.download(ctx.general_options, ctx.credentials, spec) @cli.command('synccopy') diff --git a/cli/settings.py b/cli/settings.py index 9d54d42..ad71bf7 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -33,6 +33,7 @@ import enum # non-stdlib imports # local imports +import blobxfer.models from blobxfer.util import is_none_or_empty, is_not_empty, merge_dict @@ -201,11 +202,98 @@ def merge_settings(config, cli_options): # merge general options if 'options' not in config: config['options'] = {} - try: - config['options']['verbose'] = cli_options['verbose'] - except KeyError: - pass - try: - config['options']['timeout_sec'] = cli_options['timeout'] - except KeyError: - pass + config['options']['progress_bar'] = cli_options['progress_bar'] + config['options']['timeout_sec'] = cli_options['timeout'] + config['options']['verbose'] = cli_options['verbose'] + + +def create_azure_storage_credentials(config): + # type: (dict) -> blobxfer.models.AzureStorageCredentials + """Create an AzureStorageCredentials object from configuration + :param dict config: config dict + :rtype: blobxfer.models.AzureStorageCredentials + :return: credentials object + """ + creds = blobxfer.models.AzureStorageCredentials() + endpoint = config['azure_storage']['endpoint'] + for name in config['azure_storage']['accounts']: + key = config['azure_storage']['accounts'][name] + creds.add_storage_account(name, key, endpoint) + return creds + + +def create_general_options(config): + # type: (dict) -> blobxfer.models.GeneralOptions + """Create a GeneralOptions object from configuration + :param dict config: config dict + :rtype: blobxfer.models.GeneralOptions + :return: general options object + """ + return blobxfer.models.GeneralOptions( + progress_bar=config['options']['progress_bar'], + timeout_sec=config['options']['timeout_sec'], + verbose=config['options']['verbose'], + ) + + +def create_download_specifications(config): + # type: (dict) -> List[blobxfer.models.DownloadSpecification] + """Create a list of DownloadSpecification objects from configuration + :param dict config: config dict + :rtype: list + :return: list of DownloadSpecification objects + """ + specs = [] + for conf in config['download']: + # create download options + confmode = conf['options']['mode'].lower() + if confmode == 'auto': + mode = blobxfer.models.AzureStorageModes.Auto + elif confmode == 'append': + mode = blobxfer.models.AzureStorageModes.Append + elif confmode == 'block': + mode = blobxfer.models.AzureStorageModes.Block + elif confmode == 'file': + mode == blobxfer.models.AzureStorageModes.File + elif confmode == 'page': + mode == blobxfer.models.AzureStorageModes.Page + else: + raise ValueError('unknown mode: {}'.format(confmode)) + ds = blobxfer.models.DownloadSpecification( + download_options=blobxfer.models.DownloadOptions( + check_file_md5=conf['options']['check_file_md5'], + delete_extraneous_destination=conf[ + 'options']['delete_extraneous_destination'], + mode=mode, + overwrite=conf['options']['overwrite'], + recursive=conf['options']['recursive'], + restore_file_attributes=conf[ + 'options']['restore_file_attributes'], + rsa_private_key=conf['options']['rsa_private_key'], + rsa_private_key_passphrase=conf[ + 'options']['rsa_private_key_passphrase'], + ), + skip_on_options=blobxfer.models.SkipOnOptions( + filesize_match=conf['options']['skip_on']['filesize_match'], + lmt_ge=conf['options']['skip_on']['lmt_ge'], + md5_match=conf['options']['skip_on']['md5_match'], + ), + local_destination_path=blobxfer.models.LocalDestinationPath( + conf['destination'] + ) + ) + # create remote source paths + for src in conf['source']: + if len(src) != 1: + raise RuntimeError( + 'invalid number of source pairs specified per entry') + sa = next(iter(src)) + asp = blobxfer.models.AzureSourcePath() + asp.add_path_with_storage_account(src[sa], sa) + if is_not_empty(conf['include']): + asp.add_includes(conf['include']) + if is_not_empty(conf['exclude']): + asp.add_excludes(conf['exclude']) + ds.add_azure_source_path(asp) + specs.append(ds) + return specs diff --git a/tests/test_blobxfer.py b/tests/test_blobxfer.py new file mode 100644 index 0000000..f64c084 --- /dev/null +++ b/tests/test_blobxfer.py @@ -0,0 +1,13 @@ +# coding=utf-8 +"""Tests for miscellaneous""" + +# stdlib imports +# non-stdlib imports +import azure.storage +# module under test +import blobxfer.version + + +def test_user_agent_monkey_patch(): + verstr = 'blobxfer/{}'.format(blobxfer.version.__version__) + assert azure.storage._constants.USER_AGENT_STRING.startswith(verstr) diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 2de4bc2..df1790d 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -3,7 +3,12 @@ # stdlib imports import os +try: + import pathlib2 as pathlib +except ImportError: + import pathlib # non-stdlib imports +import azure.storage import pytest # module under test import blobxfer.models @@ -17,6 +22,14 @@ def test_storage_credentials(): assert a.name == 'sa1' assert a.key == 'somekey1' assert a.endpoint == 'endpoint' + assert isinstance( + a.append_blob_client, azure.storage.blob.AppendBlobService) + assert isinstance( + a.block_blob_client, azure.storage.blob.BlockBlobService) + assert isinstance( + a.file_client, azure.storage.file.FileService) + assert isinstance( + a.page_blob_client, azure.storage.blob.PageBlobService) with pytest.raises(KeyError): a = creds.get_storage_account('sa2') @@ -86,6 +99,7 @@ def test_localsourcepaths_files(tmpdir): sfile = str(file.parent_path / file.relative_path) a_set.add(sfile) + assert len(a.paths) == 1 assert str(abcpath.join('blah.x')) not in a_set assert str(defpath.join('world.txt')) in a_set assert str(defpath.join('moo.cow')) not in a_set @@ -95,7 +109,7 @@ def test_localsourcepaths_files(tmpdir): b.add_include('*.txt') b.add_excludes(['world.txt']) b.add_exclude('**/blah.x') - b.add_paths([str(tmpdir)]) + b.add_paths([pathlib.Path(str(tmpdir))]) for file in a.files(): sfile = str(file.parent_path / file.relative_path) assert sfile in a_set @@ -130,3 +144,48 @@ def test_localdestinationpath(tmpdir): assert os.path.exists(str(path2)) assert os.path.isdir(str(path2)) assert not c.is_dir + + +def test_azuresourcepath(): + p = '/cont/remote/path' + asp = blobxfer.models.AzureSourcePath() + asp.add_path_with_storage_account(p, 'sa') + + with pytest.raises(RuntimeError): + asp.add_path_with_storage_account('x', 'x') + + assert 'sa' == asp.lookup_storage_account(p) + + +def test_downloadspecification(): + ds = blobxfer.models.DownloadSpecification( + download_options=blobxfer.models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=blobxfer.models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + rsa_private_key_passphrase=None, + ), + skip_on_options=blobxfer.models.SkipOnOptions( + filesize_match=True, + lmt_ge=False, + md5_match=True, + ), + local_destination_path=blobxfer.models.LocalDestinationPath('dest'), + ) + + asp = blobxfer.models.AzureSourcePath() + p = 'some/remote/path' + asp.add_path_with_storage_account(p, 'sa') + + ds.add_azure_source_path(asp) + + assert ds.options.check_file_md5 + assert not ds.skip_on.lmt_ge + assert ds.destination.path == pathlib.Path('dest') + assert len(ds.sources) == 1 + assert p in ds.sources[0]._path_map + assert ds.sources[0]._path_map[p] == 'sa' diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py new file mode 100644 index 0000000..4ddc104 --- /dev/null +++ b/tests/test_blobxfer_operations.py @@ -0,0 +1,83 @@ +# coding=utf-8 +"""Tests for operations""" + +# stdlib imports +from mock import ( + MagicMock, + patch, +) +# non-stdlib imports +import pytest +# local imports +import blobxfer.models +# module under test +import blobxfer.operations as ops + + +@patch('blobxfer.operations.check_if_single_file') +@patch('blobxfer.operations.check_if_single_blob') +def test_ensure_local_destination(patched_blob, patched_file, tmpdir): + downdir = tmpdir.join('down') + + # non-file tests + ds = blobxfer.models.DownloadSpecification( + download_options=blobxfer.models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=blobxfer.models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + rsa_private_key_passphrase=None, + ), + skip_on_options=MagicMock(), + local_destination_path=blobxfer.models.LocalDestinationPath( + str(downdir) + ), + ) + + with pytest.raises(RuntimeError): + ops.ensure_local_destination(MagicMock(), ds) + + asp = blobxfer.models.AzureSourcePath() + p = 'cont/remote/path' + asp.add_path_with_storage_account(p, 'sa') + + ds.add_azure_source_path(asp) + + patched_blob.return_value = False + ops.ensure_local_destination(MagicMock(), ds) + assert ds.destination.is_dir + + patched_blob.return_value = True + with pytest.raises(RuntimeError): + ops.ensure_local_destination(MagicMock(), ds) + + # file tests + ds = blobxfer.models.DownloadSpecification( + download_options=blobxfer.models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=blobxfer.models.AzureStorageModes.File, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + rsa_private_key_passphrase=None, + ), + skip_on_options=MagicMock(), + local_destination_path=blobxfer.models.LocalDestinationPath( + str(downdir) + ), + ) + + ds.add_azure_source_path(asp) + + patched_file.return_value = False + ops.ensure_local_destination(MagicMock(), ds) + assert ds.destination.is_dir + + patched_file.return_value = True + with pytest.raises(RuntimeError): + ops.ensure_local_destination(MagicMock(), ds) diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index 37e070c..9b6084e 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -131,6 +131,49 @@ def test_compute_md5(tmpdir): def test_page_align_content_length(): assert 0 == blobxfer.util.page_align_content_length(0) + assert 512 == blobxfer.util.page_align_content_length(1) assert 512 == blobxfer.util.page_align_content_length(511) assert 512 == blobxfer.util.page_align_content_length(512) assert 1024 == blobxfer.util.page_align_content_length(513) + assert 1024 == blobxfer.util.page_align_content_length(1023) + assert 1024 == blobxfer.util.page_align_content_length(1024) + assert 1536 == blobxfer.util.page_align_content_length(1025) + + +def test_normalize_azure_path(): + a = '\\cont\\r1\\r2\\r3\\' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + a = '/cont/r1/r2/r3/' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + a = '/cont\\r1/r2\\r3/' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + with pytest.raises(ValueError): + blobxfer.util.normalize_azure_path('') + + +def test_explode_azure_path(): + p = 'cont' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == '' + + p = 'cont/' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == '' + + p = 'cont/a/' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == 'a' + + p = '/some/remote/path' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'some' + assert rpath == 'remote/path' From 616c7666bc8920d014320c4c029d0000dc286b52 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 16 Feb 2017 15:00:39 -0800 Subject: [PATCH 04/47] Add some crypto models --- blobxfer/blob/operations.py | 29 ++- blobxfer/crypto/__init__.py | 0 blobxfer/crypto/models.py | 286 +++++++++++++++++++++++ blobxfer/crypto/operations.py | 130 +++++++++++ blobxfer/models.py | 50 ++-- blobxfer/operations.py | 27 ++- cli/settings.py | 33 +-- setup.py | 1 + tests/test_blobxfer_crypto_models.py | 202 ++++++++++++++++ tests/test_blobxfer_crypto_operations.py | 44 ++++ tests/test_blobxfer_models.py | 1 - tests/test_blobxfer_operations.py | 6 +- 12 files changed, 757 insertions(+), 52 deletions(-) create mode 100644 blobxfer/crypto/__init__.py create mode 100644 blobxfer/crypto/models.py create mode 100644 blobxfer/crypto/operations.py create mode 100644 tests/test_blobxfer_crypto_models.py create mode 100644 tests/test_blobxfer_crypto_operations.py diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index 126c267..405e384 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -31,8 +31,9 @@ # stdlib imports import logging # non-stdlib imports +import azure.common +import azure.storage.blob.models # local imports -from ..util import is_none_or_empty # create logger logger = logging.getLogger(__name__) @@ -47,18 +48,28 @@ def check_if_single_blob(client, container, prefix): :rtype: bool :return: if prefix in container is a single blob """ - blobs = client.list_blobs( - container_name=container, prefix=prefix, num_results=1) - return is_none_or_empty(blobs.next_marker) + try: + client.get_blob_properties( + container_name=container, blob_name=prefix) + except azure.common.AzureMissingResourceHttpError: + return False + return True -def list_blobs(client, container, prefix, mode): - # type: (azure.storage.blob.BaseBlobService, str, str, - # blobxfer.models.AzureStorageModes) -> list +def list_blobs(client, container, prefix): + # type: (azure.storage.blob.BaseBlobService, str, + # str) -> azure.storage.blob.models.Blob """List blobs in path conforming to mode :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix + :rtype: azure.storage.blob.models.Blob + :return: generator of blobs """ - - pass + blobs = client.list_blobs( + container_name=container, + prefix=prefix, + include=azure.storage.blob.models.Include.METADATA, + ) + for blob in blobs: + yield blob diff --git a/blobxfer/crypto/__init__.py b/blobxfer/crypto/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/crypto/models.py b/blobxfer/crypto/models.py new file mode 100644 index 0000000..4300b58 --- /dev/null +++ b/blobxfer/crypto/models.py @@ -0,0 +1,286 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import base64 +import collections +import hashlib +import hmac +import json +import logging +# non-stdlib imports +# local imports +import blobxfer.crypto.operations +import blobxfer.util + + +# encryption constants +_AES256_KEYLENGTH_BYTES = 32 +_AES256_BLOCKSIZE_BYTES = 16 +_HMACSHA256_DIGESTSIZE_BYTES = 32 +_AES256CBC_HMACSHA256_OVERHEAD_BYTES = ( + _AES256_BLOCKSIZE_BYTES + _HMACSHA256_DIGESTSIZE_BYTES +) + +# named tuples +EncryptionBlobxferExtensions = collections.namedtuple( + 'EncryptionBlobxferExtensions', [ + 'pre_encrypted_content_md5', + ] +) +EncryptionAgent = collections.namedtuple( + 'EncryptionAgent', [ + 'encryption_algorithm', + 'protocol', + ] +) +EncryptionAuthentication = collections.namedtuple( + 'EncryptionAuthentication', [ + 'algorithm', + 'message_authentication_code', + ] +) +EncryptionWrappedContentKey = collections.namedtuple( + 'EncryptionWrappedContentKey', [ + 'algorithm', + 'encrypted_authentication_key', + 'encrypted_key', + 'key_id', + ] +) +EncryptionMetadataAuthentication = collections.namedtuple( + 'EncryptionMetadataAuthentication', [ + 'algorithm', + 'encoding', + 'message_authentication_code', + ] +) + + +class EncryptionMetadata(object): + """EncryptionMetadata""" + + # constants + _ENCRYPTION_MODE = 'FullBlob' + _ENCRYPTION_PROTOCOL_VERSION = '1.0' + _ENCRYPTION_ALGORITHM = 'AES_CBC_256' + _ENCRYPTED_KEY_SCHEME = 'RSA-OAEP' + _AUTH_ALGORITHM = 'HMAC-SHA256' + _AUTH_ENCODING_TYPE = 'UTF-8' + + _METADATA_KEY_NAME = 'encryptiondata' + _METADATA_KEY_AUTH_NAME = 'encryptiondata_authentication' + + _JSON_KEY_ENCRYPTION_MODE = 'EncryptionMode' + _JSON_KEY_ALGORITHM = 'Algorithm' + _JSON_KEY_MAC = 'MessageAuthenticationCode' + _JSON_KEY_ENCRYPTION_AGENT = 'EncryptionAgent' + _JSON_KEY_PROTOCOL = 'Protocol' + _JSON_KEY_ENCRYPTION_ALGORITHM = 'EncryptionAlgorithm' + _JSON_KEY_INTEGRITY_AUTH = 'EncryptionAuthentication' + _JSON_KEY_WRAPPEDCONTENTKEY = 'WrappedContentKey' + _JSON_KEY_ENCRYPTED_KEY = 'EncryptedKey' + _JSON_KEY_ENCRYPTED_AUTHKEY = 'EncryptedAuthenticationKey' + _JSON_KEY_CONTENT_IV = 'ContentEncryptionIV' + _JSON_KEY_KEYID = 'KeyId' + _JSON_KEY_BLOBXFER_EXTENSIONS = 'BlobxferExtensions' + _JSON_KEY_PREENCRYPTED_MD5 = 'PreEncryptedContentMD5' + + _JSON_KEY_AUTH_METAAUTH = 'EncryptionMetadataAuthentication' + _JSON_KEY_AUTH_ENCODING = 'Encoding' + + def __init__(self): + # type: (EncryptionMetadata) -> None + """Ctor for EncryptionMetadata + :param EncryptionMetadata self: this + """ + self.blobxfer_extensions = None + self.content_encryption_iv = None + self.encryption_agent = None + self.encryption_authentication = None + self.encryption_mode = None + self.key_wrapping_metadata = {} + self.wrapped_content_key = None + self.encryption_metadata_authentication = None + self._symkey = None + self._signkey = None + + @staticmethod + def encryption_metadata_exists(md): + # type: (dict) -> bool + """Check if encryption metadata exists in json metadata + :param dict md: metadata dictionary + :rtype: bool + :return: if encryption metadata exists + """ + try: + if blobxfer.util.is_not_empty( + md[EncryptionMetadata._METADATA_KEY_NAME]): + return True + except (KeyError, TypeError): + pass + return False + + def convert_from_json(self, md, blobname, rsaprivatekey): + # type: (EncryptionMetadata, dict, str, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey) + # -> None + """Read metadata json into objects + :param EncryptionMetadata self: this + :param dict md: metadata dictionary + :param str blobname: blob name + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + """ + # populate from encryption data + ed = json.loads(md[EncryptionMetadata._METADATA_KEY_NAME]) + try: + self.blobxfer_extensions = EncryptionBlobxferExtensions( + pre_encrypted_content_md5=ed[ + EncryptionMetadata._JSON_KEY_BLOBXFER_EXTENSIONS][ + EncryptionMetadata._JSON_KEY_PREENCRYPTED_MD5], + ) + except KeyError: + pass + self.content_encryption_iv = ed[ + EncryptionMetadata._JSON_KEY_CONTENT_IV] + self.encryption_agent = EncryptionAgent( + encryption_algorithm=ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT][ + EncryptionMetadata._JSON_KEY_ENCRYPTION_ALGORITHM], + protocol=ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT][ + EncryptionMetadata._JSON_KEY_PROTOCOL], + ) + if (self.encryption_agent.encryption_algorithm != + EncryptionMetadata._ENCRYPTION_ALGORITHM): + raise RuntimeError('{}: unknown block cipher: {}'.format( + blobname, self.encryption_agent.encryption_algorithm)) + if (self.encryption_agent.protocol != + EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION): + raise RuntimeError('{}: unknown encryption protocol: {}'.format( + blobname, self.encryption_agent.protocol)) + self.encryption_authentication = EncryptionAuthentication( + algorithm=ed[ + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + message_authentication_code=ed[ + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_MAC], + ) + if (self.encryption_authentication.algorithm != + EncryptionMetadata._AUTH_ALGORITHM): + raise RuntimeError( + '{}: unknown integrity/auth method: {}'.format( + blobname, self.encryption_authentication.algorithm)) + self.encryption_mode = ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE] + if self.encryption_mode != EncryptionMetadata._ENCRYPTION_MODE: + raise RuntimeError( + '{}: unknown encryption mode: {}'.format( + blobname, self.encryption_mode)) + try: + _eak = ed[EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ENCRYPTED_AUTHKEY] + except KeyError: + _eak = None + self.wrapped_content_key = EncryptionWrappedContentKey( + algorithm=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + encrypted_authentication_key=_eak, + encrypted_key=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ENCRYPTED_KEY], + key_id=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_KEYID], + ) + if (self.wrapped_content_key.algorithm != + EncryptionMetadata._ENCRYPTED_KEY_SCHEME): + raise RuntimeError('{}: unknown key encryption scheme: {}'.format( + blobname, self.wrapped_content_key.algorithm)) + # if RSA key is a public key, stop here as keys cannot be decrypted + if rsaprivatekey is None: + return + # decrypt symmetric key + self._symkey = blobxfer.crypto.operations.\ + rsa_decrypt_base64_encoded_key( + rsaprivatekey, self.wrapped_content_key.encrypted_key) + # decrypt signing key, if it exists + if blobxfer.util.is_not_empty( + self.wrapped_content_key.encrypted_authentication_key): + self._signkey = blobxfer.crypto.operations.\ + rsa_decrypt_base64_encoded_key( + rsaprivatekey, + self.wrapped_content_key.encrypted_authentication_key) + else: + self._signkey = None + # populate from encryption data authentication + try: + eda = json.loads(md[EncryptionMetadata._METADATA_KEY_AUTH_NAME]) + except KeyError: + pass + else: + self.encryption_metadata_authentication = \ + EncryptionMetadataAuthentication( + algorithm=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + encoding=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_AUTH_ENCODING], + message_authentication_code=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_MAC], + ) + if (self.encryption_metadata_authentication.algorithm != + EncryptionMetadata._AUTH_ALGORITHM): + raise RuntimeError( + '{}: unknown integrity/auth method: {}'.format( + blobname, + self.encryption_metadata_authentication.algorithm)) + # verify hmac + authhmac = base64.b64decode( + self.encryption_metadata_authentication. + message_authentication_code) + bmeta = md[EncryptionMetadata._METADATA_KEY_NAME].encode( + self.encryption_metadata_authentication.encoding) + hmacsha256 = hmac.new(self._signkey, digestmod=hashlib.sha256) + hmacsha256.update(bmeta) + if hmacsha256.digest() != authhmac: + raise RuntimeError( + '{}: encryption metadata authentication failed'.format( + blobname)) + + def convert_to_json_with_mac(self): + pass diff --git a/blobxfer/crypto/operations.py b/blobxfer/crypto/operations.py new file mode 100644 index 0000000..9a0f099 --- /dev/null +++ b/blobxfer/crypto/operations.py @@ -0,0 +1,130 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import base64 +import logging +# non-stdlib imports +import cryptography.hazmat.backends +import cryptography.hazmat.primitives.asymmetric.padding +import cryptography.hazmat.primitives.asymmetric.rsa +import cryptography.hazmat.primitives.ciphers +import cryptography.hazmat.primitives.ciphers.algorithms +import cryptography.hazmat.primitives.ciphers.modes +import cryptography.hazmat.primitives.constant_time +import cryptography.hazmat.primitives.hashes +import cryptography.hazmat.primitives.padding +import cryptography.hazmat.primitives.serialization +# local imports +import blobxfer.util + + +def load_rsa_private_key_file(rsakeyfile, passphrase): + # type: (str, str) -> + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + """Load an RSA Private key PEM file with passphrase if specified + :param str rsakeyfile: RSA private key PEM file to load + :param str passphrase: optional passphrase + :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :return: RSAPrivateKey + """ + with open(rsakeyfile, 'rb') as keyfile: + return cryptography.hazmat.primitives.serialization.\ + load_pem_private_key( + keyfile.read(), + passphrase, + backend=cryptography.hazmat.backends.default_backend() + ) + + +def load_rsa_public_key_file(rsakeyfile): + # type: (str, str) -> + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + """Load an RSA Public key PEM file + :param str rsakeyfile: RSA public key PEM file to load + :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + :return: RSAPublicKey + """ + with open(rsakeyfile, 'rb') as keyfile: + return cryptography.hazmat.primitives.serialization.\ + load_pem_public_key( + keyfile.read(), + backend=cryptography.hazmat.backends.default_backend() + ) + + +def rsa_decrypt_base64_encoded_key(rsaprivatekey, enckey): + # type: (cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey, + # str) -> bytes + """Decrypt an RSA encrypted key encoded as base64 + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :param str enckey: base64-encoded key + :rtype: bytes + :return: decrypted key + """ + return rsaprivatekey.decrypt( + base64.b64decode(enckey), + cryptography.hazmat.primitives.asymmetric.padding.OAEP( + mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( + algorithm=cryptography.hazmat.primitives.hashes.SHA1() + ), + algorithm=cryptography.hazmat.primitives.hashes.SHA1(), + label=None, + ) + ) + + +def rsa_encrypt_key_base64_encoded(rsaprivatekey, rsapublickey, plainkey): + # type: (cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey, + # bytes) -> str + """Encrypt a plaintext key using RSA and PKCS1_OAEP padding + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :param rsapublickey: RSA public key + :type rsapublickey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + :param bytes plainkey: plain key + :rtype: str + :return: encrypted key + """ + if rsapublickey is None: + rsapublickey = rsaprivatekey.public_key() + enckey = rsapublickey.encrypt( + plainkey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( + mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( + algorithm=cryptography.hazmat.primitives.hashes.SHA1()), + algorithm=cryptography.hazmat.primitives.hashes.SHA1(), + label=None)) + return blobxfer.util.base64_encode_as_string(enckey) diff --git a/blobxfer/models.py b/blobxfer/models.py index 1e7e583..921eef8 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -47,10 +47,9 @@ create_file_client, create_page_blob_client, ) -from .util import ( - normalize_azure_path, - scantree, -) +import blobxfer.blob.operations +import blobxfer.crypto +import blobxfer.util # create logger logger = logging.getLogger(__name__) @@ -94,7 +93,6 @@ class AzureStorageModes(enum.Enum): 'overwrite', 'recursive', 'rsa_private_key', - 'rsa_private_key_passphrase', 'rsa_public_key', 'store_file_attributes', 'store_file_md5', @@ -112,7 +110,6 @@ class AzureStorageModes(enum.Enum): 'recursive', 'restore_file_attributes', 'rsa_private_key', - 'rsa_private_key_passphrase' ] ) SyncCopyOptions = collections.namedtuple( @@ -179,6 +176,9 @@ def __init__(self, name, key, endpoint): self.key = key self.endpoint = endpoint self.is_sas = self._key_is_sas(self.key) + # normalize sas keys + if self.is_sas and self.key.startswith('?'): + self.key = self.key[1:] self._create_clients() @staticmethod @@ -373,7 +373,7 @@ def files(self): for _path in self._paths: _ppath = os.path.expandvars(os.path.expanduser(str(_path))) _expath = pathlib.Path(_ppath) - for entry in scantree(_ppath): + for entry in blobxfer.util.scantree(_ppath): _rpath = pathlib.Path(entry.path).relative_to(_ppath) if not self._inclusion_check(_rpath): logger.debug( @@ -498,7 +498,7 @@ def add_path_with_storage_account(self, remote_path, storage_account): if len(self._path_map) >= 1: raise RuntimeError( 'cannot add multiple remote paths to AzureSourcePath objects') - rpath = normalize_azure_path(remote_path) + rpath = blobxfer.util.normalize_azure_path(remote_path) self.add_path(rpath) self._path_map[rpath] = storage_account @@ -510,11 +510,12 @@ def lookup_storage_account(self, remote_path): :rtype: str :return: storage account associated with path """ - return self._path_map[normalize_azure_path(remote_path)] + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] - def files(self, mode): + def files(self, creds, mode): if mode == AzureStorageModes.Auto: - pass + for blob in self._auto_blobs(creds): + yield blob elif mode == AzureStorageModes.Append: pass elif mode == AzureStorageModes.Block: @@ -526,11 +527,34 @@ def files(self, mode): else: raise RuntimeError('unknown Azure Storage Mode: {}'.format(mode)) - def _append_files(self): + def _append_blobs(self): for _path in self._paths: - pass + def _auto_blobs(self, creds): + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for blob in blobxfer.blob.operations.list_blobs( + sa.block_blob_client, cont, dir): + if blobxfer.crypto.models.EncryptionMetadata.\ + encryption_metadata_exists(blob.metadata): + ed = blobxfer.crypto.models.EncryptionMetadata() + ed.convert_from_json(blob.metadata) + else: + ed = None + yield (_path, blob.name, ed) + + +class AzureStorageEntity(object): + def __init__(self): + self._name = None + self._size = None + self._md5 = None + self._enc = None + self._vio = None + class AzureDestinationPaths(object): def __init__(self): diff --git a/blobxfer/operations.py b/blobxfer/operations.py index 476e597..df1f46f 100644 --- a/blobxfer/operations.py +++ b/blobxfer/operations.py @@ -38,9 +38,9 @@ DownloadSpecification, FileDescriptor, ) -from .blob.operations import check_if_single_blob -from .file.operations import check_if_single_file -from .util import explode_azure_path +import blobxfer.blob.operations +import blobxfer.file.operations +import blobxfer.util def ensure_local_destination(creds, spec): @@ -56,15 +56,18 @@ def ensure_local_destination(creds, spec): if len(spec.sources) == 1: # we need to query the source to see if this is a directory rpath = str(spec.sources[0].paths[0]) - sa = creds.get_storage_account( - spec.sources[0].lookup_storage_account(rpath)) - cont, dir = explode_azure_path(rpath) - if spec.options.mode == AzureStorageModes.File: - if check_if_single_file(sa.file_client, cont, dir): - spec.destination.is_dir = False - else: - if check_if_single_blob(sa.block_blob_client, cont, dir): - spec.destination.is_dir = False + cont, dir = blobxfer.util.explode_azure_path(rpath) + if not blobxfer.util.is_none_or_empty(dir): + sa = creds.get_storage_account( + spec.sources[0].lookup_storage_account(rpath)) + if spec.options.mode == AzureStorageModes.File: + if blobxfer.file.operations.check_if_single_file( + sa.file_client, cont, dir): + spec.destination.is_dir = False + else: + if blobxfer.blob.operations.check_if_single_blob( + sa.block_blob_client, cont, dir): + spec.destination.is_dir = False logging.debug('dest is_dir={} for {} specs'.format( spec.destination.is_dir, len(spec.sources))) # ensure destination path diff --git a/cli/settings.py b/cli/settings.py index ad71bf7..db5c643 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -33,8 +33,9 @@ import enum # non-stdlib imports # local imports +import blobxfer.crypto.operations import blobxfer.models -from blobxfer.util import is_none_or_empty, is_not_empty, merge_dict +import blobxfer.util # enums @@ -59,16 +60,16 @@ def add_cli_options( :param str sync_copy_dest_remote_path: synccopy dest rp """ cli_options['_action'] = action.name.lower() - if is_not_empty(storage_account): + if blobxfer.util.is_not_empty(storage_account): # add credentials try: key = cli_options['access_key'] - if is_none_or_empty(key): + if blobxfer.util.is_none_or_empty(key): raise KeyError() except KeyError: try: key = cli_options['sas'] - if is_none_or_empty(key): + if blobxfer.util.is_none_or_empty(key): raise KeyError() except KeyError: raise RuntimeError('access key or sas must be provided') @@ -133,7 +134,7 @@ def add_cli_options( }, } elif action == TransferAction.Synccopy: - if is_none_or_empty(sync_copy_dest_storage_account): + if blobxfer.util.is_none_or_empty(sync_copy_dest_storage_account): raise RuntimeError( 'must specify a destination storage account') arg = { @@ -159,12 +160,12 @@ def add_cli_options( } try: destkey = cli_options['sync_copy_dest_access_key'] - if is_none_or_empty(destkey): + if blobxfer.util.is_none_or_empty(destkey): raise KeyError() except KeyError: try: destkey = cli_options['sync_copy_dest_sas'] - if is_none_or_empty(destkey): + if blobxfer.util.is_none_or_empty(destkey): raise KeyError() except KeyError: raise RuntimeError( @@ -197,7 +198,7 @@ def merge_settings(config, cli_options): if 'azure_storage' in cli_options: if 'azure_storage' not in config: config['azure_storage'] = {} - config['azure_storage'] = merge_dict( + config['azure_storage'] = blobxfer.util.merge_dict( config['azure_storage'], cli_options['azure_storage']) # merge general options if 'options' not in config: @@ -259,6 +260,14 @@ def create_download_specifications(config): mode == blobxfer.models.AzureStorageModes.Page else: raise ValueError('unknown mode: {}'.format(confmode)) + # load RSA private key PEM file if specified + rpk = conf['options']['rsa_private_key'] + if blobxfer.util.is_not_empty(rpk): + rpkp = conf['options']['rsa_private_key_passphrase'] + rpk = blobxfer.crypto.operations.load_rsa_private_key_file( + rpk, rpkp) + else: + rpk = None ds = blobxfer.models.DownloadSpecification( download_options=blobxfer.models.DownloadOptions( check_file_md5=conf['options']['check_file_md5'], @@ -269,9 +278,7 @@ def create_download_specifications(config): recursive=conf['options']['recursive'], restore_file_attributes=conf[ 'options']['restore_file_attributes'], - rsa_private_key=conf['options']['rsa_private_key'], - rsa_private_key_passphrase=conf[ - 'options']['rsa_private_key_passphrase'], + rsa_private_key=rpk, ), skip_on_options=blobxfer.models.SkipOnOptions( filesize_match=conf['options']['skip_on']['filesize_match'], @@ -290,9 +297,9 @@ def create_download_specifications(config): sa = next(iter(src)) asp = blobxfer.models.AzureSourcePath() asp.add_path_with_storage_account(src[sa], sa) - if is_not_empty(conf['include']): + if blobxfer.util.is_not_empty(conf['include']): asp.add_includes(conf['include']) - if is_not_empty(conf['exclude']): + if blobxfer.util.is_not_empty(conf['exclude']): asp.add_excludes(conf['exclude']) ds.add_azure_source_path(asp) specs.append(ds) diff --git a/setup.py b/setup.py index 6ea9896..889f709 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ 'blobxfer.blob.append', 'blobxfer.blob.block', 'blobxfer.blob.page', + 'blobxfer.crypto', 'blobxfer.file', 'blobxfer_cli', ] diff --git a/tests/test_blobxfer_crypto_models.py b/tests/test_blobxfer_crypto_models.py new file mode 100644 index 0000000..33045c3 --- /dev/null +++ b/tests/test_blobxfer_crypto_models.py @@ -0,0 +1,202 @@ +# coding=utf-8 +"""Tests for crypto operations""" + +# stdlib imports +import copy +import json +# non-stdlib imports +import pytest +# local imports +# module under test +import blobxfer.crypto.models as models +import blobxfer.crypto.operations as ops + + +_SAMPLE_RSA_KEY = """ +-----BEGIN RSA PRIVATE KEY----- +MIICXQIBAAKBgQDwlQ0W6O2ixhZM+LYl/ZtUi4lpjFu6+Kt/fyim/LQojaa389yD +e3lqWnAitj13n8uLpv1XuysG2fL+G0AvzT9JJj8gageJRC/8uffhOlxvH/vzfFqU +wQEgwhuv9LXdFcl+mON4TiHqbKsUmggNNPNzSN/P0aohMG8pG8ihyO3uOQIDAQAB +AoGBAIkaKA96RpKQmHzc79DOqgqQSorf9hajR/ismpovQOwrbWs/iddUMmktiOH/ +QSA+7Fx1mcK5Y1fQNO4i0X1sVjdasoPvmU7iGVgHQ9TX6F5LGQtDqAKXAH6GpjkF +V7I7nEBs2vtetpzzq8up2nY7fuwPwse44jdLGZjh1pc0HcFRAkEA/F5XdWq5ZYVo +hMyxxhdb+6J8NKZTsWn92tW0s/pGlkgDwrryglpLqNf9MR+Mm906UUVh6ZmsKoxD +kZzA+4S3bwJBAPQLSryk8CUE0uFviYYANq3asn9sDDTGcvEceSGGwbaZOTDVQNQg +7BhLL5vA8Be/xvkXfEaWa1XipmaBI+4WINcCQGQLEiid0jkIldJvQtoAUJqEYzCL +7wmZtuSVazkdsfXJPpRnf9Nk8DFSzjA3DYqMPJ4THyl3neSQDgkfVvFeP0kCQQDu +0OIJKwsJ3ueSznhw1mKrzTkh8pUbTBwNEQUEpv+H9fd+byGqtLD1sRXcwHjzdKt8 +9Nubo/VTraGS68tCYQsvAkAYxzwSeX7Gj9/mMBFx1Y5v9sSCqLZQeF7q1ltzkwlK +n3by7Z7RvxXXPjv1YoFQPV0WlA6zo4sm0HwFzA0sbOql +-----END RSA PRIVATE KEY----- +""" + +_SAMPLE_ED = \ + { + "BlobxferExtensions": { + "PreEncryptedContentMD5": "tc+p1sj+vWGPkawoQ9UKHA==" + }, + "ContentEncryptionIV": "KjA4Y14+J1p7EJcYWhnKNQ==", + "EncryptionAgent": { + "EncryptionAlgorithm": "AES_CBC_256", + "Protocol": "1.0" + }, + "EncryptionAuthentication": { + "Algorithm": "HMAC-SHA256", + "MessageAuthenticationCode": + "9oKt5Ett7t1AWahxNq3qcGd5NbZMxLtzSN8Lwqy3PgU=" + }, + "EncryptionMode": "FullBlob", + "KeyWrappingMetadata": {}, + "WrappedContentKey": { + "Algorithm": "RSA-OAEP", + "EncryptedAuthenticationKey": + "1kO63RxIqIyUp1EW+v2o5VwyhAlrrJiLc+seXnNcVRm0YLHzJYqOrBCz2+" + "c2do2dJKhzTOXyPsJSwkvQVJ0NuYVUTxf6bzDNip2Ge1jTHnsd5IsljMKy" + "rSAvHaKs9NxdvDu5Ex6lhKEChnuMtJBq52zCML5+LUd98WkBxdB2az4=", + "EncryptedKey": + "yOuWT2txNNzOITtDcjV1Uf3/V+TRn5AKjvOtHt+PRuBgMhq6fOFV8kcJhO" + "zPxh8bHqydIFM2OQ+ktiETQ5Ibg7OA24hhr+n8Y6nJNpw3cGtP6L/23n8a" + "a7RMKhmactl3sToFM3xvaXRO0DYuDZeQtPR/DDKPgi2gK641y1THAoc=", + "KeyId": "private:key1" + } + } + +_SAMPLE_EDA = \ + { + "EncryptionMetadataAuthentication": { + "Algorithm": "HMAC-SHA256", + "Encoding": "UTF-8", + "MessageAuthenticationCode": + "BhJjehtHxgSRIBaITDB6o6ZUt6mdehN0PDkhHtwXTP8=" + } + } + + +def test_encryption_metadata_exists(): + md = None + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {} + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {'encryptiondata': {}} + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {'encryptiondata': {'key': 'value'}} + assert models.EncryptionMetadata.encryption_metadata_exists(md) + + +def test_convert_from_json(tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write(_SAMPLE_RSA_KEY) + rsaprivatekey = ops.load_rsa_private_key_file(str(keyfile), None) + + # test various missing metadata fields + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAgent']['EncryptionAlgorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAgent']['Protocol'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAuthentication']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionMode'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['WrappedContentKey'].pop('EncryptedAuthenticationKey') + ced['WrappedContentKey']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ceda = copy.deepcopy(_SAMPLE_EDA) + ceda['EncryptionMetadataAuthentication']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + _SAMPLE_ED, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(ceda) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + # test failed hmac + ced = copy.deepcopy(_SAMPLE_ED) + ced.pop('BlobxferExtensions') + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + # test correct path + md = { + 'encryptiondata': json.dumps( + _SAMPLE_ED, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', rsaprivatekey) + assert em.wrapped_content_key is not None + assert em._symkey is not None + assert em._signkey is not None + + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', None) + assert em.wrapped_content_key is not None + assert em._symkey is None + assert em._signkey is None + + ced = copy.deepcopy(_SAMPLE_ED) + ced['WrappedContentKey'].pop('EncryptedAuthenticationKey') + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False) + } + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', rsaprivatekey) + assert em.wrapped_content_key is not None + assert em._symkey is not None + assert em._signkey is None diff --git a/tests/test_blobxfer_crypto_operations.py b/tests/test_blobxfer_crypto_operations.py new file mode 100644 index 0000000..1760701 --- /dev/null +++ b/tests/test_blobxfer_crypto_operations.py @@ -0,0 +1,44 @@ +# coding=utf-8 +"""Tests for crypto operations""" + +# stdlib imports +from mock import patch +import os +# non-stdlib imports +import cryptography.hazmat.primitives.asymmetric.rsa +# local imports +# module under test +import blobxfer.crypto.operations as ops + + +_RSAKEY = cryptography.hazmat.primitives.asymmetric.rsa.generate_private_key( + public_exponent=65537, key_size=2048, + backend=cryptography.hazmat.backends.default_backend()) + + +@patch('cryptography.hazmat.primitives.serialization.load_pem_private_key') +def test_load_rsa_private_key_file(patched_load, tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write('a') + patched_load.return_value = _RSAKEY + + rv = ops.load_rsa_private_key_file(str(keyfile), None) + assert rv == _RSAKEY + + +@patch('cryptography.hazmat.primitives.serialization.load_pem_public_key') +def test_load_rsa_public_key_file(patched_load, tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write('b') + patched_load.return_value = 'rv' + + rv = ops.load_rsa_public_key_file(str(keyfile)) + assert rv == 'rv' + + +def test_rsa_encrypt_decrypt_keys(): + symkey = os.urandom(32) + enckey = ops.rsa_encrypt_key_base64_encoded(_RSAKEY, None, symkey) + assert enckey is not None + plainkey = ops.rsa_decrypt_base64_encoded_key(_RSAKEY, enckey) + assert symkey == plainkey diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index df1790d..2042a15 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -167,7 +167,6 @@ def test_downloadspecification(): recursive=True, restore_file_attributes=False, rsa_private_key=None, - rsa_private_key_passphrase=None, ), skip_on_options=blobxfer.models.SkipOnOptions( filesize_match=True, diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py index 4ddc104..f24703c 100644 --- a/tests/test_blobxfer_operations.py +++ b/tests/test_blobxfer_operations.py @@ -14,8 +14,8 @@ import blobxfer.operations as ops -@patch('blobxfer.operations.check_if_single_file') -@patch('blobxfer.operations.check_if_single_blob') +@patch('blobxfer.file.operations.check_if_single_file') +@patch('blobxfer.blob.operations.check_if_single_blob') def test_ensure_local_destination(patched_blob, patched_file, tmpdir): downdir = tmpdir.join('down') @@ -29,7 +29,6 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): recursive=True, restore_file_attributes=False, rsa_private_key=None, - rsa_private_key_passphrase=None, ), skip_on_options=MagicMock(), local_destination_path=blobxfer.models.LocalDestinationPath( @@ -64,7 +63,6 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): recursive=True, restore_file_attributes=False, rsa_private_key=None, - rsa_private_key_passphrase=None, ), skip_on_options=MagicMock(), local_destination_path=blobxfer.models.LocalDestinationPath( From d862d74aa2ef69523266f384bc3a0d787162726b Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 21 Feb 2017 15:06:43 -0800 Subject: [PATCH 05/47] Begin list parsing for download and MD5 offload - Add AzureStorageEntity model - Start MD5 offload api - Start download api --- blobxfer/api.py | 4 + blobxfer/blob/operations.py | 33 +++- blobxfer/download.py | 222 +++++++++++++++++++++++++ blobxfer/file/operations.py | 87 +++++++--- blobxfer/md5.py | 116 +++++++++++++ blobxfer/models.py | 213 ++++++++++++++++++++---- blobxfer/operations.py | 22 +-- blobxfer/util.py | 2 +- cli/cli.py | 3 +- setup.py | 1 + tests/test_blobxfer_blob_operations.py | 75 +++++++++ tests/test_blobxfer_file_operations.py | 97 ++++++++++- tests/test_blobxfer_models.py | 35 ++++ tests/test_blobxfer_operations.py | 4 +- 14 files changed, 842 insertions(+), 72 deletions(-) create mode 100644 blobxfer/download.py create mode 100644 blobxfer/md5.py create mode 100644 tests/test_blobxfer_blob_operations.py diff --git a/blobxfer/api.py b/blobxfer/api.py index 65e91ad..550f265 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -44,3 +44,7 @@ from .file.operations import ( # noqa create_client as create_file_client ) + +from .download import ( # noqa + download +) diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index 405e384..ef2c976 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -34,42 +34,63 @@ import azure.common import azure.storage.blob.models # local imports +import blobxfer.models # create logger logger = logging.getLogger(__name__) -def check_if_single_blob(client, container, prefix): - # type: (azure.storage.blob.BaseBlobService, str, str) -> bool +def check_if_single_blob(client, container, prefix, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, int) -> bool """Check if prefix is a single blob or multiple blobs :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix + :param int timeout: timeout :rtype: bool :return: if prefix in container is a single blob """ try: client.get_blob_properties( - container_name=container, blob_name=prefix) + container_name=container, blob_name=prefix, timeout=timeout) except azure.common.AzureMissingResourceHttpError: return False return True -def list_blobs(client, container, prefix): - # type: (azure.storage.blob.BaseBlobService, str, - # str) -> azure.storage.blob.models.Blob +def list_blobs(client, container, prefix, mode, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, int, + # blobxfer.models.AzureStorageModes) -> + # azure.storage.blob.models.Blob """List blobs in path conforming to mode :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix + :param blobxfer.models.AzureStorageModes mode: storage mode + :param int timeout: timeout :rtype: azure.storage.blob.models.Blob :return: generator of blobs """ + if mode == blobxfer.models.AzureStorageModes.File: + raise RuntimeError('cannot list Azure Files from blob client') blobs = client.list_blobs( container_name=container, prefix=prefix, include=azure.storage.blob.models.Include.METADATA, + timeout=timeout, ) for blob in blobs: + if (mode == blobxfer.models.AzureStorageModes.Append and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.AppendBlob): + continue + elif (mode == blobxfer.models.AzureStorageModes.Block and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.BlockBlob): + continue + elif (mode == blobxfer.models.AzureStorageModes.Page and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.PageBlob): + continue + # auto or match, yield the blob yield blob diff --git a/blobxfer/download.py b/blobxfer/download.py new file mode 100644 index 0000000..11264c9 --- /dev/null +++ b/blobxfer/download.py @@ -0,0 +1,222 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import datetime +import dateutil.tz +import enum +import logging +try: + import pathlib2 as pathlib +except ImportError: + import pathlib +import threading +# non-stdlib imports +import dateutil +# local imports +import blobxfer.md5 +import blobxfer.operations +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# global defines +_MD5_MAP = {} +_MD5_META_LOCK = threading.Lock() +_ALL_REMOTE_FILES_PROCESSED = False + + +class DownloadAction(enum.Enum): + Skip = 1 + CheckMd5 = 2 + Download = 3 + + +def _check_download_conditions(lpath, rfile, spec): + # type: (pathlib.Path, blobxfer.models.AzureStorageEntity, + # blobxfer.models.DownloadSpecification) -> DownloadAction + """Check for download conditions + :param pathlib.Path lpath: local path + :param blobxfer.models.AzureStorageEntity rfile: remote file + :param blobxfer.models.DownloadSpecification spec: download spec + :rtype: DownloadAction + :return: download action + """ + if not lpath.exists(): + return DownloadAction.Download + if not spec.options.overwrite: + logger.info( + 'not overwriting local file: {} (remote: {}/{})'.format( + lpath, rfile.container, rfile.name)) + return DownloadAction.Skip + # check skip on options, MD5 match takes priority + if spec.skip_on.md5_match: + return DownloadAction.CheckMd5 + # if neither of the remaining skip on actions are activated, download + if not spec.skip_on.filesize_match and not spec.skip_on.lmt_ge: + return DownloadAction.Download + # check skip on file size match + dl_fs = None + if spec.skip_on.filesize_match: + lsize = lpath.stat().st_size + if rfile.mode == blobxfer.models.AzureStorageModes.Page: + lsize = blobxfer.util.page_align_content_length(lsize) + if rfile.size == lsize: + dl_fs = False + else: + dl_fs = True + # check skip on lmt ge + dl_lmt = None + if spec.skip_on.lmt_ge: + mtime = datetime.datetime.fromtimestamp( + lpath.stat().st_mtime, tz=dateutil.tz.tzlocal()) + if mtime >= rfile.lmt: + dl_lmt = False + else: + dl_lmt = True + # download if either skip on mismatch is True + if dl_fs or dl_lmt: + return DownloadAction.Download + else: + return DownloadAction.Skip + + +def pre_md5_skip_on_check(lpath, rfile): + # type: (pathlib.Path, blobxfer.models.AzureStorageEntity) -> None + """Perform pre MD5 skip on check + :param pathlib.Path lpath: local path + :param blobxfer.models.AzureStorageEntity rfile: remote file + """ + global _MD5_META_LOCK, _MD5_MAP + # if encryption metadata is present, check for pre-encryption + # md5 in blobxfer extensions + md5 = None + if rfile.encryption_metadata is not None: + md5 = rfile.encryption_metadata.blobxfer_extensions.\ + pre_encrypted_content_md5 + if md5 is None: + md5 = rfile.md5 + slpath = str(lpath) + with _MD5_META_LOCK: + _MD5_MAP[slpath] = rfile + print('pre', lpath, len(_MD5_MAP)) + blobxfer.md5.add_file_for_md5_check( + slpath, md5, rfile.mode) + + +def post_md5_skip_on_check(filename, md5_match): + # type: (str, bool) -> None + """Perform post MD5 skip on check + :param str filename: local filename + :param bool md5_match: if MD5 matches + """ + global _MD5_META_LOCK, _MD5_MAP + if not md5_match: + lpath = pathlib.Path(filename) + # TODO enqueue file for download + with _MD5_META_LOCK: + _MD5_MAP.pop(filename) + + +def check_md5_downloads_thread(): + def check_for_downloads_from_md5(): + # type: (None) -> str + """Check queue for a file to download + :rtype: str + :return: local file path + """ + global _MD5_META_LOCK, _MD5_MAP, _ALL_REMOTE_FILES_PROCESSED + cv = blobxfer.md5.get_done_cv() + while True: + with _MD5_META_LOCK: + if len(_MD5_MAP) == 0 and _ALL_REMOTE_FILES_PROCESSED: + break + cv.acquire() + while True: + result = blobxfer.md5.check_md5_file_for_download() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + else: + break + cv.release() + if result is not None: + post_md5_skip_on_check(result[0], result[1]) + + thr = threading.Thread(target=check_for_downloads_from_md5) + thr.start() + return thr + + +def download(general_options, creds, spec): + # type: (blobxfer.models.GeneralOptions, + # blobxfer.models.AzureStorageCredentials, + # blobxfer.models.DownloadSpecification) -> None + """Download action + :param blobxfer.models.GeneralOptions general_options: general opts + :param blobxfer.models.AzureStorageCredentials creds: creds + :param blobxfer.models.DownloadSpecification spec: download spec + """ + # ensure destination path + blobxfer.operations.ensure_local_destination(creds, spec) + logger.info('downloading to local path: {}'.format(spec.destination.path)) + # initialize MD5 processes + blobxfer.md5.initialize_md5_processes() + md5_thread = check_md5_downloads_thread() + # iterate through source paths to download + for src in spec.sources: + for rfile in src.files(creds, spec.options, general_options): + # form local path for remote file + lpath = pathlib.Path(spec.destination.path, rfile.name) + # check on download conditions + action = _check_download_conditions(lpath, rfile, spec) + if action == DownloadAction.Skip: + continue + elif action == DownloadAction.CheckMd5: + pre_md5_skip_on_check(lpath, rfile) + elif action == DownloadAction.Download: + # add to download queue + ### TODO + pass + # cond checks? + print(rfile.container, rfile.name, rfile.lmt, rfile.size, + rfile.md5, rfile.mode, rfile.encryption_metadata) + + global _MD5_META_LOCK, _ALL_REMOTE_FILES_PROCESSED + with _MD5_META_LOCK: + _ALL_REMOTE_FILES_PROCESSED = True + md5_thread.join() + blobxfer.md5.finalize_md5_processes() + + import time + time.sleep(5) + diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index 3501b65..eae7640 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -36,7 +36,7 @@ import pathlib # non-stdlib imports import azure.common -from azure.storage.file import FileService +import azure.storage.file # local imports # create logger @@ -51,12 +51,12 @@ def create_client(storage_account): :return: file service client """ if storage_account.is_sas: - client = FileService( + client = azure.storage.file.FileService( account_name=storage_account.name, sas_token=storage_account.key, endpoint_suffix=storage_account.endpoint) else: - client = FileService( + client = azure.storage.file.FileService( account_name=storage_account.name, account_key=storage_account.key, endpoint_suffix=storage_account.endpoint) @@ -64,37 +64,84 @@ def create_client(storage_account): def parse_file_path(filepath): - # type: (pathlib.Path) -> tuple + # type: (pathlib.Path) -> Tuple[str, str] + """Parse file path from file path + :param str filepath: file path + :rtype: tuple + :return: (dirname, rest of path) + """ if not isinstance(filepath, pathlib.Path): filepath = pathlib.Path(filepath) dirname = '/'.join(filepath.parts[:len(filepath.parts) - 1]) - return (dirname, filepath.parts[-1]) + if len(dirname) == 0: + dirname = None + if len(filepath.parts) > 0: + fname = filepath.parts[-1] + else: + fname = None + return (dirname, fname) -def check_if_single_file(client, fileshare, prefix): - # type: (azure.storage.file.FileService, str, str) -> bool +def check_if_single_file(client, fileshare, prefix, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> + # Tuple[bool, azure.storage.file.models.File] """Check if prefix is a single file or multiple files :param FileService client: blob client :param str fileshare: file share name :param str prefix: path prefix - :rtype: bool - :return: if prefix in fileshare is a single file + :param int timeout: timeout + :rtype: tuple + :return: (if prefix in fileshare is a single file, file) """ dirname, fname = parse_file_path(prefix) + file = None try: - client.get_file_properties( - share_name=fileshare, directory_name=dirname, file_name=fname) + file = client.get_file_properties( + share_name=fileshare, + directory_name=dirname, + file_name=fname, + timeout=timeout, + ) except azure.common.AzureMissingResourceHttpError: - return False - return True + return (False, file) + return (True, file) -def list_blobs(client, container, prefix, mode): - # type: (azure.storage.blob.BaseBlobService, str, str, - # blobxfer.models.AzureStorageModes) -> list - """List blobs in path conforming to mode - :param azure.storage.blob.BaseBlobService client: blob client - :param str container: container +def list_files(client, fileshare, prefix, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> + # azure.storage.file.models.File + """List files in path + :param azure.storage.file.FileService client: file client + :param str fileshare: file share :param str prefix: path prefix + :param int timeout: timeout + :rtype: azure.storage.file.models.File + :return: generator of files """ - pass + # if single file, then yield file and return + _check = check_if_single_file(client, fileshare, prefix, timeout) + if _check[0]: + yield _check[1] + return + # else recursively list from prefix path + dirs = [prefix] + while len(dirs) > 0: + dir = dirs.pop() + files = client.list_directories_and_files( + share_name=fileshare, + directory_name=dir, + timeout=timeout, + ) + for file in files: + fspath = str(pathlib.Path( + dir if dir is not None else '' / file.name)) + if isinstance(file, azure.storage.file.File): + fsprop = client.get_file_properties( + share_name=fileshare, + directory_name=dir, + file_name=file.name, + timeout=timeout, + ) + yield fsprop + else: + dirs.append(fspath) diff --git a/blobxfer/md5.py b/blobxfer/md5.py new file mode 100644 index 0000000..fafd3f8 --- /dev/null +++ b/blobxfer/md5.py @@ -0,0 +1,116 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +import multiprocessing +try: + import queue +except ImportError: + import Queue as queue +# non-stdlib imports +# local imports +import blobxfer.download +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# global defines +_TASK_QUEUE = multiprocessing.Queue() +_DONE_QUEUE = multiprocessing.Queue() +_DONE_CV = multiprocessing.Condition() +_MD5_PROCS = [] + + +def _worker_md5_file_process(): + global _TASK_QUEUE, _DONE_QUEUE + while True: + filename, remote_md5, pagealign = _TASK_QUEUE.get() + md5 = blobxfer.util.compute_md5_for_file_asbase64(filename, pagealign) + logger.debug('MD5: {} {} {}'.format(md5, remote_md5, filename)) + _DONE_CV.acquire() + _DONE_QUEUE.put((filename, md5 == remote_md5)) + _DONE_CV.notify() + _DONE_CV.release() + + +def get_done_cv(): + global _DONE_CV + return _DONE_CV + + +def check_md5_file_for_download(): + # type: (None) -> str + """Check queue for a file to download + :rtype: str + :return: local file path + """ + global _DONE_QUEUE + try: + return _DONE_QUEUE.get_nowait() + except queue.Empty: + return None + + +def add_file_for_md5_check(filename, remote_md5, mode): + # type: (str, str, blobxfer.models.AzureStorageModes) -> bool + """Check an MD5 for a file for download + :param str filename: file to compute MD5 for + :param str remote_md5: remote MD5 to compare against + :param blobxfer.models.AzureStorageModes mode: mode + :rtype: bool + :return: MD5 match comparison + """ + global _TASK_QUEUE + if mode == blobxfer.models.AzureStorageModes.Page: + pagealign = True + else: + pagealign = False + _TASK_QUEUE.put((filename, remote_md5, pagealign)) + + +def initialize_md5_processes(num_workers=None): + global _MD5_PROCS + if num_workers is None or num_workers < 1: + num_workers = multiprocessing.cpu_count() // 2 + if num_workers < 1: + num_workers = 1 + for _ in range(num_workers): + proc = multiprocessing.Process(target=_worker_md5_file_process) + proc.start() + _MD5_PROCS.append(proc) + + +def finalize_md5_processes(): + global _MD5_PROCS + for proc in _MD5_PROCS: + proc.terminate() + proc.join() diff --git a/blobxfer/models.py b/blobxfer/models.py index 921eef8..918eaaa 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -47,8 +47,10 @@ create_file_client, create_page_blob_client, ) +from azure.storage.blob.models import _BlobTypes as BlobTypes import blobxfer.blob.operations -import blobxfer.crypto +import blobxfer.file.operations +import blobxfer.crypto.models import blobxfer.util # create logger @@ -57,11 +59,11 @@ # enums class AzureStorageModes(enum.Enum): - Auto = 1 - Append = 2 - Block = 3 - File = 4 - Page = 5 + Auto = 10 + Append = 20 + Block = 30 + File = 40 + Page = 50 # named tuples @@ -512,49 +514,204 @@ def lookup_storage_account(self, remote_path): """ return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] - def files(self, creds, mode): - if mode == AzureStorageModes.Auto: - for blob in self._auto_blobs(creds): - yield blob - elif mode == AzureStorageModes.Append: - pass - elif mode == AzureStorageModes.Block: - pass - elif mode == AzureStorageModes.File: - pass - elif mode == AzureStorageModes.Page: - pass + def files(self, creds, options, general_options): + # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, + # GeneralOptions) -> AzureStorageEntity + """Generator of Azure remote files or blobs + :param AzureSourcePath self: this + :param AzureStorageCredentials creds: storage creds + :param DownloadOptions options: download options + :param GeneralOptions general_options: general options + :rtype: AzureStorageEntity + :return: Azure storage entity object + """ + if options.mode == AzureStorageModes.File: + for file in self._populate_from_list_files( + creds, options, general_options): + yield file else: - raise RuntimeError('unknown Azure Storage Mode: {}'.format(mode)) + for blob in self._populate_from_list_blobs( + creds, options, general_options): + yield blob - def _append_blobs(self): + def _populate_from_list_files(self, creds, options, general_options): + # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, + # GeneralOptions) -> AzureStorageEntity + """Internal generator for Azure remote files + :param AzureSourcePath self: this + :param AzureStorageCredentials creds: storage creds + :param DownloadOptions options: download options + :param GeneralOptions general_options: general options + :rtype: AzureStorageEntity + :return: Azure storage entity object + """ for _path in self._paths: - pass - - def _auto_blobs(self, creds): + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for file in blobxfer.file.operations.list_files( + sa.file_client, cont, dir, general_options.timeout_sec): + if blobxfer.crypto.models.EncryptionMetadata.\ + encryption_metadata_exists(file.metadata): + ed = blobxfer.crypto.models.EncryptionMetadata() + ed.convert_from_json( + file.metadata, file.name, options.rsa_private_key) + else: + ed = None + ase = AzureStorageEntity(cont, ed) + ase.populate_from_file(file) + yield ase + + def _populate_from_list_blobs(self, creds, options, general_options): + # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, + # GeneralOptions) -> AzureStorageEntity + """Internal generator for Azure remote blobs + :param AzureSourcePath self: this + :param AzureStorageCredentials creds: storage creds + :param DownloadOptions options: download options + :param GeneralOptions general_options: general options + :rtype: AzureStorageEntity + :return: Azure storage entity object + """ for _path in self._paths: rpath = str(_path) cont, dir = blobxfer.util.explode_azure_path(rpath) sa = creds.get_storage_account(self.lookup_storage_account(rpath)) for blob in blobxfer.blob.operations.list_blobs( - sa.block_blob_client, cont, dir): + sa.block_blob_client, cont, dir, options.mode, + general_options.timeout_sec): if blobxfer.crypto.models.EncryptionMetadata.\ encryption_metadata_exists(blob.metadata): ed = blobxfer.crypto.models.EncryptionMetadata() - ed.convert_from_json(blob.metadata) + ed.convert_from_json( + blob.metadata, blob.name, options.rsa_private_key) else: ed = None - yield (_path, blob.name, ed) + ase = AzureStorageEntity(cont, ed) + ase.populate_from_blob(blob) + yield ase class AzureStorageEntity(object): - def __init__(self): + """Azure Storage Entity""" + def __init__(self, container, ed=None): + # type: (AzureStorageEntity, str + # blobxfer.crypto.models.EncryptionMetadata) -> None + """Ctor for AzureStorageEntity + :param AzureStorageEntity self: this + :param str container: container name + :param blobxfer.crypto.models.EncryptionMetadata ed: + encryption metadata + """ + self._container = container self._name = None + self._mode = None + self._lmt = None self._size = None self._md5 = None - self._enc = None + self._encryption = ed self._vio = None + def populate_from_blob(self, blob): + # type: (AzureStorageEntity, azure.storage.blob.models.Blob) -> None + """Populate properties from Blob + :param AzureStorageEntity self: this + :param azure.storage.blob.models.Blob blob: blob to populate from + """ + self._name = blob.name + self._lmt = blob.properties.last_modified + self._size = blob.properties.content_length + self._md5 = blob.properties.content_settings.content_md5 + if blob.properties.blob_type == BlobTypes.AppendBlob: + self._mode = AzureStorageModes.Append + elif blob.properties.blob_type == BlobTypes.BlockBlob: + self._mode = AzureStorageModes.Block + elif blob.properties.blob_type == BlobTypes.PageBlob: + self._mode = AzureStorageModes.Page + + def populate_from_file(self, file): + # type: (AzureStorageEntity, azure.storage.file.models.File) -> None + """Populate properties from File + :param AzureStorageEntity self: this + :param azure.storage.file.models.File file: file to populate from + """ + self._name = file.name + self._lmt = file.properties.last_modified + self._size = file.properties.content_length + self._md5 = file.properties.content_settings.content_md5 + self._mode = AzureStorageModes.File + + @property + def container(self): + # type: (AzureStorageEntity) -> str + """Container name + :param AzureStorageEntity self: this + :rtype: str + :return: name of container or file share + """ + return self._container + + @property + def name(self): + # type: (AzureStorageEntity) -> str + """Entity name + :param AzureStorageEntity self: this + :rtype: str + :return: name of entity + """ + return self._name + + @property + def lmt(self): + # type: (AzureStorageEntity) -> datetime.datetime + """Entity last modified time + :param AzureStorageEntity self: this + :rtype: datetime.datetime + :return: LMT of entity + """ + return self._lmt + + @property + def size(self): + # type: (AzureStorageEntity) -> int + """Entity size + :param AzureStorageEntity self: this + :rtype: int + :return: size of entity + """ + return self._size + + @property + def md5(self): + # type: (AzureStorageEntity) -> str + """Base64-encoded MD5 + :param AzureStorageEntity self: this + :rtype: str + :return: md5 of entity + """ + return self._md5 + + @property + def mode(self): + # type: (AzureStorageEntity) -> AzureStorageModes + """Entity mode (type) + :param AzureStorageEntity self: this + :rtype: AzureStorageModes + :return: type of entity + """ + return self._mode + + @property + def encryption_metadata(self): + # type: (AzureStorageEntity) -> + # blobxfer.crypto.models.EncryptionMetadata + """Entity mode (type) + :param AzureStorageEntity self: this + :rtype: blobxfer.crypto.models.EncryptionMetadata + :return: encryption metadata of entity + """ + return self._encryption + class AzureDestinationPaths(object): def __init__(self): diff --git a/blobxfer/operations.py b/blobxfer/operations.py index df1f46f..82e4024 100644 --- a/blobxfer/operations.py +++ b/blobxfer/operations.py @@ -32,21 +32,21 @@ import logging # non-stdlib imports # local imports -from .models import ( # noqa - AzureStorageCredentials, - AzureStorageModes, - DownloadSpecification, - FileDescriptor, -) +import blobxfer.models import blobxfer.blob.operations import blobxfer.file.operations import blobxfer.util +# create logger +logger = logging.getLogger(__name__) + def ensure_local_destination(creds, spec): + # type: (blobxfer.models.AzureStorageCredentials, + # blobxfer.models.DownloadSpecification) -> None """Ensure a local destination path given a download spec - :param AzureStorageCredentials creds: creds - :param DownloadSpecification spec: download spec + :param blobxfer.models.AzureStorageCredentials creds: creds + :param blobxfer.models.DownloadSpecification spec: download spec """ # ensure destination path is writable given the source if len(spec.sources) < 1: @@ -60,15 +60,15 @@ def ensure_local_destination(creds, spec): if not blobxfer.util.is_none_or_empty(dir): sa = creds.get_storage_account( spec.sources[0].lookup_storage_account(rpath)) - if spec.options.mode == AzureStorageModes.File: + if spec.options.mode == blobxfer.models.AzureStorageModes.File: if blobxfer.file.operations.check_if_single_file( - sa.file_client, cont, dir): + sa.file_client, cont, dir)[0]: spec.destination.is_dir = False else: if blobxfer.blob.operations.check_if_single_blob( sa.block_blob_client, cont, dir): spec.destination.is_dir = False - logging.debug('dest is_dir={} for {} specs'.format( + logger.debug('dest is_dir={} for {} specs'.format( spec.destination.is_dir, len(spec.sources))) # ensure destination path spec.destination.ensure_path_exists() diff --git a/blobxfer/util.py b/blobxfer/util.py index 9b778bd..dd116bd 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -235,7 +235,7 @@ def explode_azure_path(path): :return: container, vpath """ rpath = normalize_azure_path(path).split('/') - container = rpath[0] + container = str(rpath[0]) if len(rpath) > 1: rpath = '/'.join(rpath[1:]) else: diff --git a/cli/cli.py b/cli/cli.py index 1ec99d4..508013a 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -41,7 +41,6 @@ import blobxfer.api import blobxfer.util # local imports -import download as dl import settings # create logger @@ -592,7 +591,7 @@ def download(ctx, local_resource, storage_account, remote_path): ctx.initialize() specs = settings.create_download_specifications(ctx.config) for spec in specs: - dl.download(ctx.general_options, ctx.credentials, spec) + blobxfer.api.download(ctx.general_options, ctx.credentials, spec) @cli.command('synccopy') diff --git a/setup.py b/setup.py index 889f709..475cb90 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ 'click==6.6', 'cryptography>=1.7.1', 'future==0.16.0', + 'python-dateutil==2.6.0', 'ruamel.yaml==0.13.11', ] diff --git a/tests/test_blobxfer_blob_operations.py b/tests/test_blobxfer_blob_operations.py new file mode 100644 index 0000000..1b7d300 --- /dev/null +++ b/tests/test_blobxfer_blob_operations.py @@ -0,0 +1,75 @@ +# coding=utf-8 +"""Tests for general blob operations""" + +# stdlib imports +import mock +# non-stdlib imports +import azure.common +import azure.storage.blob +import pytest +# local imports +import blobxfer.models as models +# module under test +import blobxfer.blob.operations as ops + + +def test_check_if_single_blob(): + client = mock.MagicMock() + client.get_blob_properties = mock.MagicMock() + client.get_blob_properties.return_value = mock.MagicMock() + + result = ops.check_if_single_blob(client, 'a', 'b/c') + assert result + + client = mock.MagicMock() + client.get_blob_properties = mock.MagicMock() + client.get_blob_properties.side_effect = \ + azure.common.AzureMissingResourceHttpError('msg', 404) + + result = ops.check_if_single_blob(client, 'a', 'b/c') + assert not result + + +def test_list_blobs(): + with pytest.raises(RuntimeError): + for blob in ops.list_blobs( + None, 'cont', 'prefix', models.AzureStorageModes.File): + pass + + client = mock.MagicMock() + client.list_blobs = mock.MagicMock() + _blob = azure.storage.blob.models.Blob(name='name') + _blob.properties = azure.storage.blob.models.BlobProperties() + client.list_blobs.return_value = [_blob] + + i = 0 + for blob in ops.list_blobs( + client, 'cont', 'prefix', models.AzureStorageModes.Auto): + i += 1 + assert blob.name == 'name' + assert i == 1 + + _blob.properties.blob_type = \ + azure.storage.blob.models._BlobTypes.AppendBlob + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', models.AzureStorageModes.Block): + i += 1 + assert blob.name == 'name' + assert i == 0 + + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', models.AzureStorageModes.Page): + i += 1 + assert blob.name == 'name' + assert i == 0 + + _blob.properties.blob_type = \ + azure.storage.blob.models._BlobTypes.BlockBlob + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', models.AzureStorageModes.Append): + i += 1 + assert blob.name == 'name' + assert i == 0 diff --git a/tests/test_blobxfer_file_operations.py b/tests/test_blobxfer_file_operations.py index 56c4b95..fd39912 100644 --- a/tests/test_blobxfer_file_operations.py +++ b/tests/test_blobxfer_file_operations.py @@ -1,12 +1,14 @@ # coding=utf-8 -"""Tests for models""" +"""Tests for file operations""" # stdlib imports +import mock # non-stdlib imports +import azure.common import azure.storage -import pytest # local imports import blobxfer.models as models +import blobxfer.util as util # module under test import blobxfer.file.operations as ops @@ -27,3 +29,94 @@ def test_create_client(): assert isinstance( client.authentication, azure.storage._auth._StorageSASAuthentication) + + +def test_parse_file_path(): + rpath = '/a/b/c' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir == 'b' + assert fname == 'c' + + rpath = 'a/b/c/d' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir == 'b/c' + assert fname == 'd' + + rpath = 'a/b' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir is None + assert fname == 'b' + + rpath = 'a' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir is None + assert fname is None + + +def test_check_if_single_file(): + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = mock.MagicMock() + + result = ops.check_if_single_file(client, 'a', 'b/c') + assert result[0] + + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.side_effect = \ + azure.common.AzureMissingResourceHttpError('msg', 404) + + result = ops.check_if_single_file(client, 'a', 'b/c') + assert not result[0] + + +def test_list_files_single_file(): + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = 'fp' + + i = 0 + for file in ops.list_files(client, 'a', 'b/c'): + i += 1 + assert file == 'fp' + assert i == 1 + + +@mock.patch( + 'blobxfer.file.operations.check_if_single_file', + return_value=(False, None) +) +def test_list_files_directory(patched_cisf): + client = mock.MagicMock() + client.list_directories_and_files = mock.MagicMock() + _file = azure.storage.file.models.File(name='name') + client.list_directories_and_files.return_value = [_file] + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = _file + + i = 0 + for file in ops.list_files(client, 'dir', ''): + i += 1 + assert file.name == 'name' + assert i == 1 + + client = mock.MagicMock() + client.list_directories_and_files = mock.MagicMock() + _file = azure.storage.file.models.File(name='name') + client.list_directories_and_files.side_effect = [['dir'], [file]] + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = _file + + i = 0 + for file in ops.list_files(client, 'dir', ''): + i += 1 + assert file.name == 'name' + assert i == 1 diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 2042a15..c9a8d81 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -2,6 +2,7 @@ """Tests for models""" # stdlib imports +import mock import os try: import pathlib2 as pathlib @@ -9,6 +10,7 @@ import pathlib # non-stdlib imports import azure.storage +import azure.storage.blob import pytest # module under test import blobxfer.models @@ -188,3 +190,36 @@ def test_downloadspecification(): assert len(ds.sources) == 1 assert p in ds.sources[0]._path_map assert ds.sources[0]._path_map[p] == 'sa' + + +def test_azurestorageentity(): + ase = blobxfer.models.AzureStorageEntity('cont') + assert ase.container == 'cont' + assert ase.encryption_metadata is None + + blob = mock.MagicMock() + blob.name = 'name' + blob.properties = mock.MagicMock() + blob.properties.last_modified = 'lmt' + blob.properties.content_length = 123 + blob.properties.content_settings = mock.MagicMock() + blob.properties.content_settings.content_md5 = 'abc' + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.BlockBlob + ase.populate_from_blob(blob) + + assert ase.name == 'name' + assert ase.lmt == 'lmt' + assert ase.size == 123 + assert ase.md5 == 'abc' + assert ase.mode == blobxfer.models.AzureStorageModes.Block + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob + ase.populate_from_blob(blob) + assert ase.mode == blobxfer.models.AzureStorageModes.Append + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob + ase.populate_from_blob(blob) + assert ase.mode == blobxfer.models.AzureStorageModes.Page + + ase.populate_from_file(blob) + assert ase.mode == blobxfer.models.AzureStorageModes.File diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py index f24703c..78aef22 100644 --- a/tests/test_blobxfer_operations.py +++ b/tests/test_blobxfer_operations.py @@ -72,10 +72,10 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): ds.add_azure_source_path(asp) - patched_file.return_value = False + patched_file.return_value = (False, None) ops.ensure_local_destination(MagicMock(), ds) assert ds.destination.is_dir - patched_file.return_value = True + patched_file.return_value = (True, MagicMock()) with pytest.raises(RuntimeError): ops.ensure_local_destination(MagicMock(), ds) From 290a1ebdf903806f187dc212663f8fc1f7bd8ca8 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 21 Feb 2017 21:20:16 -0800 Subject: [PATCH 06/47] Refactor MD5/Downloader to classes - Eliminate global state --- blobxfer/api.py | 2 +- blobxfer/download.py | 331 ++++++++++++++++++++++--------------------- blobxfer/md5.py | 155 +++++++++++--------- cli/cli.py | 4 +- 4 files changed, 265 insertions(+), 227 deletions(-) diff --git a/blobxfer/api.py b/blobxfer/api.py index 550f265..69444ae 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -46,5 +46,5 @@ ) from .download import ( # noqa - download + Downloader ) diff --git a/blobxfer/download.py b/blobxfer/download.py index 11264c9..b5a87d4 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -49,11 +49,6 @@ # create logger logger = logging.getLogger(__name__) -# global defines -_MD5_MAP = {} -_MD5_META_LOCK = threading.Lock() -_ALL_REMOTE_FILES_PROCESSED = False - class DownloadAction(enum.Enum): Skip = 1 @@ -61,162 +56,176 @@ class DownloadAction(enum.Enum): Download = 3 -def _check_download_conditions(lpath, rfile, spec): - # type: (pathlib.Path, blobxfer.models.AzureStorageEntity, - # blobxfer.models.DownloadSpecification) -> DownloadAction - """Check for download conditions - :param pathlib.Path lpath: local path - :param blobxfer.models.AzureStorageEntity rfile: remote file - :param blobxfer.models.DownloadSpecification spec: download spec - :rtype: DownloadAction - :return: download action - """ - if not lpath.exists(): - return DownloadAction.Download - if not spec.options.overwrite: - logger.info( - 'not overwriting local file: {} (remote: {}/{})'.format( - lpath, rfile.container, rfile.name)) - return DownloadAction.Skip - # check skip on options, MD5 match takes priority - if spec.skip_on.md5_match: - return DownloadAction.CheckMd5 - # if neither of the remaining skip on actions are activated, download - if not spec.skip_on.filesize_match and not spec.skip_on.lmt_ge: - return DownloadAction.Download - # check skip on file size match - dl_fs = None - if spec.skip_on.filesize_match: - lsize = lpath.stat().st_size - if rfile.mode == blobxfer.models.AzureStorageModes.Page: - lsize = blobxfer.util.page_align_content_length(lsize) - if rfile.size == lsize: - dl_fs = False - else: - dl_fs = True - # check skip on lmt ge - dl_lmt = None - if spec.skip_on.lmt_ge: - mtime = datetime.datetime.fromtimestamp( - lpath.stat().st_mtime, tz=dateutil.tz.tzlocal()) - if mtime >= rfile.lmt: - dl_lmt = False +class Downloader(object): + """Downloader""" + def __init__(self, general_options, creds, spec): + # type: (Downloader, blobxfer.models.GeneralOptions, + # blobxfer.models.AzureStorageCredentials, + # blobxfer.models.DownloadSpecification) -> None + """Ctor for Downloader + :param Downloader self: this + :param blobxfer.models.GeneralOptions general_options: general opts + :param blobxfer.models.AzureStorageCredentials creds: creds + :param blobxfer.models.DownloadSpecification spec: download spec + """ + self._md5_meta_lock = threading.Lock() + self._all_remote_files_processed = False + self._md5_map = {} + self._md5_offload = None + self._md5_check_thread = None + self._general_options = general_options + self._creds = creds + self._spec = spec + + def _check_download_conditions(self, lpath, rfile, spec): + # type: (Downloader, pathlib.Path, blobxfer.models.AzureStorageEntity, + # blobxfer.models.DownloadSpecification) -> DownloadAction + """Check for download conditions + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.AzureStorageEntity rfile: remote file + :param blobxfer.models.DownloadSpecification spec: download spec + :rtype: DownloadAction + :return: download action + """ + if not lpath.exists(): + return DownloadAction.Download + if not spec.options.overwrite: + logger.info( + 'not overwriting local file: {} (remote: {}/{})'.format( + lpath, rfile.container, rfile.name)) + return DownloadAction.Skip + # check skip on options, MD5 match takes priority + if spec.skip_on.md5_match: + return DownloadAction.CheckMd5 + # if neither of the remaining skip on actions are activated, download + if not spec.skip_on.filesize_match and not spec.skip_on.lmt_ge: + return DownloadAction.Download + # check skip on file size match + dl_fs = None + if spec.skip_on.filesize_match: + lsize = lpath.stat().st_size + if rfile.mode == blobxfer.models.AzureStorageModes.Page: + lsize = blobxfer.util.page_align_content_length(lsize) + if rfile.size == lsize: + dl_fs = False + else: + dl_fs = True + # check skip on lmt ge + dl_lmt = None + if spec.skip_on.lmt_ge: + mtime = datetime.datetime.fromtimestamp( + lpath.stat().st_mtime, tz=dateutil.tz.tzlocal()) + if mtime >= rfile.lmt: + dl_lmt = False + else: + dl_lmt = True + # download if either skip on mismatch is True + if dl_fs or dl_lmt: + return DownloadAction.Download else: - dl_lmt = True - # download if either skip on mismatch is True - if dl_fs or dl_lmt: - return DownloadAction.Download - else: - return DownloadAction.Skip - - -def pre_md5_skip_on_check(lpath, rfile): - # type: (pathlib.Path, blobxfer.models.AzureStorageEntity) -> None - """Perform pre MD5 skip on check - :param pathlib.Path lpath: local path - :param blobxfer.models.AzureStorageEntity rfile: remote file - """ - global _MD5_META_LOCK, _MD5_MAP - # if encryption metadata is present, check for pre-encryption - # md5 in blobxfer extensions - md5 = None - if rfile.encryption_metadata is not None: - md5 = rfile.encryption_metadata.blobxfer_extensions.\ - pre_encrypted_content_md5 - if md5 is None: - md5 = rfile.md5 - slpath = str(lpath) - with _MD5_META_LOCK: - _MD5_MAP[slpath] = rfile - print('pre', lpath, len(_MD5_MAP)) - blobxfer.md5.add_file_for_md5_check( - slpath, md5, rfile.mode) - - -def post_md5_skip_on_check(filename, md5_match): - # type: (str, bool) -> None - """Perform post MD5 skip on check - :param str filename: local filename - :param bool md5_match: if MD5 matches - """ - global _MD5_META_LOCK, _MD5_MAP - if not md5_match: - lpath = pathlib.Path(filename) - # TODO enqueue file for download - with _MD5_META_LOCK: - _MD5_MAP.pop(filename) - - -def check_md5_downloads_thread(): - def check_for_downloads_from_md5(): - # type: (None) -> str - """Check queue for a file to download - :rtype: str - :return: local file path + return DownloadAction.Skip + + def _pre_md5_skip_on_check(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.AzureStorageEntity) -> None + """Perform pre MD5 skip on check + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.AzureStorageEntity rfile: remote file + """ + # if encryption metadata is present, check for pre-encryption + # md5 in blobxfer extensions + md5 = None + if rfile.encryption_metadata is not None: + md5 = rfile.encryption_metadata.blobxfer_extensions.\ + pre_encrypted_content_md5 + if md5 is None: + md5 = rfile.md5 + slpath = str(lpath) + with self._md5_meta_lock: + self._md5_map[slpath] = rfile + self._md5_offload.add_localfile_for_md5_check(slpath, md5, rfile.mode) + + def _post_md5_skip_on_check(self, filename, md5_match): + # type: (Downloader, str, bool) -> None + """Perform post MD5 skip on check + :param Downloader self: this + :param str filename: local filename + :param bool md5_match: if MD5 matches + """ + if not md5_match: + lpath = pathlib.Path(filename) + # TODO enqueue file for download + with self._md5_meta_lock: + self._md5_map.pop(filename) + + def _initialize_check_md5_downloads_thread(self): + # type: (Downloader) -> None + """Initialize the md5 done queue check thread + :param Downloader self: this """ - global _MD5_META_LOCK, _MD5_MAP, _ALL_REMOTE_FILES_PROCESSED - cv = blobxfer.md5.get_done_cv() - while True: - with _MD5_META_LOCK: - if len(_MD5_MAP) == 0 and _ALL_REMOTE_FILES_PROCESSED: - break - cv.acquire() + def _check_for_downloads_from_md5(self): + # type: (Downloader) -> None + """Check queue for a file to download + :param Downloader self: this + """ + cv = self._md5_offload.done_cv while True: - result = blobxfer.md5.check_md5_file_for_download() - if result is None: - # use cv timeout due to possible non-wake while running - cv.wait(1) - else: - break - cv.release() - if result is not None: - post_md5_skip_on_check(result[0], result[1]) - - thr = threading.Thread(target=check_for_downloads_from_md5) - thr.start() - return thr - - -def download(general_options, creds, spec): - # type: (blobxfer.models.GeneralOptions, - # blobxfer.models.AzureStorageCredentials, - # blobxfer.models.DownloadSpecification) -> None - """Download action - :param blobxfer.models.GeneralOptions general_options: general opts - :param blobxfer.models.AzureStorageCredentials creds: creds - :param blobxfer.models.DownloadSpecification spec: download spec - """ - # ensure destination path - blobxfer.operations.ensure_local_destination(creds, spec) - logger.info('downloading to local path: {}'.format(spec.destination.path)) - # initialize MD5 processes - blobxfer.md5.initialize_md5_processes() - md5_thread = check_md5_downloads_thread() - # iterate through source paths to download - for src in spec.sources: - for rfile in src.files(creds, spec.options, general_options): - # form local path for remote file - lpath = pathlib.Path(spec.destination.path, rfile.name) - # check on download conditions - action = _check_download_conditions(lpath, rfile, spec) - if action == DownloadAction.Skip: - continue - elif action == DownloadAction.CheckMd5: - pre_md5_skip_on_check(lpath, rfile) - elif action == DownloadAction.Download: - # add to download queue - ### TODO - pass - # cond checks? - print(rfile.container, rfile.name, rfile.lmt, rfile.size, - rfile.md5, rfile.mode, rfile.encryption_metadata) - - global _MD5_META_LOCK, _ALL_REMOTE_FILES_PROCESSED - with _MD5_META_LOCK: - _ALL_REMOTE_FILES_PROCESSED = True - md5_thread.join() - blobxfer.md5.finalize_md5_processes() - - import time - time.sleep(5) - + with self._md5_meta_lock: + if (len(self._md5_map) == 0 and + self._all_remote_files_processed): + break + cv.acquire() + while True: + result = self._md5_offload.get_localfile_md5_done() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + else: + break + cv.release() + if result is not None: + self._post_md5_skip_on_check(result[0], result[1]) + + self._md5_check_thread = threading.Thread( + target=_check_for_downloads_from_md5, + args=(self,) + ) + self._md5_check_thread.start() + + def start(self): + # type: (None) -> None + """Start Downloader""" + # ensure destination path + blobxfer.operations.ensure_local_destination(self._creds, self._spec) + logger.info('downloading blobs/files to local path: {}'.format( + self._spec.destination.path)) + # initialize MD5 processes + self._md5_offload = blobxfer.md5.LocalFileMd5Offload() + self._initialize_check_md5_downloads_thread() + # iterate through source paths to download + for src in self._spec.sources: + for rfile in src.files( + self._creds, self._spec.options, self._general_options): + # form local path for remote file + lpath = pathlib.Path(self._spec.destination.path, rfile.name) + # check on download conditions + action = self._check_download_conditions( + lpath, rfile, self._spec) + if action == DownloadAction.Skip: + continue + elif action == DownloadAction.CheckMd5: + self._pre_md5_skip_on_check(lpath, rfile) + elif action == DownloadAction.Download: + # TODO add to download queue + pass + # cond checks? + print(rfile.container, rfile.name, rfile.lmt, rfile.size, + rfile.md5, rfile.mode, rfile.encryption_metadata) + + # clean up processes and threads + with self._md5_meta_lock: + self._all_remote_files_processed = True + self._md5_check_thread.join() + self._md5_offload.finalize_md5_processes() diff --git a/blobxfer/md5.py b/blobxfer/md5.py index fafd3f8..7d84439 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -43,74 +43,101 @@ # create logger logger = logging.getLogger(__name__) -# global defines -_TASK_QUEUE = multiprocessing.Queue() -_DONE_QUEUE = multiprocessing.Queue() -_DONE_CV = multiprocessing.Condition() -_MD5_PROCS = [] +class LocalFileMd5Offload(object): + """LocalFileMd5Offload""" + def __init__(self, num_workers=None): + # type: (LocalFileMd5Offload, int) -> None + """Ctor for Local File Md5 Offload + :param LocalFileMd5Offload self: this + :param int num_workers: number of worker processes + """ + self._task_queue = multiprocessing.Queue() + self._done_queue = multiprocessing.Queue() + self._done_cv = multiprocessing.Condition() + self._term_signal = multiprocessing.Value('i', 0) + self._md5_procs = [] + self._initialize_md5_processes(num_workers) -def _worker_md5_file_process(): - global _TASK_QUEUE, _DONE_QUEUE - while True: - filename, remote_md5, pagealign = _TASK_QUEUE.get() - md5 = blobxfer.util.compute_md5_for_file_asbase64(filename, pagealign) - logger.debug('MD5: {} {} {}'.format(md5, remote_md5, filename)) - _DONE_CV.acquire() - _DONE_QUEUE.put((filename, md5 == remote_md5)) - _DONE_CV.notify() - _DONE_CV.release() + @property + def done_cv(self): + # type: (LocalFileMd5Offload) -> multiprocessing.Condition + """Get Download Done condition variable + :param LocalFileMd5Offload self: this + :rtype: multiprocessing.Condition + :return: cv for download done + """ + return self._done_cv + def _initialize_md5_processes(self, num_workers=None): + # type: (LocalFileMd5Offload, int) -> None + """Initialize MD5 checking processes for files for download + :param LocalFileMd5Offload self: this + :param int num_workers: number of worker processes + """ + if num_workers is None or num_workers < 1: + num_workers = multiprocessing.cpu_count() // 2 + if num_workers < 1: + num_workers = 1 + for _ in range(num_workers): + proc = multiprocessing.Process( + target=self._worker_compute_md5_localfile_process) + proc.start() + self._md5_procs.append(proc) -def get_done_cv(): - global _DONE_CV - return _DONE_CV + def finalize_md5_processes(self): + # type: (LocalFileMd5Offload) -> None + """Finalize MD5 checking processes for files for download + :param LocalFileMd5Offload self: this + """ + self._term_signal.value = 1 + for proc in self._md5_procs: + proc.join() + def _worker_compute_md5_localfile_process(self): + # type: (LocalFileMd5Offload) -> None + """Compute MD5 for local file + :param LocalFileMd5Offload self: this + """ + while self._term_signal.value == 0: + try: + filename, remote_md5, pagealign = self._task_queue.get(True, 1) + except queue.Empty: + continue + md5 = blobxfer.util.compute_md5_for_file_asbase64( + filename, pagealign) + logger.debug('MD5: {} {} {}'.format( + md5, remote_md5, filename)) + self._done_cv.acquire() + self._done_queue.put((filename, md5 == remote_md5)) + self.done_cv.notify() + self.done_cv.release() -def check_md5_file_for_download(): - # type: (None) -> str - """Check queue for a file to download - :rtype: str - :return: local file path - """ - global _DONE_QUEUE - try: - return _DONE_QUEUE.get_nowait() - except queue.Empty: - return None + def get_localfile_md5_done(self): + # type: (LocalFileMd5Offload) -> Tuple[str, bool] + """Get from done queue of local files with MD5 completed + :param LocalFileMd5Offload self: this + :rtype: tuple or None + :return: (local file path, md5 match) + """ + try: + return self._done_queue.get_nowait() + except queue.Empty: + return None - -def add_file_for_md5_check(filename, remote_md5, mode): - # type: (str, str, blobxfer.models.AzureStorageModes) -> bool - """Check an MD5 for a file for download - :param str filename: file to compute MD5 for - :param str remote_md5: remote MD5 to compare against - :param blobxfer.models.AzureStorageModes mode: mode - :rtype: bool - :return: MD5 match comparison - """ - global _TASK_QUEUE - if mode == blobxfer.models.AzureStorageModes.Page: - pagealign = True - else: - pagealign = False - _TASK_QUEUE.put((filename, remote_md5, pagealign)) - - -def initialize_md5_processes(num_workers=None): - global _MD5_PROCS - if num_workers is None or num_workers < 1: - num_workers = multiprocessing.cpu_count() // 2 - if num_workers < 1: - num_workers = 1 - for _ in range(num_workers): - proc = multiprocessing.Process(target=_worker_md5_file_process) - proc.start() - _MD5_PROCS.append(proc) - - -def finalize_md5_processes(): - global _MD5_PROCS - for proc in _MD5_PROCS: - proc.terminate() - proc.join() + def add_localfile_for_md5_check(self, filename, remote_md5, mode): + # type: (LocalFileMd5Offload, str, str, + # blobxfer.models.AzureStorageModes) -> bool + """Check an MD5 for a file for download + :param LocalFileMd5Offload self: this + :param str filename: file to compute MD5 for + :param str remote_md5: remote MD5 to compare against + :param blobxfer.models.AzureStorageModes mode: mode + :rtype: bool + :return: MD5 match comparison + """ + if mode == blobxfer.models.AzureStorageModes.Page: + pagealign = True + else: + pagealign = False + self._task_queue.put((filename, remote_md5, pagealign)) diff --git a/cli/cli.py b/cli/cli.py index 508013a..2a6d8d9 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -591,7 +591,9 @@ def download(ctx, local_resource, storage_account, remote_path): ctx.initialize() specs = settings.create_download_specifications(ctx.config) for spec in specs: - blobxfer.api.download(ctx.general_options, ctx.credentials, spec) + blobxfer.api.Downloader( + ctx.general_options, ctx.credentials, spec + ).start() @cli.command('synccopy') From e1d5f9f8025ce192590cf5b4faee45ecb9cc5da8 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Wed, 22 Feb 2017 10:08:54 -0800 Subject: [PATCH 07/47] Add tests for MD5/download - Add more model tests - Add noqa tags for ImportError except blocks --- blobxfer/download.py | 23 ++- blobxfer/file/operations.py | 2 +- blobxfer/md5.py | 4 +- blobxfer/models.py | 2 +- cli/cli.py | 2 +- setup.py | 2 +- tests/test_blobxfer_download.py | 253 ++++++++++++++++++++++++++++++ tests/test_blobxfer_md5.py | 99 ++++++++++++ tests/test_blobxfer_models.py | 133 +++++++++++++--- tests/test_blobxfer_operations.py | 25 ++- 10 files changed, 487 insertions(+), 58 deletions(-) create mode 100644 tests/test_blobxfer_download.py create mode 100644 tests/test_blobxfer_md5.py diff --git a/blobxfer/download.py b/blobxfer/download.py index b5a87d4..0b7d837 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -36,7 +36,7 @@ import logging try: import pathlib2 as pathlib -except ImportError: +except ImportError: # noqa import pathlib import threading # non-stdlib imports @@ -77,33 +77,33 @@ def __init__(self, general_options, creds, spec): self._creds = creds self._spec = spec - def _check_download_conditions(self, lpath, rfile, spec): - # type: (Downloader, pathlib.Path, blobxfer.models.AzureStorageEntity, - # blobxfer.models.DownloadSpecification) -> DownloadAction + def _check_download_conditions(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.AzureStorageEntity) -> DownloadAction """Check for download conditions :param Downloader self: this :param pathlib.Path lpath: local path :param blobxfer.models.AzureStorageEntity rfile: remote file - :param blobxfer.models.DownloadSpecification spec: download spec :rtype: DownloadAction :return: download action """ if not lpath.exists(): return DownloadAction.Download - if not spec.options.overwrite: + if not self._spec.options.overwrite: logger.info( 'not overwriting local file: {} (remote: {}/{})'.format( lpath, rfile.container, rfile.name)) return DownloadAction.Skip # check skip on options, MD5 match takes priority - if spec.skip_on.md5_match: + if self._spec.skip_on.md5_match: return DownloadAction.CheckMd5 # if neither of the remaining skip on actions are activated, download - if not spec.skip_on.filesize_match and not spec.skip_on.lmt_ge: + if (not self._spec.skip_on.filesize_match and + not self._spec.skip_on.lmt_ge): return DownloadAction.Download # check skip on file size match dl_fs = None - if spec.skip_on.filesize_match: + if self._spec.skip_on.filesize_match: lsize = lpath.stat().st_size if rfile.mode == blobxfer.models.AzureStorageModes.Page: lsize = blobxfer.util.page_align_content_length(lsize) @@ -113,7 +113,7 @@ def _check_download_conditions(self, lpath, rfile, spec): dl_fs = True # check skip on lmt ge dl_lmt = None - if spec.skip_on.lmt_ge: + if self._spec.skip_on.lmt_ge: mtime = datetime.datetime.fromtimestamp( lpath.stat().st_mtime, tz=dateutil.tz.tzlocal()) if mtime >= rfile.lmt: @@ -211,8 +211,7 @@ def start(self): # form local path for remote file lpath = pathlib.Path(self._spec.destination.path, rfile.name) # check on download conditions - action = self._check_download_conditions( - lpath, rfile, self._spec) + action = self._check_download_conditions(lpath, rfile) if action == DownloadAction.Skip: continue elif action == DownloadAction.CheckMd5: diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index eae7640..221f412 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -32,7 +32,7 @@ import logging try: import pathlib2 as pathlib -except ImportError: +except ImportError: # noqa import pathlib # non-stdlib imports import azure.common diff --git a/blobxfer/md5.py b/blobxfer/md5.py index 7d84439..dbbe6fd 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -33,7 +33,7 @@ import multiprocessing try: import queue -except ImportError: +except ImportError: # noqa import Queue as queue # non-stdlib imports # local imports @@ -75,7 +75,7 @@ def _initialize_md5_processes(self, num_workers=None): :param LocalFileMd5Offload self: this :param int num_workers: number of worker processes """ - if num_workers is None or num_workers < 1: + if num_workers is None: num_workers = multiprocessing.cpu_count() // 2 if num_workers < 1: num_workers = 1 diff --git a/blobxfer/models.py b/blobxfer/models.py index 918eaaa..04e4c12 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -37,7 +37,7 @@ import os try: import pathlib2 as pathlib -except ImportError: +except ImportError: # noqa import pathlib # non-stdlib imports # local imports diff --git a/cli/cli.py b/cli/cli.py index 2a6d8d9..91600b5 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -32,7 +32,7 @@ import logging try: import pathlib2 as pathlib -except ImportError: +except ImportError: # noqa import pathlib # non-stdlib imports import click diff --git a/setup.py b/setup.py index 475cb90..11ba002 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import re try: from setuptools import setup -except ImportError: +except ImportError: # noqa from distutils.core import setup import sys diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download.py new file mode 100644 index 0000000..99c9711 --- /dev/null +++ b/tests/test_blobxfer_download.py @@ -0,0 +1,253 @@ +# coding=utf-8 +"""Tests for download""" + +# stdlib imports +import datetime +import dateutil.tz +import mock +import multiprocessing +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import azure.storage.blob +import pytest +# local imports +import blobxfer.models as models +import blobxfer.util as util +# module under test +import blobxfer.download as dl + + +def test_check_download_conditions(tmpdir): + ap = tmpdir.join('a') + ap.write('abc') + ep = pathlib.Path(str(ap)) + nep = pathlib.Path(str(tmpdir.join('nep'))) + + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=models.AzureStorageModes.Auto, + overwrite=False, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=models.SkipOnOptions( + filesize_match=True, + lmt_ge=True, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + result = d._check_download_conditions(nep, mock.MagicMock()) + assert result == dl.DownloadAction.Download + result = d._check_download_conditions(ep, mock.MagicMock()) + assert result == dl.DownloadAction.Skip + + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=models.SkipOnOptions( + filesize_match=True, + lmt_ge=True, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + result = d._check_download_conditions(ep, mock.MagicMock()) + assert result == dl.DownloadAction.CheckMd5 + + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=models.SkipOnOptions( + filesize_match=False, + lmt_ge=False, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + result = d._check_download_conditions(ep, mock.MagicMock()) + assert result == dl.DownloadAction.Download + + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=models.SkipOnOptions( + filesize_match=True, + lmt_ge=False, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + rfile = models.AzureStorageEntity('cont') + rfile._size = util.page_align_content_length(ep.stat().st_size) + rfile._mode = models.AzureStorageModes.Page + result = d._check_download_conditions(ep, rfile) + assert result == dl.DownloadAction.Skip + + rfile._size = ep.stat().st_size + rfile._mode = models.AzureStorageModes.Page + result = d._check_download_conditions(ep, rfile) + assert result == dl.DownloadAction.Download + + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( + check_file_md5=True, + delete_extraneous_destination=False, + mode=models.AzureStorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=models.SkipOnOptions( + filesize_match=False, + lmt_ge=True, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + rfile = models.AzureStorageEntity('cont') + rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) + \ + datetime.timedelta(days=1) + result = d._check_download_conditions(ep, rfile) + assert result == dl.DownloadAction.Download + + rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) - \ + datetime.timedelta(days=1) + result = d._check_download_conditions(ep, rfile) + assert result == dl.DownloadAction.Skip + + +def test_pre_md5_skip_on_check(): + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_offload = mock.MagicMock() + + rfile = models.AzureStorageEntity('cont') + rfile._encryption = mock.MagicMock() + rfile._encryption.blobxfer_extensions = mock.MagicMock() + rfile._encryption.blobxfer_extensions.pre_encrypted_content_md5 = \ + 'abc' + + lpath = 'lpath' + d._pre_md5_skip_on_check(lpath, rfile) + assert lpath in d._md5_map + + lpath = 'lpath2' + rfile._encryption = None + rfile._md5 = 'abc' + d._pre_md5_skip_on_check(lpath, rfile) + assert lpath in d._md5_map + + +def test_post_md5_skip_on_check(): + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_offload = mock.MagicMock() + + lpath = 'lpath' + rfile = models.AzureStorageEntity('cont') + rfile._md5 = 'abc' + d._pre_md5_skip_on_check(lpath, rfile) + assert lpath in d._md5_map + + d._post_md5_skip_on_check(lpath, True) + assert lpath not in d._md5_map + + # TODO test mismatch + + +def test_initialize_check_md5_downloads_thread(): + lpath = 'lpath' + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[lpath] = mock.MagicMock() + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.get_localfile_md5_done = mock.MagicMock() + d._md5_offload.get_localfile_md5_done.side_effect = [None, (lpath, True)] + d._post_md5_skip_on_check = mock.MagicMock() + + d._initialize_check_md5_downloads_thread() + d._all_remote_files_processed = True + d._md5_map.clear() + d._md5_offload.done_cv.acquire() + d._md5_offload.done_cv.notify() + d._md5_offload.done_cv.release() + d._md5_check_thread.join() + + assert d._post_md5_skip_on_check.call_count == 1 + + +@mock.patch('blobxfer.md5.LocalFileMd5Offload') +@mock.patch('blobxfer.blob.operations.list_blobs') +@mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) +def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._initialize_check_md5_downloads_thread = mock.MagicMock() + d._md5_check_thread = mock.MagicMock() + d._spec.sources = [] + d._spec.options = mock.MagicMock() + d._spec.options.mode = models.AzureStorageModes.Auto + d._spec.options.overwrite = True + d._spec.skip_on = mock.MagicMock() + d._spec.skip_on.md5_match = False + d._spec.skip_on.lmt_ge = False + d._spec.skip_on.filesize_match = False + d._spec.destination = mock.MagicMock() + d._spec.destination.path = pathlib.Path(str(tmpdir)) + + p = '/cont/remote/path' + asp = models.AzureSourcePath() + asp.add_path_with_storage_account(p, 'sa') + d._spec.sources.append(asp) + + b = azure.storage.blob.models.Blob(name='name') + patched_lb.side_effect = [[b]] + + d._check_download_conditions = mock.MagicMock() + d._check_download_conditions.return_value = dl.DownloadAction.Skip + d.start() + # TODO assert + + patched_lb.side_effect = [[b]] + d._all_remote_files_processed = False + d._check_download_conditions.return_value = dl.DownloadAction.CheckMd5 + d._pre_md5_skip_on_check = mock.MagicMock() + d.start() + assert d._pre_md5_skip_on_check.call_count == 1 + + patched_lb.side_effect = [[b]] + d._all_remote_files_processed = False + d._check_download_conditions.return_value = dl.DownloadAction.Download + d.start() + # TODO assert diff --git a/tests/test_blobxfer_md5.py b/tests/test_blobxfer_md5.py new file mode 100644 index 0000000..7faa1ce --- /dev/null +++ b/tests/test_blobxfer_md5.py @@ -0,0 +1,99 @@ +# coding=utf-8 +"""Tests for md5""" + +# stdlib imports +import time +# non-stdlib imports +# local imports +import blobxfer.models as models +import blobxfer.util as util +# module under test +import blobxfer.md5 as md5 + + +def test_done_cv(): + a = None + try: + a = md5.LocalFileMd5Offload() + assert a.done_cv == a._done_cv + finally: + if a: + a.finalize_md5_processes() + + +def test_finalize_md5_processes(): + a = None + try: + a = md5.LocalFileMd5Offload(num_workers=0) + finally: + if a: + a.finalize_md5_processes() + + for proc in a._md5_procs: + assert not proc.is_alive() + + +def test_from_add_to_done_non_pagealigned(tmpdir): + file = tmpdir.join('a') + file.write('abc') + + remote_md5 = util.compute_md5_for_file_asbase64(str(file)) + + a = None + try: + a = md5.LocalFileMd5Offload(num_workers=1) + result = a.get_localfile_md5_done() + assert result is None + + a.add_localfile_for_md5_check( + str(file), remote_md5, models.AzureStorageModes.Block) + i = 33 + checked = False + while i > 0: + result = a.get_localfile_md5_done() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert len(result) == 2 + assert result[0] == str(file) + assert result[1] + checked = True + break + assert checked + finally: + if a: + a.finalize_md5_processes() + + +def test_from_add_to_done_pagealigned(tmpdir): + file = tmpdir.join('a') + file.write('abc') + + remote_md5 = util.compute_md5_for_file_asbase64(str(file), True) + + a = None + try: + a = md5.LocalFileMd5Offload(num_workers=1) + result = a.get_localfile_md5_done() + assert result is None + + a.add_localfile_for_md5_check( + str(file), remote_md5, models.AzureStorageModes.Page) + i = 33 + checked = False + while i > 0: + result = a.get_localfile_md5_done() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert len(result) == 2 + assert result[0] == str(file) + assert result[1] + checked = True + break + assert checked + finally: + if a: + a.finalize_md5_processes() diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index c9a8d81..0612a9e 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -6,18 +6,19 @@ import os try: import pathlib2 as pathlib -except ImportError: +except ImportError: # noqa import pathlib # non-stdlib imports import azure.storage import azure.storage.blob +import azure.storage.file import pytest # module under test -import blobxfer.models +import blobxfer.models as models def test_storage_credentials(): - creds = blobxfer.models.AzureStorageCredentials() + creds = models.AzureStorageCredentials() creds.add_storage_account('sa1', 'somekey1', 'endpoint') a = creds.get_storage_account('sa1') @@ -51,24 +52,24 @@ def test_storage_credentials(): def test_key_is_sas(): - a = blobxfer.models.AzureStorageAccount('name', 'abcdef', 'endpoint') + a = models.AzureStorageAccount('name', 'abcdef', 'endpoint') assert not a.is_sas - a = blobxfer.models.AzureStorageAccount('name', 'abcdef&blah', 'endpoint') + a = models.AzureStorageAccount('name', 'abcdef&blah', 'endpoint') assert not a.is_sas - a = blobxfer.models.AzureStorageAccount('name', '?abcdef', 'endpoint') + a = models.AzureStorageAccount('name', '?abcdef', 'endpoint') assert a.is_sas - a = blobxfer.models.AzureStorageAccount( + a = models.AzureStorageAccount( 'name', '?sv=0&sr=1&sig=2', 'endpoint') assert a.is_sas - a = blobxfer.models.AzureStorageAccount( + a = models.AzureStorageAccount( 'name', 'sv=0&sr=1&sig=2', 'endpoint') assert a.is_sas - a = blobxfer.models.AzureStorageAccount( + a = models.AzureStorageAccount( 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint') assert a.is_sas @@ -86,7 +87,7 @@ def test_localsourcepaths_files(tmpdir): defpath.join('world.txt').write('world') defpath.join('moo.cow').write('y') - a = blobxfer.models.LocalSourcePaths() + a = models.LocalSourcePaths() a.add_include('*.txt') a.add_includes(['moo.cow', '*blah*']) with pytest.raises(ValueError): @@ -106,7 +107,7 @@ def test_localsourcepaths_files(tmpdir): assert str(defpath.join('world.txt')) in a_set assert str(defpath.join('moo.cow')) not in a_set - b = blobxfer.models.LocalSourcePaths() + b = models.LocalSourcePaths() b.add_includes(['moo.cow', '*blah*']) b.add_include('*.txt') b.add_excludes(['world.txt']) @@ -121,7 +122,7 @@ def test_localdestinationpath(tmpdir): tmpdir.mkdir('1') path = tmpdir.join('1') - a = blobxfer.models.LocalDestinationPath(str(path)) + a = models.LocalDestinationPath(str(path)) a.is_dir = True assert str(a.path) == str(path) assert a.is_dir @@ -129,7 +130,7 @@ def test_localdestinationpath(tmpdir): a.ensure_path_exists() assert os.path.exists(str(a.path)) - b = blobxfer.models.LocalDestinationPath() + b = models.LocalDestinationPath() b.is_dir = False b.path = str(path) with pytest.raises(RuntimeError): @@ -138,7 +139,7 @@ def test_localdestinationpath(tmpdir): path2 = tmpdir.join('2') path3 = path2.join('3') - c = blobxfer.models.LocalDestinationPath(str(path3)) + c = models.LocalDestinationPath(str(path3)) with pytest.raises(RuntimeError): c.ensure_path_exists() c.is_dir = False @@ -150,7 +151,7 @@ def test_localdestinationpath(tmpdir): def test_azuresourcepath(): p = '/cont/remote/path' - asp = blobxfer.models.AzureSourcePath() + asp = models.AzureSourcePath() asp.add_path_with_storage_account(p, 'sa') with pytest.raises(RuntimeError): @@ -159,26 +160,106 @@ def test_azuresourcepath(): assert 'sa' == asp.lookup_storage_account(p) +@mock.patch('blobxfer.crypto.models.EncryptionMetadata') +@mock.patch('blobxfer.file.operations.list_files') +def test_azuresourcepath_files(patched_lf, patched_em): + p = '/cont/remote/path' + asp = models.AzureSourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = models.AzureStorageModes.File + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.file_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + f = azure.storage.file.models.File(name='name') + patched_lf.side_effect = [[f]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is None + assert i == 1 + + fe = azure.storage.file.models.File(name='name') + fe.metadata = {'encryptiondata': {'a': 'b'}} + patched_lf.side_effect = [[fe]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is not None + assert i == 1 + + +@mock.patch('blobxfer.crypto.models.EncryptionMetadata') +@mock.patch('blobxfer.blob.operations.list_blobs') +def test_azuresourcepath_blobs(patched_lb, patched_em): + p = '/cont/remote/path' + asp = models.AzureSourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = models.AzureStorageModes.Auto + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.block_blob_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + b = azure.storage.blob.models.Blob(name='name') + patched_lb.side_effect = [[b]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is None + assert i == 1 + + be = azure.storage.blob.models.Blob(name='name') + be.metadata = {'encryptiondata': {'a': 'b'}} + patched_lb.side_effect = [[be]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is not None + assert i == 1 + + def test_downloadspecification(): - ds = blobxfer.models.DownloadSpecification( - download_options=blobxfer.models.DownloadOptions( + ds = models.DownloadSpecification( + download_options=models.DownloadOptions( check_file_md5=True, delete_extraneous_destination=False, - mode=blobxfer.models.AzureStorageModes.Auto, + mode=models.AzureStorageModes.Auto, overwrite=True, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=blobxfer.models.SkipOnOptions( + skip_on_options=models.SkipOnOptions( filesize_match=True, lmt_ge=False, md5_match=True, ), - local_destination_path=blobxfer.models.LocalDestinationPath('dest'), + local_destination_path=models.LocalDestinationPath('dest'), ) - asp = blobxfer.models.AzureSourcePath() + asp = models.AzureSourcePath() p = 'some/remote/path' asp.add_path_with_storage_account(p, 'sa') @@ -193,7 +274,7 @@ def test_downloadspecification(): def test_azurestorageentity(): - ase = blobxfer.models.AzureStorageEntity('cont') + ase = models.AzureStorageEntity('cont') assert ase.container == 'cont' assert ase.encryption_metadata is None @@ -211,15 +292,15 @@ def test_azurestorageentity(): assert ase.lmt == 'lmt' assert ase.size == 123 assert ase.md5 == 'abc' - assert ase.mode == blobxfer.models.AzureStorageModes.Block + assert ase.mode == models.AzureStorageModes.Block blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob ase.populate_from_blob(blob) - assert ase.mode == blobxfer.models.AzureStorageModes.Append + assert ase.mode == models.AzureStorageModes.Append blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob ase.populate_from_blob(blob) - assert ase.mode == blobxfer.models.AzureStorageModes.Page + assert ase.mode == models.AzureStorageModes.Page ase.populate_from_file(blob) - assert ase.mode == blobxfer.models.AzureStorageModes.File + assert ase.mode == models.AzureStorageModes.File diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py index 78aef22..9926bab 100644 --- a/tests/test_blobxfer_operations.py +++ b/tests/test_blobxfer_operations.py @@ -2,10 +2,7 @@ """Tests for operations""" # stdlib imports -from mock import ( - MagicMock, - patch, -) +import mock # non-stdlib imports import pytest # local imports @@ -14,8 +11,8 @@ import blobxfer.operations as ops -@patch('blobxfer.file.operations.check_if_single_file') -@patch('blobxfer.blob.operations.check_if_single_blob') +@mock.patch('blobxfer.file.operations.check_if_single_file') +@mock.patch('blobxfer.blob.operations.check_if_single_blob') def test_ensure_local_destination(patched_blob, patched_file, tmpdir): downdir = tmpdir.join('down') @@ -30,14 +27,14 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=MagicMock(), + skip_on_options=mock.MagicMock(), local_destination_path=blobxfer.models.LocalDestinationPath( str(downdir) ), ) with pytest.raises(RuntimeError): - ops.ensure_local_destination(MagicMock(), ds) + ops.ensure_local_destination(mock.MagicMock(), ds) asp = blobxfer.models.AzureSourcePath() p = 'cont/remote/path' @@ -46,12 +43,12 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): ds.add_azure_source_path(asp) patched_blob.return_value = False - ops.ensure_local_destination(MagicMock(), ds) + ops.ensure_local_destination(mock.MagicMock(), ds) assert ds.destination.is_dir patched_blob.return_value = True with pytest.raises(RuntimeError): - ops.ensure_local_destination(MagicMock(), ds) + ops.ensure_local_destination(mock.MagicMock(), ds) # file tests ds = blobxfer.models.DownloadSpecification( @@ -64,7 +61,7 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=MagicMock(), + skip_on_options=mock.MagicMock(), local_destination_path=blobxfer.models.LocalDestinationPath( str(downdir) ), @@ -73,9 +70,9 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): ds.add_azure_source_path(asp) patched_file.return_value = (False, None) - ops.ensure_local_destination(MagicMock(), ds) + ops.ensure_local_destination(mock.MagicMock(), ds) assert ds.destination.is_dir - patched_file.return_value = (True, MagicMock()) + patched_file.return_value = (True, mock.MagicMock()) with pytest.raises(RuntimeError): - ops.ensure_local_destination(MagicMock(), ds) + ops.ensure_local_destination(mock.MagicMock(), ds) From 146fc24d35ca680a9b1c8010279204bbec777ef0 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 23 Feb 2017 14:57:01 -0800 Subject: [PATCH 08/47] Pre-download logic - Add concurrency options - Centralize MD5 logic --- blobxfer/blob/operations.py | 7 + blobxfer/crypto/models.py | 33 +++++ blobxfer/download.py | 103 +++++++++++--- blobxfer/md5.py | 51 ++++++- blobxfer/models.py | 178 ++++++++++++++++++++----- blobxfer/util.py | 54 +++----- cli/cli.py | 45 +++++++ cli/settings.py | 8 ++ tests/test_blobxfer_blob_operations.py | 7 +- tests/test_blobxfer_crypto_models.py | 6 + tests/test_blobxfer_download.py | 40 +++++- tests/test_blobxfer_md5.py | 24 +++- tests/test_blobxfer_models.py | 79 +++++++++++ tests/test_blobxfer_util.py | 33 ++--- 14 files changed, 548 insertions(+), 120 deletions(-) diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index ef2c976..411ad52 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -35,6 +35,7 @@ import azure.storage.blob.models # local imports import blobxfer.models +import blobxfer.util # create logger logger = logging.getLogger(__name__) @@ -50,6 +51,8 @@ def check_if_single_blob(client, container, prefix, timeout=None): :rtype: bool :return: if prefix in container is a single blob """ + if blobxfer.util.blob_is_snapshot(prefix): + return True try: client.get_blob_properties( container_name=container, blob_name=prefix, timeout=timeout) @@ -94,3 +97,7 @@ def list_blobs(client, container, prefix, mode, timeout=None): continue # auto or match, yield the blob yield blob + + +def get_blob_range(client, container, blob_name, snapshot): + pass diff --git a/blobxfer/crypto/models.py b/blobxfer/crypto/models.py index 4300b58..c4bb5b4 100644 --- a/blobxfer/crypto/models.py +++ b/blobxfer/crypto/models.py @@ -133,6 +133,26 @@ def __init__(self): self._symkey = None self._signkey = None + @property + def symmetric_key(self): + # type: (EncryptionMetadata) -> bytes + """Get symmetric key + :param EncryptionMetadata self: this + :rtype: bytes + :return: symmetric key + """ + return self._symkey + + @property + def signing_key(self): + # type: (EncryptionMetadata) -> bytes + """Get singing key + :param EncryptionMetadata self: this + :rtype: bytes + :return: signing key + """ + return self._signkey + @staticmethod def encryption_metadata_exists(md): # type: (dict) -> bool @@ -283,4 +303,17 @@ def convert_from_json(self, md, blobname, rsaprivatekey): blobname)) def convert_to_json_with_mac(self): + # TODO pass + + def initialize_hmac(self): + # type: (EncryptionMetadata) -> hmac.HMAC + """Initialize an hmac from a signing key if it exists + :param EncryptionMetadata self: this + :rtype: hmac.HMAC or None + :return: hmac + """ + if self._signkey is not None: + return hmac.new(self._signkey, digestmod=hashlib.sha256) + else: + return None diff --git a/blobxfer/download.py b/blobxfer/download.py index 0b7d837..dbbecac 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -38,6 +38,10 @@ import pathlib2 as pathlib except ImportError: # noqa import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue import threading # non-stdlib imports import dateutil @@ -73,6 +77,9 @@ def __init__(self, general_options, creds, spec): self._md5_map = {} self._md5_offload = None self._md5_check_thread = None + self._download_queue = queue.Queue() + self._download_threads = [] + self._download_terminate = False self._general_options = general_options self._creds = creds self._spec = spec @@ -154,11 +161,11 @@ def _post_md5_skip_on_check(self, filename, md5_match): :param str filename: local filename :param bool md5_match: if MD5 matches """ + with self._md5_meta_lock: + rfile = self._md5_map.pop(filename) if not md5_match: lpath = pathlib.Path(filename) - # TODO enqueue file for download - with self._md5_meta_lock: - self._md5_map.pop(filename) + self._add_to_download_queue(lpath, rfile) def _initialize_check_md5_downloads_thread(self): # type: (Downloader) -> None @@ -173,11 +180,12 @@ def _check_for_downloads_from_md5(self): cv = self._md5_offload.done_cv while True: with self._md5_meta_lock: - if (len(self._md5_map) == 0 and - self._all_remote_files_processed): + if (self._download_terminate or + (len(self._md5_map) == 0 and + self._all_remote_files_processed)): break cv.acquire() - while True: + while not self._download_terminate: result = self._md5_offload.get_localfile_md5_done() if result is None: # use cv timeout due to possible non-wake while running @@ -194,16 +202,69 @@ def _check_for_downloads_from_md5(self): ) self._md5_check_thread.start() - def start(self): - # type: (None) -> None - """Start Downloader""" + def _add_to_download_queue(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.AzureStorageEntity) -> None + """Add remote file to download queue + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.AzureStorageEntity rfile: remote file + """ + # prepare remote file for download + rfile.prepare_for_download(lpath, self._spec.options) + # add remote file to queue + self._download_queue.put(rfile) + + def _initialize_download_threads(self): + # type: (Downloader) -> None + """Initialize download threads + :param Downloader self: this + """ + for _ in range(self._general_options.concurrency.transfer_threads): + thr = threading.Thread(target=self._worker_thread_download) + self._download_threads.append(thr) + thr.start() + + def _terminate_download_threads(self): + # type: (Downloader) -> None + """Terminate download threads + :param Downloader self: this + """ + self._download_terminate = True + for thr in self._download_threads: + thr.join() + + def _worker_thread_download(self): + # type: (Downloader) -> None + """Worker thread download + :param Downloader self: this + """ + while True: + if self._download_terminate: + break + try: + rfile = self._download_queue.get(False, 1) + except queue.Empty: + continue + # TODO + # get next offset with respect to chunk size + + print('<<', rfile.container, rfile.name, rfile.lmt, rfile.size, + rfile.md5, rfile.mode, rfile.encryption_metadata) + + def _run(self): + # type: (Downloader) -> None + """Execute Downloader""" # ensure destination path blobxfer.operations.ensure_local_destination(self._creds, self._spec) logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) # initialize MD5 processes - self._md5_offload = blobxfer.md5.LocalFileMd5Offload() + self._md5_offload = blobxfer.md5.LocalFileMd5Offload( + num_workers=self._general_options.concurrency.md5_processes) self._initialize_check_md5_downloads_thread() + # initialize download threads + self._initialize_download_threads() # iterate through source paths to download for src in self._spec.sources: for rfile in src.files( @@ -217,14 +278,24 @@ def start(self): elif action == DownloadAction.CheckMd5: self._pre_md5_skip_on_check(lpath, rfile) elif action == DownloadAction.Download: - # TODO add to download queue - pass - # cond checks? - print(rfile.container, rfile.name, rfile.lmt, rfile.size, - rfile.md5, rfile.mode, rfile.encryption_metadata) - + self._add_to_download_queue(lpath, rfile) # clean up processes and threads with self._md5_meta_lock: self._all_remote_files_processed = True self._md5_check_thread.join() + # TODO wait for download threads + self._md5_offload.finalize_md5_processes() + + def start(self): + # type: (Downloader) -> None + """Start the Downloader""" + try: + self._run() + except KeyboardInterrupt: + logger.error( + 'KeyboardInterrupt detected, force terminating ' + 'processes and threads (this may take a while)...') + self._terminate_download_threads() + self._md5_offload.finalize_md5_processes() + raise diff --git a/blobxfer/md5.py b/blobxfer/md5.py index dbbe6fd..86dbd30 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -30,6 +30,7 @@ ) # stdlib imports import logging +import hashlib import multiprocessing try: import queue @@ -44,6 +45,51 @@ logger = logging.getLogger(__name__) +def new_md5_hasher(): + # type: (None) -> md5.MD5 + """Create a new MD5 hasher + :rtype: md5.MD5 + :return: new MD5 hasher + """ + return hashlib.md5() + + +def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): + # type: (str, bool, int) -> str + """Compute MD5 hash for file and encode as Base64 + :param str filename: file to compute MD5 for + :param bool pagealign: page align data + :param int blocksize: block size + :rtype: str + :return: MD5 for file encoded as Base64 + """ + hasher = new_md5_hasher() + with open(filename, 'rb') as filedesc: + while True: + buf = filedesc.read(blocksize) + if not buf: + break + buflen = len(buf) + if pagealign and buflen < blocksize: + aligned = blobxfer.util.page_align_content_length(buflen) + if aligned != buflen: + buf = buf.ljust(aligned, b'\0') + hasher.update(buf) + return blobxfer.util.base64_encode_as_string(hasher.digest()) + + +def compute_md5_for_data_asbase64(data): + # type: (obj) -> str + """Compute MD5 hash for bits and encode as Base64 + :param any data: data to compute MD5 for + :rtype: str + :return: MD5 for data + """ + hasher = new_md5_hasher() + hasher.update(data) + return blobxfer.util.base64_encode_as_string(hasher.digest()) + + class LocalFileMd5Offload(object): """LocalFileMd5Offload""" def __init__(self, num_workers=None): @@ -76,7 +122,7 @@ def _initialize_md5_processes(self, num_workers=None): :param int num_workers: number of worker processes """ if num_workers is None: - num_workers = multiprocessing.cpu_count() // 2 + num_workers = multiprocessing.cpu_count() // 2 - 1 if num_workers < 1: num_workers = 1 for _ in range(num_workers): @@ -104,8 +150,7 @@ def _worker_compute_md5_localfile_process(self): filename, remote_md5, pagealign = self._task_queue.get(True, 1) except queue.Empty: continue - md5 = blobxfer.util.compute_md5_for_file_asbase64( - filename, pagealign) + md5 = compute_md5_for_file_asbase64(filename, pagealign) logger.debug('MD5: {} {} {}'.format( md5, remote_md5, filename)) self._done_cv.acquire() diff --git a/blobxfer/models.py b/blobxfer/models.py index 04e4c12..34d05ce 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -51,6 +51,7 @@ import blobxfer.blob.operations import blobxfer.file.operations import blobxfer.crypto.models +import blobxfer.md5 import blobxfer.util # create logger @@ -67,13 +68,6 @@ class AzureStorageModes(enum.Enum): # named tuples -GeneralOptions = collections.namedtuple( - 'GeneralOptions', [ - 'progress_bar', - 'timeout_sec', - 'verbose', - ] -) VectoredIoOptions = collections.namedtuple( 'VectoredIoOptions', [ 'stripe_chunk_size_bytes', @@ -130,6 +124,46 @@ class AzureStorageModes(enum.Enum): ) +class ConcurrencyOptions(object): + """Concurrency Options""" + def __init__(self, crypto_processes, md5_processes, transfer_threads): + """Ctor for Concurrency Options + :param ConcurrencyOptions self: this + :param int crypto_processes: number of crypto procs + :param int md5_processes: number of md5 procs + :param int transfer_threads: number of transfer threads + """ + self.crypto_processes = crypto_processes + self.md5_processes = md5_processes + self.transfer_threads = transfer_threads + if self.crypto_processes is None or self.crypto_processes < 1: + self.crypto_processes = 1 + if self.md5_processes is None or self.md5_processes < 1: + self.md5_processes = 1 + if self.transfer_threads is None or self.transfer_threads < 1: + self.transfer_threads = 1 + + +class GeneralOptions(object): + """General Options""" + def __init__( + self, concurrency, progress_bar=True, timeout_sec=None, + verbose=False): + """Ctor for General Options + :param GeneralOptions self: this + :param ConcurrencyOptions concurrency: concurrency options + :param bool progress_bar: progress bar + :param int timeout_sec: timeout in seconds + :param bool verbose: verbose output + """ + if concurrency is None: + raise ValueError('concurrency option is unspecified') + self.concurrency = concurrency + self.progress_bar = progress_bar + self.timeout_sec = timeout_sec + self.verbose = verbose + + class AzureStorageCredentials(object): """Azure Storage Credentials""" def __init__(self): @@ -608,38 +642,11 @@ def __init__(self, container, ed=None): self._mode = None self._lmt = None self._size = None + self._snapshot = None self._md5 = None self._encryption = ed self._vio = None - - def populate_from_blob(self, blob): - # type: (AzureStorageEntity, azure.storage.blob.models.Blob) -> None - """Populate properties from Blob - :param AzureStorageEntity self: this - :param azure.storage.blob.models.Blob blob: blob to populate from - """ - self._name = blob.name - self._lmt = blob.properties.last_modified - self._size = blob.properties.content_length - self._md5 = blob.properties.content_settings.content_md5 - if blob.properties.blob_type == BlobTypes.AppendBlob: - self._mode = AzureStorageModes.Append - elif blob.properties.blob_type == BlobTypes.BlockBlob: - self._mode = AzureStorageModes.Block - elif blob.properties.blob_type == BlobTypes.PageBlob: - self._mode = AzureStorageModes.Page - - def populate_from_file(self, file): - # type: (AzureStorageEntity, azure.storage.file.models.File) -> None - """Populate properties from File - :param AzureStorageEntity self: this - :param azure.storage.file.models.File file: file to populate from - """ - self._name = file.name - self._lmt = file.properties.last_modified - self._size = file.properties.content_length - self._md5 = file.properties.content_settings.content_md5 - self._mode = AzureStorageModes.File + self.download = None @property def container(self): @@ -712,6 +719,105 @@ def encryption_metadata(self): """ return self._encryption + def populate_from_blob(self, blob): + # type: (AzureStorageEntity, azure.storage.blob.models.Blob) -> None + """Populate properties from Blob + :param AzureStorageEntity self: this + :param azure.storage.blob.models.Blob blob: blob to populate from + """ + self._name = blob.name + self._snapshot = blob.snapshot + self._lmt = blob.properties.last_modified + self._size = blob.properties.content_length + self._md5 = blob.properties.content_settings.content_md5 + if blob.properties.blob_type == BlobTypes.AppendBlob: + self._mode = AzureStorageModes.Append + elif blob.properties.blob_type == BlobTypes.BlockBlob: + self._mode = AzureStorageModes.Block + elif blob.properties.blob_type == BlobTypes.PageBlob: + self._mode = AzureStorageModes.Page + + def populate_from_file(self, file): + # type: (AzureStorageEntity, azure.storage.file.models.File) -> None + """Populate properties from File + :param AzureStorageEntity self: this + :param azure.storage.file.models.File file: file to populate from + """ + self._name = file.name + self._lmt = file.properties.last_modified + self._size = file.properties.content_length + self._md5 = file.properties.content_settings.content_md5 + self._mode = AzureStorageModes.File + + def prepare_for_download(self, lpath, options): + # type: (AzureStorageEntity, pathlib.Path, DownloadOptions) -> None + """Prepare entity for download + :param AzureStorageEntity self: this + :param pathlib.Path lpath: local path + :param DownloadOptions options: download options + """ + if self._encryption is not None: + hmac = self._encryption.initialize_hmac() + else: + hmac = None + if hmac is None and options.check_file_md5: + md5 = blobxfer.md5.new_md5_hasher() + else: + md5 = None + self.download = DownloadDescriptor(lpath, hmac, md5) + self.download.allocate_disk_space( + self._size, self._encryption is not None) + + +class DownloadDescriptor(object): + """DownloadDescriptor""" + def __init__(self, lpath, hmac, md5): + # type: (DownloadDescriptior, pathlib.Path, hmac.HMAC, md5.MD5) -> None + """Ctor for Download Descriptor + :param DownloadDescriptor self: this + :param pathlib.Path lpath: local path + :param hmac.HMAC hmac: hmac + :param md5.MD5 md5: md5 + """ + self.final_path = lpath + # create path holding the temporary file to download to + _tmp = list(lpath.parts[:-1]) + _tmp.append(lpath.name + '.bxtmp') + self.local_path = pathlib.Path(*_tmp) + self.hmac = hmac + self.md5 = md5 + self.current_position = 0 + + def allocate_disk_space(self, size, encryption): + # type: (DownloadDescriptor, int, bool) -> None + """Perform file allocation (possibly sparse), if encrypted this may + be an underallocation + :param DownloadDescriptor self: this + :param int size: size + :param bool encryption: encryption enabled + """ + # compute size + if size > 0: + if encryption: + allocatesize = size - \ + blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + # create parent path + self.local_path.parent.mkdir(mode=0o750, parents=True, exist_ok=True) + # allocate file + with self.local_path.open('wb') as fd: + if allocatesize > 0: + try: + os.posix_fallocate(fd.fileno(), 0, allocatesize) + except AttributeError: + fd.seek(allocatesize - 1) + fd.write(b'\0') + class AzureDestinationPaths(object): def __init__(self): diff --git a/blobxfer/util.py b/blobxfer/util.py index dd116bd..9029fb1 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -31,7 +31,7 @@ # stdlib imports import base64 import copy -import hashlib +import dateutil import logging import logging.handlers import mimetypes @@ -164,42 +164,6 @@ def base64_decode_string(string): return base64.b64decode(string) -def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): - # type: (str, bool, int) -> str - """Compute MD5 hash for file and encode as Base64 - :param str filename: file to compute MD5 for - :param bool pagealign: page align data - :param int blocksize: block size - :rtype: str - :return: MD5 for file encoded as Base64 - """ - hasher = hashlib.md5() - with open(filename, 'rb') as filedesc: - while True: - buf = filedesc.read(blocksize) - if not buf: - break - buflen = len(buf) - if pagealign and buflen < blocksize: - aligned = page_align_content_length(buflen) - if aligned != buflen: - buf = buf.ljust(aligned, b'\0') - hasher.update(buf) - return base64_encode_as_string(hasher.digest()) - - -def compute_md5_for_data_asbase64(data): - # type: (obj) -> str - """Compute MD5 hash for bits and encode as Base64 - :param any data: data to compute MD5 for - :rtype: str - :return: MD5 for data - """ - hasher = hashlib.md5() - hasher.update(data) - return base64_encode_as_string(hasher.digest()) - - def page_align_content_length(length): # type: (int) -> int """Compute page boundary alignment @@ -241,3 +205,19 @@ def explode_azure_path(path): else: rpath = '' return container, rpath + + +def blob_is_snapshot(url): + # type: (str) -> bool + """Checks if the blob is a snapshot blob + :param url str: blob url + :rtype: bool + :return: if blob is a snapshot blob + """ + if '?snapshot=' in url: + try: + dateutil.parser.parse(url.split('?snapshot=')[-1]) + return True + except (ValueError, OverflowError): + pass + return False diff --git a/cli/cli.py b/cli/cli.py index 91600b5..64be863 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -107,6 +107,34 @@ def _init_config(self): pass_cli_context = click.make_pass_decorator(CliContext, ensure=True) +def _crypto_processes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['crypto_processes'] = value + return value + return click.option( + '--crypto-processes', + expose_value=False, + type=int, + default=0, + help='Concurrent crypto processes', + callback=callback)(f) + + +def _md5_processes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['md5_processes'] = value + return value + return click.option( + '--md5-processes', + expose_value=False, + type=int, + default=0, + help='Concurrent MD5 processes', + callback=callback)(f) + + def _progress_bar_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -133,6 +161,20 @@ def callback(ctx, param, value): callback=callback)(f) +def _transfer_threads_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['transfer_threads'] = value + return value + return click.option( + '--transfer-threads', + expose_value=False, + type=int, + default=0, + help='Concurrent transfer threads', + callback=callback)(f) + + def _verbose_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -148,8 +190,11 @@ def callback(ctx, param, value): def common_options(f): f = _verbose_option(f) + f = _transfer_threads_option(f) f = _timeout_option(f) f = _progress_bar_option(f) + f = _md5_processes_option(f) + f = _crypto_processes_option(f) return f diff --git a/cli/settings.py b/cli/settings.py index db5c643..f567bea 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -203,8 +203,11 @@ def merge_settings(config, cli_options): # merge general options if 'options' not in config: config['options'] = {} + config['options']['crypto_processes'] = cli_options['crypto_processes'] + config['options']['md5_processes'] = cli_options['md5_processes'] config['options']['progress_bar'] = cli_options['progress_bar'] config['options']['timeout_sec'] = cli_options['timeout'] + config['options']['transfer_threads'] = cli_options['transfer_threads'] config['options']['verbose'] = cli_options['verbose'] @@ -231,6 +234,11 @@ def create_general_options(config): :return: general options object """ return blobxfer.models.GeneralOptions( + concurrency=blobxfer.models.ConcurrencyOptions( + crypto_processes=config['options']['crypto_processes'], + md5_processes=config['options']['md5_processes'], + transfer_threads=config['options']['transfer_threads'], + ), progress_bar=config['options']['progress_bar'], timeout_sec=config['options']['timeout_sec'], verbose=config['options']['verbose'], diff --git a/tests/test_blobxfer_blob_operations.py b/tests/test_blobxfer_blob_operations.py index 1b7d300..5c078ac 100644 --- a/tests/test_blobxfer_blob_operations.py +++ b/tests/test_blobxfer_blob_operations.py @@ -15,12 +15,15 @@ def test_check_if_single_blob(): client = mock.MagicMock() - client.get_blob_properties = mock.MagicMock() - client.get_blob_properties.return_value = mock.MagicMock() + client.get_blob_properties.return_value = True result = ops.check_if_single_blob(client, 'a', 'b/c') assert result + result = ops.check_if_single_blob( + client, 'a', 'a?snapshot=2017-02-23T22:21:14.8121864Z') + assert result + client = mock.MagicMock() client.get_blob_properties = mock.MagicMock() client.get_blob_properties.side_effect = \ diff --git a/tests/test_blobxfer_crypto_models.py b/tests/test_blobxfer_crypto_models.py index 33045c3..8d58419 100644 --- a/tests/test_blobxfer_crypto_models.py +++ b/tests/test_blobxfer_crypto_models.py @@ -179,9 +179,13 @@ def test_convert_from_json(tmpdir): } em = models.EncryptionMetadata() em.convert_from_json(md, 'blob', rsaprivatekey) + hmac = em.initialize_hmac() assert em.wrapped_content_key is not None + assert em._symkey == em.symmetric_key + assert em._signkey == em.signing_key assert em._symkey is not None assert em._signkey is not None + assert hmac is not None em = models.EncryptionMetadata() em.convert_from_json(md, 'blob', None) @@ -197,6 +201,8 @@ def test_convert_from_json(tmpdir): } em = models.EncryptionMetadata() em.convert_from_json(md, 'blob', rsaprivatekey) + hmac = em.initialize_hmac() assert em.wrapped_content_key is not None assert em._symkey is not None assert em._signkey is None + assert hmac is None diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download.py index 99c9711..56e8999 100644 --- a/tests/test_blobxfer_download.py +++ b/tests/test_blobxfer_download.py @@ -184,7 +184,10 @@ def test_post_md5_skip_on_check(): d._post_md5_skip_on_check(lpath, True) assert lpath not in d._md5_map - # TODO test mismatch + d._add_to_download_queue = mock.MagicMock() + d._pre_md5_skip_on_check(lpath, rfile) + d._post_md5_skip_on_check(lpath, False) + assert d._add_to_download_queue.call_count == 1 def test_initialize_check_md5_downloads_thread(): @@ -208,12 +211,28 @@ def test_initialize_check_md5_downloads_thread(): assert d._post_md5_skip_on_check.call_count == 1 +def test_initialize_and_terminate_download_threads(): + opts = mock.MagicMock() + opts.concurrency.transfer_threads = 2 + d = dl.Downloader(opts, mock.MagicMock(), mock.MagicMock()) + d._worker_thread_download = mock.MagicMock() + + d._initialize_download_threads() + assert len(d._download_threads) == 2 + + d._terminate_download_threads() + assert d._download_terminate + for thr in d._download_threads: + assert not thr.is_alive() + + @mock.patch('blobxfer.md5.LocalFileMd5Offload') @mock.patch('blobxfer.blob.operations.list_blobs') @mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._initialize_check_md5_downloads_thread = mock.MagicMock() + d._initialize_download_threads = mock.MagicMock() d._md5_check_thread = mock.MagicMock() d._spec.sources = [] d._spec.options = mock.MagicMock() @@ -232,17 +251,19 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._spec.sources.append(asp) b = azure.storage.blob.models.Blob(name='name') + b.properties.content_length = 0 patched_lb.side_effect = [[b]] + d._pre_md5_skip_on_check = mock.MagicMock() + d._check_download_conditions = mock.MagicMock() d._check_download_conditions.return_value = dl.DownloadAction.Skip d.start() - # TODO assert + assert d._pre_md5_skip_on_check.call_count == 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.CheckMd5 - d._pre_md5_skip_on_check = mock.MagicMock() d.start() assert d._pre_md5_skip_on_check.call_count == 1 @@ -250,4 +271,15 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.Download d.start() - # TODO assert + assert d._download_queue.qsize() == 1 + + +def test_start_keyboard_interrupt(): + d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._run = mock.MagicMock(side_effect=KeyboardInterrupt) + d._terminate_download_threads = mock.MagicMock() + d._md5_offload = mock.MagicMock() + + with pytest.raises(KeyboardInterrupt): + d.start() + assert d._terminate_download_threads.call_count == 1 diff --git a/tests/test_blobxfer_md5.py b/tests/test_blobxfer_md5.py index 7faa1ce..05a66d2 100644 --- a/tests/test_blobxfer_md5.py +++ b/tests/test_blobxfer_md5.py @@ -3,14 +3,32 @@ # stdlib imports import time +import uuid # non-stdlib imports +import pytest # local imports import blobxfer.models as models -import blobxfer.util as util # module under test import blobxfer.md5 as md5 +def test_compute_md5(tmpdir): + lpath = str(tmpdir.join('test.tmp')) + testdata = str(uuid.uuid4()) + with open(lpath, 'wt') as f: + f.write(testdata) + md5_file = md5.compute_md5_for_file_asbase64(lpath) + md5_data = md5.compute_md5_for_data_asbase64(testdata.encode('utf8')) + assert md5_file == md5_data + + md5_file_page = md5.compute_md5_for_file_asbase64(lpath, True) + assert md5_file != md5_file_page + + # test non-existent file + with pytest.raises(IOError): + md5.compute_md5_for_file_asbase64(testdata) + + def test_done_cv(): a = None try: @@ -37,7 +55,7 @@ def test_from_add_to_done_non_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') - remote_md5 = util.compute_md5_for_file_asbase64(str(file)) + remote_md5 = md5.compute_md5_for_file_asbase64(str(file)) a = None try: @@ -70,7 +88,7 @@ def test_from_add_to_done_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') - remote_md5 = util.compute_md5_for_file_asbase64(str(file), True) + remote_md5 = md5.compute_md5_for_file_asbase64(str(file), True) a = None try: diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 0612a9e..7b81332 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -17,6 +17,41 @@ import blobxfer.models as models +def test_concurrency_options(): + a = models.ConcurrencyOptions( + crypto_processes=-1, + md5_processes=0, + transfer_threads=-2, + ) + + assert a.crypto_processes == 1 + assert a.md5_processes == 1 + assert a.transfer_threads == 1 + + +def test_general_options(): + a = models.GeneralOptions( + concurrency=models.ConcurrencyOptions( + crypto_processes=1, + md5_processes=2, + transfer_threads=3, + ), + progress_bar=False, + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.transfer_threads == 3 + assert not a.progress_bar + assert a.timeout_sec == 1 + assert a.verbose + + with pytest.raises(ValueError): + a = models.GeneralOptions(None) + + def test_storage_credentials(): creds = models.AzureStorageCredentials() creds.add_storage_account('sa1', 'somekey1', 'endpoint') @@ -304,3 +339,47 @@ def test_azurestorageentity(): ase.populate_from_file(blob) assert ase.mode == models.AzureStorageModes.File + + +def test_azurestorageentity_prepare_for_download(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + + ase = models.AzureStorageEntity('cont') + ase._size = 0 + ase.prepare_for_download(lp, opts) + + assert ase.download.hmac is None + assert ase.download.md5 is not None + assert ase.download.final_path == lp + assert ase.download.current_position == 0 + + ase._encryption = mock.MagicMock() + ase.prepare_for_download(lp, opts) + + assert ase.download.hmac is not None + assert ase.download.md5 is None + + +def test_downloaddescriptor(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + d = models.DownloadDescriptor(lp, None, None) + assert d.current_position == 0 + assert d.final_path == lp + assert str(d.local_path) == str(lp) + '.bxtmp' + + d.allocate_disk_space(1024, True) + assert d.local_path.stat().st_size == 1024 - 16 + + d.local_path.unlink() + d.allocate_disk_space(1, True) + assert d.local_path.stat().st_size == 0 + + d.local_path.unlink() + d.allocate_disk_space(1024, False) + assert d.local_path.stat().st_size == 1024 + + # pre-existing file check + d.allocate_disk_space(0, False) + assert d.local_path.stat().st_size == 0 diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index 9b6084e..4dd0ebc 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -3,7 +3,6 @@ # stdlib imports import sys -import uuid # non-stdlib imports import pytest # module under test @@ -111,24 +110,6 @@ def test_base64_encode_as_string(): assert a == dec -def test_compute_md5(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - testdata = str(uuid.uuid4()) - with open(lpath, 'wt') as f: - f.write(testdata) - md5_file = blobxfer.util.compute_md5_for_file_asbase64(lpath) - md5_data = blobxfer.util.compute_md5_for_data_asbase64( - testdata.encode('utf8')) - assert md5_file == md5_data - - md5_file_page = blobxfer.util.compute_md5_for_file_asbase64(lpath, True) - assert md5_file != md5_file_page - - # test non-existent file - with pytest.raises(IOError): - blobxfer.util.compute_md5_for_file_asbase64(testdata) - - def test_page_align_content_length(): assert 0 == blobxfer.util.page_align_content_length(0) assert 512 == blobxfer.util.page_align_content_length(1) @@ -177,3 +158,17 @@ def test_explode_azure_path(): cont, rpath = blobxfer.util.explode_azure_path(p) assert cont == 'some' assert rpath == 'remote/path' + + +def test_blob_is_snapshot(): + a = '/cont/a?snapshot=2017-02-23T22:21:14.8121864Z' + assert blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=abc' + assert not blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=' + assert not blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=2017-02-23T22:21:14.8121864Z?snapshot=' + assert not blobxfer.util.blob_is_snapshot(a) From e82890ada974bbba7b2dd98a06b1f3761bbfa3b3 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 23 Feb 2017 23:47:16 -0800 Subject: [PATCH 09/47] Download offsets - Bind ASE to DownloadDescriptor instead - Add chunk size option to download and synccopy --- blobxfer/crypto/operations.py | 26 +++++ blobxfer/download.py | 34 +++++- blobxfer/models.py | 134 ++++++++++++++++------- cli/cli.py | 4 +- cli/settings.py | 3 + tests/test_blobxfer_crypto_operations.py | 7 ++ tests/test_blobxfer_download.py | 6 + tests/test_blobxfer_models.py | 134 +++++++++++++++++++---- tests/test_blobxfer_operations.py | 2 + 9 files changed, 279 insertions(+), 71 deletions(-) diff --git a/blobxfer/crypto/operations.py b/blobxfer/crypto/operations.py index 9a0f099..1931407 100644 --- a/blobxfer/crypto/operations.py +++ b/blobxfer/crypto/operations.py @@ -128,3 +128,29 @@ def rsa_encrypt_key_base64_encoded(rsaprivatekey, rsapublickey, plainkey): algorithm=cryptography.hazmat.primitives.hashes.SHA1(), label=None)) return blobxfer.util.base64_encode_as_string(enckey) + + +def pad_pkcs7(buf): + # type: (bytes) -> bytes + """Appends PKCS7 padding to an input buffer + :param bytes buf: buffer to add padding + :rtype: bytes + :return: buffer with PKCS7_PADDING + """ + padder = cryptography.hazmat.primitives.padding.PKCS7( + cryptography.hazmat.primitives.ciphers. + algorithms.AES.block_size).padder() + return padder.update(buf) + padder.finalize() + + +def unpad_pkcs7(buf): + # type: (bytes) -> bytes + """Removes PKCS7 padding a decrypted object + :param bytes buf: buffer to remove padding + :rtype: bytes + :return: buffer without PKCS7_PADDING + """ + unpadder = cryptography.hazmat.primitives.padding.PKCS7( + cryptography.hazmat.primitives.ciphers. + algorithms.AES.block_size).unpadder() + return unpadder.update(buf) + unpadder.finalize() diff --git a/blobxfer/download.py b/blobxfer/download.py index dbbecac..49eea54 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -47,6 +47,7 @@ import dateutil # local imports import blobxfer.md5 +import blobxfer.models import blobxfer.operations import blobxfer.util @@ -211,9 +212,10 @@ def _add_to_download_queue(self, lpath, rfile): :param blobxfer.models.AzureStorageEntity rfile: remote file """ # prepare remote file for download - rfile.prepare_for_download(lpath, self._spec.options) + dd = blobxfer.models.DownloadDescriptor( + lpath, rfile, self._spec.options) # add remote file to queue - self._download_queue.put(rfile) + self._download_queue.put(dd) def _initialize_download_threads(self): # type: (Downloader) -> None @@ -243,12 +245,34 @@ def _worker_thread_download(self): if self._download_terminate: break try: - rfile = self._download_queue.get(False, 1) + dd = self._download_queue.get(False, 1) except queue.Empty: continue - # TODO - # get next offset with respect to chunk size + # get download offsets + # issue get range + + # if encryption: + # 1. compute rolling hmac if present + # - roll through any subsequent unchecked parts + # 2. decrypt chunk + + # compute rolling md5 if present + # - roll through any subsequent unchecked parts + + # write data to disk + + # if no integrity check could be performed due to current + # integrity offset mismatch, add to unchecked set + + # check if last chunk to write + # 1. complete integrity checks + # 2. set file uid/gid + # 3. set file modes + + # pickle dd to resume file + + rfile = dd._ase print('<<', rfile.container, rfile.name, rfile.lmt, rfile.size, rfile.md5, rfile.mode, rfile.encryption_metadata) diff --git a/blobxfer/models.py b/blobxfer/models.py index 34d05ce..6d0f753 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -100,6 +100,7 @@ class AzureStorageModes(enum.Enum): DownloadOptions = collections.namedtuple( 'DownloadOptions', [ 'check_file_md5', + 'chunk_size_bytes', 'delete_extraneous_destination', 'mode', 'overwrite', @@ -110,16 +111,24 @@ class AzureStorageModes(enum.Enum): ) SyncCopyOptions = collections.namedtuple( 'SyncCopyOptions', [ - 'exclude', - 'include', + 'chunk_size_bytes', 'mode', 'overwrite', - 'skip_on', ] ) LocalPath = collections.namedtuple( 'LocalPath', [ - 'parent_path', 'relative_path' + 'parent_path', + 'relative_path', + ] +) +DownloadOffsets = collections.namedtuple( + 'DownloadOffsets', [ + 'fd_start', + 'num_bytes', + 'range_end', + 'range_start', + 'unpad', ] ) @@ -749,58 +758,60 @@ def populate_from_file(self, file): self._md5 = file.properties.content_settings.content_md5 self._mode = AzureStorageModes.File - def prepare_for_download(self, lpath, options): - # type: (AzureStorageEntity, pathlib.Path, DownloadOptions) -> None - """Prepare entity for download - :param AzureStorageEntity self: this - :param pathlib.Path lpath: local path - :param DownloadOptions options: download options - """ - if self._encryption is not None: - hmac = self._encryption.initialize_hmac() - else: - hmac = None - if hmac is None and options.check_file_md5: - md5 = blobxfer.md5.new_md5_hasher() - else: - md5 = None - self.download = DownloadDescriptor(lpath, hmac, md5) - self.download.allocate_disk_space( - self._size, self._encryption is not None) - class DownloadDescriptor(object): - """DownloadDescriptor""" - def __init__(self, lpath, hmac, md5): - # type: (DownloadDescriptior, pathlib.Path, hmac.HMAC, md5.MD5) -> None - """Ctor for Download Descriptor + """Download Descriptor""" + + _AES_BLOCKSIZE = blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES + + def __init__(self, lpath, ase, options): + # type: (DownloadDescriptior, pathlib.Path, AzureStorageEntity, + # DownloadOptions) -> None + """Ctor for DownloadDescriptor :param DownloadDescriptor self: this :param pathlib.Path lpath: local path - :param hmac.HMAC hmac: hmac - :param md5.MD5 md5: md5 + :param AzureStorageEntity ase: Azure Storage Entity + :param DownloadOptions options: download options """ self.final_path = lpath # create path holding the temporary file to download to _tmp = list(lpath.parts[:-1]) _tmp.append(lpath.name + '.bxtmp') self.local_path = pathlib.Path(*_tmp) - self.hmac = hmac - self.md5 = md5 - self.current_position = 0 - - def allocate_disk_space(self, size, encryption): - # type: (DownloadDescriptor, int, bool) -> None - """Perform file allocation (possibly sparse), if encrypted this may - be an underallocation + self._ase = ase + self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) + self.hmac = None + self.md5 = None + self.offset = 0 + self.integrity_counter = 0 + self.unchecked_chunks = set() + self._initialize_integrity_checkers(options) + self._allocate_disk_space() + + def _initialize_integrity_checkers(self, options): + # type: (DownloadDescriptor, DownloadOptions) -> None + """Initialize file integrity checkers + :param DownloadDescriptor self: this + :param DownloadOptions options: download options + """ + if self._ase.encryption_metadata is not None: + self.hmac = self._ase.encryption_metadata.initialize_hmac() + if self.hmac is None and options.check_file_md5: + self.md5 = blobxfer.md5.new_md5_hasher() + + def _allocate_disk_space(self): + # type: (DownloadDescriptor, int) -> None + """Perform file allocation (possibly sparse) :param DownloadDescriptor self: this :param int size: size - :param bool encryption: encryption enabled """ + size = self._ase.size # compute size if size > 0: - if encryption: - allocatesize = size - \ - blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES + if self._ase.encryption_metadata is not None: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + self._AES_BLOCKSIZE else: allocatesize = size if allocatesize < 0: @@ -818,6 +829,47 @@ def allocate_disk_space(self, size, encryption): fd.seek(allocatesize - 1) fd.write(b'\0') + def next_offsets(self): + # type: (DownloadDescriptor) -> DownloadOffsets + """Retrieve the next offsets + :param DownloadDescriptor self: this + :rtype: DownloadOffsets + :return: download offsets + """ + if self.offset >= self._ase.size: + return None + if self.offset + self._chunk_size > self._ase.size: + chunk = self._ase.size - self.offset + else: + chunk = self._chunk_size + # on download, num_bytes must be offset by -1 as the x-ms-range + # header expects it that way. x -> y bytes means first bits of the + # (x+1)th byte to the last bits of the (y+1)th byte. for example, + # 0 -> 511 means byte 1 to byte 512 + num_bytes = chunk - 1 + fd_start = self.offset + range_start = self.offset + if self._ase.encryption_metadata is not None: + # ensure start is AES block size aligned + range_start = range_start - (range_start % self._AES_BLOCKSIZE) - \ + self._AES_BLOCKSIZE + if range_start <= 0: + range_start = 0 + range_end = self.offset + num_bytes + self.offset += chunk + if (self._ase.encryption_metadata is not None and + self.offset >= self._ase.size): + unpad = True + else: + unpad = False + return DownloadOffsets( + fd_start=fd_start, + num_bytes=num_bytes, + range_start=range_start, + range_end=range_end, + unpad=unpad, + ) + class AzureDestinationPaths(object): def __init__(self): diff --git a/cli/cli.py b/cli/cli.py index 64be863..0c085c7 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -286,7 +286,7 @@ def callback(ctx, param, value): expose_value=False, type=int, default=4194304, - help='Chunk size in bytes [4194304]', + help='Block or chunk size in bytes [4194304]', callback=callback)(f) @@ -580,6 +580,7 @@ def download_options(f): f = _exclude_option(f) f = _endpoint_option(f) f = _delete_option(f) + f = _chunk_size_bytes_option(f) f = _access_key_option(f) return f @@ -596,6 +597,7 @@ def sync_copy_options(f): f = _include_option(f) f = _exclude_option(f) f = _endpoint_option(f) + f = _chunk_size_bytes_option(f) f = _access_key_option(f) return f diff --git a/cli/settings.py b/cli/settings.py index f567bea..448d0a8 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -117,6 +117,7 @@ def add_cli_options( 'exclude': cli_options['exclude'], 'options': { 'check_file_md5': cli_options['file_md5'], + 'chunk_size_bytes': cli_options['chunk_size_bytes'], 'delete_extraneous_destination': cli_options['delete'], 'mode': cli_options['mode'], 'overwrite': cli_options['overwrite'], @@ -148,6 +149,7 @@ def add_cli_options( 'include': cli_options['include'], 'exclude': cli_options['exclude'], 'options': { + 'chunk_size_bytes': cli_options['chunk_size_bytes'], 'mode': cli_options['mode'], 'overwrite': cli_options['overwrite'], 'skip_on': { @@ -279,6 +281,7 @@ def create_download_specifications(config): ds = blobxfer.models.DownloadSpecification( download_options=blobxfer.models.DownloadOptions( check_file_md5=conf['options']['check_file_md5'], + chunk_size_bytes=conf['options']['chunk_size_bytes'], delete_extraneous_destination=conf[ 'options']['delete_extraneous_destination'], mode=mode, diff --git a/tests/test_blobxfer_crypto_operations.py b/tests/test_blobxfer_crypto_operations.py index 1760701..a37be4f 100644 --- a/tests/test_blobxfer_crypto_operations.py +++ b/tests/test_blobxfer_crypto_operations.py @@ -42,3 +42,10 @@ def test_rsa_encrypt_decrypt_keys(): assert enckey is not None plainkey = ops.rsa_decrypt_base64_encoded_key(_RSAKEY, enckey) assert symkey == plainkey + + +def test_pkcs7_padding(): + buf = os.urandom(32) + pbuf = ops.pad_pkcs7(buf) + buf2 = ops.unpad_pkcs7(pbuf) + assert buf == buf2 diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download.py index 56e8999..a80c629 100644 --- a/tests/test_blobxfer_download.py +++ b/tests/test_blobxfer_download.py @@ -29,6 +29,7 @@ def test_check_download_conditions(tmpdir): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=False, @@ -52,6 +53,7 @@ def test_check_download_conditions(tmpdir): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=True, @@ -73,6 +75,7 @@ def test_check_download_conditions(tmpdir): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=True, @@ -94,6 +97,7 @@ def test_check_download_conditions(tmpdir): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=True, @@ -123,6 +127,7 @@ def test_check_download_conditions(tmpdir): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=True, @@ -236,6 +241,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._md5_check_thread = mock.MagicMock() d._spec.sources = [] d._spec.options = mock.MagicMock() + d._spec.options.chunk_size_bytes = 1 d._spec.options.mode = models.AzureStorageModes.Auto d._spec.options.overwrite = True d._spec.skip_on = mock.MagicMock() diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 7b81332..3227250 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -279,6 +279,7 @@ def test_downloadspecification(): ds = models.DownloadSpecification( download_options=models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=models.AzureStorageModes.Auto, overwrite=True, @@ -341,45 +342,130 @@ def test_azurestorageentity(): assert ase.mode == models.AzureStorageModes.File -def test_azurestorageentity_prepare_for_download(tmpdir): +def test_downloaddescriptor(tmpdir): lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() opts.check_file_md5 = True - + opts.chunk_size_bytes = 1 ase = models.AzureStorageEntity('cont') - ase._size = 0 - ase.prepare_for_download(lp, opts) - - assert ase.download.hmac is None - assert ase.download.md5 is not None - assert ase.download.final_path == lp - assert ase.download.current_position == 0 - + ase._size = 1024 ase._encryption = mock.MagicMock() - ase.prepare_for_download(lp, opts) + d = models.DownloadDescriptor(lp, ase, opts) - assert ase.download.hmac is not None - assert ase.download.md5 is None - - -def test_downloaddescriptor(tmpdir): - lp = pathlib.Path(str(tmpdir.join('a'))) - d = models.DownloadDescriptor(lp, None, None) - assert d.current_position == 0 + assert d.offset == 0 assert d.final_path == lp assert str(d.local_path) == str(lp) + '.bxtmp' - - d.allocate_disk_space(1024, True) assert d.local_path.stat().st_size == 1024 - 16 d.local_path.unlink() - d.allocate_disk_space(1, True) + ase._size = 1 + d._allocate_disk_space() assert d.local_path.stat().st_size == 0 d.local_path.unlink() - d.allocate_disk_space(1024, False) + ase._encryption = None + ase._size = 1024 + d._allocate_disk_space() assert d.local_path.stat().st_size == 1024 # pre-existing file check - d.allocate_disk_space(0, False) + ase._size = 0 + d._allocate_disk_space() assert d.local_path.stat().st_size == 0 + + +def test_downloaddescriptor_next_offsets(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = models.AzureStorageEntity('cont') + ase._size = 128 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 127 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 0 + d = models.DownloadDescriptor(lp, ase, opts) + assert d.next_offsets() is None + + ase._size = 1 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 0 + assert offsets.range_start == 0 + assert offsets.range_end == 0 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 255 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + 16 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 255 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets = d.next_offsets() + assert offsets.fd_start == 256 + assert offsets.num_bytes == 15 + assert offsets.range_start == 256 + assert offsets.range_end == 256 + 15 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._encryption = mock.MagicMock() + ase._size = 128 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 127 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 255 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + 32 # 16 bytes over + padding + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert offsets.fd_start == 0 + assert offsets.num_bytes == 255 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets = d.next_offsets() + assert offsets.fd_start == 256 + assert offsets.num_bytes == 31 + assert offsets.range_start == 256 - 16 + assert offsets.range_end == 256 + 31 + assert offsets.unpad + assert d.next_offsets() is None diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py index 9926bab..9b648f6 100644 --- a/tests/test_blobxfer_operations.py +++ b/tests/test_blobxfer_operations.py @@ -20,6 +20,7 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): ds = blobxfer.models.DownloadSpecification( download_options=blobxfer.models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=blobxfer.models.AzureStorageModes.Auto, overwrite=True, @@ -54,6 +55,7 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): ds = blobxfer.models.DownloadSpecification( download_options=blobxfer.models.DownloadOptions( check_file_md5=True, + chunk_size_bytes=4194304, delete_extraneous_destination=False, mode=blobxfer.models.AzureStorageModes.File, overwrite=True, From 31ef912cd675fc5dcd81953d4aa84b990b60c55f Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 24 Feb 2017 19:32:56 -0800 Subject: [PATCH 10/47] More progress on download - Add custom retry handler - Add snapshot support - Add range gets --- blobxfer/blob/append/operations.py | 20 ++-- blobxfer/blob/block/operations.py | 11 ++- blobxfer/blob/operations.py | 28 +++++- blobxfer/blob/page/operations.py | 9 +- blobxfer/download.py | 128 ++++++++++++++++++++----- blobxfer/file/operations.py | 25 +++++ blobxfer/md5.py | 10 +- blobxfer/models.py | 118 ++++++++++++++++++++--- blobxfer/retry.py | 85 ++++++++++++++++ blobxfer/util.py | 14 +++ tests/test_blobxfer_blob_operations.py | 30 +++++- tests/test_blobxfer_download.py | 29 ++++-- tests/test_blobxfer_file_operations.py | 14 +++ tests/test_blobxfer_md5.py | 7 +- tests/test_blobxfer_models.py | 59 ++++++++---- tests/test_blobxfer_retry.py | 39 ++++++++ tests/test_blobxfer_util.py | 9 ++ 17 files changed, 533 insertions(+), 102 deletions(-) create mode 100644 blobxfer/retry.py create mode 100644 tests/test_blobxfer_retry.py diff --git a/blobxfer/blob/append/operations.py b/blobxfer/blob/append/operations.py index 88d5b58..cbe4008 100644 --- a/blobxfer/blob/append/operations.py +++ b/blobxfer/blob/append/operations.py @@ -31,8 +31,9 @@ # stdlib imports import logging # non-stdlib imports -from azure.storage.blob import AppendBlobService +import azure.storage.blob # local imports +import blobxfer.retry # create logger logger = logging.getLogger(__name__) @@ -46,24 +47,15 @@ def create_client(storage_account): :return: append blob service client """ if storage_account.is_sas: - client = AppendBlobService( + client = azure.storage.blob.AppendBlobService( account_name=storage_account.name, sas_token=storage_account.key, endpoint_suffix=storage_account.endpoint) else: - client = AppendBlobService( + client = azure.storage.blob.AppendBlobService( account_name=storage_account.name, account_key=storage_account.key, endpoint_suffix=storage_account.endpoint) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client - - -def list_blobs(client, container, prefix): - # type: (azure.storage.blob.AppendBlobService, str, str) -> list - """List append blobs in path - :param AppendBlobService client: append blob client - :param str container: container - :param str prefix: path prefix - """ - - pass diff --git a/blobxfer/blob/block/operations.py b/blobxfer/blob/block/operations.py index 94fd534..c07fda7 100644 --- a/blobxfer/blob/block/operations.py +++ b/blobxfer/blob/block/operations.py @@ -31,8 +31,9 @@ # stdlib imports import logging # non-stdlib imports -from azure.storage.blob import BlockBlobService +import azure.storage.blob # local imports +import blobxfer.retry # create logger logger = logging.getLogger(__name__) @@ -42,19 +43,21 @@ def create_client(storage_account): # type: (blobxfer.models.AzureStorageAccount) -> BlockBlobService """Create block blob client :param blobxfer.models.AzureStorageAccount storage_account: storage account - :rtype: BlockBlobService + :rtype: azure.storage.blob.BlockBlobService :return: block blob service client """ if storage_account.is_sas: - client = BlockBlobService( + client = azure.storage.blob.BlockBlobService( account_name=storage_account.name, sas_token=storage_account.key, endpoint_suffix=storage_account.endpoint) else: - client = BlockBlobService( + client = azure.storage.blob.BlockBlobService( account_name=storage_account.name, account_key=storage_account.key, endpoint_suffix=storage_account.endpoint) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index 411ad52..4a8f0eb 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -76,6 +76,13 @@ def list_blobs(client, container, prefix, mode, timeout=None): """ if mode == blobxfer.models.AzureStorageModes.File: raise RuntimeError('cannot list Azure Files from blob client') + if blobxfer.util.blob_is_snapshot(prefix): + snapshot = blobxfer.util.parse_blob_snapshot_parameter(prefix) + blob = client.get_blob_properties( + container_name=container, blob_name=prefix, snapshot=snapshot, + timeout=timeout) + yield blob + return blobs = client.list_blobs( container_name=container, prefix=prefix, @@ -99,5 +106,22 @@ def list_blobs(client, container, prefix, mode, timeout=None): yield blob -def get_blob_range(client, container, blob_name, snapshot): - pass +def get_blob_range(ase, offsets, timeout=None): + # type: (blobxfer.models.AzureStorageEntity, + # blobxfer.models.DownloadOffsets, int) -> bytes + """Retrieve blob range + :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity + :param blobxfer.models.DownloadOffsets offsets: downlaod offsets + :param int timeout: timeout + :rtype: bytes + :return: content for blob range + """ + return ase.client._get_blob( + container_name=ase.container, + blob_name=ase.name, + snapshot=ase.snapshot, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # HTTPS takes care of integrity during xfer + timeout=timeout, + ).content diff --git a/blobxfer/blob/page/operations.py b/blobxfer/blob/page/operations.py index f23520b..359e207 100644 --- a/blobxfer/blob/page/operations.py +++ b/blobxfer/blob/page/operations.py @@ -31,8 +31,9 @@ # stdlib imports import logging # non-stdlib imports -from azure.storage.blob import PageBlobService +import azure.storage.blob # local imports +import blobxfer.retry # create logger logger = logging.getLogger(__name__) @@ -46,13 +47,15 @@ def create_client(storage_account): :return: block blob service client """ if storage_account.is_sas: - client = PageBlobService( + client = azure.storage.blob.PageBlobService( account_name=storage_account.name, sas_token=storage_account.key, endpoint_suffix=storage_account.endpoint) else: - client = PageBlobService( + client = azure.storage.blob.PageBlobService( account_name=storage_account.name, account_key=storage_account.key, endpoint_suffix=storage_account.endpoint) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client diff --git a/blobxfer/download.py b/blobxfer/download.py index 49eea54..970bf18 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -49,6 +49,8 @@ import blobxfer.md5 import blobxfer.models import blobxfer.operations +import blobxfer.blob.operations +import blobxfer.file.operations import blobxfer.util # create logger @@ -74,12 +76,16 @@ def __init__(self, general_options, creds, spec): :param blobxfer.models.DownloadSpecification spec: download spec """ self._md5_meta_lock = threading.Lock() + self._download_lock = threading.Lock() self._all_remote_files_processed = False self._md5_map = {} self._md5_offload = None self._md5_check_thread = None self._download_queue = queue.Queue() + self._download_set = set() self._download_threads = [] + self._download_count = 0 + self._download_total_bytes = 0 self._download_terminate = False self._general_options = general_options self._creds = creds @@ -164,8 +170,11 @@ def _post_md5_skip_on_check(self, filename, md5_match): """ with self._md5_meta_lock: rfile = self._md5_map.pop(filename) - if not md5_match: - lpath = pathlib.Path(filename) + lpath = pathlib.Path(filename) + if md5_match: + with self._download_lock: + self._download_set.remove(lpath) + else: self._add_to_download_queue(lpath, rfile) def _initialize_check_md5_downloads_thread(self): @@ -185,12 +194,18 @@ def _check_for_downloads_from_md5(self): (len(self._md5_map) == 0 and self._all_remote_files_processed)): break + result = None cv.acquire() while not self._download_terminate: result = self._md5_offload.get_localfile_md5_done() if result is None: # use cv timeout due to possible non-wake while running cv.wait(1) + # check for terminating conditions + with self._md5_meta_lock: + if (len(self._md5_map) == 0 and + self._all_remote_files_processed): + break else: break cv.release() @@ -214,7 +229,7 @@ def _add_to_download_queue(self, lpath, rfile): # prepare remote file for download dd = blobxfer.models.DownloadDescriptor( lpath, rfile, self._spec.options) - # add remote file to queue + # add download descriptor to queue self._download_queue.put(dd) def _initialize_download_threads(self): @@ -222,17 +237,20 @@ def _initialize_download_threads(self): """Initialize download threads :param Downloader self: this """ + logger.debug('spawning {} transfer threads'.format( + self._general_options.concurrency.transfer_threads)) for _ in range(self._general_options.concurrency.transfer_threads): thr = threading.Thread(target=self._worker_thread_download) self._download_threads.append(thr) thr.start() - def _terminate_download_threads(self): - # type: (Downloader) -> None + def _wait_for_download_threads(self, terminate): + # type: (Downloader, bool) -> None """Terminate download threads :param Downloader self: this + :param bool terminate: terminate threads """ - self._download_terminate = True + self._download_terminate = terminate for thr in self._download_threads: thr.join() @@ -244,37 +262,68 @@ def _worker_thread_download(self): while True: if self._download_terminate: break + with self._download_lock: + if (self._all_remote_files_processed and + len(self._download_set) == 0): + break try: dd = self._download_queue.get(False, 1) except queue.Empty: continue # get download offsets - + offsets = dd.next_offsets() + # check if all operations completed + if offsets is None and dd.outstanding_operations == 0: + # TODO + # 1. complete integrity checks + # 2. set file uid/gid + # 3. set file modes + # 4. move file to final path + with self._download_lock: + self._download_set.remove(dd.final_path) + self._download_count += 1 + logger.info('download complete: {}/{} to {}'.format( + dd.entity.container, dd.entity.name, dd.final_path)) + continue + # re-enqueue for other threads to download + self._download_queue.put(dd) + if offsets is None: + continue # issue get range - - # if encryption: - # 1. compute rolling hmac if present - # - roll through any subsequent unchecked parts - # 2. decrypt chunk - - # compute rolling md5 if present - # - roll through any subsequent unchecked parts + if dd.entity.mode == blobxfer.models.AzureStorageModes.File: + chunk = blobxfer.file.operations.get_file_range( + dd.entity, offsets, self._general_options.timeout_sec) + else: + chunk = blobxfer.blob.operations.get_blob_range( + dd.entity, offsets, self._general_options.timeout_sec) + # accounting + with self._download_lock: + self._download_total_bytes += offsets.num_bytes + # decrypt if necessary + if dd.entity.is_encrypted: + # TODO via crypto pool + # 1. compute rolling hmac if present + # - roll through any subsequent unchecked parts + # 2. decrypt chunk + pass + # compute rolling md5 via md5 pool + if dd.must_compute_md5: + # TODO + # - roll through any subsequent unchecked parts + pass # write data to disk # if no integrity check could be performed due to current # integrity offset mismatch, add to unchecked set - # check if last chunk to write - # 1. complete integrity checks - # 2. set file uid/gid - # 3. set file modes + dd.dec_outstanding_operations() # pickle dd to resume file - rfile = dd._ase - print('<<', rfile.container, rfile.name, rfile.lmt, rfile.size, - rfile.md5, rfile.mode, rfile.encryption_metadata) +# rfile = dd._ase +# print('<<', rfile.container, rfile.name, rfile.lmt, rfile.size, +# rfile.md5, rfile.mode, rfile.encryption_metadata) def _run(self): # type: (Downloader) -> None @@ -290,26 +339,52 @@ def _run(self): # initialize download threads self._initialize_download_threads() # iterate through source paths to download + nfiles = 0 + empty_files = 0 + skipped_files = 0 + total_size = 0 + skipped_size = 0 for src in self._spec.sources: for rfile in src.files( self._creds, self._spec.options, self._general_options): + nfiles += 1 + total_size += rfile.size + if rfile.size == 0: + empty_files += 1 # form local path for remote file lpath = pathlib.Path(self._spec.destination.path, rfile.name) # check on download conditions action = self._check_download_conditions(lpath, rfile) if action == DownloadAction.Skip: + skipped_files += 1 + skipped_size += rfile.size continue - elif action == DownloadAction.CheckMd5: + # add potential download to set + with self._download_lock: + self._download_set.add(lpath) + # either MD5 check or download now + if action == DownloadAction.CheckMd5: self._pre_md5_skip_on_check(lpath, rfile) elif action == DownloadAction.Download: self._add_to_download_queue(lpath, rfile) + download_files = nfiles - skipped_files + download_size = total_size - skipped_size # clean up processes and threads with self._md5_meta_lock: self._all_remote_files_processed = True + logger.debug( + ('{0} remote files processed, waiting for download completion ' + 'of {1:.4f} MiB').format(nfiles, download_size / 1048576)) self._md5_check_thread.join() - # TODO wait for download threads - + self._wait_for_download_threads(terminate=False) self._md5_offload.finalize_md5_processes() + if (self._download_count != download_files or + self._download_total_bytes != download_size): + raise RuntimeError( + 'download mismatch: [count={}/{} bytes={}/{}]'.format( + self._download_count, download_files, + self._download_total_bytes, download_size)) + logger.info('all files downloaded') def start(self): # type: (Downloader) -> None @@ -320,6 +395,7 @@ def start(self): logger.error( 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') - self._terminate_download_threads() + self._wait_for_download_threads(terminate=True) self._md5_offload.finalize_md5_processes() + # TODO close resume file in finally? raise diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index 221f412..09f7d68 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -38,6 +38,7 @@ import azure.common import azure.storage.file # local imports +import blobxfer.retry # create logger logger = logging.getLogger(__name__) @@ -60,6 +61,8 @@ def create_client(storage_account): account_name=storage_account.name, account_key=storage_account.key, endpoint_suffix=storage_account.endpoint) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client @@ -145,3 +148,25 @@ def list_files(client, fileshare, prefix, timeout=None): yield fsprop else: dirs.append(fspath) + + +def get_file_range(ase, offsets, timeout=None): + # type: (blobxfer.models.AzureStorageEntity, + # blobxfer.models.DownloadOffsets, int) -> bytes + """Retrieve file range + :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity + :param blobxfer.models.DownloadOffsets offsets: downlaod offsets + :param int timeout: timeout + :rtype: bytes + :return: content for file range + """ + dir, fpath = parse_file_path(ase.name) + return ase.client._get_file( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # HTTPS takes care of integrity during xfer + timeout=timeout, + ).content diff --git a/blobxfer/md5.py b/blobxfer/md5.py index 86dbd30..741e360 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -92,7 +92,7 @@ def compute_md5_for_data_asbase64(data): class LocalFileMd5Offload(object): """LocalFileMd5Offload""" - def __init__(self, num_workers=None): + def __init__(self, num_workers): # type: (LocalFileMd5Offload, int) -> None """Ctor for Local File Md5 Offload :param LocalFileMd5Offload self: this @@ -115,16 +115,14 @@ def done_cv(self): """ return self._done_cv - def _initialize_md5_processes(self, num_workers=None): + def _initialize_md5_processes(self, num_workers): # type: (LocalFileMd5Offload, int) -> None """Initialize MD5 checking processes for files for download :param LocalFileMd5Offload self: this :param int num_workers: number of worker processes """ - if num_workers is None: - num_workers = multiprocessing.cpu_count() // 2 - 1 - if num_workers < 1: - num_workers = 1 + if num_workers is None or num_workers < 1: + raise ValueError('invalid num_workers: {}'.format(num_workers)) for _ in range(num_workers): proc = multiprocessing.Process( target=self._worker_compute_md5_localfile_process) diff --git a/blobxfer/models.py b/blobxfer/models.py index 6d0f753..74809eb 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -34,11 +34,13 @@ import enum import fnmatch import logging +import math import os try: import pathlib2 as pathlib except ImportError: # noqa import pathlib +import multiprocessing # non-stdlib imports # local imports from .api import ( @@ -146,11 +148,15 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): self.md5_processes = md5_processes self.transfer_threads = transfer_threads if self.crypto_processes is None or self.crypto_processes < 1: + self.crypto_processes = multiprocessing.cpu_count() // 2 - 1 + if self.crypto_processes < 1: self.crypto_processes = 1 if self.md5_processes is None or self.md5_processes < 1: + self.md5_processes = multiprocessing.cpu_count() // 2 + if self.md5_processes < 1: self.md5_processes = 1 if self.transfer_threads is None or self.transfer_threads < 1: - self.transfer_threads = 1 + self.transfer_threads = multiprocessing.cpu_count() * 2 class GeneralOptions(object): @@ -602,7 +608,7 @@ def _populate_from_list_files(self, creds, options, general_options): else: ed = None ase = AzureStorageEntity(cont, ed) - ase.populate_from_file(file) + ase.populate_from_file(sa, file) yield ase def _populate_from_list_blobs(self, creds, options, general_options): @@ -631,7 +637,7 @@ def _populate_from_list_blobs(self, creds, options, general_options): else: ed = None ase = AzureStorageEntity(cont, ed) - ase.populate_from_blob(blob) + ase.populate_from_blob(sa, blob) yield ase @@ -646,6 +652,7 @@ def __init__(self, container, ed=None): :param blobxfer.crypto.models.EncryptionMetadata ed: encryption metadata """ + self._client = None self._container = container self._name = None self._mode = None @@ -657,6 +664,16 @@ def __init__(self, container, ed=None): self._vio = None self.download = None + @property + def client(self): + # type: (AzureStorageEntity) -> object + """Associated storage client + :param AzureStorageEntity self: this + :rtype: object + :return: associated storage client + """ + return self._client + @property def container(self): # type: (AzureStorageEntity) -> str @@ -697,6 +714,16 @@ def size(self): """ return self._size + @property + def snapshot(self): + # type: (AzureStorageEntity) -> str + """Entity snapshot + :param AzureStorageEntity self: this + :rtype: str + :return: snapshot of entity + """ + return self._snapshot + @property def md5(self): # type: (AzureStorageEntity) -> str @@ -717,21 +744,33 @@ def mode(self): """ return self._mode + @property + def is_encrypted(self): + # type: (AzureStorageEntity) -> bool + """If data is encrypted + :param AzureStorageEntity self: this + :rtype: bool + :return: if encryption metadata is present + """ + return self._encryption is not None + @property def encryption_metadata(self): # type: (AzureStorageEntity) -> # blobxfer.crypto.models.EncryptionMetadata - """Entity mode (type) + """Entity metadata (type) :param AzureStorageEntity self: this :rtype: blobxfer.crypto.models.EncryptionMetadata :return: encryption metadata of entity """ return self._encryption - def populate_from_blob(self, blob): - # type: (AzureStorageEntity, azure.storage.blob.models.Blob) -> None + def populate_from_blob(self, sa, blob): + # type: (AzureStorageEntity, AzureStorageAccount, + # azure.storage.blob.models.Blob) -> None """Populate properties from Blob :param AzureStorageEntity self: this + :param AzureStorageAccount sa: storage account :param azure.storage.blob.models.Blob blob: blob to populate from """ self._name = blob.name @@ -741,22 +780,29 @@ def populate_from_blob(self, blob): self._md5 = blob.properties.content_settings.content_md5 if blob.properties.blob_type == BlobTypes.AppendBlob: self._mode = AzureStorageModes.Append + self._client = sa.append_blob_client elif blob.properties.blob_type == BlobTypes.BlockBlob: self._mode = AzureStorageModes.Block + self._client = sa.block_blob_client elif blob.properties.blob_type == BlobTypes.PageBlob: self._mode = AzureStorageModes.Page + self._client = sa.page_blob_client - def populate_from_file(self, file): - # type: (AzureStorageEntity, azure.storage.file.models.File) -> None + def populate_from_file(self, sa, file): + # type: (AzureStorageEntity, AzureStorageAccount, + # azure.storage.file.models.File) -> None """Populate properties from File :param AzureStorageEntity self: this + :param AzureStorageAccount sa: storage account :param azure.storage.file.models.File file: file to populate from """ self._name = file.name + self._snapshot = None self._lmt = file.properties.last_modified self._size = file.properties.content_length self._md5 = file.properties.content_settings.content_md5 self._mode = AzureStorageModes.File + self._client = sa.file_client class DownloadDescriptor(object): @@ -778,23 +824,53 @@ def __init__(self, lpath, ase, options): _tmp = list(lpath.parts[:-1]) _tmp.append(lpath.name + '.bxtmp') self.local_path = pathlib.Path(*_tmp) + self._meta_lock = multiprocessing.Lock() self._ase = ase + # calculate the total number of ops required for transfer self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) + try: + self._total_chunks = int( + math.ceil(self._ase.size / self._chunk_size)) + except ZeroDivisionError: + self._total_chunks = 0 self.hmac = None self.md5 = None self.offset = 0 self.integrity_counter = 0 self.unchecked_chunks = set() + self._outstanding_ops = self._total_chunks + self._completed_ops = 0 + # initialize checkers and allocate space self._initialize_integrity_checkers(options) self._allocate_disk_space() + @property + def entity(self): + # type: (DownloadDescriptor) -> AzureStorageEntity + """Get linked AzureStorageEntity + :param DownloadDescriptor self: this + :rtype: AzureStorageEntity + :return: AzureStorageEntity + """ + return self._ase + + @property + def must_compute_md5(self): + # type: (DownloadDescriptor) -> bool + """Check if MD5 must be computed + :param DownloadDescriptor self: this + :rtype: bool + :return: if MD5 must be computed + """ + return self.md5 is not None + def _initialize_integrity_checkers(self, options): # type: (DownloadDescriptor, DownloadOptions) -> None """Initialize file integrity checkers :param DownloadDescriptor self: this :param DownloadOptions options: download options """ - if self._ase.encryption_metadata is not None: + if self._ase.is_encrypted: self.hmac = self._ase.encryption_metadata.initialize_hmac() if self.hmac is None and options.check_file_md5: self.md5 = blobxfer.md5.new_md5_hasher() @@ -808,7 +884,7 @@ def _allocate_disk_space(self): size = self._ase.size # compute size if size > 0: - if self._ase.encryption_metadata is not None: + if self._ase.is_encrypted: # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ self._AES_BLOCKSIZE @@ -849,7 +925,7 @@ def next_offsets(self): num_bytes = chunk - 1 fd_start = self.offset range_start = self.offset - if self._ase.encryption_metadata is not None: + if self._ase.is_encrypted: # ensure start is AES block size aligned range_start = range_start - (range_start % self._AES_BLOCKSIZE) - \ self._AES_BLOCKSIZE @@ -857,19 +933,33 @@ def next_offsets(self): range_start = 0 range_end = self.offset + num_bytes self.offset += chunk - if (self._ase.encryption_metadata is not None and - self.offset >= self._ase.size): + if self._ase.is_encrypted and self.offset >= self._ase.size: unpad = True else: unpad = False return DownloadOffsets( fd_start=fd_start, - num_bytes=num_bytes, + num_bytes=chunk, range_start=range_start, range_end=range_end, unpad=unpad, ) + @property + def outstanding_operations(self): + with self._meta_lock: + return self._outstanding_ops + + @property + def completed_operations(self): + with self._meta_lock: + return self._completed_ops + + def dec_outstanding_operations(self): + with self._meta_lock: + self._outstanding_ops -= 1 + self._completed_ops += 1 + class AzureDestinationPaths(object): def __init__(self): diff --git a/blobxfer/retry.py b/blobxfer/retry.py new file mode 100644 index 0000000..ce42bd4 --- /dev/null +++ b/blobxfer/retry.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +import azure.storage.retry +# local imports + + +class ExponentialRetryWithMaxWait(azure.storage.retry._Retry): + """Exponential Retry with Max Wait (infinite retries)""" + def __init__(self, initial_backoff=1, max_backoff=8, reset_at_max=True): + # type: (ExponentialRetryWithMaxWait, int, int, bool) -> None + """Ctor for ExponentialRetryWithMaxWait + :param ExponentialRetryWithMaxWait self: this + :param int initial_backoff: initial backoff + :param int max_backoff: max backoff + :param bool reset_at_max: reset after reaching max wait + """ + if max_backoff < initial_backoff: + raise ValueError( + 'max backoff {} less than initial backoff {}'.format( + max_backoff, initial_backoff)) + self.initial_backoff = initial_backoff + self.max_backoff = max_backoff + self.reset_at_max = reset_at_max + super(ExponentialRetryWithMaxWait, self).__init__( + max_backoff if self.reset_at_max else 2147483647, False) + + def retry(self, context): + # type: (ExponentialRetryWithMaxWait, + # azure.storage.models.RetryContext) -> int + """Retry handler + :param ExponentialRetryWithMaxWait self: this + :param azure.storage.models.RetryContext context: retry context + :rtype: int or None + :return: int + """ + return self._retry(context, self._backoff) + + def _backoff(self, context): + # type: (ExponentialRetryWithMaxWait, + # azure.storage.models.RetryContext) -> int + """Backoff calculator + :param ExponentialRetryWithMaxWait self: this + :param azure.storage.models.RetryContext context: retry context + :rtype: int + :return: backoff amount + """ + if context.count == 1: + backoff = self.initial_backoff + else: + backoff = self.initial_backoff << (context.count - 1) + if backoff > self.max_backoff and self.reset_at_max: + backoff = self.initial_backoff + context.count = 1 + return backoff diff --git a/blobxfer/util.py b/blobxfer/util.py index 9029fb1..c8885f7 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -221,3 +221,17 @@ def blob_is_snapshot(url): except (ValueError, OverflowError): pass return False + + +def parse_blob_snapshot_parameter(url): + # type: (str) -> str + """Retrieves the blob snapshot parameter from a url + :param url str: blob url + :rtype: str + :return: snapshot parameter + """ + if blob_is_snapshot(url): + tmp = url.split('?snapshot=') + if len(tmp) > 1: + return tmp[-1] + return None diff --git a/tests/test_blobxfer_blob_operations.py b/tests/test_blobxfer_blob_operations.py index 5c078ac..dd635f2 100644 --- a/tests/test_blobxfer_blob_operations.py +++ b/tests/test_blobxfer_blob_operations.py @@ -39,10 +39,9 @@ def test_list_blobs(): None, 'cont', 'prefix', models.AzureStorageModes.File): pass - client = mock.MagicMock() - client.list_blobs = mock.MagicMock() _blob = azure.storage.blob.models.Blob(name='name') _blob.properties = azure.storage.blob.models.BlobProperties() + client = mock.MagicMock() client.list_blobs.return_value = [_blob] i = 0 @@ -76,3 +75,30 @@ def test_list_blobs(): i += 1 assert blob.name == 'name' assert i == 0 + + _blob.snapshot = '2017-02-23T22:21:14.8121864Z' + client.get_blob_properties.return_value = _blob + i = 0 + for blob in ops.list_blobs( + client, 'cont', + 'a?snapshot=2017-02-23T22:21:14.8121864Z', + models.AzureStorageModes.Auto): + i += 1 + assert blob.name == 'name' + assert blob.snapshot == _blob.snapshot + assert i == 1 + + +def test_get_blob_range(): + ase = mock.MagicMock() + ret = mock.MagicMock() + ret.content = b'\0' + ase.client._get_blob.return_value = ret + ase.container = 'cont' + ase.name = 'name' + ase.snapshot = None + offsets = mock.MagicMock() + offsets.start_range = 0 + offsets.end_range = 1 + + assert ops.get_blob_range(ase, offsets) == ret.content diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download.py index a80c629..6e12bcc 100644 --- a/tests/test_blobxfer_download.py +++ b/tests/test_blobxfer_download.py @@ -184,6 +184,7 @@ def test_post_md5_skip_on_check(): rfile = models.AzureStorageEntity('cont') rfile._md5 = 'abc' d._pre_md5_skip_on_check(lpath, rfile) + d._download_set.add(pathlib.Path(lpath)) assert lpath in d._md5_map d._post_md5_skip_on_check(lpath, True) @@ -191,6 +192,7 @@ def test_post_md5_skip_on_check(): d._add_to_download_queue = mock.MagicMock() d._pre_md5_skip_on_check(lpath, rfile) + d._download_set.add(pathlib.Path(lpath)) d._post_md5_skip_on_check(lpath, False) assert d._add_to_download_queue.call_count == 1 @@ -199,21 +201,25 @@ def test_initialize_check_md5_downloads_thread(): lpath = 'lpath' d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_map[lpath] = mock.MagicMock() + d._download_set.add(pathlib.Path(lpath)) d._md5_offload = mock.MagicMock() d._md5_offload.done_cv = multiprocessing.Condition() d._md5_offload.get_localfile_md5_done = mock.MagicMock() - d._md5_offload.get_localfile_md5_done.side_effect = [None, (lpath, True)] - d._post_md5_skip_on_check = mock.MagicMock() + d._md5_offload.get_localfile_md5_done.side_effect = [None, (lpath, False)] + d._add_to_download_queue = mock.MagicMock() d._initialize_check_md5_downloads_thread() + while len(d._md5_map) > 0: + d._md5_offload.done_cv.acquire() + d._md5_offload.done_cv.notify() + d._md5_offload.done_cv.release() d._all_remote_files_processed = True - d._md5_map.clear() d._md5_offload.done_cv.acquire() d._md5_offload.done_cv.notify() d._md5_offload.done_cv.release() d._md5_check_thread.join() - assert d._post_md5_skip_on_check.call_count == 1 + assert d._add_to_download_queue.call_count == 1 def test_initialize_and_terminate_download_threads(): @@ -225,7 +231,7 @@ def test_initialize_and_terminate_download_threads(): d._initialize_download_threads() assert len(d._download_threads) == 2 - d._terminate_download_threads() + d._wait_for_download_threads(terminate=True) assert d._download_terminate for thr in d._download_threads: assert not thr.is_alive() @@ -257,7 +263,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._spec.sources.append(asp) b = azure.storage.blob.models.Blob(name='name') - b.properties.content_length = 0 + b.properties.content_length = 1 patched_lb.side_effect = [[b]] d._pre_md5_skip_on_check = mock.MagicMock() @@ -270,22 +276,25 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.CheckMd5 - d.start() + with pytest.raises(RuntimeError): + d.start() assert d._pre_md5_skip_on_check.call_count == 1 + b.properties.content_length = 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.Download - d.start() + with pytest.raises(RuntimeError): + d.start() assert d._download_queue.qsize() == 1 def test_start_keyboard_interrupt(): d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._run = mock.MagicMock(side_effect=KeyboardInterrupt) - d._terminate_download_threads = mock.MagicMock() + d._wait_for_download_threads = mock.MagicMock() d._md5_offload = mock.MagicMock() with pytest.raises(KeyboardInterrupt): d.start() - assert d._terminate_download_threads.call_count == 1 + assert d._wait_for_download_threads.call_count == 1 diff --git a/tests/test_blobxfer_file_operations.py b/tests/test_blobxfer_file_operations.py index fd39912..b221534 100644 --- a/tests/test_blobxfer_file_operations.py +++ b/tests/test_blobxfer_file_operations.py @@ -120,3 +120,17 @@ def test_list_files_directory(patched_cisf): i += 1 assert file.name == 'name' assert i == 1 + + +def test_get_file_range(): + ase = mock.MagicMock() + ret = mock.MagicMock() + ret.content = b'\0' + ase.client._get_file.return_value = ret + ase.container = 'cont' + ase.name = 'name' + offsets = mock.MagicMock() + offsets.start_range = 0 + offsets.end_range = 1 + + assert ops.get_file_range(ase, offsets) == ret.content diff --git a/tests/test_blobxfer_md5.py b/tests/test_blobxfer_md5.py index 05a66d2..7a37072 100644 --- a/tests/test_blobxfer_md5.py +++ b/tests/test_blobxfer_md5.py @@ -32,7 +32,7 @@ def test_compute_md5(tmpdir): def test_done_cv(): a = None try: - a = md5.LocalFileMd5Offload() + a = md5.LocalFileMd5Offload(num_workers=1) assert a.done_cv == a._done_cv finally: if a: @@ -40,9 +40,12 @@ def test_done_cv(): def test_finalize_md5_processes(): + with pytest.raises(ValueError): + md5.LocalFileMd5Offload(num_workers=0) + a = None try: - a = md5.LocalFileMd5Offload(num_workers=0) + a = md5.LocalFileMd5Offload(num_workers=1) finally: if a: a.finalize_md5_processes() diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 3227250..f0e636d 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -17,7 +17,8 @@ import blobxfer.models as models -def test_concurrency_options(): +@mock.patch('multiprocessing.cpu_count', return_value=1) +def test_concurrency_options(patched_cc): a = models.ConcurrencyOptions( crypto_processes=-1, md5_processes=0, @@ -26,7 +27,7 @@ def test_concurrency_options(): assert a.crypto_processes == 1 assert a.md5_processes == 1 - assert a.transfer_threads == 1 + assert a.transfer_threads == 2 def test_general_options(): @@ -316,30 +317,37 @@ def test_azurestorageentity(): blob = mock.MagicMock() blob.name = 'name' + blob.snapshot = None blob.properties = mock.MagicMock() blob.properties.last_modified = 'lmt' blob.properties.content_length = 123 blob.properties.content_settings = mock.MagicMock() blob.properties.content_settings.content_md5 = 'abc' blob.properties.blob_type = azure.storage.blob.models._BlobTypes.BlockBlob - ase.populate_from_blob(blob) + ase.populate_from_blob(mock.MagicMock(), blob) + assert ase.client is not None assert ase.name == 'name' assert ase.lmt == 'lmt' assert ase.size == 123 assert ase.md5 == 'abc' + assert ase.snapshot is None assert ase.mode == models.AzureStorageModes.Block blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob - ase.populate_from_blob(blob) + ase.populate_from_blob(mock.MagicMock(), blob) assert ase.mode == models.AzureStorageModes.Append blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob - ase.populate_from_blob(blob) + blob.snapshot = 'abc' + ase.populate_from_blob(mock.MagicMock(), blob) assert ase.mode == models.AzureStorageModes.Page + assert ase.snapshot is not None - ase.populate_from_file(blob) + blob.snapshot = None + ase.populate_from_file(mock.MagicMock(), blob) assert ase.mode == models.AzureStorageModes.File + assert ase.snapshot is None def test_downloaddescriptor(tmpdir): @@ -347,12 +355,15 @@ def test_downloaddescriptor(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True - opts.chunk_size_bytes = 1 + opts.chunk_size_bytes = 16 ase = models.AzureStorageEntity('cont') ase._size = 1024 ase._encryption = mock.MagicMock() d = models.DownloadDescriptor(lp, ase, opts) + assert d.entity == ase + assert not d.must_compute_md5 + assert d._total_chunks == 64 assert d.offset == 0 assert d.final_path == lp assert str(d.local_path) == str(lp) + '.bxtmp' @@ -360,18 +371,20 @@ def test_downloaddescriptor(tmpdir): d.local_path.unlink() ase._size = 1 - d._allocate_disk_space() + d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 1 assert d.local_path.stat().st_size == 0 d.local_path.unlink() ase._encryption = None ase._size = 1024 - d._allocate_disk_space() + d = models.DownloadDescriptor(lp, ase, opts) assert d.local_path.stat().st_size == 1024 # pre-existing file check ase._size = 0 - d._allocate_disk_space() + d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 0 assert d.local_path.stat().st_size == 0 @@ -386,8 +399,9 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 1 assert offsets.fd_start == 0 - assert offsets.num_bytes == 127 + assert offsets.num_bytes == 128 assert offsets.range_start == 0 assert offsets.range_end == 127 assert not offsets.unpad @@ -395,13 +409,15 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 0 d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 0 assert d.next_offsets() is None ase._size = 1 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 1 assert offsets.fd_start == 0 - assert offsets.num_bytes == 0 + assert offsets.num_bytes == 1 assert offsets.range_start == 0 assert offsets.range_end == 0 assert not offsets.unpad @@ -410,8 +426,9 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 256 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 1 assert offsets.fd_start == 0 - assert offsets.num_bytes == 255 + assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad @@ -420,14 +437,15 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 256 + 16 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 2 assert offsets.fd_start == 0 - assert offsets.num_bytes == 255 + assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad offsets = d.next_offsets() assert offsets.fd_start == 256 - assert offsets.num_bytes == 15 + assert offsets.num_bytes == 16 assert offsets.range_start == 256 assert offsets.range_end == 256 + 15 assert not offsets.unpad @@ -437,8 +455,9 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 128 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 1 assert offsets.fd_start == 0 - assert offsets.num_bytes == 127 + assert offsets.num_bytes == 128 assert offsets.range_start == 0 assert offsets.range_end == 127 assert offsets.unpad @@ -447,8 +466,9 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 256 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 1 assert offsets.fd_start == 0 - assert offsets.num_bytes == 255 + assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert offsets.unpad @@ -457,14 +477,15 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._size = 256 + 32 # 16 bytes over + padding d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() + assert d._total_chunks == 2 assert offsets.fd_start == 0 - assert offsets.num_bytes == 255 + assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad offsets = d.next_offsets() assert offsets.fd_start == 256 - assert offsets.num_bytes == 31 + assert offsets.num_bytes == 32 assert offsets.range_start == 256 - 16 assert offsets.range_end == 256 + 31 assert offsets.unpad diff --git a/tests/test_blobxfer_retry.py b/tests/test_blobxfer_retry.py new file mode 100644 index 0000000..b66c41e --- /dev/null +++ b/tests/test_blobxfer_retry.py @@ -0,0 +1,39 @@ +# coding=utf-8 +"""Tests for retry""" + +# stdlib imports +import mock +# non-stdlib imports +import pytest +# module under test +import blobxfer.retry as retry + + +def test_exponentialretrywithmaxwait(): + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=1, max_backoff=0) + + er = retry.ExponentialRetryWithMaxWait() + context = mock.MagicMock() + context.count = 0 + context.response.status = 500 + bo = er.retry(context) + assert context.count == 1 + assert bo == 1 + + bo = er.retry(context) + assert context.count == 2 + assert bo == 2 + + bo = er.retry(context) + assert context.count == 3 + assert bo == 4 + + bo = er.retry(context) + assert context.count == 4 + assert bo == 8 + + bo = er.retry(context) + assert context.count == 1 + assert bo == 1 diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index 4dd0ebc..0f94c0e 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -172,3 +172,12 @@ def test_blob_is_snapshot(): a = '/cont/a?snapshot=2017-02-23T22:21:14.8121864Z?snapshot=' assert not blobxfer.util.blob_is_snapshot(a) + + +def test_parse_blob_snapshot_parameter(): + param = '2017-02-23T22:21:14.8121864Z' + a = '/cont/a?snapshot=' + param + assert blobxfer.util.parse_blob_snapshot_parameter(a) == param + + a = '/cont/a?snapshot=' + assert blobxfer.util.parse_blob_snapshot_parameter(a) is None From d70d404b466823ec007777cb91b47cb8f0ca7794 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Sun, 26 Feb 2017 02:02:56 -0800 Subject: [PATCH 11/47] Current download parity - Main download logic at parity with current blobxfer - Refactor multiprocess offload into base class - Add multiprocess crypto offload --- blobxfer/crypto/models.py | 11 +- blobxfer/crypto/operations.py | 122 ++++++++- blobxfer/download.py | 231 ++++++++++------ blobxfer/md5.py | 85 +----- blobxfer/models.py | 246 ++++++++++++++---- blobxfer/offload.py | 127 +++++++++ blobxfer/util.py | 10 + cli/settings.py | 4 +- tests/test_blobxfer_blob_append_operations.py | 1 - tests/test_blobxfer_crypto_operations.py | 40 ++- tests/test_blobxfer_download.py | 27 +- tests/test_blobxfer_md5.py | 18 +- tests/test_blobxfer_models.py | 20 +- 13 files changed, 695 insertions(+), 247 deletions(-) create mode 100644 blobxfer/offload.py diff --git a/blobxfer/crypto/models.py b/blobxfer/crypto/models.py index c4bb5b4..e08f6a7 100644 --- a/blobxfer/crypto/models.py +++ b/blobxfer/crypto/models.py @@ -35,20 +35,13 @@ import hashlib import hmac import json -import logging # non-stdlib imports # local imports import blobxfer.crypto.operations import blobxfer.util - # encryption constants -_AES256_KEYLENGTH_BYTES = 32 _AES256_BLOCKSIZE_BYTES = 16 -_HMACSHA256_DIGESTSIZE_BYTES = 32 -_AES256CBC_HMACSHA256_OVERHEAD_BYTES = ( - _AES256_BLOCKSIZE_BYTES + _HMACSHA256_DIGESTSIZE_BYTES -) # named tuples EncryptionBlobxferExtensions = collections.namedtuple( @@ -191,8 +184,8 @@ def convert_from_json(self, md, blobname, rsaprivatekey): ) except KeyError: pass - self.content_encryption_iv = ed[ - EncryptionMetadata._JSON_KEY_CONTENT_IV] + self.content_encryption_iv = base64.b64decode( + ed[EncryptionMetadata._JSON_KEY_CONTENT_IV]) self.encryption_agent = EncryptionAgent( encryption_algorithm=ed[ EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT][ diff --git a/blobxfer/crypto/operations.py b/blobxfer/crypto/operations.py index 1931407..deeb287 100644 --- a/blobxfer/crypto/operations.py +++ b/blobxfer/crypto/operations.py @@ -31,7 +31,13 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import base64 +import enum import logging +import os +try: + import queue +except ImportError: # noqa + import Queue as queue # non-stdlib imports import cryptography.hazmat.backends import cryptography.hazmat.primitives.asymmetric.padding @@ -44,7 +50,13 @@ import cryptography.hazmat.primitives.padding import cryptography.hazmat.primitives.serialization # local imports -import blobxfer.util +import blobxfer.offload + +# create logger +logger = logging.getLogger(__name__) + +# encryption constants +_AES256_KEYLENGTH_BYTES = 32 def load_rsa_private_key_file(rsakeyfile, passphrase): @@ -130,7 +142,7 @@ def rsa_encrypt_key_base64_encoded(rsaprivatekey, rsapublickey, plainkey): return blobxfer.util.base64_encode_as_string(enckey) -def pad_pkcs7(buf): +def pkcs7_pad(buf): # type: (bytes) -> bytes """Appends PKCS7 padding to an input buffer :param bytes buf: buffer to add padding @@ -143,7 +155,7 @@ def pad_pkcs7(buf): return padder.update(buf) + padder.finalize() -def unpad_pkcs7(buf): +def pkcs7_unpad(buf): # type: (bytes) -> bytes """Removes PKCS7 padding a decrypted object :param bytes buf: buffer to remove padding @@ -154,3 +166,107 @@ def unpad_pkcs7(buf): cryptography.hazmat.primitives.ciphers. algorithms.AES.block_size).unpadder() return unpadder.update(buf) + unpadder.finalize() + + +def aes256_generate_random_key(): + # type: (None) -> bytes + """Generate random AES256 key + :rtype: bytes + :return: random key + """ + return os.urandom(_AES256_KEYLENGTH_BYTES) + + +def aes_cbc_decrypt_data(symkey, iv, encdata, unpad): + # type: (bytes, bytes, bytes, bool) -> bytes + """Decrypt data using AES CBC + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes encdata: data to decrypt + :param bool unpad: unpad data + :rtype: bytes + :return: decrypted data + """ + cipher = cryptography.hazmat.primitives.ciphers.Cipher( + cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), + cryptography.hazmat.primitives.ciphers.modes.CBC(iv), + backend=cryptography.hazmat.backends.default_backend()).decryptor() + decrypted = cipher.update(encdata) + cipher.finalize() + if unpad: + return pkcs7_unpad(decrypted) + else: + return decrypted + + +def aes_cbc_encrypt_data(symkey, iv, data, pad): + # type: (bytes, bytes, bytes, bool) -> bytes + """Encrypt data using AES CBC + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes data: data to encrypt + :param bool pad: pad data + :rtype: bytes + :return: encrypted data + """ + cipher = cryptography.hazmat.primitives.ciphers.Cipher( + cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), + cryptography.hazmat.primitives.ciphers.modes.CBC(iv), + backend=cryptography.hazmat.backends.default_backend()).encryptor() + if pad: + return cipher.update(pkcs7_pad(data)) + cipher.finalize() + else: + return cipher.update(data) + cipher.finalize() + + +class CryptoAction(enum.Enum): + Encrypt = 1 + Decrypt = 2 + + +class CryptoOffload(blobxfer.offload._MultiprocessOffload): + def __init__(self, num_workers): + # type: (CryptoOffload, int) -> None + """Ctor for Crypto Offload + :param CryptoOffload self: this + :param int num_workers: number of worker processes + """ + super(CryptoOffload, self).__init__(num_workers, 'Crypto') + + def _worker_process(self): + # type: (CryptoOffload) -> None + """Crypto worker + :param CryptoOffload self: this + """ + while not self.terminated: + try: + inst = self._task_queue.get(True, 1) + except queue.Empty: + continue + if inst[0] == CryptoAction.Encrypt: + # TODO on upload + raise NotImplementedError() + elif inst[0] == CryptoAction.Decrypt: + final_path, offsets, symkey, iv, encdata = \ + inst[1], inst[2], inst[3], inst[4], inst[5] + data = aes_cbc_decrypt_data(symkey, iv, encdata, offsets.unpad) + self._done_cv.acquire() + self._done_queue.put((final_path, offsets, data)) + self._done_cv.notify() + self._done_cv.release() + + def add_decrypt_chunk( + self, final_path, offsets, symkey, iv, encdata): + # type: (CryptoOffload, str, blobxfer.models.DownloadOffsets, bytes, + # bytes, bytes) -> None + """Add a chunk to decrypt + :param CryptoOffload self: this + :param str final_path: final path + :param blobxfer.models.DownloadOffsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes encdata: encrypted data + """ + self._task_queue.put( + (CryptoAction.Decrypt, final_path, offsets, symkey, iv, + encdata) + ) diff --git a/blobxfer/download.py b/blobxfer/download.py index 970bf18..65878bb 100644 --- a/blobxfer/download.py +++ b/blobxfer/download.py @@ -43,9 +43,12 @@ except ImportError: # noqa import Queue as queue import threading +import time # non-stdlib imports import dateutil # local imports +import blobxfer.crypto.models +import blobxfer.crypto.operations import blobxfer.md5 import blobxfer.models import blobxfer.operations @@ -75,18 +78,20 @@ def __init__(self, general_options, creds, spec): :param blobxfer.models.AzureStorageCredentials creds: creds :param blobxfer.models.DownloadSpecification spec: download spec """ - self._md5_meta_lock = threading.Lock() - self._download_lock = threading.Lock() + self._time_start = None self._all_remote_files_processed = False + self._crypto_offload = None + self._md5_meta_lock = threading.Lock() self._md5_map = {} self._md5_offload = None - self._md5_check_thread = None + self._download_lock = threading.Lock() self._download_queue = queue.Queue() self._download_set = set() self._download_threads = [] self._download_count = 0 self._download_total_bytes = 0 self._download_terminate = False + self._dd_map = {} self._general_options = general_options self._creds = creds self._spec = spec @@ -177,46 +182,69 @@ def _post_md5_skip_on_check(self, filename, md5_match): else: self._add_to_download_queue(lpath, rfile) - def _initialize_check_md5_downloads_thread(self): + def _check_for_downloads_from_md5(self): # type: (Downloader) -> None - """Initialize the md5 done queue check thread + """Check queue for a file to download :param Downloader self: this """ - def _check_for_downloads_from_md5(self): - # type: (Downloader) -> None - """Check queue for a file to download - :param Downloader self: this - """ - cv = self._md5_offload.done_cv - while True: - with self._md5_meta_lock: - if (self._download_terminate or - (len(self._md5_map) == 0 and - self._all_remote_files_processed)): - break - result = None - cv.acquire() - while not self._download_terminate: - result = self._md5_offload.get_localfile_md5_done() - if result is None: - # use cv timeout due to possible non-wake while running - cv.wait(1) - # check for terminating conditions - with self._md5_meta_lock: - if (len(self._md5_map) == 0 and - self._all_remote_files_processed): - break - else: - break - cv.release() - if result is not None: - self._post_md5_skip_on_check(result[0], result[1]) + cv = self._md5_offload.done_cv + while True: + with self._md5_meta_lock: + if (self._download_terminate or + (self._all_remote_files_processed and + len(self._md5_map) == 0 and + len(self._download_set) == 0)): + break + result = None + cv.acquire() + while not self._download_terminate: + result = self._md5_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + with self._md5_meta_lock: + if (self._all_remote_files_processed and + len(self._md5_map) == 0 and + len(self._download_set) == 0): + break + else: + break + cv.release() + if result is not None: + self._post_md5_skip_on_check(result[0], result[1]) - self._md5_check_thread = threading.Thread( - target=_check_for_downloads_from_md5, - args=(self,) - ) - self._md5_check_thread.start() + def _check_for_crypto_done(self): + # type: (Downloader) -> None + """Check queue for crypto done + :param Downloader self: this + """ + cv = self._crypto_offload.done_cv + while True: + with self._download_lock: + if (self._download_terminate or + (self._all_remote_files_processed and + len(self._download_set) == 0)): + break + result = None + cv.acquire() + while not self._download_terminate: + result = self._crypto_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + with self._download_lock: + if (self._all_remote_files_processed and + len(self._download_set) == 0): + break + else: + break + cv.release() + if result is not None: + with self._download_lock: + dd = self._dd_map[result[0]] + self._complete_chunk_download(result[1], result[2], dd) def _add_to_download_queue(self, lpath, rfile): # type: (Downloader, pathlib.Path, @@ -229,6 +257,9 @@ def _add_to_download_queue(self, lpath, rfile): # prepare remote file for download dd = blobxfer.models.DownloadDescriptor( lpath, rfile, self._spec.options) + if dd.entity.is_encrypted: + with self._download_lock: + self._dd_map[str(dd.final_path)] = dd # add download descriptor to queue self._download_queue.put(dd) @@ -250,7 +281,8 @@ def _wait_for_download_threads(self, terminate): :param Downloader self: this :param bool terminate: terminate threads """ - self._download_terminate = terminate + if terminate: + self._download_terminate = terminate for thr in self._download_threads: thr.join() @@ -273,17 +305,15 @@ def _worker_thread_download(self): # get download offsets offsets = dd.next_offsets() # check if all operations completed - if offsets is None and dd.outstanding_operations == 0: - # TODO - # 1. complete integrity checks - # 2. set file uid/gid - # 3. set file modes - # 4. move file to final path + if offsets is None and dd.all_operations_completed: + # finalize file + dd.finalize_file() + # accounting with self._download_lock: + if dd.entity.is_encrypted: + self._dd_map.pop(str(dd.final_path)) self._download_set.remove(dd.final_path) self._download_count += 1 - logger.info('download complete: {}/{} to {}'.format( - dd.entity.container, dd.entity.name, dd.final_path)) continue # re-enqueue for other threads to download self._download_queue.put(dd) @@ -291,39 +321,60 @@ def _worker_thread_download(self): continue # issue get range if dd.entity.mode == blobxfer.models.AzureStorageModes.File: - chunk = blobxfer.file.operations.get_file_range( + data = blobxfer.file.operations.get_file_range( dd.entity, offsets, self._general_options.timeout_sec) else: - chunk = blobxfer.blob.operations.get_blob_range( + data = blobxfer.blob.operations.get_blob_range( dd.entity, offsets, self._general_options.timeout_sec) # accounting with self._download_lock: self._download_total_bytes += offsets.num_bytes # decrypt if necessary if dd.entity.is_encrypted: - # TODO via crypto pool - # 1. compute rolling hmac if present - # - roll through any subsequent unchecked parts - # 2. decrypt chunk - pass - # compute rolling md5 via md5 pool - if dd.must_compute_md5: - # TODO - # - roll through any subsequent unchecked parts - pass - - # write data to disk - - # if no integrity check could be performed due to current - # integrity offset mismatch, add to unchecked set - - dd.dec_outstanding_operations() - - # pickle dd to resume file + # slice data to proper bounds + encdata = data[blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES:] + intdata = encdata + # get iv for chunk and compute hmac + if offsets.chunk_num == 0: + iv = dd.entity.encryption_metadata.content_encryption_iv + # integrity check for first chunk must include iv + intdata = iv + data + else: + iv = data[:blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES] + # integrity check data + dd.perform_chunked_integrity_check(offsets, intdata) + # decrypt data + if self._crypto_offload is not None: + self._crypto_offload.add_decrypt_chunk( + str(dd.final_path), offsets, + dd.entity.encryption_metadata.symmetric_key, + iv, encdata) + # data will be completed once retrieved from crypto queue + continue + else: + data = blobxfer.crypto.operations.aes_cbc_decrypt_data( + dd.entity.encryption_metadata.symmetric_key, + iv, encdata, offsets.unpad) + elif dd.must_compute_md5: + # rolling compute md5 + dd.perform_chunked_integrity_check(offsets, data) + # complete chunk download + self._complete_chunk_download(offsets, data, dd) -# rfile = dd._ase -# print('<<', rfile.container, rfile.name, rfile.lmt, rfile.size, -# rfile.md5, rfile.mode, rfile.encryption_metadata) + def _complete_chunk_download(self, offsets, data, dd): + # type: (Downloader, blobxfer.models.DownloadOffsets, bytes, + # blobxfer.models.DownloadDescriptor) -> None + """Complete chunk download + :param Downloader self: this + :param blobxfer.models.DownloadOffsets offsets: offsets + :param bytes data: data + :param blobxfer.models.DownloadDescriptor dd: download descriptor + """ + # write data to disk + dd.write_data(offsets, data) + # decrement outstanding operations + dd.dec_outstanding_operations() + # TODO pickle dd to resume file def _run(self): # type: (Downloader) -> None @@ -335,7 +386,14 @@ def _run(self): # initialize MD5 processes self._md5_offload = blobxfer.md5.LocalFileMd5Offload( num_workers=self._general_options.concurrency.md5_processes) - self._initialize_check_md5_downloads_thread() + self._md5_offload.initialize_check_thread( + self._check_for_downloads_from_md5) + # initialize crypto processes + if self._general_options.concurrency.crypto_processes > 0: + self._crypto_offload = blobxfer.crypto.operations.CryptoOffload( + num_workers=self._general_options.concurrency.crypto_processes) + self._crypto_offload.initialize_check_thread( + self._check_for_crypto_done) # initialize download threads self._initialize_download_threads() # iterate through source paths to download @@ -344,6 +402,7 @@ def _run(self): skipped_files = 0 total_size = 0 skipped_size = 0 + self._time_start = time.clock() for src in self._spec.sources: for rfile in src.files( self._creds, self._spec.options, self._general_options): @@ -369,33 +428,41 @@ def _run(self): self._add_to_download_queue(lpath, rfile) download_files = nfiles - skipped_files download_size = total_size - skipped_size + download_size_mib = download_size / 1048576 # clean up processes and threads with self._md5_meta_lock: self._all_remote_files_processed = True logger.debug( ('{0} remote files processed, waiting for download completion ' - 'of {1:.4f} MiB').format(nfiles, download_size / 1048576)) - self._md5_check_thread.join() + 'of {1:.4f} MiB').format(nfiles, download_size_mib)) self._wait_for_download_threads(terminate=False) - self._md5_offload.finalize_md5_processes() + end = time.clock() + runtime = end - self._time_start if (self._download_count != download_files or self._download_total_bytes != download_size): raise RuntimeError( 'download mismatch: [count={}/{} bytes={}/{}]'.format( self._download_count, download_files, self._download_total_bytes, download_size)) - logger.info('all files downloaded') + logger.info('all files downloaded: {0:.3f} sec {1:.4f} Mbps'.format( + runtime, download_size_mib * 8 / runtime)) def start(self): # type: (Downloader) -> None """Start the Downloader""" try: self._run() - except KeyboardInterrupt: - logger.error( - 'KeyboardInterrupt detected, force terminating ' - 'processes and threads (this may take a while)...') + except (KeyboardInterrupt, Exception) as ex: + if isinstance(ex, KeyboardInterrupt): + logger.error( + 'KeyboardInterrupt detected, force terminating ' + 'processes and threads (this may take a while)...') self._wait_for_download_threads(terminate=True) - self._md5_offload.finalize_md5_processes() + # TODO delete all temp files # TODO close resume file in finally? raise + finally: + if self._md5_offload is not None: + self._md5_offload.finalize_processes() + if self._crypto_offload is not None: + self._crypto_offload.finalize_processes() diff --git a/blobxfer/md5.py b/blobxfer/md5.py index 741e360..84e85cc 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -30,8 +30,6 @@ ) # stdlib imports import logging -import hashlib -import multiprocessing try: import queue except ImportError: # noqa @@ -39,21 +37,14 @@ # non-stdlib imports # local imports import blobxfer.download +import blobxfer.models +import blobxfer.offload import blobxfer.util # create logger logger = logging.getLogger(__name__) -def new_md5_hasher(): - # type: (None) -> md5.MD5 - """Create a new MD5 hasher - :rtype: md5.MD5 - :return: new MD5 hasher - """ - return hashlib.md5() - - def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): # type: (str, bool, int) -> str """Compute MD5 hash for file and encode as Base64 @@ -63,7 +54,7 @@ def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): :rtype: str :return: MD5 for file encoded as Base64 """ - hasher = new_md5_hasher() + hasher = blobxfer.util.new_md5_hasher() with open(filename, 'rb') as filedesc: while True: buf = filedesc.read(blocksize) @@ -85,12 +76,12 @@ def compute_md5_for_data_asbase64(data): :rtype: str :return: MD5 for data """ - hasher = new_md5_hasher() + hasher = blobxfer.util.new_md5_hasher() hasher.update(data) return blobxfer.util.base64_encode_as_string(hasher.digest()) -class LocalFileMd5Offload(object): +class LocalFileMd5Offload(blobxfer.offload._MultiprocessOffload): """LocalFileMd5Offload""" def __init__(self, num_workers): # type: (LocalFileMd5Offload, int) -> None @@ -98,52 +89,14 @@ def __init__(self, num_workers): :param LocalFileMd5Offload self: this :param int num_workers: number of worker processes """ - self._task_queue = multiprocessing.Queue() - self._done_queue = multiprocessing.Queue() - self._done_cv = multiprocessing.Condition() - self._term_signal = multiprocessing.Value('i', 0) - self._md5_procs = [] - self._initialize_md5_processes(num_workers) - - @property - def done_cv(self): - # type: (LocalFileMd5Offload) -> multiprocessing.Condition - """Get Download Done condition variable - :param LocalFileMd5Offload self: this - :rtype: multiprocessing.Condition - :return: cv for download done - """ - return self._done_cv - - def _initialize_md5_processes(self, num_workers): - # type: (LocalFileMd5Offload, int) -> None - """Initialize MD5 checking processes for files for download - :param LocalFileMd5Offload self: this - :param int num_workers: number of worker processes - """ - if num_workers is None or num_workers < 1: - raise ValueError('invalid num_workers: {}'.format(num_workers)) - for _ in range(num_workers): - proc = multiprocessing.Process( - target=self._worker_compute_md5_localfile_process) - proc.start() - self._md5_procs.append(proc) + super(LocalFileMd5Offload, self).__init__(num_workers, 'MD5') - def finalize_md5_processes(self): - # type: (LocalFileMd5Offload) -> None - """Finalize MD5 checking processes for files for download - :param LocalFileMd5Offload self: this - """ - self._term_signal.value = 1 - for proc in self._md5_procs: - proc.join() - - def _worker_compute_md5_localfile_process(self): + def _worker_process(self): # type: (LocalFileMd5Offload) -> None """Compute MD5 for local file :param LocalFileMd5Offload self: this """ - while self._term_signal.value == 0: + while not self.terminated: try: filename, remote_md5, pagealign = self._task_queue.get(True, 1) except queue.Empty: @@ -153,31 +106,17 @@ def _worker_compute_md5_localfile_process(self): md5, remote_md5, filename)) self._done_cv.acquire() self._done_queue.put((filename, md5 == remote_md5)) - self.done_cv.notify() - self.done_cv.release() - - def get_localfile_md5_done(self): - # type: (LocalFileMd5Offload) -> Tuple[str, bool] - """Get from done queue of local files with MD5 completed - :param LocalFileMd5Offload self: this - :rtype: tuple or None - :return: (local file path, md5 match) - """ - try: - return self._done_queue.get_nowait() - except queue.Empty: - return None + self._done_cv.notify() + self._done_cv.release() def add_localfile_for_md5_check(self, filename, remote_md5, mode): # type: (LocalFileMd5Offload, str, str, - # blobxfer.models.AzureStorageModes) -> bool - """Check an MD5 for a file for download + # blobxfer.models.AzureStorageModes) -> None + """Add a local file to MD5 check queue :param LocalFileMd5Offload self: this :param str filename: file to compute MD5 for :param str remote_md5: remote MD5 to compare against :param blobxfer.models.AzureStorageModes mode: mode - :rtype: bool - :return: MD5 match comparison """ if mode == blobxfer.models.AzureStorageModes.Page: pagealign = True diff --git a/blobxfer/models.py b/blobxfer/models.py index 74809eb..72b1291 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -41,6 +41,8 @@ except ImportError: # noqa import pathlib import multiprocessing +import tempfile +import threading # non-stdlib imports # local imports from .api import ( @@ -53,7 +55,6 @@ import blobxfer.blob.operations import blobxfer.file.operations import blobxfer.crypto.models -import blobxfer.md5 import blobxfer.util # create logger @@ -126,6 +127,7 @@ class AzureStorageModes(enum.Enum): ) DownloadOffsets = collections.namedtuple( 'DownloadOffsets', [ + 'chunk_num', 'fd_start', 'num_bytes', 'range_end', @@ -133,6 +135,14 @@ class AzureStorageModes(enum.Enum): 'unpad', ] ) +UncheckedChunk = collections.namedtuple( + 'UncheckedChunk', [ + 'data_len', + 'fd_start', + 'file_path', + 'temp', + ] +) class ConcurrencyOptions(object): @@ -147,16 +157,16 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): self.crypto_processes = crypto_processes self.md5_processes = md5_processes self.transfer_threads = transfer_threads + # allow crypto processes to be zero (which will inline crypto + # routines with main process) if self.crypto_processes is None or self.crypto_processes < 1: - self.crypto_processes = multiprocessing.cpu_count() // 2 - 1 - if self.crypto_processes < 1: - self.crypto_processes = 1 + self.crypto_processes = 0 if self.md5_processes is None or self.md5_processes < 1: self.md5_processes = multiprocessing.cpu_count() // 2 if self.md5_processes < 1: self.md5_processes = 1 if self.transfer_threads is None or self.transfer_threads < 1: - self.transfer_threads = multiprocessing.cpu_count() * 2 + self.transfer_threads = multiprocessing.cpu_count() * 3 class GeneralOptions(object): @@ -824,7 +834,8 @@ def __init__(self, lpath, ase, options): _tmp = list(lpath.parts[:-1]) _tmp.append(lpath.name + '.bxtmp') self.local_path = pathlib.Path(*_tmp) - self._meta_lock = multiprocessing.Lock() + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() self._ase = ase # calculate the total number of ops required for transfer self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) @@ -835,9 +846,10 @@ def __init__(self, lpath, ase, options): self._total_chunks = 0 self.hmac = None self.md5 = None - self.offset = 0 - self.integrity_counter = 0 - self.unchecked_chunks = set() + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._unchecked_chunks = {} self._outstanding_ops = self._total_chunks self._completed_ops = 0 # initialize checkers and allocate space @@ -871,9 +883,15 @@ def _initialize_integrity_checkers(self, options): :param DownloadOptions options: download options """ if self._ase.is_encrypted: + # ensure symmetric key exists + if blobxfer.util.is_none_or_empty( + self._ase.encryption_metadata.symmetric_key): + raise RuntimeError( + 'symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt') self.hmac = self._ase.encryption_metadata.initialize_hmac() if self.hmac is None and options.check_file_md5: - self.md5 = blobxfer.md5.new_md5_hasher() + self.md5 = blobxfer.util.new_md5_hasher() def _allocate_disk_space(self): # type: (DownloadDescriptor, int) -> None @@ -912,48 +930,182 @@ def next_offsets(self): :rtype: DownloadOffsets :return: download offsets """ - if self.offset >= self._ase.size: - return None - if self.offset + self._chunk_size > self._ase.size: - chunk = self._ase.size - self.offset - else: - chunk = self._chunk_size - # on download, num_bytes must be offset by -1 as the x-ms-range - # header expects it that way. x -> y bytes means first bits of the - # (x+1)th byte to the last bits of the (y+1)th byte. for example, - # 0 -> 511 means byte 1 to byte 512 - num_bytes = chunk - 1 - fd_start = self.offset - range_start = self.offset - if self._ase.is_encrypted: - # ensure start is AES block size aligned - range_start = range_start - (range_start % self._AES_BLOCKSIZE) - \ - self._AES_BLOCKSIZE - if range_start <= 0: - range_start = 0 - range_end = self.offset + num_bytes - self.offset += chunk - if self._ase.is_encrypted and self.offset >= self._ase.size: - unpad = True + with self._meta_lock: + if self._offset >= self._ase.size: + return None + if self._offset + self._chunk_size > self._ase.size: + chunk = self._ase.size - self._offset + else: + chunk = self._chunk_size + # on download, num_bytes must be offset by -1 as the x-ms-range + # header expects it that way. x -> y bytes means first bits of the + # (x+1)th byte to the last bits of the (y+1)th byte. for example, + # 0 -> 511 means byte 1 to byte 512 + num_bytes = chunk - 1 + chunk_num = self._chunk_num + fd_start = self._offset + range_start = self._offset + if self._ase.is_encrypted: + # ensure start is AES block size aligned + range_start = range_start - \ + (range_start % self._AES_BLOCKSIZE) - \ + self._AES_BLOCKSIZE + if range_start <= 0: + range_start = 0 + range_end = self._offset + num_bytes + self._offset += chunk + self._chunk_num += 1 + if self._ase.is_encrypted and self._offset >= self._ase.size: + unpad = True + else: + unpad = False + return DownloadOffsets( + chunk_num=chunk_num, + fd_start=fd_start, + num_bytes=chunk, + range_start=range_start, + range_end=range_end, + unpad=unpad, + ) + + def _postpone_integrity_check(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Postpone integrity check for chunk + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + if self.must_compute_md5: + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=offsets.fd_start, + file_path=self.local_path, + temp=False, + ) else: - unpad = False - return DownloadOffsets( - fd_start=fd_start, - num_bytes=chunk, - range_start=range_start, - range_end=range_end, - unpad=unpad, - ) - - @property - def outstanding_operations(self): + fname = None + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: + fname = fd.name + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=0, + file_path=pathlib.Path(fname), + temp=True, + ) with self._meta_lock: - return self._outstanding_ops + self._unchecked_chunks[offsets.chunk_num] = unchecked + + def perform_chunked_integrity_check(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Hash data against stored MD5 hasher safely + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + self_check = False + hasher = self.hmac or self.md5 + # iterate from next chunk to be checked + while True: + ucc = None + with self._meta_lock: + chunk_num = self._next_integrity_chunk + # check if the next chunk is ready + if chunk_num in self._unchecked_chunks: + ucc = self._unchecked_chunks.pop(chunk_num) + elif chunk_num != offsets.chunk_num: + break + # prepare data for hashing + if ucc is None: + chunk = data + self_check = True + else: + with ucc.file_path.open('rb') as fd: + fd.seek(ucc.fd_start, 0) + chunk = fd.read(ucc.data_len) + if ucc.temp: + ucc.file_path.unlink() + # hash data and set next integrity chunk + with self._hasher_lock: + hasher.update(chunk) + with self._meta_lock: + self._next_integrity_chunk += 1 + # store data that hasn't been checked + if not self_check: + self._postpone_integrity_check(offsets, data) + + def write_data(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Postpone integrity check for chunk + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) + + def finalize_file(self): + # type: (DownloadDescriptor) -> Tuple[bool, str] + """Finalize file download + :param DownloadDescriptor self: this + :rtype: tuple + :return (if integrity check passed or not, message) + """ + # check final file integrity + check = False + msg = None + if self.hmac is not None: + mac = self._ase.encryption_metadata.encryption_authentication.\ + message_authentication_code + digest = blobxfer.util.base64_encode_as_string(self.hmac.digest()) + if digest == mac: + check = True + msg = '{}: {}, {} {} {}'.format( + self._ase.encryption_metadata.encryption_authentication. + algorithm, + 'OK' if check else 'MISMATCH', + self._ase.name, + digest, + mac, + ) + elif self.md5 is not None: + digest = blobxfer.util.base64_encode_as_string(self.md5.digest()) + if digest == self._ase.md5: + check = True + msg = 'MD5: {}, {} {} {}'.format( + 'OK' if check else 'MISMATCH', + self._ase.name, + digest, + self._ase.md5, + ) + else: + check = True + msg = 'MD5: SKIPPED, {} None {}'.format( + self._ase.name, + self._ase.md5 + ) + # cleanup if download failed + if not check: + logger.error(msg) + # delete temp download file + self.local_path.unlink() + return + logger.debug(msg) + + # TODO set file uid/gid and mode + + # move temp download file to final path + self.local_path.rename(self.final_path) @property - def completed_operations(self): + def all_operations_completed(self): with self._meta_lock: - return self._completed_ops + return (self._outstanding_ops == 0 and + len(self._unchecked_chunks) == 0) def dec_outstanding_operations(self): with self._meta_lock: diff --git a/blobxfer/offload.py b/blobxfer/offload.py new file mode 100644 index 0000000..80f84fe --- /dev/null +++ b/blobxfer/offload.py @@ -0,0 +1,127 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import logging +import multiprocessing +import threading +try: + import queue +except ImportError: # noqa + import Queue as queue + +# create logger +logger = logging.getLogger(__name__) + + +class _MultiprocessOffload(object): + def __init__(self, num_workers, description=None): + # type: (_MultiprocessOffload, int, str) -> None + """Ctor for Crypto Offload + :param _MultiprocessOffload self: this + :param int num_workers: number of worker processes + :param str description: description + """ + self._task_queue = multiprocessing.Queue() + self._done_queue = multiprocessing.Queue() + self._done_cv = multiprocessing.Condition() + self._term_signal = multiprocessing.Value('i', 0) + self._procs = [] + self._check_thread = None + self._initialize_processes(num_workers, description) + + @property + def done_cv(self): + # type: (_MultiprocessOffload) -> multiprocessing.Condition + """Get Done condition variable + :param _MultiprocessOffload self: this + :rtype: multiprocessing.Condition + :return: cv for download done + """ + return self._done_cv + + @property + def terminated(self): + # type: (_MultiprocessOffload) -> bool + """Check if terminated + :param _MultiprocessOffload self: this + :rtype: bool + :return: if terminated + """ + return self._term_signal.value == 1 + + def _initialize_processes(self, num_workers, description): + # type: (_MultiprocessOffload, int, str) -> None + """Initialize processes + :param _MultiprocessOffload self: this + :param int num_workers: number of worker processes + :param str description: description + """ + if num_workers is None or num_workers < 1: + raise ValueError('invalid num_workers: {}'.format(num_workers)) + logger.debug('initializing {}{} processes'.format( + num_workers, ' ' + description if not None else '')) + for _ in range(num_workers): + proc = multiprocessing.Process(target=self._worker_process) + proc.start() + self._procs.append(proc) + + def finalize_processes(self): + # type: (_MultiprocessOffload) -> None + """Finalize processes + :param _MultiprocessOffload self: this + """ + self._term_signal.value = 1 + if self._check_thread is not None: + self._check_thread.join() + for proc in self._procs: + proc.join() + + def pop_done_queue(self): + # type: (_MultiprocessOffload) -> object + """Get item from done queue + :param _MultiprocessOffload self: this + :rtype: object or None + :return: object from done queue, if exists + """ + try: + return self._done_queue.get_nowait() + except queue.Empty: + return None + + def initialize_check_thread(self, check_func): + # type: (_MultiprocessOffload, object) -> None + """Initialize the crypto done queue check thread + :param Downloader self: this + :param object check_func: check function + """ + self._check_thread = threading.Thread(target=check_func) + self._check_thread.start() diff --git a/blobxfer/util.py b/blobxfer/util.py index c8885f7..eec47a9 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -32,6 +32,7 @@ import base64 import copy import dateutil +import hashlib import logging import logging.handlers import mimetypes @@ -164,6 +165,15 @@ def base64_decode_string(string): return base64.b64decode(string) +def new_md5_hasher(): + # type: (None) -> md5.MD5 + """Create a new MD5 hasher + :rtype: md5.MD5 + :return: new MD5 hasher + """ + return hashlib.md5() + + def page_align_content_length(length): # type: (int) -> int """Compute page boundary alignment diff --git a/cli/settings.py b/cli/settings.py index 448d0a8..8e5db75 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -265,9 +265,9 @@ def create_download_specifications(config): elif confmode == 'block': mode = blobxfer.models.AzureStorageModes.Block elif confmode == 'file': - mode == blobxfer.models.AzureStorageModes.File + mode = blobxfer.models.AzureStorageModes.File elif confmode == 'page': - mode == blobxfer.models.AzureStorageModes.Page + mode = blobxfer.models.AzureStorageModes.Page else: raise ValueError('unknown mode: {}'.format(confmode)) # load RSA private key PEM file if specified diff --git a/tests/test_blobxfer_blob_append_operations.py b/tests/test_blobxfer_blob_append_operations.py index b4ad982..e207057 100644 --- a/tests/test_blobxfer_blob_append_operations.py +++ b/tests/test_blobxfer_blob_append_operations.py @@ -4,7 +4,6 @@ # stdlib imports # non-stdlib imports import azure.storage -import pytest # local imports import blobxfer.models as models # module under test diff --git a/tests/test_blobxfer_crypto_operations.py b/tests/test_blobxfer_crypto_operations.py index a37be4f..88990e5 100644 --- a/tests/test_blobxfer_crypto_operations.py +++ b/tests/test_blobxfer_crypto_operations.py @@ -46,6 +46,42 @@ def test_rsa_encrypt_decrypt_keys(): def test_pkcs7_padding(): buf = os.urandom(32) - pbuf = ops.pad_pkcs7(buf) - buf2 = ops.unpad_pkcs7(pbuf) + pbuf = ops.pkcs7_pad(buf) + buf2 = ops.pkcs7_unpad(pbuf) assert buf == buf2 + + +def test_aes_cbc_encryption(): + enckey = ops.aes256_generate_random_key() + assert len(enckey) == ops._AES256_KEYLENGTH_BYTES + + # test random binary data, unaligned + iv = os.urandom(16) + plaindata = os.urandom(31) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + + # test random binary data aligned on boundary + plaindata = os.urandom(32) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + + # test "text" data + plaintext = 'attack at dawn!' + plaindata = plaintext.encode('utf8') + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + assert plaindata.decode('utf8') == plaintext + + # test unpadded + plaindata = os.urandom(32) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, False) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, False) + assert decdata == plaindata diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download.py index 6e12bcc..aef5e79 100644 --- a/tests/test_blobxfer_download.py +++ b/tests/test_blobxfer_download.py @@ -197,27 +197,18 @@ def test_post_md5_skip_on_check(): assert d._add_to_download_queue.call_count == 1 -def test_initialize_check_md5_downloads_thread(): +def test_check_for_downloads_from_md5(): lpath = 'lpath' d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_map[lpath] = mock.MagicMock() d._download_set.add(pathlib.Path(lpath)) d._md5_offload = mock.MagicMock() d._md5_offload.done_cv = multiprocessing.Condition() - d._md5_offload.get_localfile_md5_done = mock.MagicMock() - d._md5_offload.get_localfile_md5_done.side_effect = [None, (lpath, False)] + d._md5_offload.pop_done_queue.side_effect = [None, (lpath, False)] d._add_to_download_queue = mock.MagicMock() - d._initialize_check_md5_downloads_thread() - while len(d._md5_map) > 0: - d._md5_offload.done_cv.acquire() - d._md5_offload.done_cv.notify() - d._md5_offload.done_cv.release() - d._all_remote_files_processed = True - d._md5_offload.done_cv.acquire() - d._md5_offload.done_cv.notify() - d._md5_offload.done_cv.release() - d._md5_check_thread.join() + with pytest.raises(StopIteration): + d._check_for_downloads_from_md5() assert d._add_to_download_queue.call_count == 1 @@ -237,14 +228,15 @@ def test_initialize_and_terminate_download_threads(): assert not thr.is_alive() +@mock.patch('time.clock') @mock.patch('blobxfer.md5.LocalFileMd5Offload') @mock.patch('blobxfer.blob.operations.list_blobs') @mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) -def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): +def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._initialize_check_md5_downloads_thread = mock.MagicMock() d._initialize_download_threads = mock.MagicMock() - d._md5_check_thread = mock.MagicMock() + patched_lfmo._check_thread = mock.MagicMock() + d._general_options.concurrency.crypto_processes = 0 d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 @@ -270,12 +262,14 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._check_download_conditions = mock.MagicMock() d._check_download_conditions.return_value = dl.DownloadAction.Skip + patched_tc.side_effect = [1, 2] d.start() assert d._pre_md5_skip_on_check.call_count == 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.CheckMd5 + patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() assert d._pre_md5_skip_on_check.call_count == 1 @@ -284,6 +278,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = dl.DownloadAction.Download + patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() assert d._download_queue.qsize() == 1 diff --git a/tests/test_blobxfer_md5.py b/tests/test_blobxfer_md5.py index 7a37072..c38e758 100644 --- a/tests/test_blobxfer_md5.py +++ b/tests/test_blobxfer_md5.py @@ -36,7 +36,7 @@ def test_done_cv(): assert a.done_cv == a._done_cv finally: if a: - a.finalize_md5_processes() + a.finalize_processes() def test_finalize_md5_processes(): @@ -48,9 +48,9 @@ def test_finalize_md5_processes(): a = md5.LocalFileMd5Offload(num_workers=1) finally: if a: - a.finalize_md5_processes() + a.finalize_processes() - for proc in a._md5_procs: + for proc in a._procs: assert not proc.is_alive() @@ -63,7 +63,7 @@ def test_from_add_to_done_non_pagealigned(tmpdir): a = None try: a = md5.LocalFileMd5Offload(num_workers=1) - result = a.get_localfile_md5_done() + result = a.pop_done_queue() assert result is None a.add_localfile_for_md5_check( @@ -71,7 +71,7 @@ def test_from_add_to_done_non_pagealigned(tmpdir): i = 33 checked = False while i > 0: - result = a.get_localfile_md5_done() + result = a.pop_done_queue() if result is None: time.sleep(0.3) i -= 1 @@ -84,7 +84,7 @@ def test_from_add_to_done_non_pagealigned(tmpdir): assert checked finally: if a: - a.finalize_md5_processes() + a.finalize_processes() def test_from_add_to_done_pagealigned(tmpdir): @@ -96,7 +96,7 @@ def test_from_add_to_done_pagealigned(tmpdir): a = None try: a = md5.LocalFileMd5Offload(num_workers=1) - result = a.get_localfile_md5_done() + result = a.pop_done_queue() assert result is None a.add_localfile_for_md5_check( @@ -104,7 +104,7 @@ def test_from_add_to_done_pagealigned(tmpdir): i = 33 checked = False while i > 0: - result = a.get_localfile_md5_done() + result = a.pop_done_queue() if result is None: time.sleep(0.3) i -= 1 @@ -117,4 +117,4 @@ def test_from_add_to_done_pagealigned(tmpdir): assert checked finally: if a: - a.finalize_md5_processes() + a.finalize_processes() diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index f0e636d..897e28d 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -25,9 +25,9 @@ def test_concurrency_options(patched_cc): transfer_threads=-2, ) - assert a.crypto_processes == 1 + assert a.crypto_processes == 0 assert a.md5_processes == 1 - assert a.transfer_threads == 2 + assert a.transfer_threads == 3 def test_general_options(): @@ -359,12 +359,16 @@ def test_downloaddescriptor(tmpdir): ase = models.AzureStorageEntity('cont') ase._size = 1024 ase._encryption = mock.MagicMock() + with pytest.raises(RuntimeError): + d = models.DownloadDescriptor(lp, ase, opts) + + ase._encryption.symmetric_key = b'123' d = models.DownloadDescriptor(lp, ase, opts) assert d.entity == ase assert not d.must_compute_md5 assert d._total_chunks == 64 - assert d.offset == 0 + assert d._offset == 0 assert d.final_path == lp assert str(d.local_path) == str(lp) + '.bxtmp' assert d.local_path.stat().st_size == 1024 - 16 @@ -400,6 +404,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): offsets = d.next_offsets() assert d._total_chunks == 1 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 128 assert offsets.range_start == 0 @@ -416,6 +421,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 1 assert offsets.range_start == 0 @@ -427,6 +433,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 256 assert offsets.range_start == 0 @@ -438,12 +445,14 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 2 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad offsets = d.next_offsets() + assert offsets.chunk_num == 1 assert offsets.fd_start == 256 assert offsets.num_bytes == 16 assert offsets.range_start == 256 @@ -452,10 +461,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' ase._size = 128 d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 128 assert offsets.range_start == 0 @@ -467,6 +478,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 256 assert offsets.range_start == 0 @@ -478,12 +490,14 @@ def test_downloaddescriptor_next_offsets(tmpdir): d = models.DownloadDescriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 2 + assert offsets.chunk_num == 0 assert offsets.fd_start == 0 assert offsets.num_bytes == 256 assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad offsets = d.next_offsets() + assert offsets.chunk_num == 1 assert offsets.fd_start == 256 assert offsets.num_bytes == 32 assert offsets.range_start == 256 - 16 From 85895c1e3cbbea3c88244234930a265a4a986fa1 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Sun, 26 Feb 2017 20:15:43 -0800 Subject: [PATCH 12/47] Refactor download to module - Add more UT coverage --- blobxfer/api.py | 2 +- blobxfer/blob/operations.py | 4 +- blobxfer/crypto/operations.py | 9 +- blobxfer/download/__init__.py | 0 blobxfer/download/models.py | 377 ++++++++++++++++++ .../{download.py => download/operations.py} | 13 +- blobxfer/file/operations.py | 4 +- blobxfer/md5.py | 3 +- blobxfer/models.py | 319 --------------- blobxfer/offload.py | 18 +- setup.py | 1 + tests/test_blobxfer_crypto_operations.py | 43 +- tests/test_blobxfer_download_models.py | 377 ++++++++++++++++++ ...y => test_blobxfer_download_operations.py} | 55 +-- tests/test_blobxfer_models.py | 156 -------- tests/test_blobxfer_offload.py | 36 ++ 16 files changed, 890 insertions(+), 527 deletions(-) create mode 100644 blobxfer/download/__init__.py create mode 100644 blobxfer/download/models.py rename blobxfer/{download.py => download/operations.py} (97%) create mode 100644 tests/test_blobxfer_download_models.py rename tests/{test_blobxfer_download.py => test_blobxfer_download_operations.py} (84%) create mode 100644 tests/test_blobxfer_offload.py diff --git a/blobxfer/api.py b/blobxfer/api.py index 69444ae..57fcf09 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -45,6 +45,6 @@ create_client as create_file_client ) -from .download import ( # noqa +from .download.operations import ( # noqa Downloader ) diff --git a/blobxfer/blob/operations.py b/blobxfer/blob/operations.py index 4a8f0eb..27c19c7 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/blob/operations.py @@ -108,10 +108,10 @@ def list_blobs(client, container, prefix, mode, timeout=None): def get_blob_range(ase, offsets, timeout=None): # type: (blobxfer.models.AzureStorageEntity, - # blobxfer.models.DownloadOffsets, int) -> bytes + # blobxfer.download.models.DownloadOffsets, int) -> bytes """Retrieve blob range :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity - :param blobxfer.models.DownloadOffsets offsets: downlaod offsets + :param blobxfer.download.models.DownloadOffsets offsets: download offsets :param int timeout: timeout :rtype: bytes :return: content for blob range diff --git a/blobxfer/crypto/operations.py b/blobxfer/crypto/operations.py index deeb287..2534148 100644 --- a/blobxfer/crypto/operations.py +++ b/blobxfer/crypto/operations.py @@ -230,7 +230,8 @@ def __init__(self, num_workers): :param CryptoOffload self: this :param int num_workers: number of worker processes """ - super(CryptoOffload, self).__init__(num_workers, 'Crypto') + super(CryptoOffload, self).__init__( + self._worker_process, num_workers, 'Crypto') def _worker_process(self): # type: (CryptoOffload) -> None @@ -256,12 +257,12 @@ def _worker_process(self): def add_decrypt_chunk( self, final_path, offsets, symkey, iv, encdata): - # type: (CryptoOffload, str, blobxfer.models.DownloadOffsets, bytes, - # bytes, bytes) -> None + # type: (CryptoOffload, str, blobxfer.download.models.DownloadOffsets, + # bytes, bytes, bytes) -> None """Add a chunk to decrypt :param CryptoOffload self: this :param str final_path: final path - :param blobxfer.models.DownloadOffsets offsets: offsets + :param blobxfer.download.models.DownloadOffsets offsets: offsets :param bytes symkey: symmetric key :param bytes iv: initialization vector :param bytes encdata: encrypted data diff --git a/blobxfer/download/__init__.py b/blobxfer/download/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/download/models.py b/blobxfer/download/models.py new file mode 100644 index 0000000..39b641d --- /dev/null +++ b/blobxfer/download/models.py @@ -0,0 +1,377 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import logging +import math +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +import tempfile +import threading +# non-stdlib imports +# local imports +import blobxfer.blob.operations +import blobxfer.file.operations +import blobxfer.crypto.models +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# named tuples +DownloadOffsets = collections.namedtuple( + 'DownloadOffsets', [ + 'chunk_num', + 'fd_start', + 'num_bytes', + 'range_end', + 'range_start', + 'unpad', + ] +) +UncheckedChunk = collections.namedtuple( + 'UncheckedChunk', [ + 'data_len', + 'fd_start', + 'file_path', + 'temp', + ] +) + + +class DownloadDescriptor(object): + """Download Descriptor""" + + _AES_BLOCKSIZE = blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES + + def __init__(self, lpath, ase, options): + # type: (DownloadDescriptior, pathlib.Path, AzureStorageEntity, + # DownloadOptions) -> None + """Ctor for DownloadDescriptor + :param DownloadDescriptor self: this + :param pathlib.Path lpath: local path + :param AzureStorageEntity ase: Azure Storage Entity + :param DownloadOptions options: download options + """ + self.final_path = lpath + # create path holding the temporary file to download to + _tmp = list(lpath.parts[:-1]) + _tmp.append(lpath.name + '.bxtmp') + self.local_path = pathlib.Path(*_tmp) + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() + self._ase = ase + # calculate the total number of ops required for transfer + self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) + try: + self._total_chunks = int( + math.ceil(self._ase.size / self._chunk_size)) + except ZeroDivisionError: + self._total_chunks = 0 + self.hmac = None + self.md5 = None + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._unchecked_chunks = {} + self._outstanding_ops = self._total_chunks + self._completed_ops = 0 + # initialize checkers and allocate space + self._initialize_integrity_checkers(options) + self._allocate_disk_space() + + @property + def entity(self): + # type: (DownloadDescriptor) -> AzureStorageEntity + """Get linked AzureStorageEntity + :param DownloadDescriptor self: this + :rtype: AzureStorageEntity + :return: AzureStorageEntity + """ + return self._ase + + @property + def must_compute_md5(self): + # type: (DownloadDescriptor) -> bool + """Check if MD5 must be computed + :param DownloadDescriptor self: this + :rtype: bool + :return: if MD5 must be computed + """ + return self.md5 is not None + + @property + def all_operations_completed(self): + # type: (DownloadDescriptor) -> bool + """All operations are completed + :param DownloadDescriptor self: this + :rtype: bool + :return: if all operations completed + """ + with self._meta_lock: + return (self._outstanding_ops == 0 and + len(self._unchecked_chunks) == 0) + + def dec_outstanding_operations(self): + # type: (DownloadDescriptor) -> None + """Decrement outstanding operations (and increment completed ops) + :param DownloadDescriptor self: this + """ + with self._meta_lock: + self._outstanding_ops -= 1 + self._completed_ops += 1 + + def _initialize_integrity_checkers(self, options): + # type: (DownloadDescriptor, DownloadOptions) -> None + """Initialize file integrity checkers + :param DownloadDescriptor self: this + :param DownloadOptions options: download options + """ + if self._ase.is_encrypted: + # ensure symmetric key exists + if blobxfer.util.is_none_or_empty( + self._ase.encryption_metadata.symmetric_key): + raise RuntimeError( + 'symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt') + self.hmac = self._ase.encryption_metadata.initialize_hmac() + if self.hmac is None and options.check_file_md5: + self.md5 = blobxfer.util.new_md5_hasher() + + def _allocate_disk_space(self): + # type: (DownloadDescriptor, int) -> None + """Perform file allocation (possibly sparse) + :param DownloadDescriptor self: this + :param int size: size + """ + size = self._ase.size + # compute size + if size > 0: + if self._ase.is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + self._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + # create parent path + self.local_path.parent.mkdir(mode=0o750, parents=True, exist_ok=True) + # allocate file + with self.local_path.open('wb') as fd: + if allocatesize > 0: + try: + os.posix_fallocate(fd.fileno(), 0, allocatesize) + except AttributeError: + fd.seek(allocatesize - 1) + fd.write(b'\0') + + def next_offsets(self): + # type: (DownloadDescriptor) -> DownloadOffsets + """Retrieve the next offsets + :param DownloadDescriptor self: this + :rtype: DownloadOffsets + :return: download offsets + """ + with self._meta_lock: + if self._offset >= self._ase.size: + return None + if self._offset + self._chunk_size > self._ase.size: + chunk = self._ase.size - self._offset + else: + chunk = self._chunk_size + # on download, num_bytes must be offset by -1 as the x-ms-range + # header expects it that way. x -> y bytes means first bits of the + # (x+1)th byte to the last bits of the (y+1)th byte. for example, + # 0 -> 511 means byte 1 to byte 512 + num_bytes = chunk - 1 + chunk_num = self._chunk_num + fd_start = self._offset + range_start = self._offset + if self._ase.is_encrypted: + # ensure start is AES block size aligned + range_start = range_start - \ + (range_start % self._AES_BLOCKSIZE) - \ + self._AES_BLOCKSIZE + if range_start <= 0: + range_start = 0 + range_end = self._offset + num_bytes + self._offset += chunk + self._chunk_num += 1 + if self._ase.is_encrypted and self._offset >= self._ase.size: + unpad = True + else: + unpad = False + return DownloadOffsets( + chunk_num=chunk_num, + fd_start=fd_start, + num_bytes=chunk, + range_start=range_start, + range_end=range_end, + unpad=unpad, + ) + + def _postpone_integrity_check(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Postpone integrity check for chunk + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + if self.must_compute_md5: + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=offsets.fd_start, + file_path=self.local_path, + temp=False, + ) + else: + fname = None + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: + fname = fd.name + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=0, + file_path=pathlib.Path(fname), + temp=True, + ) + with self._meta_lock: + self._unchecked_chunks[offsets.chunk_num] = unchecked + + def perform_chunked_integrity_check(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Hash data against stored MD5 hasher safely + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + self_check = False + hasher = self.hmac or self.md5 + # iterate from next chunk to be checked + while True: + ucc = None + with self._meta_lock: + chunk_num = self._next_integrity_chunk + # check if the next chunk is ready + if chunk_num in self._unchecked_chunks: + ucc = self._unchecked_chunks.pop(chunk_num) + elif chunk_num != offsets.chunk_num: + break + # prepare data for hashing + if ucc is None: + chunk = data + self_check = True + else: + with ucc.file_path.open('rb') as fd: + fd.seek(ucc.fd_start, 0) + chunk = fd.read(ucc.data_len) + if ucc.temp: + ucc.file_path.unlink() + # hash data and set next integrity chunk + with self._hasher_lock: + hasher.update(chunk) + with self._meta_lock: + self._next_integrity_chunk += 1 + # store data that hasn't been checked + if not self_check: + self._postpone_integrity_check(offsets, data) + + def write_data(self, offsets, data): + # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + """Postpone integrity check for chunk + :param DownloadDescriptor self: this + :param DownloadOffsets offsets: download offsets + :param bytes data: data + """ + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) + + def finalize_file(self): + # type: (DownloadDescriptor) -> None + """Finalize file download + :param DownloadDescriptor self: this + """ + # check final file integrity + check = False + msg = None + if self.hmac is not None: + mac = self._ase.encryption_metadata.encryption_authentication.\ + message_authentication_code + digest = blobxfer.util.base64_encode_as_string(self.hmac.digest()) + if digest == mac: + check = True + msg = '{}: {}, {} {} {}'.format( + self._ase.encryption_metadata.encryption_authentication. + algorithm, + 'OK' if check else 'MISMATCH', + self._ase.name, + digest, + mac, + ) + elif self.md5 is not None: + digest = blobxfer.util.base64_encode_as_string(self.md5.digest()) + if digest == self._ase.md5: + check = True + msg = 'MD5: {}, {} {} {}'.format( + 'OK' if check else 'MISMATCH', + self._ase.name, + digest, + self._ase.md5, + ) + else: + check = True + msg = 'MD5: SKIPPED, {} None {}'.format( + self._ase.name, + self._ase.md5 + ) + # cleanup if download failed + if not check: + logger.error(msg) + # delete temp download file + self.local_path.unlink() + return + logger.debug(msg) + + # TODO set file uid/gid and mode + + # move temp download file to final path + self.local_path.rename(self.final_path) diff --git a/blobxfer/download.py b/blobxfer/download/operations.py similarity index 97% rename from blobxfer/download.py rename to blobxfer/download/operations.py index 65878bb..ac36ebe 100644 --- a/blobxfer/download.py +++ b/blobxfer/download/operations.py @@ -49,8 +49,8 @@ # local imports import blobxfer.crypto.models import blobxfer.crypto.operations +import blobxfer.download.models import blobxfer.md5 -import blobxfer.models import blobxfer.operations import blobxfer.blob.operations import blobxfer.file.operations @@ -255,7 +255,7 @@ def _add_to_download_queue(self, lpath, rfile): :param blobxfer.models.AzureStorageEntity rfile: remote file """ # prepare remote file for download - dd = blobxfer.models.DownloadDescriptor( + dd = blobxfer.download.models.DownloadDescriptor( lpath, rfile, self._spec.options) if dd.entity.is_encrypted: with self._download_lock: @@ -362,13 +362,14 @@ def _worker_thread_download(self): self._complete_chunk_download(offsets, data, dd) def _complete_chunk_download(self, offsets, data, dd): - # type: (Downloader, blobxfer.models.DownloadOffsets, bytes, - # blobxfer.models.DownloadDescriptor) -> None + # type: (Downloader, blobxfer.download.models.DownloadOffsets, bytes, + # blobxfer.models.download.DownloadDescriptor) -> None """Complete chunk download :param Downloader self: this - :param blobxfer.models.DownloadOffsets offsets: offsets + :param blobxfer.download.models.DownloadOffsets offsets: offsets :param bytes data: data - :param blobxfer.models.DownloadDescriptor dd: download descriptor + :param blobxfer.models.download.DownloadDescriptor dd: + download descriptor """ # write data to disk dd.write_data(offsets, data) diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index 09f7d68..ec654dd 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -152,10 +152,10 @@ def list_files(client, fileshare, prefix, timeout=None): def get_file_range(ase, offsets, timeout=None): # type: (blobxfer.models.AzureStorageEntity, - # blobxfer.models.DownloadOffsets, int) -> bytes + # blobxfer.download.models.DownloadOffsets, int) -> bytes """Retrieve file range :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity - :param blobxfer.models.DownloadOffsets offsets: downlaod offsets + :param blobxfer.download.models.DownloadOffsets offsets: download offsets :param int timeout: timeout :rtype: bytes :return: content for file range diff --git a/blobxfer/md5.py b/blobxfer/md5.py index 84e85cc..1c403cc 100644 --- a/blobxfer/md5.py +++ b/blobxfer/md5.py @@ -89,7 +89,8 @@ def __init__(self, num_workers): :param LocalFileMd5Offload self: this :param int num_workers: number of worker processes """ - super(LocalFileMd5Offload, self).__init__(num_workers, 'MD5') + super(LocalFileMd5Offload, self).__init__( + self._worker_process, num_workers, 'MD5') def _worker_process(self): # type: (LocalFileMd5Offload) -> None diff --git a/blobxfer/models.py b/blobxfer/models.py index 72b1291..8a91885 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -34,15 +34,12 @@ import enum import fnmatch import logging -import math import os try: import pathlib2 as pathlib except ImportError: # noqa import pathlib import multiprocessing -import tempfile -import threading # non-stdlib imports # local imports from .api import ( @@ -125,24 +122,6 @@ class AzureStorageModes(enum.Enum): 'relative_path', ] ) -DownloadOffsets = collections.namedtuple( - 'DownloadOffsets', [ - 'chunk_num', - 'fd_start', - 'num_bytes', - 'range_end', - 'range_start', - 'unpad', - ] -) -UncheckedChunk = collections.namedtuple( - 'UncheckedChunk', [ - 'data_len', - 'fd_start', - 'file_path', - 'temp', - ] -) class ConcurrencyOptions(object): @@ -815,304 +794,6 @@ def populate_from_file(self, sa, file): self._client = sa.file_client -class DownloadDescriptor(object): - """Download Descriptor""" - - _AES_BLOCKSIZE = blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES - - def __init__(self, lpath, ase, options): - # type: (DownloadDescriptior, pathlib.Path, AzureStorageEntity, - # DownloadOptions) -> None - """Ctor for DownloadDescriptor - :param DownloadDescriptor self: this - :param pathlib.Path lpath: local path - :param AzureStorageEntity ase: Azure Storage Entity - :param DownloadOptions options: download options - """ - self.final_path = lpath - # create path holding the temporary file to download to - _tmp = list(lpath.parts[:-1]) - _tmp.append(lpath.name + '.bxtmp') - self.local_path = pathlib.Path(*_tmp) - self._meta_lock = threading.Lock() - self._hasher_lock = threading.Lock() - self._ase = ase - # calculate the total number of ops required for transfer - self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) - try: - self._total_chunks = int( - math.ceil(self._ase.size / self._chunk_size)) - except ZeroDivisionError: - self._total_chunks = 0 - self.hmac = None - self.md5 = None - self._offset = 0 - self._chunk_num = 0 - self._next_integrity_chunk = 0 - self._unchecked_chunks = {} - self._outstanding_ops = self._total_chunks - self._completed_ops = 0 - # initialize checkers and allocate space - self._initialize_integrity_checkers(options) - self._allocate_disk_space() - - @property - def entity(self): - # type: (DownloadDescriptor) -> AzureStorageEntity - """Get linked AzureStorageEntity - :param DownloadDescriptor self: this - :rtype: AzureStorageEntity - :return: AzureStorageEntity - """ - return self._ase - - @property - def must_compute_md5(self): - # type: (DownloadDescriptor) -> bool - """Check if MD5 must be computed - :param DownloadDescriptor self: this - :rtype: bool - :return: if MD5 must be computed - """ - return self.md5 is not None - - def _initialize_integrity_checkers(self, options): - # type: (DownloadDescriptor, DownloadOptions) -> None - """Initialize file integrity checkers - :param DownloadDescriptor self: this - :param DownloadOptions options: download options - """ - if self._ase.is_encrypted: - # ensure symmetric key exists - if blobxfer.util.is_none_or_empty( - self._ase.encryption_metadata.symmetric_key): - raise RuntimeError( - 'symmetric key is invalid: provide RSA private key ' - 'or metadata corrupt') - self.hmac = self._ase.encryption_metadata.initialize_hmac() - if self.hmac is None and options.check_file_md5: - self.md5 = blobxfer.util.new_md5_hasher() - - def _allocate_disk_space(self): - # type: (DownloadDescriptor, int) -> None - """Perform file allocation (possibly sparse) - :param DownloadDescriptor self: this - :param int size: size - """ - size = self._ase.size - # compute size - if size > 0: - if self._ase.is_encrypted: - # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs - allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ - self._AES_BLOCKSIZE - else: - allocatesize = size - if allocatesize < 0: - allocatesize = 0 - else: - allocatesize = 0 - # create parent path - self.local_path.parent.mkdir(mode=0o750, parents=True, exist_ok=True) - # allocate file - with self.local_path.open('wb') as fd: - if allocatesize > 0: - try: - os.posix_fallocate(fd.fileno(), 0, allocatesize) - except AttributeError: - fd.seek(allocatesize - 1) - fd.write(b'\0') - - def next_offsets(self): - # type: (DownloadDescriptor) -> DownloadOffsets - """Retrieve the next offsets - :param DownloadDescriptor self: this - :rtype: DownloadOffsets - :return: download offsets - """ - with self._meta_lock: - if self._offset >= self._ase.size: - return None - if self._offset + self._chunk_size > self._ase.size: - chunk = self._ase.size - self._offset - else: - chunk = self._chunk_size - # on download, num_bytes must be offset by -1 as the x-ms-range - # header expects it that way. x -> y bytes means first bits of the - # (x+1)th byte to the last bits of the (y+1)th byte. for example, - # 0 -> 511 means byte 1 to byte 512 - num_bytes = chunk - 1 - chunk_num = self._chunk_num - fd_start = self._offset - range_start = self._offset - if self._ase.is_encrypted: - # ensure start is AES block size aligned - range_start = range_start - \ - (range_start % self._AES_BLOCKSIZE) - \ - self._AES_BLOCKSIZE - if range_start <= 0: - range_start = 0 - range_end = self._offset + num_bytes - self._offset += chunk - self._chunk_num += 1 - if self._ase.is_encrypted and self._offset >= self._ase.size: - unpad = True - else: - unpad = False - return DownloadOffsets( - chunk_num=chunk_num, - fd_start=fd_start, - num_bytes=chunk, - range_start=range_start, - range_end=range_end, - unpad=unpad, - ) - - def _postpone_integrity_check(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None - """Postpone integrity check for chunk - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets - :param bytes data: data - """ - if self.must_compute_md5: - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) - fd.write(data) - unchecked = UncheckedChunk( - data_len=len(data), - fd_start=offsets.fd_start, - file_path=self.local_path, - temp=False, - ) - else: - fname = None - with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: - fname = fd.name - fd.write(data) - unchecked = UncheckedChunk( - data_len=len(data), - fd_start=0, - file_path=pathlib.Path(fname), - temp=True, - ) - with self._meta_lock: - self._unchecked_chunks[offsets.chunk_num] = unchecked - - def perform_chunked_integrity_check(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None - """Hash data against stored MD5 hasher safely - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets - :param bytes data: data - """ - self_check = False - hasher = self.hmac or self.md5 - # iterate from next chunk to be checked - while True: - ucc = None - with self._meta_lock: - chunk_num = self._next_integrity_chunk - # check if the next chunk is ready - if chunk_num in self._unchecked_chunks: - ucc = self._unchecked_chunks.pop(chunk_num) - elif chunk_num != offsets.chunk_num: - break - # prepare data for hashing - if ucc is None: - chunk = data - self_check = True - else: - with ucc.file_path.open('rb') as fd: - fd.seek(ucc.fd_start, 0) - chunk = fd.read(ucc.data_len) - if ucc.temp: - ucc.file_path.unlink() - # hash data and set next integrity chunk - with self._hasher_lock: - hasher.update(chunk) - with self._meta_lock: - self._next_integrity_chunk += 1 - # store data that hasn't been checked - if not self_check: - self._postpone_integrity_check(offsets, data) - - def write_data(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None - """Postpone integrity check for chunk - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets - :param bytes data: data - """ - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) - fd.write(data) - - def finalize_file(self): - # type: (DownloadDescriptor) -> Tuple[bool, str] - """Finalize file download - :param DownloadDescriptor self: this - :rtype: tuple - :return (if integrity check passed or not, message) - """ - # check final file integrity - check = False - msg = None - if self.hmac is not None: - mac = self._ase.encryption_metadata.encryption_authentication.\ - message_authentication_code - digest = blobxfer.util.base64_encode_as_string(self.hmac.digest()) - if digest == mac: - check = True - msg = '{}: {}, {} {} {}'.format( - self._ase.encryption_metadata.encryption_authentication. - algorithm, - 'OK' if check else 'MISMATCH', - self._ase.name, - digest, - mac, - ) - elif self.md5 is not None: - digest = blobxfer.util.base64_encode_as_string(self.md5.digest()) - if digest == self._ase.md5: - check = True - msg = 'MD5: {}, {} {} {}'.format( - 'OK' if check else 'MISMATCH', - self._ase.name, - digest, - self._ase.md5, - ) - else: - check = True - msg = 'MD5: SKIPPED, {} None {}'.format( - self._ase.name, - self._ase.md5 - ) - # cleanup if download failed - if not check: - logger.error(msg) - # delete temp download file - self.local_path.unlink() - return - logger.debug(msg) - - # TODO set file uid/gid and mode - - # move temp download file to final path - self.local_path.rename(self.final_path) - - @property - def all_operations_completed(self): - with self._meta_lock: - return (self._outstanding_ops == 0 and - len(self._unchecked_chunks) == 0) - - def dec_outstanding_operations(self): - with self._meta_lock: - self._outstanding_ops -= 1 - self._completed_ops += 1 - - class AzureDestinationPaths(object): def __init__(self): pass diff --git a/blobxfer/offload.py b/blobxfer/offload.py index 80f84fe..ca2cc85 100644 --- a/blobxfer/offload.py +++ b/blobxfer/offload.py @@ -43,10 +43,11 @@ class _MultiprocessOffload(object): - def __init__(self, num_workers, description=None): - # type: (_MultiprocessOffload, int, str) -> None + def __init__(self, target, num_workers, description=None): + # type: (_MultiprocessOffload, function, int, str) -> None """Ctor for Crypto Offload :param _MultiprocessOffload self: this + :param function target: target function for process :param int num_workers: number of worker processes :param str description: description """ @@ -56,7 +57,7 @@ def __init__(self, num_workers, description=None): self._term_signal = multiprocessing.Value('i', 0) self._procs = [] self._check_thread = None - self._initialize_processes(num_workers, description) + self._initialize_processes(target, num_workers, description) @property def done_cv(self): @@ -78,10 +79,11 @@ def terminated(self): """ return self._term_signal.value == 1 - def _initialize_processes(self, num_workers, description): - # type: (_MultiprocessOffload, int, str) -> None + def _initialize_processes(self, target, num_workers, description): + # type: (_MultiprocessOffload, function, int, str) -> None """Initialize processes :param _MultiprocessOffload self: this + :param function target: target function for process :param int num_workers: number of worker processes :param str description: description """ @@ -90,7 +92,7 @@ def _initialize_processes(self, num_workers, description): logger.debug('initializing {}{} processes'.format( num_workers, ' ' + description if not None else '')) for _ in range(num_workers): - proc = multiprocessing.Process(target=self._worker_process) + proc = multiprocessing.Process(target=target) proc.start() self._procs.append(proc) @@ -118,10 +120,10 @@ def pop_done_queue(self): return None def initialize_check_thread(self, check_func): - # type: (_MultiprocessOffload, object) -> None + # type: (_MultiprocessOffload, function) -> None """Initialize the crypto done queue check thread :param Downloader self: this - :param object check_func: check function + :param function check_func: check function """ self._check_thread = threading.Thread(target=check_func) self._check_thread.start() diff --git a/setup.py b/setup.py index 11ba002..729dcc9 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ 'blobxfer.blob.block', 'blobxfer.blob.page', 'blobxfer.crypto', + 'blobxfer.download', 'blobxfer.file', 'blobxfer_cli', ] diff --git a/tests/test_blobxfer_crypto_operations.py b/tests/test_blobxfer_crypto_operations.py index 88990e5..84d633a 100644 --- a/tests/test_blobxfer_crypto_operations.py +++ b/tests/test_blobxfer_crypto_operations.py @@ -2,11 +2,13 @@ """Tests for crypto operations""" # stdlib imports -from mock import patch +import mock import os +import time # non-stdlib imports import cryptography.hazmat.primitives.asymmetric.rsa # local imports +import blobxfer.download.models # module under test import blobxfer.crypto.operations as ops @@ -16,7 +18,8 @@ backend=cryptography.hazmat.backends.default_backend()) -@patch('cryptography.hazmat.primitives.serialization.load_pem_private_key') +@mock.patch( + 'cryptography.hazmat.primitives.serialization.load_pem_private_key') def test_load_rsa_private_key_file(patched_load, tmpdir): keyfile = tmpdir.join('keyfile') keyfile.write('a') @@ -26,7 +29,7 @@ def test_load_rsa_private_key_file(patched_load, tmpdir): assert rv == _RSAKEY -@patch('cryptography.hazmat.primitives.serialization.load_pem_public_key') +@mock.patch('cryptography.hazmat.primitives.serialization.load_pem_public_key') def test_load_rsa_public_key_file(patched_load, tmpdir): keyfile = tmpdir.join('keyfile') keyfile.write('b') @@ -85,3 +88,37 @@ def test_aes_cbc_encryption(): assert encdata != plaindata decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, False) assert decdata == plaindata + + +def test_cryptooffload_decrypt(): + a = None + try: + a = ops.CryptoOffload(1) + offsets = blobxfer.download.models.DownloadOffsets( + chunk_num=0, + fd_start=1, + num_bytes=2, + range_end=3, + range_start=4, + unpad=False, + ) + a.add_decrypt_chunk( + 'fp', offsets, ops.aes256_generate_random_key(), os.urandom(16), + os.urandom(16)) + i = 33 + checked = False + while i > 0: + result = a.pop_done_queue() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert len(result) == 3 + assert result[0] == 'fp' + assert result[1] == offsets + checked = True + break + assert checked + finally: + if a is not None: + a.finalize_processes() diff --git a/tests/test_blobxfer_download_models.py b/tests/test_blobxfer_download_models.py new file mode 100644 index 0000000..530e4ac --- /dev/null +++ b/tests/test_blobxfer_download_models.py @@ -0,0 +1,377 @@ +# coding=utf-8 +"""Tests for download models""" + +# stdlib imports +import hashlib +import hmac +import mock +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import pytest +# local imports +import blobxfer.models +import blobxfer.util as util +# module under test +import blobxfer.download.models as models + + +def test_downloaddescriptor(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 1024 + ase._encryption = mock.MagicMock() + with pytest.raises(RuntimeError): + d = models.DownloadDescriptor(lp, ase, opts) + + ase._encryption.symmetric_key = b'123' + d = models.DownloadDescriptor(lp, ase, opts) + + assert d.entity == ase + assert not d.must_compute_md5 + assert d._total_chunks == 64 + assert d._offset == 0 + assert d.final_path == lp + assert str(d.local_path) == str(lp) + '.bxtmp' + assert d.local_path.stat().st_size == 1024 - 16 + + d.local_path.unlink() + ase._size = 1 + d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 1 + assert d.local_path.stat().st_size == 0 + + d.local_path.unlink() + ase._encryption = None + ase._size = 1024 + d = models.DownloadDescriptor(lp, ase, opts) + assert d.local_path.stat().st_size == 1024 + + # pre-existing file check + ase._size = 0 + d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 0 + assert d.local_path.stat().st_size == 0 + + +def test_downloaddescriptor_next_offsets(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 128 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 128 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 0 + d = models.DownloadDescriptor(lp, ase, opts) + assert d._total_chunks == 0 + assert d.next_offsets() is None + + ase._size = 1 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 1 + assert offsets.range_start == 0 + assert offsets.range_end == 0 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + 16 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 2 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets = d.next_offsets() + assert offsets.chunk_num == 1 + assert offsets.fd_start == 256 + assert offsets.num_bytes == 16 + assert offsets.range_start == 256 + assert offsets.range_end == 256 + 15 + assert not offsets.unpad + assert d.next_offsets() is None + + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + ase._size = 128 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 128 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert offsets.unpad + assert d.next_offsets() is None + + ase._size = 256 + 32 # 16 bytes over + padding + d = models.DownloadDescriptor(lp, ase, opts) + offsets = d.next_offsets() + assert d._total_chunks == 2 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets = d.next_offsets() + assert offsets.chunk_num == 1 + assert offsets.fd_start == 256 + assert offsets.num_bytes == 32 + assert offsets.range_start == 256 - 16 + assert offsets.range_end == 256 + 31 + assert offsets.unpad + assert d.next_offsets() is None + + +def test_postpone_integrity_check(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 32 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + d._postpone_integrity_check(offsets, b'0' * ase._size) + + assert offsets.chunk_num in d._unchecked_chunks + ucc = d._unchecked_chunks[offsets.chunk_num] + assert ucc.data_len == ase._size + assert ucc.fd_start == offsets.fd_start + assert ucc.file_path == d.local_path + assert not ucc.temp + + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 32 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + d._postpone_integrity_check(offsets, b'0' * ase._size) + + assert offsets.chunk_num in d._unchecked_chunks + ucc = d._unchecked_chunks[offsets.chunk_num] + assert ucc.data_len == ase._size + assert ucc.fd_start == offsets.fd_start + assert ucc.file_path != d.local_path + assert ucc.temp + + +def test_perform_chunked_integrity_check(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d._postpone_integrity_check(offsets, data) + d.perform_chunked_integrity_check(offsets, data) + + assert d._next_integrity_chunk == 1 + + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + offsets1 = d.next_offsets() + d._postpone_integrity_check(offsets1, data) + ucc = d._unchecked_chunks[offsets1.chunk_num] + d.perform_chunked_integrity_check(offsets, data) + + assert d._next_integrity_chunk == 2 + assert not ucc.file_path.exists() + assert not ucc.file_path.exists() + + +def test_write_data(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + data = b'0' * ase._size + d.write_data(offsets, data) + + assert d.local_path.exists() + assert d.local_path.stat().st_size == len(data) + + +def test_finalize_file(tmpdir): + # hmac check success + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + signkey = os.urandom(32) + ase._encryption.initialize_hmac = mock.MagicMock() + ase._encryption.initialize_hmac.return_value = hmac.new( + signkey, digestmod=hashlib.sha256) + + data = b'0' * (ase._size - 16) + _hmac = hmac.new(signkey, digestmod=hashlib.sha256) + _hmac.update(data) + ase._encryption.encryption_authentication.\ + message_authentication_code = util.base64_encode_as_string( + _hmac.digest()) + + d = models.DownloadDescriptor(lp, ase, opts) + d.hmac.update(data) + d.finalize_file() + + assert not d.local_path.exists() + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # md5 check success + lp = pathlib.Path(str(tmpdir.join('b'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + md5 = util.new_md5_hasher() + md5.update(data) + ase._md5 = util.base64_encode_as_string(md5.digest()) + + d = models.DownloadDescriptor(lp, ase, opts) + d.md5.update(data) + d.finalize_file() + + assert not d.local_path.exists() + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # no check + lp = pathlib.Path(str(tmpdir.join('c'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + + d = models.DownloadDescriptor(lp, ase, opts) + d.finalize_file() + + assert not d.local_path.exists() + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # md5 mismatch + lp = pathlib.Path(str(tmpdir.join('d'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + ase._md5 = 'oops' + + d = models.DownloadDescriptor(lp, ase, opts) + d.md5.update(data) + d.finalize_file() + + assert not d.local_path.exists() + assert not d.final_path.exists() + + +def test_operations(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 32 + + d = models.DownloadDescriptor(lp, ase, opts) + d._outstanding_ops = 1 + d._unchecked_chunks = {0: None} + assert not d.all_operations_completed + + d.dec_outstanding_operations() + assert d._completed_ops == 1 + assert not d.all_operations_completed + + d._unchecked_chunks.pop(0) + assert d.all_operations_completed diff --git a/tests/test_blobxfer_download.py b/tests/test_blobxfer_download_operations.py similarity index 84% rename from tests/test_blobxfer_download.py rename to tests/test_blobxfer_download_operations.py index aef5e79..0aebc75 100644 --- a/tests/test_blobxfer_download.py +++ b/tests/test_blobxfer_download_operations.py @@ -1,5 +1,5 @@ # coding=utf-8 -"""Tests for download""" +"""Tests for download operations""" # stdlib imports import datetime @@ -17,7 +17,7 @@ import blobxfer.models as models import blobxfer.util as util # module under test -import blobxfer.download as dl +import blobxfer.download.operations as ops def test_check_download_conditions(tmpdir): @@ -44,11 +44,11 @@ def test_check_download_conditions(tmpdir): ), local_destination_path=models.LocalDestinationPath('dest'), ) - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) result = d._check_download_conditions(nep, mock.MagicMock()) - assert result == dl.DownloadAction.Download + assert result == ops.DownloadAction.Download result = d._check_download_conditions(ep, mock.MagicMock()) - assert result == dl.DownloadAction.Skip + assert result == ops.DownloadAction.Skip ds = models.DownloadSpecification( download_options=models.DownloadOptions( @@ -68,9 +68,9 @@ def test_check_download_conditions(tmpdir): ), local_destination_path=models.LocalDestinationPath('dest'), ) - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) result = d._check_download_conditions(ep, mock.MagicMock()) - assert result == dl.DownloadAction.CheckMd5 + assert result == ops.DownloadAction.CheckMd5 ds = models.DownloadSpecification( download_options=models.DownloadOptions( @@ -90,9 +90,9 @@ def test_check_download_conditions(tmpdir): ), local_destination_path=models.LocalDestinationPath('dest'), ) - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) result = d._check_download_conditions(ep, mock.MagicMock()) - assert result == dl.DownloadAction.Download + assert result == ops.DownloadAction.Download ds = models.DownloadSpecification( download_options=models.DownloadOptions( @@ -112,17 +112,17 @@ def test_check_download_conditions(tmpdir): ), local_destination_path=models.LocalDestinationPath('dest'), ) - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) rfile = models.AzureStorageEntity('cont') rfile._size = util.page_align_content_length(ep.stat().st_size) rfile._mode = models.AzureStorageModes.Page result = d._check_download_conditions(ep, rfile) - assert result == dl.DownloadAction.Skip + assert result == ops.DownloadAction.Skip rfile._size = ep.stat().st_size rfile._mode = models.AzureStorageModes.Page result = d._check_download_conditions(ep, rfile) - assert result == dl.DownloadAction.Download + assert result == ops.DownloadAction.Download ds = models.DownloadSpecification( download_options=models.DownloadOptions( @@ -142,21 +142,21 @@ def test_check_download_conditions(tmpdir): ), local_destination_path=models.LocalDestinationPath('dest'), ) - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) rfile = models.AzureStorageEntity('cont') rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) + \ datetime.timedelta(days=1) result = d._check_download_conditions(ep, rfile) - assert result == dl.DownloadAction.Download + assert result == ops.DownloadAction.Download rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) - \ datetime.timedelta(days=1) result = d._check_download_conditions(ep, rfile) - assert result == dl.DownloadAction.Skip + assert result == ops.DownloadAction.Skip def test_pre_md5_skip_on_check(): - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_offload = mock.MagicMock() rfile = models.AzureStorageEntity('cont') @@ -177,7 +177,7 @@ def test_pre_md5_skip_on_check(): def test_post_md5_skip_on_check(): - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_offload = mock.MagicMock() lpath = 'lpath' @@ -199,7 +199,7 @@ def test_post_md5_skip_on_check(): def test_check_for_downloads_from_md5(): lpath = 'lpath' - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_map[lpath] = mock.MagicMock() d._download_set.add(pathlib.Path(lpath)) d._md5_offload = mock.MagicMock() @@ -209,14 +209,19 @@ def test_check_for_downloads_from_md5(): with pytest.raises(StopIteration): d._check_for_downloads_from_md5() - assert d._add_to_download_queue.call_count == 1 + d._add_to_download_queue = mock.MagicMock() + d._all_remote_files_processed = False + d._download_terminate = True + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 0 + def test_initialize_and_terminate_download_threads(): opts = mock.MagicMock() opts.concurrency.transfer_threads = 2 - d = dl.Downloader(opts, mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(opts, mock.MagicMock(), mock.MagicMock()) d._worker_thread_download = mock.MagicMock() d._initialize_download_threads() @@ -233,7 +238,7 @@ def test_initialize_and_terminate_download_threads(): @mock.patch('blobxfer.blob.operations.list_blobs') @mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._initialize_download_threads = mock.MagicMock() patched_lfmo._check_thread = mock.MagicMock() d._general_options.concurrency.crypto_processes = 0 @@ -261,14 +266,14 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d._pre_md5_skip_on_check = mock.MagicMock() d._check_download_conditions = mock.MagicMock() - d._check_download_conditions.return_value = dl.DownloadAction.Skip + d._check_download_conditions.return_value = ops.DownloadAction.Skip patched_tc.side_effect = [1, 2] d.start() assert d._pre_md5_skip_on_check.call_count == 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False - d._check_download_conditions.return_value = dl.DownloadAction.CheckMd5 + d._check_download_conditions.return_value = ops.DownloadAction.CheckMd5 patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() @@ -277,7 +282,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): b.properties.content_length = 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False - d._check_download_conditions.return_value = dl.DownloadAction.Download + d._check_download_conditions.return_value = ops.DownloadAction.Download patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() @@ -285,7 +290,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): def test_start_keyboard_interrupt(): - d = dl.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._run = mock.MagicMock(side_effect=KeyboardInterrupt) d._wait_for_download_threads = mock.MagicMock() d._md5_offload = mock.MagicMock() diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 897e28d..2ad07cf 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -348,159 +348,3 @@ def test_azurestorageentity(): ase.populate_from_file(mock.MagicMock(), blob) assert ase.mode == models.AzureStorageModes.File assert ase.snapshot is None - - -def test_downloaddescriptor(tmpdir): - lp = pathlib.Path(str(tmpdir.join('a'))) - - opts = mock.MagicMock() - opts.check_file_md5 = True - opts.chunk_size_bytes = 16 - ase = models.AzureStorageEntity('cont') - ase._size = 1024 - ase._encryption = mock.MagicMock() - with pytest.raises(RuntimeError): - d = models.DownloadDescriptor(lp, ase, opts) - - ase._encryption.symmetric_key = b'123' - d = models.DownloadDescriptor(lp, ase, opts) - - assert d.entity == ase - assert not d.must_compute_md5 - assert d._total_chunks == 64 - assert d._offset == 0 - assert d.final_path == lp - assert str(d.local_path) == str(lp) + '.bxtmp' - assert d.local_path.stat().st_size == 1024 - 16 - - d.local_path.unlink() - ase._size = 1 - d = models.DownloadDescriptor(lp, ase, opts) - assert d._total_chunks == 1 - assert d.local_path.stat().st_size == 0 - - d.local_path.unlink() - ase._encryption = None - ase._size = 1024 - d = models.DownloadDescriptor(lp, ase, opts) - assert d.local_path.stat().st_size == 1024 - - # pre-existing file check - ase._size = 0 - d = models.DownloadDescriptor(lp, ase, opts) - assert d._total_chunks == 0 - assert d.local_path.stat().st_size == 0 - - -def test_downloaddescriptor_next_offsets(tmpdir): - lp = pathlib.Path(str(tmpdir.join('a'))) - - opts = mock.MagicMock() - opts.check_file_md5 = True - opts.chunk_size_bytes = 256 - ase = models.AzureStorageEntity('cont') - ase._size = 128 - d = models.DownloadDescriptor(lp, ase, opts) - - offsets = d.next_offsets() - assert d._total_chunks == 1 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 128 - assert offsets.range_start == 0 - assert offsets.range_end == 127 - assert not offsets.unpad - assert d.next_offsets() is None - - ase._size = 0 - d = models.DownloadDescriptor(lp, ase, opts) - assert d._total_chunks == 0 - assert d.next_offsets() is None - - ase._size = 1 - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 1 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 1 - assert offsets.range_start == 0 - assert offsets.range_end == 0 - assert not offsets.unpad - assert d.next_offsets() is None - - ase._size = 256 - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 1 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 256 - assert offsets.range_start == 0 - assert offsets.range_end == 255 - assert not offsets.unpad - assert d.next_offsets() is None - - ase._size = 256 + 16 - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 2 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 256 - assert offsets.range_start == 0 - assert offsets.range_end == 255 - assert not offsets.unpad - offsets = d.next_offsets() - assert offsets.chunk_num == 1 - assert offsets.fd_start == 256 - assert offsets.num_bytes == 16 - assert offsets.range_start == 256 - assert offsets.range_end == 256 + 15 - assert not offsets.unpad - assert d.next_offsets() is None - - ase._encryption = mock.MagicMock() - ase._encryption.symmetric_key = b'123' - ase._size = 128 - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 1 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 128 - assert offsets.range_start == 0 - assert offsets.range_end == 127 - assert offsets.unpad - assert d.next_offsets() is None - - ase._size = 256 - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 1 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 256 - assert offsets.range_start == 0 - assert offsets.range_end == 255 - assert offsets.unpad - assert d.next_offsets() is None - - ase._size = 256 + 32 # 16 bytes over + padding - d = models.DownloadDescriptor(lp, ase, opts) - offsets = d.next_offsets() - assert d._total_chunks == 2 - assert offsets.chunk_num == 0 - assert offsets.fd_start == 0 - assert offsets.num_bytes == 256 - assert offsets.range_start == 0 - assert offsets.range_end == 255 - assert not offsets.unpad - offsets = d.next_offsets() - assert offsets.chunk_num == 1 - assert offsets.fd_start == 256 - assert offsets.num_bytes == 32 - assert offsets.range_start == 256 - 16 - assert offsets.range_end == 256 + 31 - assert offsets.unpad - assert d.next_offsets() is None diff --git a/tests/test_blobxfer_offload.py b/tests/test_blobxfer_offload.py new file mode 100644 index 0000000..71cc97d --- /dev/null +++ b/tests/test_blobxfer_offload.py @@ -0,0 +1,36 @@ +# coding=utf-8 +"""Tests for offload""" + +# stdlib imports +import mock +# non-stdlib imports +import pytest +# local imports +# module under test +import blobxfer.offload as offload + + +def test_multiprocess_offload(): + with pytest.raises(ValueError): + a = offload._MultiprocessOffload(None, None) + + target = mock.MagicMock() + a = offload._MultiprocessOffload(target, 1, 'test') + assert len(a._procs) == 1 + assert not a.terminated + assert a._done_cv == a.done_cv + assert a._check_thread is None + assert a.pop_done_queue() is None + + item = (0, 'abc') + a._done_queue.put(item) + + check_func = mock.MagicMock() + a.initialize_check_thread(check_func) + + a.finalize_processes() + assert a.terminated + for proc in a._procs: + assert not proc.is_alive() + + assert a.pop_done_queue() == item From 8204d335aca804fa7e98819b8e4e8a66ab2ecf76 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Mon, 27 Feb 2017 13:32:56 -0800 Subject: [PATCH 13/47] More coverage for download operations - Move some class instance vars to properties for mocking - Simplify termination conditions to properties --- blobxfer/download/operations.py | 68 +++--- tests/test_blobxfer_download_operations.py | 254 ++++++++++++++++++++- 2 files changed, 285 insertions(+), 37 deletions(-) diff --git a/blobxfer/download/operations.py b/blobxfer/download/operations.py index ac36ebe..9679ca6 100644 --- a/blobxfer/download/operations.py +++ b/blobxfer/download/operations.py @@ -96,6 +96,34 @@ def __init__(self, general_options, creds, spec): self._creds = creds self._spec = spec + @property + def termination_check(self): + # type: (Downloader) -> bool + """Check if terminated + :param Downloader self: this + :rtype: bool + :return: if terminated + """ + with self._download_lock: + return (self._download_terminate or + (self._all_remote_files_processed and + len(self._download_set) == 0)) + + @property + def termination_check_md5(self): + # type: (Downloader) -> bool + """Check if terminated from MD5 context + :param Downloader self: this + :rtype: bool + :return: if terminated from MD5 context + """ + with self._md5_meta_lock: + with self._download_lock: + return (self._download_terminate or + (self._all_remote_files_processed and + len(self._md5_map) == 0 and + len(self._download_set) == 0)) + def _check_download_conditions(self, lpath, rfile): # type: (Downloader, pathlib.Path, # blobxfer.models.AzureStorageEntity) -> DownloadAction @@ -188,26 +216,17 @@ def _check_for_downloads_from_md5(self): :param Downloader self: this """ cv = self._md5_offload.done_cv - while True: - with self._md5_meta_lock: - if (self._download_terminate or - (self._all_remote_files_processed and - len(self._md5_map) == 0 and - len(self._download_set) == 0)): - break + while not self.termination_check_md5: result = None cv.acquire() - while not self._download_terminate: + while True: result = self._md5_offload.pop_done_queue() if result is None: # use cv timeout due to possible non-wake while running cv.wait(1) # check for terminating conditions - with self._md5_meta_lock: - if (self._all_remote_files_processed and - len(self._md5_map) == 0 and - len(self._download_set) == 0): - break + if self.termination_check_md5: + break else: break cv.release() @@ -220,24 +239,17 @@ def _check_for_crypto_done(self): :param Downloader self: this """ cv = self._crypto_offload.done_cv - while True: - with self._download_lock: - if (self._download_terminate or - (self._all_remote_files_processed and - len(self._download_set) == 0)): - break + while not self.termination_check: result = None cv.acquire() - while not self._download_terminate: + while True: result = self._crypto_offload.pop_done_queue() if result is None: # use cv timeout due to possible non-wake while running cv.wait(1) # check for terminating conditions - with self._download_lock: - if (self._all_remote_files_processed and - len(self._download_set) == 0): - break + if self.termination_check: + break else: break cv.release() @@ -291,13 +303,7 @@ def _worker_thread_download(self): """Worker thread download :param Downloader self: this """ - while True: - if self._download_terminate: - break - with self._download_lock: - if (self._all_remote_files_processed and - len(self._download_set) == 0): - break + while not self.termination_check: try: dd = self._download_queue.get(False, 1) except queue.Empty: diff --git a/tests/test_blobxfer_download_operations.py b/tests/test_blobxfer_download_operations.py index 0aebc75..eedbeb8 100644 --- a/tests/test_blobxfer_download_operations.py +++ b/tests/test_blobxfer_download_operations.py @@ -10,10 +10,15 @@ import pathlib2 as pathlib except ImportError: # noqa import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue # non-stdlib imports import azure.storage.blob import pytest # local imports +import blobxfer.download.models import blobxfer.models as models import blobxfer.util as util # module under test @@ -206,16 +211,93 @@ def test_check_for_downloads_from_md5(): d._md5_offload.done_cv = multiprocessing.Condition() d._md5_offload.pop_done_queue.side_effect = [None, (lpath, False)] d._add_to_download_queue = mock.MagicMock() + d._all_remote_files_processed = False + d._download_terminate = True + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 0 - with pytest.raises(StopIteration): + with mock.patch( + 'blobxfer.download.operations.Downloader.' + 'termination_check_md5', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[lpath] = mock.MagicMock() + d._download_set.add(pathlib.Path(lpath)) + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.pop_done_queue.side_effect = [None, (lpath, False)] + d._add_to_download_queue = mock.MagicMock() + patched_tc.side_effect = [False, False, True] d._check_for_downloads_from_md5() - assert d._add_to_download_queue.call_count == 1 + assert d._add_to_download_queue.call_count == 1 + + with mock.patch( + 'blobxfer.download.operations.Downloader.' + 'termination_check_md5', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[lpath] = mock.MagicMock() + d._download_set.add(pathlib.Path(lpath)) + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.pop_done_queue.side_effect = [None] + d._add_to_download_queue = mock.MagicMock() + patched_tc.side_effect = [False, True, True] + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 0 - d._add_to_download_queue = mock.MagicMock() + +def test_check_for_crypto_done(): + lpath = 'lpath' + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_set.add(pathlib.Path(lpath)) + d._dd_map[lpath] = mock.MagicMock() + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + (lpath, mock.MagicMock(), mock.MagicMock()), + ] + d._complete_chunk_download = mock.MagicMock() d._all_remote_files_processed = False d._download_terminate = True - d._check_for_downloads_from_md5() - assert d._add_to_download_queue.call_count == 0 + d._check_for_crypto_done() + assert d._complete_chunk_download.call_count == 0 + + with mock.patch( + 'blobxfer.download.operations.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_set.add(pathlib.Path(lpath)) + d._dd_map[lpath] = mock.MagicMock() + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + (lpath, mock.MagicMock(), mock.MagicMock()), + ] + patched_tc.side_effect = [False, False, True] + d._complete_chunk_download = mock.MagicMock() + d._check_for_crypto_done() + assert d._complete_chunk_download.call_count == 1 + + +def test_add_to_download_queue(tmpdir): + path = tmpdir.join('a') + lpath = pathlib.Path(str(path)) + ase = models.AzureStorageEntity('cont') + ase._size = 1 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.chunk_size_bytes = 1 + + d._add_to_download_queue(lpath, ase) + assert d._download_queue.qsize() == 1 + assert path in d._dd_map def test_initialize_and_terminate_download_threads(): @@ -233,6 +315,166 @@ def test_initialize_and_terminate_download_threads(): assert not thr.is_alive() +def test_complete_chunk_download(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + offsets = dd.next_offsets() + data = b'0' * ase._size + + d._complete_chunk_download(offsets, data, dd) + + assert dd.local_path.exists() + assert dd.local_path.stat().st_size == len(data) + assert dd._completed_ops == 1 + + +@mock.patch('blobxfer.crypto.operations.aes_cbc_decrypt_data') +@mock.patch('blobxfer.file.operations.get_file_range') +@mock.patch('blobxfer.blob.operations.get_blob_range') +def test_worker_thread_download( + patched_gbr, patched_gfr, patched_acdd, tmpdir): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._complete_chunk_download = mock.MagicMock() + d._download_terminate = True + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 0 + + d._download_terminate = False + d._all_remote_files_processed = True + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 0 + + with mock.patch( + 'blobxfer.download.operations.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + with mock.patch( + 'blobxfer.download.models.DownloadDescriptor.' + 'all_operations_completed', + new_callable=mock.PropertyMock) as patched_aoc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._complete_chunk_download = mock.MagicMock() + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + lp = pathlib.Path(str(tmpdir.join('a'))) + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.next_offsets = mock.MagicMock(side_effect=[None, None]) + dd.finalize_file = mock.MagicMock() + patched_aoc.side_effect = [False, True] + patched_tc.side_effect = [False, False, False, True] + d._dd_map[str(lp)] = mock.MagicMock() + d._download_set.add(lp) + d._download_queue = mock.MagicMock() + d._download_queue.get.side_effect = [queue.Empty, dd, dd] + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 0 + assert str(lp) not in d._dd_map + assert dd.finalize_file.call_count == 1 + assert d._download_count == 1 + + with mock.patch( + 'blobxfer.download.operations.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._mode = blobxfer.models.AzureStorageModes.File + ase._size = 16 + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('b'))) + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.finalize_file = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + d._dd_map[str(lp)] = mock.MagicMock() + d._download_set.add(lp) + d._download_queue = mock.MagicMock() + d._download_queue.get.side_effect = [dd] + d._complete_chunk_download = mock.MagicMock() + patched_tc.side_effect = [False, True] + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 1 + assert dd.perform_chunked_integrity_check.call_count == 1 + + with mock.patch( + 'blobxfer.download.operations.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._mode = blobxfer.models.AzureStorageModes.Auto + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + ase._encryption.content_encryption_iv = b'0' * 16 + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('c'))) + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.finalize_file = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + d._crypto_offload = mock.MagicMock() + d._crypto_offload.add_decrypt_chunk = mock.MagicMock() + d._dd_map[str(lp)] = mock.MagicMock() + d._download_set.add(lp) + d._download_queue = mock.MagicMock() + d._download_queue.get.side_effect = [dd] + d._complete_chunk_download = mock.MagicMock() + patched_tc.side_effect = [False, True] + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 0 + assert d._crypto_offload.add_decrypt_chunk.call_count == 1 + assert dd.perform_chunked_integrity_check.call_count == 1 + + with mock.patch( + 'blobxfer.download.operations.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.crypto_processes = 0 + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._mode = blobxfer.models.AzureStorageModes.Auto + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + ase._encryption.content_encryption_iv = b'0' * 16 + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('d'))) + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.next_offsets() + dd.perform_chunked_integrity_check = mock.MagicMock() + patched_acdd.return_value = b'0' * 16 + d._dd_map[str(lp)] = mock.MagicMock() + d._download_set.add(lp) + d._download_queue = mock.MagicMock() + d._download_queue.get.side_effect = [dd] + d._complete_chunk_download = mock.MagicMock() + patched_tc.side_effect = [False, True] + d._worker_thread_download() + assert d._complete_chunk_download.call_count == 1 + assert patched_acdd.call_count == 1 + assert dd.perform_chunked_integrity_check.call_count == 1 + + @mock.patch('time.clock') @mock.patch('blobxfer.md5.LocalFileMd5Offload') @mock.patch('blobxfer.blob.operations.list_blobs') @@ -241,7 +483,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._initialize_download_threads = mock.MagicMock() patched_lfmo._check_thread = mock.MagicMock() - d._general_options.concurrency.crypto_processes = 0 + d._general_options.concurrency.crypto_processes = 1 d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 From fa72fc92fa998eaecb9eff9f1baa2b4525bfecb4 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 28 Feb 2017 09:55:28 -0800 Subject: [PATCH 14/47] Fix file download issues - Update dependencies to latest --- blobxfer/download/operations.py | 24 +++++++++++++++------- blobxfer/file/operations.py | 15 ++++++++------ cli/cli.py | 4 ++-- setup.py | 8 ++++---- tests/test_blobxfer_download_operations.py | 1 + tests/test_blobxfer_file_operations.py | 22 +++++++++++--------- 6 files changed, 45 insertions(+), 29 deletions(-) diff --git a/blobxfer/download/operations.py b/blobxfer/download/operations.py index 9679ca6..4c9a0a9 100644 --- a/blobxfer/download/operations.py +++ b/blobxfer/download/operations.py @@ -43,7 +43,6 @@ except ImportError: # noqa import Queue as queue import threading -import time # non-stdlib imports import dateutil # local imports @@ -78,7 +77,6 @@ def __init__(self, general_options, creds, spec): :param blobxfer.models.AzureStorageCredentials creds: creds :param blobxfer.models.DownloadSpecification spec: download spec """ - self._time_start = None self._all_remote_files_processed = False self._crypto_offload = None self._md5_meta_lock = threading.Lock() @@ -87,6 +85,7 @@ def __init__(self, general_options, creds, spec): self._download_lock = threading.Lock() self._download_queue = queue.Queue() self._download_set = set() + self._download_start = None self._download_threads = [] self._download_count = 0 self._download_total_bytes = 0 @@ -274,6 +273,11 @@ def _add_to_download_queue(self, lpath, rfile): self._dd_map[str(dd.final_path)] = dd # add download descriptor to queue self._download_queue.put(dd) + if self._download_start is None: + with self._download_lock: + if self._download_start is None: + self._download_start = datetime.datetime.now( + tz=dateutil.tz.tzlocal()) def _initialize_download_threads(self): # type: (Downloader) -> None @@ -386,6 +390,8 @@ def _complete_chunk_download(self, offsets, data, dd): def _run(self): # type: (Downloader) -> None """Execute Downloader""" + start_time = datetime.datetime.now(tz=dateutil.tz.tzlocal()) + logger.info('script start time: {0}'.format(start_time)) # ensure destination path blobxfer.operations.ensure_local_destination(self._creds, self._spec) logger.info('downloading blobs/files to local path: {}'.format( @@ -409,7 +415,6 @@ def _run(self): skipped_files = 0 total_size = 0 skipped_size = 0 - self._time_start = time.clock() for src in self._spec.sources: for rfile in src.files( self._creds, self._spec.options, self._general_options): @@ -443,16 +448,21 @@ def _run(self): ('{0} remote files processed, waiting for download completion ' 'of {1:.4f} MiB').format(nfiles, download_size_mib)) self._wait_for_download_threads(terminate=False) - end = time.clock() - runtime = end - self._time_start + end_time = datetime.datetime.now(tz=dateutil.tz.tzlocal()) if (self._download_count != download_files or self._download_total_bytes != download_size): raise RuntimeError( 'download mismatch: [count={}/{} bytes={}/{}]'.format( self._download_count, download_files, self._download_total_bytes, download_size)) - logger.info('all files downloaded: {0:.3f} sec {1:.4f} Mbps'.format( - runtime, download_size_mib * 8 / runtime)) + if self._download_start is not None: + dltime = (end_time - self._download_start).total_seconds() + logger.info( + ('elapsed download + verify time and throughput: {0:.3f} sec, ' + '{1:.4f} Mbps').format( + dltime, download_size_mib * 8 / dltime)) + logger.info('script end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time, (end_time - start_time).total_seconds())) def start(self): # type: (Downloader) -> None diff --git a/blobxfer/file/operations.py b/blobxfer/file/operations.py index ec654dd..eff3d01 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/file/operations.py @@ -39,6 +39,7 @@ import azure.storage.file # local imports import blobxfer.retry +import blobxfer.util # create logger logger = logging.getLogger(__name__) @@ -96,8 +97,10 @@ def check_if_single_file(client, fileshare, prefix, timeout=None): :rtype: tuple :return: (if prefix in fileshare is a single file, file) """ - dirname, fname = parse_file_path(prefix) file = None + if blobxfer.util.is_none_or_empty(prefix): + return (False, file) + dirname, fname = parse_file_path(prefix) try: file = client.get_file_properties( share_name=fileshare, @@ -136,13 +139,13 @@ def list_files(client, fileshare, prefix, timeout=None): timeout=timeout, ) for file in files: - fspath = str(pathlib.Path( - dir if dir is not None else '' / file.name)) - if isinstance(file, azure.storage.file.File): + fspath = str( + pathlib.Path(dir if dir is not None else '') / file.name) + if type(file) == azure.storage.file.models.File: fsprop = client.get_file_properties( share_name=fileshare, - directory_name=dir, - file_name=file.name, + directory_name=None, + file_name=fspath, timeout=timeout, ) yield fsprop diff --git a/cli/cli.py b/cli/cli.py index 0c085c7..d273845 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -337,7 +337,7 @@ def callback(ctx, param, value): return click.option( '--file-attributes', expose_value=False, - is_flag=True, + is_flag=False, help='Store or restore file attributes [False]', callback=callback)(f) @@ -350,7 +350,7 @@ def callback(ctx, param, value): return click.option( '--file-md5/--no-file-md5', expose_value=False, - default=True, + default=False, help='Compute file MD5 [True]', callback=callback)(f) diff --git a/setup.py b/setup.py index 729dcc9..f6336db 100644 --- a/setup.py +++ b/setup.py @@ -43,12 +43,12 @@ install_requires = [ 'azure-common==1.1.4', - 'azure-storage==0.33.0', - 'click==6.6', - 'cryptography>=1.7.1', + 'azure-storage==0.34.0', + 'click==6.7', + 'cryptography>=1.7.2', 'future==0.16.0', 'python-dateutil==2.6.0', - 'ruamel.yaml==0.13.11', + 'ruamel.yaml==0.13.14', ] if sys.version_info < (3, 4): diff --git a/tests/test_blobxfer_download_operations.py b/tests/test_blobxfer_download_operations.py index eedbeb8..49d550e 100644 --- a/tests/test_blobxfer_download_operations.py +++ b/tests/test_blobxfer_download_operations.py @@ -481,6 +481,7 @@ def test_worker_thread_download( @mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_start = datetime.datetime.now(tz=dateutil.tz.tzlocal()) d._initialize_download_threads = mock.MagicMock() patched_lfmo._check_thread = mock.MagicMock() d._general_options.concurrency.crypto_processes = 1 diff --git a/tests/test_blobxfer_file_operations.py b/tests/test_blobxfer_file_operations.py index b221534..e354bda 100644 --- a/tests/test_blobxfer_file_operations.py +++ b/tests/test_blobxfer_file_operations.py @@ -69,6 +69,9 @@ def test_check_if_single_file(): result = ops.check_if_single_file(client, 'a', 'b/c') assert result[0] + result = ops.check_if_single_file(client, 'a', '') + assert not result[0] + client = mock.MagicMock() client.get_file_properties = mock.MagicMock() client.get_file_properties.side_effect = \ @@ -95,11 +98,9 @@ def test_list_files_single_file(): return_value=(False, None) ) def test_list_files_directory(patched_cisf): - client = mock.MagicMock() - client.list_directories_and_files = mock.MagicMock() _file = azure.storage.file.models.File(name='name') + client = mock.MagicMock() client.list_directories_and_files.return_value = [_file] - client.get_file_properties = mock.MagicMock() client.get_file_properties.return_value = _file i = 0 @@ -108,17 +109,18 @@ def test_list_files_directory(patched_cisf): assert file.name == 'name' assert i == 1 + print('test') + _dir = azure.storage.file.models.Directory(name='dirname') + _file = azure.storage.file.models.File(name='dirname/name') client = mock.MagicMock() - client.list_directories_and_files = mock.MagicMock() - _file = azure.storage.file.models.File(name='name') - client.list_directories_and_files.side_effect = [['dir'], [file]] - client.get_file_properties = mock.MagicMock() - client.get_file_properties.return_value = _file + client.list_directories_and_files.side_effect = [[_dir, _file]] + client.get_file_properties.side_effect = [_file] i = 0 - for file in ops.list_files(client, 'dir', ''): + for file in ops.list_files(client, '', ''): i += 1 - assert file.name == 'name' + assert file.name == _file.name + assert type(file) == azure.storage.file.models.File assert i == 1 From bc36c5d08b7bf7663beaf90b3c21208c38e5296d Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 28 Feb 2017 13:27:20 -0800 Subject: [PATCH 15/47] Add cleanup actions --- blobxfer/download/models.py | 20 +++++++++ blobxfer/download/operations.py | 25 ++++++++++- blobxfer/models.py | 9 +++- cli/cli.py | 14 +++++++ cli/settings.py | 2 + tests/test_blobxfer_download_models.py | 31 ++++++++++++++ tests/test_blobxfer_download_operations.py | 48 ++++++++++++++++++++++ tests/test_blobxfer_models.py | 22 ++++++++++ 8 files changed, 167 insertions(+), 4 deletions(-) diff --git a/blobxfer/download/models.py b/blobxfer/download/models.py index 39b641d..7fb4fe8 100644 --- a/blobxfer/download/models.py +++ b/blobxfer/download/models.py @@ -200,6 +200,26 @@ def _allocate_disk_space(self): fd.seek(allocatesize - 1) fd.write(b'\0') + def cleanup_all_temporary_files(self): + # type: (DownloadDescriptor) -> None + """Cleanup all temporary files in case of an exception or interrupt. + This function is not thread-safe. + :param DownloadDescriptor self: this + """ + # delete local file + try: + self.local_path.unlink() + except OSError: + pass + # iterate unchecked chunks and delete + for key in self._unchecked_chunks: + ucc = self._unchecked_chunks[key] + if ucc.temp: + try: + ucc.file_path.unlink() + except OSError: + pass + def next_offsets(self): # type: (DownloadDescriptor) -> DownloadOffsets """Retrieve the next offsets diff --git a/blobxfer/download/operations.py b/blobxfer/download/operations.py index 4c9a0a9..b947bcf 100644 --- a/blobxfer/download/operations.py +++ b/blobxfer/download/operations.py @@ -387,6 +387,26 @@ def _complete_chunk_download(self, offsets, data, dd): dd.dec_outstanding_operations() # TODO pickle dd to resume file + def _cleanup_temporary_files(self): + # type: (Downloader) -> None + """Cleanup temporary files in case of an exception or interrupt. + This function is not thread-safe. + :param Downloader self: this + """ + # do not clean up if resume file exists + if self._general_options.resume_file is not None: + logger.debug( + 'not cleaning up temporary files since resume file has ' + 'been specified') + return + # iterate through dd map and cleanup files + for key in self._dd_map: + dd = self._dd_map[key] + try: + dd.cleanup_all_temporary_files() + except Exception as e: + logger.exception(e) + def _run(self): # type: (Downloader) -> None """Execute Downloader""" @@ -475,10 +495,11 @@ def start(self): 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') self._wait_for_download_threads(terminate=True) - # TODO delete all temp files - # TODO close resume file in finally? + self._cleanup_temporary_files() raise finally: + # TODO close resume file + # shutdown processes if self._md5_offload is not None: self._md5_offload.finalize_processes() if self._crypto_offload is not None: diff --git a/blobxfer/models.py b/blobxfer/models.py index 8a91885..3648722 100644 --- a/blobxfer/models.py +++ b/blobxfer/models.py @@ -151,12 +151,13 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): class GeneralOptions(object): """General Options""" def __init__( - self, concurrency, progress_bar=True, timeout_sec=None, - verbose=False): + self, concurrency, progress_bar=True, resume_file=None, + timeout_sec=None, verbose=False): """Ctor for General Options :param GeneralOptions self: this :param ConcurrencyOptions concurrency: concurrency options :param bool progress_bar: progress bar + :param str resume_file: resume file :param int timeout_sec: timeout in seconds :param bool verbose: verbose output """ @@ -164,6 +165,10 @@ def __init__( raise ValueError('concurrency option is unspecified') self.concurrency = concurrency self.progress_bar = progress_bar + if blobxfer.util.is_not_empty(resume_file): + self.resume_file = pathlib.Path(resume_file) + else: + self.resume_file = None self.timeout_sec = timeout_sec self.verbose = verbose diff --git a/cli/cli.py b/cli/cli.py index d273845..03fb231 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -148,6 +148,19 @@ def callback(ctx, param, value): callback=callback)(f) +def _resume_file_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['resume_file'] = value + return value + return click.option( + '--resume-file', + expose_value=False, + default=None, + help='Save or use resume file specified', + callback=callback)(f) + + def _timeout_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -192,6 +205,7 @@ def common_options(f): f = _verbose_option(f) f = _transfer_threads_option(f) f = _timeout_option(f) + f = _resume_file_option(f) f = _progress_bar_option(f) f = _md5_processes_option(f) f = _crypto_processes_option(f) diff --git a/cli/settings.py b/cli/settings.py index 8e5db75..4da2500 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -208,6 +208,7 @@ def merge_settings(config, cli_options): config['options']['crypto_processes'] = cli_options['crypto_processes'] config['options']['md5_processes'] = cli_options['md5_processes'] config['options']['progress_bar'] = cli_options['progress_bar'] + config['options']['resume_file'] = cli_options['resume_file'] config['options']['timeout_sec'] = cli_options['timeout'] config['options']['transfer_threads'] = cli_options['transfer_threads'] config['options']['verbose'] = cli_options['verbose'] @@ -242,6 +243,7 @@ def create_general_options(config): transfer_threads=config['options']['transfer_threads'], ), progress_bar=config['options']['progress_bar'], + resume_file=config['options']['resume_file'], timeout_sec=config['options']['timeout_sec'], verbose=config['options']['verbose'], ) diff --git a/tests/test_blobxfer_download_models.py b/tests/test_blobxfer_download_models.py index 530e4ac..e91607e 100644 --- a/tests/test_blobxfer_download_models.py +++ b/tests/test_blobxfer_download_models.py @@ -251,6 +251,37 @@ def test_perform_chunked_integrity_check(tmpdir): assert not ucc.file_path.exists() +def test_cleanup_all_temporary_files(tmpdir): + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + lp = pathlib.Path(str(tmpdir.join('a'))) + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d._postpone_integrity_check(offsets, data) + assert len(d._unchecked_chunks) == 1 + d.cleanup_all_temporary_files() + assert not d.local_path.exists() + assert not d._unchecked_chunks[0].file_path.exists() + + lp = pathlib.Path(str(tmpdir.join('b'))) + d = models.DownloadDescriptor(lp, ase, opts) + + offsets = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d._postpone_integrity_check(offsets, data) + assert len(d._unchecked_chunks) == 1 + d.local_path.unlink() + d._unchecked_chunks[0].file_path.unlink() + d.cleanup_all_temporary_files() + assert not d.local_path.exists() + assert not d._unchecked_chunks[0].file_path.exists() + + def test_write_data(tmpdir): lp = pathlib.Path(str(tmpdir.join('a'))) diff --git a/tests/test_blobxfer_download_operations.py b/tests/test_blobxfer_download_operations.py index 49d550e..3645291 100644 --- a/tests/test_blobxfer_download_operations.py +++ b/tests/test_blobxfer_download_operations.py @@ -475,12 +475,58 @@ def test_worker_thread_download( assert dd.perform_chunked_integrity_check.call_count == 1 +def test_cleanup_temporary_files(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.cleanup_all_temporary_files = mock.MagicMock() + dd.cleanup_all_temporary_files.side_effect = Exception + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = pathlib.Path('abc') + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert dd.local_path.exists() + + lp = pathlib.Path(str(tmpdir.join('b'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert not dd.local_path.exists() + + lp = pathlib.Path(str(tmpdir.join('c'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = blobxfer.models.AzureStorageEntity('cont') + ase._size = 16 + dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd.cleanup_all_temporary_files = mock.MagicMock() + dd.cleanup_all_temporary_files.side_effect = Exception + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert dd.local_path.exists() + + @mock.patch('time.clock') @mock.patch('blobxfer.md5.LocalFileMd5Offload') @mock.patch('blobxfer.blob.operations.list_blobs') @mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._cleanup_temporary_files = mock.MagicMock() d._download_start = datetime.datetime.now(tz=dateutil.tz.tzlocal()) d._initialize_download_threads = mock.MagicMock() patched_lfmo._check_thread = mock.MagicMock() @@ -536,8 +582,10 @@ def test_start_keyboard_interrupt(): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._run = mock.MagicMock(side_effect=KeyboardInterrupt) d._wait_for_download_threads = mock.MagicMock() + d._cleanup_temporary_files = mock.MagicMock() d._md5_offload = mock.MagicMock() with pytest.raises(KeyboardInterrupt): d.start() assert d._wait_for_download_threads.call_count == 1 + assert d._cleanup_temporary_files.call_count == 1 diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py index 2ad07cf..f200aaf 100644 --- a/tests/test_blobxfer_models.py +++ b/tests/test_blobxfer_models.py @@ -38,6 +38,7 @@ def test_general_options(): transfer_threads=3, ), progress_bar=False, + resume_file='abc', timeout_sec=1, verbose=True, ) @@ -46,6 +47,27 @@ def test_general_options(): assert a.concurrency.md5_processes == 2 assert a.concurrency.transfer_threads == 3 assert not a.progress_bar + assert a.resume_file == pathlib.Path('abc') + assert a.timeout_sec == 1 + assert a.verbose + + a = models.GeneralOptions( + concurrency=models.ConcurrencyOptions( + crypto_processes=1, + md5_processes=2, + transfer_threads=3, + ), + progress_bar=False, + resume_file=None, + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.transfer_threads == 3 + assert not a.progress_bar + assert a.resume_file is None assert a.timeout_sec == 1 assert a.verbose From bb81f29802deedb3b30f97485b6c39a8850775d2 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 4 Apr 2017 11:37:27 -0700 Subject: [PATCH 16/47] Reorganize sources into sensible hierarchy --- blobxfer/api.py | 10 +- blobxfer/blob/block/__init__.py | 0 blobxfer/blob/page/__init__.py | 0 blobxfer/crypto/__init__.py | 0 blobxfer/download/__init__.py | 0 blobxfer/file/__init__.py | 0 blobxfer/models.py | 828 ------------------ blobxfer/models/__init__.py | 145 +++ blobxfer/models/azure.py | 460 ++++++++++ .../{crypto/models.py => models/crypto.py} | 68 +- .../models.py => models/download.py} | 189 +++- blobxfer/{ => models}/md5.py | 54 +- blobxfer/{ => models}/offload.py | 0 blobxfer/models/options.py | 147 ++++ blobxfer/models/upload.py | 75 ++ blobxfer/operations.py | 120 --- blobxfer/{blob => operations}/__init__.py | 0 .../append => operations/azure}/__init__.py | 0 .../azure/blob/__init__.py} | 22 +- .../azure/blob/append.py} | 5 +- .../azure/blob/block.py} | 5 +- .../azure/blob/page.py} | 5 +- .../azure/file.py} | 13 +- .../operations.py => operations/crypto.py} | 62 +- .../operations.py => operations/download.py} | 102 ++- blobxfer/operations/md5.py | 74 ++ blobxfer/util.py | 2 +- cli/settings.py | 53 +- setup.py | 10 +- 29 files changed, 1252 insertions(+), 1197 deletions(-) delete mode 100644 blobxfer/blob/block/__init__.py delete mode 100644 blobxfer/blob/page/__init__.py delete mode 100644 blobxfer/crypto/__init__.py delete mode 100644 blobxfer/download/__init__.py delete mode 100644 blobxfer/file/__init__.py delete mode 100644 blobxfer/models.py create mode 100644 blobxfer/models/__init__.py create mode 100644 blobxfer/models/azure.py rename blobxfer/{crypto/models.py => models/crypto.py} (84%) rename blobxfer/{download/models.py => models/download.py} (69%) rename blobxfer/{ => models}/md5.py (64%) rename blobxfer/{ => models}/offload.py (100%) create mode 100644 blobxfer/models/options.py create mode 100644 blobxfer/models/upload.py delete mode 100644 blobxfer/operations.py rename blobxfer/{blob => operations}/__init__.py (100%) rename blobxfer/{blob/append => operations/azure}/__init__.py (100%) rename blobxfer/{blob/operations.py => operations/azure/blob/__init__.py} (86%) rename blobxfer/{blob/append/operations.py => operations/azure/blob/append.py} (93%) rename blobxfer/{blob/block/operations.py => operations/azure/blob/block.py} (93%) rename blobxfer/{blob/page/operations.py => operations/azure/blob/page.py} (93%) rename blobxfer/{file/operations.py => operations/azure/file.py} (93%) rename blobxfer/{crypto/operations.py => operations/crypto.py} (79%) rename blobxfer/{download/operations.py => operations/download.py} (83%) create mode 100644 blobxfer/operations/md5.py diff --git a/blobxfer/api.py b/blobxfer/api.py index 57fcf09..f8c3378 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -32,19 +32,19 @@ # non-stdlib imports # local imports -from .blob.append.operations import ( # noqa +from .operations.azure.blob.append import ( # noqa create_client as create_append_blob_client ) -from .blob.block.operations import ( # noqa +from .operations.azure.blob.block import ( # noqa create_client as create_block_blob_client ) -from .blob.page.operations import ( # noqa +from .operations.azure.blob.page import ( # noqa create_client as create_page_blob_client ) -from .file.operations import ( # noqa +from .operations.azure.file import ( # noqa create_client as create_file_client ) -from .download.operations import ( # noqa +from .operations.download import ( # noqa Downloader ) diff --git a/blobxfer/blob/block/__init__.py b/blobxfer/blob/block/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/blobxfer/blob/page/__init__.py b/blobxfer/blob/page/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/blobxfer/crypto/__init__.py b/blobxfer/crypto/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/blobxfer/download/__init__.py b/blobxfer/download/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/blobxfer/file/__init__.py b/blobxfer/file/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/blobxfer/models.py b/blobxfer/models.py deleted file mode 100644 index 3648722..0000000 --- a/blobxfer/models.py +++ /dev/null @@ -1,828 +0,0 @@ -# Copyright (c) Microsoft Corporation -# -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -# compat imports -from __future__ import ( - absolute_import, division, print_function, unicode_literals -) -from builtins import ( # noqa - bytes, dict, int, list, object, range, ascii, chr, hex, input, - next, oct, open, pow, round, super, filter, map, zip) -# stdlib imports -import collections -import enum -import fnmatch -import logging -import os -try: - import pathlib2 as pathlib -except ImportError: # noqa - import pathlib -import multiprocessing -# non-stdlib imports -# local imports -from .api import ( - create_append_blob_client, - create_block_blob_client, - create_file_client, - create_page_blob_client, -) -from azure.storage.blob.models import _BlobTypes as BlobTypes -import blobxfer.blob.operations -import blobxfer.file.operations -import blobxfer.crypto.models -import blobxfer.util - -# create logger -logger = logging.getLogger(__name__) - - -# enums -class AzureStorageModes(enum.Enum): - Auto = 10 - Append = 20 - Block = 30 - File = 40 - Page = 50 - - -# named tuples -VectoredIoOptions = collections.namedtuple( - 'VectoredIoOptions', [ - 'stripe_chunk_size_bytes', - 'multi_storage_account_distribution_mode', - ] -) -SkipOnOptions = collections.namedtuple( - 'SkipOnOptions', [ - 'filesize_match', - 'lmt_ge', - 'md5_match', - ] -) -UploadOptions = collections.namedtuple( - 'UploadOptions', [ - 'chunk_size_bytes', - 'delete_extraneous_destination', - 'mode', - 'overwrite', - 'recursive', - 'rsa_private_key', - 'rsa_public_key', - 'store_file_attributes', - 'store_file_md5', - 'strip_components', - 'vectored_io', - 'split_size_bytes', - ] -) -DownloadOptions = collections.namedtuple( - 'DownloadOptions', [ - 'check_file_md5', - 'chunk_size_bytes', - 'delete_extraneous_destination', - 'mode', - 'overwrite', - 'recursive', - 'restore_file_attributes', - 'rsa_private_key', - ] -) -SyncCopyOptions = collections.namedtuple( - 'SyncCopyOptions', [ - 'chunk_size_bytes', - 'mode', - 'overwrite', - ] -) -LocalPath = collections.namedtuple( - 'LocalPath', [ - 'parent_path', - 'relative_path', - ] -) - - -class ConcurrencyOptions(object): - """Concurrency Options""" - def __init__(self, crypto_processes, md5_processes, transfer_threads): - """Ctor for Concurrency Options - :param ConcurrencyOptions self: this - :param int crypto_processes: number of crypto procs - :param int md5_processes: number of md5 procs - :param int transfer_threads: number of transfer threads - """ - self.crypto_processes = crypto_processes - self.md5_processes = md5_processes - self.transfer_threads = transfer_threads - # allow crypto processes to be zero (which will inline crypto - # routines with main process) - if self.crypto_processes is None or self.crypto_processes < 1: - self.crypto_processes = 0 - if self.md5_processes is None or self.md5_processes < 1: - self.md5_processes = multiprocessing.cpu_count() // 2 - if self.md5_processes < 1: - self.md5_processes = 1 - if self.transfer_threads is None or self.transfer_threads < 1: - self.transfer_threads = multiprocessing.cpu_count() * 3 - - -class GeneralOptions(object): - """General Options""" - def __init__( - self, concurrency, progress_bar=True, resume_file=None, - timeout_sec=None, verbose=False): - """Ctor for General Options - :param GeneralOptions self: this - :param ConcurrencyOptions concurrency: concurrency options - :param bool progress_bar: progress bar - :param str resume_file: resume file - :param int timeout_sec: timeout in seconds - :param bool verbose: verbose output - """ - if concurrency is None: - raise ValueError('concurrency option is unspecified') - self.concurrency = concurrency - self.progress_bar = progress_bar - if blobxfer.util.is_not_empty(resume_file): - self.resume_file = pathlib.Path(resume_file) - else: - self.resume_file = None - self.timeout_sec = timeout_sec - self.verbose = verbose - - -class AzureStorageCredentials(object): - """Azure Storage Credentials""" - def __init__(self): - # type: (AzureStorageCredentials) -> None - """Ctor for AzureStorageCredentials""" - self._storage_accounts = {} - - def add_storage_account(self, name, key, endpoint): - # type: (AzureStorageCredentials, str, str, str) -> None - """Add a storage account - :param AzureStorageCredentials self: this - :param str name: name of storage account to store - :param str key: storage key or sas - :param str endpoint: endpoint - """ - if name in self._storage_accounts: - raise ValueError( - '{} already exists in storage accounts'.format(name)) - self._storage_accounts[name] = AzureStorageAccount(name, key, endpoint) - - def get_storage_account(self, name): - # type: (AzureStorageCredentials, str) -> AzureStorageAccount - """Get storage account details - :param AzureStorageCredentials self: this - :param str name: name of storage account to retrieve - :rtype: AzureStorageAccount - :return: storage account details - """ - return self._storage_accounts[name] - - -class AzureStorageAccount(object): - """Azure Storage Account""" - def __init__(self, name, key, endpoint): - # type: (AzureStorageAccount, str, str, str) -> None - """Ctor for AzureStorageAccount - :param str name: name of storage account - :param str key: storage key or sas - :param str endpoint: endpoint - """ - self._append_blob_client = None - self._block_blob_client = None - self._file_client = None - self._page_blob_client = None - self.name = name - self.key = key - self.endpoint = endpoint - self.is_sas = self._key_is_sas(self.key) - # normalize sas keys - if self.is_sas and self.key.startswith('?'): - self.key = self.key[1:] - self._create_clients() - - @staticmethod - def _key_is_sas(key): - # type: (str) -> bool - """Determine if key is a sas - :param str key: key to parse - :rtype: bool - :return: if key is a sas - """ - # keys starting with ? are sas keys as ? is not in the base-64 - # character range - if key.startswith('?'): - return True - else: - # & is not in the base-64 character range, so technically - # the presence of this character means the key is a sas. however, - # perform a stronger check for the sig= parameter. - tmp = key.split('&') - if len(tmp) == 1: - return False - elif any(x.startswith('sig=') for x in tmp): - return True - return False - - def _create_clients(self): - # type: (AzureStorageAccount) -> None - """Create Azure Storage clients - :param AzureStorageAccount self: this - """ - self._append_blob_client = create_append_blob_client(self) - self._block_blob_client = create_block_blob_client(self) - self._file_client = create_file_client(self) - self._page_blob_client = create_page_blob_client(self) - - @property - def append_blob_client(self): - # type: (AzureStorageAccount) -> azure.storage.blob.AppendBlobService - """Get append blob client - :param AzureStorageAccount self: this - :rtype: azure.storage.blob.AppendBlobService - :return: append blob client - """ - return self._append_blob_client - - @property - def block_blob_client(self): - # type: (AzureStorageAccount) -> azure.storage.blob.BlockBlobService - """Get block blob client - :param AzureStorageAccount self: this - :rtype: azure.storage.blob.BlockBlobService - :return: block blob client - """ - return self._block_blob_client - - @property - def file_client(self): - # type: (AzureStorageAccount) -> azure.storage.file.FileService - """Get file client - :param AzureStorageAccount self: this - :rtype: azure.storage.file.FileService - :return: file client - """ - return self._file_client - - @property - def page_blob_client(self): - # type: (AzureStorageAccount) -> azure.storage.blob.PageBlobService - """Get page blob client - :param AzureStorageAccount self: this - :rtype: azure.storage.blob.PageBlobService - :return: page blob client - """ - return self._page_blob_client - - -class _BaseSourcePaths(object): - """Base Source Paths""" - def __init__(self): - # type: (_BaseSourcePaths) -> None - """Ctor for _BaseSourcePaths - :param _BaseSourcePaths self: this - """ - self._include = None - self._exclude = None - self._paths = [] - - @property - def paths(self): - # type: (_BaseSourcePaths) -> List[pathlib.Path] - """Stored paths - :param _BaseSourcePaths self: this - :rtype: list - :return: list of pathlib.Path - """ - return self._paths - - def add_include(self, incl): - # type: (_BaseSourcePaths, str) -> None - """Add an include - :param _BaseSourcePaths self: this - :param str incl: include filter - """ - if self._include is None: - self._include = [incl] - else: - self._include.append(incl) - - def add_includes(self, includes): - # type: (_BaseSourcePaths, list) -> None - """Add a list of includes - :param _BaseSourcePaths self: this - :param list includes: list of includes - """ - if not isinstance(includes, list): - raise ValueError('includes is not of type list') - if self._include is None: - self._include = includes - else: - self._include.extend(includes) - - def add_exclude(self, excl): - # type: (_BaseSourcePaths, str) -> None - """Add an exclude - :param _BaseSourcePaths self: this - :param str excl: exclude filter - """ - if self._exclude is None: - self._exclude = [excl] - else: - self._exclude.append(excl) - - def add_excludes(self, excludes): - # type: (_BaseSourcePaths, list) -> None - """Add a list of excludes - :param _BaseSourcePaths self: this - :param list excludes: list of excludes - """ - if not isinstance(excludes, list): - raise ValueError('excludes is not of type list') - if self._exclude is None: - self._exclude = excludes - else: - self._exclude.extend(excludes) - - def add_path(self, path): - # type: (_BaseSourcePaths, str) -> None - """Add a local path - :param _BaseSourcePaths self: this - :param str path: path to add - """ - if isinstance(path, pathlib.Path): - self._paths.append(path) - else: - self._paths.append(pathlib.Path(path)) - - def add_paths(self, paths): - # type: (_BaseSourcePaths, list) -> None - """Add a list of local paths - :param _BaseSourcePaths self: this - :param list paths: paths to add - """ - for path in paths: - self.add_path(path) - - def _inclusion_check(self, path): - # type: (_BaseSourcePaths, pathlib.Path) -> bool - """Check file for inclusion against filters - :param _BaseSourcePaths self: this - :param pathlib.Path path: path to check - :rtype: bool - :return: if file should be included - """ - _spath = str(path) - inc = True - if self._include is not None: - inc = any([fnmatch.fnmatch(_spath, x) for x in self._include]) - if inc and self._exclude is not None: - inc = not any([fnmatch.fnmatch(_spath, x) for x in self._exclude]) - return inc - - -class LocalSourcePaths(_BaseSourcePaths): - """Local Source Paths""" - def files(self): - # type: (LocalSourcePaths) -> LocalPath - """Generator for files in paths - :param LocalSourcePaths self: this - :rtype: LocalPath - :return: LocalPath - """ - for _path in self._paths: - _ppath = os.path.expandvars(os.path.expanduser(str(_path))) - _expath = pathlib.Path(_ppath) - for entry in blobxfer.util.scantree(_ppath): - _rpath = pathlib.Path(entry.path).relative_to(_ppath) - if not self._inclusion_check(_rpath): - logger.debug( - 'skipping file {} due to filters'.format(_rpath)) - continue - yield LocalPath(parent_path=_expath, relative_path=_rpath) - - -class LocalDestinationPath(object): - """Local Destination Path""" - def __init__(self, path=None): - # type: (LocalDestinationPath, str) -> None - """Ctor for LocalDestinationPath - :param LocalDestinationPath self: this - :param str path: path - """ - self._is_dir = None - if path is not None: - self.path = path - - @property - def path(self): - # type: (LocalDestinationPath) -> pathlib.Path - """Path property - :param LocalDestinationPath self: this - :rtype: pathlib.Path - :return: local destination path - """ - return self._path - - @path.setter - def path(self, value): - # type: (LocalDestinationPath, str) -> None - """Path property setter - :param LocalDestinationPath self: this - :param str value: value to set path to - """ - self._path = pathlib.Path(value) - - @property - def is_dir(self): - # type: (LocalDestinationPath) -> bool - """is_dir property - :param LocalDestinationPath self: this - :rtype: bool - :return: if local destination path is a directory - """ - return self._is_dir - - @is_dir.setter - def is_dir(self, value): - # type: (LocalDestinationPath, bool) -> None - """is_dir property setter - :param LocalDestinationPath self: this - :param bool value: value to set is_dir to - """ - self._is_dir = value - - def ensure_path_exists(self): - # type: (LocalDestinationPath) -> None - """Ensure path exists - :param LocalDestinationPath self: this - """ - if self._is_dir is None: - raise RuntimeError('is_dir not set') - if self._is_dir: - self._path.mkdir(mode=0o750, parents=True, exist_ok=True) - else: - if self._path.exists() and self._path.is_dir(): - raise RuntimeError( - ('destination path {} already exists and is a ' - 'directory').format(self._path)) - else: - # ensure parent path exists and is created - self._path.parent.mkdir( - mode=0o750, parents=True, exist_ok=True) - - -class DownloadSpecification(object): - """DownloadSpecification""" - def __init__( - self, download_options, skip_on_options, local_destination_path): - # type: (DownloadSpecification, DownloadOptions, SkipOnOptions, - # LocalDestinationPath) -> None - """Ctor for DownloadSpecification - :param DownloadSepcification self: this - :param DownloadOptions download_options: download options - :param SkipOnOptions skip_on_options: skip on options - :param LocalDestinationPath local_destination_path: local dest path - """ - self.options = download_options - self.skip_on = skip_on_options - self.destination = local_destination_path - self.sources = [] - - def add_azure_source_path(self, source): - # type: (DownloadSpecification, AzureSourcePath) -> None - """Add an Azure Source Path - :param DownloadSepcification self: this - :param AzureSourcePath source: Azure source path to add - """ - self.sources.append(source) - - -class AzureSourcePath(_BaseSourcePaths): - """AzureSourcePath""" - def __init__(self): - # type: (AzureSourcePath) -> None - """Ctor for AzureSourcePath - :param AzureSourcePath self: this - """ - super(AzureSourcePath, self).__init__() - self._path_map = {} - - def add_path_with_storage_account(self, remote_path, storage_account): - # type: (AzureSourcePath, str, str) -> None - """Add a path with an associated storage account - :param AzureSourcePath self: this - :param str remote_path: remote path - :param str storage_account: storage account to associate with path - """ - if len(self._path_map) >= 1: - raise RuntimeError( - 'cannot add multiple remote paths to AzureSourcePath objects') - rpath = blobxfer.util.normalize_azure_path(remote_path) - self.add_path(rpath) - self._path_map[rpath] = storage_account - - def lookup_storage_account(self, remote_path): - # type: (AzureSourcePath, str) -> str - """Lookup the storage account associated with the remote path - :param AzureSourcePath self: this - :param str remote_path: remote path - :rtype: str - :return: storage account associated with path - """ - return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] - - def files(self, creds, options, general_options): - # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, - # GeneralOptions) -> AzureStorageEntity - """Generator of Azure remote files or blobs - :param AzureSourcePath self: this - :param AzureStorageCredentials creds: storage creds - :param DownloadOptions options: download options - :param GeneralOptions general_options: general options - :rtype: AzureStorageEntity - :return: Azure storage entity object - """ - if options.mode == AzureStorageModes.File: - for file in self._populate_from_list_files( - creds, options, general_options): - yield file - else: - for blob in self._populate_from_list_blobs( - creds, options, general_options): - yield blob - - def _populate_from_list_files(self, creds, options, general_options): - # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, - # GeneralOptions) -> AzureStorageEntity - """Internal generator for Azure remote files - :param AzureSourcePath self: this - :param AzureStorageCredentials creds: storage creds - :param DownloadOptions options: download options - :param GeneralOptions general_options: general options - :rtype: AzureStorageEntity - :return: Azure storage entity object - """ - for _path in self._paths: - rpath = str(_path) - cont, dir = blobxfer.util.explode_azure_path(rpath) - sa = creds.get_storage_account(self.lookup_storage_account(rpath)) - for file in blobxfer.file.operations.list_files( - sa.file_client, cont, dir, general_options.timeout_sec): - if blobxfer.crypto.models.EncryptionMetadata.\ - encryption_metadata_exists(file.metadata): - ed = blobxfer.crypto.models.EncryptionMetadata() - ed.convert_from_json( - file.metadata, file.name, options.rsa_private_key) - else: - ed = None - ase = AzureStorageEntity(cont, ed) - ase.populate_from_file(sa, file) - yield ase - - def _populate_from_list_blobs(self, creds, options, general_options): - # type: (AzureSourcePath, AzureStorageCredentials, DownloadOptions, - # GeneralOptions) -> AzureStorageEntity - """Internal generator for Azure remote blobs - :param AzureSourcePath self: this - :param AzureStorageCredentials creds: storage creds - :param DownloadOptions options: download options - :param GeneralOptions general_options: general options - :rtype: AzureStorageEntity - :return: Azure storage entity object - """ - for _path in self._paths: - rpath = str(_path) - cont, dir = blobxfer.util.explode_azure_path(rpath) - sa = creds.get_storage_account(self.lookup_storage_account(rpath)) - for blob in blobxfer.blob.operations.list_blobs( - sa.block_blob_client, cont, dir, options.mode, - general_options.timeout_sec): - if blobxfer.crypto.models.EncryptionMetadata.\ - encryption_metadata_exists(blob.metadata): - ed = blobxfer.crypto.models.EncryptionMetadata() - ed.convert_from_json( - blob.metadata, blob.name, options.rsa_private_key) - else: - ed = None - ase = AzureStorageEntity(cont, ed) - ase.populate_from_blob(sa, blob) - yield ase - - -class AzureStorageEntity(object): - """Azure Storage Entity""" - def __init__(self, container, ed=None): - # type: (AzureStorageEntity, str - # blobxfer.crypto.models.EncryptionMetadata) -> None - """Ctor for AzureStorageEntity - :param AzureStorageEntity self: this - :param str container: container name - :param blobxfer.crypto.models.EncryptionMetadata ed: - encryption metadata - """ - self._client = None - self._container = container - self._name = None - self._mode = None - self._lmt = None - self._size = None - self._snapshot = None - self._md5 = None - self._encryption = ed - self._vio = None - self.download = None - - @property - def client(self): - # type: (AzureStorageEntity) -> object - """Associated storage client - :param AzureStorageEntity self: this - :rtype: object - :return: associated storage client - """ - return self._client - - @property - def container(self): - # type: (AzureStorageEntity) -> str - """Container name - :param AzureStorageEntity self: this - :rtype: str - :return: name of container or file share - """ - return self._container - - @property - def name(self): - # type: (AzureStorageEntity) -> str - """Entity name - :param AzureStorageEntity self: this - :rtype: str - :return: name of entity - """ - return self._name - - @property - def lmt(self): - # type: (AzureStorageEntity) -> datetime.datetime - """Entity last modified time - :param AzureStorageEntity self: this - :rtype: datetime.datetime - :return: LMT of entity - """ - return self._lmt - - @property - def size(self): - # type: (AzureStorageEntity) -> int - """Entity size - :param AzureStorageEntity self: this - :rtype: int - :return: size of entity - """ - return self._size - - @property - def snapshot(self): - # type: (AzureStorageEntity) -> str - """Entity snapshot - :param AzureStorageEntity self: this - :rtype: str - :return: snapshot of entity - """ - return self._snapshot - - @property - def md5(self): - # type: (AzureStorageEntity) -> str - """Base64-encoded MD5 - :param AzureStorageEntity self: this - :rtype: str - :return: md5 of entity - """ - return self._md5 - - @property - def mode(self): - # type: (AzureStorageEntity) -> AzureStorageModes - """Entity mode (type) - :param AzureStorageEntity self: this - :rtype: AzureStorageModes - :return: type of entity - """ - return self._mode - - @property - def is_encrypted(self): - # type: (AzureStorageEntity) -> bool - """If data is encrypted - :param AzureStorageEntity self: this - :rtype: bool - :return: if encryption metadata is present - """ - return self._encryption is not None - - @property - def encryption_metadata(self): - # type: (AzureStorageEntity) -> - # blobxfer.crypto.models.EncryptionMetadata - """Entity metadata (type) - :param AzureStorageEntity self: this - :rtype: blobxfer.crypto.models.EncryptionMetadata - :return: encryption metadata of entity - """ - return self._encryption - - def populate_from_blob(self, sa, blob): - # type: (AzureStorageEntity, AzureStorageAccount, - # azure.storage.blob.models.Blob) -> None - """Populate properties from Blob - :param AzureStorageEntity self: this - :param AzureStorageAccount sa: storage account - :param azure.storage.blob.models.Blob blob: blob to populate from - """ - self._name = blob.name - self._snapshot = blob.snapshot - self._lmt = blob.properties.last_modified - self._size = blob.properties.content_length - self._md5 = blob.properties.content_settings.content_md5 - if blob.properties.blob_type == BlobTypes.AppendBlob: - self._mode = AzureStorageModes.Append - self._client = sa.append_blob_client - elif blob.properties.blob_type == BlobTypes.BlockBlob: - self._mode = AzureStorageModes.Block - self._client = sa.block_blob_client - elif blob.properties.blob_type == BlobTypes.PageBlob: - self._mode = AzureStorageModes.Page - self._client = sa.page_blob_client - - def populate_from_file(self, sa, file): - # type: (AzureStorageEntity, AzureStorageAccount, - # azure.storage.file.models.File) -> None - """Populate properties from File - :param AzureStorageEntity self: this - :param AzureStorageAccount sa: storage account - :param azure.storage.file.models.File file: file to populate from - """ - self._name = file.name - self._snapshot = None - self._lmt = file.properties.last_modified - self._size = file.properties.content_length - self._md5 = file.properties.content_settings.content_md5 - self._mode = AzureStorageModes.File - self._client = sa.file_client - - -class AzureDestinationPaths(object): - def __init__(self): - pass - - -class FileDescriptor(object): - def __init__(self, filepath): - if filepath == '-': - self.stdin = True - self.path = None - else: - self.stdin = False - self.path = pathlib.Path(filepath) - self.size = None - self.hmac = None - self.md5 = None - self.bytes_xferred = 0 - - -class ReadFileDescriptor(FileDescriptor): - def __init__(self, filepath): - super().__init__(filepath) - - -class WriteFileDescriptor(FileDescriptor): - def __init__(self, filepath): - super().__init__(filepath) diff --git a/blobxfer/models/__init__.py b/blobxfer/models/__init__.py new file mode 100644 index 0000000..82ac224 --- /dev/null +++ b/blobxfer/models/__init__.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import fnmatch +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# local imports + + +class _BaseSourcePaths(object): + """Base Source Paths""" + def __init__(self): + # type: (_BaseSourcePaths) -> None + """Ctor for _BaseSourcePaths + :param _BaseSourcePaths self: this + """ + self._include = None + self._exclude = None + self._paths = [] + + @property + def paths(self): + # type: (_BaseSourcePaths) -> List[pathlib.Path] + """Stored paths + :param _BaseSourcePaths self: this + :rtype: list + :return: list of pathlib.Path + """ + return self._paths + + def add_include(self, incl): + # type: (_BaseSourcePaths, str) -> None + """Add an include + :param _BaseSourcePaths self: this + :param str incl: include filter + """ + if self._include is None: + self._include = [incl] + else: + self._include.append(incl) + + def add_includes(self, includes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of includes + :param _BaseSourcePaths self: this + :param list includes: list of includes + """ + if not isinstance(includes, list): + raise ValueError('includes is not of type list') + if self._include is None: + self._include = includes + else: + self._include.extend(includes) + + def add_exclude(self, excl): + # type: (_BaseSourcePaths, str) -> None + """Add an exclude + :param _BaseSourcePaths self: this + :param str excl: exclude filter + """ + if self._exclude is None: + self._exclude = [excl] + else: + self._exclude.append(excl) + + def add_excludes(self, excludes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of excludes + :param _BaseSourcePaths self: this + :param list excludes: list of excludes + """ + if not isinstance(excludes, list): + raise ValueError('excludes is not of type list') + if self._exclude is None: + self._exclude = excludes + else: + self._exclude.extend(excludes) + + def add_path(self, path): + # type: (_BaseSourcePaths, str) -> None + """Add a local path + :param _BaseSourcePaths self: this + :param str path: path to add + """ + if isinstance(path, pathlib.Path): + self._paths.append(path) + else: + self._paths.append(pathlib.Path(path)) + + def add_paths(self, paths): + # type: (_BaseSourcePaths, list) -> None + """Add a list of local paths + :param _BaseSourcePaths self: this + :param list paths: paths to add + """ + for path in paths: + self.add_path(path) + + def _inclusion_check(self, path): + # type: (_BaseSourcePaths, pathlib.Path) -> bool + """Check file for inclusion against filters + :param _BaseSourcePaths self: this + :param pathlib.Path path: path to check + :rtype: bool + :return: if file should be included + """ + _spath = str(path) + inc = True + if self._include is not None: + inc = any([fnmatch.fnmatch(_spath, x) for x in self._include]) + if inc and self._exclude is not None: + inc = not any([fnmatch.fnmatch(_spath, x) for x in self._exclude]) + return inc diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py new file mode 100644 index 0000000..57d1f38 --- /dev/null +++ b/blobxfer/models/azure.py @@ -0,0 +1,460 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +# non-stdlib imports +from azure.storage.blob.models import _BlobTypes as BlobTypes +# local imports +import blobxfer.models +import blobxfer.operations.azure.blob +import blobxfer.operations.azure.blob.append +import blobxfer.operations.azure.blob.block +import blobxfer.operations.azure.blob.page +import blobxfer.operations.azure.file + + +# enums +class StorageModes(enum.Enum): + Auto = 10 + Append = 20 + Block = 30 + File = 40 + Page = 50 + + +class StorageCredentials(object): + """Azure Storage Credentials""" + def __init__(self): + # type: (StorageCredentials) -> None + """Ctor for StorageCredentials""" + self._storage_accounts = {} + + def add_storage_account(self, name, key, endpoint): + # type: (StorageCredentials, str, str, str) -> None + """Add a storage account + :param StorageCredentials self: this + :param str name: name of storage account to store + :param str key: storage key or sas + :param str endpoint: endpoint + """ + if name in self._storage_accounts: + raise ValueError( + '{} already exists in storage accounts'.format(name)) + self._storage_accounts[name] = StorageAccount(name, key, endpoint) + + def get_storage_account(self, name): + # type: (StorageCredentials, str) -> StorageAccount + """Get storage account details + :param StorageCredentials self: this + :param str name: name of storage account to retrieve + :rtype: StorageAccount + :return: storage account details + """ + return self._storage_accounts[name] + + +class StorageAccount(object): + """Azure Storage Account""" + def __init__(self, name, key, endpoint): + # type: (StorageAccount, str, str, str) -> None + """Ctor for StorageAccount + :param str name: name of storage account + :param str key: storage key or sas + :param str endpoint: endpoint + """ + self._append_blob_client = None + self._block_blob_client = None + self._file_client = None + self._page_blob_client = None + self.name = name + self.key = key + self.endpoint = endpoint + self.is_sas = self._key_is_sas(self.key) + # normalize sas keys + if self.is_sas and self.key.startswith('?'): + self.key = self.key[1:] + self._create_clients() + + @staticmethod + def _key_is_sas(key): + # type: (str) -> bool + """Determine if key is a sas + :param str key: key to parse + :rtype: bool + :return: if key is a sas + """ + # keys starting with ? are sas keys as ? is not in the base-64 + # character range + if key.startswith('?'): + return True + else: + # & is not in the base-64 character range, so technically + # the presence of this character means the key is a sas. however, + # perform a stronger check for the sig= parameter. + tmp = key.split('&') + if len(tmp) == 1: + return False + elif any(x.startswith('sig=') for x in tmp): + return True + return False + + def _create_clients(self): + # type: (StorageAccount) -> None + """Create Azure Storage clients + :param StorageAccount self: this + """ + self._append_blob_client = \ + blobxfer.operations.azure.blob.append.create_client(self) + self._block_blob_client = \ + blobxfer.operations.azure.blob.block.create_client(self) + self._file_client = blobxfer.operations.azure.file.create_client(self) + self._page_blob_client = \ + blobxfer.operations.azure.blob.page.create_client(self) + + @property + def append_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.AppendBlobService + """Get append blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.AppendBlobService + :return: append blob client + """ + return self._append_blob_client + + @property + def block_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.BlockBlobService + """Get block blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.BlockBlobService + :return: block blob client + """ + return self._block_blob_client + + @property + def file_client(self): + # type: (StorageAccount) -> azure.storage.file.FileService + """Get file client + :param StorageAccount self: this + :rtype: azure.storage.file.FileService + :return: file client + """ + return self._file_client + + @property + def page_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.PageBlobService + """Get page blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.PageBlobService + :return: page blob client + """ + return self._page_blob_client + + +class StorageEntity(object): + """Azure Storage Entity""" + def __init__(self, container, ed=None): + # type: (StorageEntity, str + # blobxfer.models.crypto.EncryptionMetadata) -> None + """Ctor for StorageEntity + :param StorageEntity self: this + :param str container: container name + :param blobxfer.models.crypto.EncryptionMetadata ed: + encryption metadata + """ + self._client = None + self._container = container + self._name = None + self._mode = None + self._lmt = None + self._size = None + self._snapshot = None + self._md5 = None + self._encryption = ed + self._vio = None + self.download = None + + @property + def client(self): + # type: (StorageEntity) -> object + """Associated storage client + :param StorageEntity self: this + :rtype: object + :return: associated storage client + """ + return self._client + + @property + def container(self): + # type: (StorageEntity) -> str + """Container name + :param StorageEntity self: this + :rtype: str + :return: name of container or file share + """ + return self._container + + @property + def name(self): + # type: (StorageEntity) -> str + """Entity name + :param StorageEntity self: this + :rtype: str + :return: name of entity + """ + return self._name + + @property + def lmt(self): + # type: (StorageEntity) -> datetime.datetime + """Entity last modified time + :param StorageEntity self: this + :rtype: datetime.datetime + :return: LMT of entity + """ + return self._lmt + + @property + def size(self): + # type: (StorageEntity) -> int + """Entity size + :param StorageEntity self: this + :rtype: int + :return: size of entity + """ + return self._size + + @property + def snapshot(self): + # type: (StorageEntity) -> str + """Entity snapshot + :param StorageEntity self: this + :rtype: str + :return: snapshot of entity + """ + return self._snapshot + + @property + def md5(self): + # type: (StorageEntity) -> str + """Base64-encoded MD5 + :param StorageEntity self: this + :rtype: str + :return: md5 of entity + """ + return self._md5 + + @property + def mode(self): + # type: (StorageEntity) -> blobxfer.models.azure.StorageModes + """Entity mode (type) + :param StorageEntity self: this + :rtype: blobxfer.models.azure.StorageModes + :return: type of entity + """ + return self._mode + + @property + def is_encrypted(self): + # type: (StorageEntity) -> bool + """If data is encrypted + :param StorageEntity self: this + :rtype: bool + :return: if encryption metadata is present + """ + return self._encryption is not None + + @property + def encryption_metadata(self): + # type: (StorageEntity) -> + # blobxfer.models.crypto.EncryptionMetadata + """Entity metadata (type) + :param StorageEntity self: this + :rtype: blobxfer.models.crypto.EncryptionMetadata + :return: encryption metadata of entity + """ + return self._encryption + + def populate_from_blob(self, sa, blob): + # type: (StorageEntity, blobxfer.models.azure.StorageAccount, + # azure.storage.blob.models.Blob) -> None + """Populate properties from Blob + :param StorageEntity self: this + :param blobxfer.models.azure.StorageAccount sa: storage account + :param azure.storage.blob.models.Blob blob: blob to populate from + """ + self._name = blob.name + self._snapshot = blob.snapshot + self._lmt = blob.properties.last_modified + self._size = blob.properties.content_length + self._md5 = blob.properties.content_settings.content_md5 + if blob.properties.blob_type == BlobTypes.AppendBlob: + self._mode = StorageModes.Append + self._client = sa.append_blob_client + elif blob.properties.blob_type == BlobTypes.BlockBlob: + self._mode = StorageModes.Block + self._client = sa.block_blob_client + elif blob.properties.blob_type == BlobTypes.PageBlob: + self._mode = StorageModes.Page + self._client = sa.page_blob_client + + def populate_from_file(self, sa, file): + # type: (StorageEntity, blobxfer.models.azure.StorageAccount, + # azure.storage.file.models.File) -> None + """Populate properties from File + :param StorageEntity self: this + :param blobxfer.models.azure.StorageAccount sa: storage account + :param azure.storage.file.models.File file: file to populate from + """ + self._name = file.name + self._snapshot = None + self._lmt = file.properties.last_modified + self._size = file.properties.content_length + self._md5 = file.properties.content_settings.content_md5 + self._mode = StorageModes.File + self._client = sa.file_client + + +class SourcePath(blobxfer.models._BaseSourcePaths): + """Azure Source Path""" + def __init__(self): + # type: (SourcePath) -> None + """Ctor for SourcePath + :param SourcePath self: this + """ + super(SourcePath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (SourcePath, str, str) -> None + """Add a path with an associated storage account + :param SourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to SourcePath objects') + rpath = blobxfer.util.normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (SourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param SourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] + + def files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Generator of Azure remote files or blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + if options.mode == blobxfer.models.azure.StorageModes.File: + for file in self._populate_from_list_files( + creds, options, general_options): + yield file + else: + for blob in self._populate_from_list_blobs( + creds, options, general_options): + yield blob + + def _populate_from_list_files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote files + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for file in blobxfer.operations.azure.file.list_files( + sa.file_client, cont, dir, general_options.timeout_sec): + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(file.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + file.metadata, file.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_file(sa, file) + yield ase + + def _populate_from_list_blobs(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for blob in blobxfer.operations.azure.blob.list_blobs( + sa.block_blob_client, cont, dir, options.mode, + general_options.timeout_sec): + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(blob.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + blob.metadata, blob.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_blob(sa, blob) + yield ase diff --git a/blobxfer/crypto/models.py b/blobxfer/models/crypto.py similarity index 84% rename from blobxfer/crypto/models.py rename to blobxfer/models/crypto.py index e08f6a7..904da80 100644 --- a/blobxfer/crypto/models.py +++ b/blobxfer/models/crypto.py @@ -32,12 +32,18 @@ # stdlib imports import base64 import collections +import enum import hashlib import hmac import json +try: + import queue +except ImportError: # noqa + import Queue as queue # non-stdlib imports # local imports -import blobxfer.crypto.operations +import blobxfer.models.offload +import blobxfer.operations.crypto import blobxfer.util # encryption constants @@ -246,13 +252,13 @@ def convert_from_json(self, md, blobname, rsaprivatekey): if rsaprivatekey is None: return # decrypt symmetric key - self._symkey = blobxfer.crypto.operations.\ + self._symkey = blobxfer.operations.crypto.\ rsa_decrypt_base64_encoded_key( rsaprivatekey, self.wrapped_content_key.encrypted_key) # decrypt signing key, if it exists if blobxfer.util.is_not_empty( self.wrapped_content_key.encrypted_authentication_key): - self._signkey = blobxfer.crypto.operations.\ + self._signkey = blobxfer.operations.crypto.\ rsa_decrypt_base64_encoded_key( rsaprivatekey, self.wrapped_content_key.encrypted_authentication_key) @@ -310,3 +316,59 @@ def initialize_hmac(self): return hmac.new(self._signkey, digestmod=hashlib.sha256) else: return None + + +class CryptoAction(enum.Enum): + Encrypt = 1 + Decrypt = 2 + + +class CryptoOffload(blobxfer.models.offload._MultiprocessOffload): + def __init__(self, num_workers): + # type: (CryptoOffload, int) -> None + """Ctor for Crypto Offload + :param CryptoOffload self: this + :param int num_workers: number of worker processes + """ + super(CryptoOffload, self).__init__( + self._worker_process, num_workers, 'Crypto') + + def _worker_process(self): + # type: (CryptoOffload) -> None + """Crypto worker + :param CryptoOffload self: this + """ + while not self.terminated: + try: + inst = self._task_queue.get(True, 1) + except queue.Empty: + continue + if inst[0] == CryptoAction.Encrypt: + # TODO on upload + raise NotImplementedError() + elif inst[0] == CryptoAction.Decrypt: + final_path, offsets, symkey, iv, encdata = \ + inst[1], inst[2], inst[3], inst[4], inst[5] + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( + symkey, iv, encdata, offsets.unpad) + self._done_cv.acquire() + self._done_queue.put((final_path, offsets, data)) + self._done_cv.notify() + self._done_cv.release() + + def add_decrypt_chunk( + self, final_path, offsets, symkey, iv, encdata): + # type: (CryptoOffload, str, blobxfer.models.download.Offsets, + # bytes, bytes, bytes) -> None + """Add a chunk to decrypt + :param CryptoOffload self: this + :param str final_path: final path + :param blobxfer.models.download.Offsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes encdata: encrypted data + """ + self._task_queue.put( + (CryptoAction.Decrypt, final_path, offsets, symkey, iv, + encdata) + ) diff --git a/blobxfer/download/models.py b/blobxfer/models/download.py similarity index 69% rename from blobxfer/download/models.py rename to blobxfer/models/download.py index 7fb4fe8..d4d8f06 100644 --- a/blobxfer/download/models.py +++ b/blobxfer/models/download.py @@ -42,17 +42,16 @@ import threading # non-stdlib imports # local imports -import blobxfer.blob.operations -import blobxfer.file.operations -import blobxfer.crypto.models +import blobxfer.models.options +import blobxfer.models.crypto import blobxfer.util # create logger logger = logging.getLogger(__name__) # named tuples -DownloadOffsets = collections.namedtuple( - 'DownloadOffsets', [ +Offsets = collections.namedtuple( + 'Offsets', [ 'chunk_num', 'fd_start', 'num_bytes', @@ -71,19 +70,117 @@ ) -class DownloadDescriptor(object): +class LocalDestinationPath(object): + """Local Destination Path""" + def __init__(self, path=None): + # type: (LocalDestinationPath, str) -> None + """Ctor for LocalDestinationPath + :param LocalDestinationPath self: this + :param str path: path + """ + self._is_dir = None + if path is not None: + self.path = path + + @property + def path(self): + # type: (LocalDestinationPath) -> pathlib.Path + """Path property + :param LocalDestinationPath self: this + :rtype: pathlib.Path + :return: local destination path + """ + return self._path + + @path.setter + def path(self, value): + # type: (LocalDestinationPath, str) -> None + """Path property setter + :param LocalDestinationPath self: this + :param str value: value to set path to + """ + self._path = pathlib.Path(value) + + @property + def is_dir(self): + # type: (LocalDestinationPath) -> bool + """is_dir property + :param LocalDestinationPath self: this + :rtype: bool + :return: if local destination path is a directory + """ + return self._is_dir + + @is_dir.setter + def is_dir(self, value): + # type: (LocalDestinationPath, bool) -> None + """is_dir property setter + :param LocalDestinationPath self: this + :param bool value: value to set is_dir to + """ + self._is_dir = value + + def ensure_path_exists(self): + # type: (LocalDestinationPath) -> None + """Ensure path exists + :param LocalDestinationPath self: this + """ + if self._is_dir is None: + raise RuntimeError('is_dir not set') + if self._is_dir: + self._path.mkdir(mode=0o750, parents=True, exist_ok=True) + else: + if self._path.exists() and self._path.is_dir(): + raise RuntimeError( + ('destination path {} already exists and is a ' + 'directory').format(self._path)) + else: + # ensure parent path exists and is created + self._path.parent.mkdir( + mode=0o750, parents=True, exist_ok=True) + + +class Specification(object): + """Download Specification""" + def __init__( + self, download_options, skip_on_options, local_destination_path): + # type: (Specification, blobxfer.models.options.Download, + # blobxfer.models.options.SkipOn, LocalDestinationPath) -> None + """Ctor for Specification + :param DownloadSepcification self: this + :param blobxfer.models.options.Download download_options: + download options + :param blobxfer.models.options.SkipOn skip_on_options: skip on options + :param LocalDestinationPath local_destination_path: local dest path + """ + self.options = download_options + self.skip_on = skip_on_options + self.destination = local_destination_path + self.sources = [] + + def add_azure_source_path(self, source): + # type: (Specification, AzureSourcePath) -> None + """Add an Azure Source Path + :param DownloadSepcification self: this + :param AzureSourcePath source: Azure source path to add + """ + self.sources.append(source) + + +class Descriptor(object): """Download Descriptor""" - _AES_BLOCKSIZE = blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES + _AES_BLOCKSIZE = blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES def __init__(self, lpath, ase, options): - # type: (DownloadDescriptior, pathlib.Path, AzureStorageEntity, - # DownloadOptions) -> None - """Ctor for DownloadDescriptor - :param DownloadDescriptor self: this + # type: (DownloadDescriptior, pathlib.Path, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.options.Download) -> None + """Ctor for Descriptor + :param Descriptor self: this :param pathlib.Path lpath: local path - :param AzureStorageEntity ase: Azure Storage Entity - :param DownloadOptions options: download options + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :param blobxfer.models.options.Download options: download options """ self.final_path = lpath # create path holding the temporary file to download to @@ -114,19 +211,19 @@ def __init__(self, lpath, ase, options): @property def entity(self): - # type: (DownloadDescriptor) -> AzureStorageEntity - """Get linked AzureStorageEntity - :param DownloadDescriptor self: this - :rtype: AzureStorageEntity - :return: AzureStorageEntity + # type: (Descriptor) -> blobxfer.models.azure.StorageEntity + """Get linked blobxfer.models.azure.StorageEntity + :param Descriptor self: this + :rtype: blobxfer.models.azure.StorageEntity + :return: blobxfer.models.azure.StorageEntity """ return self._ase @property def must_compute_md5(self): - # type: (DownloadDescriptor) -> bool + # type: (Descriptor) -> bool """Check if MD5 must be computed - :param DownloadDescriptor self: this + :param Descriptor self: this :rtype: bool :return: if MD5 must be computed """ @@ -134,9 +231,9 @@ def must_compute_md5(self): @property def all_operations_completed(self): - # type: (DownloadDescriptor) -> bool + # type: (Descriptor) -> bool """All operations are completed - :param DownloadDescriptor self: this + :param Descriptor self: this :rtype: bool :return: if all operations completed """ @@ -145,19 +242,19 @@ def all_operations_completed(self): len(self._unchecked_chunks) == 0) def dec_outstanding_operations(self): - # type: (DownloadDescriptor) -> None + # type: (Descriptor) -> None """Decrement outstanding operations (and increment completed ops) - :param DownloadDescriptor self: this + :param Descriptor self: this """ with self._meta_lock: self._outstanding_ops -= 1 self._completed_ops += 1 def _initialize_integrity_checkers(self, options): - # type: (DownloadDescriptor, DownloadOptions) -> None + # type: (Descriptor, blobxfer.models.options.Download) -> None """Initialize file integrity checkers - :param DownloadDescriptor self: this - :param DownloadOptions options: download options + :param Descriptor self: this + :param blobxfer.models.options.Download options: download options """ if self._ase.is_encrypted: # ensure symmetric key exists @@ -171,9 +268,9 @@ def _initialize_integrity_checkers(self, options): self.md5 = blobxfer.util.new_md5_hasher() def _allocate_disk_space(self): - # type: (DownloadDescriptor, int) -> None + # type: (Descriptor, int) -> None """Perform file allocation (possibly sparse) - :param DownloadDescriptor self: this + :param Descriptor self: this :param int size: size """ size = self._ase.size @@ -201,10 +298,10 @@ def _allocate_disk_space(self): fd.write(b'\0') def cleanup_all_temporary_files(self): - # type: (DownloadDescriptor) -> None + # type: (Descriptor) -> None """Cleanup all temporary files in case of an exception or interrupt. This function is not thread-safe. - :param DownloadDescriptor self: this + :param Descriptor self: this """ # delete local file try: @@ -221,10 +318,10 @@ def cleanup_all_temporary_files(self): pass def next_offsets(self): - # type: (DownloadDescriptor) -> DownloadOffsets + # type: (Descriptor) -> Offsets """Retrieve the next offsets - :param DownloadDescriptor self: this - :rtype: DownloadOffsets + :param Descriptor self: this + :rtype: Offsets :return: download offsets """ with self._meta_lock: @@ -256,7 +353,7 @@ def next_offsets(self): unpad = True else: unpad = False - return DownloadOffsets( + return Offsets( chunk_num=chunk_num, fd_start=fd_start, num_bytes=chunk, @@ -266,10 +363,10 @@ def next_offsets(self): ) def _postpone_integrity_check(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + # type: (Descriptor, Offsets, bytes) -> None """Postpone integrity check for chunk - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets + :param Descriptor self: this + :param Offsets offsets: download offsets :param bytes data: data """ if self.must_compute_md5: @@ -297,10 +394,10 @@ def _postpone_integrity_check(self, offsets, data): self._unchecked_chunks[offsets.chunk_num] = unchecked def perform_chunked_integrity_check(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + # type: (Descriptor, Offsets, bytes) -> None """Hash data against stored MD5 hasher safely - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets + :param Descriptor self: this + :param Offsets offsets: download offsets :param bytes data: data """ self_check = False @@ -335,10 +432,10 @@ def perform_chunked_integrity_check(self, offsets, data): self._postpone_integrity_check(offsets, data) def write_data(self, offsets, data): - # type: (DownloadDescriptor, DownloadOffsets, bytes) -> None + # type: (Descriptor, Offsets, bytes) -> None """Postpone integrity check for chunk - :param DownloadDescriptor self: this - :param DownloadOffsets offsets: download offsets + :param Descriptor self: this + :param Offsets offsets: download offsets :param bytes data: data """ with self.local_path.open('r+b') as fd: @@ -346,9 +443,9 @@ def write_data(self, offsets, data): fd.write(data) def finalize_file(self): - # type: (DownloadDescriptor) -> None + # type: (Descriptor) -> None """Finalize file download - :param DownloadDescriptor self: this + :param Descriptor self: this """ # check final file integrity check = False diff --git a/blobxfer/md5.py b/blobxfer/models/md5.py similarity index 64% rename from blobxfer/md5.py rename to blobxfer/models/md5.py index 1c403cc..f8c1d3a 100644 --- a/blobxfer/md5.py +++ b/blobxfer/models/md5.py @@ -36,52 +36,15 @@ import Queue as queue # non-stdlib imports # local imports -import blobxfer.download -import blobxfer.models -import blobxfer.offload -import blobxfer.util +import blobxfer.models.azure +import blobxfer.models.offload +import blobxfer.operations.md5 # create logger logger = logging.getLogger(__name__) -def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): - # type: (str, bool, int) -> str - """Compute MD5 hash for file and encode as Base64 - :param str filename: file to compute MD5 for - :param bool pagealign: page align data - :param int blocksize: block size - :rtype: str - :return: MD5 for file encoded as Base64 - """ - hasher = blobxfer.util.new_md5_hasher() - with open(filename, 'rb') as filedesc: - while True: - buf = filedesc.read(blocksize) - if not buf: - break - buflen = len(buf) - if pagealign and buflen < blocksize: - aligned = blobxfer.util.page_align_content_length(buflen) - if aligned != buflen: - buf = buf.ljust(aligned, b'\0') - hasher.update(buf) - return blobxfer.util.base64_encode_as_string(hasher.digest()) - - -def compute_md5_for_data_asbase64(data): - # type: (obj) -> str - """Compute MD5 hash for bits and encode as Base64 - :param any data: data to compute MD5 for - :rtype: str - :return: MD5 for data - """ - hasher = blobxfer.util.new_md5_hasher() - hasher.update(data) - return blobxfer.util.base64_encode_as_string(hasher.digest()) - - -class LocalFileMd5Offload(blobxfer.offload._MultiprocessOffload): +class LocalFileMd5Offload(blobxfer.models.offload._MultiprocessOffload): """LocalFileMd5Offload""" def __init__(self, num_workers): # type: (LocalFileMd5Offload, int) -> None @@ -102,7 +65,8 @@ def _worker_process(self): filename, remote_md5, pagealign = self._task_queue.get(True, 1) except queue.Empty: continue - md5 = compute_md5_for_file_asbase64(filename, pagealign) + md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( + filename, pagealign) logger.debug('MD5: {} {} {}'.format( md5, remote_md5, filename)) self._done_cv.acquire() @@ -112,14 +76,14 @@ def _worker_process(self): def add_localfile_for_md5_check(self, filename, remote_md5, mode): # type: (LocalFileMd5Offload, str, str, - # blobxfer.models.AzureStorageModes) -> None + # blobxfer.models.azure.StorageModes) -> None """Add a local file to MD5 check queue :param LocalFileMd5Offload self: this :param str filename: file to compute MD5 for :param str remote_md5: remote MD5 to compare against - :param blobxfer.models.AzureStorageModes mode: mode + :param blobxfer.models.azure.StorageModes mode: mode """ - if mode == blobxfer.models.AzureStorageModes.Page: + if mode == blobxfer.models.azure.StorageModes.Page: pagealign = True else: pagealign = False diff --git a/blobxfer/offload.py b/blobxfer/models/offload.py similarity index 100% rename from blobxfer/offload.py rename to blobxfer/models/offload.py diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py new file mode 100644 index 0000000..f7c9f6f --- /dev/null +++ b/blobxfer/models/options.py @@ -0,0 +1,147 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import logging +import multiprocessing +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# named tuples +VectoredIo = collections.namedtuple( + 'VectoredIoOptions', [ + 'stripe_chunk_size_bytes', + 'multi_storage_account_distribution_mode', + ] +) +SkipOn = collections.namedtuple( + 'SkipOn', [ + 'filesize_match', + 'lmt_ge', + 'md5_match', + ] +) +Upload = collections.namedtuple( + 'Upload', [ + 'chunk_size_bytes', + 'delete_extraneous_destination', + 'mode', + 'overwrite', + 'recursive', + 'rsa_private_key', + 'rsa_public_key', + 'store_file_attributes', + 'store_file_md5', + 'strip_components', + 'vectored_io', + 'split_size_bytes', + ] +) +Download = collections.namedtuple( + 'Download', [ + 'check_file_md5', + 'chunk_size_bytes', + 'delete_extraneous_destination', + 'mode', + 'overwrite', + 'recursive', + 'restore_file_attributes', + 'rsa_private_key', + ] +) +SyncCopy = collections.namedtuple( + 'SyncCopy', [ + 'chunk_size_bytes', + 'mode', + 'overwrite', + ] +) + + +class Concurrency(object): + """Concurrency Options""" + def __init__(self, crypto_processes, md5_processes, transfer_threads): + """Ctor for Concurrency Options + :param Concurrency self: this + :param int crypto_processes: number of crypto procs + :param int md5_processes: number of md5 procs + :param int transfer_threads: number of transfer threads + """ + self.crypto_processes = crypto_processes + self.md5_processes = md5_processes + self.transfer_threads = transfer_threads + # allow crypto processes to be zero (which will inline crypto + # routines with main process) + if self.crypto_processes is None or self.crypto_processes < 1: + self.crypto_processes = 0 + if self.md5_processes is None or self.md5_processes < 1: + self.md5_processes = multiprocessing.cpu_count() // 2 + if self.md5_processes < 1: + self.md5_processes = 1 + if self.transfer_threads is None or self.transfer_threads < 1: + self.transfer_threads = multiprocessing.cpu_count() * 3 + # cap maximum number of threads from cpu count to 24 + if self.transfer_threads > 24: + self.transfer_threads = 24 + + +class General(object): + """General Options""" + def __init__( + self, concurrency, progress_bar=True, resume_file=None, + timeout_sec=None, verbose=False): + """Ctor for General Options + :param General self: this + :param Concurrency concurrency: concurrency options + :param bool progress_bar: progress bar + :param str resume_file: resume file + :param int timeout_sec: timeout in seconds + :param bool verbose: verbose output + """ + if concurrency is None: + raise ValueError('concurrency option is unspecified') + self.concurrency = concurrency + self.progress_bar = progress_bar + if blobxfer.util.is_not_empty(resume_file): + self.resume_file = pathlib.Path(resume_file) + else: + self.resume_file = None + self.timeout_sec = timeout_sec + self.verbose = verbose diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py new file mode 100644 index 0000000..607b001 --- /dev/null +++ b/blobxfer/models/upload.py @@ -0,0 +1,75 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import logging +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# local imports +import blobxfer.models +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +LocalPath = collections.namedtuple( + 'LocalPath', [ + 'parent_path', + 'relative_path', + ] +) + + +class LocalSourcePaths(blobxfer.models._BaseSourcePaths): + """Local Source Paths""" + def files(self): + # type: (LocalSourcePaths) -> LocalPath + """Generator for files in paths + :param LocalSourcePaths self: this + :rtype: LocalPath + :return: LocalPath + """ + for _path in self._paths: + _ppath = os.path.expandvars(os.path.expanduser(str(_path))) + _expath = pathlib.Path(_ppath) + for entry in blobxfer.util.scantree(_ppath): + _rpath = pathlib.Path(entry.path).relative_to(_ppath) + if not self._inclusion_check(_rpath): + logger.debug( + 'skipping file {} due to filters'.format(_rpath)) + continue + yield LocalPath(parent_path=_expath, relative_path=_rpath) diff --git a/blobxfer/operations.py b/blobxfer/operations.py deleted file mode 100644 index 82e4024..0000000 --- a/blobxfer/operations.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) Microsoft Corporation -# -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -# compat imports -from __future__ import absolute_import, division, print_function -from builtins import ( # noqa - bytes, dict, int, list, object, range, ascii, chr, hex, input, - next, oct, open, pow, round, super, filter, map, zip -) -# stdlib imports -import logging -# non-stdlib imports -# local imports -import blobxfer.models -import blobxfer.blob.operations -import blobxfer.file.operations -import blobxfer.util - -# create logger -logger = logging.getLogger(__name__) - - -def ensure_local_destination(creds, spec): - # type: (blobxfer.models.AzureStorageCredentials, - # blobxfer.models.DownloadSpecification) -> None - """Ensure a local destination path given a download spec - :param blobxfer.models.AzureStorageCredentials creds: creds - :param blobxfer.models.DownloadSpecification spec: download spec - """ - # ensure destination path is writable given the source - if len(spec.sources) < 1: - raise RuntimeError('no sources to download from specified') - # set is_dir for destination - spec.destination.is_dir = True - if len(spec.sources) == 1: - # we need to query the source to see if this is a directory - rpath = str(spec.sources[0].paths[0]) - cont, dir = blobxfer.util.explode_azure_path(rpath) - if not blobxfer.util.is_none_or_empty(dir): - sa = creds.get_storage_account( - spec.sources[0].lookup_storage_account(rpath)) - if spec.options.mode == blobxfer.models.AzureStorageModes.File: - if blobxfer.file.operations.check_if_single_file( - sa.file_client, cont, dir)[0]: - spec.destination.is_dir = False - else: - if blobxfer.blob.operations.check_if_single_blob( - sa.block_blob_client, cont, dir): - spec.destination.is_dir = False - logger.debug('dest is_dir={} for {} specs'.format( - spec.destination.is_dir, len(spec.sources))) - # ensure destination path - spec.destination.ensure_path_exists() - - -def file_chunks(fd, chunk_size): - # type: (FileDescriptor, int) -> bytes - """Generator for getting file chunks of a file - :param FileDescriptor fd: file descriptor - :param int chunk_size: the amount of data to read - :rtype: bytes - :return: file data - """ - with fd.path.open('rb') as f: - while True: - data = f.read(chunk_size) - if not data: - break - yield data - - -def read_file_chunk(fd, chunk_num, chunk_size): - # type: (FileDescriptor, int, int) -> bytes - """Read file chunk - :param FileDescriptor fd: file descriptor - :param int chunk_num: chunk number - :param int chunk_size: the amount of data to read - :rtype: bytes - :return: file data - """ - offset = chunk_num * chunk_size - with fd.path.open('rb') as f: - f.seek(offset, 0) - return f.read(chunk_size) - - -def write_file_chunk(fd, chunk_num, chunk_size, data): - # type: (FileDescriptor, int, int, bytes) -> None - """Write file chunk - :param FileDescriptor fd: file descriptor - :param int chunk_num: chunk number - :param int chunk_size: the amount of data to read - :rtype: bytes - :return: file data - """ - offset = chunk_num * chunk_size - with fd.path.open('wb') as f: - f.seek(offset, 0) - f.write(data) diff --git a/blobxfer/blob/__init__.py b/blobxfer/operations/__init__.py similarity index 100% rename from blobxfer/blob/__init__.py rename to blobxfer/operations/__init__.py diff --git a/blobxfer/blob/append/__init__.py b/blobxfer/operations/azure/__init__.py similarity index 100% rename from blobxfer/blob/append/__init__.py rename to blobxfer/operations/azure/__init__.py diff --git a/blobxfer/blob/operations.py b/blobxfer/operations/azure/blob/__init__.py similarity index 86% rename from blobxfer/blob/operations.py rename to blobxfer/operations/azure/blob/__init__.py index 27c19c7..e0cf878 100644 --- a/blobxfer/blob/operations.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -34,7 +34,7 @@ import azure.common import azure.storage.blob.models # local imports -import blobxfer.models +import blobxfer.models.azure import blobxfer.util # create logger @@ -63,18 +63,18 @@ def check_if_single_blob(client, container, prefix, timeout=None): def list_blobs(client, container, prefix, mode, timeout=None): # type: (azure.storage.blob.BaseBlobService, str, str, int, - # blobxfer.models.AzureStorageModes) -> + # blobxfer.models.azure.StorageModes) -> # azure.storage.blob.models.Blob """List blobs in path conforming to mode :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix - :param blobxfer.models.AzureStorageModes mode: storage mode + :param blobxfer.models.azure.StorageModes mode: storage mode :param int timeout: timeout :rtype: azure.storage.blob.models.Blob :return: generator of blobs """ - if mode == blobxfer.models.AzureStorageModes.File: + if mode == blobxfer.models.azure.StorageModes.File: raise RuntimeError('cannot list Azure Files from blob client') if blobxfer.util.blob_is_snapshot(prefix): snapshot = blobxfer.util.parse_blob_snapshot_parameter(prefix) @@ -90,15 +90,15 @@ def list_blobs(client, container, prefix, mode, timeout=None): timeout=timeout, ) for blob in blobs: - if (mode == blobxfer.models.AzureStorageModes.Append and + if (mode == blobxfer.models.azure.StorageModes.Append and blob.properties.blob_type != azure.storage.blob.models._BlobTypes.AppendBlob): continue - elif (mode == blobxfer.models.AzureStorageModes.Block and + elif (mode == blobxfer.models.azure.StorageModes.Block and blob.properties.blob_type != azure.storage.blob.models._BlobTypes.BlockBlob): continue - elif (mode == blobxfer.models.AzureStorageModes.Page and + elif (mode == blobxfer.models.azure.StorageModes.Page and blob.properties.blob_type != azure.storage.blob.models._BlobTypes.PageBlob): continue @@ -107,11 +107,11 @@ def list_blobs(client, container, prefix, mode, timeout=None): def get_blob_range(ase, offsets, timeout=None): - # type: (blobxfer.models.AzureStorageEntity, - # blobxfer.download.models.DownloadOffsets, int) -> bytes + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.download.Offsets, int) -> bytes """Retrieve blob range - :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity - :param blobxfer.download.models.DownloadOffsets offsets: download offsets + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.download.Offsets offsets: download offsets :param int timeout: timeout :rtype: bytes :return: content for blob range diff --git a/blobxfer/blob/append/operations.py b/blobxfer/operations/azure/blob/append.py similarity index 93% rename from blobxfer/blob/append/operations.py rename to blobxfer/operations/azure/blob/append.py index cbe4008..296e8c7 100644 --- a/blobxfer/blob/append/operations.py +++ b/blobxfer/operations/azure/blob/append.py @@ -40,9 +40,10 @@ def create_client(storage_account): - # type: (blobxfer.models.AzureStorageAccount) -> AppendBlobService + # type: (blobxfer.models.azure.StorageAccount) -> AppendBlobService """Create Append blob client - :param blobxfer.models.AzureStorageAccount storage_account: storage account + :param blobxfer.models.azure.StorageAccount storage_account: + storage account :rtype: AppendBlobService :return: append blob service client """ diff --git a/blobxfer/blob/block/operations.py b/blobxfer/operations/azure/blob/block.py similarity index 93% rename from blobxfer/blob/block/operations.py rename to blobxfer/operations/azure/blob/block.py index c07fda7..c68ac32 100644 --- a/blobxfer/blob/block/operations.py +++ b/blobxfer/operations/azure/blob/block.py @@ -40,9 +40,10 @@ def create_client(storage_account): - # type: (blobxfer.models.AzureStorageAccount) -> BlockBlobService + # type: (blobxfer.models.azure.StorageAccount) -> BlockBlobService """Create block blob client - :param blobxfer.models.AzureStorageAccount storage_account: storage account + :param blobxfer.models.azure.StorageAccount storage_account: + storage account :rtype: azure.storage.blob.BlockBlobService :return: block blob service client """ diff --git a/blobxfer/blob/page/operations.py b/blobxfer/operations/azure/blob/page.py similarity index 93% rename from blobxfer/blob/page/operations.py rename to blobxfer/operations/azure/blob/page.py index 359e207..8a64622 100644 --- a/blobxfer/blob/page/operations.py +++ b/blobxfer/operations/azure/blob/page.py @@ -40,9 +40,10 @@ def create_client(storage_account): - # type: (blobxfer.models.AzureStorageAccount) -> PageBlobService + # type: (blobxfer.models.azure.StorageAccount) -> PageBlobService """Create block blob client - :param blobxfer.models.AzureStorageAccount storage_account: storage account + :param blobxfer.models.azure.StorageAccount storage_account: + storage account :rtype: PageBlobService :return: block blob service client """ diff --git a/blobxfer/file/operations.py b/blobxfer/operations/azure/file.py similarity index 93% rename from blobxfer/file/operations.py rename to blobxfer/operations/azure/file.py index eff3d01..e531fc2 100644 --- a/blobxfer/file/operations.py +++ b/blobxfer/operations/azure/file.py @@ -46,9 +46,10 @@ def create_client(storage_account): - # type: (blobxfer.models.AzureStorageAccount) -> FileService + # type: (blobxfer.models.azure.StorageAccount) -> FileService """Create file client - :param blobxfer.models.AzureStorageAccount storage_account: storage account + :param blobxfer.models.azure.StorageAccount storage_account: + storage account :rtype: FileService :return: file service client """ @@ -154,11 +155,11 @@ def list_files(client, fileshare, prefix, timeout=None): def get_file_range(ase, offsets, timeout=None): - # type: (blobxfer.models.AzureStorageEntity, - # blobxfer.download.models.DownloadOffsets, int) -> bytes + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.download.Offsets, int) -> bytes """Retrieve file range - :param blobxfer.models.AzureStorageEntity ase: AzureStorageEntity - :param blobxfer.download.models.DownloadOffsets offsets: download offsets + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.download.Offsets offsets: download offsets :param int timeout: timeout :rtype: bytes :return: content for file range diff --git a/blobxfer/crypto/operations.py b/blobxfer/operations/crypto.py similarity index 79% rename from blobxfer/crypto/operations.py rename to blobxfer/operations/crypto.py index 2534148..44202c3 100644 --- a/blobxfer/crypto/operations.py +++ b/blobxfer/operations/crypto.py @@ -31,13 +31,8 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import base64 -import enum import logging import os -try: - import queue -except ImportError: # noqa - import Queue as queue # non-stdlib imports import cryptography.hazmat.backends import cryptography.hazmat.primitives.asymmetric.padding @@ -50,7 +45,7 @@ import cryptography.hazmat.primitives.padding import cryptography.hazmat.primitives.serialization # local imports -import blobxfer.offload +import blobxfer.util # create logger logger = logging.getLogger(__name__) @@ -216,58 +211,3 @@ def aes_cbc_encrypt_data(symkey, iv, data, pad): return cipher.update(pkcs7_pad(data)) + cipher.finalize() else: return cipher.update(data) + cipher.finalize() - - -class CryptoAction(enum.Enum): - Encrypt = 1 - Decrypt = 2 - - -class CryptoOffload(blobxfer.offload._MultiprocessOffload): - def __init__(self, num_workers): - # type: (CryptoOffload, int) -> None - """Ctor for Crypto Offload - :param CryptoOffload self: this - :param int num_workers: number of worker processes - """ - super(CryptoOffload, self).__init__( - self._worker_process, num_workers, 'Crypto') - - def _worker_process(self): - # type: (CryptoOffload) -> None - """Crypto worker - :param CryptoOffload self: this - """ - while not self.terminated: - try: - inst = self._task_queue.get(True, 1) - except queue.Empty: - continue - if inst[0] == CryptoAction.Encrypt: - # TODO on upload - raise NotImplementedError() - elif inst[0] == CryptoAction.Decrypt: - final_path, offsets, symkey, iv, encdata = \ - inst[1], inst[2], inst[3], inst[4], inst[5] - data = aes_cbc_decrypt_data(symkey, iv, encdata, offsets.unpad) - self._done_cv.acquire() - self._done_queue.put((final_path, offsets, data)) - self._done_cv.notify() - self._done_cv.release() - - def add_decrypt_chunk( - self, final_path, offsets, symkey, iv, encdata): - # type: (CryptoOffload, str, blobxfer.download.models.DownloadOffsets, - # bytes, bytes, bytes) -> None - """Add a chunk to decrypt - :param CryptoOffload self: this - :param str final_path: final path - :param blobxfer.download.models.DownloadOffsets offsets: offsets - :param bytes symkey: symmetric key - :param bytes iv: initialization vector - :param bytes encdata: encrypted data - """ - self._task_queue.put( - (CryptoAction.Decrypt, final_path, offsets, symkey, iv, - encdata) - ) diff --git a/blobxfer/download/operations.py b/blobxfer/operations/download.py similarity index 83% rename from blobxfer/download/operations.py rename to blobxfer/operations/download.py index b947bcf..7a843e7 100644 --- a/blobxfer/download/operations.py +++ b/blobxfer/operations/download.py @@ -46,13 +46,11 @@ # non-stdlib imports import dateutil # local imports -import blobxfer.crypto.models -import blobxfer.crypto.operations -import blobxfer.download.models -import blobxfer.md5 -import blobxfer.operations -import blobxfer.blob.operations -import blobxfer.file.operations +import blobxfer.models.crypto +import blobxfer.models.md5 +import blobxfer.operations.azure.blob +import blobxfer.operations.azure.file +import blobxfer.operations.crypto import blobxfer.util # create logger @@ -68,14 +66,14 @@ class DownloadAction(enum.Enum): class Downloader(object): """Downloader""" def __init__(self, general_options, creds, spec): - # type: (Downloader, blobxfer.models.GeneralOptions, - # blobxfer.models.AzureStorageCredentials, - # blobxfer.models.DownloadSpecification) -> None + # type: (Downloader, blobxfer.models.options.General, + # blobxfer.models.azure.StorageCredentials, + # blobxfer.models.download.Specification) -> None """Ctor for Downloader :param Downloader self: this - :param blobxfer.models.GeneralOptions general_options: general opts - :param blobxfer.models.AzureStorageCredentials creds: creds - :param blobxfer.models.DownloadSpecification spec: download spec + :param blobxfer.models.options.General general_options: general opts + :param blobxfer.models.azure.StorageCredentials creds: creds + :param blobxfer.models.download.Specification spec: download spec """ self._all_remote_files_processed = False self._crypto_offload = None @@ -123,13 +121,47 @@ def termination_check_md5(self): len(self._md5_map) == 0 and len(self._download_set) == 0)) + @staticmethod + def ensure_local_destination(creds, spec): + # type: (blobxfer.models.azure.StorageCredentials, + # blobxfer.models.download.Specification) -> None + """Ensure a local destination path given a download spec + :param blobxfer.models.azure.StorageCredentials creds: creds + :param blobxfer.models.download.Specification spec: download spec + """ + # ensure destination path is writable given the source + if len(spec.sources) < 1: + raise RuntimeError('no sources to download from specified') + # set is_dir for destination + spec.destination.is_dir = True + if len(spec.sources) == 1: + # we need to query the source to see if this is a directory + rpath = str(spec.sources[0].paths[0]) + cont, dir = blobxfer.util.explode_azure_path(rpath) + if not blobxfer.util.is_none_or_empty(dir): + sa = creds.get_storage_account( + spec.sources[0].lookup_storage_account(rpath)) + if (spec.options.mode == + blobxfer.models.azure.StorageModes.File): + if blobxfer.operations.azure.file.check_if_single_file( + sa.file_client, cont, dir)[0]: + spec.destination.is_dir = False + else: + if blobxfer.operations.azure.blob.check_if_single_blob( + sa.block_blob_client, cont, dir): + spec.destination.is_dir = False + logger.debug('dest is_dir={} for {} specs'.format( + spec.destination.is_dir, len(spec.sources))) + # ensure destination path + spec.destination.ensure_path_exists() + def _check_download_conditions(self, lpath, rfile): # type: (Downloader, pathlib.Path, - # blobxfer.models.AzureStorageEntity) -> DownloadAction + # blobxfer.models.azure.StorageEntity) -> DownloadAction """Check for download conditions :param Downloader self: this :param pathlib.Path lpath: local path - :param blobxfer.models.AzureStorageEntity rfile: remote file + :param blobxfer.models.azure.StorageEntity rfile: remote file :rtype: DownloadAction :return: download action """ @@ -151,7 +183,7 @@ def _check_download_conditions(self, lpath, rfile): dl_fs = None if self._spec.skip_on.filesize_match: lsize = lpath.stat().st_size - if rfile.mode == blobxfer.models.AzureStorageModes.Page: + if rfile.mode == blobxfer.models.azure.StorageModes.Page: lsize = blobxfer.util.page_align_content_length(lsize) if rfile.size == lsize: dl_fs = False @@ -174,11 +206,11 @@ def _check_download_conditions(self, lpath, rfile): def _pre_md5_skip_on_check(self, lpath, rfile): # type: (Downloader, pathlib.Path, - # blobxfer.models.AzureStorageEntity) -> None + # blobxfer.models.azure.StorageEntity) -> None """Perform pre MD5 skip on check :param Downloader self: this :param pathlib.Path lpath: local path - :param blobxfer.models.AzureStorageEntity rfile: remote file + :param blobxfer.models.azure.StorageEntity rfile: remote file """ # if encryption metadata is present, check for pre-encryption # md5 in blobxfer extensions @@ -259,14 +291,14 @@ def _check_for_crypto_done(self): def _add_to_download_queue(self, lpath, rfile): # type: (Downloader, pathlib.Path, - # blobxfer.models.AzureStorageEntity) -> None + # blobxfer.models.azure.StorageEntity) -> None """Add remote file to download queue :param Downloader self: this :param pathlib.Path lpath: local path - :param blobxfer.models.AzureStorageEntity rfile: remote file + :param blobxfer.models.azure.StorageEntity rfile: remote file """ # prepare remote file for download - dd = blobxfer.download.models.DownloadDescriptor( + dd = blobxfer.models.download.Descriptor( lpath, rfile, self._spec.options) if dd.entity.is_encrypted: with self._download_lock: @@ -330,11 +362,11 @@ def _worker_thread_download(self): if offsets is None: continue # issue get range - if dd.entity.mode == blobxfer.models.AzureStorageModes.File: - data = blobxfer.file.operations.get_file_range( + if dd.entity.mode == blobxfer.models.azure.StorageModes.File: + data = blobxfer.operations.azure.file.get_file_range( dd.entity, offsets, self._general_options.timeout_sec) else: - data = blobxfer.blob.operations.get_blob_range( + data = blobxfer.operations.azure.blob.get_blob_range( dd.entity, offsets, self._general_options.timeout_sec) # accounting with self._download_lock: @@ -342,7 +374,7 @@ def _worker_thread_download(self): # decrypt if necessary if dd.entity.is_encrypted: # slice data to proper bounds - encdata = data[blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES:] + encdata = data[blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES:] intdata = encdata # get iv for chunk and compute hmac if offsets.chunk_num == 0: @@ -350,7 +382,7 @@ def _worker_thread_download(self): # integrity check for first chunk must include iv intdata = iv + data else: - iv = data[:blobxfer.crypto.models._AES256_BLOCKSIZE_BYTES] + iv = data[:blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES] # integrity check data dd.perform_chunked_integrity_check(offsets, intdata) # decrypt data @@ -362,7 +394,7 @@ def _worker_thread_download(self): # data will be completed once retrieved from crypto queue continue else: - data = blobxfer.crypto.operations.aes_cbc_decrypt_data( + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( dd.entity.encryption_metadata.symmetric_key, iv, encdata, offsets.unpad) elif dd.must_compute_md5: @@ -372,14 +404,13 @@ def _worker_thread_download(self): self._complete_chunk_download(offsets, data, dd) def _complete_chunk_download(self, offsets, data, dd): - # type: (Downloader, blobxfer.download.models.DownloadOffsets, bytes, - # blobxfer.models.download.DownloadDescriptor) -> None + # type: (Downloader, blobxfer.models.download.Offsets, bytes, + # blobxfer.models.download.Descriptor) -> None """Complete chunk download :param Downloader self: this - :param blobxfer.download.models.DownloadOffsets offsets: offsets + :param blobxfer.models.download.Offsets offsets: offsets :param bytes data: data - :param blobxfer.models.download.DownloadDescriptor dd: - download descriptor + :param blobxfer.models.download.Descriptor dd: download descriptor """ # write data to disk dd.write_data(offsets, data) @@ -413,17 +444,18 @@ def _run(self): start_time = datetime.datetime.now(tz=dateutil.tz.tzlocal()) logger.info('script start time: {0}'.format(start_time)) # ensure destination path - blobxfer.operations.ensure_local_destination(self._creds, self._spec) + blobxfer.operations.download.Downloader.ensure_local_destination( + self._creds, self._spec) logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) # initialize MD5 processes - self._md5_offload = blobxfer.md5.LocalFileMd5Offload( + self._md5_offload = blobxfer.models.md5.LocalFileMd5Offload( num_workers=self._general_options.concurrency.md5_processes) self._md5_offload.initialize_check_thread( self._check_for_downloads_from_md5) # initialize crypto processes if self._general_options.concurrency.crypto_processes > 0: - self._crypto_offload = blobxfer.crypto.operations.CryptoOffload( + self._crypto_offload = blobxfer.models.crypto.CryptoOffload( num_workers=self._general_options.concurrency.crypto_processes) self._crypto_offload.initialize_check_thread( self._check_for_crypto_done) diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py new file mode 100644 index 0000000..4a50d25 --- /dev/null +++ b/blobxfer/operations/md5.py @@ -0,0 +1,74 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): + # type: (str, bool, int) -> str + """Compute MD5 hash for file and encode as Base64 + :param str filename: file to compute MD5 for + :param bool pagealign: page align data + :param int blocksize: block size + :rtype: str + :return: MD5 for file encoded as Base64 + """ + hasher = blobxfer.util.new_md5_hasher() + with open(filename, 'rb') as filedesc: + while True: + buf = filedesc.read(blocksize) + if not buf: + break + buflen = len(buf) + if pagealign and buflen < blocksize: + aligned = blobxfer.util.page_align_content_length(buflen) + if aligned != buflen: + buf = buf.ljust(aligned, b'\0') + hasher.update(buf) + return blobxfer.util.base64_encode_as_string(hasher.digest()) + + +def compute_md5_for_data_asbase64(data): + # type: (obj) -> str + """Compute MD5 hash for bits and encode as Base64 + :param any data: data to compute MD5 for + :rtype: str + :return: MD5 for data + """ + hasher = blobxfer.util.new_md5_hasher() + hasher.update(data) + return blobxfer.util.base64_encode_as_string(hasher.digest()) diff --git a/blobxfer/util.py b/blobxfer/util.py index eec47a9..ec85fe5 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -64,7 +64,7 @@ def setup_logger(logger): # noqa logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() formatter = logging.Formatter( - '%(asctime)sZ %(levelname)s %(name)s:%(funcName)s:%(lineno)d ' + '%(asctime)s %(levelname)s %(name)s:%(funcName)s:%(lineno)d ' '%(message)s') handler.setFormatter(formatter) logger.addHandler(handler) diff --git a/cli/settings.py b/cli/settings.py index 4da2500..b62de58 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -33,8 +33,10 @@ import enum # non-stdlib imports # local imports -import blobxfer.crypto.operations -import blobxfer.models +import blobxfer.models.azure +import blobxfer.models.download +import blobxfer.models.options +import blobxfer.operations.crypto import blobxfer.util @@ -215,13 +217,13 @@ def merge_settings(config, cli_options): def create_azure_storage_credentials(config): - # type: (dict) -> blobxfer.models.AzureStorageCredentials - """Create an AzureStorageCredentials object from configuration + # type: (dict) -> blobxfer.models.azure.StorageCredentials + """Create an Azure StorageCredentials object from configuration :param dict config: config dict - :rtype: blobxfer.models.AzureStorageCredentials + :rtype: blobxfer.models.azure.StorageCredentials :return: credentials object """ - creds = blobxfer.models.AzureStorageCredentials() + creds = blobxfer.models.azure.StorageCredentials() endpoint = config['azure_storage']['endpoint'] for name in config['azure_storage']['accounts']: key = config['azure_storage']['accounts'][name] @@ -230,14 +232,14 @@ def create_azure_storage_credentials(config): def create_general_options(config): - # type: (dict) -> blobxfer.models.GeneralOptions - """Create a GeneralOptions object from configuration + # type: (dict) -> blobxfer.models.options.General + """Create a General Options object from configuration :param dict config: config dict - :rtype: blobxfer.models.GeneralOptions + :rtype: blobxfer.models.options.General :return: general options object """ - return blobxfer.models.GeneralOptions( - concurrency=blobxfer.models.ConcurrencyOptions( + return blobxfer.models.options.General( + concurrency=blobxfer.models.options.Concurrency( crypto_processes=config['options']['crypto_processes'], md5_processes=config['options']['md5_processes'], transfer_threads=config['options']['transfer_threads'], @@ -250,38 +252,38 @@ def create_general_options(config): def create_download_specifications(config): - # type: (dict) -> List[blobxfer.models.DownloadSpecification] - """Create a list of DownloadSpecification objects from configuration + # type: (dict) -> List[blobxfer.models.download.Specification] + """Create a list of Download Specification objects from configuration :param dict config: config dict :rtype: list - :return: list of DownloadSpecification objects + :return: list of Download Specification objects """ specs = [] for conf in config['download']: # create download options confmode = conf['options']['mode'].lower() if confmode == 'auto': - mode = blobxfer.models.AzureStorageModes.Auto + mode = blobxfer.models.azure.StorageModes.Auto elif confmode == 'append': - mode = blobxfer.models.AzureStorageModes.Append + mode = blobxfer.models.azure.StorageModes.Append elif confmode == 'block': - mode = blobxfer.models.AzureStorageModes.Block + mode = blobxfer.models.azure.StorageModes.Block elif confmode == 'file': - mode = blobxfer.models.AzureStorageModes.File + mode = blobxfer.models.azure.StorageModes.File elif confmode == 'page': - mode = blobxfer.models.AzureStorageModes.Page + mode = blobxfer.models.azure.StorageModes.Page else: raise ValueError('unknown mode: {}'.format(confmode)) # load RSA private key PEM file if specified rpk = conf['options']['rsa_private_key'] if blobxfer.util.is_not_empty(rpk): rpkp = conf['options']['rsa_private_key_passphrase'] - rpk = blobxfer.crypto.operations.load_rsa_private_key_file( + rpk = blobxfer.operations.crypto.load_rsa_private_key_file( rpk, rpkp) else: rpk = None - ds = blobxfer.models.DownloadSpecification( - download_options=blobxfer.models.DownloadOptions( + ds = blobxfer.models.download.Specification( + download_options=blobxfer.models.options.Download( check_file_md5=conf['options']['check_file_md5'], chunk_size_bytes=conf['options']['chunk_size_bytes'], delete_extraneous_destination=conf[ @@ -293,12 +295,13 @@ def create_download_specifications(config): 'options']['restore_file_attributes'], rsa_private_key=rpk, ), - skip_on_options=blobxfer.models.SkipOnOptions( + skip_on_options=blobxfer.models.options.SkipOn( filesize_match=conf['options']['skip_on']['filesize_match'], lmt_ge=conf['options']['skip_on']['lmt_ge'], md5_match=conf['options']['skip_on']['md5_match'], ), - local_destination_path=blobxfer.models.LocalDestinationPath( + local_destination_path=blobxfer.models.download. + LocalDestinationPath( conf['destination'] ) ) @@ -308,7 +311,7 @@ def create_download_specifications(config): raise RuntimeError( 'invalid number of source pairs specified per entry') sa = next(iter(src)) - asp = blobxfer.models.AzureSourcePath() + asp = blobxfer.models.azure.SourcePath() asp.add_path_with_storage_account(src[sa], sa) if blobxfer.util.is_not_empty(conf['include']): asp.add_includes(conf['include']) diff --git a/setup.py b/setup.py index f6336db..7c61abf 100644 --- a/setup.py +++ b/setup.py @@ -45,18 +45,18 @@ 'azure-common==1.1.4', 'azure-storage==0.34.0', 'click==6.7', - 'cryptography>=1.7.2', + 'cryptography>=1.8.1', 'future==0.16.0', 'python-dateutil==2.6.0', - 'ruamel.yaml==0.13.14', + 'ruamel.yaml==0.14.5', ] if sys.version_info < (3, 4): - install_requires.append('enum34') + install_requires.append('enum34==1.1.6') if sys.version_info < (3, 5): - install_requires.append('pathlib2') - install_requires.append('scandir') + install_requires.append('pathlib2==2.2.1') + install_requires.append('scandir==1.5') setup( name='blobxfer', From 2bcab1e605e43a51b0485a3713b97aac731f2a0e Mon Sep 17 00:00:00 2001 From: Fred Park Date: Wed, 5 Apr 2017 14:14:41 -0700 Subject: [PATCH 17/47] Fix tests to align with refactor - Move some classes out of models to operations --- blobxfer/models/azure.py | 259 +----------- blobxfer/models/crypto.py | 63 +-- blobxfer/models/download.py | 2 +- blobxfer/models/md5.py | 90 ----- blobxfer/operations/azure/__init__.py | 284 +++++++++++++ blobxfer/operations/azure/blob/append.py | 4 +- blobxfer/operations/azure/blob/block.py | 4 +- blobxfer/operations/azure/blob/page.py | 4 +- blobxfer/operations/azure/file.py | 4 +- blobxfer/operations/crypto.py | 62 +++ blobxfer/operations/download.py | 18 +- blobxfer/operations/md5.py | 52 +++ cli/settings.py | 9 +- setup.py | 11 +- test_requirements.txt | 4 +- tests/test_blobxfer_models.py | 372 ------------------ tests/test_blobxfer_models_azure.py | 51 +++ ...dels.py => test_blobxfer_models_crypto.py} | 6 +- ...ls.py => test_blobxfer_models_download.py} | 147 +++++-- ...oad.py => test_blobxfer_models_offload.py} | 2 +- tests/test_blobxfer_models_options.py | 82 ++++ tests/test_blobxfer_models_upload.py | 56 +++ tests/test_blobxfer_operations.py | 80 ---- tests/test_blobxfer_operations_azure.py | 161 ++++++++ ...=> test_blobxfer_operations_azure_blob.py} | 16 +- ..._blobxfer_operations_azure_blob_append.py} | 10 +- ...t_blobxfer_operations_azure_blob_block.py} | 11 +- ...st_blobxfer_operations_azure_blob_page.py} | 9 +- ...=> test_blobxfer_operations_azure_file.py} | 10 +- ....py => test_blobxfer_operations_crypto.py} | 6 +- ...y => test_blobxfer_operations_download.py} | 204 +++++++--- ...md5.py => test_blobxfer_operations_md5.py} | 30 +- 32 files changed, 1077 insertions(+), 1046 deletions(-) delete mode 100644 blobxfer/models/md5.py delete mode 100644 tests/test_blobxfer_models.py create mode 100644 tests/test_blobxfer_models_azure.py rename tests/{test_blobxfer_crypto_models.py => test_blobxfer_models_crypto.py} (98%) rename tests/{test_blobxfer_download_models.py => test_blobxfer_models_download.py} (74%) rename tests/{test_blobxfer_offload.py => test_blobxfer_models_offload.py} (94%) create mode 100644 tests/test_blobxfer_models_options.py create mode 100644 tests/test_blobxfer_models_upload.py delete mode 100644 tests/test_blobxfer_operations.py create mode 100644 tests/test_blobxfer_operations_azure.py rename tests/{test_blobxfer_blob_operations.py => test_blobxfer_operations_azure_blob.py} (83%) rename tests/{test_blobxfer_blob_append_operations.py => test_blobxfer_operations_azure_blob_append.py} (71%) rename tests/{test_blobxfer_blob_block_operations.py => test_blobxfer_operations_azure_blob_block.py} (71%) rename tests/{test_blobxfer_blob_page_operations.py => test_blobxfer_operations_azure_blob_page.py} (74%) rename tests/{test_blobxfer_file_operations.py => test_blobxfer_operations_azure_file.py} (93%) rename tests/{test_blobxfer_crypto_operations.py => test_blobxfer_operations_crypto.py} (96%) rename tests/{test_blobxfer_download_operations.py => test_blobxfer_operations_download.py} (78%) rename tests/{test_blobxfer_md5.py => test_blobxfer_operations_md5.py} (72%) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index 57d1f38..9f28ca3 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -34,12 +34,6 @@ # non-stdlib imports from azure.storage.blob.models import _BlobTypes as BlobTypes # local imports -import blobxfer.models -import blobxfer.operations.azure.blob -import blobxfer.operations.azure.blob.append -import blobxfer.operations.azure.blob.block -import blobxfer.operations.azure.blob.page -import blobxfer.operations.azure.file # enums @@ -51,136 +45,6 @@ class StorageModes(enum.Enum): Page = 50 -class StorageCredentials(object): - """Azure Storage Credentials""" - def __init__(self): - # type: (StorageCredentials) -> None - """Ctor for StorageCredentials""" - self._storage_accounts = {} - - def add_storage_account(self, name, key, endpoint): - # type: (StorageCredentials, str, str, str) -> None - """Add a storage account - :param StorageCredentials self: this - :param str name: name of storage account to store - :param str key: storage key or sas - :param str endpoint: endpoint - """ - if name in self._storage_accounts: - raise ValueError( - '{} already exists in storage accounts'.format(name)) - self._storage_accounts[name] = StorageAccount(name, key, endpoint) - - def get_storage_account(self, name): - # type: (StorageCredentials, str) -> StorageAccount - """Get storage account details - :param StorageCredentials self: this - :param str name: name of storage account to retrieve - :rtype: StorageAccount - :return: storage account details - """ - return self._storage_accounts[name] - - -class StorageAccount(object): - """Azure Storage Account""" - def __init__(self, name, key, endpoint): - # type: (StorageAccount, str, str, str) -> None - """Ctor for StorageAccount - :param str name: name of storage account - :param str key: storage key or sas - :param str endpoint: endpoint - """ - self._append_blob_client = None - self._block_blob_client = None - self._file_client = None - self._page_blob_client = None - self.name = name - self.key = key - self.endpoint = endpoint - self.is_sas = self._key_is_sas(self.key) - # normalize sas keys - if self.is_sas and self.key.startswith('?'): - self.key = self.key[1:] - self._create_clients() - - @staticmethod - def _key_is_sas(key): - # type: (str) -> bool - """Determine if key is a sas - :param str key: key to parse - :rtype: bool - :return: if key is a sas - """ - # keys starting with ? are sas keys as ? is not in the base-64 - # character range - if key.startswith('?'): - return True - else: - # & is not in the base-64 character range, so technically - # the presence of this character means the key is a sas. however, - # perform a stronger check for the sig= parameter. - tmp = key.split('&') - if len(tmp) == 1: - return False - elif any(x.startswith('sig=') for x in tmp): - return True - return False - - def _create_clients(self): - # type: (StorageAccount) -> None - """Create Azure Storage clients - :param StorageAccount self: this - """ - self._append_blob_client = \ - blobxfer.operations.azure.blob.append.create_client(self) - self._block_blob_client = \ - blobxfer.operations.azure.blob.block.create_client(self) - self._file_client = blobxfer.operations.azure.file.create_client(self) - self._page_blob_client = \ - blobxfer.operations.azure.blob.page.create_client(self) - - @property - def append_blob_client(self): - # type: (StorageAccount) -> azure.storage.blob.AppendBlobService - """Get append blob client - :param StorageAccount self: this - :rtype: azure.storage.blob.AppendBlobService - :return: append blob client - """ - return self._append_blob_client - - @property - def block_blob_client(self): - # type: (StorageAccount) -> azure.storage.blob.BlockBlobService - """Get block blob client - :param StorageAccount self: this - :rtype: azure.storage.blob.BlockBlobService - :return: block blob client - """ - return self._block_blob_client - - @property - def file_client(self): - # type: (StorageAccount) -> azure.storage.file.FileService - """Get file client - :param StorageAccount self: this - :rtype: azure.storage.file.FileService - :return: file client - """ - return self._file_client - - @property - def page_blob_client(self): - # type: (StorageAccount) -> azure.storage.blob.PageBlobService - """Get page blob client - :param StorageAccount self: this - :rtype: azure.storage.blob.PageBlobService - :return: page blob client - """ - return self._page_blob_client - - class StorageEntity(object): """Azure Storage Entity""" def __init__(self, container, ed=None): @@ -306,11 +170,11 @@ def encryption_metadata(self): return self._encryption def populate_from_blob(self, sa, blob): - # type: (StorageEntity, blobxfer.models.azure.StorageAccount, + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, # azure.storage.blob.models.Blob) -> None """Populate properties from Blob :param StorageEntity self: this - :param blobxfer.models.azure.StorageAccount sa: storage account + :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.blob.models.Blob blob: blob to populate from """ self._name = blob.name @@ -329,11 +193,11 @@ def populate_from_blob(self, sa, blob): self._client = sa.page_blob_client def populate_from_file(self, sa, file): - # type: (StorageEntity, blobxfer.models.azure.StorageAccount, + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, # azure.storage.file.models.File) -> None """Populate properties from File :param StorageEntity self: this - :param blobxfer.models.azure.StorageAccount sa: storage account + :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.file.models.File file: file to populate from """ self._name = file.name @@ -343,118 +207,3 @@ def populate_from_file(self, sa, file): self._md5 = file.properties.content_settings.content_md5 self._mode = StorageModes.File self._client = sa.file_client - - -class SourcePath(blobxfer.models._BaseSourcePaths): - """Azure Source Path""" - def __init__(self): - # type: (SourcePath) -> None - """Ctor for SourcePath - :param SourcePath self: this - """ - super(SourcePath, self).__init__() - self._path_map = {} - - def add_path_with_storage_account(self, remote_path, storage_account): - # type: (SourcePath, str, str) -> None - """Add a path with an associated storage account - :param SourcePath self: this - :param str remote_path: remote path - :param str storage_account: storage account to associate with path - """ - if len(self._path_map) >= 1: - raise RuntimeError( - 'cannot add multiple remote paths to SourcePath objects') - rpath = blobxfer.util.normalize_azure_path(remote_path) - self.add_path(rpath) - self._path_map[rpath] = storage_account - - def lookup_storage_account(self, remote_path): - # type: (SourcePath, str) -> str - """Lookup the storage account associated with the remote path - :param SourcePath self: this - :param str remote_path: remote path - :rtype: str - :return: storage account associated with path - """ - return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] - - def files(self, creds, options, general_options): - # type: (SourcePath, StorageCredentials, - # blobxfer.models.options.Download, - # blobxfer.models.options.General) -> StorageEntity - """Generator of Azure remote files or blobs - :param SourcePath self: this - :param StorageCredentials creds: storage creds - :param blobxfer.models.options.Download options: download options - :param blobxfer.models.options.General general_options: general options - :rtype: StorageEntity - :return: Azure storage entity object - """ - if options.mode == blobxfer.models.azure.StorageModes.File: - for file in self._populate_from_list_files( - creds, options, general_options): - yield file - else: - for blob in self._populate_from_list_blobs( - creds, options, general_options): - yield blob - - def _populate_from_list_files(self, creds, options, general_options): - # type: (SourcePath, StorageCredentials, - # blobxfer.models.options.Download, - # blobxfer.models.options.General) -> StorageEntity - """Internal generator for Azure remote files - :param SourcePath self: this - :param StorageCredentials creds: storage creds - :param blobxfer.models.options.Download options: download options - :param blobxfer.models.options.General general_options: general options - :rtype: StorageEntity - :return: Azure storage entity object - """ - for _path in self._paths: - rpath = str(_path) - cont, dir = blobxfer.util.explode_azure_path(rpath) - sa = creds.get_storage_account(self.lookup_storage_account(rpath)) - for file in blobxfer.operations.azure.file.list_files( - sa.file_client, cont, dir, general_options.timeout_sec): - if blobxfer.models.crypto.EncryptionMetadata.\ - encryption_metadata_exists(file.metadata): - ed = blobxfer.models.crypto.EncryptionMetadata() - ed.convert_from_json( - file.metadata, file.name, options.rsa_private_key) - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_file(sa, file) - yield ase - - def _populate_from_list_blobs(self, creds, options, general_options): - # type: (SourcePath, StorageCredentials, - # blobxfer.models.options.Download, - # blobxfer.models.options.General) -> StorageEntity - """Internal generator for Azure remote blobs - :param SourcePath self: this - :param StorageCredentials creds: storage creds - :param blobxfer.models.options.Download options: download options - :param blobxfer.models.options.General general_options: general options - :rtype: StorageEntity - :return: Azure storage entity object - """ - for _path in self._paths: - rpath = str(_path) - cont, dir = blobxfer.util.explode_azure_path(rpath) - sa = creds.get_storage_account(self.lookup_storage_account(rpath)) - for blob in blobxfer.operations.azure.blob.list_blobs( - sa.block_blob_client, cont, dir, options.mode, - general_options.timeout_sec): - if blobxfer.models.crypto.EncryptionMetadata.\ - encryption_metadata_exists(blob.metadata): - ed = blobxfer.models.crypto.EncryptionMetadata() - ed.convert_from_json( - blob.metadata, blob.name, options.rsa_private_key) - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_blob(sa, blob) - yield ase diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py index 904da80..91a2f4a 100644 --- a/blobxfer/models/crypto.py +++ b/blobxfer/models/crypto.py @@ -32,14 +32,9 @@ # stdlib imports import base64 import collections -import enum import hashlib import hmac import json -try: - import queue -except ImportError: # noqa - import Queue as queue # non-stdlib imports # local imports import blobxfer.models.offload @@ -47,7 +42,7 @@ import blobxfer.util # encryption constants -_AES256_BLOCKSIZE_BYTES = 16 +AES256_BLOCKSIZE_BYTES = 16 # named tuples EncryptionBlobxferExtensions = collections.namedtuple( @@ -316,59 +311,3 @@ def initialize_hmac(self): return hmac.new(self._signkey, digestmod=hashlib.sha256) else: return None - - -class CryptoAction(enum.Enum): - Encrypt = 1 - Decrypt = 2 - - -class CryptoOffload(blobxfer.models.offload._MultiprocessOffload): - def __init__(self, num_workers): - # type: (CryptoOffload, int) -> None - """Ctor for Crypto Offload - :param CryptoOffload self: this - :param int num_workers: number of worker processes - """ - super(CryptoOffload, self).__init__( - self._worker_process, num_workers, 'Crypto') - - def _worker_process(self): - # type: (CryptoOffload) -> None - """Crypto worker - :param CryptoOffload self: this - """ - while not self.terminated: - try: - inst = self._task_queue.get(True, 1) - except queue.Empty: - continue - if inst[0] == CryptoAction.Encrypt: - # TODO on upload - raise NotImplementedError() - elif inst[0] == CryptoAction.Decrypt: - final_path, offsets, symkey, iv, encdata = \ - inst[1], inst[2], inst[3], inst[4], inst[5] - data = blobxfer.operations.crypto.aes_cbc_decrypt_data( - symkey, iv, encdata, offsets.unpad) - self._done_cv.acquire() - self._done_queue.put((final_path, offsets, data)) - self._done_cv.notify() - self._done_cv.release() - - def add_decrypt_chunk( - self, final_path, offsets, symkey, iv, encdata): - # type: (CryptoOffload, str, blobxfer.models.download.Offsets, - # bytes, bytes, bytes) -> None - """Add a chunk to decrypt - :param CryptoOffload self: this - :param str final_path: final path - :param blobxfer.models.download.Offsets offsets: offsets - :param bytes symkey: symmetric key - :param bytes iv: initialization vector - :param bytes encdata: encrypted data - """ - self._task_queue.put( - (CryptoAction.Decrypt, final_path, offsets, symkey, iv, - encdata) - ) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index d4d8f06..150e07e 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -170,7 +170,7 @@ def add_azure_source_path(self, source): class Descriptor(object): """Download Descriptor""" - _AES_BLOCKSIZE = blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES + _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES def __init__(self, lpath, ase, options): # type: (DownloadDescriptior, pathlib.Path, diff --git a/blobxfer/models/md5.py b/blobxfer/models/md5.py deleted file mode 100644 index f8c1d3a..0000000 --- a/blobxfer/models/md5.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) Microsoft Corporation -# -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -# compat imports -from __future__ import absolute_import, division, print_function -from builtins import ( # noqa - bytes, dict, int, list, object, range, ascii, chr, hex, input, - next, oct, open, pow, round, super, filter, map, zip -) -# stdlib imports -import logging -try: - import queue -except ImportError: # noqa - import Queue as queue -# non-stdlib imports -# local imports -import blobxfer.models.azure -import blobxfer.models.offload -import blobxfer.operations.md5 - -# create logger -logger = logging.getLogger(__name__) - - -class LocalFileMd5Offload(blobxfer.models.offload._MultiprocessOffload): - """LocalFileMd5Offload""" - def __init__(self, num_workers): - # type: (LocalFileMd5Offload, int) -> None - """Ctor for Local File Md5 Offload - :param LocalFileMd5Offload self: this - :param int num_workers: number of worker processes - """ - super(LocalFileMd5Offload, self).__init__( - self._worker_process, num_workers, 'MD5') - - def _worker_process(self): - # type: (LocalFileMd5Offload) -> None - """Compute MD5 for local file - :param LocalFileMd5Offload self: this - """ - while not self.terminated: - try: - filename, remote_md5, pagealign = self._task_queue.get(True, 1) - except queue.Empty: - continue - md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( - filename, pagealign) - logger.debug('MD5: {} {} {}'.format( - md5, remote_md5, filename)) - self._done_cv.acquire() - self._done_queue.put((filename, md5 == remote_md5)) - self._done_cv.notify() - self._done_cv.release() - - def add_localfile_for_md5_check(self, filename, remote_md5, mode): - # type: (LocalFileMd5Offload, str, str, - # blobxfer.models.azure.StorageModes) -> None - """Add a local file to MD5 check queue - :param LocalFileMd5Offload self: this - :param str filename: file to compute MD5 for - :param str remote_md5: remote MD5 to compare against - :param blobxfer.models.azure.StorageModes mode: mode - """ - if mode == blobxfer.models.azure.StorageModes.Page: - pagealign = True - else: - pagealign = False - self._task_queue.put((filename, remote_md5, pagealign)) diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index e69de29..20ddb50 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -0,0 +1,284 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +# local imports +import blobxfer.models +import blobxfer.operations.azure.blob.append +import blobxfer.operations.azure.blob.block +import blobxfer.operations.azure.blob.page +import blobxfer.operations.azure.file + + +class StorageCredentials(object): + """Azure Storage Credentials""" + def __init__(self): + # type: (StorageCredentials) -> None + """Ctor for StorageCredentials""" + self._storage_accounts = {} + + def add_storage_account(self, name, key, endpoint): + # type: (StorageCredentials, str, str, str) -> None + """Add a storage account + :param StorageCredentials self: this + :param str name: name of storage account to store + :param str key: storage key or sas + :param str endpoint: endpoint + """ + if name in self._storage_accounts: + raise ValueError( + '{} already exists in storage accounts'.format(name)) + self._storage_accounts[name] = StorageAccount(name, key, endpoint) + + def get_storage_account(self, name): + # type: (StorageCredentials, str) -> StorageAccount + """Get storage account details + :param StorageCredentials self: this + :param str name: name of storage account to retrieve + :rtype: StorageAccount + :return: storage account details + """ + return self._storage_accounts[name] + + +class StorageAccount(object): + """Azure Storage Account""" + def __init__(self, name, key, endpoint): + # type: (StorageAccount, str, str, str) -> None + """Ctor for StorageAccount + :param str name: name of storage account + :param str key: storage key or sas + :param str endpoint: endpoint + """ + self._append_blob_client = None + self._block_blob_client = None + self._file_client = None + self._page_blob_client = None + self.name = name + self.key = key + self.endpoint = endpoint + self.is_sas = self._key_is_sas(self.key) + # normalize sas keys + if self.is_sas and self.key.startswith('?'): + self.key = self.key[1:] + self._create_clients() + + @staticmethod + def _key_is_sas(key): + # type: (str) -> bool + """Determine if key is a sas + :param str key: key to parse + :rtype: bool + :return: if key is a sas + """ + # keys starting with ? are sas keys as ? is not in the base-64 + # character range + if key.startswith('?'): + return True + else: + # & is not in the base-64 character range, so technically + # the presence of this character means the key is a sas. however, + # perform a stronger check for the sig= parameter. + tmp = key.split('&') + if len(tmp) == 1: + return False + elif any(x.startswith('sig=') for x in tmp): + return True + return False + + def _create_clients(self): + # type: (StorageAccount) -> None + """Create Azure Storage clients + :param StorageAccount self: this + """ + self._append_blob_client = \ + blobxfer.operations.azure.blob.append.create_client(self) + self._block_blob_client = \ + blobxfer.operations.azure.blob.block.create_client(self) + self._file_client = blobxfer.operations.azure.file.create_client(self) + self._page_blob_client = \ + blobxfer.operations.azure.blob.page.create_client(self) + + @property + def append_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.AppendBlobService + """Get append blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.AppendBlobService + :return: append blob client + """ + return self._append_blob_client + + @property + def block_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.BlockBlobService + """Get block blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.BlockBlobService + :return: block blob client + """ + return self._block_blob_client + + @property + def file_client(self): + # type: (StorageAccount) -> azure.storage.file.FileService + """Get file client + :param StorageAccount self: this + :rtype: azure.storage.file.FileService + :return: file client + """ + return self._file_client + + @property + def page_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.PageBlobService + """Get page blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.PageBlobService + :return: page blob client + """ + return self._page_blob_client + + +class SourcePath(blobxfer.models._BaseSourcePaths): + """Azure Source Path""" + def __init__(self): + # type: (SourcePath) -> None + """Ctor for SourcePath + :param SourcePath self: this + """ + super(SourcePath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (SourcePath, str, str) -> None + """Add a path with an associated storage account + :param SourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to SourcePath objects') + rpath = blobxfer.util.normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (SourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param SourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] + + def files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Generator of Azure remote files or blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + if options.mode == blobxfer.models.azure.StorageModes.File: + for file in self._populate_from_list_files( + creds, options, general_options): + yield file + else: + for blob in self._populate_from_list_blobs( + creds, options, general_options): + yield blob + + def _populate_from_list_files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote files + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for file in blobxfer.operations.azure.file.list_files( + sa.file_client, cont, dir, general_options.timeout_sec): + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(file.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + file.metadata, file.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_file(sa, file) + yield ase + + def _populate_from_list_blobs(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for blob in blobxfer.operations.azure.blob.list_blobs( + sa.block_blob_client, cont, dir, options.mode, + general_options.timeout_sec): + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(blob.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + blob.metadata, blob.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_blob(sa, blob) + yield ase diff --git a/blobxfer/operations/azure/blob/append.py b/blobxfer/operations/azure/blob/append.py index 296e8c7..910ab5d 100644 --- a/blobxfer/operations/azure/blob/append.py +++ b/blobxfer/operations/azure/blob/append.py @@ -40,9 +40,9 @@ def create_client(storage_account): - # type: (blobxfer.models.azure.StorageAccount) -> AppendBlobService + # type: (blobxfer.operations.azure.StorageAccount) -> AppendBlobService """Create Append blob client - :param blobxfer.models.azure.StorageAccount storage_account: + :param blobxfer.operations.azure.StorageAccount storage_account: storage account :rtype: AppendBlobService :return: append blob service client diff --git a/blobxfer/operations/azure/blob/block.py b/blobxfer/operations/azure/blob/block.py index c68ac32..b6fd673 100644 --- a/blobxfer/operations/azure/blob/block.py +++ b/blobxfer/operations/azure/blob/block.py @@ -40,9 +40,9 @@ def create_client(storage_account): - # type: (blobxfer.models.azure.StorageAccount) -> BlockBlobService + # type: (blobxfer.operations.azure.StorageAccount) -> BlockBlobService """Create block blob client - :param blobxfer.models.azure.StorageAccount storage_account: + :param blobxfer.operations.azure.StorageAccount storage_account: storage account :rtype: azure.storage.blob.BlockBlobService :return: block blob service client diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 8a64622..6aedc8f 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -40,9 +40,9 @@ def create_client(storage_account): - # type: (blobxfer.models.azure.StorageAccount) -> PageBlobService + # type: (blobxfer.operations.azure.StorageAccount) -> PageBlobService """Create block blob client - :param blobxfer.models.azure.StorageAccount storage_account: + :param blobxfer.operations.azure.StorageAccount storage_account: storage account :rtype: PageBlobService :return: block blob service client diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index e531fc2..1b17f94 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -46,9 +46,9 @@ def create_client(storage_account): - # type: (blobxfer.models.azure.StorageAccount) -> FileService + # type: (blobxfer.operations.azure.StorageAccount) -> FileService """Create file client - :param blobxfer.models.azure.StorageAccount storage_account: + :param blobxfer.operations.azure.StorageAccount storage_account: storage account :rtype: FileService :return: file service client diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 44202c3..98945d3 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -31,8 +31,13 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import base64 +import enum import logging import os +try: + import queue +except ImportError: # noqa + import Queue as queue # non-stdlib imports import cryptography.hazmat.backends import cryptography.hazmat.primitives.asymmetric.padding @@ -54,6 +59,12 @@ _AES256_KEYLENGTH_BYTES = 32 +# enums +class CryptoAction(enum.Enum): + Encrypt = 1 + Decrypt = 2 + + def load_rsa_private_key_file(rsakeyfile, passphrase): # type: (str, str) -> # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey @@ -211,3 +222,54 @@ def aes_cbc_encrypt_data(symkey, iv, data, pad): return cipher.update(pkcs7_pad(data)) + cipher.finalize() else: return cipher.update(data) + cipher.finalize() + + +class CryptoOffload(blobxfer.models.offload._MultiprocessOffload): + def __init__(self, num_workers): + # type: (CryptoOffload, int) -> None + """Ctor for Crypto Offload + :param CryptoOffload self: this + :param int num_workers: number of worker processes + """ + super(CryptoOffload, self).__init__( + self._worker_process, num_workers, 'Crypto') + + def _worker_process(self): + # type: (CryptoOffload) -> None + """Crypto worker + :param CryptoOffload self: this + """ + while not self.terminated: + try: + inst = self._task_queue.get(True, 1) + except queue.Empty: + continue + if inst[0] == CryptoAction.Encrypt: + # TODO on upload + raise NotImplementedError() + elif inst[0] == CryptoAction.Decrypt: + final_path, offsets, symkey, iv, encdata = \ + inst[1], inst[2], inst[3], inst[4], inst[5] + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( + symkey, iv, encdata, offsets.unpad) + self._done_cv.acquire() + self._done_queue.put((final_path, offsets, data)) + self._done_cv.notify() + self._done_cv.release() + + def add_decrypt_chunk( + self, final_path, offsets, symkey, iv, encdata): + # type: (CryptoOffload, str, blobxfer.models.download.Offsets, + # bytes, bytes, bytes) -> None + """Add a chunk to decrypt + :param CryptoOffload self: this + :param str final_path: final path + :param blobxfer.models.download.Offsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes encdata: encrypted data + """ + self._task_queue.put( + (CryptoAction.Decrypt, final_path, offsets, symkey, iv, + encdata) + ) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 7a843e7..61875fe 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -47,10 +47,10 @@ import dateutil # local imports import blobxfer.models.crypto -import blobxfer.models.md5 import blobxfer.operations.azure.blob import blobxfer.operations.azure.file import blobxfer.operations.crypto +import blobxfer.operations.md5 import blobxfer.util # create logger @@ -67,12 +67,12 @@ class Downloader(object): """Downloader""" def __init__(self, general_options, creds, spec): # type: (Downloader, blobxfer.models.options.General, - # blobxfer.models.azure.StorageCredentials, + # blobxfer.operations.azure.StorageCredentials, # blobxfer.models.download.Specification) -> None """Ctor for Downloader :param Downloader self: this :param blobxfer.models.options.General general_options: general opts - :param blobxfer.models.azure.StorageCredentials creds: creds + :param blobxfer.operations.azure.StorageCredentials creds: creds :param blobxfer.models.download.Specification spec: download spec """ self._all_remote_files_processed = False @@ -123,10 +123,10 @@ def termination_check_md5(self): @staticmethod def ensure_local_destination(creds, spec): - # type: (blobxfer.models.azure.StorageCredentials, + # type: (blobxfer.operations.azure.StorageCredentials, # blobxfer.models.download.Specification) -> None """Ensure a local destination path given a download spec - :param blobxfer.models.azure.StorageCredentials creds: creds + :param blobxfer.operations.azure.StorageCredentials creds: creds :param blobxfer.models.download.Specification spec: download spec """ # ensure destination path is writable given the source @@ -374,7 +374,7 @@ def _worker_thread_download(self): # decrypt if necessary if dd.entity.is_encrypted: # slice data to proper bounds - encdata = data[blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES:] + encdata = data[blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] intdata = encdata # get iv for chunk and compute hmac if offsets.chunk_num == 0: @@ -382,7 +382,7 @@ def _worker_thread_download(self): # integrity check for first chunk must include iv intdata = iv + data else: - iv = data[:blobxfer.models.crypto._AES256_BLOCKSIZE_BYTES] + iv = data[:blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES] # integrity check data dd.perform_chunked_integrity_check(offsets, intdata) # decrypt data @@ -449,13 +449,13 @@ def _run(self): logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) # initialize MD5 processes - self._md5_offload = blobxfer.models.md5.LocalFileMd5Offload( + self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( num_workers=self._general_options.concurrency.md5_processes) self._md5_offload.initialize_check_thread( self._check_for_downloads_from_md5) # initialize crypto processes if self._general_options.concurrency.crypto_processes > 0: - self._crypto_offload = blobxfer.models.crypto.CryptoOffload( + self._crypto_offload = blobxfer.operations.crypto.CryptoOffload( num_workers=self._general_options.concurrency.crypto_processes) self._crypto_offload.initialize_check_thread( self._check_for_crypto_done) diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py index 4a50d25..d260c9e 100644 --- a/blobxfer/operations/md5.py +++ b/blobxfer/operations/md5.py @@ -30,8 +30,14 @@ ) # stdlib imports import logging +try: + import queue +except ImportError: # noqa + import Queue as queue # non-stdlib imports # local imports +import blobxfer.models.azure +import blobxfer.models.offload import blobxfer.util # create logger @@ -72,3 +78,49 @@ def compute_md5_for_data_asbase64(data): hasher = blobxfer.util.new_md5_hasher() hasher.update(data) return blobxfer.util.base64_encode_as_string(hasher.digest()) + + +class LocalFileMd5Offload(blobxfer.models.offload._MultiprocessOffload): + """LocalFileMd5Offload""" + def __init__(self, num_workers): + # type: (LocalFileMd5Offload, int) -> None + """Ctor for Local File Md5 Offload + :param LocalFileMd5Offload self: this + :param int num_workers: number of worker processes + """ + super(LocalFileMd5Offload, self).__init__( + self._worker_process, num_workers, 'MD5') + + def _worker_process(self): + # type: (LocalFileMd5Offload) -> None + """Compute MD5 for local file + :param LocalFileMd5Offload self: this + """ + while not self.terminated: + try: + filename, remote_md5, pagealign = self._task_queue.get(True, 1) + except queue.Empty: + continue + md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( + filename, pagealign) + logger.debug('MD5: {} {} {}'.format( + md5, remote_md5, filename)) + self._done_cv.acquire() + self._done_queue.put((filename, md5 == remote_md5)) + self._done_cv.notify() + self._done_cv.release() + + def add_localfile_for_md5_check(self, filename, remote_md5, mode): + # type: (LocalFileMd5Offload, str, str, + # blobxfer.models.azure.StorageModes) -> None + """Add a local file to MD5 check queue + :param LocalFileMd5Offload self: this + :param str filename: file to compute MD5 for + :param str remote_md5: remote MD5 to compare against + :param blobxfer.models.azure.StorageModes mode: mode + """ + if mode == blobxfer.models.azure.StorageModes.Page: + pagealign = True + else: + pagealign = False + self._task_queue.put((filename, remote_md5, pagealign)) diff --git a/cli/settings.py b/cli/settings.py index b62de58..4faadd9 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -36,6 +36,7 @@ import blobxfer.models.azure import blobxfer.models.download import blobxfer.models.options +import blobxfer.operations.azure import blobxfer.operations.crypto import blobxfer.util @@ -217,13 +218,13 @@ def merge_settings(config, cli_options): def create_azure_storage_credentials(config): - # type: (dict) -> blobxfer.models.azure.StorageCredentials + # type: (dict) -> blobxfer.operations.azure.StorageCredentials """Create an Azure StorageCredentials object from configuration :param dict config: config dict - :rtype: blobxfer.models.azure.StorageCredentials + :rtype: blobxfer.operations.azure.StorageCredentials :return: credentials object """ - creds = blobxfer.models.azure.StorageCredentials() + creds = blobxfer.operations.azure.StorageCredentials() endpoint = config['azure_storage']['endpoint'] for name in config['azure_storage']['accounts']: key = config['azure_storage']['accounts'][name] @@ -311,7 +312,7 @@ def create_download_specifications(config): raise RuntimeError( 'invalid number of source pairs specified per entry') sa = next(iter(src)) - asp = blobxfer.models.azure.SourcePath() + asp = blobxfer.operations.azure.SourcePath() asp.add_path_with_storage_account(src[sa], sa) if blobxfer.util.is_not_empty(conf['include']): asp.add_includes(conf['include']) diff --git a/setup.py b/setup.py index 7c61abf..74b57cf 100644 --- a/setup.py +++ b/setup.py @@ -31,13 +31,10 @@ packages = [ 'blobxfer', - 'blobxfer.blob', - 'blobxfer.blob.append', - 'blobxfer.blob.block', - 'blobxfer.blob.page', - 'blobxfer.crypto', - 'blobxfer.download', - 'blobxfer.file', + 'blobxfer.models', + 'blobxfer.operations', + 'blobxfer.operations.azure', + 'blobxfer.operations.azure.blob', 'blobxfer_cli', ] diff --git a/test_requirements.txt b/test_requirements.txt index 925320c..f2315c3 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,5 +1,5 @@ -flake8>=3.2.1 +flake8>=3.3.0 mock>=2.0.0 pypandoc>=1.3.3 -pytest>=3.0.5 +pytest>=3.0.7 pytest-cov>=2.4.0 diff --git a/tests/test_blobxfer_models.py b/tests/test_blobxfer_models.py deleted file mode 100644 index f200aaf..0000000 --- a/tests/test_blobxfer_models.py +++ /dev/null @@ -1,372 +0,0 @@ -# coding=utf-8 -"""Tests for models""" - -# stdlib imports -import mock -import os -try: - import pathlib2 as pathlib -except ImportError: # noqa - import pathlib -# non-stdlib imports -import azure.storage -import azure.storage.blob -import azure.storage.file -import pytest -# module under test -import blobxfer.models as models - - -@mock.patch('multiprocessing.cpu_count', return_value=1) -def test_concurrency_options(patched_cc): - a = models.ConcurrencyOptions( - crypto_processes=-1, - md5_processes=0, - transfer_threads=-2, - ) - - assert a.crypto_processes == 0 - assert a.md5_processes == 1 - assert a.transfer_threads == 3 - - -def test_general_options(): - a = models.GeneralOptions( - concurrency=models.ConcurrencyOptions( - crypto_processes=1, - md5_processes=2, - transfer_threads=3, - ), - progress_bar=False, - resume_file='abc', - timeout_sec=1, - verbose=True, - ) - - assert a.concurrency.crypto_processes == 1 - assert a.concurrency.md5_processes == 2 - assert a.concurrency.transfer_threads == 3 - assert not a.progress_bar - assert a.resume_file == pathlib.Path('abc') - assert a.timeout_sec == 1 - assert a.verbose - - a = models.GeneralOptions( - concurrency=models.ConcurrencyOptions( - crypto_processes=1, - md5_processes=2, - transfer_threads=3, - ), - progress_bar=False, - resume_file=None, - timeout_sec=1, - verbose=True, - ) - - assert a.concurrency.crypto_processes == 1 - assert a.concurrency.md5_processes == 2 - assert a.concurrency.transfer_threads == 3 - assert not a.progress_bar - assert a.resume_file is None - assert a.timeout_sec == 1 - assert a.verbose - - with pytest.raises(ValueError): - a = models.GeneralOptions(None) - - -def test_storage_credentials(): - creds = models.AzureStorageCredentials() - creds.add_storage_account('sa1', 'somekey1', 'endpoint') - - a = creds.get_storage_account('sa1') - assert a.name == 'sa1' - assert a.key == 'somekey1' - assert a.endpoint == 'endpoint' - assert isinstance( - a.append_blob_client, azure.storage.blob.AppendBlobService) - assert isinstance( - a.block_blob_client, azure.storage.blob.BlockBlobService) - assert isinstance( - a.file_client, azure.storage.file.FileService) - assert isinstance( - a.page_blob_client, azure.storage.blob.PageBlobService) - - with pytest.raises(KeyError): - a = creds.get_storage_account('sa2') - - with pytest.raises(ValueError): - creds.add_storage_account('sa1', 'somekeyxx', 'endpoint') - - creds.add_storage_account('sa2', 'somekey2', 'endpoint2') - a = creds.get_storage_account('sa1') - b = creds.get_storage_account('sa2') - assert a.name == 'sa1' - assert a.key == 'somekey1' - assert a.endpoint == 'endpoint' - assert b.name == 'sa2' - assert b.key == 'somekey2' - assert b.endpoint == 'endpoint2' - - -def test_key_is_sas(): - a = models.AzureStorageAccount('name', 'abcdef', 'endpoint') - assert not a.is_sas - - a = models.AzureStorageAccount('name', 'abcdef&blah', 'endpoint') - assert not a.is_sas - - a = models.AzureStorageAccount('name', '?abcdef', 'endpoint') - assert a.is_sas - - a = models.AzureStorageAccount( - 'name', '?sv=0&sr=1&sig=2', 'endpoint') - assert a.is_sas - - a = models.AzureStorageAccount( - 'name', 'sv=0&sr=1&sig=2', 'endpoint') - assert a.is_sas - - a = models.AzureStorageAccount( - 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint') - assert a.is_sas - - -def test_localsourcepaths_files(tmpdir): - tmpdir.mkdir('abc') - tmpdir.join('moo.cow').write('z') - abcpath = tmpdir.join('abc') - abcpath.join('hello.txt').write('hello') - abcpath.join('blah.x').write('x') - abcpath.join('blah.y').write('x') - abcpath.join('blah.z').write('x') - abcpath.mkdir('def') - defpath = abcpath.join('def') - defpath.join('world.txt').write('world') - defpath.join('moo.cow').write('y') - - a = models.LocalSourcePaths() - a.add_include('*.txt') - a.add_includes(['moo.cow', '*blah*']) - with pytest.raises(ValueError): - a.add_includes('abc') - a.add_exclude('**/blah.x') - a.add_excludes(['world.txt']) - with pytest.raises(ValueError): - a.add_excludes('abc') - a.add_path(str(tmpdir)) - a_set = set() - for file in a.files(): - sfile = str(file.parent_path / file.relative_path) - a_set.add(sfile) - - assert len(a.paths) == 1 - assert str(abcpath.join('blah.x')) not in a_set - assert str(defpath.join('world.txt')) in a_set - assert str(defpath.join('moo.cow')) not in a_set - - b = models.LocalSourcePaths() - b.add_includes(['moo.cow', '*blah*']) - b.add_include('*.txt') - b.add_excludes(['world.txt']) - b.add_exclude('**/blah.x') - b.add_paths([pathlib.Path(str(tmpdir))]) - for file in a.files(): - sfile = str(file.parent_path / file.relative_path) - assert sfile in a_set - - -def test_localdestinationpath(tmpdir): - tmpdir.mkdir('1') - path = tmpdir.join('1') - - a = models.LocalDestinationPath(str(path)) - a.is_dir = True - assert str(a.path) == str(path) - assert a.is_dir - - a.ensure_path_exists() - assert os.path.exists(str(a.path)) - - b = models.LocalDestinationPath() - b.is_dir = False - b.path = str(path) - with pytest.raises(RuntimeError): - b.ensure_path_exists() - assert not b.is_dir - - path2 = tmpdir.join('2') - path3 = path2.join('3') - c = models.LocalDestinationPath(str(path3)) - with pytest.raises(RuntimeError): - c.ensure_path_exists() - c.is_dir = False - c.ensure_path_exists() - assert os.path.exists(str(path2)) - assert os.path.isdir(str(path2)) - assert not c.is_dir - - -def test_azuresourcepath(): - p = '/cont/remote/path' - asp = models.AzureSourcePath() - asp.add_path_with_storage_account(p, 'sa') - - with pytest.raises(RuntimeError): - asp.add_path_with_storage_account('x', 'x') - - assert 'sa' == asp.lookup_storage_account(p) - - -@mock.patch('blobxfer.crypto.models.EncryptionMetadata') -@mock.patch('blobxfer.file.operations.list_files') -def test_azuresourcepath_files(patched_lf, patched_em): - p = '/cont/remote/path' - asp = models.AzureSourcePath() - asp.add_path_with_storage_account(p, 'sa') - - options = mock.MagicMock() - options.mode = models.AzureStorageModes.File - creds = mock.MagicMock() - creds.get_storage_account = mock.MagicMock() - sa = mock.MagicMock() - sa.file_client = mock.MagicMock() - creds.get_storage_account.return_value = sa - f = azure.storage.file.models.File(name='name') - patched_lf.side_effect = [[f]] - patched_em.encryption_metadata_exists = mock.MagicMock() - patched_em.encryption_metadata_exists.return_value = False - - i = 0 - for file in asp.files(creds, options, mock.MagicMock()): - i += 1 - assert file.name == 'name' - assert file.encryption_metadata is None - assert i == 1 - - fe = azure.storage.file.models.File(name='name') - fe.metadata = {'encryptiondata': {'a': 'b'}} - patched_lf.side_effect = [[fe]] - patched_em.encryption_metadata_exists.return_value = True - patched_em.convert_from_json = mock.MagicMock() - - i = 0 - for file in asp.files(creds, options, mock.MagicMock()): - i += 1 - assert file.name == 'name' - assert file.encryption_metadata is not None - assert i == 1 - - -@mock.patch('blobxfer.crypto.models.EncryptionMetadata') -@mock.patch('blobxfer.blob.operations.list_blobs') -def test_azuresourcepath_blobs(patched_lb, patched_em): - p = '/cont/remote/path' - asp = models.AzureSourcePath() - asp.add_path_with_storage_account(p, 'sa') - - options = mock.MagicMock() - options.mode = models.AzureStorageModes.Auto - creds = mock.MagicMock() - creds.get_storage_account = mock.MagicMock() - sa = mock.MagicMock() - sa.block_blob_client = mock.MagicMock() - creds.get_storage_account.return_value = sa - b = azure.storage.blob.models.Blob(name='name') - patched_lb.side_effect = [[b]] - patched_em.encryption_metadata_exists = mock.MagicMock() - patched_em.encryption_metadata_exists.return_value = False - - i = 0 - for file in asp.files(creds, options, mock.MagicMock()): - i += 1 - assert file.name == 'name' - assert file.encryption_metadata is None - assert i == 1 - - be = azure.storage.blob.models.Blob(name='name') - be.metadata = {'encryptiondata': {'a': 'b'}} - patched_lb.side_effect = [[be]] - patched_em.encryption_metadata_exists.return_value = True - patched_em.convert_from_json = mock.MagicMock() - - i = 0 - for file in asp.files(creds, options, mock.MagicMock()): - i += 1 - assert file.name == 'name' - assert file.encryption_metadata is not None - assert i == 1 - - -def test_downloadspecification(): - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( - check_file_md5=True, - chunk_size_bytes=4194304, - delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, - overwrite=True, - recursive=True, - restore_file_attributes=False, - rsa_private_key=None, - ), - skip_on_options=models.SkipOnOptions( - filesize_match=True, - lmt_ge=False, - md5_match=True, - ), - local_destination_path=models.LocalDestinationPath('dest'), - ) - - asp = models.AzureSourcePath() - p = 'some/remote/path' - asp.add_path_with_storage_account(p, 'sa') - - ds.add_azure_source_path(asp) - - assert ds.options.check_file_md5 - assert not ds.skip_on.lmt_ge - assert ds.destination.path == pathlib.Path('dest') - assert len(ds.sources) == 1 - assert p in ds.sources[0]._path_map - assert ds.sources[0]._path_map[p] == 'sa' - - -def test_azurestorageentity(): - ase = models.AzureStorageEntity('cont') - assert ase.container == 'cont' - assert ase.encryption_metadata is None - - blob = mock.MagicMock() - blob.name = 'name' - blob.snapshot = None - blob.properties = mock.MagicMock() - blob.properties.last_modified = 'lmt' - blob.properties.content_length = 123 - blob.properties.content_settings = mock.MagicMock() - blob.properties.content_settings.content_md5 = 'abc' - blob.properties.blob_type = azure.storage.blob.models._BlobTypes.BlockBlob - ase.populate_from_blob(mock.MagicMock(), blob) - - assert ase.client is not None - assert ase.name == 'name' - assert ase.lmt == 'lmt' - assert ase.size == 123 - assert ase.md5 == 'abc' - assert ase.snapshot is None - assert ase.mode == models.AzureStorageModes.Block - - blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob - ase.populate_from_blob(mock.MagicMock(), blob) - assert ase.mode == models.AzureStorageModes.Append - - blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob - blob.snapshot = 'abc' - ase.populate_from_blob(mock.MagicMock(), blob) - assert ase.mode == models.AzureStorageModes.Page - assert ase.snapshot is not None - - blob.snapshot = None - ase.populate_from_file(mock.MagicMock(), blob) - assert ase.mode == models.AzureStorageModes.File - assert ase.snapshot is None diff --git a/tests/test_blobxfer_models_azure.py b/tests/test_blobxfer_models_azure.py new file mode 100644 index 0000000..37a40a5 --- /dev/null +++ b/tests/test_blobxfer_models_azure.py @@ -0,0 +1,51 @@ +# coding=utf-8 +"""Tests for models azure""" + +# stdlib imports +import mock +# non-stdlib imports +import azure.storage +import azure.storage.blob +import azure.storage.file +# module under test +import blobxfer.models.azure as azmodels + + +def test_azurestorageentity(): + ase = azmodels.StorageEntity('cont') + assert ase.container == 'cont' + assert ase.encryption_metadata is None + + blob = mock.MagicMock() + blob.name = 'name' + blob.snapshot = None + blob.properties = mock.MagicMock() + blob.properties.last_modified = 'lmt' + blob.properties.content_length = 123 + blob.properties.content_settings = mock.MagicMock() + blob.properties.content_settings.content_md5 = 'abc' + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.BlockBlob + ase.populate_from_blob(mock.MagicMock(), blob) + + assert ase.client is not None + assert ase.name == 'name' + assert ase.lmt == 'lmt' + assert ase.size == 123 + assert ase.md5 == 'abc' + assert ase.snapshot is None + assert ase.mode == azmodels.StorageModes.Block + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob + ase.populate_from_blob(mock.MagicMock(), blob) + assert ase.mode == azmodels.StorageModes.Append + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob + blob.snapshot = 'abc' + ase.populate_from_blob(mock.MagicMock(), blob) + assert ase.mode == azmodels.StorageModes.Page + assert ase.snapshot is not None + + blob.snapshot = None + ase.populate_from_file(mock.MagicMock(), blob) + assert ase.mode == azmodels.StorageModes.File + assert ase.snapshot is None diff --git a/tests/test_blobxfer_crypto_models.py b/tests/test_blobxfer_models_crypto.py similarity index 98% rename from tests/test_blobxfer_crypto_models.py rename to tests/test_blobxfer_models_crypto.py index 8d58419..8503a71 100644 --- a/tests/test_blobxfer_crypto_models.py +++ b/tests/test_blobxfer_models_crypto.py @@ -1,5 +1,5 @@ # coding=utf-8 -"""Tests for crypto operations""" +"""Tests for crypto models""" # stdlib imports import copy @@ -8,8 +8,8 @@ import pytest # local imports # module under test -import blobxfer.crypto.models as models -import blobxfer.crypto.operations as ops +import blobxfer.models.crypto as models +import blobxfer.operations.crypto as ops _SAMPLE_RSA_KEY = """ diff --git a/tests/test_blobxfer_download_models.py b/tests/test_blobxfer_models_download.py similarity index 74% rename from tests/test_blobxfer_download_models.py rename to tests/test_blobxfer_models_download.py index e91607e..69133e2 100644 --- a/tests/test_blobxfer_download_models.py +++ b/tests/test_blobxfer_models_download.py @@ -13,10 +13,77 @@ # non-stdlib imports import pytest # local imports -import blobxfer.models +import blobxfer.models.azure as azmodels +import blobxfer.models.options as options +import blobxfer.operations.azure as azops import blobxfer.util as util # module under test -import blobxfer.download.models as models +import blobxfer.models.download as models + + +def test_localdestinationpath(tmpdir): + tmpdir.mkdir('1') + path = tmpdir.join('1') + + a = models.LocalDestinationPath(str(path)) + a.is_dir = True + assert str(a.path) == str(path) + assert a.is_dir + + a.ensure_path_exists() + assert os.path.exists(str(a.path)) + + b = models.LocalDestinationPath() + b.is_dir = False + b.path = str(path) + with pytest.raises(RuntimeError): + b.ensure_path_exists() + assert not b.is_dir + + path2 = tmpdir.join('2') + path3 = path2.join('3') + c = models.LocalDestinationPath(str(path3)) + with pytest.raises(RuntimeError): + c.ensure_path_exists() + c.is_dir = False + c.ensure_path_exists() + assert os.path.exists(str(path2)) + assert os.path.isdir(str(path2)) + assert not c.is_dir + + +def test_downloadspecification(): + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=True, + lmt_ge=False, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + + asp = azops.SourcePath() + p = 'some/remote/path' + asp.add_path_with_storage_account(p, 'sa') + + ds.add_azure_source_path(asp) + + assert ds.options.check_file_md5 + assert not ds.skip_on.lmt_ge + assert ds.destination.path == pathlib.Path('dest') + assert len(ds.sources) == 1 + assert p in ds.sources[0]._path_map + assert ds.sources[0]._path_map[p] == 'sa' def test_downloaddescriptor(tmpdir): @@ -25,14 +92,14 @@ def test_downloaddescriptor(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 1024 ase._encryption = mock.MagicMock() with pytest.raises(RuntimeError): - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) ase._encryption.symmetric_key = b'123' - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) assert d.entity == ase assert not d.must_compute_md5 @@ -44,19 +111,19 @@ def test_downloaddescriptor(tmpdir): d.local_path.unlink() ase._size = 1 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) assert d._total_chunks == 1 assert d.local_path.stat().st_size == 0 d.local_path.unlink() ase._encryption = None ase._size = 1024 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) assert d.local_path.stat().st_size == 1024 # pre-existing file check ase._size = 0 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) assert d._total_chunks == 0 assert d.local_path.stat().st_size == 0 @@ -67,9 +134,9 @@ def test_downloaddescriptor_next_offsets(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 256 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 128 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 @@ -82,12 +149,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._size = 0 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) assert d._total_chunks == 0 assert d.next_offsets() is None ase._size = 1 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 assert offsets.chunk_num == 0 @@ -99,7 +166,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._size = 256 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 assert offsets.chunk_num == 0 @@ -111,7 +178,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._size = 256 + 16 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 2 assert offsets.chunk_num == 0 @@ -132,7 +199,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' ase._size = 128 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 assert offsets.chunk_num == 0 @@ -144,7 +211,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._size = 256 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 1 assert offsets.chunk_num == 0 @@ -156,7 +223,7 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() is None ase._size = 256 + 32 # 16 bytes over + padding - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() assert d._total_chunks == 2 assert offsets.chunk_num == 0 @@ -181,9 +248,9 @@ def test_postpone_integrity_check(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 32 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() d._postpone_integrity_check(offsets, b'0' * ase._size) @@ -198,9 +265,9 @@ def test_postpone_integrity_check(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 32 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() d._postpone_integrity_check(offsets, b'0' * ase._size) @@ -219,9 +286,9 @@ def test_perform_chunked_integrity_check(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() data = b'0' * opts.chunk_size_bytes @@ -233,11 +300,11 @@ def test_perform_chunked_integrity_check(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() data = b'0' * opts.chunk_size_bytes @@ -255,10 +322,10 @@ def test_cleanup_all_temporary_files(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 lp = pathlib.Path(str(tmpdir.join('a'))) - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() data = b'0' * opts.chunk_size_bytes @@ -269,7 +336,7 @@ def test_cleanup_all_temporary_files(tmpdir): assert not d._unchecked_chunks[0].file_path.exists() lp = pathlib.Path(str(tmpdir.join('b'))) - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() data = b'0' * opts.chunk_size_bytes @@ -288,9 +355,9 @@ def test_write_data(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) offsets = d.next_offsets() data = b'0' * ase._size @@ -306,7 +373,7 @@ def test_finalize_file(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' @@ -322,7 +389,7 @@ def test_finalize_file(tmpdir): message_authentication_code = util.base64_encode_as_string( _hmac.digest()) - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) d.hmac.update(data) d.finalize_file() @@ -335,7 +402,7 @@ def test_finalize_file(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 data = b'0' * ase._size @@ -343,7 +410,7 @@ def test_finalize_file(tmpdir): md5.update(data) ase._md5 = util.base64_encode_as_string(md5.digest()) - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) d.md5.update(data) d.finalize_file() @@ -356,12 +423,12 @@ def test_finalize_file(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 data = b'0' * ase._size - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) d.finalize_file() assert not d.local_path.exists() @@ -373,13 +440,13 @@ def test_finalize_file(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 data = b'0' * ase._size ase._md5 = 'oops' - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) d.md5.update(data) d.finalize_file() @@ -392,10 +459,10 @@ def test_operations(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.DownloadDescriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts) d._outstanding_ops = 1 d._unchecked_chunks = {0: None} assert not d.all_operations_completed diff --git a/tests/test_blobxfer_offload.py b/tests/test_blobxfer_models_offload.py similarity index 94% rename from tests/test_blobxfer_offload.py rename to tests/test_blobxfer_models_offload.py index 71cc97d..ca5a2bb 100644 --- a/tests/test_blobxfer_offload.py +++ b/tests/test_blobxfer_models_offload.py @@ -7,7 +7,7 @@ import pytest # local imports # module under test -import blobxfer.offload as offload +import blobxfer.models.offload as offload def test_multiprocess_offload(): diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py new file mode 100644 index 0000000..1e7cb8b --- /dev/null +++ b/tests/test_blobxfer_models_options.py @@ -0,0 +1,82 @@ +# coding=utf-8 +"""Tests for models options""" + +# stdlib imports +import mock +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import pytest +# module under test +import blobxfer.models.options as options + + +@mock.patch('multiprocessing.cpu_count', return_value=1) +def test_concurrency_options(patched_cc): + a = options.Concurrency( + crypto_processes=-1, + md5_processes=0, + transfer_threads=-2, + ) + + assert a.crypto_processes == 0 + assert a.md5_processes == 1 + assert a.transfer_threads == 3 + + +@mock.patch('multiprocessing.cpu_count', return_value=10) +def test_concurrency_options_max_transfer_threads(patched_cc): + a = options.Concurrency( + crypto_processes=1, + md5_processes=1, + transfer_threads=None, + ) + + assert a.transfer_threads == 24 + + +def test_general_options(): + a = options.General( + concurrency=options.Concurrency( + crypto_processes=1, + md5_processes=2, + transfer_threads=3, + ), + progress_bar=False, + resume_file='abc', + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.transfer_threads == 3 + assert not a.progress_bar + assert a.resume_file == pathlib.Path('abc') + assert a.timeout_sec == 1 + assert a.verbose + + a = options.General( + concurrency=options.Concurrency( + crypto_processes=1, + md5_processes=2, + transfer_threads=3, + ), + progress_bar=False, + resume_file=None, + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.transfer_threads == 3 + assert not a.progress_bar + assert a.resume_file is None + assert a.timeout_sec == 1 + assert a.verbose + + with pytest.raises(ValueError): + a = options.General(None) diff --git a/tests/test_blobxfer_models_upload.py b/tests/test_blobxfer_models_upload.py new file mode 100644 index 0000000..21d9494 --- /dev/null +++ b/tests/test_blobxfer_models_upload.py @@ -0,0 +1,56 @@ +# coding=utf-8 +"""Tests for models upload""" + +# stdlib imports +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import pytest +# module under test +import blobxfer.models.upload as upload + + +def test_localsourcepaths_files(tmpdir): + tmpdir.mkdir('abc') + tmpdir.join('moo.cow').write('z') + abcpath = tmpdir.join('abc') + abcpath.join('hello.txt').write('hello') + abcpath.join('blah.x').write('x') + abcpath.join('blah.y').write('x') + abcpath.join('blah.z').write('x') + abcpath.mkdir('def') + defpath = abcpath.join('def') + defpath.join('world.txt').write('world') + defpath.join('moo.cow').write('y') + + a = upload.LocalSourcePaths() + a.add_include('*.txt') + a.add_includes(['moo.cow', '*blah*']) + with pytest.raises(ValueError): + a.add_includes('abc') + a.add_exclude('**/blah.x') + a.add_excludes(['world.txt']) + with pytest.raises(ValueError): + a.add_excludes('abc') + a.add_path(str(tmpdir)) + a_set = set() + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + a_set.add(sfile) + + assert len(a.paths) == 1 + assert str(abcpath.join('blah.x')) not in a_set + assert str(defpath.join('world.txt')) in a_set + assert str(defpath.join('moo.cow')) not in a_set + + b = upload.LocalSourcePaths() + b.add_includes(['moo.cow', '*blah*']) + b.add_include('*.txt') + b.add_excludes(['world.txt']) + b.add_exclude('**/blah.x') + b.add_paths([pathlib.Path(str(tmpdir))]) + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + assert sfile in a_set diff --git a/tests/test_blobxfer_operations.py b/tests/test_blobxfer_operations.py deleted file mode 100644 index 9b648f6..0000000 --- a/tests/test_blobxfer_operations.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding=utf-8 -"""Tests for operations""" - -# stdlib imports -import mock -# non-stdlib imports -import pytest -# local imports -import blobxfer.models -# module under test -import blobxfer.operations as ops - - -@mock.patch('blobxfer.file.operations.check_if_single_file') -@mock.patch('blobxfer.blob.operations.check_if_single_blob') -def test_ensure_local_destination(patched_blob, patched_file, tmpdir): - downdir = tmpdir.join('down') - - # non-file tests - ds = blobxfer.models.DownloadSpecification( - download_options=blobxfer.models.DownloadOptions( - check_file_md5=True, - chunk_size_bytes=4194304, - delete_extraneous_destination=False, - mode=blobxfer.models.AzureStorageModes.Auto, - overwrite=True, - recursive=True, - restore_file_attributes=False, - rsa_private_key=None, - ), - skip_on_options=mock.MagicMock(), - local_destination_path=blobxfer.models.LocalDestinationPath( - str(downdir) - ), - ) - - with pytest.raises(RuntimeError): - ops.ensure_local_destination(mock.MagicMock(), ds) - - asp = blobxfer.models.AzureSourcePath() - p = 'cont/remote/path' - asp.add_path_with_storage_account(p, 'sa') - - ds.add_azure_source_path(asp) - - patched_blob.return_value = False - ops.ensure_local_destination(mock.MagicMock(), ds) - assert ds.destination.is_dir - - patched_blob.return_value = True - with pytest.raises(RuntimeError): - ops.ensure_local_destination(mock.MagicMock(), ds) - - # file tests - ds = blobxfer.models.DownloadSpecification( - download_options=blobxfer.models.DownloadOptions( - check_file_md5=True, - chunk_size_bytes=4194304, - delete_extraneous_destination=False, - mode=blobxfer.models.AzureStorageModes.File, - overwrite=True, - recursive=True, - restore_file_attributes=False, - rsa_private_key=None, - ), - skip_on_options=mock.MagicMock(), - local_destination_path=blobxfer.models.LocalDestinationPath( - str(downdir) - ), - ) - - ds.add_azure_source_path(asp) - - patched_file.return_value = (False, None) - ops.ensure_local_destination(mock.MagicMock(), ds) - assert ds.destination.is_dir - - patched_file.return_value = (True, mock.MagicMock()) - with pytest.raises(RuntimeError): - ops.ensure_local_destination(mock.MagicMock(), ds) diff --git a/tests/test_blobxfer_operations_azure.py b/tests/test_blobxfer_operations_azure.py new file mode 100644 index 0000000..c90340a --- /dev/null +++ b/tests/test_blobxfer_operations_azure.py @@ -0,0 +1,161 @@ +# coding=utf-8 +"""Tests for operations azure""" + +# stdlib imports +import mock +# non-stdlib imports +import azure.storage +import azure.storage.blob +import azure.storage.file +import pytest +# module under test +import blobxfer.models.azure as azmodels +import blobxfer.operations.azure as azops + + +def test_storage_credentials(): + creds = azops.StorageCredentials() + creds.add_storage_account('sa1', 'somekey1', 'endpoint') + + a = creds.get_storage_account('sa1') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + assert isinstance( + a.append_blob_client, azure.storage.blob.AppendBlobService) + assert isinstance( + a.block_blob_client, azure.storage.blob.BlockBlobService) + assert isinstance( + a.file_client, azure.storage.file.FileService) + assert isinstance( + a.page_blob_client, azure.storage.blob.PageBlobService) + + with pytest.raises(KeyError): + a = creds.get_storage_account('sa2') + + with pytest.raises(ValueError): + creds.add_storage_account('sa1', 'somekeyxx', 'endpoint') + + creds.add_storage_account('sa2', 'somekey2', 'endpoint2') + a = creds.get_storage_account('sa1') + b = creds.get_storage_account('sa2') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + assert b.name == 'sa2' + assert b.key == 'somekey2' + assert b.endpoint == 'endpoint2' + + +def test_key_is_sas(): + a = azops.StorageAccount('name', 'abcdef', 'endpoint') + assert not a.is_sas + + a = azops.StorageAccount('name', 'abcdef&blah', 'endpoint') + assert not a.is_sas + + a = azops.StorageAccount('name', '?abcdef', 'endpoint') + assert a.is_sas + + a = azops.StorageAccount( + 'name', '?sv=0&sr=1&sig=2', 'endpoint') + assert a.is_sas + + a = azops.StorageAccount( + 'name', 'sv=0&sr=1&sig=2', 'endpoint') + assert a.is_sas + + a = azops.StorageAccount( + 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint') + assert a.is_sas + + +def test_azuresourcepath(): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + with pytest.raises(RuntimeError): + asp.add_path_with_storage_account('x', 'x') + + assert 'sa' == asp.lookup_storage_account(p) + + +@mock.patch('blobxfer.models.crypto.EncryptionMetadata') +@mock.patch('blobxfer.operations.azure.file.list_files') +def test_azuresourcepath_files(patched_lf, patched_em): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = azmodels.StorageModes.File + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.file_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + f = azure.storage.file.models.File(name='name') + patched_lf.side_effect = [[f]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is None + assert i == 1 + + fe = azure.storage.file.models.File(name='name') + fe.metadata = {'encryptiondata': {'a': 'b'}} + patched_lf.side_effect = [[fe]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is not None + assert i == 1 + + +@mock.patch('blobxfer.models.crypto.EncryptionMetadata') +@mock.patch('blobxfer.operations.azure.blob.list_blobs') +def test_azuresourcepath_blobs(patched_lb, patched_em): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = azmodels.StorageModes.Auto + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.block_blob_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + b = azure.storage.blob.models.Blob(name='name') + patched_lb.side_effect = [[b]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is None + assert i == 1 + + be = azure.storage.blob.models.Blob(name='name') + be.metadata = {'encryptiondata': {'a': 'b'}} + patched_lb.side_effect = [[be]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is not None + assert i == 1 diff --git a/tests/test_blobxfer_blob_operations.py b/tests/test_blobxfer_operations_azure_blob.py similarity index 83% rename from tests/test_blobxfer_blob_operations.py rename to tests/test_blobxfer_operations_azure_blob.py index dd635f2..d6ad180 100644 --- a/tests/test_blobxfer_blob_operations.py +++ b/tests/test_blobxfer_operations_azure_blob.py @@ -8,9 +8,9 @@ import azure.storage.blob import pytest # local imports -import blobxfer.models as models +import blobxfer.models.azure as azmodels # module under test -import blobxfer.blob.operations as ops +import blobxfer.operations.azure.blob as ops def test_check_if_single_blob(): @@ -36,7 +36,7 @@ def test_check_if_single_blob(): def test_list_blobs(): with pytest.raises(RuntimeError): for blob in ops.list_blobs( - None, 'cont', 'prefix', models.AzureStorageModes.File): + None, 'cont', 'prefix', azmodels.StorageModes.File): pass _blob = azure.storage.blob.models.Blob(name='name') @@ -46,7 +46,7 @@ def test_list_blobs(): i = 0 for blob in ops.list_blobs( - client, 'cont', 'prefix', models.AzureStorageModes.Auto): + client, 'cont', 'prefix', azmodels.StorageModes.Auto): i += 1 assert blob.name == 'name' assert i == 1 @@ -55,14 +55,14 @@ def test_list_blobs(): azure.storage.blob.models._BlobTypes.AppendBlob i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', models.AzureStorageModes.Block): + client, 'dir', 'prefix', azmodels.StorageModes.Block): i += 1 assert blob.name == 'name' assert i == 0 i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', models.AzureStorageModes.Page): + client, 'dir', 'prefix', azmodels.StorageModes.Page): i += 1 assert blob.name == 'name' assert i == 0 @@ -71,7 +71,7 @@ def test_list_blobs(): azure.storage.blob.models._BlobTypes.BlockBlob i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', models.AzureStorageModes.Append): + client, 'dir', 'prefix', azmodels.StorageModes.Append): i += 1 assert blob.name == 'name' assert i == 0 @@ -82,7 +82,7 @@ def test_list_blobs(): for blob in ops.list_blobs( client, 'cont', 'a?snapshot=2017-02-23T22:21:14.8121864Z', - models.AzureStorageModes.Auto): + azmodels.StorageModes.Auto): i += 1 assert blob.name == 'name' assert blob.snapshot == _blob.snapshot diff --git a/tests/test_blobxfer_blob_append_operations.py b/tests/test_blobxfer_operations_azure_blob_append.py similarity index 71% rename from tests/test_blobxfer_blob_append_operations.py rename to tests/test_blobxfer_operations_azure_blob_append.py index e207057..5553b7d 100644 --- a/tests/test_blobxfer_blob_append_operations.py +++ b/tests/test_blobxfer_operations_azure_blob_append.py @@ -1,17 +1,17 @@ # coding=utf-8 -"""Tests for models""" +"""Tests for operations: blob append""" # stdlib imports # non-stdlib imports import azure.storage # local imports -import blobxfer.models as models # module under test -import blobxfer.blob.append.operations as ops +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.append as ops def test_create_client(): - sa = models.AzureStorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.AppendBlobService) @@ -19,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.AppendBlobService) diff --git a/tests/test_blobxfer_blob_block_operations.py b/tests/test_blobxfer_operations_azure_blob_block.py similarity index 71% rename from tests/test_blobxfer_blob_block_operations.py rename to tests/test_blobxfer_operations_azure_blob_block.py index dc83b8b..4aece2d 100644 --- a/tests/test_blobxfer_blob_block_operations.py +++ b/tests/test_blobxfer_operations_azure_blob_block.py @@ -1,18 +1,17 @@ # coding=utf-8 -"""Tests for models""" +"""Tests for operations: block blob""" # stdlib imports # non-stdlib imports import azure.storage -import pytest # local imports -import blobxfer.models as models # module under test -import blobxfer.blob.block.operations as ops +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.block as ops def test_create_client(): - sa = models.AzureStorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.BlockBlobService) @@ -20,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.BlockBlobService) diff --git a/tests/test_blobxfer_blob_page_operations.py b/tests/test_blobxfer_operations_azure_blob_page.py similarity index 74% rename from tests/test_blobxfer_blob_page_operations.py rename to tests/test_blobxfer_operations_azure_blob_page.py index 8ae5989..f70e83d 100644 --- a/tests/test_blobxfer_blob_page_operations.py +++ b/tests/test_blobxfer_operations_azure_blob_page.py @@ -4,15 +4,14 @@ # stdlib imports # non-stdlib imports import azure.storage -import pytest # local imports -import blobxfer.models as models # module under test -import blobxfer.blob.page.operations as ops +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.page as ops def test_create_client(): - sa = models.AzureStorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.PageBlobService) @@ -20,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.PageBlobService) diff --git a/tests/test_blobxfer_file_operations.py b/tests/test_blobxfer_operations_azure_file.py similarity index 93% rename from tests/test_blobxfer_file_operations.py rename to tests/test_blobxfer_operations_azure_file.py index e354bda..c6bf764 100644 --- a/tests/test_blobxfer_file_operations.py +++ b/tests/test_blobxfer_operations_azure_file.py @@ -7,14 +7,14 @@ import azure.common import azure.storage # local imports -import blobxfer.models as models import blobxfer.util as util # module under test -import blobxfer.file.operations as ops +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.file as ops def test_create_client(): - sa = models.AzureStorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.file.FileService) @@ -22,7 +22,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = models.AzureStorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.file.FileService) @@ -94,7 +94,7 @@ def test_list_files_single_file(): @mock.patch( - 'blobxfer.file.operations.check_if_single_file', + 'blobxfer.operations.azure.file.check_if_single_file', return_value=(False, None) ) def test_list_files_directory(patched_cisf): diff --git a/tests/test_blobxfer_crypto_operations.py b/tests/test_blobxfer_operations_crypto.py similarity index 96% rename from tests/test_blobxfer_crypto_operations.py rename to tests/test_blobxfer_operations_crypto.py index 84d633a..3ed2262 100644 --- a/tests/test_blobxfer_crypto_operations.py +++ b/tests/test_blobxfer_operations_crypto.py @@ -8,9 +8,9 @@ # non-stdlib imports import cryptography.hazmat.primitives.asymmetric.rsa # local imports -import blobxfer.download.models +import blobxfer.models.download # module under test -import blobxfer.crypto.operations as ops +import blobxfer.operations.crypto as ops _RSAKEY = cryptography.hazmat.primitives.asymmetric.rsa.generate_private_key( @@ -94,7 +94,7 @@ def test_cryptooffload_decrypt(): a = None try: a = ops.CryptoOffload(1) - offsets = blobxfer.download.models.DownloadOffsets( + offsets = blobxfer.models.download.Offsets( chunk_num=0, fd_start=1, num_bytes=2, diff --git a/tests/test_blobxfer_download_operations.py b/tests/test_blobxfer_operations_download.py similarity index 78% rename from tests/test_blobxfer_download_operations.py rename to tests/test_blobxfer_operations_download.py index 3645291..4e05182 100644 --- a/tests/test_blobxfer_download_operations.py +++ b/tests/test_blobxfer_operations_download.py @@ -18,11 +18,82 @@ import azure.storage.blob import pytest # local imports -import blobxfer.download.models -import blobxfer.models as models +import blobxfer.models.azure as azmodels +import blobxfer.models.download as models +import blobxfer.models.options as options +import blobxfer.operations.azure as azops import blobxfer.util as util # module under test -import blobxfer.download.operations as ops +import blobxfer.operations.download as ops + + +@mock.patch('blobxfer.operations.azure.file.check_if_single_file') +@mock.patch('blobxfer.operations.azure.blob.check_if_single_blob') +def test_ensure_local_destination(patched_blob, patched_file, tmpdir): + downdir = tmpdir.join('down') + + # non-file tests + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + + asp = azops.SourcePath() + p = 'cont/remote/path' + asp.add_path_with_storage_account(p, 'sa') + + ds.add_azure_source_path(asp) + + patched_blob.return_value = False + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + assert ds.destination.is_dir + + patched_blob.return_value = True + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + + # file tests + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.File, + overwrite=True, + recursive=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + + ds.add_azure_source_path(asp) + + patched_file.return_value = (False, None) + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + assert ds.destination.is_dir + + patched_file.return_value = (True, mock.MagicMock()) + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) def test_check_download_conditions(tmpdir): @@ -31,18 +102,18 @@ def test_check_download_conditions(tmpdir): ep = pathlib.Path(str(ap)) nep = pathlib.Path(str(tmpdir.join('nep'))) - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( + ds = models.Specification( + download_options=options.Download( check_file_md5=True, chunk_size_bytes=4194304, delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, + mode=azmodels.StorageModes.Auto, overwrite=False, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=models.SkipOnOptions( + skip_on_options=options.SkipOn( filesize_match=True, lmt_ge=True, md5_match=True, @@ -55,18 +126,18 @@ def test_check_download_conditions(tmpdir): result = d._check_download_conditions(ep, mock.MagicMock()) assert result == ops.DownloadAction.Skip - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( + ds = models.Specification( + download_options=options.Download( check_file_md5=True, chunk_size_bytes=4194304, delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, + mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=models.SkipOnOptions( + skip_on_options=options.SkipOn( filesize_match=True, lmt_ge=True, md5_match=True, @@ -77,18 +148,18 @@ def test_check_download_conditions(tmpdir): result = d._check_download_conditions(ep, mock.MagicMock()) assert result == ops.DownloadAction.CheckMd5 - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( + ds = models.Specification( + download_options=options.Download( check_file_md5=True, chunk_size_bytes=4194304, delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, + mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=models.SkipOnOptions( + skip_on_options=options.SkipOn( filesize_match=False, lmt_ge=False, md5_match=False, @@ -99,18 +170,18 @@ def test_check_download_conditions(tmpdir): result = d._check_download_conditions(ep, mock.MagicMock()) assert result == ops.DownloadAction.Download - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( + ds = models.Specification( + download_options=options.Download( check_file_md5=True, chunk_size_bytes=4194304, delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, + mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=models.SkipOnOptions( + skip_on_options=options.SkipOn( filesize_match=True, lmt_ge=False, md5_match=False, @@ -118,29 +189,29 @@ def test_check_download_conditions(tmpdir): local_destination_path=models.LocalDestinationPath('dest'), ) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) - rfile = models.AzureStorageEntity('cont') + rfile = azmodels.StorageEntity('cont') rfile._size = util.page_align_content_length(ep.stat().st_size) - rfile._mode = models.AzureStorageModes.Page + rfile._mode = azmodels.StorageModes.Page result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.Skip rfile._size = ep.stat().st_size - rfile._mode = models.AzureStorageModes.Page + rfile._mode = azmodels.StorageModes.Page result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.Download - ds = models.DownloadSpecification( - download_options=models.DownloadOptions( + ds = models.Specification( + download_options=options.Download( check_file_md5=True, chunk_size_bytes=4194304, delete_extraneous_destination=False, - mode=models.AzureStorageModes.Auto, + mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, restore_file_attributes=False, rsa_private_key=None, ), - skip_on_options=models.SkipOnOptions( + skip_on_options=options.SkipOn( filesize_match=False, lmt_ge=True, md5_match=False, @@ -148,7 +219,7 @@ def test_check_download_conditions(tmpdir): local_destination_path=models.LocalDestinationPath('dest'), ) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) - rfile = models.AzureStorageEntity('cont') + rfile = azmodels.StorageEntity('cont') rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) + \ datetime.timedelta(days=1) result = d._check_download_conditions(ep, rfile) @@ -164,7 +235,7 @@ def test_pre_md5_skip_on_check(): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._md5_offload = mock.MagicMock() - rfile = models.AzureStorageEntity('cont') + rfile = azmodels.StorageEntity('cont') rfile._encryption = mock.MagicMock() rfile._encryption.blobxfer_extensions = mock.MagicMock() rfile._encryption.blobxfer_extensions.pre_encrypted_content_md5 = \ @@ -186,7 +257,7 @@ def test_post_md5_skip_on_check(): d._md5_offload = mock.MagicMock() lpath = 'lpath' - rfile = models.AzureStorageEntity('cont') + rfile = azmodels.StorageEntity('cont') rfile._md5 = 'abc' d._pre_md5_skip_on_check(lpath, rfile) d._download_set.add(pathlib.Path(lpath)) @@ -217,7 +288,7 @@ def test_check_for_downloads_from_md5(): assert d._add_to_download_queue.call_count == 0 with mock.patch( - 'blobxfer.download.operations.Downloader.' + 'blobxfer.operations.download.Downloader.' 'termination_check_md5', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( @@ -233,7 +304,7 @@ def test_check_for_downloads_from_md5(): assert d._add_to_download_queue.call_count == 1 with mock.patch( - 'blobxfer.download.operations.Downloader.' + 'blobxfer.operations.download.Downloader.' 'termination_check_md5', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( @@ -267,7 +338,7 @@ def test_check_for_crypto_done(): assert d._complete_chunk_download.call_count == 0 with mock.patch( - 'blobxfer.download.operations.Downloader.termination_check', + 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -288,7 +359,7 @@ def test_check_for_crypto_done(): def test_add_to_download_queue(tmpdir): path = tmpdir.join('a') lpath = pathlib.Path(str(path)) - ase = models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 1 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' @@ -320,9 +391,9 @@ def test_complete_chunk_download(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) offsets = dd.next_offsets() @@ -335,9 +406,9 @@ def test_complete_chunk_download(tmpdir): assert dd._completed_ops == 1 -@mock.patch('blobxfer.crypto.operations.aes_cbc_decrypt_data') -@mock.patch('blobxfer.file.operations.get_file_range') -@mock.patch('blobxfer.blob.operations.get_blob_range') +@mock.patch('blobxfer.operations.crypto.aes_cbc_decrypt_data') +@mock.patch('blobxfer.operations.azure.file.get_file_range') +@mock.patch('blobxfer.operations.azure.blob.get_blob_range') def test_worker_thread_download( patched_gbr, patched_gfr, patched_acdd, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -352,10 +423,10 @@ def test_worker_thread_download( assert d._complete_chunk_download.call_count == 0 with mock.patch( - 'blobxfer.download.operations.Downloader.termination_check', + 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: with mock.patch( - 'blobxfer.download.models.DownloadDescriptor.' + 'blobxfer.models.download.Descriptor.' 'all_operations_completed', new_callable=mock.PropertyMock) as patched_aoc: d = ops.Downloader( @@ -364,12 +435,12 @@ def test_worker_thread_download( opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' lp = pathlib.Path(str(tmpdir.join('a'))) - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.next_offsets = mock.MagicMock(side_effect=[None, None]) dd.finalize_file = mock.MagicMock() patched_aoc.side_effect = [False, True] @@ -385,19 +456,19 @@ def test_worker_thread_download( assert d._download_count == 1 with mock.patch( - 'blobxfer.download.operations.Downloader.termination_check', + 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') - ase._mode = blobxfer.models.AzureStorageModes.File + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.File ase._size = 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('b'))) - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() d._dd_map[str(lp)] = mock.MagicMock() @@ -411,22 +482,22 @@ def test_worker_thread_download( assert dd.perform_chunked_integrity_check.call_count == 1 with mock.patch( - 'blobxfer.download.operations.Downloader.termination_check', + 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') - ase._mode = blobxfer.models.AzureStorageModes.Auto + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.Auto ase._size = 32 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' ase._encryption.content_encryption_iv = b'0' * 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('c'))) - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() d._crypto_offload = mock.MagicMock() @@ -443,7 +514,7 @@ def test_worker_thread_download( assert dd.perform_chunked_integrity_check.call_count == 1 with mock.patch( - 'blobxfer.download.operations.Downloader.termination_check', + 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -451,15 +522,15 @@ def test_worker_thread_download( opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') - ase._mode = blobxfer.models.AzureStorageModes.Auto + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.Auto ase._size = 32 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' ase._encryption.content_encryption_iv = b'0' * 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('d'))) - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.next_offsets() dd.perform_chunked_integrity_check = mock.MagicMock() patched_acdd.return_value = b'0' * 16 @@ -480,9 +551,9 @@ def test_cleanup_temporary_files(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.cleanup_all_temporary_files = mock.MagicMock() dd.cleanup_all_temporary_files.side_effect = Exception d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -495,9 +566,9 @@ def test_cleanup_temporary_files(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._general_options.resume_file = None d._dd_map[0] = dd @@ -508,9 +579,9 @@ def test_cleanup_temporary_files(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 - ase = blobxfer.models.AzureStorageEntity('cont') + ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = blobxfer.download.models.DownloadDescriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts) dd.cleanup_all_temporary_files = mock.MagicMock() dd.cleanup_all_temporary_files.side_effect = Exception d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -521,9 +592,12 @@ def test_cleanup_temporary_files(tmpdir): @mock.patch('time.clock') -@mock.patch('blobxfer.md5.LocalFileMd5Offload') -@mock.patch('blobxfer.blob.operations.list_blobs') -@mock.patch('blobxfer.operations.ensure_local_destination', return_value=True) +@mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') +@mock.patch('blobxfer.operations.azure.blob.list_blobs') +@mock.patch( + 'blobxfer.operations.download.Downloader.ensure_local_destination', + return_value=True +) def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._cleanup_temporary_files = mock.MagicMock() @@ -534,7 +608,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 - d._spec.options.mode = models.AzureStorageModes.Auto + d._spec.options.mode = azmodels.StorageModes.Auto d._spec.options.overwrite = True d._spec.skip_on = mock.MagicMock() d._spec.skip_on.md5_match = False @@ -544,7 +618,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d._spec.destination.path = pathlib.Path(str(tmpdir)) p = '/cont/remote/path' - asp = models.AzureSourcePath() + asp = azops.SourcePath() asp.add_path_with_storage_account(p, 'sa') d._spec.sources.append(asp) diff --git a/tests/test_blobxfer_md5.py b/tests/test_blobxfer_operations_md5.py similarity index 72% rename from tests/test_blobxfer_md5.py rename to tests/test_blobxfer_operations_md5.py index c38e758..5bd7b20 100644 --- a/tests/test_blobxfer_md5.py +++ b/tests/test_blobxfer_operations_md5.py @@ -7,9 +7,9 @@ # non-stdlib imports import pytest # local imports -import blobxfer.models as models +import blobxfer.models.azure as azmodels # module under test -import blobxfer.md5 as md5 +import blobxfer.operations.md5 as ops def test_compute_md5(tmpdir): @@ -17,22 +17,22 @@ def test_compute_md5(tmpdir): testdata = str(uuid.uuid4()) with open(lpath, 'wt') as f: f.write(testdata) - md5_file = md5.compute_md5_for_file_asbase64(lpath) - md5_data = md5.compute_md5_for_data_asbase64(testdata.encode('utf8')) + md5_file = ops.compute_md5_for_file_asbase64(lpath) + md5_data = ops.compute_md5_for_data_asbase64(testdata.encode('utf8')) assert md5_file == md5_data - md5_file_page = md5.compute_md5_for_file_asbase64(lpath, True) + md5_file_page = ops.compute_md5_for_file_asbase64(lpath, True) assert md5_file != md5_file_page # test non-existent file with pytest.raises(IOError): - md5.compute_md5_for_file_asbase64(testdata) + ops.compute_md5_for_file_asbase64(testdata) def test_done_cv(): a = None try: - a = md5.LocalFileMd5Offload(num_workers=1) + a = ops.LocalFileMd5Offload(num_workers=1) assert a.done_cv == a._done_cv finally: if a: @@ -41,11 +41,11 @@ def test_done_cv(): def test_finalize_md5_processes(): with pytest.raises(ValueError): - md5.LocalFileMd5Offload(num_workers=0) + ops.LocalFileMd5Offload(num_workers=0) a = None try: - a = md5.LocalFileMd5Offload(num_workers=1) + a = ops.LocalFileMd5Offload(num_workers=1) finally: if a: a.finalize_processes() @@ -58,16 +58,16 @@ def test_from_add_to_done_non_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') - remote_md5 = md5.compute_md5_for_file_asbase64(str(file)) + remote_md5 = ops.compute_md5_for_file_asbase64(str(file)) a = None try: - a = md5.LocalFileMd5Offload(num_workers=1) + a = ops.LocalFileMd5Offload(num_workers=1) result = a.pop_done_queue() assert result is None a.add_localfile_for_md5_check( - str(file), remote_md5, models.AzureStorageModes.Block) + str(file), remote_md5, azmodels.StorageModes.Block) i = 33 checked = False while i > 0: @@ -91,16 +91,16 @@ def test_from_add_to_done_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') - remote_md5 = md5.compute_md5_for_file_asbase64(str(file), True) + remote_md5 = ops.compute_md5_for_file_asbase64(str(file), True) a = None try: - a = md5.LocalFileMd5Offload(num_workers=1) + a = ops.LocalFileMd5Offload(num_workers=1) result = a.pop_done_queue() assert result is None a.add_localfile_for_md5_check( - str(file), remote_md5, models.AzureStorageModes.Page) + str(file), remote_md5, azmodels.StorageModes.Page) i = 33 checked = False while i > 0: From a09bbfb65538c0fcf55e4b82277eb4babd481fe6 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 7 Apr 2017 14:38:57 -0700 Subject: [PATCH 18/47] Add file log, param preamble and progress bar --- blobxfer/models/options.py | 6 +- blobxfer/operations/download.py | 104 ++++++++++----- blobxfer/operations/progress.py | 141 +++++++++++++++++++++ blobxfer/util.py | 64 ++++++++-- cli/cli.py | 21 ++- cli/settings.py | 2 + tests/test_blobxfer_models_options.py | 3 + tests/test_blobxfer_operations_download.py | 2 +- tests/test_blobxfer_operations_progress.py | 38 ++++++ 9 files changed, 331 insertions(+), 50 deletions(-) create mode 100644 blobxfer/operations/progress.py create mode 100644 tests/test_blobxfer_operations_progress.py diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index f7c9f6f..c5f6da6 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -125,12 +125,13 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): class General(object): """General Options""" def __init__( - self, concurrency, progress_bar=True, resume_file=None, - timeout_sec=None, verbose=False): + self, concurrency, log_file=None, progress_bar=True, + resume_file=None, timeout_sec=None, verbose=False): """Ctor for General Options :param General self: this :param Concurrency concurrency: concurrency options :param bool progress_bar: progress bar + :param str log_file: log file :param str resume_file: resume file :param int timeout_sec: timeout in seconds :param bool verbose: verbose output @@ -138,6 +139,7 @@ def __init__( if concurrency is None: raise ValueError('concurrency option is unspecified') self.concurrency = concurrency + self.log_file = log_file self.progress_bar = progress_bar if blobxfer.util.is_not_empty(resume_file): self.resume_file = pathlib.Path(resume_file) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 61875fe..b9c592b 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -30,8 +30,6 @@ bytes, dict, int, list, object, range, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # stdlib imports -import datetime -import dateutil.tz import enum import logging try: @@ -44,13 +42,13 @@ import Queue as queue import threading # non-stdlib imports -import dateutil # local imports import blobxfer.models.crypto import blobxfer.operations.azure.blob import blobxfer.operations.azure.file import blobxfer.operations.crypto import blobxfer.operations.md5 +import blobxfer.operations.progress import blobxfer.util # create logger @@ -85,9 +83,12 @@ def __init__(self, general_options, creds, spec): self._download_set = set() self._download_start = None self._download_threads = [] - self._download_count = 0 - self._download_total_bytes = 0 + self._download_total = None + self._download_sofar = 0 + self._download_bytes_total = None + self._download_bytes_sofar = 0 self._download_terminate = False + self._start_time = None self._dd_map = {} self._general_options = general_options self._creds = creds @@ -155,6 +156,21 @@ def ensure_local_destination(creds, spec): # ensure destination path spec.destination.ensure_path_exists() + def _update_progress_bar(self): + # type: (Downloader) -> None + """Update progress bar + :param Downloader self: this + """ + blobxfer.operations.progress.update_progress_bar( + self._general_options, + 'download', + self._start_time, + self._download_total, + self._download_sofar, + self._download_bytes_total, + self._download_bytes_sofar, + ) + def _check_download_conditions(self, lpath, rfile): # type: (Downloader, pathlib.Path, # blobxfer.models.azure.StorageEntity) -> DownloadAction @@ -192,8 +208,8 @@ def _check_download_conditions(self, lpath, rfile): # check skip on lmt ge dl_lmt = None if self._spec.skip_on.lmt_ge: - mtime = datetime.datetime.fromtimestamp( - lpath.stat().st_mtime, tz=dateutil.tz.tzlocal()) + mtime = blobxfer.util.datetime_from_timestamp( + lpath.stat().st_mtime) if mtime >= rfile.lmt: dl_lmt = False else: @@ -308,8 +324,7 @@ def _add_to_download_queue(self, lpath, rfile): if self._download_start is None: with self._download_lock: if self._download_start is None: - self._download_start = datetime.datetime.now( - tz=dateutil.tz.tzlocal()) + self._download_start = blobxfer.util.datetime_now() def _initialize_download_threads(self): # type: (Downloader) -> None @@ -344,6 +359,8 @@ def _worker_thread_download(self): dd = self._download_queue.get(False, 1) except queue.Empty: continue + # update progress bar + self._update_progress_bar() # get download offsets offsets = dd.next_offsets() # check if all operations completed @@ -355,7 +372,7 @@ def _worker_thread_download(self): if dd.entity.is_encrypted: self._dd_map.pop(str(dd.final_path)) self._download_set.remove(dd.final_path) - self._download_count += 1 + self._download_sofar += 1 continue # re-enqueue for other threads to download self._download_queue.put(dd) @@ -370,7 +387,7 @@ def _worker_thread_download(self): dd.entity, offsets, self._general_options.timeout_sec) # accounting with self._download_lock: - self._download_total_bytes += offsets.num_bytes + self._download_bytes_sofar += offsets.num_bytes # decrypt if necessary if dd.entity.is_encrypted: # slice data to proper bounds @@ -440,14 +457,16 @@ def _cleanup_temporary_files(self): def _run(self): # type: (Downloader) -> None - """Execute Downloader""" - start_time = datetime.datetime.now(tz=dateutil.tz.tzlocal()) - logger.info('script start time: {0}'.format(start_time)) + """Execute Downloader + :param Downloader self: this + """ # ensure destination path blobxfer.operations.download.Downloader.ensure_local_destination( self._creds, self._spec) logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) + # TODO catalog all local files if delete extraneous enabled + # initialize MD5 processes self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( num_workers=self._general_options.concurrency.md5_processes) @@ -461,19 +480,22 @@ def _run(self): self._check_for_crypto_done) # initialize download threads self._initialize_download_threads() - # iterate through source paths to download + # initialize local counters nfiles = 0 - empty_files = 0 - skipped_files = 0 total_size = 0 + skipped_files = 0 skipped_size = 0 + # mark start + self._start_time = blobxfer.util.datetime_now() + logger.info('download start time: {0}'.format(self._start_time)) + # display progress bar if specified + self._update_progress_bar() + # iterate through source paths to download for src in self._spec.sources: for rfile in src.files( self._creds, self._spec.options, self._general_options): nfiles += 1 total_size += rfile.size - if rfile.size == 0: - empty_files += 1 # form local path for remote file lpath = pathlib.Path(self._spec.destination.path, rfile.name) # check on download conditions @@ -490,44 +512,60 @@ def _run(self): self._pre_md5_skip_on_check(lpath, rfile) elif action == DownloadAction.Download: self._add_to_download_queue(lpath, rfile) - download_files = nfiles - skipped_files - download_size = total_size - skipped_size - download_size_mib = download_size / 1048576 - # clean up processes and threads + self._download_total = nfiles - skipped_files + self._download_bytes_total = total_size - skipped_size + download_size_mib = self._download_bytes_total / blobxfer.util.MEGABYTE + # set remote files processed with self._md5_meta_lock: self._all_remote_files_processed = True logger.debug( ('{0} remote files processed, waiting for download completion ' 'of {1:.4f} MiB').format(nfiles, download_size_mib)) + del nfiles + del total_size + del skipped_files + del skipped_size + # TODO delete all remaining local files not accounted for if + # delete extraneous enabled + + # wait for downloads to complete self._wait_for_download_threads(terminate=False) - end_time = datetime.datetime.now(tz=dateutil.tz.tzlocal()) - if (self._download_count != download_files or - self._download_total_bytes != download_size): + # update progress bar + self._update_progress_bar() + end_time = blobxfer.util.datetime_now() + if (self._download_sofar != self._download_total or + self._download_bytes_sofar != self._download_bytes_total): raise RuntimeError( 'download mismatch: [count={}/{} bytes={}/{}]'.format( - self._download_count, download_files, - self._download_total_bytes, download_size)) + self._download_sofar, self._download_total, + self._download_bytes_sofar, self._download_bytes_total)) if self._download_start is not None: dltime = (end_time - self._download_start).total_seconds() logger.info( ('elapsed download + verify time and throughput: {0:.3f} sec, ' '{1:.4f} Mbps').format( dltime, download_size_mib * 8 / dltime)) - logger.info('script end time: {0} (elapsed: {1:.3f} sec)'.format( - end_time, (end_time - start_time).total_seconds())) + logger.info('download end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time, (end_time - self._start_time).total_seconds())) def start(self): # type: (Downloader) -> None - """Start the Downloader""" + """Start the Downloader + :param Downloader self: this + """ try: + blobxfer.operations.progress.output_download_parameters( + self._general_options, self._spec) self._run() except (KeyboardInterrupt, Exception) as ex: if isinstance(ex, KeyboardInterrupt): logger.error( 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') - self._wait_for_download_threads(terminate=True) - self._cleanup_temporary_files() + try: + self._wait_for_download_threads(terminate=True) + finally: + self._cleanup_temporary_files() raise finally: # TODO close resume file diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py new file mode 100644 index 0000000..25539a3 --- /dev/null +++ b/blobxfer/operations/progress.py @@ -0,0 +1,141 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import logging +import os +import platform +import sys +# non-stdlib imports +import azure.storage +# local imports +import blobxfer.util +import blobxfer.version + +# create logger +logger = logging.getLogger(__name__) + + +def update_progress_bar( + go, optext, start, total_files, files_sofar, total_bytes, + bytes_sofar): + # type: (blobxfer.options.General, str, datetime.datetime, int, int, int, + # int) -> None + """Update the progress bar + :param blobxfer.options.General go: general options + :param str optext: operation prefix text + :param datetime.datetime start: start time + :param int total_files: total number of files + :param int files_sofar: files transfered so far + :param int total_bytes: total number of bytes + :param int bytes_sofar: bytes transferred so far + """ + if not go.progress_bar or blobxfer.util.is_none_or_empty(go.log_file): + return + diff = (blobxfer.util.datetime_now() - start).total_seconds() + if diff <= 0: + # arbitrarily give a small delta + diff = 1e-9 + if total_bytes is None: + done = 0 + else: + done = float(bytes_sofar) / total_bytes + rate = bytes_sofar / blobxfer.util.MEGABYTE / diff + if optext == 'synccopy': + rtext = 'sync-copied' + else: + rtext = optext + 'ed' + if total_files is None: + fprog = 'n/a' + else: + fprog = '{}/{}'.format(files_sofar, total_files) + sys.stdout.write( + ('\r{0} progress: [{1:30s}] {2:.2f}% {3:12.3f} MiB/sec, ' + '{4} {5}').format( + optext, '>' * int(done * 30), done * 100, rate, fprog, rtext) + ) + if files_sofar == total_files: + sys.stdout.write(os.linesep) + sys.stdout.flush() + + +def output_download_parameters(general_options, spec): + # type: (Downloader) -> None + """Output configuration block + :param Downloader downloader: this + """ + log = [] + log.append('===========================') + log.append(' azure blobxfer parameters') + log.append('===========================') + log.append(' blobxfer version: {}'.format( + blobxfer.version.__version__)) + log.append(' platform: {}'.format(platform.platform())) + log.append(' python: {} {} az.stor={}'.format( + platform.python_implementation(), platform.python_version(), + azure.storage._constants.__version__)) + log.append(' transfer direction: {}'.format('local->Azure')) + log.append(' workers: xfer={} md5={} crypto={}'.format( + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes, + general_options.concurrency.crypto_processes)) + log.append(' timeout: {}'.format( + general_options.timeout_sec)) + log.append(' resume file: {}'.format( + general_options.resume_file)) + log.append(' skip on: fs_match={} lmt_ge={} md5={}'.format( + spec.skip_on.filesize_match, + spec.skip_on.lmt_ge, + spec.skip_on.md5_match)) + log.append(' mode: {}'.format( + spec.options.mode)) + log.append(' compute file md5: {}'.format( + spec.options.check_file_md5)) + log.append(' chunk size (bytes): {}'.format( + spec.options.chunk_size_bytes)) + log.append(' delete extraneous: {}'.format( + spec.options.delete_extraneous_destination)) + log.append(' overwrite: {}'.format( + spec.options.overwrite)) + log.append(' recursive: {}'.format( + spec.options.recursive)) + log.append(' file attributes: {}'.format( + spec.options.restore_file_attributes)) + log.append(' rsa private key: {}'.format( + 'Loaded' if spec.options.rsa_private_key else 'None')) + log.append(' local destination: {}'.format( + spec.destination.path)) + log.append('===========================') + log = os.linesep.join(log) + if blobxfer.util.is_not_empty(general_options.log_file): + print(log) + else: + logger.info('{}{}'.format(os.linesep, log)) diff --git a/blobxfer/util.py b/blobxfer/util.py index ec85fe5..82c20a7 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -31,7 +31,7 @@ # stdlib imports import base64 import copy -import dateutil +import datetime import hashlib import logging import logging.handlers @@ -42,10 +42,14 @@ from scandir import scandir as scandir import re # non-stdlib imports +import dateutil +import dateutil.tz import future.utils # local imports # global defines +MEGABYTE = 1048576 +_REGISTERED_LOGGER_HANDLERS = [] _PAGEBLOB_BOUNDARY = 512 @@ -58,16 +62,34 @@ def on_python2(): return future.utils.PY2 -def setup_logger(logger): # noqa - # type: (logger) -> None +def setup_logger(logger, logfile): # noqa + # type: (logger, str) -> None """Set up logger""" + global _REGISTERED_LOGGER_HANDLERS logger.setLevel(logging.DEBUG) - handler = logging.StreamHandler() + if is_none_or_empty(logfile): + handler = logging.StreamHandler() + else: + handler = logging.FileHandler(logfile, encoding='utf-8') + logging.getLogger().addHandler(handler) + formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + formatter.default_msec_format = '%s.%03d' + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.propagate = False + _REGISTERED_LOGGER_HANDLERS.append(handler) + + +def set_verbose_logger_handlers(): # noqa + # type: (None) -> None + """Set logger handler formatters to more detail""" + global _REGISTERED_LOGGER_HANDLERS formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s:%(funcName)s:%(lineno)d ' '%(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) + formatter.default_msec_format = '%s.%03d' + for handler in _REGISTERED_LOGGER_HANDLERS: + handler.setFormatter(formatter) def is_none_or_empty(obj): @@ -77,9 +99,7 @@ def is_none_or_empty(obj): :rtype: bool :return: if object is None or empty """ - if obj is None or len(obj) == 0: - return True - return False + return obj is None or len(obj) == 0 def is_not_empty(obj): @@ -89,9 +109,7 @@ def is_not_empty(obj): :rtype: bool :return: if object is not None and length is > 0 """ - if obj is not None and len(obj) > 0: - return True - return False + return obj is not None and len(obj) > 0 def merge_dict(dict1, dict2): @@ -116,6 +134,28 @@ def merge_dict(dict1, dict2): return result +def datetime_now(): + # type: (None) -> datetime.datetime + """Return a timezone-aware datetime instance with local offset + :rtype: datetime.datetime + :return: datetime now with local tz + """ + return datetime.datetime.now(tz=dateutil.tz.tzlocal()) + + +def datetime_from_timestamp(ts, tz=None): + # type: (int, dateutil.tz) -> datetime.datetime + """Convert a timestamp into datetime with offset + :param int ts: timestamp + :param dateutil.tz tz: time zone or local tz if not specified + :rtype: datetime.datetime + :return: converted timestamp to datetime + """ + if tz is None: + tz = dateutil.tz.tzlocal() + return datetime.datetime.fromtimestamp(ts, tz=tz) + + def scantree(path): # type: (str) -> os.DirEntry """Recursively scan a directory tree diff --git a/cli/cli.py b/cli/cli.py index 03fb231..744ecdc 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -45,7 +45,6 @@ # create logger logger = logging.getLogger('blobxfer') -blobxfer.util.setup_logger(logger) # global defines _CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) @@ -96,7 +95,11 @@ def _init_config(self): self._read_yaml_file(self.yaml_config) # merge cli options with config settings.merge_settings(self.config, self.cli_options) + # set log file if specified + blobxfer.util.setup_logger(logger, self.config['options']['log_file']) + # output config if self.config['options']['verbose']: + blobxfer.util.set_verbose_logger_handlers() logger.debug('config: \n' + json.dumps(self.config, indent=4)) # free mem del self.yaml_config @@ -121,6 +124,19 @@ def callback(ctx, param, value): callback=callback)(f) +def _log_file_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['log_file'] = value + return value + return click.option( + '--log-file', + expose_value=False, + default=None, + help='Log to file specified', + callback=callback)(f) + + def _md5_processes_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -144,7 +160,7 @@ def callback(ctx, param, value): '--progress-bar/--no-progress-bar', expose_value=False, default=True, - help='Display progress bar', + help='Display progress bar instead of console logs', callback=callback)(f) @@ -208,6 +224,7 @@ def common_options(f): f = _resume_file_option(f) f = _progress_bar_option(f) f = _md5_processes_option(f) + f = _log_file_option(f) f = _crypto_processes_option(f) return f diff --git a/cli/settings.py b/cli/settings.py index 4faadd9..d464056 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -209,6 +209,7 @@ def merge_settings(config, cli_options): if 'options' not in config: config['options'] = {} config['options']['crypto_processes'] = cli_options['crypto_processes'] + config['options']['log_file'] = cli_options['log_file'] config['options']['md5_processes'] = cli_options['md5_processes'] config['options']['progress_bar'] = cli_options['progress_bar'] config['options']['resume_file'] = cli_options['resume_file'] @@ -245,6 +246,7 @@ def create_general_options(config): md5_processes=config['options']['md5_processes'], transfer_threads=config['options']['transfer_threads'], ), + log_file=config['options']['log_file'], progress_bar=config['options']['progress_bar'], resume_file=config['options']['resume_file'], timeout_sec=config['options']['timeout_sec'], diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py index 1e7cb8b..e73f3e3 100644 --- a/tests/test_blobxfer_models_options.py +++ b/tests/test_blobxfer_models_options.py @@ -44,6 +44,7 @@ def test_general_options(): md5_processes=2, transfer_threads=3, ), + log_file='abc.log', progress_bar=False, resume_file='abc', timeout_sec=1, @@ -53,6 +54,7 @@ def test_general_options(): assert a.concurrency.crypto_processes == 1 assert a.concurrency.md5_processes == 2 assert a.concurrency.transfer_threads == 3 + assert a.log_file == 'abc.log' assert not a.progress_bar assert a.resume_file == pathlib.Path('abc') assert a.timeout_sec == 1 @@ -73,6 +75,7 @@ def test_general_options(): assert a.concurrency.crypto_processes == 1 assert a.concurrency.md5_processes == 2 assert a.concurrency.transfer_threads == 3 + assert a.log_file is None assert not a.progress_bar assert a.resume_file is None assert a.timeout_sec == 1 diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index 4e05182..ee0166f 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -453,7 +453,7 @@ def test_worker_thread_download( assert d._complete_chunk_download.call_count == 0 assert str(lp) not in d._dd_map assert dd.finalize_file.call_count == 1 - assert d._download_count == 1 + assert d._download_sofar == 1 with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', diff --git a/tests/test_blobxfer_operations_progress.py b/tests/test_blobxfer_operations_progress.py new file mode 100644 index 0000000..7cb0776 --- /dev/null +++ b/tests/test_blobxfer_operations_progress.py @@ -0,0 +1,38 @@ +# coding=utf-8 +"""Tests for progress operations""" + +# stdlib imports +import mock +# non-stdlib imports +# local imports +import blobxfer.util as util +# module under test +import blobxfer.operations.progress as ops + + +def test_output_download_parameters(): + go = mock.MagicMock() + spec = mock.MagicMock() + go.log_file = 'abc' + + ops.output_download_parameters(go, spec) + + assert util.is_not_empty(go.log_file) + + +def test_update_progress_bar(): + go = mock.MagicMock() + go.progress_bar = True + go.log_file = 'abc' + + start = util.datetime_now() + + ops.update_progress_bar( + go, 'download', start, None, 1, None, 1) + + with mock.patch('blobxfer.util.datetime_now') as patched_dt: + patched_dt.return_value = start + ops.update_progress_bar( + go, 'synccopy', start, 1, 1, 1, 1) + + assert util.is_not_empty(go.log_file) From fbdd1b1f122e4863e6aff96abd1cb6f0248f4fbf Mon Sep 17 00:00:00 2001 From: Fred Park Date: Mon, 10 Apr 2017 14:40:03 -0700 Subject: [PATCH 19/47] More download features - Add delete support on download - Add recursive support on download - Add common requests session for connection pooling support with matched transfer thread count --- blobxfer/models/options.py | 8 +- blobxfer/operations/azure/__init__.py | 35 ++++++-- blobxfer/operations/azure/blob/__init__.py | 11 ++- blobxfer/operations/azure/blob/append.py | 6 +- blobxfer/operations/azure/blob/block.py | 6 +- blobxfer/operations/azure/blob/page.py | 6 +- blobxfer/operations/azure/file.py | 14 ++-- blobxfer/operations/crypto.py | 2 +- blobxfer/operations/download.py | 81 +++++++++++++------ blobxfer/operations/md5.py | 3 +- blobxfer/operations/progress.py | 15 ++-- cli/cli.py | 6 +- cli/settings.py | 14 +++- setup.py | 1 + tests/test_blobxfer_models_options.py | 6 +- tests/test_blobxfer_operations_azure.py | 14 ++-- tests/test_blobxfer_operations_azure_blob.py | 32 +++++--- ...t_blobxfer_operations_azure_blob_append.py | 4 +- ...st_blobxfer_operations_azure_blob_block.py | 4 +- ...est_blobxfer_operations_azure_blob_page.py | 4 +- tests/test_blobxfer_operations_azure_file.py | 10 +-- tests/test_blobxfer_operations_download.py | 37 +++++++++ 22 files changed, 222 insertions(+), 97 deletions(-) diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index c5f6da6..08ba42a 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -116,10 +116,10 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): if self.md5_processes < 1: self.md5_processes = 1 if self.transfer_threads is None or self.transfer_threads < 1: - self.transfer_threads = multiprocessing.cpu_count() * 3 - # cap maximum number of threads from cpu count to 24 - if self.transfer_threads > 24: - self.transfer_threads = 24 + self.transfer_threads = multiprocessing.cpu_count() * 4 + # cap maximum number of threads from cpu count to 96 + if self.transfer_threads > 96: + self.transfer_threads = 96 class General(object): diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 20ddb50..cc33834 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -31,6 +31,7 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports # non-stdlib imports +import requests # local imports import blobxfer.models import blobxfer.operations.azure.blob.append @@ -41,10 +42,14 @@ class StorageCredentials(object): """Azure Storage Credentials""" - def __init__(self): - # type: (StorageCredentials) -> None - """Ctor for StorageCredentials""" + def __init__(self, general_options): + # type: (StorageCredentials, blobxfer.models.options.General) -> None + """Ctor for StorageCredentials + :param StorageCredentials self: this + :param blobxfer.models.options.General: general options + """ self._storage_accounts = {} + self._general_options = general_options def add_storage_account(self, name, key, endpoint): # type: (StorageCredentials, str, str, str) -> None @@ -57,7 +62,10 @@ def add_storage_account(self, name, key, endpoint): if name in self._storage_accounts: raise ValueError( '{} already exists in storage accounts'.format(name)) - self._storage_accounts[name] = StorageAccount(name, key, endpoint) + self._storage_accounts[name] = StorageAccount( + name, key, endpoint, + self._general_options.concurrency.transfer_threads + ) def get_storage_account(self, name): # type: (StorageCredentials, str) -> StorageAccount @@ -72,12 +80,13 @@ def get_storage_account(self, name): class StorageAccount(object): """Azure Storage Account""" - def __init__(self, name, key, endpoint): - # type: (StorageAccount, str, str, str) -> None + def __init__(self, name, key, endpoint, transfer_threads): + # type: (StorageAccount, str, str, str, int) -> None """Ctor for StorageAccount :param str name: name of storage account :param str key: storage key or sas :param str endpoint: endpoint + :param int transfer_threads: number of transfer threads """ self._append_blob_client = None self._block_blob_client = None @@ -90,6 +99,15 @@ def __init__(self, name, key, endpoint): # normalize sas keys if self.is_sas and self.key.startswith('?'): self.key = self.key[1:] + # create requests session for connection pooling + self.session = requests.Session() + self.session.mount( + 'https://', + requests.adapters.HTTPAdapter( + pool_connections=transfer_threads, + pool_maxsize=transfer_threads << 1, + ) + ) self._create_clients() @staticmethod @@ -241,7 +259,8 @@ def _populate_from_list_files(self, creds, options, general_options): cont, dir = blobxfer.util.explode_azure_path(rpath) sa = creds.get_storage_account(self.lookup_storage_account(rpath)) for file in blobxfer.operations.azure.file.list_files( - sa.file_client, cont, dir, general_options.timeout_sec): + sa.file_client, cont, dir, options.recursive, + general_options.timeout_sec): if blobxfer.models.crypto.EncryptionMetadata.\ encryption_metadata_exists(file.metadata): ed = blobxfer.models.crypto.EncryptionMetadata() @@ -271,7 +290,7 @@ def _populate_from_list_blobs(self, creds, options, general_options): sa = creds.get_storage_account(self.lookup_storage_account(rpath)) for blob in blobxfer.operations.azure.blob.list_blobs( sa.block_blob_client, cont, dir, options.mode, - general_options.timeout_sec): + options.recursive, general_options.timeout_sec): if blobxfer.models.crypto.EncryptionMetadata.\ encryption_metadata_exists(blob.metadata): ed = blobxfer.models.crypto.EncryptionMetadata() diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index e0cf878..0d49ed0 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -61,15 +61,16 @@ def check_if_single_blob(client, container, prefix, timeout=None): return True -def list_blobs(client, container, prefix, mode, timeout=None): - # type: (azure.storage.blob.BaseBlobService, str, str, int, - # blobxfer.models.azure.StorageModes) -> +def list_blobs(client, container, prefix, mode, recursive, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, + # blobxfer.models.azure.StorageModes, bool, int) -> # azure.storage.blob.models.Blob """List blobs in path conforming to mode :param azure.storage.blob.BaseBlobService client: blob client :param str container: container :param str prefix: path prefix :param blobxfer.models.azure.StorageModes mode: storage mode + :param bool recursive: recursive :param int timeout: timeout :rtype: azure.storage.blob.models.Blob :return: generator of blobs @@ -85,7 +86,7 @@ def list_blobs(client, container, prefix, mode, timeout=None): return blobs = client.list_blobs( container_name=container, - prefix=prefix, + prefix=prefix if blobxfer.util.is_not_empty(prefix) else None, include=azure.storage.blob.models.Include.METADATA, timeout=timeout, ) @@ -102,6 +103,8 @@ def list_blobs(client, container, prefix, mode, timeout=None): blob.properties.blob_type != azure.storage.blob.models._BlobTypes.PageBlob): continue + if not recursive and '/' in blob.name: + continue # auto or match, yield the blob yield blob diff --git a/blobxfer/operations/azure/blob/append.py b/blobxfer/operations/azure/blob/append.py index 910ab5d..087e33b 100644 --- a/blobxfer/operations/azure/blob/append.py +++ b/blobxfer/operations/azure/blob/append.py @@ -51,12 +51,14 @@ def create_client(storage_account): client = azure.storage.blob.AppendBlobService( account_name=storage_account.name, sas_token=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) else: client = azure.storage.blob.AppendBlobService( account_name=storage_account.name, account_key=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client diff --git a/blobxfer/operations/azure/blob/block.py b/blobxfer/operations/azure/blob/block.py index b6fd673..00f7eb3 100644 --- a/blobxfer/operations/azure/blob/block.py +++ b/blobxfer/operations/azure/blob/block.py @@ -51,12 +51,14 @@ def create_client(storage_account): client = azure.storage.blob.BlockBlobService( account_name=storage_account.name, sas_token=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) else: client = azure.storage.blob.BlockBlobService( account_name=storage_account.name, account_key=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 6aedc8f..05d36b6 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -51,12 +51,14 @@ def create_client(storage_account): client = azure.storage.blob.PageBlobService( account_name=storage_account.name, sas_token=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) else: client = azure.storage.blob.PageBlobService( account_name=storage_account.name, account_key=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index 1b17f94..43e89ca 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -57,12 +57,14 @@ def create_client(storage_account): client = azure.storage.file.FileService( account_name=storage_account.name, sas_token=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) else: client = azure.storage.file.FileService( account_name=storage_account.name, account_key=storage_account.key, - endpoint_suffix=storage_account.endpoint) + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client @@ -114,13 +116,14 @@ def check_if_single_file(client, fileshare, prefix, timeout=None): return (True, file) -def list_files(client, fileshare, prefix, timeout=None): - # type: (azure.storage.file.FileService, str, str, int) -> +def list_files(client, fileshare, prefix, recursive, timeout=None): + # type: (azure.storage.file.FileService, str, str, bool, int) -> # azure.storage.file.models.File """List files in path :param azure.storage.file.FileService client: file client :param str fileshare: file share :param str prefix: path prefix + :param bool recursive: recursive :param int timeout: timeout :rtype: azure.storage.file.models.File :return: generator of files @@ -151,7 +154,8 @@ def list_files(client, fileshare, prefix, timeout=None): ) yield fsprop else: - dirs.append(fspath) + if recursive: + dirs.append(fspath) def get_file_range(ase, offsets, timeout=None): diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 98945d3..58f65d8 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -241,7 +241,7 @@ def _worker_process(self): """ while not self.terminated: try: - inst = self._task_queue.get(True, 1) + inst = self._task_queue.get(True, 0.25) except queue.Empty: continue if inst[0] == CryptoAction.Encrypt: diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index b9c592b..78baa34 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -81,7 +81,7 @@ def __init__(self, general_options, creds, spec): self._download_lock = threading.Lock() self._download_queue = queue.Queue() self._download_set = set() - self._download_start = None + self._download_start_time = None self._download_threads = [] self._download_total = None self._download_sofar = 0 @@ -89,6 +89,7 @@ def __init__(self, general_options, creds, spec): self._download_bytes_sofar = 0 self._download_terminate = False self._start_time = None + self._delete_after = set() self._dd_map = {} self._general_options = general_options self._creds = creds @@ -164,7 +165,7 @@ def _update_progress_bar(self): blobxfer.operations.progress.update_progress_bar( self._general_options, 'download', - self._start_time, + self._download_start_time, self._download_total, self._download_sofar, self._download_bytes_total, @@ -321,10 +322,10 @@ def _add_to_download_queue(self, lpath, rfile): self._dd_map[str(dd.final_path)] = dd # add download descriptor to queue self._download_queue.put(dd) - if self._download_start is None: + if self._download_start_time is None: with self._download_lock: - if self._download_start is None: - self._download_start = blobxfer.util.datetime_now() + if self._download_start_time is None: + self._download_start_time = blobxfer.util.datetime_now() def _initialize_download_threads(self): # type: (Downloader) -> None @@ -356,7 +357,7 @@ def _worker_thread_download(self): """ while not self.termination_check: try: - dd = self._download_queue.get(False, 1) + dd = self._download_queue.get(False, 0.25) except queue.Empty: continue # update progress bar @@ -455,23 +456,52 @@ def _cleanup_temporary_files(self): except Exception as e: logger.exception(e) + def _catalog_local_files_for_deletion(self): + # type: (Downloader) -> None + """Catalog all local files if delete extraneous enabled + :param Downloader self: this + """ + if not (self._spec.options.delete_extraneous_destination and + self._spec.destination.is_dir): + return + dst = str(self._spec.destination.path) + for file in blobxfer.util.scantree(dst): + self._delete_after.add(pathlib.Path(file.path)) + + def _delete_extraneous_files(self): + # type: (Downloader) -> None + """Delete extraneous files cataloged + :param Downloader self: this + """ + logger.info('attempting to delete {} extraneous files'.format( + len(self._delete_after))) + for file in self._delete_after: + try: + file.unlink() + except OSError: + pass + def _run(self): # type: (Downloader) -> None """Execute Downloader :param Downloader self: this """ + # mark start + self._start_time = blobxfer.util.datetime_now() + logger.info('blobxfer start time: {0}'.format(self._start_time)) # ensure destination path blobxfer.operations.download.Downloader.ensure_local_destination( self._creds, self._spec) logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) - # TODO catalog all local files if delete extraneous enabled - + self._catalog_local_files_for_deletion() # initialize MD5 processes - self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( - num_workers=self._general_options.concurrency.md5_processes) - self._md5_offload.initialize_check_thread( - self._check_for_downloads_from_md5) + if (self._spec.options.check_file_md5 and + self._general_options.concurrency.md5_processes > 0): + self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( + num_workers=self._general_options.concurrency.md5_processes) + self._md5_offload.initialize_check_thread( + self._check_for_downloads_from_md5) # initialize crypto processes if self._general_options.concurrency.crypto_processes > 0: self._crypto_offload = blobxfer.operations.crypto.CryptoOffload( @@ -485,11 +515,6 @@ def _run(self): total_size = 0 skipped_files = 0 skipped_size = 0 - # mark start - self._start_time = blobxfer.util.datetime_now() - logger.info('download start time: {0}'.format(self._start_time)) - # display progress bar if specified - self._update_progress_bar() # iterate through source paths to download for src in self._spec.sources: for rfile in src.files( @@ -498,6 +523,11 @@ def _run(self): total_size += rfile.size # form local path for remote file lpath = pathlib.Path(self._spec.destination.path, rfile.name) + # remove from delete after set + try: + self._delete_after.remove(lpath) + except KeyError: + pass # check on download conditions action = self._check_download_conditions(lpath, rfile) if action == DownloadAction.Skip: @@ -525,27 +555,30 @@ def _run(self): del total_size del skipped_files del skipped_size - # TODO delete all remaining local files not accounted for if - # delete extraneous enabled - # wait for downloads to complete self._wait_for_download_threads(terminate=False) + end_time = blobxfer.util.datetime_now() # update progress bar self._update_progress_bar() - end_time = blobxfer.util.datetime_now() + # check for mismatches if (self._download_sofar != self._download_total or self._download_bytes_sofar != self._download_bytes_total): raise RuntimeError( 'download mismatch: [count={}/{} bytes={}/{}]'.format( self._download_sofar, self._download_total, self._download_bytes_sofar, self._download_bytes_total)) - if self._download_start is not None: - dltime = (end_time - self._download_start).total_seconds() + # delete all remaining local files not accounted for if + # delete extraneous enabled + self._delete_extraneous_files() + # output throughput + if self._download_start_time is not None: + dltime = (end_time - self._download_start_time).total_seconds() logger.info( ('elapsed download + verify time and throughput: {0:.3f} sec, ' '{1:.4f} Mbps').format( dltime, download_size_mib * 8 / dltime)) - logger.info('download end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time = blobxfer.util.datetime_now() + logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( end_time, (end_time - self._start_time).total_seconds())) def start(self): diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py index d260c9e..dbd05fb 100644 --- a/blobxfer/operations/md5.py +++ b/blobxfer/operations/md5.py @@ -98,7 +98,8 @@ def _worker_process(self): """ while not self.terminated: try: - filename, remote_md5, pagealign = self._task_queue.get(True, 1) + filename, remote_md5, pagealign = self._task_queue.get( + True, 0.25) except queue.Empty: continue md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index 25539a3..b5ec8e9 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -36,6 +36,7 @@ import sys # non-stdlib imports import azure.storage +import requests # local imports import blobxfer.util import blobxfer.version @@ -58,7 +59,8 @@ def update_progress_bar( :param int total_bytes: total number of bytes :param int bytes_sofar: bytes transferred so far """ - if not go.progress_bar or blobxfer.util.is_none_or_empty(go.log_file): + if (not go.progress_bar or blobxfer.util.is_none_or_empty(go.log_file) or + start is None): return diff = (blobxfer.util.datetime_now() - start).total_seconds() if diff <= 0: @@ -99,13 +101,16 @@ def output_download_parameters(general_options, spec): log.append(' blobxfer version: {}'.format( blobxfer.version.__version__)) log.append(' platform: {}'.format(platform.platform())) - log.append(' python: {} {} az.stor={}'.format( - platform.python_implementation(), platform.python_version(), - azure.storage._constants.__version__)) + log.append(' python: {} {} az.stor={} req={}'.format( + platform.python_implementation(), + platform.python_version(), + azure.storage._constants.__version__, + requests.__version__)) log.append(' transfer direction: {}'.format('local->Azure')) log.append(' workers: xfer={} md5={} crypto={}'.format( general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes, + general_options.concurrency.md5_processes + if spec.options.check_file_md5 else 0, general_options.concurrency.crypto_processes)) log.append(' timeout: {}'.format( general_options.timeout_sec)) diff --git a/cli/cli.py b/cli/cli.py index 744ecdc..d27efc8 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -65,9 +65,9 @@ def initialize(self): :param CliContext self: this """ self._init_config() - self.credentials = settings.create_azure_storage_credentials( - self.config) self.general_options = settings.create_general_options(self.config) + self.credentials = settings.create_azure_storage_credentials( + self.config, self.general_options) def _read_yaml_file(self, yaml_file): # type: (CliContext, pathlib.Path) -> None @@ -382,7 +382,7 @@ def callback(ctx, param, value): '--file-md5/--no-file-md5', expose_value=False, default=False, - help='Compute file MD5 [True]', + help='Compute file MD5 [False]', callback=callback)(f) diff --git a/cli/settings.py b/cli/settings.py index d464056..088a4f4 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -218,14 +218,16 @@ def merge_settings(config, cli_options): config['options']['verbose'] = cli_options['verbose'] -def create_azure_storage_credentials(config): - # type: (dict) -> blobxfer.operations.azure.StorageCredentials +def create_azure_storage_credentials(config, general_options): + # type: (dict, blobxfer.models.options.General) -> + # blobxfer.operations.azure.StorageCredentials """Create an Azure StorageCredentials object from configuration :param dict config: config dict + :param blobxfer.models.options.General: general options :rtype: blobxfer.operations.azure.StorageCredentials :return: credentials object """ - creds = blobxfer.operations.azure.StorageCredentials() + creds = blobxfer.operations.azure.StorageCredentials(general_options) endpoint = config['azure_storage']['endpoint'] for name in config['azure_storage']['accounts']: key = config['azure_storage']['accounts'][name] @@ -285,6 +287,12 @@ def create_download_specifications(config): rpk, rpkp) else: rpk = None + # ensure compatible options + if (not conf['options']['check_file_md5'] and + conf['options']['skip_on']['md5_match']): + raise ValueError( + 'Cannot specify skip on MD5 match without file MD5 enabled') + # create specification ds = blobxfer.models.download.Specification( download_options=blobxfer.models.options.Download( check_file_md5=conf['options']['check_file_md5'], diff --git a/setup.py b/setup.py index 74b57cf..5a2d6c6 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'cryptography>=1.8.1', 'future==0.16.0', 'python-dateutil==2.6.0', + 'requests==2.13.0', 'ruamel.yaml==0.14.5', ] diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py index e73f3e3..4716f27 100644 --- a/tests/test_blobxfer_models_options.py +++ b/tests/test_blobxfer_models_options.py @@ -23,10 +23,10 @@ def test_concurrency_options(patched_cc): assert a.crypto_processes == 0 assert a.md5_processes == 1 - assert a.transfer_threads == 3 + assert a.transfer_threads == 4 -@mock.patch('multiprocessing.cpu_count', return_value=10) +@mock.patch('multiprocessing.cpu_count', return_value=64) def test_concurrency_options_max_transfer_threads(patched_cc): a = options.Concurrency( crypto_processes=1, @@ -34,7 +34,7 @@ def test_concurrency_options_max_transfer_threads(patched_cc): transfer_threads=None, ) - assert a.transfer_threads == 24 + assert a.transfer_threads == 96 def test_general_options(): diff --git a/tests/test_blobxfer_operations_azure.py b/tests/test_blobxfer_operations_azure.py index c90340a..bfe976d 100644 --- a/tests/test_blobxfer_operations_azure.py +++ b/tests/test_blobxfer_operations_azure.py @@ -14,7 +14,7 @@ def test_storage_credentials(): - creds = azops.StorageCredentials() + creds = azops.StorageCredentials(mock.MagicMock()) creds.add_storage_account('sa1', 'somekey1', 'endpoint') a = creds.get_storage_account('sa1') @@ -48,25 +48,25 @@ def test_storage_credentials(): def test_key_is_sas(): - a = azops.StorageAccount('name', 'abcdef', 'endpoint') + a = azops.StorageAccount('name', 'abcdef', 'endpoint', 10) assert not a.is_sas - a = azops.StorageAccount('name', 'abcdef&blah', 'endpoint') + a = azops.StorageAccount('name', 'abcdef&blah', 'endpoint', 10) assert not a.is_sas - a = azops.StorageAccount('name', '?abcdef', 'endpoint') + a = azops.StorageAccount('name', '?abcdef', 'endpoint', 10) assert a.is_sas a = azops.StorageAccount( - 'name', '?sv=0&sr=1&sig=2', 'endpoint') + 'name', '?sv=0&sr=1&sig=2', 'endpoint', 10) assert a.is_sas a = azops.StorageAccount( - 'name', 'sv=0&sr=1&sig=2', 'endpoint') + 'name', 'sv=0&sr=1&sig=2', 'endpoint', 10) assert a.is_sas a = azops.StorageAccount( - 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint') + 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint', 10) assert a.is_sas diff --git a/tests/test_blobxfer_operations_azure_blob.py b/tests/test_blobxfer_operations_azure_blob.py index d6ad180..3880d1a 100644 --- a/tests/test_blobxfer_operations_azure_blob.py +++ b/tests/test_blobxfer_operations_azure_blob.py @@ -36,44 +36,51 @@ def test_check_if_single_blob(): def test_list_blobs(): with pytest.raises(RuntimeError): for blob in ops.list_blobs( - None, 'cont', 'prefix', azmodels.StorageModes.File): + None, 'cont', 'prefix', azmodels.StorageModes.File, True): pass - _blob = azure.storage.blob.models.Blob(name='name') + _blob = azure.storage.blob.models.Blob(name='dir/name') _blob.properties = azure.storage.blob.models.BlobProperties() client = mock.MagicMock() client.list_blobs.return_value = [_blob] i = 0 for blob in ops.list_blobs( - client, 'cont', 'prefix', azmodels.StorageModes.Auto): + client, 'cont', 'prefix', azmodels.StorageModes.Auto, False): i += 1 - assert blob.name == 'name' + assert blob.name == _blob.name + assert i == 0 + + i = 0 + for blob in ops.list_blobs( + client, 'cont', 'prefix', azmodels.StorageModes.Auto, True): + i += 1 + assert blob.name == _blob.name assert i == 1 _blob.properties.blob_type = \ azure.storage.blob.models._BlobTypes.AppendBlob i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', azmodels.StorageModes.Block): + client, 'dir', 'prefix', azmodels.StorageModes.Block, True): i += 1 - assert blob.name == 'name' + assert blob.name == _blob.name assert i == 0 i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', azmodels.StorageModes.Page): + client, 'dir', 'prefix', azmodels.StorageModes.Page, True): i += 1 - assert blob.name == 'name' + assert blob.name == _blob.name assert i == 0 _blob.properties.blob_type = \ azure.storage.blob.models._BlobTypes.BlockBlob i = 0 for blob in ops.list_blobs( - client, 'dir', 'prefix', azmodels.StorageModes.Append): + client, 'dir', 'prefix', azmodels.StorageModes.Append, True): i += 1 - assert blob.name == 'name' + assert blob.name == _blob.name assert i == 0 _blob.snapshot = '2017-02-23T22:21:14.8121864Z' @@ -82,9 +89,10 @@ def test_list_blobs(): for blob in ops.list_blobs( client, 'cont', 'a?snapshot=2017-02-23T22:21:14.8121864Z', - azmodels.StorageModes.Auto): + azmodels.StorageModes.Auto, + True): i += 1 - assert blob.name == 'name' + assert blob.name == _blob.name assert blob.snapshot == _blob.snapshot assert i == 1 diff --git a/tests/test_blobxfer_operations_azure_blob_append.py b/tests/test_blobxfer_operations_azure_blob_append.py index 5553b7d..f6e8c23 100644 --- a/tests/test_blobxfer_operations_azure_blob_append.py +++ b/tests/test_blobxfer_operations_azure_blob_append.py @@ -11,7 +11,7 @@ def test_create_client(): - sa = azops.StorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.AppendBlobService) @@ -19,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.AppendBlobService) diff --git a/tests/test_blobxfer_operations_azure_blob_block.py b/tests/test_blobxfer_operations_azure_blob_block.py index 4aece2d..2af2f6f 100644 --- a/tests/test_blobxfer_operations_azure_blob_block.py +++ b/tests/test_blobxfer_operations_azure_blob_block.py @@ -11,7 +11,7 @@ def test_create_client(): - sa = azops.StorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.BlockBlobService) @@ -19,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.BlockBlobService) diff --git a/tests/test_blobxfer_operations_azure_blob_page.py b/tests/test_blobxfer_operations_azure_blob_page.py index f70e83d..f1b4d8c 100644 --- a/tests/test_blobxfer_operations_azure_blob_page.py +++ b/tests/test_blobxfer_operations_azure_blob_page.py @@ -11,7 +11,7 @@ def test_create_client(): - sa = azops.StorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.PageBlobService) @@ -19,7 +19,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.blob.PageBlobService) diff --git a/tests/test_blobxfer_operations_azure_file.py b/tests/test_blobxfer_operations_azure_file.py index c6bf764..cb6b04e 100644 --- a/tests/test_blobxfer_operations_azure_file.py +++ b/tests/test_blobxfer_operations_azure_file.py @@ -14,7 +14,7 @@ def test_create_client(): - sa = azops.StorageAccount('name', 'key', 'endpoint') + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.file.FileService) @@ -22,7 +22,7 @@ def test_create_client(): client.authentication, azure.storage._auth._StorageSharedKeyAuthentication) - sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint') + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) client = ops.create_client(sa) assert client is not None assert isinstance(client, azure.storage.file.FileService) @@ -87,7 +87,7 @@ def test_list_files_single_file(): client.get_file_properties.return_value = 'fp' i = 0 - for file in ops.list_files(client, 'a', 'b/c'): + for file in ops.list_files(client, 'a', 'b/c', True): i += 1 assert file == 'fp' assert i == 1 @@ -104,7 +104,7 @@ def test_list_files_directory(patched_cisf): client.get_file_properties.return_value = _file i = 0 - for file in ops.list_files(client, 'dir', ''): + for file in ops.list_files(client, 'dir', '', True): i += 1 assert file.name == 'name' assert i == 1 @@ -117,7 +117,7 @@ def test_list_files_directory(patched_cisf): client.get_file_properties.side_effect = [_file] i = 0 - for file in ops.list_files(client, '', ''): + for file in ops.list_files(client, '', '', True): i += 1 assert file.name == _file.name assert type(file) == azure.storage.file.models.File diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index ee0166f..ea2be05 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -591,6 +591,41 @@ def test_cleanup_temporary_files(tmpdir): assert dd.local_path.exists() +def test_catalog_local_files_for_deletion(tmpdir): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.delete_extraneous_destination = False + + d._catalog_local_files_for_deletion() + assert len(d._delete_after) == 0 + + a = tmpdir.join('a') + a.write('abc') + d._spec.destination.path = tmpdir + d._spec.options.delete_extraneous_destination = True + d._spec.destination.is_dir = True + + d._catalog_local_files_for_deletion() + assert len(d._delete_after) == 1 + assert pathlib.Path(str(a)) in d._delete_after + + +def test_delete_extraneous_files(tmpdir): + a = tmpdir.join('a') + a.write('abc') + fp = pathlib.Path(str(a)) + + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.delete_extraneous_destination = True + d._spec.destination.is_dir = True + d._delete_after.add(fp) + + d._delete_extraneous_files() + assert not fp.exists() + + # following should not throw exception + d._delete_extraneous_files() + + @mock.patch('time.clock') @mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') @mock.patch('blobxfer.operations.azure.blob.list_blobs') @@ -605,6 +640,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d._initialize_download_threads = mock.MagicMock() patched_lfmo._check_thread = mock.MagicMock() d._general_options.concurrency.crypto_processes = 1 + d._general_options.concurrency.md5_processes = 1 d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 @@ -616,6 +652,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): d._spec.skip_on.filesize_match = False d._spec.destination = mock.MagicMock() d._spec.destination.path = pathlib.Path(str(tmpdir)) + d._download_start_time = util.datetime_now() p = '/cont/remote/path' asp = azops.SourcePath() From 9ce5571e7fe25c8f9e8139f02ccd8b5b0606ea5b Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 18 Apr 2017 12:01:18 -0700 Subject: [PATCH 20/47] Add resume support for downloads --- blobxfer/models/download.py | 362 ++++++++++++++++++++++++-------- blobxfer/models/resume.py | 171 +++++++++++++++ blobxfer/operations/crypto.py | 27 ++- blobxfer/operations/download.py | 98 +++++---- blobxfer/operations/resume.py | 149 +++++++++++++ 5 files changed, 662 insertions(+), 145 deletions(-) create mode 100644 blobxfer/models/resume.py create mode 100644 blobxfer/operations/resume.py diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index 150e07e..70ba7a7 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -42,8 +42,9 @@ import threading # non-stdlib imports # local imports -import blobxfer.models.options +import blobxfer.models.azure import blobxfer.models.crypto +import blobxfer.models.options import blobxfer.util # create logger @@ -172,42 +173,44 @@ class Descriptor(object): _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES - def __init__(self, lpath, ase, options): + def __init__(self, lpath, ase, options, resume_mgr): # type: (DownloadDescriptior, pathlib.Path, # blobxfer.models.azure.StorageEntity, - # blobxfer.models.options.Download) -> None + # blobxfer.models.options.Download, + # blobxfer.operations.resume.DownloadResumeManager) -> None """Ctor for Descriptor :param Descriptor self: this :param pathlib.Path lpath: local path :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity :param blobxfer.models.options.Download options: download options + :param blobxfer.operations.resume.DownloadResumeManager resume_mgr: + download resume manager """ + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._unchecked_chunks = {} + self._allocated = False + self._finalized = False + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() + self._resume_mgr = resume_mgr + self._ase = ase + # set paths self.final_path = lpath # create path holding the temporary file to download to _tmp = list(lpath.parts[:-1]) _tmp.append(lpath.name + '.bxtmp') self.local_path = pathlib.Path(*_tmp) - self._meta_lock = threading.Lock() - self._hasher_lock = threading.Lock() - self._ase = ase + del _tmp # calculate the total number of ops required for transfer self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) - try: - self._total_chunks = int( - math.ceil(self._ase.size / self._chunk_size)) - except ZeroDivisionError: - self._total_chunks = 0 + self._total_chunks = self._compute_total_chunks(self._chunk_size) + self._outstanding_ops = self._total_chunks + # initialize integrity checkers self.hmac = None self.md5 = None - self._offset = 0 - self._chunk_num = 0 - self._next_integrity_chunk = 0 - self._unchecked_chunks = {} - self._outstanding_ops = self._total_chunks - self._completed_ops = 0 - # initialize checkers and allocate space self._initialize_integrity_checkers(options) - self._allocate_disk_space() @property def entity(self): @@ -241,14 +244,28 @@ def all_operations_completed(self): return (self._outstanding_ops == 0 and len(self._unchecked_chunks) == 0) - def dec_outstanding_operations(self): - # type: (Descriptor) -> None - """Decrement outstanding operations (and increment completed ops) + @property + def is_resumable(self): + # type: (Descriptor) -> bool + """Download is resume capable :param Descriptor self: this + :rtype: bool + :return: if resumable """ - with self._meta_lock: - self._outstanding_ops -= 1 - self._completed_ops += 1 + return self._resume_mgr is not None and self.hmac is None + + def _compute_total_chunks(self, chunk_size): + # type: (Descriptor, int) -> int + """Compute total number of chunks for entity + :param Descriptor self: this + :param int chunk_size: chunk size + :rtype: int + :return: num chunks + """ + try: + return int(math.ceil(self._ase.size / chunk_size)) + except ZeroDivisionError: + return 0 def _initialize_integrity_checkers(self, options): # type: (Descriptor, blobxfer.models.options.Download) -> None @@ -273,29 +290,145 @@ def _allocate_disk_space(self): :param Descriptor self: this :param int size: size """ - size = self._ase.size - # compute size - if size > 0: - if self._ase.is_encrypted: - # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs - allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ - self._AES_BLOCKSIZE + with self._meta_lock: + if self._allocated: + return + size = self._ase.size + # compute size + if size > 0: + if self._ase.is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + self._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 else: - allocatesize = size - if allocatesize < 0: allocatesize = 0 - else: - allocatesize = 0 - # create parent path - self.local_path.parent.mkdir(mode=0o750, parents=True, exist_ok=True) - # allocate file - with self.local_path.open('wb') as fd: - if allocatesize > 0: - try: - os.posix_fallocate(fd.fileno(), 0, allocatesize) - except AttributeError: - fd.seek(allocatesize - 1) - fd.write(b'\0') + # check if path already exists and is of sufficient size + if (not self.local_path.exists() or + self.local_path.stat().st_size != allocatesize): + # create parent path + self.local_path.parent.mkdir( + mode=0o750, parents=True, exist_ok=True) + # allocate file + with self.local_path.open('wb') as fd: + if allocatesize > 0: + try: + os.posix_fallocate(fd.fileno(), 0, allocatesize) + except AttributeError: + fd.seek(allocatesize - 1) + fd.write(b'\0') + self._allocated = True + + def _resume(self): + # type: (Descriptor) -> int + """Resume a download, if possible + :param Descriptor self: this + :rtype: int or None + :return: verified download offset + """ + if self._resume_mgr is None or self._offset != 0: + return None + # check if path exists in resume db + rr = self._resume_mgr.get_record(str(self.final_path)) + if rr is None: + logger.debug('no resume record for {}'.format(self.final_path)) + return None + # ensure lengths are the same + if rr.length != self._ase.size: + logger.warning('resume length mismatch {} -> {}'.format( + rr.length, self._ase.size)) + return None + # calculate current chunk and offset + if rr.next_integrity_chunk == 0: + logger.debug('nothing to resume for {}'.format(self.final_path)) + return None + curr_chunk = rr.next_integrity_chunk + curr_offset = curr_chunk * rr.chunk_size + # set offsets if completed and the final path exists + if rr.completed and self.final_path.exists(): + logger.debug('{} download already completed'.format( + self.final_path)) + with self._meta_lock: + self._offset = self._ase.size + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = self._compute_total_chunks(rr.chunk_size) + self._next_integrity_chunk = rr.next_integrity_chunk + self._outstanding_ops = 0 + self._finalized = True + return self._ase.size + # encrypted files are not resumable due to hmac requirement + if self._ase.is_encrypted: + logger.debug('cannot resume encrypted entity {}/{}'.format( + self._ase.container, self._ase.name)) + return None + # check if intermediate (blobtmp) exists + if not self.local_path.exists(): + logger.warning('temporary download file {} does not exist'.format( + rr.temp_path)) + return None + if self.hmac is not None: + raise RuntimeError( + 'unexpected hmac object for entity {}/{}'.format( + self._ase.container, self._ase.name)) + # re-hash from 0 to offset if needed + if self.md5 is not None and curr_chunk > 0: + pagealign = ( + self._ase.mode == blobxfer.models.azure.StorageModes.Page + ) + _fd_offset = 0 + _end_offset = min( + (curr_chunk * rr.chunk_size, rr.length) + ) + logger.debug( + 'integrity checking existing file {} to offset {}'.format( + self.final_path, _end_offset)) + with self._hasher_lock: + with self.local_path.open('rb') as filedesc: + while _fd_offset < _end_offset: + _blocksize = blobxfer.util.MEGABYTE << 2 + if (_fd_offset + _blocksize) > _end_offset: + _blocksize = _end_offset - _fd_offset + buf = filedesc.read(_blocksize) + buflen = len(buf) + if pagealign and buflen < _blocksize: + aligned = blobxfer.\ + util.page_align_content_length(buflen) + if aligned != buflen: + buf = buf.ljust(aligned, b'\0') + self.md5.update(buf) + _fd_offset += _blocksize + del _fd_offset + del _end_offset + # compare hashes + hexdigest = self.md5.hexdigest() + if rr.md5hexdigest != hexdigest: + logger.warning( + 'MD5 mismatch resume={} computed={} for {}'.format( + rr.md5hexdigest, hexdigest, self.local_path)) + # reset hasher + self.md5 = blobxfer.util.new_md5_hasher() + return None + # set values from resume + with self._meta_lock: + self._offset = curr_offset + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = self._compute_total_chunks(rr.chunk_size) + self._next_integrity_chunk = rr.next_integrity_chunk + self._outstanding_ops = \ + self._total_chunks - self._next_integrity_chunk + logger.debug( + ('resuming file {} from byte={} chunk={} chunk_size={} ' + 'total_chunks={} next_integrity_chunk={} ' + 'outstanding_ops={}').format( + self.final_path, self._offset, self._chunk_num, + self._chunk_size, self._total_chunks, + self._next_integrity_chunk, self._outstanding_ops)) + return curr_offset def cleanup_all_temporary_files(self): # type: (Descriptor) -> None @@ -324,9 +457,12 @@ def next_offsets(self): :rtype: Offsets :return: download offsets """ + resume_bytes = self._resume() + if resume_bytes is None and not self._allocated: + self._allocate_disk_space() with self._meta_lock: if self._offset >= self._ase.size: - return None + return None, resume_bytes if self._offset + self._chunk_size > self._ase.size: chunk = self._ase.size - self._offset else: @@ -360,47 +496,62 @@ def next_offsets(self): range_start=range_start, range_end=range_end, unpad=unpad, - ) + ), resume_bytes + + def hmac_iv(self, iv): + # type: (Descriptor, bytes) -> None + """Send IV through hasher + :param Descriptor self: this + :param bytes iv: iv + """ + with self._hasher_lock: + self.hmac.update(iv) - def _postpone_integrity_check(self, offsets, data): + def write_unchecked_data(self, offsets, data): # type: (Descriptor, Offsets, bytes) -> None - """Postpone integrity check for chunk + """Write unchecked data to disk :param Descriptor self: this :param Offsets offsets: download offsets :param bytes data: data """ - if self.must_compute_md5: - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) - fd.write(data) - unchecked = UncheckedChunk( - data_len=len(data), - fd_start=offsets.fd_start, - file_path=self.local_path, - temp=False, - ) - else: - fname = None - with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: - fname = fd.name - fd.write(data) - unchecked = UncheckedChunk( - data_len=len(data), - fd_start=0, - file_path=pathlib.Path(fname), - temp=True, - ) + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=offsets.fd_start, + file_path=self.local_path, + temp=False, + ) with self._meta_lock: self._unchecked_chunks[offsets.chunk_num] = unchecked - def perform_chunked_integrity_check(self, offsets, data): + def write_unchecked_hmac_data(self, offsets, data): # type: (Descriptor, Offsets, bytes) -> None - """Hash data against stored MD5 hasher safely + """Write unchecked encrypted data to disk :param Descriptor self: this :param Offsets offsets: download offsets - :param bytes data: data + :param bytes data: hmac/encrypted data + """ + fname = None + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: + fname = fd.name + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=0, + file_path=pathlib.Path(fname), + temp=True, + ) + with self._meta_lock: + self._unchecked_chunks[offsets.chunk_num] = unchecked + return str(unchecked.file_path) + + def perform_chunked_integrity_check(self): + # type: (Descriptor) -> None + """Hash data against stored hasher safely + :param Descriptor self: this """ - self_check = False hasher = self.hmac or self.md5 # iterate from next chunk to be checked while True: @@ -410,26 +561,45 @@ def perform_chunked_integrity_check(self, offsets, data): # check if the next chunk is ready if chunk_num in self._unchecked_chunks: ucc = self._unchecked_chunks.pop(chunk_num) - elif chunk_num != offsets.chunk_num: + else: break - # prepare data for hashing - if ucc is None: - chunk = data - self_check = True - else: + # hash data and set next integrity chunk + md5hexdigest = None + if hasher is not None: with ucc.file_path.open('rb') as fd: - fd.seek(ucc.fd_start, 0) + if not ucc.temp: + fd.seek(ucc.fd_start, 0) chunk = fd.read(ucc.data_len) if ucc.temp: ucc.file_path.unlink() - # hash data and set next integrity chunk - with self._hasher_lock: - hasher.update(chunk) + with self._hasher_lock: + hasher.update(chunk) + if hasher == self.md5: + md5hexdigest = hasher.hexdigest() with self._meta_lock: + # update integrity counter and resume db self._next_integrity_chunk += 1 - # store data that hasn't been checked - if not self_check: - self._postpone_integrity_check(offsets, data) + if self.is_resumable: + self._resume_mgr.add_or_update_record( + self.final_path, self.local_path, self._ase.size, + self._chunk_size, self._next_integrity_chunk, False, + md5hexdigest, + ) + # decrement outstanding op counter + self._outstanding_ops -= 1 + + def _update_resume_for_completed(self): + # type: (Descriptor) -> None + """Update resume for completion + :param Descriptor self: this + """ + if not self.is_resumable: + return + with self._meta_lock: + self._resume_mgr.add_or_update_record( + self.final_path, self.local_path, self._ase.size, + self._chunk_size, self._next_integrity_chunk, True, None, + ) def write_data(self, offsets, data): # type: (Descriptor, Offsets, bytes) -> None @@ -438,15 +608,19 @@ def write_data(self, offsets, data): :param Offsets offsets: download offsets :param bytes data: data """ - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) - fd.write(data) + if len(data) > 0: + with self.local_path.open('r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) def finalize_file(self): # type: (Descriptor) -> None """Finalize file download :param Descriptor self: this """ + with self._meta_lock: + if self._finalized: + return # check final file integrity check = False msg = None @@ -491,4 +665,8 @@ def finalize_file(self): # TODO set file uid/gid and mode # move temp download file to final path - self.local_path.rename(self.final_path) + self.local_path.replace(self.final_path) + # update resume file + self._update_resume_for_completed() + with self._meta_lock: + self._finalized = True diff --git a/blobxfer/models/resume.py b/blobxfer/models/resume.py new file mode 100644 index 0000000..37a5acc --- /dev/null +++ b/blobxfer/models/resume.py @@ -0,0 +1,171 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +# local imports + + +class Download(object): + """Download resume object""" + def __init__( + self, final_path, temp_path, length, chunk_size, + next_integrity_chunk, completed, md5): + # type: (Download, str, str, int, int, int, str) -> None + """Ctor for Download + :param Download self: this + :param str final_path: final path + :param str temp_path: temporary path + :param int length: total bytes + :param int chunk_size: chunk size in bytes + :param int next_integrity_chunk: next integrity chunk + :param str md5: md5 hex digest + """ + self._final_path = final_path + self._temp_path = temp_path + self._length = length + self._chunk_size = chunk_size + self._next_integrity_chunk = next_integrity_chunk + self._completed = completed + self._md5hexdigest = md5 if md5 is not None else None + + @property + def final_path(self): + # type: (Download) -> str + """Final path + :param Download self: this + :rtype: str + :return: final path + """ + return self._final_path + + @property + def temp_path(self): + # type: (Download) -> str + """Temp path + :param Download self: this + :rtype: str + :return: temp path + """ + return self._temp_path + + @property + def length(self): + # type: (Download) -> int + """Content length + :param Download self: this + :rtype: int + :return: number of bytes + """ + return self._length + + @property + def chunk_size(self): + # type: (Download) -> int + """Chunk size + :param Download self: this + :rtype: int + :return: chunk size in bytes + """ + return self._chunk_size + + @property + def next_integrity_chunk(self): + # type: (Download) -> int + """Get Next integrity chunk + :param Download self: this + :rtype: int + :return: next integrity chunk + """ + return self._next_integrity_chunk + + @next_integrity_chunk.setter + def next_integrity_chunk(self, value): + # type: (Download) -> None + """Set Next integrity chunk + :param Download self: this + :param int value: next chunk num + """ + self._next_integrity_chunk = value + + @property + def completed(self): + # type: (Download) -> bool + """Get Completed + :param Download self: this + :rtype: bool + :return: if completed + """ + return self._completed + + @completed.setter + def completed(self, value): + # type: (Download) -> None + """Set Completed + :param Download self: this + :param bool value: completion value + """ + self._completed = value + + @property + def md5hexdigest(self): + # type: (Download) -> str + """Get md5 hex digest + :param Download self: this + :rtype: str + :return: md5 hex digest + """ + return self._md5hexdigest + + @md5hexdigest.setter + def md5hexdigest(self, value): + # type: (Download) -> None + """Set md5 hex digest value if value is not None + :param Download self: this + :param str value: md5 hex digest + """ + if value is None: + return + self._md5hexdigest = value + + def __repr__(self): + # type: (Download) -> str + """Return representation + :param Download self: this + :rtype: str + :return: representation string + """ + return ('Download').format( + self.final_path, self.temp_path, self.length, + self.chunk_size, self.next_integrity_chunk, + self.completed, self.md5hexdigest, + ) diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 58f65d8..f494d81 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -248,28 +248,37 @@ def _worker_process(self): # TODO on upload raise NotImplementedError() elif inst[0] == CryptoAction.Decrypt: - final_path, offsets, symkey, iv, encdata = \ - inst[1], inst[2], inst[3], inst[4], inst[5] + final_path, local_path, offsets, symkey, iv, hmac_datafile = \ + inst[1], inst[2], inst[3], inst[4], inst[5], inst[6] + # read encrypted data from disk + with open(hmac_datafile, 'rb') as fd: + encdata = fd.read() data = blobxfer.operations.crypto.aes_cbc_decrypt_data( symkey, iv, encdata, offsets.unpad) + # write decrypted data to disk + if len(data) > 0: + with open(local_path, 'r+b') as fd: + fd.seek(offsets.fd_start, 0) + fd.write(data) self._done_cv.acquire() - self._done_queue.put((final_path, offsets, data)) + self._done_queue.put(final_path) self._done_cv.notify() self._done_cv.release() def add_decrypt_chunk( - self, final_path, offsets, symkey, iv, encdata): - # type: (CryptoOffload, str, blobxfer.models.download.Offsets, - # bytes, bytes, bytes) -> None + self, final_path, local_path, offsets, symkey, iv, hmac_datafile): + # type: (CryptoOffload, str, str, blobxfer.models.download.Offsets, + # bytes, bytes, str) -> None """Add a chunk to decrypt :param CryptoOffload self: this :param str final_path: final path + :param str local_path: temp local path :param blobxfer.models.download.Offsets offsets: offsets :param bytes symkey: symmetric key :param bytes iv: initialization vector - :param bytes encdata: encrypted data + :param str hmac_datafile: encrypted data file """ self._task_queue.put( - (CryptoAction.Decrypt, final_path, offsets, symkey, iv, - encdata) + (CryptoAction.Decrypt, final_path, local_path, offsets, symkey, + iv, hmac_datafile) ) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 78baa34..bb63f3f 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -49,6 +49,7 @@ import blobxfer.operations.crypto import blobxfer.operations.md5 import blobxfer.operations.progress +import blobxfer.operations.resume import blobxfer.util # create logger @@ -94,6 +95,7 @@ def __init__(self, general_options, creds, spec): self._general_options = general_options self._creds = creds self._spec = spec + self._resume = None @property def termination_check(self): @@ -255,6 +257,8 @@ def _post_md5_skip_on_check(self, filename, md5_match): if md5_match: with self._download_lock: self._download_set.remove(lpath) + self._download_total -= 1 + self._download_bytes_total -= lpath.stat().st_size else: self._add_to_download_queue(lpath, rfile) @@ -302,9 +306,14 @@ def _check_for_crypto_done(self): break cv.release() if result is not None: - with self._download_lock: - dd = self._dd_map[result[0]] - self._complete_chunk_download(result[1], result[2], dd) + try: + with self._download_lock: + dd = self._dd_map[result] + dd.perform_chunked_integrity_check() + except KeyError: + # this can happen if all of the last integrity + # chunks are processed at once + pass def _add_to_download_queue(self, lpath, rfile): # type: (Downloader, pathlib.Path, @@ -316,7 +325,7 @@ def _add_to_download_queue(self, lpath, rfile): """ # prepare remote file for download dd = blobxfer.models.download.Descriptor( - lpath, rfile, self._spec.options) + lpath, rfile, self._spec.options, self._resume) if dd.entity.is_encrypted: with self._download_lock: self._dd_map[str(dd.final_path)] = dd @@ -363,7 +372,12 @@ def _worker_thread_download(self): # update progress bar self._update_progress_bar() # get download offsets - offsets = dd.next_offsets() + offsets, resume_bytes = dd.next_offsets() + # add resume bytes to counter + if resume_bytes is not None: + with self._download_lock: + self._download_bytes_sofar += resume_bytes + del resume_bytes # check if all operations completed if offsets is None and dd.all_operations_completed: # finalize file @@ -391,50 +405,43 @@ def _worker_thread_download(self): self._download_bytes_sofar += offsets.num_bytes # decrypt if necessary if dd.entity.is_encrypted: - # slice data to proper bounds - encdata = data[blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] - intdata = encdata - # get iv for chunk and compute hmac + # slice data to proper bounds and get iv for chunk if offsets.chunk_num == 0: + # set iv iv = dd.entity.encryption_metadata.content_encryption_iv - # integrity check for first chunk must include iv - intdata = iv + data + # set data to decrypt + encdata = data + # send iv through hmac + dd.hmac_iv(iv) else: + # set iv iv = data[:blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES] - # integrity check data - dd.perform_chunked_integrity_check(offsets, intdata) + # set data to decrypt + encdata = data[ + blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] + # write encdata to disk for hmac later + _hmac_datafile = dd.write_unchecked_hmac_data( + offsets, encdata) # decrypt data if self._crypto_offload is not None: self._crypto_offload.add_decrypt_chunk( - str(dd.final_path), offsets, + str(dd.final_path), str(dd.local_path), offsets, dd.entity.encryption_metadata.symmetric_key, - iv, encdata) - # data will be completed once retrieved from crypto queue + iv, _hmac_datafile) + # data will be integrity checked and written once + # retrieved from crypto queue continue else: data = blobxfer.operations.crypto.aes_cbc_decrypt_data( dd.entity.encryption_metadata.symmetric_key, iv, encdata, offsets.unpad) - elif dd.must_compute_md5: - # rolling compute md5 - dd.perform_chunked_integrity_check(offsets, data) - # complete chunk download - self._complete_chunk_download(offsets, data, dd) - - def _complete_chunk_download(self, offsets, data, dd): - # type: (Downloader, blobxfer.models.download.Offsets, bytes, - # blobxfer.models.download.Descriptor) -> None - """Complete chunk download - :param Downloader self: this - :param blobxfer.models.download.Offsets offsets: offsets - :param bytes data: data - :param blobxfer.models.download.Descriptor dd: download descriptor - """ - # write data to disk - dd.write_data(offsets, data) - # decrement outstanding operations - dd.dec_outstanding_operations() - # TODO pickle dd to resume file + dd.write_data(offsets, data) + else: + # write data to disk + dd.write_unchecked_data(offsets, data) + # integrity check data and write to disk (this is called + # regardless of md5/hmac enablement for resume purposes) + dd.perform_chunked_integrity_check() def _cleanup_temporary_files(self): # type: (Downloader) -> None @@ -442,12 +449,6 @@ def _cleanup_temporary_files(self): This function is not thread-safe. :param Downloader self: this """ - # do not clean up if resume file exists - if self._general_options.resume_file is not None: - logger.debug( - 'not cleaning up temporary files since resume file has ' - 'been specified') - return # iterate through dd map and cleanup files for key in self._dd_map: dd = self._dd_map[key] @@ -495,6 +496,10 @@ def _run(self): logger.info('downloading blobs/files to local path: {}'.format( self._spec.destination.path)) self._catalog_local_files_for_deletion() + # initialize resume db if specified + if self._general_options.resume_file is not None: + self._resume = blobxfer.operations.resume.DownloadResumeManager( + self._general_options.resume_file) # initialize MD5 processes if (self._spec.options.check_file_md5 and self._general_options.concurrency.md5_processes > 0): @@ -570,6 +575,9 @@ def _run(self): # delete all remaining local files not accounted for if # delete extraneous enabled self._delete_extraneous_files() + # delete resume file if we've gotten this far + if self._resume is not None: + self._resume.delete() # output throughput if self._download_start_time is not None: dltime = (end_time - self._download_start_time).total_seconds() @@ -592,7 +600,7 @@ def start(self): self._run() except (KeyboardInterrupt, Exception) as ex: if isinstance(ex, KeyboardInterrupt): - logger.error( + logger.info( 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') try: @@ -601,9 +609,11 @@ def start(self): self._cleanup_temporary_files() raise finally: - # TODO close resume file # shutdown processes if self._md5_offload is not None: self._md5_offload.finalize_processes() if self._crypto_offload is not None: self._crypto_offload.finalize_processes() + # close resume file + if self._resume is not None: + self._resume.close() diff --git a/blobxfer/operations/resume.py b/blobxfer/operations/resume.py new file mode 100644 index 0000000..0f76562 --- /dev/null +++ b/blobxfer/operations/resume.py @@ -0,0 +1,149 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import contextlib +import logging +import pickle +import shelve +import threading +# non-stdlib imports +# local imports +import blobxfer.models.resume +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +class DownloadResumeManager(): + """Download Resume Manager""" + def __init__(self, resume_file): + # type: (DownloadResumeManager, str) -> None + """Ctor for DownloadResumeManager + :param DownloadResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + self._lock = threading.Lock() + self._resume_file = resume_file + self._data = shelve.open( + str(resume_file), protocol=pickle.HIGHEST_PROTOCOL) + + def close(self): + # type: (DownloadResumeManager) -> None + """Close the internal data store + :param DownloadResumeManager self: this + """ + if self._data is not None: + self._data.close() + self._data = None + + def delete(self): + # type: (DownloadResumeManager) -> None + """Delete the resume file db + :param DownloadResumeManager self: this + """ + self.close() + try: + self._resume_file.unlink() + except OSError as e: + logger.warning('could not unlink resume db: {}'.format(e)) + + @contextlib.contextmanager + def datalock(self, acquire=True): + # type: (DownloadResumeManager) -> None + """Delete the resume file db + :param DownloadResumeManager self: this + :param bool acquire: acquire lock + """ + if acquire: + self._lock.acquire() + try: + yield + finally: + if acquire: + self._lock.release() + + def get_record(self, final_path, lock=True): + # type: (DownloadResumeManager, str, + # bool) -> blobxfer.models.resume.Download + """Get a resume record + :param DownloadResumeManager self: this + :param str final_path: final path + :param bool lock: acquire lock + :rtype: blobxfer.models.resume.Download + :return: Download record + """ + with self.datalock(lock): + try: + return self._data[final_path] + except KeyError: + return None + + def add_or_update_record( + self, final_path, temp_path, length, chunk_size, + next_integrity_chunk, completed, md5): + # type: (DownloadResumeManager, pathlib.Path, pathlib.Path, int, int, + # int, bool, str) -> None + """Get a resume record + :param DownloadResumeManager self: this + :param pathlib.Path final_path: final path + :param pathlib.Path temp_path: temp local path + :param int length: content length + :param int chunk_size: chunk size in bytes + :param int next_integrity_chunk: next integrity chunk + :param bool completed: if completed + :param str md5: md5 hex digest + """ + sfp = str(final_path) + with self.datalock(): + dl = self.get_record(sfp, lock=False) + if dl is None: + dl = blobxfer.models.resume.Download( + final_path=sfp, + temp_path=str(temp_path), + length=length, + chunk_size=chunk_size, + next_integrity_chunk=next_integrity_chunk, + completed=completed, + md5=md5, + ) + else: + if (dl.completed or + next_integrity_chunk < dl.next_integrity_chunk): + return + if completed: + dl.completed = completed + else: + dl.next_integrity_chunk = next_integrity_chunk + dl.md5hexdigest = md5 + self._data[sfp] = dl + self._data.sync() From 9701cfc76fd4599aa342edc6972c5a1c4ae937d7 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 18 Apr 2017 19:58:12 -0700 Subject: [PATCH 21/47] Fix tests with resume changes - Add replace_file in util --- blobxfer/models/download.py | 2 +- blobxfer/util.py | 15 ++ tests/test_blobxfer_models_download.py | 161 +++++++++++-------- tests/test_blobxfer_operations_azure_file.py | 1 - tests/test_blobxfer_operations_crypto.py | 26 ++- tests/test_blobxfer_operations_download.py | 82 ++++------ 6 files changed, 164 insertions(+), 123 deletions(-) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index 70ba7a7..7780378 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -665,7 +665,7 @@ def finalize_file(self): # TODO set file uid/gid and mode # move temp download file to final path - self.local_path.replace(self.final_path) + blobxfer.util.replace_file(self.local_path, self.final_path) # update resume file self._update_resume_for_completed() with self._meta_lock: diff --git a/blobxfer/util.py b/blobxfer/util.py index 82c20a7..7d48ceb 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -41,6 +41,7 @@ except ImportError: # noqa from scandir import scandir as scandir import re +import sys # non-stdlib imports import dateutil import dateutil.tz @@ -172,6 +173,20 @@ def scantree(path): yield entry +def replace_file(src, dst): + # type: (pathlib.Path, pathlib.Path) -> None + """Replace a file, using atomic replace if available + :param pathlib.Path src: source path + :param pathlib.Path dst: destination path + """ + if sys.version_info < (3, 3): + if dst.exists(): + dst.unlink() + src.rename(dst) + else: + src.replace(dst) + + def get_mime_type(filename): # type: (str) -> str """Guess the type of a file based on its filename diff --git a/tests/test_blobxfer_models_download.py b/tests/test_blobxfer_models_download.py index 69133e2..548ebf8 100644 --- a/tests/test_blobxfer_models_download.py +++ b/tests/test_blobxfer_models_download.py @@ -96,10 +96,11 @@ def test_downloaddescriptor(tmpdir): ase._size = 1024 ase._encryption = mock.MagicMock() with pytest.raises(RuntimeError): - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) ase._encryption.symmetric_key = b'123' - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() assert d.entity == ase assert not d.must_compute_md5 @@ -107,24 +108,31 @@ def test_downloaddescriptor(tmpdir): assert d._offset == 0 assert d.final_path == lp assert str(d.local_path) == str(lp) + '.bxtmp' + assert d._allocated assert d.local_path.stat().st_size == 1024 - 16 d.local_path.unlink() ase._size = 1 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() assert d._total_chunks == 1 + assert d._allocated assert d.local_path.stat().st_size == 0 d.local_path.unlink() ase._encryption = None ase._size = 1024 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + assert d._allocated assert d.local_path.stat().st_size == 1024 # pre-existing file check ase._size = 0 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() assert d._total_chunks == 0 + assert d._allocated assert d.local_path.stat().st_size == 0 @@ -136,9 +144,10 @@ def test_downloaddescriptor_next_offsets(tmpdir): opts.chunk_size_bytes = 256 ase = azmodels.StorageEntity('cont') ase._size = 128 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 1 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -146,16 +155,17 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 127 assert not offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 0 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) assert d._total_chunks == 0 - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 1 - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 1 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -163,11 +173,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 0 assert not offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 256 - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 1 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -175,11 +186,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 256 + 16 - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 2 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -187,20 +199,22 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad - offsets = d.next_offsets() + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert offsets.chunk_num == 1 assert offsets.fd_start == 256 assert offsets.num_bytes == 16 assert offsets.range_start == 256 assert offsets.range_end == 256 + 15 assert not offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' ase._size = 128 - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 1 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -208,11 +222,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 127 assert offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 256 - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 1 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -220,11 +235,12 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 255 assert offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) ase._size = 256 + 32 # 16 bytes over + padding - d = models.Descriptor(lp, ase, opts) - offsets = d.next_offsets() + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert d._total_chunks == 2 assert offsets.chunk_num == 0 assert offsets.fd_start == 0 @@ -232,17 +248,18 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert offsets.range_start == 0 assert offsets.range_end == 255 assert not offsets.unpad - offsets = d.next_offsets() + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None assert offsets.chunk_num == 1 assert offsets.fd_start == 256 assert offsets.num_bytes == 32 assert offsets.range_start == 256 - 16 assert offsets.range_end == 256 + 31 assert offsets.unpad - assert d.next_offsets() is None + assert d.next_offsets() == (None, None) -def test_postpone_integrity_check(tmpdir): +def test_write_unchecked_data(tmpdir): lp = pathlib.Path(str(tmpdir.join('a'))) opts = mock.MagicMock() @@ -250,10 +267,10 @@ def test_postpone_integrity_check(tmpdir): opts.chunk_size_bytes = 32 ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() - d._postpone_integrity_check(offsets, b'0' * ase._size) + offsets, _ = d.next_offsets() + d.write_unchecked_data(offsets, b'0' * ase._size) assert offsets.chunk_num in d._unchecked_chunks ucc = d._unchecked_chunks[offsets.chunk_num] @@ -262,15 +279,19 @@ def test_postpone_integrity_check(tmpdir): assert ucc.file_path == d.local_path assert not ucc.temp + +def test_write_unchecked_hmac_data(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 32 ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() - d._postpone_integrity_check(offsets, b'0' * ase._size) + offsets, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets, b'0' * ase._size) assert offsets.chunk_num in d._unchecked_chunks ucc = d._unchecked_chunks[offsets.chunk_num] @@ -288,14 +309,16 @@ def test_perform_chunked_integrity_check(tmpdir): opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() + offsets, _ = d.next_offsets() data = b'0' * opts.chunk_size_bytes - d._postpone_integrity_check(offsets, data) - d.perform_chunked_integrity_check(offsets, data) + d.write_unchecked_data(offsets, data) + d.perform_chunked_integrity_check() assert d._next_integrity_chunk == 1 + assert 0 not in d._unchecked_chunks + assert len(d._unchecked_chunks) == 0 opts = mock.MagicMock() opts.check_file_md5 = False @@ -304,18 +327,23 @@ def test_perform_chunked_integrity_check(tmpdir): ase._size = 32 ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() data = b'0' * opts.chunk_size_bytes - offsets1 = d.next_offsets() - d._postpone_integrity_check(offsets1, data) - ucc = d._unchecked_chunks[offsets1.chunk_num] - d.perform_chunked_integrity_check(offsets, data) + offsets, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets, data) + ucc = d._unchecked_chunks[offsets.chunk_num] + offsets1, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets1, data) + ucc1 = d._unchecked_chunks[offsets1.chunk_num] + d.perform_chunked_integrity_check() - assert d._next_integrity_chunk == 2 - assert not ucc.file_path.exists() assert not ucc.file_path.exists() + assert not ucc1.file_path.exists() + assert d._next_integrity_chunk == 2 + assert 0 not in d._unchecked_chunks + assert 1 not in d._unchecked_chunks + assert len(d._unchecked_chunks) == 0 def test_cleanup_all_temporary_files(tmpdir): @@ -325,22 +353,22 @@ def test_cleanup_all_temporary_files(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 16 lp = pathlib.Path(str(tmpdir.join('a'))) - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() + offsets, _ = d.next_offsets() data = b'0' * opts.chunk_size_bytes - d._postpone_integrity_check(offsets, data) + d.write_unchecked_data(offsets, data) assert len(d._unchecked_chunks) == 1 d.cleanup_all_temporary_files() assert not d.local_path.exists() assert not d._unchecked_chunks[0].file_path.exists() lp = pathlib.Path(str(tmpdir.join('b'))) - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() + offsets, _ = d.next_offsets() data = b'0' * opts.chunk_size_bytes - d._postpone_integrity_check(offsets, data) + d.write_unchecked_hmac_data(offsets, data) assert len(d._unchecked_chunks) == 1 d.local_path.unlink() d._unchecked_chunks[0].file_path.unlink() @@ -357,9 +385,9 @@ def test_write_data(tmpdir): opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) - offsets = d.next_offsets() + offsets, _ = d.next_offsets() data = b'0' * ase._size d.write_data(offsets, data) @@ -389,7 +417,8 @@ def test_finalize_file(tmpdir): message_authentication_code = util.base64_encode_as_string( _hmac.digest()) - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() d.hmac.update(data) d.finalize_file() @@ -410,7 +439,8 @@ def test_finalize_file(tmpdir): md5.update(data) ase._md5 = util.base64_encode_as_string(md5.digest()) - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() d.md5.update(data) d.finalize_file() @@ -428,7 +458,8 @@ def test_finalize_file(tmpdir): data = b'0' * ase._size - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() d.finalize_file() assert not d.local_path.exists() @@ -446,7 +477,8 @@ def test_finalize_file(tmpdir): data = b'0' * ase._size ase._md5 = 'oops' - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() d.md5.update(data) d.finalize_file() @@ -462,14 +494,11 @@ def test_operations(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 32 - d = models.Descriptor(lp, ase, opts) + d = models.Descriptor(lp, ase, opts, None) d._outstanding_ops = 1 d._unchecked_chunks = {0: None} assert not d.all_operations_completed - d.dec_outstanding_operations() - assert d._completed_ops == 1 - assert not d.all_operations_completed - + d._outstanding_ops -= 1 d._unchecked_chunks.pop(0) assert d.all_operations_completed diff --git a/tests/test_blobxfer_operations_azure_file.py b/tests/test_blobxfer_operations_azure_file.py index cb6b04e..34b3c32 100644 --- a/tests/test_blobxfer_operations_azure_file.py +++ b/tests/test_blobxfer_operations_azure_file.py @@ -109,7 +109,6 @@ def test_list_files_directory(patched_cisf): assert file.name == 'name' assert i == 1 - print('test') _dir = azure.storage.file.models.Directory(name='dirname') _file = azure.storage.file.models.File(name='dirname/name') client = mock.MagicMock() diff --git a/tests/test_blobxfer_operations_crypto.py b/tests/test_blobxfer_operations_crypto.py index 3ed2262..4dbe116 100644 --- a/tests/test_blobxfer_operations_crypto.py +++ b/tests/test_blobxfer_operations_crypto.py @@ -90,21 +90,32 @@ def test_aes_cbc_encryption(): assert decdata == plaindata -def test_cryptooffload_decrypt(): +def test_cryptooffload_decrypt(tmpdir): + symkey = ops.aes256_generate_random_key() + iv = os.urandom(16) + plainlen = 16 + plaindata = os.urandom(plainlen) + encdata = ops.aes_cbc_encrypt_data(symkey, iv, plaindata, False) + + afile = tmpdir.join('a') + afile.write(encdata, mode='wb') + hmacfile = str(afile) + bfile = tmpdir.join('b') + bfile.ensure(file=True) + a = None try: a = ops.CryptoOffload(1) offsets = blobxfer.models.download.Offsets( chunk_num=0, - fd_start=1, + fd_start=0, # this matters! num_bytes=2, range_end=3, range_start=4, unpad=False, ) a.add_decrypt_chunk( - 'fp', offsets, ops.aes256_generate_random_key(), os.urandom(16), - os.urandom(16)) + 'fp', str(bfile), offsets, symkey, iv, hmacfile) i = 33 checked = False while i > 0: @@ -113,12 +124,13 @@ def test_cryptooffload_decrypt(): time.sleep(0.3) i -= 1 continue - assert len(result) == 3 - assert result[0] == 'fp' - assert result[1] == offsets + assert result == 'fp' checked = True break assert checked + assert bfile.stat().size == plainlen + decdata = bfile.read(mode='rb') + assert decdata == plaindata finally: if a is not None: a.finalize_processes() diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index ea2be05..81ae200 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -252,11 +252,14 @@ def test_pre_md5_skip_on_check(): assert lpath in d._md5_map -def test_post_md5_skip_on_check(): +def test_post_md5_skip_on_check(tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_total = 0 + d._download_bytes_total = 0 d._md5_offload = mock.MagicMock() - lpath = 'lpath' + lp = tmpdir.join('lpath').ensure(file=True) + lpath = str(lp) rfile = azmodels.StorageEntity('cont') rfile._md5 = 'abc' d._pre_md5_skip_on_check(lpath, rfile) @@ -324,18 +327,18 @@ def test_check_for_crypto_done(): lpath = 'lpath' d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._download_set.add(pathlib.Path(lpath)) - d._dd_map[lpath] = mock.MagicMock() + dd = mock.MagicMock() + d._dd_map[lpath] = dd d._crypto_offload = mock.MagicMock() d._crypto_offload.done_cv = multiprocessing.Condition() d._crypto_offload.pop_done_queue.side_effect = [ None, - (lpath, mock.MagicMock(), mock.MagicMock()), + lpath, ] - d._complete_chunk_download = mock.MagicMock() d._all_remote_files_processed = False d._download_terminate = True d._check_for_crypto_done() - assert d._complete_chunk_download.call_count == 0 + assert dd.perform_chunked_integrity_check.call_count == 0 with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', @@ -343,17 +346,18 @@ def test_check_for_crypto_done(): d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._download_set.add(pathlib.Path(lpath)) - d._dd_map[lpath] = mock.MagicMock() + dd = mock.MagicMock() + d._dd_map[lpath] = dd d._crypto_offload = mock.MagicMock() d._crypto_offload.done_cv = multiprocessing.Condition() d._crypto_offload.pop_done_queue.side_effect = [ None, - (lpath, mock.MagicMock(), mock.MagicMock()), + lpath, ] patched_tc.side_effect = [False, False, True] d._complete_chunk_download = mock.MagicMock() d._check_for_crypto_done() - assert d._complete_chunk_download.call_count == 1 + assert dd.perform_chunked_integrity_check.call_count == 1 def test_add_to_download_queue(tmpdir): @@ -386,26 +390,6 @@ def test_initialize_and_terminate_download_threads(): assert not thr.is_alive() -def test_complete_chunk_download(tmpdir): - lp = pathlib.Path(str(tmpdir.join('a'))) - opts = mock.MagicMock() - opts.check_file_md5 = False - opts.chunk_size_bytes = 16 - ase = azmodels.StorageEntity('cont') - ase._size = 16 - dd = models.Descriptor(lp, ase, opts) - - d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - offsets = dd.next_offsets() - data = b'0' * ase._size - - d._complete_chunk_download(offsets, data, dd) - - assert dd.local_path.exists() - assert dd.local_path.stat().st_size == len(data) - assert dd._completed_ops == 1 - - @mock.patch('blobxfer.operations.crypto.aes_cbc_decrypt_data') @mock.patch('blobxfer.operations.azure.file.get_file_range') @mock.patch('blobxfer.operations.azure.blob.get_blob_range') @@ -431,7 +415,6 @@ def test_worker_thread_download( new_callable=mock.PropertyMock) as patched_aoc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._complete_chunk_download = mock.MagicMock() opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 @@ -440,17 +423,18 @@ def test_worker_thread_download( ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' lp = pathlib.Path(str(tmpdir.join('a'))) - dd = models.Descriptor(lp, ase, opts) - dd.next_offsets = mock.MagicMock(side_effect=[None, None]) + dd = models.Descriptor(lp, ase, opts, None) + dd.next_offsets = mock.MagicMock( + side_effect=[(None, None), (None, None)]) dd.finalize_file = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() patched_aoc.side_effect = [False, True] patched_tc.side_effect = [False, False, False, True] - d._dd_map[str(lp)] = mock.MagicMock() + d._dd_map[str(lp)] = dd d._download_set.add(lp) d._download_queue = mock.MagicMock() d._download_queue.get.side_effect = [queue.Empty, dd, dd] d._worker_thread_download() - assert d._complete_chunk_download.call_count == 0 assert str(lp) not in d._dd_map assert dd.finalize_file.call_count == 1 assert d._download_sofar == 1 @@ -468,17 +452,15 @@ def test_worker_thread_download( ase._size = 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('b'))) - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() d._dd_map[str(lp)] = mock.MagicMock() d._download_set.add(lp) d._download_queue = mock.MagicMock() d._download_queue.get.side_effect = [dd] - d._complete_chunk_download = mock.MagicMock() patched_tc.side_effect = [False, True] d._worker_thread_download() - assert d._complete_chunk_download.call_count == 1 assert dd.perform_chunked_integrity_check.call_count == 1 with mock.patch( @@ -497,21 +479,20 @@ def test_worker_thread_download( ase._encryption.content_encryption_iv = b'0' * 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('c'))) - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) dd.finalize_file = mock.MagicMock() + dd.write_unchecked_hmac_data = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() d._crypto_offload = mock.MagicMock() d._crypto_offload.add_decrypt_chunk = mock.MagicMock() - d._dd_map[str(lp)] = mock.MagicMock() + d._dd_map[str(lp)] = dd d._download_set.add(lp) d._download_queue = mock.MagicMock() d._download_queue.get.side_effect = [dd] - d._complete_chunk_download = mock.MagicMock() patched_tc.side_effect = [False, True] d._worker_thread_download() - assert d._complete_chunk_download.call_count == 0 assert d._crypto_offload.add_decrypt_chunk.call_count == 1 - assert dd.perform_chunked_integrity_check.call_count == 1 + assert dd.write_unchecked_hmac_data.call_count == 1 with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', @@ -530,19 +511,19 @@ def test_worker_thread_download( ase._encryption.content_encryption_iv = b'0' * 16 patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('d'))) - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) dd.next_offsets() + dd.write_unchecked_hmac_data = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() patched_acdd.return_value = b'0' * 16 d._dd_map[str(lp)] = mock.MagicMock() d._download_set.add(lp) d._download_queue = mock.MagicMock() d._download_queue.get.side_effect = [dd] - d._complete_chunk_download = mock.MagicMock() patched_tc.side_effect = [False, True] d._worker_thread_download() - assert d._complete_chunk_download.call_count == 1 assert patched_acdd.call_count == 1 + assert dd.write_unchecked_hmac_data.call_count == 1 assert dd.perform_chunked_integrity_check.call_count == 1 @@ -553,7 +534,8 @@ def test_cleanup_temporary_files(tmpdir): opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() dd.cleanup_all_temporary_files = mock.MagicMock() dd.cleanup_all_temporary_files.side_effect = Exception d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -568,7 +550,8 @@ def test_cleanup_temporary_files(tmpdir): opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._general_options.resume_file = None d._dd_map[0] = dd @@ -581,7 +564,8 @@ def test_cleanup_temporary_files(tmpdir): opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 16 - dd = models.Descriptor(lp, ase, opts) + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() dd.cleanup_all_temporary_files = mock.MagicMock() dd.cleanup_all_temporary_files.side_effect = Exception d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) @@ -641,6 +625,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): patched_lfmo._check_thread = mock.MagicMock() d._general_options.concurrency.crypto_processes = 1 d._general_options.concurrency.md5_processes = 1 + d._general_options.resume_file = None d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 @@ -691,6 +676,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): def test_start_keyboard_interrupt(): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None d._run = mock.MagicMock(side_effect=KeyboardInterrupt) d._wait_for_download_threads = mock.MagicMock() d._cleanup_temporary_files = mock.MagicMock() From 158a67fefc6ba0c404641b10bbb3303b13979d56 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Wed, 19 Apr 2017 14:36:10 -0700 Subject: [PATCH 22/47] Improve test coverage - Add exception handling for download worker threads - Add resume tests - Update test_requirements to conditionally install mock - Use unittest.mock if present --- blobxfer/models/azure.py | 1 - blobxfer/models/download.py | 38 ++- blobxfer/operations/download.py | 158 +++++++------ setup.py | 4 +- test_requirements.txt | 2 +- tests/test_blobxfer_models_azure.py | 5 +- tests/test_blobxfer_models_download.py | 230 ++++++++++++++++++- tests/test_blobxfer_models_offload.py | 5 +- tests/test_blobxfer_models_options.py | 5 +- tests/test_blobxfer_models_resume.py | 32 +++ tests/test_blobxfer_operations_azure.py | 5 +- tests/test_blobxfer_operations_azure_blob.py | 5 +- tests/test_blobxfer_operations_azure_file.py | 5 +- tests/test_blobxfer_operations_crypto.py | 5 +- tests/test_blobxfer_operations_download.py | 73 +++++- tests/test_blobxfer_operations_progress.py | 5 +- tests/test_blobxfer_operations_resume.py | 65 ++++++ tests/test_blobxfer_retry.py | 5 +- tests/test_blobxfer_util.py | 40 ++++ 19 files changed, 569 insertions(+), 119 deletions(-) create mode 100644 tests/test_blobxfer_models_resume.py create mode 100644 tests/test_blobxfer_operations_resume.py diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index 9f28ca3..afc971d 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -66,7 +66,6 @@ def __init__(self, container, ed=None): self._md5 = None self._encryption = ed self._vio = None - self.download = None @property def client(self): diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index 7780378..77f2757 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -329,7 +329,7 @@ def _resume(self): :rtype: int or None :return: verified download offset """ - if self._resume_mgr is None or self._offset != 0: + if self._resume_mgr is None or self._offset > 0 or self._finalized: return None # check if path exists in resume db rr = self._resume_mgr.get_record(str(self.final_path)) @@ -346,12 +346,11 @@ def _resume(self): logger.debug('nothing to resume for {}'.format(self.final_path)) return None curr_chunk = rr.next_integrity_chunk - curr_offset = curr_chunk * rr.chunk_size # set offsets if completed and the final path exists if rr.completed and self.final_path.exists(): - logger.debug('{} download already completed'.format( - self.final_path)) with self._meta_lock: + logger.debug('{} download already completed'.format( + self.final_path)) self._offset = self._ase.size self._chunk_num = curr_chunk self._chunk_size = rr.chunk_size @@ -375,34 +374,22 @@ def _resume(self): 'unexpected hmac object for entity {}/{}'.format( self._ase.container, self._ase.name)) # re-hash from 0 to offset if needed + _fd_offset = 0 + _end_offset = min((curr_chunk * rr.chunk_size, rr.length)) if self.md5 is not None and curr_chunk > 0: - pagealign = ( - self._ase.mode == blobxfer.models.azure.StorageModes.Page - ) - _fd_offset = 0 - _end_offset = min( - (curr_chunk * rr.chunk_size, rr.length) - ) + _blocksize = blobxfer.util.MEGABYTE << 2 logger.debug( 'integrity checking existing file {} to offset {}'.format( self.final_path, _end_offset)) with self._hasher_lock: with self.local_path.open('rb') as filedesc: while _fd_offset < _end_offset: - _blocksize = blobxfer.util.MEGABYTE << 2 if (_fd_offset + _blocksize) > _end_offset: _blocksize = _end_offset - _fd_offset - buf = filedesc.read(_blocksize) - buflen = len(buf) - if pagealign and buflen < _blocksize: - aligned = blobxfer.\ - util.page_align_content_length(buflen) - if aligned != buflen: - buf = buf.ljust(aligned, b'\0') - self.md5.update(buf) + _buf = filedesc.read(_blocksize) + self.md5.update(_buf) _fd_offset += _blocksize - del _fd_offset - del _end_offset + del _blocksize # compare hashes hexdigest = self.md5.hexdigest() if rr.md5hexdigest != hexdigest: @@ -414,13 +401,14 @@ def _resume(self): return None # set values from resume with self._meta_lock: - self._offset = curr_offset + self._offset = _end_offset self._chunk_num = curr_chunk self._chunk_size = rr.chunk_size self._total_chunks = self._compute_total_chunks(rr.chunk_size) self._next_integrity_chunk = rr.next_integrity_chunk - self._outstanding_ops = \ + self._outstanding_ops = ( self._total_chunks - self._next_integrity_chunk + ) logger.debug( ('resuming file {} from byte={} chunk={} chunk_size={} ' 'total_chunks={} next_integrity_chunk={} ' @@ -428,7 +416,7 @@ def _resume(self): self.final_path, self._offset, self._chunk_num, self._chunk_size, self._total_chunks, self._next_integrity_chunk, self._outstanding_ops)) - return curr_offset + return _end_offset def cleanup_all_temporary_files(self): # type: (Descriptor) -> None diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index bb63f3f..f1112e1 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -96,6 +96,7 @@ def __init__(self, general_options, creds, spec): self._creds = creds self._spec = spec self._resume = None + self._exceptions = [] @property def termination_check(self): @@ -107,6 +108,7 @@ def termination_check(self): """ with self._download_lock: return (self._download_terminate or + len(self._exceptions) > 0 or (self._all_remote_files_processed and len(self._download_set) == 0)) @@ -369,79 +371,92 @@ def _worker_thread_download(self): dd = self._download_queue.get(False, 0.25) except queue.Empty: continue - # update progress bar - self._update_progress_bar() - # get download offsets - offsets, resume_bytes = dd.next_offsets() - # add resume bytes to counter - if resume_bytes is not None: - with self._download_lock: - self._download_bytes_sofar += resume_bytes - del resume_bytes - # check if all operations completed - if offsets is None and dd.all_operations_completed: - # finalize file - dd.finalize_file() - # accounting + try: + self._process_download_descriptor(dd) + except Exception as e: with self._download_lock: - if dd.entity.is_encrypted: - self._dd_map.pop(str(dd.final_path)) - self._download_set.remove(dd.final_path) - self._download_sofar += 1 - continue - # re-enqueue for other threads to download - self._download_queue.put(dd) - if offsets is None: - continue - # issue get range - if dd.entity.mode == blobxfer.models.azure.StorageModes.File: - data = blobxfer.operations.azure.file.get_file_range( - dd.entity, offsets, self._general_options.timeout_sec) - else: - data = blobxfer.operations.azure.blob.get_blob_range( - dd.entity, offsets, self._general_options.timeout_sec) + self._exceptions.append(e) + + def _process_download_descriptor(self, dd): + # type: (Downloader, blobxfer.models.download.Descriptor) -> None + """Process download descriptor + :param Downloader self: this + :param blobxfer.models.download.Descriptor: download descriptor + """ + # update progress bar + self._update_progress_bar() + # get download offsets + offsets, resume_bytes = dd.next_offsets() + # add resume bytes to counter + if resume_bytes is not None: + with self._download_lock: + self._download_bytes_sofar += resume_bytes + logger.debug('adding {} sofar {} from {}'.format( + resume_bytes, self._download_bytes_sofar, dd._ase.name)) + del resume_bytes + # check if all operations completed + if offsets is None and dd.all_operations_completed: + # finalize file + dd.finalize_file() # accounting with self._download_lock: - self._download_bytes_sofar += offsets.num_bytes - # decrypt if necessary - if dd.entity.is_encrypted: - # slice data to proper bounds and get iv for chunk - if offsets.chunk_num == 0: - # set iv - iv = dd.entity.encryption_metadata.content_encryption_iv - # set data to decrypt - encdata = data - # send iv through hmac - dd.hmac_iv(iv) - else: - # set iv - iv = data[:blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES] - # set data to decrypt - encdata = data[ - blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] - # write encdata to disk for hmac later - _hmac_datafile = dd.write_unchecked_hmac_data( - offsets, encdata) - # decrypt data - if self._crypto_offload is not None: - self._crypto_offload.add_decrypt_chunk( - str(dd.final_path), str(dd.local_path), offsets, - dd.entity.encryption_metadata.symmetric_key, - iv, _hmac_datafile) - # data will be integrity checked and written once - # retrieved from crypto queue - continue - else: - data = blobxfer.operations.crypto.aes_cbc_decrypt_data( - dd.entity.encryption_metadata.symmetric_key, - iv, encdata, offsets.unpad) - dd.write_data(offsets, data) + if dd.entity.is_encrypted: + self._dd_map.pop(str(dd.final_path)) + self._download_set.remove(dd.final_path) + self._download_sofar += 1 + return + # re-enqueue for other threads to download + self._download_queue.put(dd) + if offsets is None: + return + # issue get range + if dd.entity.mode == blobxfer.models.azure.StorageModes.File: + data = blobxfer.operations.azure.file.get_file_range( + dd.entity, offsets, self._general_options.timeout_sec) + else: + data = blobxfer.operations.azure.blob.get_blob_range( + dd.entity, offsets, self._general_options.timeout_sec) + # accounting + with self._download_lock: + self._download_bytes_sofar += offsets.num_bytes + # decrypt if necessary + if dd.entity.is_encrypted: + # slice data to proper bounds and get iv for chunk + if offsets.chunk_num == 0: + # set iv + iv = dd.entity.encryption_metadata.content_encryption_iv + # set data to decrypt + encdata = data + # send iv through hmac + dd.hmac_iv(iv) else: - # write data to disk - dd.write_unchecked_data(offsets, data) - # integrity check data and write to disk (this is called - # regardless of md5/hmac enablement for resume purposes) - dd.perform_chunked_integrity_check() + # set iv + iv = data[:blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES] + # set data to decrypt + encdata = data[blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] + # write encdata to disk for hmac later + _hmac_datafile = dd.write_unchecked_hmac_data( + offsets, encdata) + # decrypt data + if self._crypto_offload is not None: + self._crypto_offload.add_decrypt_chunk( + str(dd.final_path), str(dd.local_path), offsets, + dd.entity.encryption_metadata.symmetric_key, + iv, _hmac_datafile) + # data will be integrity checked and written once + # retrieved from crypto queue + return + else: + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( + dd.entity.encryption_metadata.symmetric_key, + iv, encdata, offsets.unpad) + dd.write_data(offsets, data) + else: + # write data to disk + dd.write_unchecked_data(offsets, data) + # integrity check data and write to disk (this is called + # regardless of md5/hmac enablement for resume purposes) + dd.perform_chunked_integrity_check() def _cleanup_temporary_files(self): # type: (Downloader) -> None @@ -565,6 +580,11 @@ def _run(self): end_time = blobxfer.util.datetime_now() # update progress bar self._update_progress_bar() + # check for exceptions + if len(self._exceptions) > 0: + logger.error('exceptions encountered while downloading') + # raise the first one + raise self._exceptions[0] # check for mismatches if (self._download_sofar != self._download_total or self._download_bytes_sofar != self._download_bytes_total): diff --git a/setup.py b/setup.py index 5a2d6c6..2725ade 100644 --- a/setup.py +++ b/setup.py @@ -39,14 +39,14 @@ ] install_requires = [ - 'azure-common==1.1.4', + 'azure-common==1.1.5', 'azure-storage==0.34.0', 'click==6.7', 'cryptography>=1.8.1', 'future==0.16.0', 'python-dateutil==2.6.0', 'requests==2.13.0', - 'ruamel.yaml==0.14.5', + 'ruamel.yaml==0.14.8', ] if sys.version_info < (3, 4): diff --git a/test_requirements.txt b/test_requirements.txt index f2315c3..bc58365 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,5 +1,5 @@ flake8>=3.3.0 -mock>=2.0.0 +mock>=2.0.0; python_version < '3.3' pypandoc>=1.3.3 pytest>=3.0.7 pytest-cov>=2.4.0 diff --git a/tests/test_blobxfer_models_azure.py b/tests/test_blobxfer_models_azure.py index 37a40a5..6ddc95b 100644 --- a/tests/test_blobxfer_models_azure.py +++ b/tests/test_blobxfer_models_azure.py @@ -2,7 +2,10 @@ """Tests for models azure""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import azure.storage import azure.storage.blob diff --git a/tests/test_blobxfer_models_download.py b/tests/test_blobxfer_models_download.py index 548ebf8..6c62ce1 100644 --- a/tests/test_blobxfer_models_download.py +++ b/tests/test_blobxfer_models_download.py @@ -4,18 +4,23 @@ # stdlib imports import hashlib import hmac -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock import os try: import pathlib2 as pathlib except ImportError: # noqa import pathlib +import unittest # non-stdlib imports import pytest # local imports import blobxfer.models.azure as azmodels import blobxfer.models.options as options import blobxfer.operations.azure as azops +import blobxfer.operations.resume as rops import blobxfer.util as util # module under test import blobxfer.models.download as models @@ -100,6 +105,7 @@ def test_downloaddescriptor(tmpdir): ase._encryption.symmetric_key = b'123' d = models.Descriptor(lp, ase, opts, None) + assert not d._allocated d._allocate_disk_space() assert d.entity == ase @@ -111,6 +117,9 @@ def test_downloaddescriptor(tmpdir): assert d._allocated assert d.local_path.stat().st_size == 1024 - 16 + d._allocate_disk_space() + assert d._allocated + d.local_path.unlink() ase._size = 1 d = models.Descriptor(lp, ase, opts, None) @@ -136,6 +145,146 @@ def test_downloaddescriptor(tmpdir): assert d.local_path.stat().st_size == 0 +@unittest.skipIf(util.on_python2(), 'fallocate does not exist') +def test_downloaddescriptor_allocate_disk_space_via_seek(tmpdir): + fp = pathlib.Path(str(tmpdir.join('fp'))) + lp = pathlib.Path(str(tmpdir.join('fp.bxtmp'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + d = models.Descriptor(fp, ase, opts, None) + + with mock.patch('os.posix_fallocate') as patched_fallocate: + patched_fallocate.side_effect = [AttributeError()] + d._allocate_disk_space() + assert d._allocated + assert not fp.exists() + assert lp.stat().st_size == ase._size + + +def test_downloaddescriptor_resume(tmpdir): + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + lp = pathlib.Path(str(tmpdir.join('fp.bxtmp'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + + # test no record + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test length mismatch + rmgr.add_or_update_record(str(fp), str(lp), 127, 0, 0, False, None) + rb = d._resume() + assert rb is None + + # test nothing to resume + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + rmgr.add_or_update_record(str(fp), str(lp), ase._size, 0, 0, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test completion + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, True, None) + d = models.Descriptor(fp, ase, opts, rmgr) + fp.touch() + rb = d._resume() + assert rb == ase._size + + # test encrypted no resume + fp.unlink() + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test if intermediate file not exists + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + + rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # ensure hmac not populated + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + lp.touch() + + rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + d.hmac = True + with pytest.raises(RuntimeError): + d._resume() + + # md5 hash check + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + data = os.urandom(32) + with lp.open('wb') as f: + f.write(data) + md5 = util.new_md5_hasher() + md5.update(data) + + rmgr.add_or_update_record( + str(fp), str(lp), ase._size, 32, 1, False, md5.hexdigest()) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb == 32 + + # md5 hash mismatch + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + rmgr.add_or_update_record( + str(fp), str(lp), ase._size, 32, 1, False, 'abc') + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # md5 hash check as page file + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + ase._mode = azmodels.StorageModes.Page + + rmgr.add_or_update_record( + str(fp), str(lp), ase._size, 32, 1, False, md5.hexdigest()) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb == 32 + + def test_downloaddescriptor_next_offsets(tmpdir): lp = pathlib.Path(str(tmpdir.join('a'))) @@ -259,6 +408,24 @@ def test_downloaddescriptor_next_offsets(tmpdir): assert d.next_offsets() == (None, None) +def test_hmac_iv(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + ase._size = 128 + d = models.Descriptor(lp, ase, opts, None) + + iv = b'abc' + d.hmac_iv(iv) + assert d.hmac.update.call_count == 1 + + def test_write_unchecked_data(tmpdir): lp = pathlib.Path(str(tmpdir.join('a'))) @@ -345,6 +512,48 @@ def test_perform_chunked_integrity_check(tmpdir): assert 1 not in d._unchecked_chunks assert len(d._unchecked_chunks) == 0 + # check integrity with resume + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._name = 'blob' + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + + data = b'0' * opts.chunk_size_bytes + md5 = util.new_md5_hasher() + md5.update(data) + offsets, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets, data) + d.perform_chunked_integrity_check() + assert d._next_integrity_chunk == 1 + assert len(d._unchecked_chunks) == 0 + dr = rmgr.get_record(str(fp)) + assert dr.next_integrity_chunk == 1 + assert dr.md5hexdigest == md5.hexdigest() + + +def test_update_resume_for_completed(tmpdir): + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._name = 'blob' + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + offsets, _ = d.next_offsets() + d._update_resume_for_completed() + dr = rmgr.get_record(str(fp)) + assert dr.completed + def test_cleanup_all_temporary_files(tmpdir): opts = mock.MagicMock() @@ -396,6 +605,25 @@ def test_write_data(tmpdir): def test_finalize_file(tmpdir): + # already finalized + lp = pathlib.Path(str(tmpdir.join('af'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d._finalized = True + d.finalize_file() + + assert d.local_path.exists() + assert not d.final_path.exists() + d.local_path.unlink() + # hmac check success lp = pathlib.Path(str(tmpdir.join('a'))) opts = mock.MagicMock() diff --git a/tests/test_blobxfer_models_offload.py b/tests/test_blobxfer_models_offload.py index ca5a2bb..24351e3 100644 --- a/tests/test_blobxfer_models_offload.py +++ b/tests/test_blobxfer_models_offload.py @@ -2,7 +2,10 @@ """Tests for offload""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import pytest # local imports diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py index 4716f27..1ee72bb 100644 --- a/tests/test_blobxfer_models_options.py +++ b/tests/test_blobxfer_models_options.py @@ -2,7 +2,10 @@ """Tests for models options""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock try: import pathlib2 as pathlib except ImportError: # noqa diff --git a/tests/test_blobxfer_models_resume.py b/tests/test_blobxfer_models_resume.py new file mode 100644 index 0000000..55a6009 --- /dev/null +++ b/tests/test_blobxfer_models_resume.py @@ -0,0 +1,32 @@ +# coding=utf-8 +"""Tests for models resume""" + +# stdlib imports +# non-stdlib imports +# module under test +import blobxfer.models.resume as rmodels + + +def test_download(): + d = rmodels.Download('fp', 'tp', 1, 2, 0, False, '') + assert d.final_path == 'fp' + assert d.temp_path == 'tp' + assert d.length == 1 + assert d.chunk_size == 2 + assert d.next_integrity_chunk == 0 + assert not d.completed + assert d.md5hexdigest == '' + + d.md5hexdigest = None + assert d.md5hexdigest == '' + + d.md5hexdigest = 'abc' + assert d.md5hexdigest == 'abc' + + d.next_integrity_chunk = 1 + assert d.next_integrity_chunk == 1 + + d.completed = True + assert d.completed + + assert len(str(d)) > 0 diff --git a/tests/test_blobxfer_operations_azure.py b/tests/test_blobxfer_operations_azure.py index bfe976d..346fab6 100644 --- a/tests/test_blobxfer_operations_azure.py +++ b/tests/test_blobxfer_operations_azure.py @@ -2,7 +2,10 @@ """Tests for operations azure""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import azure.storage import azure.storage.blob diff --git a/tests/test_blobxfer_operations_azure_blob.py b/tests/test_blobxfer_operations_azure_blob.py index 3880d1a..0ed626a 100644 --- a/tests/test_blobxfer_operations_azure_blob.py +++ b/tests/test_blobxfer_operations_azure_blob.py @@ -2,7 +2,10 @@ """Tests for general blob operations""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import azure.common import azure.storage.blob diff --git a/tests/test_blobxfer_operations_azure_file.py b/tests/test_blobxfer_operations_azure_file.py index 34b3c32..2a45428 100644 --- a/tests/test_blobxfer_operations_azure_file.py +++ b/tests/test_blobxfer_operations_azure_file.py @@ -2,7 +2,10 @@ """Tests for file operations""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import azure.common import azure.storage diff --git a/tests/test_blobxfer_operations_crypto.py b/tests/test_blobxfer_operations_crypto.py index 4dbe116..d3fdc62 100644 --- a/tests/test_blobxfer_operations_crypto.py +++ b/tests/test_blobxfer_operations_crypto.py @@ -2,7 +2,10 @@ """Tests for crypto operations""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock import os import time # non-stdlib imports diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index 81ae200..749e835 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -3,8 +3,10 @@ # stdlib imports import datetime -import dateutil.tz -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock import multiprocessing try: import pathlib2 as pathlib @@ -16,6 +18,7 @@ import Queue as queue # non-stdlib imports import azure.storage.blob +import dateutil.tz import pytest # local imports import blobxfer.models.azure as azmodels @@ -340,6 +343,7 @@ def test_check_for_crypto_done(): d._check_for_crypto_done() assert dd.perform_chunked_integrity_check.call_count == 0 + # check successful integrity check call with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: @@ -359,6 +363,25 @@ def test_check_for_crypto_done(): d._check_for_crypto_done() assert dd.perform_chunked_integrity_check.call_count == 1 + # check KeyError on result + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_set.add(pathlib.Path(lpath)) + dd = mock.MagicMock() + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + lpath, + ] + patched_tc.side_effect = [False, False, True] + d._complete_chunk_download = mock.MagicMock() + d._check_for_crypto_done() + assert dd.perform_chunked_integrity_check.call_count == 0 + def test_add_to_download_queue(tmpdir): path = tmpdir.join('a') @@ -406,6 +429,27 @@ def test_worker_thread_download( d._worker_thread_download() assert d._complete_chunk_download.call_count == 0 + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + patched_tc.side_effect = [False, False, True] + ase = azmodels.StorageEntity('cont') + ase._size = 16 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + lp = pathlib.Path(str(tmpdir.join('exc'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + dd = models.Descriptor(lp, ase, opts, None) + d._download_queue = mock.MagicMock() + d._download_queue.get.side_effect = [queue.Empty, dd] + d._process_download_descriptor = mock.MagicMock() + d._process_download_descriptor.side_effect = RuntimeError('oops') + d._worker_thread_download() + assert len(d._exceptions) == 1 + assert d._process_download_descriptor.call_count == 1 + with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', new_callable=mock.PropertyMock) as patched_tc: @@ -425,7 +469,7 @@ def test_worker_thread_download( lp = pathlib.Path(str(tmpdir.join('a'))) dd = models.Descriptor(lp, ase, opts, None) dd.next_offsets = mock.MagicMock( - side_effect=[(None, None), (None, None)]) + side_effect=[(None, 1), (None, 2)]) dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() patched_aoc.side_effect = [False, True] @@ -438,6 +482,7 @@ def test_worker_thread_download( assert str(lp) not in d._dd_map assert dd.finalize_file.call_count == 1 assert d._download_sofar == 1 + assert d._download_bytes_sofar == 3 with mock.patch( 'blobxfer.operations.download.Downloader.termination_check', @@ -610,14 +655,13 @@ def test_delete_extraneous_files(tmpdir): d._delete_extraneous_files() -@mock.patch('time.clock') @mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') @mock.patch('blobxfer.operations.azure.blob.list_blobs') @mock.patch( 'blobxfer.operations.download.Downloader.ensure_local_destination', return_value=True ) -def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): +def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._cleanup_temporary_files = mock.MagicMock() d._download_start = datetime.datetime.now(tz=dateutil.tz.tzlocal()) @@ -625,7 +669,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): patched_lfmo._check_thread = mock.MagicMock() d._general_options.concurrency.crypto_processes = 1 d._general_options.concurrency.md5_processes = 1 - d._general_options.resume_file = None + d._general_options.resume_file = pathlib.Path(str(tmpdir.join('rf'))) d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 @@ -647,19 +691,15 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): b = azure.storage.blob.models.Blob(name='name') b.properties.content_length = 1 patched_lb.side_effect = [[b]] - d._pre_md5_skip_on_check = mock.MagicMock() - d._check_download_conditions = mock.MagicMock() d._check_download_conditions.return_value = ops.DownloadAction.Skip - patched_tc.side_effect = [1, 2] d.start() assert d._pre_md5_skip_on_check.call_count == 0 patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = ops.DownloadAction.CheckMd5 - patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() assert d._pre_md5_skip_on_check.call_count == 1 @@ -668,11 +708,22 @@ def test_start(patched_eld, patched_lb, patched_lfmo, patched_tc, tmpdir): patched_lb.side_effect = [[b]] d._all_remote_files_processed = False d._check_download_conditions.return_value = ops.DownloadAction.Download - patched_tc.side_effect = [1, 2] with pytest.raises(RuntimeError): d.start() assert d._download_queue.qsize() == 1 + # test exception count + b = azure.storage.blob.models.Blob(name='name') + b.properties.content_length = 1 + patched_lb.side_effect = [[b]] + d._pre_md5_skip_on_check = mock.MagicMock() + d._check_download_conditions = mock.MagicMock() + d._check_download_conditions.return_value = ops.DownloadAction.Skip + d._exceptions = [RuntimeError('oops')] + with pytest.raises(RuntimeError): + d.start() + assert d._pre_md5_skip_on_check.call_count == 0 + def test_start_keyboard_interrupt(): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) diff --git a/tests/test_blobxfer_operations_progress.py b/tests/test_blobxfer_operations_progress.py index 7cb0776..75f9f79 100644 --- a/tests/test_blobxfer_operations_progress.py +++ b/tests/test_blobxfer_operations_progress.py @@ -2,7 +2,10 @@ """Tests for progress operations""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports # local imports import blobxfer.util as util diff --git a/tests/test_blobxfer_operations_resume.py b/tests/test_blobxfer_operations_resume.py new file mode 100644 index 0000000..52f11b8 --- /dev/null +++ b/tests/test_blobxfer_operations_resume.py @@ -0,0 +1,65 @@ +# coding=utf-8 +"""Tests for operations resume""" + +# stdlib imports +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# module under test +import blobxfer.operations.resume as ops + + +def test_download_resume_manager(tmpdir): + tmpdb = pathlib.Path(str(tmpdir.join('tmp.db'))) + + drm = ops.DownloadResumeManager(tmpdb) + assert drm._data is not None + drm.close() + assert drm._data is None + assert tmpdb.exists() + drm.delete() + assert drm._data is None + assert not tmpdb.exists() + + final_path = 'fp' + drm = ops.DownloadResumeManager(tmpdb) + drm.add_or_update_record(final_path, 'tp', 1, 2, 0, False, None) + d = drm.get_record(final_path) + + assert d.final_path == final_path + + drm.add_or_update_record(final_path, 'tp', 1, 2, 1, False, 'abc') + d = drm.get_record(final_path) + + assert d.final_path == final_path + assert not d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + drm.add_or_update_record(final_path, 'tp', 1, 2, 1, True, None) + d = drm.get_record(final_path) + + assert d.final_path == final_path + assert d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + # idempotent check after completed + drm.add_or_update_record(final_path, 'tp', 1, 2, 1, True, None) + d = drm.get_record(final_path) + + assert d.final_path == final_path + assert d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + drm.close() + assert drm._data is None + assert tmpdb.exists() + + tmpdb.unlink() + drm.delete() + assert drm._data is None + assert not tmpdb.exists() diff --git a/tests/test_blobxfer_retry.py b/tests/test_blobxfer_retry.py index b66c41e..9d84b90 100644 --- a/tests/test_blobxfer_retry.py +++ b/tests/test_blobxfer_retry.py @@ -2,7 +2,10 @@ """Tests for retry""" # stdlib imports -import mock +try: + import unittest.mock as mock +except ImportError: # noqa + import mock # non-stdlib imports import pytest # module under test diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index 0f94c0e..64294d3 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -2,6 +2,14 @@ """Tests for util""" # stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib import sys # non-stdlib imports import pytest @@ -90,6 +98,38 @@ def test_scantree(tmpdir): assert len(found) == 2 +def test_replace_file(tmpdir): + src = pathlib.Path(str(tmpdir.join('src'))) + dst = pathlib.Path(str(tmpdir.join('dst'))) + src.touch() + dst.touch() + + replace_avail = sys.version_info >= (3, 3) + + with mock.patch( + 'sys.version_info', + new_callable=mock.PropertyMock(return_value=(3, 2, 0))): + blobxfer.util.replace_file(src, dst) + assert not src.exists() + assert dst.exists() + + dst.unlink() + src.touch() + dst.touch() + + with mock.patch( + 'sys.version_info', + new_callable=mock.PropertyMock(return_value=(3, 3, 0))): + if replace_avail: + blobxfer.util.replace_file(src, dst) + assert not src.exists() + assert dst.exists() + else: + src = mock.MagicMock() + blobxfer.util.replace_file(src, dst) + assert src.replace.call_count == 1 + + def test_get_mime_type(): a = 'b.txt' mt = blobxfer.util.get_mime_type(a) From 27126cbc7beb0de7550ab96d8b06af9eabda8c40 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 20 Apr 2017 09:03:53 -0700 Subject: [PATCH 23/47] Fix snapshot support - Add rename option for single file download - Disable MD5 checking if source doesn't have MD5 stored --- blobxfer/models/download.py | 7 +-- blobxfer/models/options.py | 2 + blobxfer/models/upload.py | 32 +++++++++-- blobxfer/operations/azure/blob/__init__.py | 5 +- blobxfer/operations/download.py | 20 ++++--- blobxfer/operations/progress.py | 2 + blobxfer/util.py | 4 +- tests/test_blobxfer_models_download.py | 14 +++-- tests/test_blobxfer_models_upload.py | 4 +- tests/test_blobxfer_operations_download.py | 63 +++++++++++++++++++--- tests/test_blobxfer_util.py | 5 +- 11 files changed, 126 insertions(+), 32 deletions(-) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index 77f2757..dd26515 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -148,7 +148,7 @@ def __init__( # type: (Specification, blobxfer.models.options.Download, # blobxfer.models.options.SkipOn, LocalDestinationPath) -> None """Ctor for Specification - :param DownloadSepcification self: this + :param DownloadSpecification self: this :param blobxfer.models.options.Download download_options: download options :param blobxfer.models.options.SkipOn skip_on_options: skip on options @@ -162,7 +162,7 @@ def __init__( def add_azure_source_path(self, source): # type: (Specification, AzureSourcePath) -> None """Add an Azure Source Path - :param DownloadSepcification self: this + :param DownloadSpecification self: this :param AzureSourcePath source: Azure source path to add """ self.sources.append(source) @@ -281,7 +281,8 @@ def _initialize_integrity_checkers(self, options): 'symmetric key is invalid: provide RSA private key ' 'or metadata corrupt') self.hmac = self._ase.encryption_metadata.initialize_hmac() - if self.hmac is None and options.check_file_md5: + if (self.hmac is None and options.check_file_md5 and + blobxfer.util.is_not_empty(self._ase.md5)): self.md5 = blobxfer.util.new_md5_hasher() def _allocate_disk_space(self): diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index 08ba42a..10b53b1 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -65,6 +65,7 @@ 'mode', 'overwrite', 'recursive', + 'rename', 'rsa_private_key', 'rsa_public_key', 'store_file_attributes', @@ -82,6 +83,7 @@ 'mode', 'overwrite', 'recursive', + 'rename', 'restore_file_attributes', 'rsa_private_key', ] diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 607b001..d8c898e 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -54,12 +54,12 @@ ) -class LocalSourcePaths(blobxfer.models._BaseSourcePaths): - """Local Source Paths""" +class LocalSourcePath(blobxfer.models._BaseSourcePaths): + """Local Source Path""" def files(self): # type: (LocalSourcePaths) -> LocalPath """Generator for files in paths - :param LocalSourcePaths self: this + :param LocalSourcePath self: this :rtype: LocalPath :return: LocalPath """ @@ -73,3 +73,29 @@ def files(self): 'skipping file {} due to filters'.format(_rpath)) continue yield LocalPath(parent_path=_expath, relative_path=_rpath) + + +class Specification(object): + """Upload Specification""" + def __init__( + self, upload_options, skip_on_options, remote_destination_path): + # type: (Specification, blobxfer.models.options.Upload, + # blobxfer.models.options.SkipOn, RemoteDestinationPath) -> None + """Ctor for Specification + :param UploadSpecification self: this + :param blobxfer.models.options.Upload upload_options: upload options + :param blobxfer.models.options.SkipOn skip_on_options: skip on options + :param RemoteDestinationPath remote_destination_path: remote dest path + """ + self.options = upload_options + self.skip_on = skip_on_options + self.destination = remote_destination_path + self.sources = [] + + def add_local_source_path(self, source): + # type: (Specification, LocalSourcePath) -> None + """Add a Local Source Path + :param UploadSpecification self: this + :param LocalSourcePath source: Local source path to add + """ + self.sources.append(source) diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index 0d49ed0..c94c9c9 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -78,9 +78,10 @@ def list_blobs(client, container, prefix, mode, recursive, timeout=None): if mode == blobxfer.models.azure.StorageModes.File: raise RuntimeError('cannot list Azure Files from blob client') if blobxfer.util.blob_is_snapshot(prefix): - snapshot = blobxfer.util.parse_blob_snapshot_parameter(prefix) + base_blob, snapshot = blobxfer.util.parse_blob_snapshot_parameter( + prefix) blob = client.get_blob_properties( - container_name=container, blob_name=prefix, snapshot=snapshot, + container_name=container, blob_name=base_blob, snapshot=snapshot, timeout=timeout) yield blob return diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index f1112e1..f0e08ec 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -149,12 +149,14 @@ def ensure_local_destination(creds, spec): spec.sources[0].lookup_storage_account(rpath)) if (spec.options.mode == blobxfer.models.azure.StorageModes.File): - if blobxfer.operations.azure.file.check_if_single_file( - sa.file_client, cont, dir)[0]: + if (blobxfer.operations.azure.file.check_if_single_file( + sa.file_client, cont, dir)[0] and + spec.options.rename): spec.destination.is_dir = False else: - if blobxfer.operations.azure.blob.check_if_single_blob( - sa.block_blob_client, cont, dir): + if (blobxfer.operations.azure.blob.check_if_single_blob( + sa.block_blob_client, cont, dir) and + spec.options.rename): spec.destination.is_dir = False logger.debug('dest is_dir={} for {} specs'.format( spec.destination.is_dir, len(spec.sources))) @@ -194,7 +196,8 @@ def _check_download_conditions(self, lpath, rfile): lpath, rfile.container, rfile.name)) return DownloadAction.Skip # check skip on options, MD5 match takes priority - if self._spec.skip_on.md5_match: + if (self._spec.skip_on.md5_match and + blobxfer.util.is_not_empty(rfile.md5)): return DownloadAction.CheckMd5 # if neither of the remaining skip on actions are activated, download if (not self._spec.skip_on.filesize_match and @@ -542,7 +545,12 @@ def _run(self): nfiles += 1 total_size += rfile.size # form local path for remote file - lpath = pathlib.Path(self._spec.destination.path, rfile.name) + if (not self._spec.destination.is_dir and + self._spec.options.rename): + lpath = pathlib.Path(self._spec.destination.path) + else: + lpath = pathlib.Path( + self._spec.destination.path, rfile.name) # remove from delete after set try: self._delete_after.remove(lpath) diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index b5ec8e9..99d7445 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -132,6 +132,8 @@ def output_download_parameters(general_options, spec): spec.options.overwrite)) log.append(' recursive: {}'.format( spec.options.recursive)) + log.append(' rename single: {}'.format( + spec.options.rename)) log.append(' file attributes: {}'.format( spec.options.restore_file_attributes)) log.append(' rsa private key: {}'.format( diff --git a/blobxfer/util.py b/blobxfer/util.py index 7d48ceb..9b4e644 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -297,6 +297,6 @@ def parse_blob_snapshot_parameter(url): """ if blob_is_snapshot(url): tmp = url.split('?snapshot=') - if len(tmp) > 1: - return tmp[-1] + if len(tmp) == 2: + return tmp[0], tmp[1] return None diff --git a/tests/test_blobxfer_models_download.py b/tests/test_blobxfer_models_download.py index 6c62ce1..918a7f0 100644 --- a/tests/test_blobxfer_models_download.py +++ b/tests/test_blobxfer_models_download.py @@ -66,6 +66,7 @@ def test_downloadspecification(): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -266,6 +267,7 @@ def test_downloaddescriptor_resume(tmpdir): rmgr = rops.DownloadResumeManager(resumefile) rmgr.add_or_update_record( str(fp), str(lp), ase._size, 32, 1, False, 'abc') + ase._md5 = 'abc' d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() assert rb is None @@ -519,17 +521,21 @@ def test_perform_chunked_integrity_check(tmpdir): opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 + + data = b'0' * opts.chunk_size_bytes + md5 = util.new_md5_hasher() + md5.update(data) + ase = azmodels.StorageEntity('cont') ase._size = 32 ase._name = 'blob' + ase._md5 = md5.hexdigest() + rmgr = rops.DownloadResumeManager(resumefile) d = models.Descriptor(fp, ase, opts, rmgr) - data = b'0' * opts.chunk_size_bytes - md5 = util.new_md5_hasher() - md5.update(data) offsets, _ = d.next_offsets() - d.write_unchecked_hmac_data(offsets, data) + d.write_unchecked_data(offsets, data) d.perform_chunked_integrity_check() assert d._next_integrity_chunk == 1 assert len(d._unchecked_chunks) == 0 diff --git a/tests/test_blobxfer_models_upload.py b/tests/test_blobxfer_models_upload.py index 21d9494..e6447d7 100644 --- a/tests/test_blobxfer_models_upload.py +++ b/tests/test_blobxfer_models_upload.py @@ -25,7 +25,7 @@ def test_localsourcepaths_files(tmpdir): defpath.join('world.txt').write('world') defpath.join('moo.cow').write('y') - a = upload.LocalSourcePaths() + a = upload.LocalSourcePath() a.add_include('*.txt') a.add_includes(['moo.cow', '*blah*']) with pytest.raises(ValueError): @@ -45,7 +45,7 @@ def test_localsourcepaths_files(tmpdir): assert str(defpath.join('world.txt')) in a_set assert str(defpath.join('moo.cow')) not in a_set - b = upload.LocalSourcePaths() + b = upload.LocalSourcePath() b.add_includes(['moo.cow', '*blah*']) b.add_include('*.txt') b.add_excludes(['world.txt']) diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index 749e835..08702b1 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -34,8 +34,9 @@ @mock.patch('blobxfer.operations.azure.blob.check_if_single_blob') def test_ensure_local_destination(patched_blob, patched_file, tmpdir): downdir = tmpdir.join('down') + downdir.mkdir() - # non-file tests + # no spec sources ds = models.Specification( download_options=options.Download( check_file_md5=True, @@ -44,6 +45,7 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -52,25 +54,42 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): str(downdir) ), ) - with pytest.raises(RuntimeError): ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + # blob directory asp = azops.SourcePath() p = 'cont/remote/path' asp.add_path_with_storage_account(p, 'sa') - ds.add_azure_source_path(asp) - patched_blob.return_value = False ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) assert ds.destination.is_dir + # blob single file + rename + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + ds.add_azure_source_path(asp) patched_blob.return_value = True with pytest.raises(RuntimeError): ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) - # file tests + # file directory ds = models.Specification( download_options=options.Download( check_file_md5=True, @@ -79,6 +98,7 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): mode=azmodels.StorageModes.File, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -87,13 +107,30 @@ def test_ensure_local_destination(patched_blob, patched_file, tmpdir): str(downdir) ), ) - ds.add_azure_source_path(asp) - patched_file.return_value = (False, None) ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) assert ds.destination.is_dir + # file single + rename + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.File, + overwrite=True, + recursive=True, + rename=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + ds.add_azure_source_path(asp) patched_file.return_value = (True, mock.MagicMock()) with pytest.raises(RuntimeError): ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) @@ -113,6 +150,7 @@ def test_check_download_conditions(tmpdir): mode=azmodels.StorageModes.Auto, overwrite=False, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -137,6 +175,7 @@ def test_check_download_conditions(tmpdir): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -148,7 +187,9 @@ def test_check_download_conditions(tmpdir): local_destination_path=models.LocalDestinationPath('dest'), ) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) - result = d._check_download_conditions(ep, mock.MagicMock()) + rfile = mock.MagicMock() + rfile.md5 = 'abc' + result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.CheckMd5 ds = models.Specification( @@ -159,6 +200,7 @@ def test_check_download_conditions(tmpdir): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -181,6 +223,7 @@ def test_check_download_conditions(tmpdir): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -211,6 +254,7 @@ def test_check_download_conditions(tmpdir): mode=azmodels.StorageModes.Auto, overwrite=True, recursive=True, + rename=False, restore_file_attributes=False, rsa_private_key=None, ), @@ -675,6 +719,7 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._spec.options.chunk_size_bytes = 1 d._spec.options.mode = azmodels.StorageModes.Auto d._spec.options.overwrite = True + d._spec.options.rename = False d._spec.skip_on = mock.MagicMock() d._spec.skip_on.md5_match = False d._spec.skip_on.lmt_ge = False @@ -716,6 +761,8 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): b = azure.storage.blob.models.Blob(name='name') b.properties.content_length = 1 patched_lb.side_effect = [[b]] + d._spec.destination.is_dir = False + d._spec.options.rename = True d._pre_md5_skip_on_check = mock.MagicMock() d._check_download_conditions = mock.MagicMock() d._check_download_conditions.return_value = ops.DownloadAction.Skip diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py index 64294d3..e294a0e 100644 --- a/tests/test_blobxfer_util.py +++ b/tests/test_blobxfer_util.py @@ -215,9 +215,10 @@ def test_blob_is_snapshot(): def test_parse_blob_snapshot_parameter(): + base = '/cont/a' param = '2017-02-23T22:21:14.8121864Z' - a = '/cont/a?snapshot=' + param - assert blobxfer.util.parse_blob_snapshot_parameter(a) == param + a = base + '?snapshot=' + param + assert blobxfer.util.parse_blob_snapshot_parameter(a) == (base, param) a = '/cont/a?snapshot=' assert blobxfer.util.parse_blob_snapshot_parameter(a) is None From 6c33bc1d174da8e9aab426b3ba3256f851383c9f Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 16 May 2017 15:05:06 -0700 Subject: [PATCH 24/47] Monkeypatch python sdk timeout value --- blobxfer/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py index e05319c..8babc97 100644 --- a/blobxfer/__init__.py +++ b/blobxfer/__init__.py @@ -28,3 +28,6 @@ import azure.storage azure.storage._constants.USER_AGENT_STRING = 'blobxfer/{} {}'.format( __version__, azure.storage._constants.USER_AGENT_STRING) + +# monkeypatch SOCKET_TIMEOUT value in Azure Storage SDK +azure.storage._constants.SOCKET_TIMEOUT = (5, 300) From 85183bae71abcd294efc9fc6b3fe341a2cde89ce Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 16 May 2017 15:05:58 -0700 Subject: [PATCH 25/47] Begin upload work --- blobxfer/api.py | 3 + blobxfer/models/azure.py | 61 +- blobxfer/models/crypto.py | 23 + blobxfer/models/download.py | 24 +- blobxfer/models/options.py | 11 +- blobxfer/models/upload.py | 350 ++++++++++- blobxfer/operations/azure/__init__.py | 49 ++ blobxfer/operations/azure/blob/__init__.py | 29 + blobxfer/operations/azure/file.py | 118 +++- blobxfer/operations/crypto.py | 36 +- blobxfer/operations/download.py | 8 +- blobxfer/operations/md5.py | 3 + blobxfer/operations/progress.py | 80 ++- blobxfer/operations/upload.py | 681 +++++++++++++++++++++ cli/cli.py | 53 +- cli/settings.py | 114 +++- 16 files changed, 1560 insertions(+), 83 deletions(-) create mode 100644 blobxfer/operations/upload.py diff --git a/blobxfer/api.py b/blobxfer/api.py index f8c3378..9034b2e 100644 --- a/blobxfer/api.py +++ b/blobxfer/api.py @@ -48,3 +48,6 @@ from .operations.download import ( # noqa Downloader ) +from .operations.upload import ( # noqa + Uploader +) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index afc971d..fbc319f 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -97,6 +97,16 @@ def name(self): """ return self._name + @property + def path(self): + # type: (StorageEntity) -> str + """Entity path + :param StorageEntity self: this + :rtype: str + :return: remote path of entity + """ + return '{}/{}'.format(self._container, self._name) + @property def lmt(self): # type: (StorageEntity) -> datetime.datetime @@ -117,6 +127,15 @@ def size(self): """ return self._size + @size.setter + def size(self, value): + # type: (StorageEntity, int) -> None + """Set entity size + :param StorageEntity self: this + :param int value: value + """ + self._size = value + @property def snapshot(self): # type: (StorageEntity) -> str @@ -161,13 +180,23 @@ def is_encrypted(self): def encryption_metadata(self): # type: (StorageEntity) -> # blobxfer.models.crypto.EncryptionMetadata - """Entity metadata (type) + """Get encryption metadata :param StorageEntity self: this :rtype: blobxfer.models.crypto.EncryptionMetadata :return: encryption metadata of entity """ return self._encryption + @encryption_metadata.setter + def encryption_metadata(self, value): + # type: (StorageEntity, + # blobxfer.models.crypto.EncryptionMetadata) -> None + """Set encryption metadata + :param StorageEntity self: this + :param blobxfer.models.crypto.EncryptionMetadata value: value + """ + self._encryption = value + def populate_from_blob(self, sa, blob): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, # azure.storage.blob.models.Blob) -> None @@ -206,3 +235,33 @@ def populate_from_file(self, sa, file): self._md5 = file.properties.content_settings.content_md5 self._mode = StorageModes.File self._client = sa.file_client + + def populate_from_local(self, sa, container, name, mode): + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount + # str, str, blobxfer.models.azure.StorageModes) -> None + """Populate properties from local + :param StorageEntity self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param str container: container + :param str name: name + :param blobxfer.models.azure.StorageModes mode: storage mode + """ + self._container = container + self._name = name + self._mode = mode + if mode == StorageModes.Append: + self._client = sa.append_blob_client + elif mode == StorageModes.Block: + self._client = sa.block_blob_client + elif mode == StorageModes.File: + self._client = sa.file_client + elif mode == StorageModes.Page: + self._client = sa.page_blob_client + elif mode == StorageModes.Auto: + name = self.name.lower() + if name.endswith('.vhd') or name.endswith('.vhdx'): + self._client = sa.page_blob_client + self._mode = StorageModes.Page + else: + self._client = sa.block_blob_client + self._mode = StorageModes.Block diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py index 91a2f4a..56c0c79 100644 --- a/blobxfer/models/crypto.py +++ b/blobxfer/models/crypto.py @@ -35,6 +35,7 @@ import hashlib import hmac import json +import os # non-stdlib imports # local imports import blobxfer.models.offload @@ -126,6 +127,7 @@ def __init__(self): self.encryption_metadata_authentication = None self._symkey = None self._signkey = None + self._rsa_public_key = None @property def symmetric_key(self): @@ -163,6 +165,27 @@ def encryption_metadata_exists(md): pass return False + def create_new_metadata(self, rsa_public_key): + # type: (EncryptionMetadata, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey) + # -> None + """Create new metadata entries for encryption (upload) + :param EncryptionMetadata self: this + :param cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey: + rsa public key + """ + self._rsa_public_key = rsa_public_key + self._symkey = os.urandom( + blobxfer.operations.crypto._AES256_KEYLENGTH_BYTES) + self._signkey = os.urandom( + blobxfer.operations.crypto._AES256_KEYLENGTH_BYTES) + self.content_encryption_iv = os.urandom(AES256_BLOCKSIZE_BYTES) + self.encryption_agent = EncryptionAgent( + encryption_algorithm=EncryptionMetadata._ENCRYPTION_ALGORITHM, + protocol=EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION, + ) + self.encryption_mode = EncryptionMetadata._ENCRYPTION_MODE + def convert_from_json(self, md, blobname, rsaprivatekey): # type: (EncryptionMetadata, dict, str, # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index dd26515..e1c9266 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -158,12 +158,17 @@ def __init__( self.skip_on = skip_on_options self.destination = local_destination_path self.sources = [] + # validate compatible options + if not self.options.check_file_md5 and self.skip_on.md5_match: + raise ValueError( + 'Cannot specify skip on MD5 match without file MD5 enabled') def add_azure_source_path(self, source): - # type: (Specification, AzureSourcePath) -> None + # type: (Specification, blobxfer.operations.azure.SourcePath) -> None """Add an Azure Source Path :param DownloadSpecification self: this - :param AzureSourcePath source: Azure source path to add + :param blobxfer.operations.Azure.SourcePath source: + Azure source path to add """ self.sources.append(source) @@ -174,7 +179,7 @@ class Descriptor(object): _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES def __init__(self, lpath, ase, options, resume_mgr): - # type: (DownloadDescriptior, pathlib.Path, + # type: (Descriptior, pathlib.Path, # blobxfer.models.azure.StorageEntity, # blobxfer.models.options.Download, # blobxfer.operations.resume.DownloadResumeManager) -> None @@ -362,8 +367,8 @@ def _resume(self): return self._ase.size # encrypted files are not resumable due to hmac requirement if self._ase.is_encrypted: - logger.debug('cannot resume encrypted entity {}/{}'.format( - self._ase.container, self._ase.name)) + logger.debug('cannot resume encrypted entity {}'.format( + self._ase.path)) return None # check if intermediate (blobtmp) exists if not self.local_path.exists(): @@ -372,8 +377,7 @@ def _resume(self): return None if self.hmac is not None: raise RuntimeError( - 'unexpected hmac object for entity {}/{}'.format( - self._ase.container, self._ase.name)) + 'unexpected hmac object for entity {}'.format(self._ase.path)) # re-hash from 0 to offset if needed _fd_offset = 0 _end_offset = min((curr_chunk * rr.chunk_size, rr.length)) @@ -623,7 +627,7 @@ def finalize_file(self): self._ase.encryption_metadata.encryption_authentication. algorithm, 'OK' if check else 'MISMATCH', - self._ase.name, + self._ase.path, digest, mac, ) @@ -633,14 +637,14 @@ def finalize_file(self): check = True msg = 'MD5: {}, {} {} {}'.format( 'OK' if check else 'MISMATCH', - self._ase.name, + self._ase.path, digest, self._ase.md5, ) else: check = True msg = 'MD5: SKIPPED, {} None {}'.format( - self._ase.name, + self._ase.path, self._ase.md5 ) # cleanup if download failed diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index 10b53b1..9a3af9f 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -58,6 +58,12 @@ 'md5_match', ] ) +FileProperties = collections.namedtuple( + 'FileProperties', [ + 'attributes', + 'md5', + ] +) Upload = collections.namedtuple( 'Upload', [ 'chunk_size_bytes', @@ -66,13 +72,10 @@ 'overwrite', 'recursive', 'rename', - 'rsa_private_key', 'rsa_public_key', - 'store_file_attributes', - 'store_file_md5', + 'store_file_properties', 'strip_components', 'vectored_io', - 'split_size_bytes', ] ) Download = collections.namedtuple( diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index d8c898e..cbe1816 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -31,31 +31,85 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import collections +import enum import logging +import math import os try: import pathlib2 as pathlib except ImportError: # noqa import pathlib +import threading # non-stdlib imports # local imports import blobxfer.models +import blobxfer.models.crypto import blobxfer.util # create logger logger = logging.getLogger(__name__) +# global defines +_MAX_BLOCK_CHUNKSIZE_BYTES = 268435456 +_MAX_NONBLOCK_CHUNKSIZE_BYTES = 4194304 -LocalPath = collections.namedtuple( - 'LocalPath', [ - 'parent_path', - 'relative_path', +# named tuples +Offsets = collections.namedtuple( + 'Offsets', [ + 'chunk_num', + 'block_id', + 'fd_start', + 'num_bytes', + 'range_end', + 'range_start', + 'pad', ] ) +class VectoredIoDistributionMode(enum.Enum): + Disabled = 'disabled' + Stripe = 'stripe' + Replica = 'replica' + + def __str__(self): + return self.value + + +class LocalPath(object): + def __init__(self, parent_path, relative_path): + self.parent_path = parent_path + self.relative_path = relative_path + # populate properties + self._stat = self.absolute_path.stat() + + @property + def absolute_path(self): + return self.parent_path / self.relative_path + + @property + def size(self): + return self._stat.st_size + + @property + def mode(self): + return str(oct(self._stat.st_mode)) + + @property + def uid(self): + return self._stat.st_uid + + @property + def gid(self): + return self._stat.st_gid + + class LocalSourcePath(blobxfer.models._BaseSourcePaths): """Local Source Path""" + + def can_rename(self): + return len(self._paths) == 1 and self._paths[0].is_file() + def files(self): # type: (LocalSourcePaths) -> LocalPath """Generator for files in paths @@ -65,7 +119,16 @@ def files(self): """ for _path in self._paths: _ppath = os.path.expandvars(os.path.expanduser(str(_path))) - _expath = pathlib.Path(_ppath) + _expath = pathlib.Path(_ppath).resolve() + # check if path is a single file + tmp = pathlib.Path(_ppath) + if tmp.is_file(): + yield LocalPath( + parent_path=tmp.parent, + relative_path=pathlib.Path(tmp.name) + ) + continue + del tmp for entry in blobxfer.util.scantree(_ppath): _rpath = pathlib.Path(entry.path).relative_to(_ppath) if not self._inclusion_check(_rpath): @@ -78,24 +141,281 @@ def files(self): class Specification(object): """Upload Specification""" def __init__( - self, upload_options, skip_on_options, remote_destination_path): + self, upload_options, skip_on_options, local_source_path): # type: (Specification, blobxfer.models.options.Upload, - # blobxfer.models.options.SkipOn, RemoteDestinationPath) -> None + # blobxfer.models.options.SkipOn, LocalSourcePath) -> None """Ctor for Specification :param UploadSpecification self: this :param blobxfer.models.options.Upload upload_options: upload options :param blobxfer.models.options.SkipOn skip_on_options: skip on options - :param RemoteDestinationPath remote_destination_path: remote dest path + :param LocalSourcePath local_source_path: local source path """ self.options = upload_options self.skip_on = skip_on_options - self.destination = remote_destination_path - self.sources = [] + self.destinations = [] + self.sources = local_source_path + # validate options + if self.options.rename: + # ensure only one internal path is present + if len(self.sources.paths) > 1: + raise ValueError( + 'cannot add more than one internal source path if rename ' + 'is specified') + # check if internal source path is directory and rename is enabled + if self.sources.paths[0].is_dir(): + raise ValueError( + 'cannot rename a directory of files to upload') + if (self.options.rsa_public_key and + self.options.vectored_io. + multi_storage_account_distribution_mode == + VectoredIoDistributionMode.Stripe): + raise ValueError( + 'cannot enable encryption and multi-storage account ' + 'vectored IO in stripe mode') + if self.options.chunk_size_bytes <= 0: + raise ValueError('chunk size must be positive') + if self.options.chunk_size_bytes > _MAX_BLOCK_CHUNKSIZE_BYTES: + raise ValueError( + ('chunk size value of {} exceeds maximum allowable ' + 'of {}').format( + self.options.chunk_size_bytes, + _MAX_BLOCK_CHUNKSIZE_BYTES)) - def add_local_source_path(self, source): - # type: (Specification, LocalSourcePath) -> None - """Add a Local Source Path + def add_azure_destination_path(self, dest): + # type: (Specification, + # blobxfer.operations.azure.DestinationPath) -> None + """Add a remote Azure Destination path :param UploadSpecification self: this - :param LocalSourcePath source: Local source path to add + :param blobxfer.operations.azure.DestinationPath dest: + Remote destination path + """ + self.destinations.append(dest) + + +class Descriptor(object): + """Upload Descriptor""" + + _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES + + def __init__(self, lpath, ase, uid, options, resume_mgr): + # type: (Descriptior, LocalPath, + # blobxfer.models.azure.StorageEntity, str, + # blobxfer.models.options.Upload, + # blobxfer.operations.resume.UploadResumeManager) -> None + """Ctor for Descriptor + :param Descriptor self: this + :param LocalPath lpath: local path + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :param str uid: unique id + :param blobxfer.models.options.Upload options: download options + :param blobxfer.operations.resume.UploadResumeManager resume_mgr: + upload resume manager + """ + self.local_path = lpath + self.unique_id = uid + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._finalized = False + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() + self._resume_mgr = resume_mgr + self._ase = ase + self.current_iv = None + self._initialize_encryption(options) + # calculate the total number of ops required for transfer + self._compute_remote_size() + self._adjust_chunk_size(options) + self._total_chunks = self._compute_total_chunks(self._chunk_size) + self._outstanding_ops = self._total_chunks + # initialize integrity checkers + self.hmac = None + self.md5 = None + self._initialize_integrity_checkers(options) + + @property + def entity(self): + # type: (Descriptor) -> blobxfer.models.azure.StorageEntity + """Get linked blobxfer.models.azure.StorageEntity + :param Descriptor self: this + :rtype: blobxfer.models.azure.StorageEntity + :return: blobxfer.models.azure.StorageEntity + """ + return self._ase + + @property + def must_compute_md5(self): + # type: (Descriptor) -> bool + """Check if MD5 must be computed + :param Descriptor self: this + :rtype: bool + :return: if MD5 must be computed + """ + return self.md5 is not None + + @property + def all_operations_completed(self): + # type: (Descriptor) -> bool + """All operations are completed + :param Descriptor self: this + :rtype: bool + :return: if all operations completed + """ + with self._meta_lock: + return (self._outstanding_ops == 0 and + len(self._unchecked_chunks) == 0) + + @property + def is_resumable(self): + # type: (Descriptor) -> bool + """Download is resume capable + :param Descriptor self: this + :rtype: bool + :return: if resumable + """ + return self._resume_mgr is not None and self.hmac is None + + def hmac_iv(self, iv): + # type: (Descriptor, bytes) -> None + """Send IV through hasher + :param Descriptor self: this + :param bytes iv: iv + """ + with self._hasher_lock: + self.hmac.update(iv) + + def _initialize_encryption(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Download is resume capable + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + if options.rsa_public_key is not None: + em = blobxfer.models.crypto.EncryptionMetadata() + em.create_new_metadata(options.rsa_public_key) + self.current_iv = em.content_encryption_iv + self._ase.encryption_metadata = em + + def _compute_remote_size(self): + # type: (Descriptor, int) -> None + """Compute total remote file size + :param Descriptor self: this + :rtype: int + :return: remote file size + """ + size = self.local_path.size + if size > 0: + if self._ase.is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + self._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + self._ase.size = allocatesize + logger.debug('remote size for {} is {} bytes'.format( + self._ase.path, self._ase.size)) + + def _adjust_chunk_size(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Adjust chunk size for entity mode + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) + # ensure chunk sizes are compatible with mode + if self._ase.mode == blobxfer.models.azure.StorageModes.Append: + if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for append blobs'.format( + self._chunk_size)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.Block: + if self._chunk_size > _MAX_BLOCK_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_BLOCK_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for block blobs'.format( + self._chunk_size)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.File: + if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for files'.format( + self._chunk_size)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.Page: + if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for page blobs'.format( + self._chunk_size)) + + def _compute_total_chunks(self, chunk_size): + # type: (Descriptor, int) -> int + """Compute total number of chunks for entity + :param Descriptor self: this + :param int chunk_size: chunk size + :rtype: int + :return: num chunks + """ + try: + return int(math.ceil(self._ase.size / chunk_size)) + except ZeroDivisionError: + return 0 + + def _initialize_integrity_checkers(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Initialize file integrity checkers + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + if self._ase.is_encrypted: + # ensure symmetric key exists + if blobxfer.util.is_none_or_empty( + self._ase.encryption_metadata.symmetric_key): + raise RuntimeError( + 'symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt') + self.hmac = self._ase.encryption_metadata.initialize_hmac() + if self.hmac is None and options.store_file_properties.md5: + self.md5 = blobxfer.util.new_md5_hasher() + + def next_offsets(self): + # type: (Descriptor) -> Offsets + """Retrieve the next offsets + :param Descriptor self: this + :rtype: Offsets + :return: upload offsets """ - self.sources.append(source) + # TODO RESUME +# resume_bytes = self._resume() + resume_bytes = None + with self._meta_lock: +# if self._offset >= self._ase.size: +# return None, resume_bytes + if self._offset + self._chunk_size > self._ase.size: + chunk = self._ase.size - self._offset + else: + chunk = self._chunk_size + num_bytes = chunk + chunk_num = self._chunk_num + fd_start = self._offset + range_start = self._offset + range_end = self._offset + num_bytes - 1 + self._offset += chunk + self._chunk_num += 1 + if self._ase.is_encrypted and self._offset >= self._ase.size: + pad = True + else: + pad = False + return Offsets( + chunk_num=chunk_num, + block_id='{0:08d}'.format(chunk_num), + fd_start=fd_start, + num_bytes=chunk, + range_start=range_start, + range_end=range_end, + pad=pad, + ), resume_bytes diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index cc33834..8581c9a 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -301,3 +301,52 @@ def _populate_from_list_blobs(self, creds, options, general_options): ase = blobxfer.models.azure.StorageEntity(cont, ed) ase.populate_from_blob(sa, blob) yield ase + + +class DestinationPath(blobxfer.models._BaseSourcePaths): + """Azure Destination Path""" + def __init__(self): + # type: (SourcePath) -> None + """Ctor for SourcePath + :param SourcePath self: this + """ + super(DestinationPath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (SourcePath, str, str) -> None + """Add a path with an associated storage account + :param SourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to SourcePath objects') + rpath = blobxfer.util.normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (SourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param SourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] + + # TODO IS THIS NEEDED? + def generate_entities_for_mode(self, creds, options): + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + + if options.rsa_public_key is not None: + ed = blobxfer.models.crypto.EncryptionMetadata() + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_blob(sa, blob) diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index c94c9c9..dbe8263 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -61,6 +61,35 @@ def check_if_single_blob(client, container, prefix, timeout=None): return True +def get_blob_properties(client, container, prefix, mode, timeout=None): + if mode == blobxfer.models.azure.StorageModes.File: + raise RuntimeError('cannot list Azure Files from blob client') + try: + blob = client.get_blob_properties( + container_name=container, blob_name=prefix, timeout=timeout) + except azure.common.AzureMissingResourceHttpError: + return None + if (mode == blobxfer.models.azure.StorageModes.Append and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.AppendBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + elif (mode == blobxfer.models.azure.StorageModes.Block and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.BlockBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + elif (mode == blobxfer.models.azure.StorageModes.Page and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.PageBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + return blob + + def list_blobs(client, container, prefix, mode, recursive, timeout=None): # type: (azure.storage.blob.BaseBlobService, str, str, # blobxfer.models.azure.StorageModes, bool, int) -> diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index 43e89ca..7a4e076 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -89,31 +89,47 @@ def parse_file_path(filepath): return (dirname, fname) -def check_if_single_file(client, fileshare, prefix, timeout=None): +def get_file_properties(client, fileshare, prefix, timeout=None): # type: (azure.storage.file.FileService, str, str, int) -> - # Tuple[bool, azure.storage.file.models.File] - """Check if prefix is a single file or multiple files + # azure.storage.file.models.File + """Get file properties :param FileService client: blob client :param str fileshare: file share name :param str prefix: path prefix :param int timeout: timeout - :rtype: tuple - :return: (if prefix in fileshare is a single file, file) + :rtype: azure.storage.file.models.File + :return: file properties """ - file = None - if blobxfer.util.is_none_or_empty(prefix): - return (False, file) dirname, fname = parse_file_path(prefix) try: - file = client.get_file_properties( + return client.get_file_properties( share_name=fileshare, directory_name=dirname, file_name=fname, timeout=timeout, ) except azure.common.AzureMissingResourceHttpError: + return None + + +def check_if_single_file(client, fileshare, prefix, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> + # Tuple[bool, azure.storage.file.models.File] + """Check if prefix is a single file or multiple files + :param FileService client: blob client + :param str fileshare: file share name + :param str prefix: path prefix + :param int timeout: timeout + :rtype: tuple + :return: (if prefix in fileshare is a single file, file) + """ + if blobxfer.util.is_none_or_empty(prefix): + return (False, None) + file = get_file_properties(client, fileshare, prefix, timeout) + if file is None: return (False, file) - return (True, file) + else: + return (True, file) def list_files(client, fileshare, prefix, recursive, timeout=None): @@ -178,3 +194,85 @@ def get_file_range(ase, offsets, timeout=None): validate_content=False, # HTTPS takes care of integrity during xfer timeout=timeout, ).content + + +def create_share(ase, containers_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create file share + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict containers_created: containers already created map + :param int timeout: timeout + """ + key = ase.client.account_name + ':file=' + ase.container + if key not in containers_created: + ase.client.create_share( + share_name=ase.container, + fail_on_exist=False, + timeout=timeout) + containers_created.add(key) + logger.info('created file share {} on storage account {}'.format( + ase.container, ase.client.account_name)) + + +def create_all_parent_directories(ase, dirs_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create all parent directories for a file + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict dirs_created: directories already created map + :param int timeout: timeout + """ + dirs = pathlib.Path(ase.name).parts + if len(dirs) <= 1: + return + dk = ase.client.account_name + ':' + ase.container + for i in range(0, len(dirs)): + dir = str(pathlib.Path(*(dirs[0:i + 1]))) + if dk not in dirs_created or dir not in dirs_created[dk]: + ase.client.create_directory( + share_name=ase.container, + directory_name=dir, + fail_on_exist=False, + timeout=timeout) + if dk not in dirs_created: + dirs_created[dk] = set() + dirs_created[dk].add(dir) + + +def create_file(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create file remotely + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.create_file( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + content_length=ase.size, + content_settings=None, + timeout=timeout) + + +def put_file_range(ase, local_file, offsets, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, pathlib.path, + # blobxfer.models.upload.Offsets, int) -> None + """Puts a range of bytes into the remote file + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param pathlib.Path local_file: local file + :param blobxfer.models.upload.Offsets offsets: upload offsets + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + with local_file.open('rb') as fd: + fd.seek(offsets.range_start, 0) + data = fd.read(offsets.num_bytes) + ase.client.update_range( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + data=data, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index f494d81..3097b96 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -38,6 +38,7 @@ import queue except ImportError: # noqa import Queue as queue +import tempfile # non-stdlib imports import cryptography.hazmat.backends import cryptography.hazmat.primitives.asymmetric.padding @@ -78,7 +79,7 @@ def load_rsa_private_key_file(rsakeyfile, passphrase): return cryptography.hazmat.primitives.serialization.\ load_pem_private_key( keyfile.read(), - passphrase, + passphrase.encode('utf8') if passphrase is not None else None, backend=cryptography.hazmat.backends.default_backend() ) @@ -245,8 +246,18 @@ def _worker_process(self): except queue.Empty: continue if inst[0] == CryptoAction.Encrypt: - # TODO on upload - raise NotImplementedError() + local_file, offsets, symkey, iv = \ + inst[1], inst[2], inst[3], inst[4] + with open(local_file, 'rb') as fd: + data = fd.read() + encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( + symkey, iv, data, offsets.pad) + with tempfile.NamedTemporaryFile( + mode='wb', delete=False) as fd: + fpath = fd.name + fd.write(encdata) + self._done_cv.acquire() + self._done_queue.put(fpath) elif inst[0] == CryptoAction.Decrypt: final_path, local_path, offsets, symkey, iv, hmac_datafile = \ inst[1], inst[2], inst[3], inst[4], inst[5], inst[6] @@ -260,8 +271,9 @@ def _worker_process(self): with open(local_path, 'r+b') as fd: fd.seek(offsets.fd_start, 0) fd.write(data) - self._done_cv.acquire() - self._done_queue.put(final_path) + self._done_cv.acquire() + self._done_queue.put(final_path) + # notify and release condition var self._done_cv.notify() self._done_cv.release() @@ -282,3 +294,17 @@ def add_decrypt_chunk( (CryptoAction.Decrypt, final_path, local_path, offsets, symkey, iv, hmac_datafile) ) + + def add_encrypt_chunk(self, local_file, offsets, symkey, iv): + # type: (CryptoOffload, pathlib.Path, blobxfer.models.upload.Offsets, + # bytes, bytes) -> None + """Add a chunk to encrypt + :param CryptoOffload self: this + :param pathlib.Path local_file: local file + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + """ + self._task_queue.put( + (CryptoAction.Encrypt, str(local_file), offsets, symkey, iv) + ) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index f0e08ec..fa379fc 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -192,8 +192,8 @@ def _check_download_conditions(self, lpath, rfile): return DownloadAction.Download if not self._spec.options.overwrite: logger.info( - 'not overwriting local file: {} (remote: {}/{})'.format( - lpath, rfile.container, rfile.name)) + 'not overwriting local file: {} (remote: {})'.format( + lpath, rfile.path)) return DownloadAction.Skip # check skip on options, MD5 match takes priority if (self._spec.skip_on.md5_match and @@ -355,7 +355,7 @@ def _initialize_download_threads(self): def _wait_for_download_threads(self, terminate): # type: (Downloader, bool) -> None - """Terminate download threads + """Wait for download threads :param Downloader self: this :param bool terminate: terminate threads """ @@ -623,7 +623,7 @@ def start(self): :param Downloader self: this """ try: - blobxfer.operations.progress.output_download_parameters( + blobxfer.operations.progress.output_parameters( self._general_options, self._spec) self._run() except (KeyboardInterrupt, Exception) as ex: diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py index dbd05fb..e04daec 100644 --- a/blobxfer/operations/md5.py +++ b/blobxfer/operations/md5.py @@ -120,6 +120,9 @@ def add_localfile_for_md5_check(self, filename, remote_md5, mode): :param str remote_md5: remote MD5 to compare against :param blobxfer.models.azure.StorageModes mode: mode """ + if blobxfer.util.is_none_or_empty(remote_md5): + raise ValueError('comparison MD5 is empty for file {}'.format( + filename)) if mode == blobxfer.models.azure.StorageModes.Page: pagealign = True else: diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index 99d7445..b0f3bf4 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -48,10 +48,10 @@ def update_progress_bar( go, optext, start, total_files, files_sofar, total_bytes, bytes_sofar): - # type: (blobxfer.options.General, str, datetime.datetime, int, int, int, - # int) -> None + # type: (blobxfer.models.options.General, str, datetime.datetime, int, + # int, int, int) -> None """Update the progress bar - :param blobxfer.options.General go: general options + :param blobxfer.models.options.General go: general options :param str optext: operation prefix text :param datetime.datetime start: start time :param int total_files: total number of files @@ -89,10 +89,11 @@ def update_progress_bar( sys.stdout.flush() -def output_download_parameters(general_options, spec): - # type: (Downloader) -> None - """Output configuration block - :param Downloader downloader: this +def output_parameters(general_options, spec): + # type: (blobxfer.models.options.General, object) -> None + """Output parameters + :param blobxfer.models.options.General general_options: general options + :param object spec: upload or download spec """ log = [] log.append('===========================') @@ -106,25 +107,37 @@ def output_download_parameters(general_options, spec): platform.python_version(), azure.storage._constants.__version__, requests.__version__)) - log.append(' transfer direction: {}'.format('local->Azure')) - log.append(' workers: xfer={} md5={} crypto={}'.format( - general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes - if spec.options.check_file_md5 else 0, - general_options.concurrency.crypto_processes)) - log.append(' timeout: {}'.format( - general_options.timeout_sec)) + # specific preamble + if isinstance(spec, blobxfer.models.download.Specification): + log.append(' transfer direction: {}'.format('Azure -> local')) + log.append(' workers: xfer={} md5={} crypto={}'.format( + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.options.check_file_md5 else 0, + general_options.concurrency.crypto_processes)) + elif isinstance(spec, blobxfer.models.upload.Specification): + log.append(' transfer direction: {}'.format('local -> Azure')) + log.append(' workers: xfer={} md5={} crypto={}'.format( + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.skip_on.md5_match or spec.options.store_file_properties.md5 + else 0, + general_options.concurrency.crypto_processes)) + + # TODO handle synccopy spec + + # common block log.append(' resume file: {}'.format( general_options.resume_file)) + log.append(' timeout: {}'.format( + general_options.timeout_sec)) + log.append(' mode: {}'.format( + spec.options.mode)) log.append(' skip on: fs_match={} lmt_ge={} md5={}'.format( spec.skip_on.filesize_match, spec.skip_on.lmt_ge, spec.skip_on.md5_match)) - log.append(' mode: {}'.format( - spec.options.mode)) - log.append(' compute file md5: {}'.format( - spec.options.check_file_md5)) - log.append(' chunk size (bytes): {}'.format( + log.append(' chunk size: {} bytes'.format( spec.options.chunk_size_bytes)) log.append(' delete extraneous: {}'.format( spec.options.delete_extraneous_destination)) @@ -132,14 +145,29 @@ def output_download_parameters(general_options, spec): spec.options.overwrite)) log.append(' recursive: {}'.format( spec.options.recursive)) + + # TODO only output rename single if not synccopy log.append(' rename single: {}'.format( spec.options.rename)) - log.append(' file attributes: {}'.format( - spec.options.restore_file_attributes)) - log.append(' rsa private key: {}'.format( - 'Loaded' if spec.options.rsa_private_key else 'None')) - log.append(' local destination: {}'.format( - spec.destination.path)) + + # specific epilog + if isinstance(spec, blobxfer.models.download.Specification): + log.append(' compute file md5: {}'.format( + spec.options.check_file_md5)) + log.append(' file attributes: {}'.format( + spec.options.restore_file_attributes)) + log.append(' rsa private key: {}'.format( + 'Loaded' if spec.options.rsa_private_key else 'None')) + log.append(' local destination: {}'.format( + spec.destination.path)) + elif isinstance(spec, blobxfer.models.upload.Specification): + log.append(' store properties: attr={} md5={}'.format( + spec.options.store_file_properties.attributes, + spec.options.store_file_properties.md5)) + log.append(' rsa public key: {}'.format( + 'Loaded' if spec.options.rsa_public_key else 'None')) + log.append(' local source paths: {}'.format( + ' '.join([str(src) for src in spec.sources.paths]))) log.append('===========================') log = os.linesep.join(log) if blobxfer.util.is_not_empty(general_options.log_file): diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py new file mode 100644 index 0000000..5f83b7b --- /dev/null +++ b/blobxfer/operations/upload.py @@ -0,0 +1,681 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +import logging +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue +import threading +# non-stdlib imports +# local imports +import blobxfer.models.crypto +import blobxfer.operations.azure.blob +import blobxfer.operations.azure.file +import blobxfer.operations.crypto +import blobxfer.operations.md5 +import blobxfer.operations.progress +import blobxfer.operations.resume +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +class UploadAction(enum.Enum): + Skip = 1 + CheckMd5 = 2 + Upload = 3 + + +class Uploader(object): + """Uploader""" + def __init__(self, general_options, creds, spec): + # type: (Uploader, blobxfer.models.options.General, + # blobxfer.operations.azure.StorageCredentials, + # blobxfer.models.upload.Specification) -> None + """Ctor for Uploader + :param Uploader self: this + :param blobxfer.models.options.General general_options: general opts + :param blobxfer.operations.azure.StorageCredentials creds: creds + :param blobxfer.models.uplaod.Specification spec: upload spec + """ + self._all_remote_files_processed = False + self._crypto_offload = None + self._md5_meta_lock = threading.Lock() + self._md5_map = {} + self._md5_offload = None + self._upload_lock = threading.Lock() + self._upload_queue = queue.Queue() + self._upload_set = set() + self._upload_start_time = None + self._upload_threads = [] + self._upload_total = None + self._upload_sofar = 0 + self._upload_bytes_total = None + self._upload_bytes_sofar = 0 + self._upload_terminate = False + self._start_time = None + self._delete_after = set() + self._ud_map = {} + self._containers_created = set() + self._fileshare_dir_lock = threading.Lock() + self._dirs_created = {} + self._general_options = general_options + self._creds = creds + self._spec = spec + self._resume = None + self._exceptions = [] + + @property + def termination_check(self): + # type: (Uploader) -> bool + """Check if terminated + :param Uploader self: this + :rtype: bool + :return: if terminated + """ + with self._upload_lock: + return (self._upload_terminate or + len(self._exceptions) > 0 or + (self._all_remote_files_processed and + len(self._upload_set) == 0)) + + @property + def termination_check_md5(self): + # type: (Uploader) -> bool + """Check if terminated from MD5 context + :param Uploader self: this + :rtype: bool + :return: if terminated from MD5 context + """ + with self._md5_meta_lock: + with self._upload_lock: + return (self._upload_terminate or + (self._all_remote_files_processed and + len(self._md5_map) == 0 and + len(self._upload_set) == 0)) + + def _update_progress_bar(self): + # type: (Uploader) -> None + """Update progress bar + :param Uploader self: this + """ + blobxfer.operations.progress.update_progress_bar( + self._general_options, + 'upload', + self._upload_start_time, + self._upload_total, + self._upload_sofar, + self._upload_bytes_total, + self._upload_bytes_sofar, + ) + + def _pre_md5_skip_on_check(self, src, rfile): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> None + """Perform pre MD5 skip on check + :param Uploader self: this + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + """ + # if encryption metadata is present, check for pre-encryption + # md5 in blobxfer extensions + md5 = None + if rfile.encryption_metadata is not None: + md5 = rfile.encryption_metadata.blobxfer_extensions.\ + pre_encrypted_content_md5 + if md5 is None: + md5 = rfile.md5 + slpath = str(src.absolute_path) + with self._md5_meta_lock: + self._md5_map[slpath] = (src, rfile) + self._md5_offload.add_localfile_for_md5_check(slpath, md5, rfile.mode) + + def _post_md5_skip_on_check(self, filename, md5_match): + # type: (Uploader, str, bool) -> None + """Perform post MD5 skip on check + :param Uploader self: this + :param str filename: local filename + :param bool md5_match: if MD5 matches + """ + uid = self._create_unique_id(src, rfile) + with self._md5_meta_lock: + src, rfile = self._md5_map.pop(filename) + if md5_match: + with self._upload_lock: + self._upload_set.remove(uid) + self._upload_total -= 1 + self._upload_bytes_total -= src.size + else: + self._add_to_upload_queue(src, rfile, uid) + + def _check_for_uploads_from_md5(self): + # type: (Uploader) -> None + """Check queue for a file to upload + :param Uploader self: this + """ + cv = self._md5_offload.done_cv + while not self.termination_check_md5: + result = None + cv.acquire() + while True: + result = self._md5_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + if self.termination_check_md5: + break + else: + break + cv.release() + if result is not None: + self._post_md5_skip_on_check(result[0], result[1]) + + def _check_for_crypto_done(self): + # type: (Uploader) -> None + """Check queue for crypto done + :param Uploader self: this + """ + cv = self._crypto_offload.done_cv + while not self.termination_check: + result = None + cv.acquire() + while True: + result = self._crypto_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + if self.termination_check: + break + else: + break + cv.release() + if result is not None: + try: + with self._upload_lock: + dd = self._ud_map[result] + dd.perform_chunked_integrity_check() + except KeyError: + # this can happen if all of the last integrity + # chunks are processed at once + pass + + def _add_to_upload_queue(self, src, rfile, uid): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity, str) -> None + """Add remote file to download queue + :param Uploader self: this + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + :param str uid: unique id + """ + # prepare local file for upload + ud = blobxfer.models.upload.Descriptor( + src, rfile, uid, self._spec.options, self._resume) + if ud.entity.is_encrypted: + with self._upload_lock: + self._ud_map[uid] = ud + # add download descriptor to queue + self._upload_queue.put(ud) + if self._upload_start_time is None: + with self._upload_lock: + if self._upload_start_time is None: + self._upload_start_time = blobxfer.util.datetime_now() + + def _initialize_upload_threads(self): + # type: (Uploader) -> None + """Initialize upload threads + :param Uploader self: this + """ + logger.debug('spawning {} transfer threads'.format( + self._general_options.concurrency.transfer_threads)) + for _ in range(self._general_options.concurrency.transfer_threads): + thr = threading.Thread(target=self._worker_thread_upload) + self._upload_threads.append(thr) + thr.start() + + def _wait_for_upload_threads(self, terminate): + # type: (Uploader, bool) -> None + """Wait for upload threads + :param Uploader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._upload_terminate = terminate + for thr in self._upload_threads: + thr.join() + + def _worker_thread_upload(self): + # type: (Uploader) -> None + """Worker thread upload + :param Uploader self: this + """ + while not self.termination_check: + try: + ud = self._upload_queue.get(False, 0.25) + except queue.Empty: + continue + try: + self._process_upload_descriptor(ud) + except Exception as e: + with self._upload_lock: + self._exceptions.append(e) + + def _put_data(self, ud, offsets): + if ud.entity.mode == blobxfer.models.azure.StorageModes.File: + if offsets.chunk_num == 0: + # create container if necessary + blobxfer.operations.azure.file.create_share( + ud.entity, self._containers_created, + self._general_options.timeout_sec) + # create parent directories + with self._fileshare_dir_lock: + blobxfer.operations.azure.file.\ + create_all_parent_directories( + ud.entity, self._dirs_created, + self._general_options.timeout_sec) + # create remote file + blobxfer.operations.azure.file.create_file( + ud.entity, self._general_options.timeout_sec) + # upload chunk + blobxfer.operations.azure.file.put_file_range( + ud.entity, ud.local_path.absolute_path, offsets, + self._general_options.timeout_sec) + + else: + # TODO all upload types + data = blobxfer.operations.azure.blob.get_blob_range( + dd.entity, offsets, self._general_options.timeout_sec) + + def _process_upload_descriptor(self, ud): + # type: (Uploader, blobxfer.models.upload.Descriptor) -> None + """Process upload descriptor + :param Uploader self: this + :param blobxfer.models.upload.Descriptor: upload descriptor + """ + # update progress bar + self._update_progress_bar() + # get download offsets + offsets, resume_bytes = ud.next_offsets() + # add resume bytes to counter + if resume_bytes is not None: + with self._upload_lock: + self._upload_bytes_sofar += resume_bytes + logger.debug('adding {} sofar {} from {}'.format( + resume_bytes, self._upload_bytes_sofar, ud._ase.name)) + del resume_bytes + print(offsets) + # check if all operations completed + if offsets is None and ud.all_operations_completed: + # finalize file + ud.finalize_file() + # accounting + with self._upload_lock: + if ud.entity.is_encrypted: + self._ud_map.pop(ud.unique_id) + self._upload_set.remove(ud.unique_id) + self._upload_sofar += 1 + return + # re-enqueue for other threads to download + self._upload_queue.put(ud) + if offsets is None: + return + # encrypt if necessary + if ud.entity.is_encrypted: + # send iv through hmac + ud.hmac_iv(ud.current_iv) + # encrypt data + if self._crypto_offload is not None: + self._crypto_offload.add_encrypt_chunk( + str(ud.local_path.absolute_path), offsets, + ud.entity.encryption_metadata.symmetric_key, + ud.current_iv) + # encrypted data will be retrieved from a temp file once + # retrieved from crypto queue + return + else: + # TODO pickup here, read data from file + + encdata = blobxfer.operations.crypto.aes_cbc_decrypt_data( + ud.entity.encryption_metadata.symmetric_key, + ud.current_iv, data, offsets.pad) + # send encrypted data through hmac + + # TODO send data as optional param if encrypted + # issue put range + self._put_data(ud, offsets) + # accounting + with self._upload_lock: + self._upload_bytes_sofar += offsets.num_bytes + + def _cleanup_temporary_files(self): + # type: (Uploader) -> None + """Cleanup temporary files in case of an exception or interrupt. + This function is not thread-safe. + :param Uploader self: this + """ + # iterate through dd map and cleanup files + for key in self._ud_map: + dd = self._ud_map[key] + try: + dd.cleanup_all_temporary_files() + except Exception as e: + logger.exception(e) + + def _delete_extraneous_files(self): + # type: (Uploader) -> None + """Delete extraneous files cataloged + :param Uploader self: this + """ + logger.info('attempting to delete {} extraneous files'.format( + len(self._delete_after))) + for file in self._delete_after: + try: + file.unlink() + except OSError: + pass + + def _check_upload_conditions(self, lpath, rfile): + # type: (Uploader, pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> UploadAction + """Check for upload conditions + :param Uploader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + :rtype: UploadAction + :return: upload action + """ + # check if file still exists + if not lpath.exists(): + return UploadAction.Skip + # if remote file doesn't exist, upload + if rfile is None: + return UploadAction.Upload + # check overwrite option + if not self._spec.options.overwrite: + logger.info( + 'not overwriting remote file: {} (local: {})'.format( + rfile.path, lpath)) + return UploadAction.Skip + # check skip on options, MD5 match takes priority + if (self._spec.skip_on.md5_match and + blobxfer.util.is_not_empty(rfile.md5)): + return UploadAction.CheckMd5 + # if neither of the remaining skip on actions are activated, upload + if (not self._spec.skip_on.filesize_match and + not self._spec.skip_on.lmt_ge): + return UploadAction.Upload + # check skip on file size match + ul_fs = None + if self._spec.skip_on.filesize_match: + lsize = lpath.stat().st_size + if rfile.mode == blobxfer.models.azure.StorageModes.Page: + lsize = blobxfer.util.page_align_content_length(lsize) + if rfile.size == lsize: + ul_fs = False + else: + ul_fs = True + # check skip on lmt ge + ul_lmt = None + if self._spec.skip_on.lmt_ge: + mtime = blobxfer.util.datetime_from_timestamp( + lpath.stat().st_mtime) + if rfile.lmt >= mtime: + ul_lmt = False + else: + ul_lmt = True + # upload if either skip on mismatch is True + if ul_fs or ul_lmt: + return UploadAction.Upload + else: + return UploadAction.Skip + + def _generate_entity_for_source(self, local_path): + # type: (Uploader, blobxfer.models.upload.LocalSourcePath) -> ??? + """Generate entities for source path + :param Uploader self: this + :param blobxfer.models.upload.LocalSourcePath local_path: local path + """ + # construct stripped destination path + spath = local_path.relative_path + if self._spec.options.strip_components > 0: + _rparts = local_path.relative_path.parts + _strip = min( + (len(_rparts) - 1, self._spec.options.strip_components) + ) + if _strip > 0: + spath = pathlib.Path(*_rparts[_strip:]) + # for each destination: + # 1. prepend non-container path + # 2. bind client from mode + # 3. perform get blob or file properties + for dst in self._spec.destinations: + for dpath in dst.paths: + sdpath = str(dpath) + cont, dir = blobxfer.util.explode_azure_path(sdpath) + # apply rename + if self._spec.options.rename: + name = dir + else: + name = str(spath / dir) + if blobxfer.util.is_none_or_empty(name): + raise ValueError( + 'must specify a container for destination: {}'.format( + dpath)) + # apply strip components + print(cont, name) + sa = self._creds.get_storage_account( + dst.lookup_storage_account(sdpath)) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + fp = blobxfer.operations.azure.file.get_file_properties( + sa.file_client, cont, name, + timeout=self._general_options.timeout_sec) + else: + fp = blobxfer.operations.azure.blob.get_blob_properties( + sa.block_blob_client, cont, name, + self._spec.options.mode, + timeout=self._general_options.timeout_sec) + if fp is not None: + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(fp.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json(fp.metadata, fp.name, None) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + ase.populate_from_file(sa, fp) + else: + ase.populate_from_blob(sa, fp) + else: + ase = None + lpath = local_path.parent_path / local_path.relative_path + action = self._check_upload_conditions(lpath, ase) + if ase is None: + if self._spec.options.rsa_public_key: + ed = blobxfer.models.crypto.EncryptionMetadata() + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_local( + sa, cont, name, self._spec.options.mode) + yield action, ase + + def _create_unique_id(self, src, ase): + return ';'.join( + (str(src.absolute_path), ase._client.account_name, ase.path) + ) + + def _run(self): + # type: (Uploader) -> None + """Execute Uploader + :param Uploader self: this + """ + # mark start + self._start_time = blobxfer.util.datetime_now() + logger.info('blobxfer start time: {0}'.format(self._start_time)) + # initialize resume db if specified +# if self._general_options.resume_file is not None: +# self._resume = blobxfer.operations.resume.DownloadResumeManager( +# self._general_options.resume_file) + # initialize MD5 processes + if ((self._spec.options.store_file_properties.md5 or + self._spec.skip_on.md5_match) and + self._general_options.concurrency.md5_processes > 0): + self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( + num_workers=self._general_options.concurrency.md5_processes) + self._md5_offload.initialize_check_thread( + self._check_for_uploads_from_md5) + # initialize crypto processes + if self._general_options.concurrency.crypto_processes > 0: + self._crypto_offload = blobxfer.operations.crypto.CryptoOffload( + num_workers=self._general_options.concurrency.crypto_processes) + self._crypto_offload.initialize_check_thread( + self._check_for_crypto_done) + # initialize upload threads + self._initialize_upload_threads() + # initialize local counters + nfiles = 0 + total_size = 0 + skipped_files = 0 + skipped_size = 0 + if not self._spec.sources.can_rename() and self._spec.options.rename: + raise RuntimeError( + 'cannot rename to specified destination with multiple sources') + # iterate through source paths to upload + for sfile in self._spec.sources.files(): + # create associated storage entity (destination) for file + for action, ase in self._generate_entity_for_source(sfile): + print(sfile.parent_path, sfile.relative_path, sfile.absolute_path, action, ase.container, ase.name) + print(sfile.size, sfile.mode, sfile.uid, sfile.gid) + print(self._create_unique_id(sfile, ase)) + if action == UploadAction.Skip: + skipped_files += 1 + skipped_size += ase.size if ase.size is not None else 0 + continue + # add to potential upload set + uid = self._create_unique_id(sfile, ase) + with self._upload_lock: + self._upload_set.add(uid) + if action == UploadAction.CheckMd5: + self._pre_md5_skip_on_check(sfile, ase) + elif action == UploadAction.Upload: + self._add_to_upload_queue(sfile, ase, uid) + + nfiles += 1 + total_size += sfile.size + + self._upload_total = nfiles - skipped_files + self._upload_bytes_total = total_size - skipped_size + upload_size_mib = self._upload_bytes_total / blobxfer.util.MEGABYTE + # set remote files processed + with self._md5_meta_lock: + self._all_remote_files_processed = True + logger.debug( + ('{0} remote files processed, waiting for upload completion ' + 'of {1:.4f} MiB').format(nfiles, upload_size_mib)) + del nfiles + del total_size + del skipped_files + del skipped_size + # wait for downloads to complete + self._wait_for_upload_threads(terminate=False) + end_time = blobxfer.util.datetime_now() + # update progress bar + self._update_progress_bar() + # check for exceptions + if len(self._exceptions) > 0: + logger.error('exceptions encountered while downloading') + # raise the first one + raise self._exceptions[0] + # check for mismatches + if (self._upload_sofar != self._upload_total or + self._upload_bytes_sofar != self._upload_bytes_total): + raise RuntimeError( + 'download mismatch: [count={}/{} bytes={}/{}]'.format( + self._upload_sofar, self._upload_total, + self._upload_bytes_sofar, self._upload_bytes_total)) + # delete all remaining local files not accounted for if + # delete extraneous enabled + self._delete_extraneous_files() + # delete resume file if we've gotten this far + if self._resume is not None: + self._resume.delete() + # output throughput + if self._upload_start_time is not None: + dltime = (end_time - self._upload_start_time).total_seconds() + logger.info( + ('elapsed download + verify time and throughput: {0:.3f} sec, ' + '{1:.4f} Mbps').format( + dltime, download_size_mib * 8 / dltime)) + end_time = blobxfer.util.datetime_now() + logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time, (end_time - self._start_time).total_seconds())) + + def start(self): + # type: (Uploader) -> None + """Start the Uploader + :param Uploader self: this + """ + try: + blobxfer.operations.progress.output_parameters( + self._general_options, self._spec) + self._run() + except (KeyboardInterrupt, Exception) as ex: + if isinstance(ex, KeyboardInterrupt): + logger.info( + 'KeyboardInterrupt detected, force terminating ' + 'processes and threads (this may take a while)...') + try: + self._wait_for_upload_threads(terminate=True) + finally: + self._cleanup_temporary_files() + raise + finally: + # shutdown processes + if self._md5_offload is not None: + self._md5_offload.finalize_processes() + if self._crypto_offload is not None: + self._crypto_offload.finalize_processes() + # close resume file + if self._resume is not None: + self._resume.close() diff --git a/cli/cli.py b/cli/cli.py index d27efc8..fdb9c0f 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -368,6 +368,7 @@ def callback(ctx, param, value): return click.option( '--file-attributes', expose_value=False, + default=False, is_flag=False, help='Store or restore file attributes [False]', callback=callback)(f) @@ -412,6 +413,20 @@ def callback(ctx, param, value): callback=callback)(f) +def _multi_storage_account_distribution_mode(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options[ + 'multi_storage_account_distribution_mode'] = value.lower() + return value + return click.option( + '--multi-storage-account-distribution-mode', + expose_value=False, + default='disabled', + help='Multiple storage account distribution mode [stripe]', + callback=callback)(f) + + def _overwrite_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -438,6 +453,20 @@ def callback(ctx, param, value): callback=callback)(f) +def _rename_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rename'] = value + return value + return click.option( + '--rename', + expose_value=False, + is_flag=True, + default=False, + help='Rename a single file upload or download [False]', + callback=callback)(f) + + def _rsa_private_key_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -572,7 +601,22 @@ def callback(ctx, param, value): callback=callback)(f) +def _stripe_chunk_size_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['stripe_chunk_size_bytes'] = value + return value + return click.option( + '--stripe-chunk-size-bytes', + expose_value=False, + type=int, + default=1073741824, + help='Stripe width in bytes [1073741824]', + callback=callback)(f) + + def upload_options(f): + f = _stripe_chunk_size_bytes_option(f) f = _strip_components_option(f) f = _skip_on_md5_match_option(f) f = _skip_on_lmt_ge_option(f) @@ -581,8 +625,10 @@ def upload_options(f): f = _rsa_public_key_option(f) f = _rsa_private_key_passphrase_option(f) f = _rsa_private_key_option(f) + f = _rename_option(f) f = _recursive_option(f) f = _overwrite_option(f) + f = _multi_storage_account_distribution_mode(f) f = _mode_option(f) f = _include_option(f) f = _file_md5_option(f) @@ -602,6 +648,7 @@ def download_options(f): f = _sas_option(f) f = _rsa_private_key_passphrase_option(f) f = _rsa_private_key_option(f) + f = _rename_option(f) f = _recursive_option(f) f = _overwrite_option(f) f = _mode_option(f) @@ -702,7 +749,11 @@ def upload(ctx, local_resource, storage_account, remote_path): ctx.cli_options, settings.TransferAction.Upload, local_resource, storage_account, remote_path) ctx.initialize() - blobxfer.api.upload_block() + specs = settings.create_upload_specifications(ctx.config) + for spec in specs: + blobxfer.api.Uploader( + ctx.general_options, ctx.credentials, spec + ).start() @cli.group() diff --git a/cli/settings.py b/cli/settings.py index 088a4f4..6dc4f72 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -36,6 +36,7 @@ import blobxfer.models.azure import blobxfer.models.download import blobxfer.models.options +import blobxfer.models.upload import blobxfer.operations.azure import blobxfer.operations.crypto import blobxfer.util @@ -97,6 +98,7 @@ def add_cli_options( 'mode': cli_options['mode'], 'overwrite': cli_options['overwrite'], 'recursive': cli_options['recursive'], + 'rename': cli_options['rename'], 'rsa_private_key': cli_options['rsa_private_key'], 'rsa_private_key_passphrase': cli_options[ 'rsa_private_key_passphrase'], @@ -107,9 +109,17 @@ def add_cli_options( 'lmt_ge': cli_options['skip_on_lmt_ge'], 'md5_match': cli_options['skip_on_md5_match'], }, - 'store_file_attributes': cli_options['file_attributes'], - 'store_file_md5': cli_options['file_md5'], + 'store_file_properties': { + 'attributes': cli_options['file_attributes'], + 'md5': cli_options['file_md5'], + }, 'strip_components': cli_options['strip_components'], + 'vectored_io': { + 'stripe_chunk_size_bytes': cli_options[ + 'stripe_chunk_size_bytes'], + 'multi_storage_account_distribution_mode': cli_options[ + 'multi_storage_account_distribution_mode'], + }, }, } elif action == TransferAction.Download: @@ -125,6 +135,7 @@ def add_cli_options( 'mode': cli_options['mode'], 'overwrite': cli_options['overwrite'], 'recursive': cli_options['recursive'], + 'rename': cli_options['rename'], 'rsa_private_key': cli_options['rsa_private_key'], 'rsa_private_key_passphrase': cli_options[ 'rsa_private_key_passphrase'], @@ -287,11 +298,6 @@ def create_download_specifications(config): rpk, rpkp) else: rpk = None - # ensure compatible options - if (not conf['options']['check_file_md5'] and - conf['options']['skip_on']['md5_match']): - raise ValueError( - 'Cannot specify skip on MD5 match without file MD5 enabled') # create specification ds = blobxfer.models.download.Specification( download_options=blobxfer.models.options.Download( @@ -302,6 +308,7 @@ def create_download_specifications(config): mode=mode, overwrite=conf['options']['overwrite'], recursive=conf['options']['recursive'], + rename=conf['options']['rename'], restore_file_attributes=conf[ 'options']['restore_file_attributes'], rsa_private_key=rpk, @@ -329,5 +336,98 @@ def create_download_specifications(config): if blobxfer.util.is_not_empty(conf['exclude']): asp.add_excludes(conf['exclude']) ds.add_azure_source_path(asp) + # append spec to list specs.append(ds) return specs + + +def create_upload_specifications(config): + # type: (dict) -> List[blobxfer.models.upload.Specification] + """Create a list of Upload Specification objects from configuration + :param dict config: config dict + :rtype: list + :return: list of Upload Specification objects + """ + specs = [] + for conf in config['upload']: + # create upload options + confmode = conf['options']['mode'].lower() + if confmode == 'auto': + mode = blobxfer.models.azure.StorageModes.Auto + elif confmode == 'append': + mode = blobxfer.models.azure.StorageModes.Append + elif confmode == 'block': + mode = blobxfer.models.azure.StorageModes.Block + elif confmode == 'file': + mode = blobxfer.models.azure.StorageModes.File + elif confmode == 'page': + mode = blobxfer.models.azure.StorageModes.Page + else: + raise ValueError('unknown mode: {}'.format(confmode)) + # load RSA public key PEM if specified + rpk = conf['options']['rsa_public_key'] + if blobxfer.util.is_not_empty(rpk): + rpk = blobxfer.operations.crypto.load_rsa_public_key_file(rpk) + if rpk is None: + # load RSA private key PEM file if specified + rpk = conf['options']['rsa_private_key'] + if blobxfer.util.is_not_empty(rpk): + rpkp = conf['options']['rsa_private_key_passphrase'] + rpk = blobxfer.operations.crypto.load_rsa_private_key_file( + rpk, rpkp) + rpk = rpk.public_key() + else: + rpk = None + # create local source paths + lsp = blobxfer.models.upload.LocalSourcePath() + lsp.add_paths(conf['source']) + if blobxfer.util.is_not_empty(conf['include']): + lsp.add_includes(conf['include']) + if blobxfer.util.is_not_empty(conf['exclude']): + lsp.add_excludes(conf['exclude']) + # create specification + us = blobxfer.models.upload.Specification( + upload_options=blobxfer.models.options.Upload( + chunk_size_bytes=conf['options']['chunk_size_bytes'], + delete_extraneous_destination=conf[ + 'options']['delete_extraneous_destination'], + mode=mode, + overwrite=conf['options']['overwrite'], + recursive=conf['options']['recursive'], + rename=conf['options']['rename'], + rsa_public_key=rpk, + store_file_properties=blobxfer.models.options.FileProperties( + attributes=conf[ + 'options']['store_file_properties']['attributes'], + md5=conf['options']['store_file_properties']['md5'], + ), + strip_components=conf['options']['strip_components'], + vectored_io=blobxfer.models.options.VectoredIo( + stripe_chunk_size_bytes=conf[ + 'options']['vectored_io']['stripe_chunk_size_bytes'], + multi_storage_account_distribution_mode=blobxfer. + models.upload.VectoredIoDistributionMode( + conf['options']['vectored_io'][ + 'multi_storage_account_distribution_mode'].lower( + )), + ), + ), + skip_on_options=blobxfer.models.options.SkipOn( + filesize_match=conf['options']['skip_on']['filesize_match'], + lmt_ge=conf['options']['skip_on']['lmt_ge'], + md5_match=conf['options']['skip_on']['md5_match'], + ), + local_source_path=lsp, + ) + # create remote destination paths + for dst in conf['destination']: + if len(dst) != 1: + raise RuntimeError( + 'invalid number of destination pairs specified per entry') + sa = next(iter(dst)) + adp = blobxfer.operations.azure.DestinationPath() + adp.add_path_with_storage_account(dst[sa], sa) + us.add_azure_destination_path(adp) + # append spec to list + specs.append(us) + return specs From 52c504ad6c53b2e13133be320963d49fe1d678c7 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 18 May 2017 08:18:14 -0700 Subject: [PATCH 26/47] Fix filter check on download - Retry interval shortened - Allow multiple include/exclude filters to be specified on the CLI --- blobxfer/models/__init__.py | 8 ++++---- blobxfer/models/upload.py | 11 +++++------ blobxfer/operations/azure/__init__.py | 18 ++++-------------- blobxfer/retry.py | 4 ++-- cli/cli.py | 2 ++ 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/blobxfer/models/__init__.py b/blobxfer/models/__init__.py index 82ac224..b9e9fbc 100644 --- a/blobxfer/models/__init__.py +++ b/blobxfer/models/__init__.py @@ -67,7 +67,7 @@ def add_include(self, incl): :param str incl: include filter """ if self._include is None: - self._include = [incl] + self._include = list(incl) else: self._include.append(incl) @@ -78,7 +78,7 @@ def add_includes(self, includes): :param list includes: list of includes """ if not isinstance(includes, list): - raise ValueError('includes is not of type list') + includes = list(includes) if self._include is None: self._include = includes else: @@ -91,7 +91,7 @@ def add_exclude(self, excl): :param str excl: exclude filter """ if self._exclude is None: - self._exclude = [excl] + self._exclude = list(excl) else: self._exclude.append(excl) @@ -102,7 +102,7 @@ def add_excludes(self, excludes): :param list excludes: list of excludes """ if not isinstance(excludes, list): - raise ValueError('excludes is not of type list') + excludes = list(excludes) if self._exclude is None: self._exclude = excludes else: diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index cbe1816..8d8a09d 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -123,17 +123,16 @@ def files(self): # check if path is a single file tmp = pathlib.Path(_ppath) if tmp.is_file(): - yield LocalPath( - parent_path=tmp.parent, - relative_path=pathlib.Path(tmp.name) - ) + if self._inclusion_check(tmp.name): + yield LocalPath( + parent_path=tmp.parent, + relative_path=pathlib.Path(tmp.name) + ) continue del tmp for entry in blobxfer.util.scantree(_ppath): _rpath = pathlib.Path(entry.path).relative_to(_ppath) if not self._inclusion_check(_rpath): - logger.debug( - 'skipping file {} due to filters'.format(_rpath)) continue yield LocalPath(parent_path=_expath, relative_path=_rpath) diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 8581c9a..8664ac8 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -261,6 +261,8 @@ def _populate_from_list_files(self, creds, options, general_options): for file in blobxfer.operations.azure.file.list_files( sa.file_client, cont, dir, options.recursive, general_options.timeout_sec): + if not self._inclusion_check(file.name): + continue if blobxfer.models.crypto.EncryptionMetadata.\ encryption_metadata_exists(file.metadata): ed = blobxfer.models.crypto.EncryptionMetadata() @@ -291,6 +293,8 @@ def _populate_from_list_blobs(self, creds, options, general_options): for blob in blobxfer.operations.azure.blob.list_blobs( sa.block_blob_client, cont, dir, options.mode, options.recursive, general_options.timeout_sec): + if not self._inclusion_check(blob.name): + continue if blobxfer.models.crypto.EncryptionMetadata.\ encryption_metadata_exists(blob.metadata): ed = blobxfer.models.crypto.EncryptionMetadata() @@ -336,17 +340,3 @@ def lookup_storage_account(self, remote_path): :return: storage account associated with path """ return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] - - # TODO IS THIS NEEDED? - def generate_entities_for_mode(self, creds, options): - for _path in self._paths: - rpath = str(_path) - cont, dir = blobxfer.util.explode_azure_path(rpath) - sa = creds.get_storage_account(self.lookup_storage_account(rpath)) - - if options.rsa_public_key is not None: - ed = blobxfer.models.crypto.EncryptionMetadata() - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_blob(sa, blob) diff --git a/blobxfer/retry.py b/blobxfer/retry.py index ce42bd4..892b25c 100644 --- a/blobxfer/retry.py +++ b/blobxfer/retry.py @@ -37,7 +37,7 @@ class ExponentialRetryWithMaxWait(azure.storage.retry._Retry): """Exponential Retry with Max Wait (infinite retries)""" - def __init__(self, initial_backoff=1, max_backoff=8, reset_at_max=True): + def __init__(self, initial_backoff=0.1, max_backoff=2, reset_at_max=True): # type: (ExponentialRetryWithMaxWait, int, int, bool) -> None """Ctor for ExponentialRetryWithMaxWait :param ExponentialRetryWithMaxWait self: this @@ -78,7 +78,7 @@ def _backoff(self, context): if context.count == 1: backoff = self.initial_backoff else: - backoff = self.initial_backoff << (context.count - 1) + backoff = self.initial_backoff * (context.count - 1) if backoff > self.max_backoff and self.reset_at_max: backoff = self.initial_backoff context.count = 1 diff --git a/cli/cli.py b/cli/cli.py index fdb9c0f..f7a5f53 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -356,6 +356,7 @@ def callback(ctx, param, value): '--exclude', expose_value=False, default=None, + multiple=True, help='Exclude pattern', callback=callback)(f) @@ -396,6 +397,7 @@ def callback(ctx, param, value): '--include', expose_value=False, default=None, + multiple=True, help='Include pattern', callback=callback)(f) From 28b9bc9df57efc0ebb72d5d23795060b00e6c89b Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 19 May 2017 15:01:49 -0700 Subject: [PATCH 27/47] Continue upload work - Add one-shot parameter - Begin accommodating vectorized output --- blobxfer/models/azure.py | 1 + blobxfer/models/options.py | 3 +- blobxfer/models/upload.py | 83 +++++++++++----- blobxfer/operations/upload.py | 182 +++++++++++++++++++++++++--------- cli/cli.py | 59 +++++++---- cli/settings.py | 10 +- 6 files changed, 239 insertions(+), 99 deletions(-) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index fbc319f..b2b34b8 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -66,6 +66,7 @@ def __init__(self, container, ed=None): self._md5 = None self._encryption = ed self._vio = None + self.replica_targets = None @property def client(self): diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index 9a3af9f..14e4e09 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -48,7 +48,7 @@ VectoredIo = collections.namedtuple( 'VectoredIoOptions', [ 'stripe_chunk_size_bytes', - 'multi_storage_account_distribution_mode', + 'distribution_mode', ] ) SkipOn = collections.namedtuple( @@ -69,6 +69,7 @@ 'chunk_size_bytes', 'delete_extraneous_destination', 'mode', + 'one_shot_bytes', 'overwrite', 'recursive', 'rename', diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 8d8a09d..34584dd 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -49,8 +49,9 @@ # create logger logger = logging.getLogger(__name__) # global defines -_MAX_BLOCK_CHUNKSIZE_BYTES = 268435456 -_MAX_NONBLOCK_CHUNKSIZE_BYTES = 4194304 +_MAX_BLOCK_BLOB_ONESHOT_BYTES = 268435456 +_MAX_BLOCK_BLOB_CHUNKSIZE_BYTES = 268435456 +_MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304 # named tuples @@ -65,6 +66,13 @@ 'pad', ] ) +LocalPathView = collections.namedtuple( + 'LocalPathView', [ + 'fd_end', + 'fd_start', + 'slice_num', + ] +) class VectoredIoDistributionMode(enum.Enum): @@ -77,11 +85,19 @@ def __str__(self): class LocalPath(object): - def __init__(self, parent_path, relative_path): + def __init__(self, parent_path, relative_path, view=None): self.parent_path = parent_path self.relative_path = relative_path # populate properties self._stat = self.absolute_path.stat() + if view is None: + self.view = LocalPathView( + fd_start=0, + fd_end=self.size, + slice_num=0, + ) + else: + self.view = view @property def absolute_path(self): @@ -91,6 +107,10 @@ def absolute_path(self): def size(self): return self._stat.st_size + @property + def lmt(self): + return self._stat.st_mtime + @property def mode(self): return str(oct(self._stat.st_mode)) @@ -164,21 +184,22 @@ def __init__( if self.sources.paths[0].is_dir(): raise ValueError( 'cannot rename a directory of files to upload') - if (self.options.rsa_public_key and - self.options.vectored_io. - multi_storage_account_distribution_mode == - VectoredIoDistributionMode.Stripe): - raise ValueError( - 'cannot enable encryption and multi-storage account ' - 'vectored IO in stripe mode') if self.options.chunk_size_bytes <= 0: raise ValueError('chunk size must be positive') - if self.options.chunk_size_bytes > _MAX_BLOCK_CHUNKSIZE_BYTES: + if self.options.chunk_size_bytes > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: raise ValueError( ('chunk size value of {} exceeds maximum allowable ' 'of {}').format( self.options.chunk_size_bytes, - _MAX_BLOCK_CHUNKSIZE_BYTES)) + _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES)) + if self.options.one_shot_bytes < 0: + raise ValueError('one shot bytes value must be at least 0') + if self.options.one_shot_bytes > _MAX_BLOCK_BLOB_ONESHOT_BYTES: + raise ValueError( + ('one shot bytes value of {} exceeds maximum allowable ' + 'of {}').format( + self.options.chunk_size_bytes, + _MAX_BLOCK_BLOB_ONESHOT_BYTES)) def add_azure_destination_path(self, dest): # type: (Specification, @@ -267,13 +288,24 @@ def all_operations_completed(self): @property def is_resumable(self): # type: (Descriptor) -> bool - """Download is resume capable + """Upload is resume capable :param Descriptor self: this :rtype: bool :return: if resumable """ return self._resume_mgr is not None and self.hmac is None + @property + def one_shot(self): + # type: (Descriptor) -> bool + """Upload is a one-shot block upload + :param Descriptor self: this + :rtype: bool + :return: is one-shot capable + """ + return (self._ase.mode == blobxfer.models.azure.StorageModes.Block and + self._total_chunks == 1) + def hmac_iv(self, iv): # type: (Descriptor, bytes) -> None """Send IV through hasher @@ -327,26 +359,29 @@ def _adjust_chunk_size(self, options): self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) # ensure chunk sizes are compatible with mode if self._ase.mode == blobxfer.models.azure.StorageModes.Append: - if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: - self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( 'adjusting chunk size to {} for append blobs'.format( self._chunk_size)) elif self._ase.mode == blobxfer.models.azure.StorageModes.Block: - if self._chunk_size > _MAX_BLOCK_CHUNKSIZE_BYTES: - self._chunk_size = _MAX_BLOCK_CHUNKSIZE_BYTES - logger.debug( - 'adjusting chunk size to {} for block blobs'.format( - self._chunk_size)) + if self._ase.size <= options.one_shot_bytes: + self._chunk_size = options.one_shot_bytes + else: + if self._chunk_size > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for block blobs'.format( + self._chunk_size)) elif self._ase.mode == blobxfer.models.azure.StorageModes.File: - if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: - self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( 'adjusting chunk size to {} for files'.format( self._chunk_size)) elif self._ase.mode == blobxfer.models.azure.StorageModes.Page: - if self._chunk_size > _MAX_NONBLOCK_CHUNKSIZE_BYTES: - self._chunk_size = _MAX_NONBLOCK_CHUNKSIZE_BYTES + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( 'adjusting chunk size to {} for page blobs'.format( self._chunk_size)) diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 5f83b7b..67d29e2 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -32,6 +32,7 @@ # stdlib imports import enum import logging +import math try: import pathlib2 as pathlib except ImportError: # noqa @@ -318,9 +319,9 @@ def _put_data(self, ud, offsets): blobxfer.operations.azure.file.put_file_range( ud.entity, ud.local_path.absolute_path, offsets, self._general_options.timeout_sec) - else: # TODO all upload types + # TODO handle one-shot uploads for block blobs data = blobxfer.operations.azure.blob.get_blob_range( dd.entity, offsets, self._general_options.timeout_sec) @@ -353,7 +354,7 @@ def _process_upload_descriptor(self, ud): self._upload_set.remove(ud.unique_id) self._upload_sofar += 1 return - # re-enqueue for other threads to download + # re-enqueue for other threads to upload self._upload_queue.put(ud) if offsets is None: return @@ -412,17 +413,18 @@ def _delete_extraneous_files(self): except OSError: pass - def _check_upload_conditions(self, lpath, rfile): - # type: (Uploader, pathlib.Path, + def _check_upload_conditions(self, local_path, rfile): + # type: (Uploader, blobxfer.models.upload.LocalPath, # blobxfer.models.azure.StorageEntity) -> UploadAction """Check for upload conditions :param Uploader self: this - :param pathlib.Path lpath: local path + :param blobxfer.models.LocalPath local_path: local path :param blobxfer.models.azure.StorageEntity rfile: remote file :rtype: UploadAction :return: upload action """ - # check if file still exists + lpath = local_path.absolute_path + # check if local file still exists if not lpath.exists(): return UploadAction.Skip # if remote file doesn't exist, upload @@ -445,7 +447,7 @@ def _check_upload_conditions(self, lpath, rfile): # check skip on file size match ul_fs = None if self._spec.skip_on.filesize_match: - lsize = lpath.stat().st_size + lsize = local_path.size if rfile.mode == blobxfer.models.azure.StorageModes.Page: lsize = blobxfer.util.page_align_content_length(lsize) if rfile.size == lsize: @@ -455,8 +457,7 @@ def _check_upload_conditions(self, lpath, rfile): # check skip on lmt ge ul_lmt = None if self._spec.skip_on.lmt_ge: - mtime = blobxfer.util.datetime_from_timestamp( - lpath.stat().st_mtime) + mtime = blobxfer.util.datetime_from_timestamp(local_path.lmt) if rfile.lmt >= mtime: ul_lmt = False else: @@ -467,7 +468,33 @@ def _check_upload_conditions(self, lpath, rfile): else: return UploadAction.Skip - def _generate_entity_for_source(self, local_path): + def _check_for_existing_remote(self, sa, cont, name): + if self._spec.options.mode == blobxfer.models.azure.StorageModes.File: + fp = blobxfer.operations.azure.file.get_file_properties( + sa.file_client, cont, name, + timeout=self._general_options.timeout_sec) + else: + fp = blobxfer.operations.azure.blob.get_blob_properties( + sa.block_blob_client, cont, name, self._spec.options.mode, + timeout=self._general_options.timeout_sec) + if fp is not None: + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(fp.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json(fp.metadata, fp.name, None) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + ase.populate_from_file(sa, fp) + else: + ase.populate_from_blob(sa, fp) + else: + ase = None + return ase + + def _generate_destination_for_source(self, local_path): # type: (Uploader, blobxfer.models.upload.LocalSourcePath) -> ??? """Generate entities for source path :param Uploader self: this @@ -500,36 +527,16 @@ def _generate_entity_for_source(self, local_path): 'must specify a container for destination: {}'.format( dpath)) # apply strip components - print(cont, name) sa = self._creds.get_storage_account( dst.lookup_storage_account(sdpath)) - if (self._spec.options.mode == - blobxfer.models.azure.StorageModes.File): - fp = blobxfer.operations.azure.file.get_file_properties( - sa.file_client, cont, name, - timeout=self._general_options.timeout_sec) - else: - fp = blobxfer.operations.azure.blob.get_blob_properties( - sa.block_blob_client, cont, name, - self._spec.options.mode, - timeout=self._general_options.timeout_sec) - if fp is not None: - if blobxfer.models.crypto.EncryptionMetadata.\ - encryption_metadata_exists(fp.metadata): - ed = blobxfer.models.crypto.EncryptionMetadata() - ed.convert_from_json(fp.metadata, fp.name, None) - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - if (self._spec.options.mode == - blobxfer.models.azure.StorageModes.File): - ase.populate_from_file(sa, fp) - else: - ase.populate_from_blob(sa, fp) - else: + # do not check for existing remote right now if striped + # vectored io mode + if (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload. + VectoredIoDistributionMode.Stripe): ase = None - lpath = local_path.parent_path / local_path.relative_path - action = self._check_upload_conditions(lpath, ase) + else: + ase = self._check_for_existing_remote(sa, cont, name) if ase is None: if self._spec.options.rsa_public_key: ed = blobxfer.models.crypto.EncryptionMetadata() @@ -538,13 +545,87 @@ def _generate_entity_for_source(self, local_path): ase = blobxfer.models.azure.StorageEntity(cont, ed) ase.populate_from_local( sa, cont, name, self._spec.options.mode) - yield action, ase + yield sa, ase def _create_unique_id(self, src, ase): return ';'.join( (str(src.absolute_path), ase._client.account_name, ase.path) ) + def append_slice_suffix_to_name(self, name, slice): + return '{}.bxslice-{}'.format(name, slice) + + def _vectorize_and_bind(self, local_path, dest): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # List[blobxfer.models.azure.StorageEntity]) -> None + """Vectorize local path to destinations and bind + :param Uploader self: this + :param blobxfer.models.LocalPath local_path: local path + :param list rfile: remote file + """ + if (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload.VectoredIoDistributionMode.Stripe): + num_dest = len(dest) + # compute total number of slices + slices = int(math.ceil( + local_path.size / + self._spec.options.vectored_io.stripe_chunk_size_bytes)) + logger.debug( + '{} slices for vectored out of {} to {} destinations'.format( + slices, local_path.absolute_path, num_dest)) + # create new local path to ase mappings + curr = 0 + slice = 0 + for i in range(0, slices): + start = curr + end = ( + curr + + self._spec.options.vectored_io.stripe_chunk_size_bytes + ) + if end > local_path.size: + end = local_path.size + sa, ase = dest[i % num_dest] + name = self.append_slice_suffix_to_name(ase.name, slice) + ase = self._check_for_existing_remote(sa, ase.container, name) + lp_slice = blobxfer.models.upload.LocalPath( + parent_path=local_path.parent_path, + relative_path=local_path.relative_path, + view=blobxfer.models.upload.LocalPathView( + fd_start=start, + fd_end=end, + slice_num=slice, + ) + ) + action = self._check_upload_conditions(lp_slice, ase) + yield action, lp_slice, ase + start += curr + slice += 1 + elif (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload.VectoredIoDistributionMode.Replica): + action_map = {} + for _, ase in dest: + action = self._check_upload_conditions(local_path, ase) + if action not in action_map: + action_map[action] = [] + action_map[action].append(ase) + for action in action_map: + dst = action_map[action] + if len(dst) == 1: + yield action, local_path, dst[0] + else: + if (action == UploadAction.CheckMd5 or + action == UploadAction.Skip): + for ase in dst: + yield action, local_path, ase + else: + primary_ase = dst[0] + primary_ase.replica_targets.extend(dst[1:]) + yield action, local_path, primary_ase + else: + for _, ase in dest: + action = self._check_upload_conditions(local_path, ase) + yield action, local_path, ase + def _run(self): # type: (Uploader) -> None """Execute Uploader @@ -582,27 +663,32 @@ def _run(self): raise RuntimeError( 'cannot rename to specified destination with multiple sources') # iterate through source paths to upload - for sfile in self._spec.sources.files(): - # create associated storage entity (destination) for file - for action, ase in self._generate_entity_for_source(sfile): - print(sfile.parent_path, sfile.relative_path, sfile.absolute_path, action, ase.container, ase.name) - print(sfile.size, sfile.mode, sfile.uid, sfile.gid) - print(self._create_unique_id(sfile, ase)) + for src in self._spec.sources.files(): + # create a destination array for the source + dest = [ + (sa, ase) for sa, ase in + self._generate_destination_for_source(src) + ] + for action, lp, ase in self._vectorize_and_bind(src, dest): + print(lp.parent_path, lp.relative_path, lp.absolute_path, action, ase.container, ase.name) + print(lp.size, lp.mode, lp.uid, lp.gid) + print(self._create_unique_id(lp, ase)) + print('replicas', len(ase.replica_targets) if ase.replica_targets is not None else 'none') if action == UploadAction.Skip: skipped_files += 1 skipped_size += ase.size if ase.size is not None else 0 continue # add to potential upload set - uid = self._create_unique_id(sfile, ase) + uid = self._create_unique_id(lp, ase) with self._upload_lock: self._upload_set.add(uid) if action == UploadAction.CheckMd5: - self._pre_md5_skip_on_check(sfile, ase) + self._pre_md5_skip_on_check(lp, ase) elif action == UploadAction.Upload: - self._add_to_upload_queue(sfile, ase, uid) + self._add_to_upload_queue(lp, ase, uid) nfiles += 1 - total_size += sfile.size + total_size += lp.size self._upload_total = nfiles - skipped_files self._upload_bytes_total = total_size - skipped_size diff --git a/cli/cli.py b/cli/cli.py index f7a5f53..bc11ef6 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -334,6 +334,20 @@ def callback(ctx, param, value): callback=callback)(f) +def _distribution_mode(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['distribution_mode'] = value.lower() + return value + return click.option( + '--distribution-mode', + expose_value=False, + default='disabled', + help='Vectored IO distribution mode: disabled, replica, ' + 'stripe [disabled]', + callback=callback)(f) + + def _endpoint_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -415,17 +429,19 @@ def callback(ctx, param, value): callback=callback)(f) -def _multi_storage_account_distribution_mode(f): +def _one_shot_bytes_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) - clictx.cli_options[ - 'multi_storage_account_distribution_mode'] = value.lower() + clictx.cli_options['one_shot_bytes'] = value return value return click.option( - '--multi-storage-account-distribution-mode', + '--one-shot-bytes', expose_value=False, - default='disabled', - help='Multiple storage account distribution mode [stripe]', + type=int, + default=0, + help='File sizes less than or equal to the specified byte threshold ' + 'will be uploaded as one-shot for block blobs; the valid range that ' + 'can be specified is 0 to 256MiB [0]', callback=callback)(f) @@ -577,6 +593,20 @@ def callback(ctx, param, value): callback=callback)(f) +def _stripe_chunk_size_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['stripe_chunk_size_bytes'] = value + return value + return click.option( + '--stripe-chunk-size-bytes', + expose_value=False, + type=int, + default=1073741824, + help='Vectored IO stripe width in bytes [1073741824]', + callback=callback)(f) + + def _sync_copy_dest_access_key_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -603,20 +633,6 @@ def callback(ctx, param, value): callback=callback)(f) -def _stripe_chunk_size_bytes_option(f): - def callback(ctx, param, value): - clictx = ctx.ensure_object(CliContext) - clictx.cli_options['stripe_chunk_size_bytes'] = value - return value - return click.option( - '--stripe-chunk-size-bytes', - expose_value=False, - type=int, - default=1073741824, - help='Stripe width in bytes [1073741824]', - callback=callback)(f) - - def upload_options(f): f = _stripe_chunk_size_bytes_option(f) f = _strip_components_option(f) @@ -630,13 +646,14 @@ def upload_options(f): f = _rename_option(f) f = _recursive_option(f) f = _overwrite_option(f) - f = _multi_storage_account_distribution_mode(f) + f = _one_shot_bytes_option(f) f = _mode_option(f) f = _include_option(f) f = _file_md5_option(f) f = _file_attributes(f) f = _exclude_option(f) f = _endpoint_option(f) + f = _distribution_mode(f) f = _delete_option(f) f = _chunk_size_bytes_option(f) f = _access_key_option(f) diff --git a/cli/settings.py b/cli/settings.py index 6dc4f72..5834c12 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -96,6 +96,7 @@ def add_cli_options( 'chunk_size_bytes': cli_options['chunk_size_bytes'], 'delete_extraneous_destination': cli_options['delete'], 'mode': cli_options['mode'], + 'one_shot_bytes': cli_options['one_shot_bytes'], 'overwrite': cli_options['overwrite'], 'recursive': cli_options['recursive'], 'rename': cli_options['rename'], @@ -117,8 +118,7 @@ def add_cli_options( 'vectored_io': { 'stripe_chunk_size_bytes': cli_options[ 'stripe_chunk_size_bytes'], - 'multi_storage_account_distribution_mode': cli_options[ - 'multi_storage_account_distribution_mode'], + 'distribution_mode': cli_options['distribution_mode'], }, }, } @@ -392,6 +392,7 @@ def create_upload_specifications(config): delete_extraneous_destination=conf[ 'options']['delete_extraneous_destination'], mode=mode, + one_shot_bytes=conf['options']['one_shot_bytes'], overwrite=conf['options']['overwrite'], recursive=conf['options']['recursive'], rename=conf['options']['rename'], @@ -405,11 +406,10 @@ def create_upload_specifications(config): vectored_io=blobxfer.models.options.VectoredIo( stripe_chunk_size_bytes=conf[ 'options']['vectored_io']['stripe_chunk_size_bytes'], - multi_storage_account_distribution_mode=blobxfer. + distribution_mode=blobxfer. models.upload.VectoredIoDistributionMode( conf['options']['vectored_io'][ - 'multi_storage_account_distribution_mode'].lower( - )), + 'distribution_mode'].lower()), ), ), skip_on_options=blobxfer.models.options.SkipOn( From 813080ad5532fbe38356836b44c94cf3444f789b Mon Sep 17 00:00:00 2001 From: Fred Park Date: Sun, 21 May 2017 20:06:20 -0700 Subject: [PATCH 28/47] Basic file and page upload support --- blobxfer/models/upload.py | 80 ++++++-- blobxfer/operations/azure/blob/__init__.py | 18 ++ blobxfer/operations/azure/blob/append.py | 15 ++ blobxfer/operations/azure/blob/block.py | 19 +- blobxfer/operations/azure/blob/page.py | 53 ++++++ blobxfer/operations/azure/file.py | 34 +++- blobxfer/operations/md5.py | 22 +++ blobxfer/operations/upload.py | 212 ++++++++++++++++----- 8 files changed, 380 insertions(+), 73 deletions(-) diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 34584dd..a0ee766 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -248,6 +248,8 @@ def __init__(self, lpath, ase, uid, options, resume_mgr): self._adjust_chunk_size(options) self._total_chunks = self._compute_total_chunks(self._chunk_size) self._outstanding_ops = self._total_chunks + if blobxfer.util.is_not_empty(self._ase.replica_targets): + self._outstanding_ops *= len(self._ase.replica_targets) # initialize integrity checkers self.hmac = None self.md5 = None @@ -282,8 +284,7 @@ def all_operations_completed(self): :return: if all operations completed """ with self._meta_lock: - return (self._outstanding_ops == 0 and - len(self._unchecked_chunks) == 0) + return self._outstanding_ops == 0 @property def is_resumable(self): @@ -296,24 +297,51 @@ def is_resumable(self): return self._resume_mgr is not None and self.hmac is None @property - def one_shot(self): + def requires_put_block_list(self): # type: (Descriptor) -> bool - """Upload is a one-shot block upload + """Requires a put block list operation to finalize :param Descriptor self: this :rtype: bool - :return: is one-shot capable + :return: if finalize requires a put block list """ return (self._ase.mode == blobxfer.models.azure.StorageModes.Block and - self._total_chunks == 1) + self._total_chunks > 1) - def hmac_iv(self, iv): + @property + def requires_set_blob_properties_md5(self): + # type: (Descriptor) -> bool + """Requires a set file properties for md5 to finalize + :param Descriptor self: this + :rtype: bool + :return: if finalize requires a put file properties + """ + return (not self.entity.is_encrypted and self.must_compute_md5 and + self.entity.mode == blobxfer.models.azure.StorageModes.Page) + + @property + def requires_set_file_properties_md5(self): + # type: (Descriptor) -> bool + """Requires a set file properties for md5 to finalize + :param Descriptor self: this + :rtype: bool + :return: if finalize requires a put file properties + """ + return (not self.entity.is_encrypted and self.must_compute_md5 and + self.entity.mode == blobxfer.models.azure.StorageModes.File) + + def complete_offset_upload(self): + with self._meta_lock: + self._outstanding_ops -= 1 + # TODO save resume state + + def hmac_data(self, data): # type: (Descriptor, bytes) -> None - """Send IV through hasher + """Send data through hmac hasher :param Descriptor self: this - :param bytes iv: iv + :param bytes data: data """ with self._hasher_lock: - self.hmac.update(iv) + self.hmac.update(data) def _initialize_encryption(self, options): # type: (Descriptor, blobxfer.models.options.Upload) -> None @@ -321,7 +349,10 @@ def _initialize_encryption(self, options): :param Descriptor self: this :param blobxfer.models.options.Upload options: upload options """ - if options.rsa_public_key is not None: + # TODO support append blobs? + if (options.rsa_public_key is not None and + (self._ase.mode == blobxfer.models.azure.StorageModes.Block or + self._ase.mode == blobxfer.models.azure.StorageModes.File)): em = blobxfer.models.crypto.EncryptionMetadata() em.create_new_metadata(options.rsa_public_key) self.current_iv = em.content_encryption_iv @@ -413,7 +444,8 @@ def _initialize_integrity_checkers(self, options): 'symmetric key is invalid: provide RSA private key ' 'or metadata corrupt') self.hmac = self._ase.encryption_metadata.initialize_hmac() - if self.hmac is None and options.store_file_properties.md5: + # both hmac and md5 can be enabled + if options.store_file_properties.md5: self.md5 = blobxfer.util.new_md5_hasher() def next_offsets(self): @@ -424,13 +456,13 @@ def next_offsets(self): :return: upload offsets """ # TODO RESUME -# resume_bytes = self._resume() resume_bytes = None +# resume_bytes = self._resume() with self._meta_lock: -# if self._offset >= self._ase.size: -# return None, resume_bytes - if self._offset + self._chunk_size > self._ase.size: - chunk = self._ase.size - self._offset + if self._offset >= self.local_path.view.fd_end: + return None, resume_bytes + if self._offset + self._chunk_size > self.local_path.view.fd_end: + chunk = self.local_path.view.fd_end - self._offset else: chunk = self._chunk_size num_bytes = chunk @@ -440,7 +472,8 @@ def next_offsets(self): range_end = self._offset + num_bytes - 1 self._offset += chunk self._chunk_num += 1 - if self._ase.is_encrypted and self._offset >= self._ase.size: + if (self._ase.is_encrypted and + self._offset >= self.local_path.view.fd_end): pad = True else: pad = False @@ -453,3 +486,14 @@ def next_offsets(self): range_end=range_end, pad=pad, ), resume_bytes + + def read_data(self, offsets): + # compute start from view + start = self.local_path.view.fd_start + offsets.range_start + with self.local_path.absolute_path.open('rb') as fd: + fd.seek(start, 0) + data = fd.read(offsets.num_bytes) + if self.must_compute_md5: + with self._hasher_lock: + self.md5.update(data) + return data diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index dbe8263..40b40c4 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -158,3 +158,21 @@ def get_blob_range(ase, offsets, timeout=None): validate_content=False, # HTTPS takes care of integrity during xfer timeout=timeout, ).content + + +def create_container(ase, containers_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create blob container + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict containers_created: containers already created map + :param int timeout: timeout + """ + key = ase.client.account_name + ':blob=' + ase.container + if key not in containers_created: + ase.client.create_container( + container_name=ase.container, + fail_on_exist=False, + timeout=timeout) + containers_created.add(key) + logger.info('created blob container {} on storage account {}'.format( + ase.container, ase.client.account_name)) diff --git a/blobxfer/operations/azure/blob/append.py b/blobxfer/operations/azure/blob/append.py index 087e33b..e28fcdb 100644 --- a/blobxfer/operations/azure/blob/append.py +++ b/blobxfer/operations/azure/blob/append.py @@ -62,3 +62,18 @@ def create_client(storage_account): # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client + + +def create_blob(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create append blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + ase.client.create_blob( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name) + ), + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/block.py b/blobxfer/operations/azure/blob/block.py index 00f7eb3..9318a76 100644 --- a/blobxfer/operations/azure/blob/block.py +++ b/blobxfer/operations/azure/blob/block.py @@ -64,6 +64,19 @@ def create_client(storage_account): return client -def upload_block(): - logger.info('upload block') - print('upload') +def create_blob(ase, data, md5, encmeta, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create one shot block blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + if encmeta is not None: + raise NotImplementedError() + ase.client._put_blob( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 05d36b6..5601557 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -62,3 +62,56 @@ def create_client(storage_account): # set retry policy client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry return client + + +def create_blob(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create page blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + ase.client.create_blob( + container_name=ase.container, + blob_name=ase.name, + content_length=ase.size, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name) + ), + timeout=timeout) + + +def put_page(ase, page_start, page_end, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # int, int, bytes, int) -> None + """Puts a page into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int page_start: page range start + :param int page_end: page range end + :param bytes data: data + :param int timeout: timeout + """ + ase.client.update_page( + container_name=ase.container, + blob_name=ase.name, + page=data, + start_range=page_start, + end_range=page_end, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def set_blob_md5(ase, md5, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, str, int) -> None + """Set blob properties MD5 + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param str md5: md5 as base64 + :param int timeout: timeout + """ + ase.client.set_blob_properties( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + timeout=timeout) diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index 7a4e076..4970970 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -250,23 +250,22 @@ def create_file(ase, timeout=None): directory_name=dir, file_name=fpath, content_length=ase.size, - content_settings=None, + content_settings=azure.storage.file.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(fpath) + ), timeout=timeout) -def put_file_range(ase, local_file, offsets, timeout=None): - # type: (blobxfer.models.azure.StorageEntity, pathlib.path, - # blobxfer.models.upload.Offsets, int) -> None +def put_file_range(ase, offsets, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes, int) -> None """Puts a range of bytes into the remote file :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity - :param pathlib.Path local_file: local file :param blobxfer.models.upload.Offsets offsets: upload offsets + :param bytes data: data :param int timeout: timeout """ dir, fpath = parse_file_path(ase.name) - with local_file.open('rb') as fd: - fd.seek(offsets.range_start, 0) - data = fd.read(offsets.num_bytes) ase.client.update_range( share_name=ase.container, directory_name=dir, @@ -276,3 +275,22 @@ def put_file_range(ase, local_file, offsets, timeout=None): end_range=offsets.range_end, validate_content=False, # integrity is enforced with HTTPS timeout=timeout) + + +def set_file_md5(ase, md5, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, str, int) -> None + """Set file properties MD5 + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param str md5: md5 as base64 + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.set_file_properties( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + content_settings=azure.storage.file.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(fpath), + content_md5=md5, + ), + timeout=timeout) diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py index e04daec..7239a2d 100644 --- a/blobxfer/operations/md5.py +++ b/blobxfer/operations/md5.py @@ -42,6 +42,9 @@ # create logger logger = logging.getLogger(__name__) +# global defines +_EMPTY_MAX_PAGE_SIZE_MD5 = 'tc+p1sj+vWGPkawoQ9UKHA==' +_MAX_PAGE_SIZE_BYTES = 4194304 def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): @@ -80,6 +83,25 @@ def compute_md5_for_data_asbase64(data): return blobxfer.util.base64_encode_as_string(hasher.digest()) +def check_data_is_empty(data): + # type: (bytes) -> bool + """Check if data is empty via MD5 + :param bytes data: data to check + :rtype: bool + :return: if data is empty + """ + contentmd5 = compute_md5_for_data_asbase64(data) + datalen = len(data) + if datalen == _MAX_PAGE_SIZE_BYTES: + if contentmd5 == _EMPTY_MAX_PAGE_SIZE_MD5: + return True + else: + data_chk = b'\0' * datalen + if compute_md5_for_data_asbase64(data_chk) == contentmd5: + return True + return False + + class LocalFileMd5Offload(blobxfer.models.offload._MultiprocessOffload): """LocalFileMd5Offload""" def __init__(self, num_workers): diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 67d29e2..985a154 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -90,6 +90,10 @@ def __init__(self, general_options, creds, spec): self._upload_bytes_total = None self._upload_bytes_sofar = 0 self._upload_terminate = False + self._transfer_lock = threading.Lock() + self._transfer_queue = queue.Queue() + self._transfer_set = set() + self._transfer_threads = [] self._start_time = None self._delete_after = set() self._ud_map = {} @@ -111,10 +115,12 @@ def termination_check(self): :return: if terminated """ with self._upload_lock: - return (self._upload_terminate or - len(self._exceptions) > 0 or - (self._all_remote_files_processed and - len(self._upload_set) == 0)) + with self._transfer_lock: + return (self._upload_terminate or + len(self._exceptions) > 0 or + (self._all_remote_files_processed and + len(self._upload_set) == 0 and + len(self._transfer_set) == 0)) @property def termination_check_md5(self): @@ -272,6 +278,18 @@ def _initialize_upload_threads(self): self._upload_threads.append(thr) thr.start() + def _initialize_transfer_threads(self): + # type: (Uploader) -> None + """Initialize transfer threads + :param Uploader self: this + """ + logger.debug('spawning {} transfer threads'.format( + self._general_options.concurrency.transfer_threads)) + for _ in range(self._general_options.concurrency.transfer_threads): + thr = threading.Thread(target=self._worker_thread_transfer) + self._transfer_threads.append(thr) + thr.start() + def _wait_for_upload_threads(self, terminate): # type: (Uploader, bool) -> None """Wait for upload threads @@ -283,47 +301,100 @@ def _wait_for_upload_threads(self, terminate): for thr in self._upload_threads: thr.join() - def _worker_thread_upload(self): + def _worker_thread_transfer(self): # type: (Uploader) -> None - """Worker thread upload + """Worker thread transfer :param Uploader self: this """ while not self.termination_check: try: - ud = self._upload_queue.get(False, 0.25) + ud, ase, offsets, data = self._transfer_queue.get( + block=False, timeout=0.03) except queue.Empty: continue try: - self._process_upload_descriptor(ud) + self._process_transfer(ud, ase, offsets, data) except Exception as e: with self._upload_lock: self._exceptions.append(e) - def _put_data(self, ud, offsets): - if ud.entity.mode == blobxfer.models.azure.StorageModes.File: + def _process_transfer(self, ud, ase, offsets, data): + # issue put range + self._put_data(ase, offsets, data) + # accounting + with self._transfer_lock: + self._transfer_set.remove( + self._create_unique_transfer_id(ud.local_path, ase, offsets)) + self._upload_bytes_sofar += offsets.num_bytes + ud.complete_offset_upload() + + def _put_data(self, ase, offsets, data): + print('UL', offsets) + if ase.mode == blobxfer.models.azure.StorageModes.File: if offsets.chunk_num == 0: # create container if necessary blobxfer.operations.azure.file.create_share( - ud.entity, self._containers_created, - self._general_options.timeout_sec) + ase, self._containers_created, + timeout=self._general_options.timeout_sec) # create parent directories with self._fileshare_dir_lock: blobxfer.operations.azure.file.\ create_all_parent_directories( - ud.entity, self._dirs_created, - self._general_options.timeout_sec) + ase, self._dirs_created, + timeout=self._general_options.timeout_sec) # create remote file blobxfer.operations.azure.file.create_file( - ud.entity, self._general_options.timeout_sec) - # upload chunk + ase, timeout=self._general_options.timeout_sec) + # upload range blobxfer.operations.azure.file.put_file_range( - ud.entity, ud.local_path.absolute_path, offsets, - self._general_options.timeout_sec) - else: - # TODO all upload types - # TODO handle one-shot uploads for block blobs - data = blobxfer.operations.azure.blob.get_blob_range( - dd.entity, offsets, self._general_options.timeout_sec) + ase, offsets, data, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Append: + raise NotImplementedError() + elif ase.mode == blobxfer.models.azure.StorageModes.Block: + # TODO handle one-shot uploads for block blobs (get md5 as well) + raise NotImplementedError() + elif ase.mode == blobxfer.models.azure.StorageModes.Page: + if offsets.chunk_num == 0: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.page.create_blob( + ase, timeout=self._general_options.timeout_sec) + # align page + aligned = blobxfer.util.page_align_content_length( + offsets.num_bytes) + if aligned != offsets.num_bytes: + data = data.ljust(aligned, b'\0') + if blobxfer.operations.md5.check_data_is_empty(data): + return + # upload page + blobxfer.operations.azure.blob.page.put_page( + ase, offsets.range_start, offsets.range_start + aligned - 1, + data, timeout=self._general_options.timeout_sec) + + def _worker_thread_upload(self): + # type: (Uploader) -> None + """Worker thread upload + :param Uploader self: this + """ + import time + while not self.termination_check: + try: + if (len(self._transfer_set) > + self._general_options.concurrency.transfer_threads): + time.sleep(0.03) + continue + else: + ud = self._upload_queue.get(False, 0.03) + except queue.Empty: + continue + try: + self._process_upload_descriptor(ud) + except Exception as e: + with self._upload_lock: + self._exceptions.append(e) def _process_upload_descriptor(self, ud): # type: (Uploader, blobxfer.models.upload.Descriptor) -> None @@ -342,11 +413,10 @@ def _process_upload_descriptor(self, ud): logger.debug('adding {} sofar {} from {}'.format( resume_bytes, self._upload_bytes_sofar, ud._ase.name)) del resume_bytes - print(offsets) # check if all operations completed if offsets is None and ud.all_operations_completed: # finalize file - ud.finalize_file() + self._finalize_file(ud) # accounting with self._upload_lock: if ud.entity.is_encrypted: @@ -354,14 +424,17 @@ def _process_upload_descriptor(self, ud): self._upload_set.remove(ud.unique_id) self._upload_sofar += 1 return - # re-enqueue for other threads to upload - self._upload_queue.put(ud) + # if nothing to upload, re-enqueue for finalization if offsets is None: + self._upload_queue.put(ud) return + + # TODO encryption + # encrypt if necessary if ud.entity.is_encrypted: # send iv through hmac - ud.hmac_iv(ud.current_iv) + ud.hmac_data(ud.current_iv) # encrypt data if self._crypto_offload is not None: self._crypto_offload.add_encrypt_chunk( @@ -372,19 +445,62 @@ def _process_upload_descriptor(self, ud): # retrieved from crypto queue return else: - # TODO pickup here, read data from file - - encdata = blobxfer.operations.crypto.aes_cbc_decrypt_data( + # read data from file and encrypt + data = ud.read_data(offsets) + encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( ud.entity.encryption_metadata.symmetric_key, ud.current_iv, data, offsets.pad) # send encrypted data through hmac - - # TODO send data as optional param if encrypted - # issue put range - self._put_data(ud, offsets) - # accounting - with self._upload_lock: - self._upload_bytes_sofar += offsets.num_bytes + ud.hmac_data(encdata) + data = encdata + # TODO save last 16 encrypted bytes for next IV + else: + data = ud.read_data(offsets) + # re-enqueue for other threads to upload + self._upload_queue.put(ud) + # add data to transfer queue + with self._transfer_lock: + self._transfer_set.add( + self._create_unique_transfer_id( + ud.local_path, ud.entity, offsets)) + self._transfer_queue.put((ud, ud.entity, offsets, data)) + # iterate replicas + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + with self._transfer_lock: + self._transfer_set.add( + self._create_unique_transfer_id( + ud.local_path, ase, offsets)) + self._transfer_queue.put((ud, ase, offsets, data)) + + def _finalize_file(self, ud): + # create encryption metadata for file/blob + if ud.entity.is_encrypted: + # TODO + pass + # put block list for non one-shot block blobs + if ud.requires_put_block_list: + # TODO + pass + # set md5 blob property if not encrypted + if ud.requires_set_blob_properties_md5: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.blob.page.set_blob_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.page.set_blob_md5( + ase, digest, timeout=self._general_options.timeout_sec) + # set md5 file property if not encrypted + if ud.requires_set_file_properties_md5: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.file.set_file_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_md5( + ase, digest, timeout=self._general_options.timeout_sec) + # TODO set file metadata if encrypted def _cleanup_temporary_files(self): # type: (Uploader) -> None @@ -552,6 +668,12 @@ def _create_unique_id(self, src, ase): (str(src.absolute_path), ase._client.account_name, ase.path) ) + def _create_unique_transfer_id(self, local_path, ase, offsets): + return ';'.join( + (str(local_path.absolute_path), ase._client.account_name, ase.path, + str(local_path.view.fd_start), str(offsets.range_start)) + ) + def append_slice_suffix_to_name(self, name, slice): return '{}.bxslice-{}'.format(name, slice) @@ -654,6 +776,7 @@ def _run(self): self._check_for_crypto_done) # initialize upload threads self._initialize_upload_threads() + self._initialize_transfer_threads() # initialize local counters nfiles = 0 total_size = 0 @@ -710,14 +833,14 @@ def _run(self): self._update_progress_bar() # check for exceptions if len(self._exceptions) > 0: - logger.error('exceptions encountered while downloading') + logger.error('exceptions encountered while uploading') # raise the first one raise self._exceptions[0] # check for mismatches if (self._upload_sofar != self._upload_total or self._upload_bytes_sofar != self._upload_bytes_total): raise RuntimeError( - 'download mismatch: [count={}/{} bytes={}/{}]'.format( + 'upload mismatch: [count={}/{} bytes={}/{}]'.format( self._upload_sofar, self._upload_total, self._upload_bytes_sofar, self._upload_bytes_total)) # delete all remaining local files not accounted for if @@ -728,11 +851,12 @@ def _run(self): self._resume.delete() # output throughput if self._upload_start_time is not None: - dltime = (end_time - self._upload_start_time).total_seconds() + ultime = (end_time - self._upload_start_time).total_seconds() + mibps = upload_size_mib / ultime logger.info( - ('elapsed download + verify time and throughput: {0:.3f} sec, ' - '{1:.4f} Mbps').format( - dltime, download_size_mib * 8 / dltime)) + ('elapsed upload + verify time and throughput: {0:.3f} sec, ' + '{1:.4f} Mbps ({2:.3f} MiB/s)').format( + ultime, mibps * 8, mibps)) end_time = blobxfer.util.datetime_now() logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( end_time, (end_time - self._start_time).total_seconds())) From ccd8db28d38229ff4481d854f9812a11308e4ed1 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Mon, 22 May 2017 20:26:52 -0700 Subject: [PATCH 29/47] Block uploading - Fix striped uploads - Add --delete support to upload - Block size auto-select support - Metadata support including file attr and vectored io - Disable container create on insufficient sas token privilege --- blobxfer/models/azure.py | 34 +- blobxfer/models/crypto.py | 18 +- blobxfer/models/metadata.py | 132 ++++++++ blobxfer/models/upload.py | 193 +++++++++-- blobxfer/operations/azure/__init__.py | 27 +- blobxfer/operations/azure/blob/__init__.py | 38 +++ blobxfer/operations/azure/blob/block.py | 69 +++- blobxfer/operations/azure/blob/page.py | 14 + blobxfer/operations/azure/file.py | 64 ++++ blobxfer/operations/upload.py | 356 +++++++++++++++------ blobxfer/util.py | 11 + cli/cli.py | 6 +- 12 files changed, 795 insertions(+), 167 deletions(-) create mode 100644 blobxfer/models/metadata.py diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index b2b34b8..d44af48 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -56,6 +56,7 @@ def __init__(self, container, ed=None): :param blobxfer.models.crypto.EncryptionMetadata ed: encryption metadata """ + self._create_containers = None self._client = None self._container = container self._name = None @@ -68,6 +69,16 @@ def __init__(self, container, ed=None): self._vio = None self.replica_targets = None + @property + def create_containers(self): + # type: (StorageEntity) -> bool + """Create containers + :param StorageEntity self: this + :rtype: bool + :return: create containers + """ + return self._create_containers + @property def client(self): # type: (StorageEntity) -> object @@ -198,15 +209,17 @@ def encryption_metadata(self, value): """ self._encryption = value - def populate_from_blob(self, sa, blob): + def populate_from_blob(self, sa, blob, path): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, - # azure.storage.blob.models.Blob) -> None + # azure.storage.blob.models.Blob, str) -> None """Populate properties from Blob :param StorageEntity self: this :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.blob.models.Blob blob: blob to populate from + :param str path: full path to blob """ - self._name = blob.name + self._create_containers = sa.create_containers + self._name = path self._snapshot = blob.snapshot self._lmt = blob.properties.last_modified self._size = blob.properties.content_length @@ -221,15 +234,17 @@ def populate_from_blob(self, sa, blob): self._mode = StorageModes.Page self._client = sa.page_blob_client - def populate_from_file(self, sa, file): + def populate_from_file(self, sa, file, path): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, - # azure.storage.file.models.File) -> None + # azure.storage.file.models.File, str) -> None """Populate properties from File :param StorageEntity self: this :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.file.models.File file: file to populate from + :param str path: full path to file """ - self._name = file.name + self._create_containers = sa.create_containers + self._name = path self._snapshot = None self._lmt = file.properties.last_modified self._size = file.properties.content_length @@ -237,18 +252,19 @@ def populate_from_file(self, sa, file): self._mode = StorageModes.File self._client = sa.file_client - def populate_from_local(self, sa, container, name, mode): + def populate_from_local(self, sa, container, path, mode): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount # str, str, blobxfer.models.azure.StorageModes) -> None """Populate properties from local :param StorageEntity self: this :param blobxfer.operations.azure.StorageAccount sa: storage account :param str container: container - :param str name: name + :param str path: full path to file :param blobxfer.models.azure.StorageModes mode: storage mode """ + self._create_containers = sa.create_containers self._container = container - self._name = name + self._name = path self._mode = mode if mode == StorageModes.Append: self._client = sa.append_blob_client diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py index 56c0c79..6a50c9c 100644 --- a/blobxfer/models/crypto.py +++ b/blobxfer/models/crypto.py @@ -186,14 +186,14 @@ def create_new_metadata(self, rsa_public_key): ) self.encryption_mode = EncryptionMetadata._ENCRYPTION_MODE - def convert_from_json(self, md, blobname, rsaprivatekey): + def convert_from_json(self, md, entityname, rsaprivatekey): # type: (EncryptionMetadata, dict, str, # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey) # -> None """Read metadata json into objects :param EncryptionMetadata self: this :param dict md: metadata dictionary - :param str blobname: blob name + :param str entityname: entity name :param rsaprivatekey: RSA private key :type rsaprivatekey: cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey @@ -221,11 +221,11 @@ def convert_from_json(self, md, blobname, rsaprivatekey): if (self.encryption_agent.encryption_algorithm != EncryptionMetadata._ENCRYPTION_ALGORITHM): raise RuntimeError('{}: unknown block cipher: {}'.format( - blobname, self.encryption_agent.encryption_algorithm)) + entityname, self.encryption_agent.encryption_algorithm)) if (self.encryption_agent.protocol != EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION): raise RuntimeError('{}: unknown encryption protocol: {}'.format( - blobname, self.encryption_agent.protocol)) + entityname, self.encryption_agent.protocol)) self.encryption_authentication = EncryptionAuthentication( algorithm=ed[ EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ @@ -238,13 +238,13 @@ def convert_from_json(self, md, blobname, rsaprivatekey): EncryptionMetadata._AUTH_ALGORITHM): raise RuntimeError( '{}: unknown integrity/auth method: {}'.format( - blobname, self.encryption_authentication.algorithm)) + entityname, self.encryption_authentication.algorithm)) self.encryption_mode = ed[ EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE] if self.encryption_mode != EncryptionMetadata._ENCRYPTION_MODE: raise RuntimeError( '{}: unknown encryption mode: {}'.format( - blobname, self.encryption_mode)) + entityname, self.encryption_mode)) try: _eak = ed[EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ EncryptionMetadata._JSON_KEY_ENCRYPTED_AUTHKEY] @@ -265,7 +265,7 @@ def convert_from_json(self, md, blobname, rsaprivatekey): if (self.wrapped_content_key.algorithm != EncryptionMetadata._ENCRYPTED_KEY_SCHEME): raise RuntimeError('{}: unknown key encryption scheme: {}'.format( - blobname, self.wrapped_content_key.algorithm)) + entityname, self.wrapped_content_key.algorithm)) # if RSA key is a public key, stop here as keys cannot be decrypted if rsaprivatekey is None: return @@ -304,7 +304,7 @@ def convert_from_json(self, md, blobname, rsaprivatekey): EncryptionMetadata._AUTH_ALGORITHM): raise RuntimeError( '{}: unknown integrity/auth method: {}'.format( - blobname, + entityname, self.encryption_metadata_authentication.algorithm)) # verify hmac authhmac = base64.b64decode( @@ -317,7 +317,7 @@ def convert_from_json(self, md, blobname, rsaprivatekey): if hmacsha256.digest() != authhmac: raise RuntimeError( '{}: encryption metadata authentication failed'.format( - blobname)) + entityname)) def convert_to_json_with_mac(self): # TODO diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py new file mode 100644 index 0000000..5ebeea3 --- /dev/null +++ b/blobxfer/models/metadata.py @@ -0,0 +1,132 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import logging +# non-stdlib imports +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) +# global defines +JSON_KEY_BLOBXFER_METADATA = 'BlobxferMetadata' +# file attributes +_JSON_KEY_FILE_ATTRIBUTES = 'FileAttributes' +_JSON_KEY_FILE_ATTRIBUTES_POSIX = 'POSIX' +_JSON_KEY_FILE_ATTRIBUTES_WINDOWS = 'Windows' +_JSON_KEY_FILE_ATTRIBUTES_MODE = 'mode' +_JSON_KEY_FILE_ATTRIBUTES_UID = 'uid' +_JSON_KEY_FILE_ATTRIBUTES_GID = 'gid' +# vectored io +_JSON_KEY_VECTORED_IO = 'VectoredIO' +_JSON_KEY_VECTORED_IO_MODE = 'Mode' +_JSON_KEY_VECTORED_IO_STRIPE = 'Stripe' +_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE = 'TotalSize' +_JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START = 'OffsetStart' +_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES = 'TotalSlices' +_JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID = 'SliceId' +_JSON_KEY_VECTORED_IO_STRIPE_NEXT = 'Next' + + +def generate_fileattr_metadata(local_path, metadata): + # type: (blobxfer.models.upload.LocalPath, dict) -> dict + """Generate file attribute metadata dict + :param blobxfer.models.upload.LocalPath local_path: local path + :param dict metadata: existing metadata dict + :rtype: dict + :return: merged metadata dictionary + """ + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported yet') + return None + else: + md = { + _JSON_KEY_FILE_ATTRIBUTES: { + _JSON_KEY_FILE_ATTRIBUTES_POSIX: { + _JSON_KEY_FILE_ATTRIBUTES_MODE: local_path.mode, + _JSON_KEY_FILE_ATTRIBUTES_UID: local_path.uid, + _JSON_KEY_FILE_ATTRIBUTES_GID: local_path.gid, + } + } + } + return blobxfer.util.merge_dict(metadata, md) + + +def restore_fileattr(path, metadata): + # type: (pathlib.Path, dict) -> None + """Restore file attributes from metadata + :param pathlib.Path path: path to modify + :param dict metadata: existing metadata dict + """ + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported yet') + raise NotImplementedError() + + +def create_vectored_io_next_entry(ase): + # type: (blobxfer.models.upload.LocalPath) -> str + """Create Vectored IO next entry id + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :rtype: str + :return: vectored io next entry + """ + return ';'.join( + (ase.client.primary_endpoint, ase.container, ase.name) + ) + + +def generate_vectored_io_stripe_metadata(local_path, metadata): + # type: (blobxfer.models.upload.LocalPath, dict) -> dict + """Generate vectored io stripe metadata dict + :param blobxfer.models.upload.LocalPath local_path: local path + :param dict metadata: existing metadata dict + :rtype: dict + :return: merged metadata dictionary + """ + md = { + _JSON_KEY_VECTORED_IO: { + _JSON_KEY_VECTORED_IO_MODE: _JSON_KEY_VECTORED_IO_STRIPE, + _JSON_KEY_VECTORED_IO_STRIPE: { + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE: local_path.total_size, + _JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START: + local_path.view.fd_start, + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES: + local_path.view.total_slices, + _JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID: + local_path.view.slice_num, + _JSON_KEY_VECTORED_IO_STRIPE_NEXT: local_path.view.next, + } + } + } + return blobxfer.util.merge_dict(metadata, md) diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index a0ee766..07852cb 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -32,6 +32,7 @@ # stdlib imports import collections import enum +import json import logging import math import os @@ -43,7 +44,9 @@ # non-stdlib imports # local imports import blobxfer.models +import blobxfer.models.azure import blobxfer.models.crypto +import blobxfer.models.metadata import blobxfer.util # create logger @@ -52,14 +55,14 @@ _MAX_BLOCK_BLOB_ONESHOT_BYTES = 268435456 _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES = 268435456 _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304 +_MAX_NUM_CHUNKS = 50000 +_DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216 # named tuples Offsets = collections.namedtuple( 'Offsets', [ 'chunk_num', - 'block_id', - 'fd_start', 'num_bytes', 'range_end', 'range_start', @@ -70,7 +73,10 @@ 'LocalPathView', [ 'fd_end', 'fd_start', + 'mode', + 'next', 'slice_num', + 'total_slices', ] ) @@ -93,11 +99,15 @@ def __init__(self, parent_path, relative_path, view=None): if view is None: self.view = LocalPathView( fd_start=0, - fd_end=self.size, + fd_end=self._stat.st_size, slice_num=0, + mode=VectoredIoDistributionMode.Disabled, + total_slices=1, + next=None, ) else: self.view = view + self._size = self.view.fd_end - self.view.fd_start @property def absolute_path(self): @@ -105,6 +115,10 @@ def absolute_path(self): @property def size(self): + return self._size + + @property + def total_size(self): return self._stat.st_size @property @@ -184,8 +198,8 @@ def __init__( if self.sources.paths[0].is_dir(): raise ValueError( 'cannot rename a directory of files to upload') - if self.options.chunk_size_bytes <= 0: - raise ValueError('chunk size must be positive') + if self.options.chunk_size_bytes < 0: + raise ValueError('chunk size cannot be negative') if self.options.chunk_size_bytes > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: raise ValueError( ('chunk size value of {} exceeds maximum allowable ' @@ -241,6 +255,7 @@ def __init__(self, lpath, ase, uid, options, resume_mgr): self._hasher_lock = threading.Lock() self._resume_mgr = resume_mgr self._ase = ase + self._store_file_attr = options.store_file_properties.attributes self.current_iv = None self._initialize_encryption(options) # calculate the total number of ops required for transfer @@ -286,6 +301,18 @@ def all_operations_completed(self): with self._meta_lock: return self._outstanding_ops == 0 + @property + def last_block_num(self): + # type: (Descriptor) -> bool + """Last used block number for block id, should only be called for + finalize operation + :param Descriptor self: this + :rtype: int + :return: block number + """ + with self._meta_lock: + return self._chunk_num - 1 + @property def is_resumable(self): # type: (Descriptor) -> bool @@ -296,6 +323,37 @@ def is_resumable(self): """ return self._resume_mgr is not None and self.hmac is None + @property + def remote_is_file(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure File + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure File + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.File + + @property + def remote_is_page_blob(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure Page Blob + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure Page Blob + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.Page + + @property + def is_one_shot_block_blob(self): + # type: (Descriptor) -> bool + """Is one shot block blob + :param Descriptor self: this + :rtype: bool + :return: if upload is a one-shot block blob + """ + return (self._ase.mode == blobxfer.models.azure.StorageModes.Block and + self._total_chunks == 1) + @property def requires_put_block_list(self): # type: (Descriptor) -> bool @@ -308,15 +366,14 @@ def requires_put_block_list(self): self._total_chunks > 1) @property - def requires_set_blob_properties_md5(self): + def requires_non_encrypted_md5_put(self): # type: (Descriptor) -> bool """Requires a set file properties for md5 to finalize :param Descriptor self: this :rtype: bool :return: if finalize requires a put file properties """ - return (not self.entity.is_encrypted and self.must_compute_md5 and - self.entity.mode == blobxfer.models.azure.StorageModes.Page) + return not self.entity.is_encrypted and self.must_compute_md5 @property def requires_set_file_properties_md5(self): @@ -327,7 +384,7 @@ def requires_set_file_properties_md5(self): :return: if finalize requires a put file properties """ return (not self.entity.is_encrypted and self.must_compute_md5 and - self.entity.mode == blobxfer.models.azure.StorageModes.File) + self.remote_is_file) def complete_offset_upload(self): with self._meta_lock: @@ -350,7 +407,7 @@ def _initialize_encryption(self, options): :param blobxfer.models.options.Upload options: upload options """ # TODO support append blobs? - if (options.rsa_public_key is not None and + if (options.rsa_public_key is not None and self._ase.size > 0 and (self._ase.mode == blobxfer.models.azure.StorageModes.Block or self._ase.mode == blobxfer.models.azure.StorageModes.File)): em = blobxfer.models.crypto.EncryptionMetadata() @@ -387,35 +444,57 @@ def _adjust_chunk_size(self, options): :param Descriptor self: this :param blobxfer.models.options.Upload options: upload options """ - self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) + chunk_size = options.chunk_size_bytes + # auto-select chunk size + if chunk_size == 0: + if self._ase.mode != blobxfer.models.azure.StorageModes.Block: + chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + else: + if self._ase.size == 0: + chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + else: + chunk_size = _DEFAULT_AUTO_CHUNKSIZE_BYTES + while chunk_size < _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + chunks = int(math.ceil(self._ase.size / chunk_size)) + if chunks <= _MAX_NUM_CHUNKS: + break + chunk_size = chunk_size << 1 + logger.debug( + 'auto-selected chunk size of {} for {}'.format( + chunk_size, self.local_path.absolute_path)) + self._chunk_size = min((chunk_size, self._ase.size)) # ensure chunk sizes are compatible with mode if self._ase.mode == blobxfer.models.azure.StorageModes.Append: if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( - 'adjusting chunk size to {} for append blobs'.format( - self._chunk_size)) + ('adjusting chunk size to {} for append blob ' + 'from {}').format( + self._chunk_size, self.local_path.absolute_path)) elif self._ase.mode == blobxfer.models.azure.StorageModes.Block: if self._ase.size <= options.one_shot_bytes: - self._chunk_size = options.one_shot_bytes + self._chunk_size = min( + (self._ase.size, options.one_shot_bytes) + ) else: if self._chunk_size > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: self._chunk_size = _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( - 'adjusting chunk size to {} for block blobs'.format( - self._chunk_size)) + ('adjusting chunk size to {} for block blob ' + 'from {}').format( + self._chunk_size, self.local_path.absolute_path)) elif self._ase.mode == blobxfer.models.azure.StorageModes.File: if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( - 'adjusting chunk size to {} for files'.format( - self._chunk_size)) + 'adjusting chunk size to {} for file from {}'.format( + self._chunk_size, self.local_path.absolute_path)) elif self._ase.mode == blobxfer.models.azure.StorageModes.Page: if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES logger.debug( - 'adjusting chunk size to {} for page blobs'.format( - self._chunk_size)) + 'adjusting chunk size to {} for page blob from {}'.format( + self._chunk_size, self.local_path.absolute_path)) def _compute_total_chunks(self, chunk_size): # type: (Descriptor, int) -> int @@ -426,9 +505,30 @@ def _compute_total_chunks(self, chunk_size): :return: num chunks """ try: - return int(math.ceil(self._ase.size / chunk_size)) + chunks = int(math.ceil(self._ase.size / chunk_size)) except ZeroDivisionError: - return 0 + chunks = 1 + if chunks > 50000: + max_vector = False + if self._ase.mode == blobxfer.models.azure.StorageModes.Block: + if self._chunk_size == _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + max_vector = True + elif self._chunk_size == _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + max_vector = True + if max_vector: + raise RuntimeError( + ('number of chunks {} exceeds maximum permissible ' + 'limit and chunk size is set at the maximum value ' + 'for {}. Please try using stripe mode ' + 'vectorization to overcome this limitation').format( + chunks, self.local_path.absolute_path)) + else: + raise RuntimeError( + ('number of chunks {} exceeds maximum permissible ' + 'limit for {}, please adjust chunk size higher or ' + 'set to -1 for automatic chunk size selection').format( + chunks, self.local_path.absolute_path)) + return chunks def _initialize_integrity_checkers(self, options): # type: (Descriptor, blobxfer.models.options.Upload) -> None @@ -459,35 +559,32 @@ def next_offsets(self): resume_bytes = None # resume_bytes = self._resume() with self._meta_lock: - if self._offset >= self.local_path.view.fd_end: + if self._chunk_num >= self._total_chunks: return None, resume_bytes - if self._offset + self._chunk_size > self.local_path.view.fd_end: - chunk = self.local_path.view.fd_end - self._offset + if self._offset + self._chunk_size > self._ase.size: + num_bytes = self._ase.size - self._offset else: - chunk = self._chunk_size - num_bytes = chunk + num_bytes = self._chunk_size chunk_num = self._chunk_num - fd_start = self._offset range_start = self._offset range_end = self._offset + num_bytes - 1 - self._offset += chunk + self._offset += num_bytes self._chunk_num += 1 - if (self._ase.is_encrypted and - self._offset >= self.local_path.view.fd_end): + if self._ase.is_encrypted and self._offset >= self._ase.size: pad = True else: pad = False return Offsets( chunk_num=chunk_num, - block_id='{0:08d}'.format(chunk_num), - fd_start=fd_start, - num_bytes=chunk, + num_bytes=num_bytes, range_start=range_start, range_end=range_end, pad=pad, ), resume_bytes def read_data(self, offsets): + if offsets.num_bytes == 0: + return None # compute start from view start = self.local_path.view.fd_start + offsets.range_start with self.local_path.absolute_path.open('rb') as fd: @@ -497,3 +594,31 @@ def read_data(self, offsets): with self._hasher_lock: self.md5.update(data) return data + + def generate_metadata(self): + genmeta = {} + encmeta = {} + # generate encryption metadata + if self._ase.is_encrypted: + raise NotImplementedError() + # generate file attribute metadata + if self._store_file_attr: + merged = blobxfer.models.metadata.generate_fileattr_metadata( + self.local_path, genmeta) + if merged is not None: + genmeta = merged + # generate vectored io metadata + if self.local_path.view.mode == VectoredIoDistributionMode.Stripe: + merged = blobxfer.models.metadata.\ + generate_vectored_io_stripe_metadata(self.local_path, genmeta) + if merged is not None: + genmeta = merged + metadata = {} + if len(genmeta) > 0: + metadata[blobxfer.models.metadata.JSON_KEY_BLOBXFER_METADATA] = \ + json.dumps(genmeta, ensure_ascii=False, sort_keys=True) + if len(encmeta) > 0: + raise NotImplementedError() + if len(metadata) == 0: + return None + return metadata diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 8664ac8..a237532 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -95,7 +95,8 @@ def __init__(self, name, key, endpoint, transfer_threads): self.name = name self.key = key self.endpoint = endpoint - self.is_sas = self._key_is_sas(self.key) + self.is_sas = StorageAccount._key_is_sas(self.key) + self.create_containers = self._container_creation_allowed() # normalize sas keys if self.is_sas and self.key.startswith('?'): self.key = self.key[1:] @@ -133,6 +134,26 @@ def _key_is_sas(key): return True return False + def _container_creation_allowed(self): + # # type: (StorageAccount) -> bool + """Check if container creation is allowed + :param StorageAccount self: this + :rtype: bool + :return: if container creation is allowed + """ + if self.is_sas: + # search for account sas "c" resource + sasparts = self.key.split('&') + for part in sasparts: + tmp = part.split('=') + if tmp[0] == 'srt': + if 'c' in tmp[1]: + return True + else: + # storage account key always allows container creation + return True + return False + def _create_clients(self): # type: (StorageAccount) -> None """Create Azure Storage clients @@ -271,7 +292,7 @@ def _populate_from_list_files(self, creds, options, general_options): else: ed = None ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_file(sa, file) + ase.populate_from_file(sa, file, dir) yield ase def _populate_from_list_blobs(self, creds, options, general_options): @@ -303,7 +324,7 @@ def _populate_from_list_blobs(self, creds, options, general_options): else: ed = None ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_blob(sa, blob) + ase.populate_from_blob(sa, blob, dir) yield ase diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index 40b40c4..49b3678 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -139,6 +139,41 @@ def list_blobs(client, container, prefix, mode, recursive, timeout=None): yield blob +def list_all_blobs(client, container, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, int) -> + # azure.storage.blob.models.Blob + """List all blobs in a container + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param int timeout: timeout + :rtype: azure.storage.blob.models.Blob + :return: generator of blobs + """ + blobs = client.list_blobs( + container_name=container, + prefix=None, + timeout=timeout, + ) + for blob in blobs: + yield blob + + +def delete_blob(client, container, name, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, int) -> None + """Delete blob, including all associated snapshots + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str name: blob name + :param int timeout: timeout + """ + client.delete_blob( + container_name=container, + blob_name=name, + delete_snapshots=azure.storage.blob.models.DeleteSnapshot.Include, + timeout=timeout, + ) + + def get_blob_range(ase, offsets, timeout=None): # type: (blobxfer.models.azure.StorageEntity, # blobxfer.models.download.Offsets, int) -> bytes @@ -167,6 +202,9 @@ def create_container(ase, containers_created, timeout=None): :param dict containers_created: containers already created map :param int timeout: timeout """ + # check if auth allows create container + if not ase.create_containers: + return key = ase.client.account_name + ':blob=' + ase.container if key not in containers_created: ase.client.create_container( diff --git a/blobxfer/operations/azure/blob/block.py b/blobxfer/operations/azure/blob/block.py index 9318a76..b8e5bc5 100644 --- a/blobxfer/operations/azure/blob/block.py +++ b/blobxfer/operations/azure/blob/block.py @@ -64,19 +64,80 @@ def create_client(storage_account): return client -def create_blob(ase, data, md5, encmeta, timeout=None): - # type: (blobxfer.models.azure.StorageEntity, int) -> None +def create_blob(ase, data, md5, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, str, dict, + # int) -> None """Create one shot block blob :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param bytes data: blob data + :param str md5: md5 as base64 + :param dict metadata: metadata kv pairs :param int timeout: timeout """ - if encmeta is not None: - raise NotImplementedError() ase.client._put_blob( container_name=ase.container, blob_name=ase.name, + blob=data, content_settings=azure.storage.blob.models.ContentSettings( content_type=blobxfer.util.get_mime_type(ase.name), content_md5=md5, ), + metadata=metadata, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def _format_block_id(chunk_num): + # type: (int) -> str + """Create a block id given a block (chunk) number + :param int chunk_num: chunk number + :rtype: str + :return: block id + """ + return '{0:08d}'.format(chunk_num) + + +def put_block(ase, offsets, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes, int) -> None + """Puts a block into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.upload.Offsets offsets: upload offsets + :param bytes data: data + :param int timeout: timeout + """ + ase.client.put_block( + container_name=ase.container, + blob_name=ase.name, + block=data, + block_id=_format_block_id(offsets.chunk_num), + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def put_block_list(ase, last_block_num, md5, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, str, dict, + # int) -> None + """Create block blob from blocks + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int last_block_num: last block number (chunk_num) + :param str md5: md5 as base64 + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + # construct block list + block_list = [ + azure.storage.blob.BlobBlock(id=_format_block_id(x)) + for x in range(0, last_block_num + 1) + ] + ase.client.put_block_list( + container_name=ase.container, + blob_name=ase.name, + block_list=block_list, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + metadata=metadata, + validate_content=False, # integrity is enforced with HTTPS timeout=timeout) diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 5601557..859b4bb 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -115,3 +115,17 @@ def set_blob_md5(ase, md5, timeout=None): content_md5=md5, ), timeout=timeout) + + +def set_blob_metadata(ase, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Set blob metadata + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + ase.client.set_blob_metadata( + container_name=ase.container, + blob_name=ase.name, + metadata=metadata, + timeout=timeout) diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index 4970970..32f1b13 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -174,6 +174,49 @@ def list_files(client, fileshare, prefix, recursive, timeout=None): dirs.append(fspath) +def list_all_files(client, fileshare, timeout=None): + # type: (azure.storage.file.FileService, str, int) -> str + """List all files in share + :param azure.storage.file.FileService client: file client + :param str fileshare: file share + :param int timeout: timeout + :rtype: str + :return: file name + """ + dirs = [None] + while len(dirs) > 0: + dir = dirs.pop() + files = client.list_directories_and_files( + share_name=fileshare, + directory_name=dir, + timeout=timeout, + ) + for file in files: + fspath = str( + pathlib.Path(dir if dir is not None else '') / file.name) + if type(file) == azure.storage.file.models.File: + yield fspath + else: + dirs.append(fspath) + + +def delete_file(client, fileshare, name, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> None + """Delete file from share + :param azure.storage.file.FileService client: file client + :param str fileshare: file share + :param str name: file name + :param int timeout: timeout + """ + dir, fpath = parse_file_path(name) + client.delete_file( + share_name=fileshare, + directory_name=dir, + file_name=fpath, + timeout=timeout, + ) + + def get_file_range(ase, offsets, timeout=None): # type: (blobxfer.models.azure.StorageEntity, # blobxfer.models.download.Offsets, int) -> bytes @@ -203,6 +246,9 @@ def create_share(ase, containers_created, timeout=None): :param dict containers_created: containers already created map :param int timeout: timeout """ + # check if auth allows create container + if not ase.create_containers: + return key = ase.client.account_name + ':file=' + ase.container if key not in containers_created: ase.client.create_share( @@ -224,6 +270,8 @@ def create_all_parent_directories(ase, dirs_created, timeout=None): dirs = pathlib.Path(ase.name).parts if len(dirs) <= 1: return + # remove last part (which is the file) + dirs = dirs[:-1] dk = ase.client.account_name + ':' + ase.container for i in range(0, len(dirs)): dir = str(pathlib.Path(*(dirs[0:i + 1]))) @@ -294,3 +342,19 @@ def set_file_md5(ase, md5, timeout=None): content_md5=md5, ), timeout=timeout) + + +def set_file_metadata(ase, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Set file metadata + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.set_file_metadata( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + metadata=metadata, + timeout=timeout) diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 985a154..3b38ea5 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -75,7 +75,7 @@ def __init__(self, general_options, creds, spec): :param blobxfer.operations.azure.StorageCredentials creds: creds :param blobxfer.models.uplaod.Specification spec: upload spec """ - self._all_remote_files_processed = False + self._all_local_files_processed = False self._crypto_offload = None self._md5_meta_lock = threading.Lock() self._md5_map = {} @@ -95,7 +95,7 @@ def __init__(self, general_options, creds, spec): self._transfer_set = set() self._transfer_threads = [] self._start_time = None - self._delete_after = set() + self._delete_exclude = set() self._ud_map = {} self._containers_created = set() self._fileshare_dir_lock = threading.Lock() @@ -118,7 +118,7 @@ def termination_check(self): with self._transfer_lock: return (self._upload_terminate or len(self._exceptions) > 0 or - (self._all_remote_files_processed and + (self._all_local_files_processed and len(self._upload_set) == 0 and len(self._transfer_set) == 0)) @@ -133,7 +133,7 @@ def termination_check_md5(self): with self._md5_meta_lock: with self._upload_lock: return (self._upload_terminate or - (self._all_remote_files_processed and + (self._all_local_files_processed and len(self._md5_map) == 0 and len(self._upload_set) == 0)) @@ -301,6 +301,17 @@ def _wait_for_upload_threads(self, terminate): for thr in self._upload_threads: thr.join() + def _wait_for_transfer_threads(self, terminate): + # type: (Uploader, bool) -> None + """Wait for transfer threads + :param Uploader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._upload_terminate = terminate + for thr in self._transfer_threads: + thr.join() + def _worker_thread_transfer(self): # type: (Uploader) -> None """Worker thread transfer @@ -320,7 +331,7 @@ def _worker_thread_transfer(self): def _process_transfer(self, ud, ase, offsets, data): # issue put range - self._put_data(ase, offsets, data) + self._put_data(ud, ase, offsets, data) # accounting with self._transfer_lock: self._transfer_set.remove( @@ -328,9 +339,32 @@ def _process_transfer(self, ud, ase, offsets, data): self._upload_bytes_sofar += offsets.num_bytes ud.complete_offset_upload() - def _put_data(self, ase, offsets, data): + def _put_data(self, ud, ase, offsets, data): print('UL', offsets) - if ase.mode == blobxfer.models.azure.StorageModes.File: + if ase.mode == blobxfer.models.azure.StorageModes.Append: + raise NotImplementedError() + elif ase.mode == blobxfer.models.azure.StorageModes.Block: + if offsets.chunk_num == 0: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # handle one-shot uploads + if ud.is_one_shot_block_blob: + metadata = ud.generate_metadata() + if ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string( + ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.create_blob( + ase, data, digest, metadata, + timeout=self._general_options.timeout_sec) + return + # upload block + blobxfer.operations.azure.blob.block.put_block( + ase, offsets, data, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.File: if offsets.chunk_num == 0: # create container if necessary blobxfer.operations.azure.file.create_share( @@ -346,13 +380,10 @@ def _put_data(self, ase, offsets, data): blobxfer.operations.azure.file.create_file( ase, timeout=self._general_options.timeout_sec) # upload range - blobxfer.operations.azure.file.put_file_range( - ase, offsets, data, timeout=self._general_options.timeout_sec) - elif ase.mode == blobxfer.models.azure.StorageModes.Append: - raise NotImplementedError() - elif ase.mode == blobxfer.models.azure.StorageModes.Block: - # TODO handle one-shot uploads for block blobs (get md5 as well) - raise NotImplementedError() + if data is not None: + blobxfer.operations.azure.file.put_file_range( + ase, offsets, data, + timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.Page: if offsets.chunk_num == 0: # create container if necessary @@ -362,6 +393,8 @@ def _put_data(self, ase, offsets, data): # create remote blob blobxfer.operations.azure.blob.page.create_blob( ase, timeout=self._general_options.timeout_sec) + if data is None: + return # align page aligned = blobxfer.util.page_align_content_length( offsets.num_bytes) @@ -382,9 +415,9 @@ def _worker_thread_upload(self): import time while not self.termination_check: try: - if (len(self._transfer_set) > - self._general_options.concurrency.transfer_threads): - time.sleep(0.03) + if (len(self._transfer_set) >= + self._general_options.concurrency.transfer_threads * 2): + time.sleep(0.5) continue else: ud = self._upload_queue.get(False, 0.03) @@ -408,7 +441,7 @@ def _process_upload_descriptor(self, ud): offsets, resume_bytes = ud.next_offsets() # add resume bytes to counter if resume_bytes is not None: - with self._upload_lock: + with self._transfer_lock: self._upload_bytes_sofar += resume_bytes logger.debug('adding {} sofar {} from {}'.format( resume_bytes, self._upload_bytes_sofar, ud._ase.name)) @@ -474,33 +507,67 @@ def _process_upload_descriptor(self, ud): self._transfer_queue.put((ud, ase, offsets, data)) def _finalize_file(self, ud): - # create encryption metadata for file/blob - if ud.entity.is_encrypted: - # TODO - pass + metadata = ud.generate_metadata() # put block list for non one-shot block blobs if ud.requires_put_block_list: - # TODO - pass - # set md5 blob property if not encrypted - if ud.requires_set_blob_properties_md5: - digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) - blobxfer.operations.azure.blob.page.set_blob_md5( - ud.entity, digest, timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.blob.page.set_blob_md5( - ase, digest, timeout=self._general_options.timeout_sec) - # set md5 file property if not encrypted - if ud.requires_set_file_properties_md5: - digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) - blobxfer.operations.azure.file.set_file_md5( - ud.entity, digest, timeout=self._general_options.timeout_sec) + if ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.put_block_list( + ud.entity, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) if blobxfer.util.is_not_empty(ud.entity.replica_targets): for ase in ud.entity.replica_targets: - blobxfer.operations.azure.file.set_file_md5( - ase, digest, timeout=self._general_options.timeout_sec) - # TODO set file metadata if encrypted + blobxfer.operations.azure.blob.block.put_block_list( + ase, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) + # page blob finalization + if ud.remote_is_page_blob: + # set md5 page blob property if required + if ud.requires_non_encrypted_md5_put: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.blob.page.set_blob_md5( + ud.entity, digest, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.page.set_blob_md5( + ase, digest, + timeout=self._general_options.timeout_sec) + # set metadata if needed + if blobxfer.util.is_not_empty(metadata): + blobxfer.operations.azure.blob.page.set_blob_metadata( + ud.entity, metadata, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.page.set_blob_metadata( + ase, metadata, + timeout=self._general_options.timeout_sec) + # azure file finalization + if ud.remote_is_file: + # set md5 file property if required + if ud.requires_non_encrypted_md5_put: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.file.set_file_md5( + ud.entity, digest, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_md5( + ase, digest, + timeout=self._general_options.timeout_sec) + # set file metadata if needed + if blobxfer.util.is_not_empty(metadata): + blobxfer.operations.azure.file.set_file_metadata( + ud.entity, metadata, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_metadata( + ase, metadata, + timeout=self._general_options.timeout_sec) def _cleanup_temporary_files(self): # type: (Uploader) -> None @@ -516,18 +583,68 @@ def _cleanup_temporary_files(self): except Exception as e: logger.exception(e) + def _get_destination_paths(self): + # type: (Uploader) -> + # Tuple[blobxfer.operations.azure.StorageAccount, str, str] + """Get destination paths + :param Uploader self: this + :rtype: tuple + :return: (storage account, container, name) + """ + for dst in self._spec.destinations: + for dpath in dst.paths: + sdpath = str(dpath) + cont, dir = blobxfer.util.explode_azure_path(sdpath) + sa = self._creds.get_storage_account( + dst.lookup_storage_account(sdpath)) + yield sa, cont, dir, dpath + def _delete_extraneous_files(self): # type: (Uploader) -> None - """Delete extraneous files cataloged + """Delete extraneous files on the remote :param Uploader self: this """ - logger.info('attempting to delete {} extraneous files'.format( - len(self._delete_after))) - for file in self._delete_after: - try: - file.unlink() - except OSError: - pass + if not self._spec.options.delete_extraneous_destination: + return + # list blobs for all destinations + checked = set() + deleted = 0 + print(self._delete_exclude) + for sa, container, _, _ in self._get_destination_paths(): + key = ';'.join((sa.name, sa.endpoint, container)) + if key in checked: + continue + logger.debug( + 'attempting to delete extraneous blobs/files from: {}'.format( + key)) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + files = blobxfer.operations.azure.file.list_all_files( + sa.file_client, container, + timeout=self._general_options.timeout_sec) + for file in files: + id = self._create_deletion_id( + sa.file_client, container, file) + print(id) + if id not in self._delete_exclude: + blobxfer.operations.azure.file.delete_file( + sa.file_client, container, file, + timeout=self._general_options.timeout_sec) + deleted += 1 + else: + blobs = blobxfer.operations.azure.blob.list_all_blobs( + sa.block_blob_client, container, + timeout=self._general_options.timeout_sec) + for blob in blobs: + id = self._create_deletion_id( + sa.block_blob_client, container, blob.name) + if id not in self._delete_exclude: + blobxfer.operations.azure.blob.delete_blob( + sa.block_blob_client, container, blob.name, + timeout=self._general_options.timeout_sec) + deleted += 1 + checked.add(key) + logger.info('deleted {} extraneous blobs/files'.format(deleted)) def _check_upload_conditions(self, local_path, rfile): # type: (Uploader, blobxfer.models.upload.LocalPath, @@ -603,9 +720,9 @@ def _check_for_existing_remote(self, sa, cont, name): ase = blobxfer.models.azure.StorageEntity(cont, ed) if (self._spec.options.mode == blobxfer.models.azure.StorageModes.File): - ase.populate_from_file(sa, fp) + ase.populate_from_file(sa, fp, name) else: - ase.populate_from_blob(sa, fp) + ase.populate_from_blob(sa, fp, name) else: ase = None return ase @@ -618,6 +735,7 @@ def _generate_destination_for_source(self, local_path): """ # construct stripped destination path spath = local_path.relative_path + # apply strip components if self._spec.options.strip_components > 0: _rparts = local_path.relative_path.parts _strip = min( @@ -625,53 +743,47 @@ def _generate_destination_for_source(self, local_path): ) if _strip > 0: spath = pathlib.Path(*_rparts[_strip:]) - # for each destination: - # 1. prepend non-container path - # 2. bind client from mode - # 3. perform get blob or file properties - for dst in self._spec.destinations: - for dpath in dst.paths: - sdpath = str(dpath) - cont, dir = blobxfer.util.explode_azure_path(sdpath) - # apply rename - if self._spec.options.rename: - name = dir - else: - name = str(spath / dir) - if blobxfer.util.is_none_or_empty(name): - raise ValueError( - 'must specify a container for destination: {}'.format( - dpath)) - # apply strip components - sa = self._creds.get_storage_account( - dst.lookup_storage_account(sdpath)) - # do not check for existing remote right now if striped - # vectored io mode - if (self._spec.options.vectored_io.distribution_mode == - blobxfer.models.upload. - VectoredIoDistributionMode.Stripe): - ase = None + # create a storage entity for each destination + for sa, cont, name, dpath in self._get_destination_paths(): + # apply rename + if not self._spec.options.rename: + name = str(spath / name) + if blobxfer.util.is_none_or_empty(name): + raise ValueError( + ('invalid destination, must specify a container or ' + 'fileshare and remote file name: {}').format(dpath)) + # do not check for existing remote right now if striped + # vectored io mode + if (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload. + VectoredIoDistributionMode.Stripe): + ase = None + else: + ase = self._check_for_existing_remote(sa, cont, name) + if ase is None: + if self._spec.options.rsa_public_key: + ed = blobxfer.models.crypto.EncryptionMetadata() else: - ase = self._check_for_existing_remote(sa, cont, name) - if ase is None: - if self._spec.options.rsa_public_key: - ed = blobxfer.models.crypto.EncryptionMetadata() - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_local( - sa, cont, name, self._spec.options.mode) - yield sa, ase + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + ase.populate_from_local( + sa, cont, name, self._spec.options.mode) + yield sa, ase def _create_unique_id(self, src, ase): return ';'.join( - (str(src.absolute_path), ase._client.account_name, ase.path) + (str(src.absolute_path), ase._client.primary_endpoint, ase.path) ) def _create_unique_transfer_id(self, local_path, ase, offsets): return ';'.join( - (str(local_path.absolute_path), ase._client.account_name, ase.path, - str(local_path.view.fd_start), str(offsets.range_start)) + (str(local_path.absolute_path), ase._client.primary_endpoint, + ase.path, str(local_path.view.fd_start), str(offsets.range_start)) + ) + + def _create_deletion_id(self, client, container, name): + return ';'.join( + (client.primary_endpoint, container, name) ) def append_slice_suffix_to_name(self, name, slice): @@ -687,41 +799,68 @@ def _vectorize_and_bind(self, local_path, dest): """ if (self._spec.options.vectored_io.distribution_mode == blobxfer.models.upload.VectoredIoDistributionMode.Stripe): - num_dest = len(dest) # compute total number of slices slices = int(math.ceil( - local_path.size / + local_path.total_size / self._spec.options.vectored_io.stripe_chunk_size_bytes)) + # check if vectorization is possible + if slices == 1: + sa, ase = dest[0] + action = self._check_upload_conditions(local_path, ase) + yield action, local_path, ase + return + num_dest = len(dest) logger.debug( '{} slices for vectored out of {} to {} destinations'.format( slices, local_path.absolute_path, num_dest)) + # pre-populate slice map for next pointers + slice_map = {} + for i in range(0, slices): + sa, ase = dest[i % num_dest] + name = self.append_slice_suffix_to_name(ase.name, i) + sase = self._check_for_existing_remote(sa, ase.container, name) + if sase is None: + if self._spec.options.rsa_public_key: + ed = blobxfer.models.crypto.EncryptionMetadata() + else: + ed = None + sase = blobxfer.models.azure.StorageEntity( + ase.container, ed) + sase.populate_from_local( + sa, ase.container, name, self._spec.options.mode) + slice_map[i] = sase # create new local path to ase mappings curr = 0 - slice = 0 for i in range(0, slices): start = curr end = ( curr + self._spec.options.vectored_io.stripe_chunk_size_bytes ) - if end > local_path.size: - end = local_path.size - sa, ase = dest[i % num_dest] - name = self.append_slice_suffix_to_name(ase.name, slice) - ase = self._check_for_existing_remote(sa, ase.container, name) + if end > local_path.total_size: + end = local_path.total_size + ase = slice_map[i] + if i < slices - 1: + next_entry = blobxfer.models.metadata.\ + create_vectored_io_next_entry(slice_map[i+1]) + else: + next_entry = None lp_slice = blobxfer.models.upload.LocalPath( parent_path=local_path.parent_path, relative_path=local_path.relative_path, view=blobxfer.models.upload.LocalPathView( fd_start=start, fd_end=end, - slice_num=slice, + slice_num=i, + mode=self._spec.options.vectored_io.distribution_mode, + total_slices=slices, + next=next_entry, ) ) + print(lp_slice.view) action = self._check_upload_conditions(lp_slice, ase) yield action, lp_slice, ase - start += curr - slice += 1 + curr = end elif (self._spec.options.vectored_io.distribution_mode == blobxfer.models.upload.VectoredIoDistributionMode.Replica): action_map = {} @@ -794,9 +933,14 @@ def _run(self): ] for action, lp, ase in self._vectorize_and_bind(src, dest): print(lp.parent_path, lp.relative_path, lp.absolute_path, action, ase.container, ase.name) - print(lp.size, lp.mode, lp.uid, lp.gid) + print(lp.total_size, lp.size, lp.mode, lp.uid, lp.gid) print(self._create_unique_id(lp, ase)) print('replicas', len(ase.replica_targets) if ase.replica_targets is not None else 'none') + if self._spec.options.delete_extraneous_destination: + self._delete_exclude.add( + self._create_deletion_id( + ase._client, ase.container, ase.name) + ) if action == UploadAction.Skip: skipped_files += 1 skipped_size += ase.size if ase.size is not None else 0 @@ -818,16 +962,17 @@ def _run(self): upload_size_mib = self._upload_bytes_total / blobxfer.util.MEGABYTE # set remote files processed with self._md5_meta_lock: - self._all_remote_files_processed = True + self._all_local_files_processed = True logger.debug( - ('{0} remote files processed, waiting for upload completion ' + ('{0} local files processed, waiting for upload completion ' 'of {1:.4f} MiB').format(nfiles, upload_size_mib)) del nfiles del total_size del skipped_files del skipped_size - # wait for downloads to complete + # wait for uploads to complete self._wait_for_upload_threads(terminate=False) + self._wait_for_transfer_threads(terminate=False) end_time = blobxfer.util.datetime_now() # update progress bar self._update_progress_bar() @@ -876,6 +1021,7 @@ def start(self): 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') try: + self._wait_for_transfer_threads(terminate=True) self._wait_for_upload_threads(terminate=True) finally: self._cleanup_temporary_files() diff --git a/blobxfer/util.py b/blobxfer/util.py index 9b4e644..cce84f0 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -40,6 +40,7 @@ from os import scandir as scandir except ImportError: # noqa from scandir import scandir as scandir +import platform import re import sys # non-stdlib imports @@ -50,6 +51,7 @@ # global defines MEGABYTE = 1048576 +_ON_WINDOWS = platform.system() == 'Windows' _REGISTERED_LOGGER_HANDLERS = [] _PAGEBLOB_BOUNDARY = 512 @@ -63,6 +65,15 @@ def on_python2(): return future.utils.PY2 +def on_windows(): + # type: (None) -> bool + """Execution on Windows + :rtype: bool + :return: if on Windows + """ + return _ON_WINDOWS + + def setup_logger(logger, logfile): # noqa # type: (logger, str) -> None """Set up logger""" diff --git a/cli/cli.py b/cli/cli.py index bc11ef6..c3b9b2d 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -317,7 +317,8 @@ def callback(ctx, param, value): expose_value=False, type=int, default=4194304, - help='Block or chunk size in bytes [4194304]', + help='Block or chunk size in bytes; set to 0 for auto-select ' + 'on upload [0]', callback=callback)(f) @@ -381,10 +382,9 @@ def callback(ctx, param, value): clictx.cli_options['file_attributes'] = value return value return click.option( - '--file-attributes', + '--file-attributes/--no-file-attributes', expose_value=False, default=False, - is_flag=False, help='Store or restore file attributes [False]', callback=callback)(f) From 69edbc9fde68a44d7c1513cd1b38799217779b62 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 25 May 2017 09:01:57 -0700 Subject: [PATCH 30/47] Client-side encryption upload support - Split transfer threads into disk/transfer - Various fixes - Update dependencies to latest --- blobxfer/models/crypto.py | 74 ++++- blobxfer/models/options.py | 24 +- blobxfer/models/upload.py | 48 ++- blobxfer/operations/azure/blob/__init__.py | 19 +- blobxfer/operations/azure/blob/page.py | 2 +- blobxfer/operations/azure/file.py | 18 +- blobxfer/operations/crypto.py | 8 +- blobxfer/operations/progress.py | 28 +- blobxfer/operations/upload.py | 347 +++++++++++---------- cli/cli.py | 17 +- cli/settings.py | 2 + setup.py | 8 +- 12 files changed, 388 insertions(+), 207 deletions(-) diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py index 6a50c9c..c6670f2 100644 --- a/blobxfer/models/crypto.py +++ b/blobxfer/models/crypto.py @@ -106,6 +106,7 @@ class EncryptionMetadata(object): _JSON_KEY_ENCRYPTED_AUTHKEY = 'EncryptedAuthenticationKey' _JSON_KEY_CONTENT_IV = 'ContentEncryptionIV' _JSON_KEY_KEYID = 'KeyId' + _JSON_KEY_KEY_WRAPPING_METADATA = 'KeyWrappingMetadata' _JSON_KEY_BLOBXFER_EXTENSIONS = 'BlobxferExtensions' _JSON_KEY_PREENCRYPTED_MD5 = 'PreEncryptedContentMD5' @@ -319,9 +320,76 @@ def convert_from_json(self, md, entityname, rsaprivatekey): '{}: encryption metadata authentication failed'.format( entityname)) - def convert_to_json_with_mac(self): - # TODO - pass + def convert_to_json_with_mac(self, md5digest, hmacdigest): + # type: (EncryptionMetadata, str, str) -> dict + """Constructs metadata for encryption + :param EncryptionMetadata self: this + :param str md5digest: md5 digest + :param str hmacdigest: hmac-sha256 digest (data) + :rtype: dict + :return: encryption metadata + """ + enc_content_key = blobxfer.operations.crypto.\ + rsa_encrypt_key_base64_encoded( + None, self._rsa_public_key, self.symmetric_key) + enc_sign_key = blobxfer.operations.crypto.\ + rsa_encrypt_key_base64_encoded( + None, self._rsa_public_key, self.signing_key) + + encjson = { + EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE: + EncryptionMetadata._ENCRYPTION_MODE, + EncryptionMetadata._JSON_KEY_CONTENT_IV: + blobxfer.util.base64_encode_as_string(self.content_encryption_iv), + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY: { + EncryptionMetadata._JSON_KEY_KEYID: 'private:pem', + EncryptionMetadata._JSON_KEY_ENCRYPTED_KEY: enc_content_key, + EncryptionMetadata._JSON_KEY_ENCRYPTED_AUTHKEY: enc_sign_key, + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._ENCRYPTED_KEY_SCHEME, + }, + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT: { + EncryptionMetadata._JSON_KEY_PROTOCOL: + EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION, + EncryptionMetadata._JSON_KEY_ENCRYPTION_ALGORITHM: + EncryptionMetadata._ENCRYPTION_ALGORITHM, + }, + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH: { + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._AUTH_ALGORITHM, + }, + EncryptionMetadata._JSON_KEY_KEY_WRAPPING_METADATA: {}, + } + if md5digest is not None: + encjson[EncryptionMetadata._JSON_KEY_BLOBXFER_EXTENSIONS] = { + EncryptionMetadata._JSON_KEY_PREENCRYPTED_MD5: md5digest + } + if hmacdigest is not None: + encjson[EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_MAC] = hmacdigest + bencjson = json.dumps( + encjson, sort_keys=True, ensure_ascii=False).encode( + EncryptionMetadata._AUTH_ENCODING_TYPE) + encjson = { + EncryptionMetadata._METADATA_KEY_NAME: + json.dumps(encjson, sort_keys=True) + } + # compute MAC over encjson + hmacsha256 = hmac.new(self._signkey, digestmod=hashlib.sha256) + hmacsha256.update(bencjson) + authjson = { + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH: { + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._AUTH_ALGORITHM, + EncryptionMetadata._JSON_KEY_AUTH_ENCODING: + EncryptionMetadata._AUTH_ENCODING_TYPE, + EncryptionMetadata._JSON_KEY_MAC: + blobxfer.util.base64_encode_as_string(hmacsha256.digest()), + } + } + encjson[EncryptionMetadata._METADATA_KEY_AUTH_NAME] = json.dumps( + authjson, sort_keys=True) + return encjson def initialize_hmac(self): # type: (EncryptionMetadata) -> hmac.HMAC diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index 14e4e09..cdc32df 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -103,15 +103,19 @@ class Concurrency(object): """Concurrency Options""" - def __init__(self, crypto_processes, md5_processes, transfer_threads): + def __init__( + self, crypto_processes, md5_processes, disk_threads, + transfer_threads): """Ctor for Concurrency Options :param Concurrency self: this :param int crypto_processes: number of crypto procs :param int md5_processes: number of md5 procs + :param int disk_threads: number of disk threads :param int transfer_threads: number of transfer threads """ self.crypto_processes = crypto_processes self.md5_processes = md5_processes + self.disk_threads = disk_threads self.transfer_threads = transfer_threads # allow crypto processes to be zero (which will inline crypto # routines with main process) @@ -121,11 +125,21 @@ def __init__(self, crypto_processes, md5_processes, transfer_threads): self.md5_processes = multiprocessing.cpu_count() // 2 if self.md5_processes < 1: self.md5_processes = 1 - if self.transfer_threads is None or self.transfer_threads < 1: - self.transfer_threads = multiprocessing.cpu_count() * 4 - # cap maximum number of threads from cpu count to 96 - if self.transfer_threads > 96: + auto_disk = False + if self.disk_threads is None or self.disk_threads < 1: + self.disk_threads = multiprocessing.cpu_count() * 4 + # cap maximum number of disk threads from cpu count to 96 + if self.disk_threads > 96: self.transfer_threads = 96 + auto_disk = True + if self.transfer_threads is None or self.transfer_threads < 1: + if auto_disk: + self.transfer_threads = self.disk_threads << 1 + else: + self.transfer_threads = multiprocessing.cpu_count() * 2 + # cap maximum number of threads from cpu count to 64 + if self.transfer_threads > 64: + self.transfer_threads = 64 class General(object): diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 07852cb..da3a1fe 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -387,6 +387,10 @@ def requires_set_file_properties_md5(self): self.remote_is_file) def complete_offset_upload(self): + # type: (Descriptor) -> None + """Complete the upload for the offset + :param Descriptor self: this + """ with self._meta_lock: self._outstanding_ops -= 1 # TODO save resume state @@ -407,7 +411,7 @@ def _initialize_encryption(self, options): :param blobxfer.models.options.Upload options: upload options """ # TODO support append blobs? - if (options.rsa_public_key is not None and self._ase.size > 0 and + if (options.rsa_public_key is not None and self.local_path.size > 0 and (self._ase.mode == blobxfer.models.azure.StorageModes.Block or self._ase.mode == blobxfer.models.azure.StorageModes.File)): em = blobxfer.models.crypto.EncryptionMetadata() @@ -426,7 +430,7 @@ def _compute_remote_size(self): if size > 0: if self._ase.is_encrypted: # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs - allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + allocatesize = (size // self._AES_BLOCKSIZE + 1) * \ self._AES_BLOCKSIZE else: allocatesize = size @@ -541,8 +545,9 @@ def _initialize_integrity_checkers(self, options): if blobxfer.util.is_none_or_empty( self._ase.encryption_metadata.symmetric_key): raise RuntimeError( - 'symmetric key is invalid: provide RSA private key ' - 'or metadata corrupt') + ('symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt for {}').format( + self.local_path.absolute_path)) self.hmac = self._ase.encryption_metadata.initialize_hmac() # both hmac and md5 can be enabled if options.store_file_properties.md5: @@ -583,10 +588,19 @@ def next_offsets(self): ), resume_bytes def read_data(self, offsets): + # type: (Descriptor, Offsets) -> bytes + """Read data from file + :param Descriptor self: this + :param Offsets offsets: offsets + :rtype: bytes + :return: file data + """ if offsets.num_bytes == 0: return None # compute start from view start = self.local_path.view.fd_start + offsets.range_start + # encrypted offsets will read past the end of the file due + # to padding, but will be accounted for after encryption+padding with self.local_path.absolute_path.open('rb') as fd: fd.seek(start, 0) data = fd.read(offsets.num_bytes) @@ -596,11 +610,28 @@ def read_data(self, offsets): return data def generate_metadata(self): + # type: (Descriptor) -> dict + """Generate metadata for descriptor + :param Descriptor self: this + :rtype: dict or None + :return: kv metadata dict + """ genmeta = {} encmeta = {} # generate encryption metadata if self._ase.is_encrypted: - raise NotImplementedError() + if self.must_compute_md5: + md5digest = blobxfer.util.base64_encode_as_string( + self.md5.digest()) + else: + md5digest = None + if self.hmac is not None: + hmacdigest = blobxfer.util.base64_encode_as_string( + self.hmac.digest()) + else: + hmacdigest = None + encmeta = self._ase.encryption_metadata.convert_to_json_with_mac( + md5digest, hmacdigest) # generate file attribute metadata if self._store_file_attr: merged = blobxfer.models.metadata.generate_fileattr_metadata( @@ -613,12 +644,13 @@ def generate_metadata(self): generate_vectored_io_stripe_metadata(self.local_path, genmeta) if merged is not None: genmeta = merged - metadata = {} + if len(encmeta) > 0: + metadata = encmeta + else: + metadata = {} if len(genmeta) > 0: metadata[blobxfer.models.metadata.JSON_KEY_BLOBXFER_METADATA] = \ json.dumps(genmeta, ensure_ascii=False, sort_keys=True) - if len(encmeta) > 0: - raise NotImplementedError() if len(metadata) == 0: return None return metadata diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index 49b3678..63fd4a1 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -207,10 +207,15 @@ def create_container(ase, containers_created, timeout=None): return key = ase.client.account_name + ':blob=' + ase.container if key not in containers_created: - ase.client.create_container( - container_name=ase.container, - fail_on_exist=False, - timeout=timeout) - containers_created.add(key) - logger.info('created blob container {} on storage account {}'.format( - ase.container, ase.client.account_name)) + try: + ase.client.create_container( + container_name=ase.container, + fail_on_exist=True, + timeout=timeout) + except azure.common.AzureConflictHttpError: + pass + else: + containers_created.add(key) + logger.info( + 'created blob container {} on storage account {}'.format( + ase.container, ase.client.account_name)) diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 859b4bb..4223a30 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -73,7 +73,7 @@ def create_blob(ase, timeout=None): ase.client.create_blob( container_name=ase.container, blob_name=ase.name, - content_length=ase.size, + content_length=blobxfer.util.page_align_content_length(ase.size), content_settings=azure.storage.blob.models.ContentSettings( content_type=blobxfer.util.get_mime_type(ase.name) ), diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py index 32f1b13..3a905bf 100644 --- a/blobxfer/operations/azure/file.py +++ b/blobxfer/operations/azure/file.py @@ -251,13 +251,17 @@ def create_share(ase, containers_created, timeout=None): return key = ase.client.account_name + ':file=' + ase.container if key not in containers_created: - ase.client.create_share( - share_name=ase.container, - fail_on_exist=False, - timeout=timeout) - containers_created.add(key) - logger.info('created file share {} on storage account {}'.format( - ase.container, ase.client.account_name)) + try: + ase.client.create_share( + share_name=ase.container, + fail_on_exist=True, + timeout=timeout) + except azure.common.AzureConflictHttpError: + pass + else: + containers_created.add(key) + logger.info('created file share {} on storage account {}'.format( + ase.container, ase.client.account_name)) def create_all_parent_directories(ase, dirs_created, timeout=None): diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 3097b96..6f1e011 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -242,10 +242,11 @@ def _worker_process(self): """ while not self.terminated: try: - inst = self._task_queue.get(True, 0.25) + inst = self._task_queue.get(True, 0.1) except queue.Empty: continue - if inst[0] == CryptoAction.Encrypt: + # UNUSED due to AES256-CBC FullBlob mode + if inst[0] == CryptoAction.Encrypt: # noqa local_file, offsets, symkey, iv = \ inst[1], inst[2], inst[3], inst[4] with open(local_file, 'rb') as fd: @@ -295,7 +296,8 @@ def add_decrypt_chunk( iv, hmac_datafile) ) - def add_encrypt_chunk(self, local_file, offsets, symkey, iv): + # UNUSED due to AES256-CBC FullBlob mode + def add_encrypt_chunk(self, local_file, offsets, symkey, iv): # noqa # type: (CryptoOffload, pathlib.Path, blobxfer.models.upload.Offsets, # bytes, bytes) -> None """Add a chunk to encrypt diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index b0f3bf4..b2b6c26 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -110,19 +110,23 @@ def output_parameters(general_options, spec): # specific preamble if isinstance(spec, blobxfer.models.download.Specification): log.append(' transfer direction: {}'.format('Azure -> local')) - log.append(' workers: xfer={} md5={} crypto={}'.format( - general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes - if spec.options.check_file_md5 else 0, - general_options.concurrency.crypto_processes)) + log.append( + ' workers: disk={} xfer={} md5={} crypto={}'.format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.options.check_file_md5 else 0, + general_options.concurrency.crypto_processes)) elif isinstance(spec, blobxfer.models.upload.Specification): log.append(' transfer direction: {}'.format('local -> Azure')) - log.append(' workers: xfer={} md5={} crypto={}'.format( - general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes - if spec.skip_on.md5_match or spec.options.store_file_properties.md5 - else 0, - general_options.concurrency.crypto_processes)) + log.append( + ' workers: disk={} xfer={} md5={} crypto={}'.format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.skip_on.md5_match or + spec.options.store_file_properties.md5 else 0, + 0)) # TODO handle synccopy spec @@ -161,6 +165,8 @@ def output_parameters(general_options, spec): log.append(' local destination: {}'.format( spec.destination.path)) elif isinstance(spec, blobxfer.models.upload.Specification): + log.append(' one shot bytes: {}'.format( + spec.options.one_shot_bytes)) log.append(' store properties: attr={} md5={}'.format( spec.options.store_file_properties.attributes, spec.options.store_file_properties.md5)) diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 3b38ea5..7f9230a 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -42,6 +42,7 @@ except ImportError: # noqa import Queue as queue import threading +import time # non-stdlib imports # local imports import blobxfer.models.crypto @@ -84,7 +85,7 @@ def __init__(self, general_options, creds, spec): self._upload_queue = queue.Queue() self._upload_set = set() self._upload_start_time = None - self._upload_threads = [] + self._disk_threads = [] self._upload_total = None self._upload_sofar = 0 self._upload_bytes_total = None @@ -137,6 +138,59 @@ def termination_check_md5(self): len(self._md5_map) == 0 and len(self._upload_set) == 0)) + @staticmethod + def create_unique_id(src, ase): + # type: (blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> str + """Create a unique id given a LocalPath and StorageEntity + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity ase: azure storage entity + :rtype: str + :return: unique id for pair + """ + return ';'.join( + (str(src.absolute_path), ase._client.primary_endpoint, ase.path) + ) + + @staticmethod + def create_unique_transfer_id(local_path, ase, offsets): + # type: (blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> str + """Create a unique transfer id given a offsets + :param blobxfer.models.upload.LocalPath local_path: local path + :param blobxfer.models.azure.StorageEntity ase: azure storage entity + :param blobxfer.models.upload.Offsets offsets: upload offsets + :rtype: str + :return: unique id for transfer + """ + return ';'.join( + (str(local_path.absolute_path), ase._client.primary_endpoint, + ase.path, str(local_path.view.fd_start), str(offsets.range_start)) + ) + + @staticmethod + def create_deletion_id(client, container, name): + # type: (azure.storage.StorageClient, str, str) -> str + """Create a unique deletion id + :param azure.storage.StorageClient client: storage client + :param str container: container name + :param str name: entity name + :rtype: str + :return: unique id for deletion + """ + return ';'.join((client.primary_endpoint, container, name)) + + @staticmethod + def append_slice_suffix_to_name(name, slice): + # type: (str, int) -> str + """Append a vectored io (stripe) slice suffix to a given name + :param str name: entity name + :param int slice: slice num + :rtype: str + :return: name with appended suffix + """ + return '{}.bxslice-{}'.format(name, slice) + def _update_progress_bar(self): # type: (Uploader) -> None """Update progress bar @@ -180,9 +234,9 @@ def _post_md5_skip_on_check(self, filename, md5_match): :param str filename: local filename :param bool md5_match: if MD5 matches """ - uid = self._create_unique_id(src, rfile) with self._md5_meta_lock: src, rfile = self._md5_map.pop(filename) + uid = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) if md5_match: with self._upload_lock: self._upload_set.remove(uid) @@ -214,36 +268,6 @@ def _check_for_uploads_from_md5(self): if result is not None: self._post_md5_skip_on_check(result[0], result[1]) - def _check_for_crypto_done(self): - # type: (Uploader) -> None - """Check queue for crypto done - :param Uploader self: this - """ - cv = self._crypto_offload.done_cv - while not self.termination_check: - result = None - cv.acquire() - while True: - result = self._crypto_offload.pop_done_queue() - if result is None: - # use cv timeout due to possible non-wake while running - cv.wait(1) - # check for terminating conditions - if self.termination_check: - break - else: - break - cv.release() - if result is not None: - try: - with self._upload_lock: - dd = self._ud_map[result] - dd.perform_chunked_integrity_check() - except KeyError: - # this can happen if all of the last integrity - # chunks are processed at once - pass - def _add_to_upload_queue(self, src, rfile, uid): # type: (Uploader, blobxfer.models.upload.LocalPath, # blobxfer.models.azure.StorageEntity, str) -> None @@ -266,16 +290,16 @@ def _add_to_upload_queue(self, src, rfile, uid): if self._upload_start_time is None: self._upload_start_time = blobxfer.util.datetime_now() - def _initialize_upload_threads(self): + def _initialize_disk_threads(self): # type: (Uploader) -> None - """Initialize upload threads + """Initialize disk threads :param Uploader self: this """ - logger.debug('spawning {} transfer threads'.format( + logger.debug('spawning {} disk threads'.format( self._general_options.concurrency.transfer_threads)) for _ in range(self._general_options.concurrency.transfer_threads): thr = threading.Thread(target=self._worker_thread_upload) - self._upload_threads.append(thr) + self._disk_threads.append(thr) thr.start() def _initialize_transfer_threads(self): @@ -290,15 +314,15 @@ def _initialize_transfer_threads(self): self._transfer_threads.append(thr) thr.start() - def _wait_for_upload_threads(self, terminate): + def _wait_for_disk_threads(self, terminate): # type: (Uploader, bool) -> None - """Wait for upload threads + """Wait for disk threads :param Uploader self: this :param bool terminate: terminate threads """ if terminate: self._upload_terminate = terminate - for thr in self._upload_threads: + for thr in self._disk_threads: thr.join() def _wait_for_transfer_threads(self, terminate): @@ -320,7 +344,7 @@ def _worker_thread_transfer(self): while not self.termination_check: try: ud, ase, offsets, data = self._transfer_queue.get( - block=False, timeout=0.03) + block=False, timeout=0.1) except queue.Empty: continue try: @@ -330,17 +354,40 @@ def _worker_thread_transfer(self): self._exceptions.append(e) def _process_transfer(self, ud, ase, offsets, data): + # type: (Uploader, blobxfer.models.upload.Descriptor, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes) -> None + """Process transfer instructions + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes data: data to upload + """ # issue put range self._put_data(ud, ase, offsets, data) # accounting with self._transfer_lock: self._transfer_set.remove( - self._create_unique_transfer_id(ud.local_path, ase, offsets)) + blobxfer.operations.upload.Uploader.create_unique_transfer_id( + ud.local_path, ase, offsets)) self._upload_bytes_sofar += offsets.num_bytes + if offsets.chunk_num == 0: + self._upload_bytes_total += ase.size ud.complete_offset_upload() def _put_data(self, ud, ase, offsets, data): - print('UL', offsets) + # type: (Uploader, blobxfer.models.upload.Descriptor, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes) -> None + """Put data in Azure + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes data: data to upload + """ + print('UL', offsets, ase.path, len(data) if data is not None else None) if ase.mode == blobxfer.models.azure.StorageModes.Append: raise NotImplementedError() elif ase.mode == blobxfer.models.azure.StorageModes.Block: @@ -352,7 +399,7 @@ def _put_data(self, ud, ase, offsets, data): # handle one-shot uploads if ud.is_one_shot_block_blob: metadata = ud.generate_metadata() - if ud.must_compute_md5: + if not ud.entity.is_encrypted and ud.must_compute_md5: digest = blobxfer.util.base64_encode_as_string( ud.md5.digest()) else: @@ -385,6 +432,7 @@ def _put_data(self, ud, ase, offsets, data): ase, offsets, data, timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.Page: + # compute aligned size if offsets.chunk_num == 0: # create container if necessary blobxfer.operations.azure.blob.create_container( @@ -412,15 +460,15 @@ def _worker_thread_upload(self): """Worker thread upload :param Uploader self: this """ - import time while not self.termination_check: try: - if (len(self._transfer_set) >= - self._general_options.concurrency.transfer_threads * 2): - time.sleep(0.5) + if (len(self._transfer_set) > + self._general_options.concurrency. + transfer_threads * 4): + time.sleep(0.2) continue else: - ud = self._upload_queue.get(False, 0.03) + ud = self._upload_queue.get(False, 0.1) except queue.Empty: continue try: @@ -461,23 +509,13 @@ def _process_upload_descriptor(self, ud): if offsets is None: self._upload_queue.put(ud) return - - # TODO encryption - # encrypt if necessary - if ud.entity.is_encrypted: - # send iv through hmac - ud.hmac_data(ud.current_iv) + if ud.entity.is_encrypted and ud.entity.size > 0: + # send iv through hmac if first chunk + if offsets.chunk_num == 0: + ud.hmac_data(ud.current_iv) # encrypt data - if self._crypto_offload is not None: - self._crypto_offload.add_encrypt_chunk( - str(ud.local_path.absolute_path), offsets, - ud.entity.encryption_metadata.symmetric_key, - ud.current_iv) - # encrypted data will be retrieved from a temp file once - # retrieved from crypto queue - return - else: + if self._crypto_offload is None: # read data from file and encrypt data = ud.read_data(offsets) encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( @@ -486,7 +524,19 @@ def _process_upload_descriptor(self, ud): # send encrypted data through hmac ud.hmac_data(encdata) data = encdata - # TODO save last 16 encrypted bytes for next IV + # save last 16 encrypted bytes for next IV + ud.current_iv = \ + encdata[-blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] + else: # noqa + # crypto offload is not supported with AES256-CBC FullBlob + raise NotImplementedError() + # self._crypto_offload.add_encrypt_chunk( + # str(ud.local_path.absolute_path), offsets, + # ud.entity.encryption_metadata.symmetric_key, + # ud.current_iv) + # encrypted data will be retrieved from a temp file once + # retrieved from crypto queue + # return_early = True else: data = ud.read_data(offsets) # re-enqueue for other threads to upload @@ -494,7 +544,7 @@ def _process_upload_descriptor(self, ud): # add data to transfer queue with self._transfer_lock: self._transfer_set.add( - self._create_unique_transfer_id( + blobxfer.operations.upload.Uploader.create_unique_transfer_id( ud.local_path, ud.entity, offsets)) self._transfer_queue.put((ud, ud.entity, offsets, data)) # iterate replicas @@ -502,15 +552,21 @@ def _process_upload_descriptor(self, ud): for ase in ud.entity.replica_targets: with self._transfer_lock: self._transfer_set.add( - self._create_unique_transfer_id( - ud.local_path, ase, offsets)) + blobxfer.operations.upload.Uploader. + create_unique_transfer_id(ud.local_path, ase, offsets) + ) self._transfer_queue.put((ud, ase, offsets, data)) def _finalize_file(self, ud): + # type: (Uploader, blobxfer.models.upload.Descriptor) -> None + """Finalize file upload + :param Uploader self: this + :param blobxfer.models.upload.Descriptor: upload descriptor + """ metadata = ud.generate_metadata() # put block list for non one-shot block blobs if ud.requires_put_block_list: - if ud.must_compute_md5: + if not ud.entity.is_encrypted and ud.must_compute_md5: digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) else: digest = None @@ -569,20 +625,6 @@ def _finalize_file(self, ud): ase, metadata, timeout=self._general_options.timeout_sec) - def _cleanup_temporary_files(self): - # type: (Uploader) -> None - """Cleanup temporary files in case of an exception or interrupt. - This function is not thread-safe. - :param Uploader self: this - """ - # iterate through dd map and cleanup files - for key in self._ud_map: - dd = self._ud_map[key] - try: - dd.cleanup_all_temporary_files() - except Exception as e: - logger.exception(e) - def _get_destination_paths(self): # type: (Uploader) -> # Tuple[blobxfer.operations.azure.StorageAccount, str, str] @@ -609,7 +651,6 @@ def _delete_extraneous_files(self): # list blobs for all destinations checked = set() deleted = 0 - print(self._delete_exclude) for sa, container, _, _ in self._get_destination_paths(): key = ';'.join((sa.name, sa.endpoint, container)) if key in checked: @@ -623,9 +664,8 @@ def _delete_extraneous_files(self): sa.file_client, container, timeout=self._general_options.timeout_sec) for file in files: - id = self._create_deletion_id( - sa.file_client, container, file) - print(id) + id = blobxfer.operations.upload.Uploader.\ + create_deletion_id(sa.file_client, container, file) if id not in self._delete_exclude: blobxfer.operations.azure.file.delete_file( sa.file_client, container, file, @@ -636,8 +676,9 @@ def _delete_extraneous_files(self): sa.block_blob_client, container, timeout=self._general_options.timeout_sec) for blob in blobs: - id = self._create_deletion_id( - sa.block_blob_client, container, blob.name) + id = blobxfer.operations.upload.Uploader.\ + create_deletion_id( + sa.block_blob_client, container, blob.name) if id not in self._delete_exclude: blobxfer.operations.azure.blob.delete_blob( sa.block_blob_client, container, blob.name, @@ -702,6 +743,14 @@ def _check_upload_conditions(self, local_path, rfile): return UploadAction.Skip def _check_for_existing_remote(self, sa, cont, name): + # type: (Uploader, blobxfer.operations.azure.StorageAccount, + # str, str) -> bobxfer.models.azure.StorageEntity + """Check for an existing remote file + :param Uploader self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param str cont: container + :param str name: entity name + """ if self._spec.options.mode == blobxfer.models.azure.StorageModes.File: fp = blobxfer.operations.azure.file.get_file_properties( sa.file_client, cont, name, @@ -728,10 +777,14 @@ def _check_for_existing_remote(self, sa, cont, name): return ase def _generate_destination_for_source(self, local_path): - # type: (Uploader, blobxfer.models.upload.LocalSourcePath) -> ??? + # type: (Uploader, blobxfer.models.upload.LocalSourcePath) -> + # Tuple[blobxfer.operations.azure.StorageAccount, + # blobxfer.models.azure.StorageEntity) """Generate entities for source path :param Uploader self: this :param blobxfer.models.upload.LocalSourcePath local_path: local path + :rtype: tuple + :return: storage account, storage entity """ # construct stripped destination path spath = local_path.relative_path @@ -761,41 +814,24 @@ def _generate_destination_for_source(self, local_path): else: ase = self._check_for_existing_remote(sa, cont, name) if ase is None: - if self._spec.options.rsa_public_key: - ed = blobxfer.models.crypto.EncryptionMetadata() - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) + # encryption metadata will be populated later, if required + ase = blobxfer.models.azure.StorageEntity(cont, ed=None) ase.populate_from_local( sa, cont, name, self._spec.options.mode) yield sa, ase - def _create_unique_id(self, src, ase): - return ';'.join( - (str(src.absolute_path), ase._client.primary_endpoint, ase.path) - ) - - def _create_unique_transfer_id(self, local_path, ase, offsets): - return ';'.join( - (str(local_path.absolute_path), ase._client.primary_endpoint, - ase.path, str(local_path.view.fd_start), str(offsets.range_start)) - ) - - def _create_deletion_id(self, client, container, name): - return ';'.join( - (client.primary_endpoint, container, name) - ) - - def append_slice_suffix_to_name(self, name, slice): - return '{}.bxslice-{}'.format(name, slice) - def _vectorize_and_bind(self, local_path, dest): # type: (Uploader, blobxfer.models.upload.LocalPath, - # List[blobxfer.models.azure.StorageEntity]) -> None - """Vectorize local path to destinations and bind + # List[blobxfer.models.azure.StorageEntity]) -> + # Tuple[blobxfer.operations.upload.UploadAction, + # blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity] + """Vectorize local path to destinations, if necessary, and bind :param Uploader self: this :param blobxfer.models.LocalPath local_path: local path - :param list rfile: remote file + :param list dest: list of destination tuples (sa, ase) + :rtype: tuple + :return: action, LocalPath, ase """ if (self._spec.options.vectored_io.distribution_mode == blobxfer.models.upload.VectoredIoDistributionMode.Stripe): @@ -817,15 +853,13 @@ def _vectorize_and_bind(self, local_path, dest): slice_map = {} for i in range(0, slices): sa, ase = dest[i % num_dest] - name = self.append_slice_suffix_to_name(ase.name, i) + name = blobxfer.operations.upload.Uploader.\ + append_slice_suffix_to_name(ase.name, i) sase = self._check_for_existing_remote(sa, ase.container, name) if sase is None: - if self._spec.options.rsa_public_key: - ed = blobxfer.models.crypto.EncryptionMetadata() - else: - ed = None + # encryption metadata will be populated later, if required sase = blobxfer.models.azure.StorageEntity( - ase.container, ed) + ase.container, ed=None) sase.populate_from_local( sa, ase.container, name, self._spec.options.mode) slice_map[i] = sase @@ -857,7 +891,6 @@ def _vectorize_and_bind(self, local_path, dest): next=next_entry, ) ) - print(lp_slice.view) action = self._check_upload_conditions(lp_slice, ase) yield action, lp_slice, ase curr = end @@ -909,18 +942,21 @@ def _run(self): self._check_for_uploads_from_md5) # initialize crypto processes if self._general_options.concurrency.crypto_processes > 0: - self._crypto_offload = blobxfer.operations.crypto.CryptoOffload( - num_workers=self._general_options.concurrency.crypto_processes) - self._crypto_offload.initialize_check_thread( - self._check_for_crypto_done) - # initialize upload threads - self._initialize_upload_threads() + logger.warning( + 'crypto offload for upload is not possible due to ' + 'sequential nature of {} and FullBlob encryption mode'.format( + blobxfer.models.crypto.EncryptionMetadata. + _ENCRYPTION_ALGORITHM) + ) + # initialize worker threads + self._initialize_disk_threads() self._initialize_transfer_threads() # initialize local counters - nfiles = 0 - total_size = 0 skipped_files = 0 skipped_size = 0 + approx_total_bytes = 0 + self._upload_total = 0 + self._upload_bytes_total = 0 if not self._spec.sources.can_rename() and self._spec.options.rename: raise RuntimeError( 'cannot rename to specified destination with multiple sources') @@ -932,46 +968,43 @@ def _run(self): self._generate_destination_for_source(src) ] for action, lp, ase in self._vectorize_and_bind(src, dest): - print(lp.parent_path, lp.relative_path, lp.absolute_path, action, ase.container, ase.name) - print(lp.total_size, lp.size, lp.mode, lp.uid, lp.gid) - print(self._create_unique_id(lp, ase)) - print('replicas', len(ase.replica_targets) if ase.replica_targets is not None else 'none') if self._spec.options.delete_extraneous_destination: self._delete_exclude.add( - self._create_deletion_id( + blobxfer.operations.upload.Uploader.create_deletion_id( ase._client, ase.container, ase.name) ) if action == UploadAction.Skip: skipped_files += 1 skipped_size += ase.size if ase.size is not None else 0 continue + approx_total_bytes += lp.size # add to potential upload set - uid = self._create_unique_id(lp, ase) + uid = blobxfer.operations.upload.Uploader.create_unique_id( + lp, ase) with self._upload_lock: self._upload_set.add(uid) + self._upload_total += 1 if action == UploadAction.CheckMd5: self._pre_md5_skip_on_check(lp, ase) elif action == UploadAction.Upload: self._add_to_upload_queue(lp, ase, uid) - - nfiles += 1 - total_size += lp.size - - self._upload_total = nfiles - skipped_files - self._upload_bytes_total = total_size - skipped_size - upload_size_mib = self._upload_bytes_total / blobxfer.util.MEGABYTE # set remote files processed with self._md5_meta_lock: self._all_local_files_processed = True - logger.debug( - ('{0} local files processed, waiting for upload completion ' - 'of {1:.4f} MiB').format(nfiles, upload_size_mib)) - del nfiles - del total_size + with self._upload_lock: + self._upload_total -= skipped_files + self._upload_bytes_total -= skipped_size + upload_size_mib = approx_total_bytes / blobxfer.util.MEGABYTE + logger.debug( + ('{0} local/remote files processed, waiting for upload ' + 'completion of approx. {1:.4f} MiB').format( + self._upload_total, upload_size_mib)) del skipped_files del skipped_size + del upload_size_mib + del approx_total_bytes # wait for uploads to complete - self._wait_for_upload_threads(terminate=False) + self._wait_for_disk_threads(terminate=False) self._wait_for_transfer_threads(terminate=False) end_time = blobxfer.util.datetime_now() # update progress bar @@ -997,11 +1030,12 @@ def _run(self): # output throughput if self._upload_start_time is not None: ultime = (end_time - self._upload_start_time).total_seconds() - mibps = upload_size_mib / ultime + mibup = self._upload_bytes_total / blobxfer.util.MEGABYTE + mibps = mibup / ultime logger.info( - ('elapsed upload + verify time and throughput: {0:.3f} sec, ' - '{1:.4f} Mbps ({2:.3f} MiB/s)').format( - ultime, mibps * 8, mibps)) + ('elapsed upload + verify time and throughput of {0:.4f} ' + 'GiB: {1:.3f} sec, {2:.4f} Mbps ({3:.3f} MiB/s)').format( + mibup / 1024, ultime, mibps * 8, mibps)) end_time = blobxfer.util.datetime_now() logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( end_time, (end_time - self._start_time).total_seconds())) @@ -1022,10 +1056,9 @@ def start(self): 'processes and threads (this may take a while)...') try: self._wait_for_transfer_threads(terminate=True) - self._wait_for_upload_threads(terminate=True) + self._wait_for_disk_threads(terminate=True) finally: - self._cleanup_temporary_files() - raise + raise finally: # shutdown processes if self._md5_offload is not None: diff --git a/cli/cli.py b/cli/cli.py index c3b9b2d..8ce121b 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -120,7 +120,21 @@ def callback(ctx, param, value): expose_value=False, type=int, default=0, - help='Concurrent crypto processes', + help='Concurrent crypto processes (download only)', + callback=callback)(f) + + +def _disk_threads_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['disk_threads'] = value + return value + return click.option( + '--disk-threads', + expose_value=False, + type=int, + default=0, + help='Concurrent disk threads', callback=callback)(f) @@ -225,6 +239,7 @@ def common_options(f): f = _progress_bar_option(f) f = _md5_processes_option(f) f = _log_file_option(f) + f = _disk_threads_option(f) f = _crypto_processes_option(f) return f diff --git a/cli/settings.py b/cli/settings.py index 5834c12..d198359 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -225,6 +225,7 @@ def merge_settings(config, cli_options): config['options']['progress_bar'] = cli_options['progress_bar'] config['options']['resume_file'] = cli_options['resume_file'] config['options']['timeout_sec'] = cli_options['timeout'] + config['options']['disk_threads'] = cli_options['disk_threads'] config['options']['transfer_threads'] = cli_options['transfer_threads'] config['options']['verbose'] = cli_options['verbose'] @@ -256,6 +257,7 @@ def create_general_options(config): return blobxfer.models.options.General( concurrency=blobxfer.models.options.Concurrency( crypto_processes=config['options']['crypto_processes'], + disk_threads=config['options']['disk_threads'], md5_processes=config['options']['md5_processes'], transfer_threads=config['options']['transfer_threads'], ), diff --git a/setup.py b/setup.py index 2725ade..a9031a7 100644 --- a/setup.py +++ b/setup.py @@ -39,14 +39,14 @@ ] install_requires = [ - 'azure-common==1.1.5', - 'azure-storage==0.34.0', + 'azure-common==1.1.6', + 'azure-storage==0.34.2', 'click==6.7', 'cryptography>=1.8.1', 'future==0.16.0', 'python-dateutil==2.6.0', - 'requests==2.13.0', - 'ruamel.yaml==0.14.8', + 'requests==2.14.2', + 'ruamel.yaml==0.14.12', ] if sys.version_info < (3, 4): From a949367261151d74acb6c85238f393ed29ed80cc Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 26 May 2017 08:03:02 -0700 Subject: [PATCH 31/47] Split transfer/disk threads on download - Fix some naming/path issues - Fix race condition on container/entity init - Upload progress bar --- blobxfer/models/azure.py | 11 +- blobxfer/models/download.py | 26 +++- blobxfer/operations/azure/__init__.py | 3 + blobxfer/operations/crypto.py | 2 +- blobxfer/operations/download.py | 193 ++++++++++++++++++++------ blobxfer/operations/progress.py | 2 +- blobxfer/operations/upload.py | 120 +++++++++------- 7 files changed, 252 insertions(+), 105 deletions(-) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index d44af48..1e3325e 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -31,6 +31,10 @@ next, oct, open, pow, round, super, filter, map, zip) # stdlib imports import enum +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib # non-stdlib imports from azure.storage.blob.models import _BlobTypes as BlobTypes # local imports @@ -219,7 +223,7 @@ def populate_from_blob(self, sa, blob, path): :param str path: full path to blob """ self._create_containers = sa.create_containers - self._name = path + self._name = str(pathlib.Path(path) / blob.name) self._snapshot = blob.snapshot self._lmt = blob.properties.last_modified self._size = blob.properties.content_length @@ -244,7 +248,10 @@ def populate_from_file(self, sa, file, path): :param str path: full path to file """ self._create_containers = sa.create_containers - self._name = path + if path is not None: + self._name = str(pathlib.Path(path) / file.name) + else: + self._name = file.name self._snapshot = None self._lmt = file.properties.last_modified self._size = file.properties.content_length diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index e1c9266..3c5f686 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -517,7 +517,10 @@ def write_unchecked_data(self, offsets, data): temp=False, ) with self._meta_lock: - self._unchecked_chunks[offsets.chunk_num] = unchecked + self._unchecked_chunks[offsets.chunk_num] = { + 'ucc': unchecked, + 'decrypted': True, + } def write_unchecked_hmac_data(self, offsets, data): # type: (Descriptor, Offsets, bytes) -> None @@ -537,9 +540,21 @@ def write_unchecked_hmac_data(self, offsets, data): temp=True, ) with self._meta_lock: - self._unchecked_chunks[offsets.chunk_num] = unchecked + self._unchecked_chunks[offsets.chunk_num] = { + 'ucc': unchecked, + 'decrypted': False, + } return str(unchecked.file_path) + def mark_unchecked_chunk_decrypted(self, chunk_num): + # type: (Descriptor, int) -> None + """Mark an unchecked chunk as decrypted + :param Descriptor self: this + :param int chunk_num: unchecked chunk number + """ + with self._meta_lock: + self._unchecked_chunks[chunk_num]['decrypted'] = True + def perform_chunked_integrity_check(self): # type: (Descriptor) -> None """Hash data against stored hasher safely @@ -552,8 +567,9 @@ def perform_chunked_integrity_check(self): with self._meta_lock: chunk_num = self._next_integrity_chunk # check if the next chunk is ready - if chunk_num in self._unchecked_chunks: - ucc = self._unchecked_chunks.pop(chunk_num) + if (chunk_num in self._unchecked_chunks and + self._unchecked_chunks[chunk_num]['decrypted']): + ucc = self._unchecked_chunks.pop(chunk_num)['ucc'] else: break # hash data and set next integrity chunk @@ -653,7 +669,7 @@ def finalize_file(self): # delete temp download file self.local_path.unlink() return - logger.debug(msg) + logger.info(msg) # TODO set file uid/gid and mode diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index a237532..9b19423 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -292,6 +292,9 @@ def _populate_from_list_files(self, creds, options, general_options): else: ed = None ase = blobxfer.models.azure.StorageEntity(cont, ed) + if dir is not None: + dir, _ = blobxfer.operations.azure.file.parse_file_path( + dir) ase.populate_from_file(sa, file, dir) yield ase diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 6f1e011..19c56a4 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -273,7 +273,7 @@ def _worker_process(self): fd.seek(offsets.fd_start, 0) fd.write(data) self._done_cv.acquire() - self._done_queue.put(final_path) + self._done_queue.put((final_path, offsets)) # notify and release condition var self._done_cv.notify() self._done_cv.release() diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index fa379fc..47c237e 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -41,6 +41,7 @@ except ImportError: # noqa import Queue as queue import threading +import time # non-stdlib imports # local imports import blobxfer.models.crypto @@ -79,11 +80,15 @@ def __init__(self, general_options, creds, spec): self._md5_meta_lock = threading.Lock() self._md5_map = {} self._md5_offload = None - self._download_lock = threading.Lock() - self._download_queue = queue.Queue() - self._download_set = set() + self._transfer_lock = threading.Lock() + self._transfer_queue = queue.Queue() + self._transfer_set = set() + self._transfer_threads = [] + self._disk_operation_lock = threading.Lock() + self._disk_queue = queue.Queue() + self._disk_set = set() + self._disk_threads = [] self._download_start_time = None - self._download_threads = [] self._download_total = None self._download_sofar = 0 self._download_bytes_total = None @@ -106,11 +111,13 @@ def termination_check(self): :rtype: bool :return: if terminated """ - with self._download_lock: - return (self._download_terminate or - len(self._exceptions) > 0 or - (self._all_remote_files_processed and - len(self._download_set) == 0)) + with self._transfer_lock: + with self._disk_operation_lock: + return (self._download_terminate or + len(self._exceptions) > 0 or + (self._all_remote_files_processed and + len(self._transfer_set) == 0 and + len(self._disk_set) == 0)) @property def termination_check_md5(self): @@ -121,11 +128,11 @@ def termination_check_md5(self): :return: if terminated from MD5 context """ with self._md5_meta_lock: - with self._download_lock: + with self._transfer_lock: return (self._download_terminate or (self._all_remote_files_processed and len(self._md5_map) == 0 and - len(self._download_set) == 0)) + len(self._transfer_set) == 0)) @staticmethod def ensure_local_destination(creds, spec): @@ -163,6 +170,20 @@ def ensure_local_destination(creds, spec): # ensure destination path spec.destination.ensure_path_exists() + @staticmethod + def create_unique_disk_operation_id(dd, offsets): + # type: (blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets) -> None + """Create a unique disk operation id + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: download offsets + """ + # TODO add local view offset or slice num with stripe support + return ';'.join( + (str(dd.local_path), dd.entity._client.primary_endpoint, + dd.entity.path, str(offsets.range_start)) + ) + def _update_progress_bar(self): # type: (Downloader) -> None """Update progress bar @@ -260,8 +281,8 @@ def _post_md5_skip_on_check(self, filename, md5_match): rfile = self._md5_map.pop(filename) lpath = pathlib.Path(filename) if md5_match: - with self._download_lock: - self._download_set.remove(lpath) + with self._transfer_lock: + self._transfer_set.remove(lpath) self._download_total -= 1 self._download_bytes_total -= lpath.stat().st_size else: @@ -303,7 +324,7 @@ def _check_for_crypto_done(self): result = self._crypto_offload.pop_done_queue() if result is None: # use cv timeout due to possible non-wake while running - cv.wait(1) + cv.wait(0.1) # check for terminating conditions if self.termination_check: break @@ -312,9 +333,10 @@ def _check_for_crypto_done(self): cv.release() if result is not None: try: - with self._download_lock: - dd = self._dd_map[result] - dd.perform_chunked_integrity_check() + final_path, offsets = result + with self._transfer_lock: + dd = self._dd_map[final_path] + self._finalize_chunk(dd, offsets) except KeyError: # this can happen if all of the last integrity # chunks are processed at once @@ -332,28 +354,51 @@ def _add_to_download_queue(self, lpath, rfile): dd = blobxfer.models.download.Descriptor( lpath, rfile, self._spec.options, self._resume) if dd.entity.is_encrypted: - with self._download_lock: + with self._transfer_lock: self._dd_map[str(dd.final_path)] = dd # add download descriptor to queue - self._download_queue.put(dd) + self._transfer_queue.put(dd) if self._download_start_time is None: - with self._download_lock: + with self._transfer_lock: if self._download_start_time is None: self._download_start_time = blobxfer.util.datetime_now() - def _initialize_download_threads(self): + def _initialize_disk_threads(self): # type: (Downloader) -> None """Initialize download threads :param Downloader self: this """ + logger.debug('spawning {} disk threads'.format( + self._general_options.concurrency.disk_threads)) + for _ in range(self._general_options.concurrency.disk_threads): + thr = threading.Thread(target=self._worker_thread_disk) + self._disk_threads.append(thr) + thr.start() + + def _initialize_transfer_threads(self): + # type: (Downloader) -> None + """Initialize transfer threads + :param Downloader self: this + """ logger.debug('spawning {} transfer threads'.format( self._general_options.concurrency.transfer_threads)) for _ in range(self._general_options.concurrency.transfer_threads): - thr = threading.Thread(target=self._worker_thread_download) - self._download_threads.append(thr) + thr = threading.Thread(target=self._worker_thread_transfer) + self._transfer_threads.append(thr) thr.start() - def _wait_for_download_threads(self, terminate): + def _wait_for_disk_threads(self, terminate): + # type: (Downloader, bool) -> None + """Wait for disk threads + :param Downloader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._download_terminate = terminate + for thr in self._disk_threads: + thr.join() + + def _wait_for_transfer_threads(self, terminate): # type: (Downloader, bool) -> None """Wait for download threads :param Downloader self: this @@ -361,30 +406,53 @@ def _wait_for_download_threads(self, terminate): """ if terminate: self._download_terminate = terminate - for thr in self._download_threads: + for thr in self._transfer_threads: thr.join() - def _worker_thread_download(self): + def _worker_thread_transfer(self): # type: (Downloader) -> None """Worker thread download :param Downloader self: this """ while not self.termination_check: try: - dd = self._download_queue.get(False, 0.25) + if (len(self._disk_set) > + self._general_options.concurrency. + disk_threads * 4): + time.sleep(0.2) + continue + else: + dd = self._transfer_queue.get(block=False, timeout=0.1) except queue.Empty: continue try: self._process_download_descriptor(dd) except Exception as e: - with self._download_lock: + with self._transfer_lock: + self._exceptions.append(e) + + def _worker_thread_disk(self): + # type: (Downloader) -> None + """Worker thread for disk + :param Downloader self: this + """ + while not self.termination_check: + try: + dd, offsets, data = self._disk_queue.get( + block=False, timeout=0.1) + except queue.Empty: + continue + try: + self._process_data(dd, offsets, data) + except Exception as e: + with self._transfer_lock: self._exceptions.append(e) def _process_download_descriptor(self, dd): # type: (Downloader, blobxfer.models.download.Descriptor) -> None """Process download descriptor :param Downloader self: this - :param blobxfer.models.download.Descriptor: download descriptor + :param blobxfer.models.download.Descriptor dd: download descriptor """ # update progress bar self._update_progress_bar() @@ -392,7 +460,7 @@ def _process_download_descriptor(self, dd): offsets, resume_bytes = dd.next_offsets() # add resume bytes to counter if resume_bytes is not None: - with self._download_lock: + with self._disk_operation_lock: self._download_bytes_sofar += resume_bytes logger.debug('adding {} sofar {} from {}'.format( resume_bytes, self._download_bytes_sofar, dd._ase.name)) @@ -402,14 +470,14 @@ def _process_download_descriptor(self, dd): # finalize file dd.finalize_file() # accounting - with self._download_lock: + with self._transfer_lock: if dd.entity.is_encrypted: self._dd_map.pop(str(dd.final_path)) - self._download_set.remove(dd.final_path) + self._transfer_set.remove(dd.final_path) self._download_sofar += 1 return # re-enqueue for other threads to download - self._download_queue.put(dd) + self._transfer_queue.put(dd) if offsets is None: return # issue get range @@ -419,9 +487,22 @@ def _process_download_descriptor(self, dd): else: data = blobxfer.operations.azure.blob.get_blob_range( dd.entity, offsets, self._general_options.timeout_sec) - # accounting - with self._download_lock: - self._download_bytes_sofar += offsets.num_bytes + # enqueue data for processing + with self._disk_operation_lock: + self._disk_set.add( + blobxfer.operations.download.Downloader. + create_unique_disk_operation_id(dd, offsets)) + self._disk_queue.put((dd, offsets, data)) + + def _process_data(self, dd, offsets, data): + # type: (Downloader, blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets, bytes) -> None + """Process downloaded data for disk + :param Downloader self: this + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: offsets + :param bytes data: data to process + """ # decrypt if necessary if dd.entity.is_encrypted: # slice data to proper bounds and get iv for chunk @@ -457,9 +538,28 @@ def _process_download_descriptor(self, dd): else: # write data to disk dd.write_unchecked_data(offsets, data) + # finalize chunk + self._finalize_chunk(dd, offsets) + + def _finalize_chunk(self, dd, offsets): + # type: (Downloader, blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets) -> None + """Finalize written chunk + :param Downloader self: this + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: offsets + """ + if dd.entity.is_encrypted: + dd.mark_unchecked_chunk_decrypted(offsets.chunk_num) # integrity check data and write to disk (this is called # regardless of md5/hmac enablement for resume purposes) dd.perform_chunked_integrity_check() + # remove from disk set and add bytes to counter + with self._disk_operation_lock: + self._disk_set.remove( + blobxfer.operations.download.Downloader. + create_unique_disk_operation_id(dd, offsets)) + self._download_bytes_sofar += offsets.num_bytes def _cleanup_temporary_files(self): # type: (Downloader) -> None @@ -532,7 +632,8 @@ def _run(self): self._crypto_offload.initialize_check_thread( self._check_for_crypto_done) # initialize download threads - self._initialize_download_threads() + self._initialize_transfer_threads() + self._initialize_disk_threads() # initialize local counters nfiles = 0 total_size = 0 @@ -563,8 +664,8 @@ def _run(self): skipped_size += rfile.size continue # add potential download to set - with self._download_lock: - self._download_set.add(lpath) + with self._transfer_lock: + self._transfer_set.add(lpath) # either MD5 check or download now if action == DownloadAction.CheckMd5: self._pre_md5_skip_on_check(lpath, rfile) @@ -584,7 +685,8 @@ def _run(self): del skipped_files del skipped_size # wait for downloads to complete - self._wait_for_download_threads(terminate=False) + self._wait_for_transfer_threads(terminate=False) + self._wait_for_disk_threads(terminate=False) end_time = blobxfer.util.datetime_now() # update progress bar self._update_progress_bar() @@ -609,10 +711,12 @@ def _run(self): # output throughput if self._download_start_time is not None: dltime = (end_time - self._download_start_time).total_seconds() + dlmibspeed = download_size_mib / dltime logger.info( - ('elapsed download + verify time and throughput: {0:.3f} sec, ' - '{1:.4f} Mbps').format( - dltime, download_size_mib * 8 / dltime)) + ('elapsed download + verify time and throughput of {0:.4f} ' + 'GiB: {1:.3f} sec, {2:.4f} Mbps ({3:.3f} MiB/sec)').format( + download_size_mib / 1024, dltime, dlmibspeed * 8, + dlmibspeed)) end_time = blobxfer.util.datetime_now() logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( end_time, (end_time - self._start_time).total_seconds())) @@ -632,7 +736,8 @@ def start(self): 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') try: - self._wait_for_download_threads(terminate=True) + self._wait_for_transfer_threads(terminate=True) + self._wait_for_disk_threads(terminate=True) finally: self._cleanup_temporary_files() raise diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index b2b6c26..a0689ed 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -66,7 +66,7 @@ def update_progress_bar( if diff <= 0: # arbitrarily give a small delta diff = 1e-9 - if total_bytes is None: + if total_bytes is None or total_bytes == 0 or bytes_sofar > total_bytes: done = 0 else: done = float(bytes_sofar) / total_bytes diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 7f9230a..d33aad6 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -76,7 +76,7 @@ def __init__(self, general_options, creds, spec): :param blobxfer.operations.azure.StorageCredentials creds: creds :param blobxfer.models.uplaod.Specification spec: upload spec """ - self._all_local_files_processed = False + self._all_files_processed = False self._crypto_offload = None self._md5_meta_lock = threading.Lock() self._md5_map = {} @@ -119,7 +119,7 @@ def termination_check(self): with self._transfer_lock: return (self._upload_terminate or len(self._exceptions) > 0 or - (self._all_local_files_processed and + (self._all_files_processed and len(self._upload_set) == 0 and len(self._transfer_set) == 0)) @@ -134,7 +134,7 @@ def termination_check_md5(self): with self._md5_meta_lock: with self._upload_lock: return (self._upload_terminate or - (self._all_local_files_processed and + (self._all_files_processed and len(self._md5_map) == 0 and len(self._upload_set) == 0)) @@ -196,6 +196,8 @@ def _update_progress_bar(self): """Update progress bar :param Uploader self: this """ + if not self._all_files_processed: + return blobxfer.operations.progress.update_progress_bar( self._general_options, 'upload', @@ -297,7 +299,7 @@ def _initialize_disk_threads(self): """ logger.debug('spawning {} disk threads'.format( self._general_options.concurrency.transfer_threads)) - for _ in range(self._general_options.concurrency.transfer_threads): + for _ in range(self._general_options.concurrency.disk_threads): thr = threading.Thread(target=self._worker_thread_upload) self._disk_threads.append(thr) thr.start() @@ -368,13 +370,15 @@ def _process_transfer(self, ud, ase, offsets, data): self._put_data(ud, ase, offsets, data) # accounting with self._transfer_lock: + if offsets.chunk_num == 0: + self._upload_bytes_total += ase.size + self._upload_bytes_sofar += offsets.num_bytes self._transfer_set.remove( blobxfer.operations.upload.Uploader.create_unique_transfer_id( ud.local_path, ase, offsets)) - self._upload_bytes_sofar += offsets.num_bytes - if offsets.chunk_num == 0: - self._upload_bytes_total += ase.size ud.complete_offset_upload() + # update progress bar + self._update_progress_bar() def _put_data(self, ud, ase, offsets, data): # type: (Uploader, blobxfer.models.upload.Descriptor, @@ -391,61 +395,34 @@ def _put_data(self, ud, ase, offsets, data): if ase.mode == blobxfer.models.azure.StorageModes.Append: raise NotImplementedError() elif ase.mode == blobxfer.models.azure.StorageModes.Block: - if offsets.chunk_num == 0: - # create container if necessary - blobxfer.operations.azure.blob.create_container( - ase, self._containers_created, + # handle one-shot uploads + if ud.is_one_shot_block_blob: + metadata = ud.generate_metadata() + if not ud.entity.is_encrypted and ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string( + ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.create_blob( + ase, data, digest, metadata, timeout=self._general_options.timeout_sec) - # handle one-shot uploads - if ud.is_one_shot_block_blob: - metadata = ud.generate_metadata() - if not ud.entity.is_encrypted and ud.must_compute_md5: - digest = blobxfer.util.base64_encode_as_string( - ud.md5.digest()) - else: - digest = None - blobxfer.operations.azure.blob.block.create_blob( - ase, data, digest, metadata, - timeout=self._general_options.timeout_sec) - return + return # upload block blobxfer.operations.azure.blob.block.put_block( ase, offsets, data, timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.File: - if offsets.chunk_num == 0: - # create container if necessary - blobxfer.operations.azure.file.create_share( - ase, self._containers_created, - timeout=self._general_options.timeout_sec) - # create parent directories - with self._fileshare_dir_lock: - blobxfer.operations.azure.file.\ - create_all_parent_directories( - ase, self._dirs_created, - timeout=self._general_options.timeout_sec) - # create remote file - blobxfer.operations.azure.file.create_file( - ase, timeout=self._general_options.timeout_sec) # upload range if data is not None: blobxfer.operations.azure.file.put_file_range( ase, offsets, data, timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.Page: - # compute aligned size - if offsets.chunk_num == 0: - # create container if necessary - blobxfer.operations.azure.blob.create_container( - ase, self._containers_created, - timeout=self._general_options.timeout_sec) - # create remote blob - blobxfer.operations.azure.blob.page.create_blob( - ase, timeout=self._general_options.timeout_sec) if data is None: return - # align page + # compute aligned size aligned = blobxfer.util.page_align_content_length( offsets.num_bytes) + # align page if aligned != offsets.num_bytes: data = data.ljust(aligned, b'\0') if blobxfer.operations.md5.check_data_is_empty(data): @@ -477,14 +454,48 @@ def _worker_thread_upload(self): with self._upload_lock: self._exceptions.append(e) + def _prepare_upload(self, ase, offsets): + # type: (Uploader, blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets) -> None + """Prepare upload + :param Uploader self: this + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + """ + if ase.mode == blobxfer.models.azure.StorageModes.Block: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.File: + # create share directory structure + with self._fileshare_dir_lock: + # create container if necessary + blobxfer.operations.azure.file.create_share( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create parent directories + blobxfer.operations.azure.file.create_all_parent_directories( + ase, self._dirs_created, + timeout=self._general_options.timeout_sec) + # create remote file + blobxfer.operations.azure.file.create_file( + ase, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Page: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.page.create_blob( + ase, timeout=self._general_options.timeout_sec) + def _process_upload_descriptor(self, ud): # type: (Uploader, blobxfer.models.upload.Descriptor) -> None """Process upload descriptor :param Uploader self: this :param blobxfer.models.upload.Descriptor: upload descriptor """ - # update progress bar - self._update_progress_bar() # get download offsets offsets, resume_bytes = ud.next_offsets() # add resume bytes to counter @@ -509,6 +520,9 @@ def _process_upload_descriptor(self, ud): if offsets is None: self._upload_queue.put(ud) return + # prepare upload + if offsets.chunk_num == 0: + self._prepare_upload(ud.entity, offsets) # encrypt if necessary if ud.entity.is_encrypted and ud.entity.size > 0: # send iv through hmac if first chunk @@ -769,9 +783,11 @@ def _check_for_existing_remote(self, sa, cont, name): ase = blobxfer.models.azure.StorageEntity(cont, ed) if (self._spec.options.mode == blobxfer.models.azure.StorageModes.File): - ase.populate_from_file(sa, fp, name) + dir, _ = blobxfer.operations.azure.file.parse_file_path(name) + ase.populate_from_file(sa, fp, dir) else: - ase.populate_from_blob(sa, fp, name) + # blob.name contains full path, no need to specify dir + ase.populate_from_blob(sa, fp, '') else: ase = None return ase @@ -990,7 +1006,7 @@ def _run(self): self._add_to_upload_queue(lp, ase, uid) # set remote files processed with self._md5_meta_lock: - self._all_local_files_processed = True + self._all_files_processed = True with self._upload_lock: self._upload_total -= skipped_files self._upload_bytes_total -= skipped_size From 3c9e2041732ea2fcea3180af4953c2945982a1a1 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 26 May 2017 09:50:31 -0700 Subject: [PATCH 32/47] File attribute restore support - Revert blob naming changes in populate_from - Expand path for pem file reads --- blobxfer/models/azure.py | 28 ++++++-- blobxfer/models/download.py | 19 ++++- blobxfer/models/metadata.py | 47 +++++++++++++ blobxfer/operations/azure/__init__.py | 2 +- blobxfer/operations/crypto.py | 6 +- blobxfer/operations/progress.py | 99 ++++++++++++++------------- blobxfer/operations/upload.py | 3 +- setup.py | 4 +- 8 files changed, 147 insertions(+), 61 deletions(-) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index 1e3325e..1e10e03 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -38,6 +38,7 @@ # non-stdlib imports from azure.storage.blob.models import _BlobTypes as BlobTypes # local imports +import blobxfer.models.metadata # enums @@ -51,7 +52,7 @@ class StorageModes(enum.Enum): class StorageEntity(object): """Azure Storage Entity""" - def __init__(self, container, ed=None): + def __init__(self, container, ed=None, fileattr=None): # type: (StorageEntity, str # blobxfer.models.crypto.EncryptionMetadata) -> None """Ctor for StorageEntity @@ -71,6 +72,7 @@ def __init__(self, container, ed=None): self._md5 = None self._encryption = ed self._vio = None + self._fileattr = None self.replica_targets = None @property @@ -213,17 +215,30 @@ def encryption_metadata(self, value): """ self._encryption = value - def populate_from_blob(self, sa, blob, path): + @property + def file_attributes(self): + # type: (StorageEntity) -> object + """Return file attributes collection + :param StorageEntity self: this + :rtype: blobxfer.models.metadata.PosixFileAttr or + blobxfer.models.metadata.WindowsFileAttr or None + :return: file attributes + """ + return self._fileattr + + def populate_from_blob(self, sa, blob): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, - # azure.storage.blob.models.Blob, str) -> None + # azure.storage.blob.models.Blob) -> None """Populate properties from Blob :param StorageEntity self: this :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.blob.models.Blob blob: blob to populate from - :param str path: full path to blob """ + # set file attributes from metadata + self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( + blob.metadata) self._create_containers = sa.create_containers - self._name = str(pathlib.Path(path) / blob.name) + self._name = blob.name self._snapshot = blob.snapshot self._lmt = blob.properties.last_modified self._size = blob.properties.content_length @@ -247,6 +262,9 @@ def populate_from_file(self, sa, file, path): :param azure.storage.file.models.File file: file to populate from :param str path: full path to file """ + # set file attributes from metadata + self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( + file.metadata) self._create_containers = sa.create_containers if path is not None: self._name = str(pathlib.Path(path) / file.name) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index 3c5f686..ef92068 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -162,6 +162,9 @@ def __init__( if not self.options.check_file_md5 and self.skip_on.md5_match: raise ValueError( 'Cannot specify skip on MD5 match without file MD5 enabled') + if (self.options.restore_file_attributes and + not blobxfer.util.on_windows() and os.getuid() != 0): + logger.warning('Cannot set file uid/gid without root privileges') def add_azure_source_path(self, source): # type: (Specification, blobxfer.operations.azure.SourcePath) -> None @@ -670,9 +673,19 @@ def finalize_file(self): self.local_path.unlink() return logger.info(msg) - - # TODO set file uid/gid and mode - + # set file uid/gid and mode + if self._ase.file_attributes is not None: + if blobxfer.util.on_windows(): + # TODO not implemented yet + pass + else: + self.local_path.chmod(int(self._ase.file_attributes.mode, 8)) + if os.getuid() == 0: + os.chown( + str(self.local_path), + self._ase.file_attributes.uid, + self._ase.file_attributes.gid + ) # move temp download file to final path blobxfer.util.replace_file(self.local_path, self.final_path) # update resume file diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py index 5ebeea3..139ed63 100644 --- a/blobxfer/models/metadata.py +++ b/blobxfer/models/metadata.py @@ -30,6 +30,8 @@ bytes, dict, int, list, object, range, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # stdlib imports +import collections +import json import logging # non-stdlib imports # local imports @@ -55,6 +57,18 @@ _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES = 'TotalSlices' _JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID = 'SliceId' _JSON_KEY_VECTORED_IO_STRIPE_NEXT = 'Next' +# named tuples +PosixFileAttr = collections.namedtuple( + 'PosixFileAttr', [ + 'mode', + 'uid', + 'gid', + ] +) +WindowsFileAttr = collections.namedtuple( + 'WindowsFileAttr', [ + ] +) def generate_fileattr_metadata(local_path, metadata): @@ -82,6 +96,39 @@ def generate_fileattr_metadata(local_path, metadata): return blobxfer.util.merge_dict(metadata, md) +def fileattr_from_metadata(md): + # type: (dict) -> bool + """Convert fileattr metadata in json metadata + :param dict md: metadata dictionary + :rtype: PosixFileAttr or WindowsFileAttr or None + :return: fileattr metadata + """ + try: + mdattr = json.loads( + md[JSON_KEY_BLOBXFER_METADATA])[_JSON_KEY_FILE_ATTRIBUTES] + except (KeyError, TypeError): + return None + else: + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported ' + 'yet') + fileattr = None + else: + try: + fileattr = PosixFileAttr( + mode=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_MODE], + uid=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_UID], + gid=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_GID], + ) + except KeyError: + fileattr = None + return fileattr + + def restore_fileattr(path, metadata): # type: (pathlib.Path, dict) -> None """Restore file attributes from metadata diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 9b19423..61dfe53 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -327,7 +327,7 @@ def _populate_from_list_blobs(self, creds, options, general_options): else: ed = None ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_blob(sa, blob, dir) + ase.populate_from_blob(sa, blob) yield ase diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index 19c56a4..ba6982c 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -75,7 +75,8 @@ def load_rsa_private_key_file(rsakeyfile, passphrase): :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey :return: RSAPrivateKey """ - with open(rsakeyfile, 'rb') as keyfile: + keypath = os.path.expandvars(os.path.expanduser(rsakeyfile)) + with open(keypath, 'rb') as keyfile: return cryptography.hazmat.primitives.serialization.\ load_pem_private_key( keyfile.read(), @@ -92,7 +93,8 @@ def load_rsa_public_key_file(rsakeyfile): :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey :return: RSAPublicKey """ - with open(rsakeyfile, 'rb') as keyfile: + keypath = os.path.expandvars(os.path.expanduser(rsakeyfile)) + with open(keypath, 'rb') as keyfile: return cryptography.hazmat.primitives.serialization.\ load_pem_public_key( keyfile.read(), diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index a0689ed..b9d93a7 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -36,6 +36,7 @@ import sys # non-stdlib imports import azure.storage +import cryptography import requests # local imports import blobxfer.util @@ -95,86 +96,92 @@ def output_parameters(general_options, spec): :param blobxfer.models.options.General general_options: general options :param object spec: upload or download spec """ + sep = '============================================' log = [] - log.append('===========================') - log.append(' azure blobxfer parameters') - log.append('===========================') - log.append(' blobxfer version: {}'.format( + log.append(sep) + log.append(' Azure blobxfer parameters') + log.append(sep) + log.append(' blobxfer version: {}'.format( blobxfer.version.__version__)) - log.append(' platform: {}'.format(platform.platform())) - log.append(' python: {} {} az.stor={} req={}'.format( - platform.python_implementation(), - platform.python_version(), - azure.storage._constants.__version__, - requests.__version__)) + log.append(' platform: {}'.format(platform.platform())) + log.append( + ' components: {}={} az.stor={} crypt={} req={}'.format( + platform.python_implementation(), + platform.python_version(), + azure.storage._constants.__version__, + cryptography.__version__, + requests.__version__,)) # specific preamble if isinstance(spec, blobxfer.models.download.Specification): - log.append(' transfer direction: {}'.format('Azure -> local')) + log.append(' transfer direction: {}'.format('Azure -> local')) log.append( - ' workers: disk={} xfer={} md5={} crypto={}'.format( - general_options.concurrency.disk_threads, - general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes - if spec.options.check_file_md5 else 0, - general_options.concurrency.crypto_processes)) + (' workers: disk={} xfer={} md5={} ' + 'crypto={}').format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.options.check_file_md5 else 0, + general_options.concurrency.crypto_processes)) elif isinstance(spec, blobxfer.models.upload.Specification): - log.append(' transfer direction: {}'.format('local -> Azure')) + log.append(' transfer direction: {}'.format('local -> Azure')) log.append( - ' workers: disk={} xfer={} md5={} crypto={}'.format( - general_options.concurrency.disk_threads, - general_options.concurrency.transfer_threads, - general_options.concurrency.md5_processes - if spec.skip_on.md5_match or - spec.options.store_file_properties.md5 else 0, - 0)) + (' workers: disk={} xfer={} md5={} ' + 'crypto={}').format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.skip_on.md5_match or + spec.options.store_file_properties.md5 else 0, + 0)) # TODO handle synccopy spec # common block - log.append(' resume file: {}'.format( + log.append(' resume file: {}'.format( general_options.resume_file)) - log.append(' timeout: {}'.format( + log.append(' timeout: {}'.format( general_options.timeout_sec)) - log.append(' mode: {}'.format( + log.append(' mode: {}'.format( spec.options.mode)) - log.append(' skip on: fs_match={} lmt_ge={} md5={}'.format( - spec.skip_on.filesize_match, - spec.skip_on.lmt_ge, - spec.skip_on.md5_match)) - log.append(' chunk size: {} bytes'.format( + log.append( + ' skip on: fs_match={} lmt_ge={} md5={}'.format( + spec.skip_on.filesize_match, + spec.skip_on.lmt_ge, + spec.skip_on.md5_match)) + log.append(' chunk size: {} bytes'.format( spec.options.chunk_size_bytes)) - log.append(' delete extraneous: {}'.format( + log.append(' delete extraneous: {}'.format( spec.options.delete_extraneous_destination)) - log.append(' overwrite: {}'.format( + log.append(' overwrite: {}'.format( spec.options.overwrite)) - log.append(' recursive: {}'.format( + log.append(' recursive: {}'.format( spec.options.recursive)) # TODO only output rename single if not synccopy - log.append(' rename single: {}'.format( + log.append(' rename single: {}'.format( spec.options.rename)) # specific epilog if isinstance(spec, blobxfer.models.download.Specification): - log.append(' compute file md5: {}'.format( + log.append(' compute file md5: {}'.format( spec.options.check_file_md5)) - log.append(' file attributes: {}'.format( + log.append(' restore file attributes: {}'.format( spec.options.restore_file_attributes)) - log.append(' rsa private key: {}'.format( + log.append(' rsa private key: {}'.format( 'Loaded' if spec.options.rsa_private_key else 'None')) - log.append(' local destination: {}'.format( + log.append(' local destination: {}'.format( spec.destination.path)) elif isinstance(spec, blobxfer.models.upload.Specification): - log.append(' one shot bytes: {}'.format( + log.append(' one shot bytes: {}'.format( spec.options.one_shot_bytes)) - log.append(' store properties: attr={} md5={}'.format( + log.append(' store properties: attr={} md5={}'.format( spec.options.store_file_properties.attributes, spec.options.store_file_properties.md5)) - log.append(' rsa public key: {}'.format( + log.append(' rsa public key: {}'.format( 'Loaded' if spec.options.rsa_public_key else 'None')) - log.append(' local source paths: {}'.format( + log.append(' local source paths: {}'.format( ' '.join([str(src) for src in spec.sources.paths]))) - log.append('===========================') + log.append(sep) log = os.linesep.join(log) if blobxfer.util.is_not_empty(general_options.log_file): print(log) diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index d33aad6..7fc094f 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -786,8 +786,7 @@ def _check_for_existing_remote(self, sa, cont, name): dir, _ = blobxfer.operations.azure.file.parse_file_path(name) ase.populate_from_file(sa, fp, dir) else: - # blob.name contains full path, no need to specify dir - ase.populate_from_blob(sa, fp, '') + ase.populate_from_blob(sa, fp) else: ase = None return ase diff --git a/setup.py b/setup.py index a9031a7..c61be64 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) -if not version: +if not version or len(version) == 0: raise RuntimeError('Cannot find version') packages = [ @@ -42,7 +42,7 @@ 'azure-common==1.1.6', 'azure-storage==0.34.2', 'click==6.7', - 'cryptography>=1.8.1', + 'cryptography>=1.8.2', 'future==0.16.0', 'python-dateutil==2.6.0', 'requests==2.14.2', From f9ba1bf47d3617b07634115c3c716f246bfc2d49 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Sun, 28 May 2017 17:03:30 -0700 Subject: [PATCH 33/47] Add download support for vectored stripes --- blobxfer/models/azure.py | 20 ++- blobxfer/models/download.py | 190 +++++++++++++++++--------- blobxfer/models/metadata.py | 94 ++++++++++++- blobxfer/models/resume.py | 27 ++-- blobxfer/operations/azure/__init__.py | 127 ++++++++++++++--- blobxfer/operations/crypto.py | 18 +-- blobxfer/operations/download.py | 66 +++++++-- blobxfer/operations/resume.py | 44 ++++-- 8 files changed, 435 insertions(+), 151 deletions(-) diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index 1e10e03..abfa10e 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -226,7 +226,17 @@ def file_attributes(self): """ return self._fileattr - def populate_from_blob(self, sa, blob): + @property + def vectored_io(self): + # type: (StorageEntity) -> object + """Return vectored io metadata, currently stripe only + :param StorageEntity self: this + :rtype: blobxfer.models.metadata.VectoredStripe or None + :return: vectored io metadata + """ + return self._vio + + def populate_from_blob(self, sa, blob, vio=None): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, # azure.storage.blob.models.Blob) -> None """Populate properties from Blob @@ -234,9 +244,10 @@ def populate_from_blob(self, sa, blob): :param blobxfer.operations.azure.StorageAccount sa: storage account :param azure.storage.blob.models.Blob blob: blob to populate from """ - # set file attributes from metadata + # set props from metadata self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( blob.metadata) + self._vio = vio self._create_containers = sa.create_containers self._name = blob.name self._snapshot = blob.snapshot @@ -253,7 +264,7 @@ def populate_from_blob(self, sa, blob): self._mode = StorageModes.Page self._client = sa.page_blob_client - def populate_from_file(self, sa, file, path): + def populate_from_file(self, sa, file, path, vio=None): # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, # azure.storage.file.models.File, str) -> None """Populate properties from File @@ -262,9 +273,10 @@ def populate_from_file(self, sa, file, path): :param azure.storage.file.models.File file: file to populate from :param str path: full path to file """ - # set file attributes from metadata + # set props from metadata self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( file.metadata) + self._vio = vio self._create_containers = sa.create_containers if path is not None: self._name = str(pathlib.Path(path) / file.name) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index ef92068..e2adcbc 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -69,6 +69,12 @@ 'temp', ] ) +LocalPathView = collections.namedtuple( + 'LocalPathView', [ + 'fd_end', + 'fd_start', + ] +) class LocalDestinationPath(object): @@ -206,11 +212,7 @@ def __init__(self, lpath, ase, options, resume_mgr): self._ase = ase # set paths self.final_path = lpath - # create path holding the temporary file to download to - _tmp = list(lpath.parts[:-1]) - _tmp.append(lpath.name + '.bxtmp') - self.local_path = pathlib.Path(*_tmp) - del _tmp + self._view = None # calculate the total number of ops required for transfer self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) self._total_chunks = self._compute_total_chunks(self._chunk_size) @@ -218,6 +220,7 @@ def __init__(self, lpath, ase, options, resume_mgr): # initialize integrity checkers self.hmac = None self.md5 = None + self._integrity_failed = False self._initialize_integrity_checkers(options) @property @@ -293,36 +296,74 @@ def _initialize_integrity_checkers(self, options): blobxfer.util.is_not_empty(self._ase.md5)): self.md5 = blobxfer.util.new_md5_hasher() + def _compute_allocated_size(self, size): + # type: (Descriptor, int) -> int + """Compute allocated size on disk + :param Descriptor self: this + :param int size: size (content length) + :rtype: int + :return: required size on disk + """ + # compute size + if size > 0: + if self._ase.is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ + self._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + return allocatesize + + def _set_final_path_view(self): + # type: (Descriptor) -> int + """Set final path view and return required space on disk + :param Descriptor self: this + :rtype: int + :return: required size on disk + """ + slicesize = self._compute_allocated_size(self._ase.size) + if self._ase.vectored_io is None: + self._view = LocalPathView( + fd_start=0, + fd_end=slicesize, + ) + return self._ase.size + else: + name = self.final_path.name + name = blobxfer.models.metadata.\ + remove_vectored_io_slice_suffix_from_name( + name, self._ase.vectored_io.slice_id) + _tmp = list(self.final_path.parts[:-1]) + _tmp.append(name) + self.final_path = pathlib.Path(*_tmp) + self._view = LocalPathView( + fd_start=self._ase.vectored_io.offset_start, + fd_end=self._ase.vectored_io.offset_start + slicesize, + ) + return self._ase.vectored_io.total_size + def _allocate_disk_space(self): - # type: (Descriptor, int) -> None + # type: (Descriptor) -> None """Perform file allocation (possibly sparse) :param Descriptor self: this - :param int size: size """ with self._meta_lock: - if self._allocated: + if self._allocated or self._offset != 0: return - size = self._ase.size - # compute size - if size > 0: - if self._ase.is_encrypted: - # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs - allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ - self._AES_BLOCKSIZE - else: - allocatesize = size - if allocatesize < 0: - allocatesize = 0 - else: - allocatesize = 0 + # set local path view + allocatesize = self._set_final_path_view() # check if path already exists and is of sufficient size - if (not self.local_path.exists() or - self.local_path.stat().st_size != allocatesize): + if (not self.final_path.exists() or + self.final_path.stat().st_size != allocatesize): # create parent path - self.local_path.parent.mkdir( + self.final_path.parent.mkdir( mode=0o750, parents=True, exist_ok=True) # allocate file - with self.local_path.open('wb') as fd: + with self.final_path.open('wb') as fd: if allocatesize > 0: try: os.posix_fallocate(fd.fileno(), 0, allocatesize) @@ -341,7 +382,7 @@ def _resume(self): if self._resume_mgr is None or self._offset > 0 or self._finalized: return None # check if path exists in resume db - rr = self._resume_mgr.get_record(str(self.final_path)) + rr = self._resume_mgr.get_record(self._ase) if rr is None: logger.debug('no resume record for {}'.format(self.final_path)) return None @@ -373,10 +414,11 @@ def _resume(self): logger.debug('cannot resume encrypted entity {}'.format( self._ase.path)) return None - # check if intermediate (blobtmp) exists - if not self.local_path.exists(): - logger.warning('temporary download file {} does not exist'.format( - rr.temp_path)) + self._allocate_disk_space() + # check if final path exists + if not self.final_path.exists(): + logger.warning('download path {} does not exist'.format( + self.final_path)) return None if self.hmac is not None: raise RuntimeError( @@ -387,10 +429,14 @@ def _resume(self): if self.md5 is not None and curr_chunk > 0: _blocksize = blobxfer.util.MEGABYTE << 2 logger.debug( - 'integrity checking existing file {} to offset {}'.format( - self.final_path, _end_offset)) + 'integrity checking existing file {} offset {} -> {}'.format( + self.final_path, + self._view.fd_start, + self._view.fd_start + _end_offset) + ) with self._hasher_lock: - with self.local_path.open('rb') as filedesc: + with self.final_path.open('rb') as filedesc: + filedesc.seek(self._view.fd_start, 0) while _fd_offset < _end_offset: if (_fd_offset + _blocksize) > _end_offset: _blocksize = _end_offset - _fd_offset @@ -403,7 +449,7 @@ def _resume(self): if rr.md5hexdigest != hexdigest: logger.warning( 'MD5 mismatch resume={} computed={} for {}'.format( - rr.md5hexdigest, hexdigest, self.local_path)) + rr.md5hexdigest, hexdigest, self.final_path)) # reset hasher self.md5 = blobxfer.util.new_md5_hasher() return None @@ -434,7 +480,7 @@ def cleanup_all_temporary_files(self): """ # delete local file try: - self.local_path.unlink() + self.final_path.unlink() except OSError: pass # iterate unchecked chunks and delete @@ -510,13 +556,11 @@ def write_unchecked_data(self, offsets, data): :param Offsets offsets: download offsets :param bytes data: data """ - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) - fd.write(data) + self.write_data(offsets, data) unchecked = UncheckedChunk( data_len=len(data), - fd_start=offsets.fd_start, - file_path=self.local_path, + fd_start=self._view.fd_start + offsets.fd_start, + file_path=self.final_path, temp=False, ) with self._meta_lock: @@ -593,9 +637,8 @@ def perform_chunked_integrity_check(self): self._next_integrity_chunk += 1 if self.is_resumable: self._resume_mgr.add_or_update_record( - self.final_path, self.local_path, self._ase.size, - self._chunk_size, self._next_integrity_chunk, False, - md5hexdigest, + self.final_path, self._ase, self._chunk_size, + self._next_integrity_chunk, False, md5hexdigest, ) # decrement outstanding op counter self._outstanding_ops -= 1 @@ -609,8 +652,8 @@ def _update_resume_for_completed(self): return with self._meta_lock: self._resume_mgr.add_or_update_record( - self.final_path, self.local_path, self._ase.size, - self._chunk_size, self._next_integrity_chunk, True, None, + self.final_path, self._ase, self._chunk_size, + self._next_integrity_chunk, True, None, ) def write_data(self, offsets, data): @@ -621,13 +664,14 @@ def write_data(self, offsets, data): :param bytes data: data """ if len(data) > 0: - with self.local_path.open('r+b') as fd: - fd.seek(offsets.fd_start, 0) + with self.final_path.open('r+b') as fd: + # offset some internal view + fd.seek(self._view.fd_start + offsets.fd_start, 0) fd.write(data) - def finalize_file(self): + def finalize_integrity(self): # type: (Descriptor) -> None - """Finalize file download + """Finalize integrity check for download :param Descriptor self: this """ with self._meta_lock: @@ -668,26 +712,40 @@ def finalize_file(self): ) # cleanup if download failed if not check: + self._integrity_failed = True logger.error(msg) - # delete temp download file - self.local_path.unlink() - return logger.info(msg) + + def _restore_file_attributes(self): + # type: (Descriptor) -> None + """Restore file attributes for file + :param Descriptor self: this + """ + if self._ase.file_attributes is None: + return # set file uid/gid and mode - if self._ase.file_attributes is not None: - if blobxfer.util.on_windows(): - # TODO not implemented yet - pass - else: - self.local_path.chmod(int(self._ase.file_attributes.mode, 8)) - if os.getuid() == 0: - os.chown( - str(self.local_path), - self._ase.file_attributes.uid, - self._ase.file_attributes.gid - ) - # move temp download file to final path - blobxfer.util.replace_file(self.local_path, self.final_path) + if blobxfer.util.on_windows(): + # TODO not implemented yet + pass + else: + self.final_path.chmod(int(self._ase.file_attributes.mode, 8)) + if os.getuid() == 0: + os.chown( + str(self.final_path), + self._ase.file_attributes.uid, + self._ase.file_attributes.gid + ) + + def finalize_file(self): + # type: (Descriptor) -> None + """Finalize file for download + :param Descriptor self: this + """ + # delete bad file if integrity failed + if self._integrity_failed: + self.final_path.unlink() + else: + self._restore_file_attributes() # update resume file self._update_resume_for_completed() with self._meta_lock: diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py index 139ed63..f891696 100644 --- a/blobxfer/models/metadata.py +++ b/blobxfer/models/metadata.py @@ -40,7 +40,7 @@ # create logger logger = logging.getLogger(__name__) # global defines -JSON_KEY_BLOBXFER_METADATA = 'BlobxferMetadata' +JSON_KEY_BLOBXFER_METADATA = 'blobxfer_metadata' # file attributes _JSON_KEY_FILE_ATTRIBUTES = 'FileAttributes' _JSON_KEY_FILE_ATTRIBUTES_POSIX = 'POSIX' @@ -60,15 +60,32 @@ # named tuples PosixFileAttr = collections.namedtuple( 'PosixFileAttr', [ + 'gid', 'mode', 'uid', - 'gid', ] ) WindowsFileAttr = collections.namedtuple( 'WindowsFileAttr', [ ] ) +VectoredStripe = collections.namedtuple( + 'VectoredStripe', [ + 'next', + 'offset_start', + 'slice_id', + 'total_size', + 'total_slices', + ] +) +VectoredNextEntry = collections.namedtuple( + 'VectoredNextEntry', [ + 'storage_account_name', + 'endpoint', + 'container', + 'name', + ] +) def generate_fileattr_metadata(local_path, metadata): @@ -97,7 +114,7 @@ def generate_fileattr_metadata(local_path, metadata): def fileattr_from_metadata(md): - # type: (dict) -> bool + # type: (dict) -> collections.namedtuple """Convert fileattr metadata in json metadata :param dict md: metadata dictionary :rtype: PosixFileAttr or WindowsFileAttr or None @@ -153,6 +170,38 @@ def create_vectored_io_next_entry(ase): ) +def explode_vectored_io_next_entry(entry): + # type: (str, int) -> str + """Explode next vectored io entry + :param str entry: next entry + :rtype: VectoredNextEntry + :return: vectored next entry + """ + tmp = entry.split(';') + _sa = tmp[0].split('.') + return VectoredNextEntry( + storage_account_name=_sa[0], + endpoint='.'.join(_sa[2:]), + container=tmp[1], + name=tmp[2], + ) + + +def remove_vectored_io_slice_suffix_from_name(name, slice): + # type: (str, int) -> str + """Remove vectored io (stripe) slice suffix from a given name + :param str name: entity name + :param int slice: slice num + :rtype: str + :return: name without suffix + """ + suffix = '.bxslice-{}'.format(slice) + if name.endswith(suffix): + return name[:-len(suffix)] + else: + return name + + def generate_vectored_io_stripe_metadata(local_path, metadata): # type: (blobxfer.models.upload.LocalPath, dict) -> dict """Generate vectored io stripe metadata dict @@ -172,8 +221,45 @@ def generate_vectored_io_stripe_metadata(local_path, metadata): local_path.view.total_slices, _JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID: local_path.view.slice_num, - _JSON_KEY_VECTORED_IO_STRIPE_NEXT: local_path.view.next, + _JSON_KEY_VECTORED_IO_STRIPE_NEXT: + explode_vectored_io_next_entry(local_path.view.next), } } } return blobxfer.util.merge_dict(metadata, md) + + +def vectored_io_from_metadata(md): + # type: (dict) -> collections.namedtuple + """Convert vectored io metadata in json metadata + :param dict md: metadata dictionary + :rtype: VectoredStripe or None + :return: vectored io metadata + """ + try: + mdattr = json.loads( + md[JSON_KEY_BLOBXFER_METADATA])[_JSON_KEY_VECTORED_IO] + except (KeyError, TypeError): + pass + else: + if mdattr[_JSON_KEY_VECTORED_IO_MODE] == _JSON_KEY_VECTORED_IO_STRIPE: + mdstripe = mdattr[_JSON_KEY_VECTORED_IO_STRIPE] + try: + nextptr = explode_vectored_io_next_entry( + mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_NEXT]) + except (KeyError, AttributeError): + nextptr = None + vio = VectoredStripe( + total_size=mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE], + offset_start=mdstripe[ + _JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START], + total_slices=mdstripe[ + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES], + slice_id=mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID], + next=nextptr, + ) + return vio + else: + raise RuntimeError('Cannot handle Vectored IO mode: {}'.format( + mdattr[_JSON_KEY_VECTORED_IO_MODE])) + return None diff --git a/blobxfer/models/resume.py b/blobxfer/models/resume.py index 37a5acc..aa8b9da 100644 --- a/blobxfer/models/resume.py +++ b/blobxfer/models/resume.py @@ -37,20 +37,19 @@ class Download(object): """Download resume object""" def __init__( - self, final_path, temp_path, length, chunk_size, - next_integrity_chunk, completed, md5): - # type: (Download, str, str, int, int, int, str) -> None + self, final_path, length, chunk_size, next_integrity_chunk, + completed, md5): + # type: (Download, str, int, int, int, bool, str) -> None """Ctor for Download :param Download self: this :param str final_path: final path - :param str temp_path: temporary path :param int length: total bytes :param int chunk_size: chunk size in bytes :param int next_integrity_chunk: next integrity chunk + :param bool completed: completed :param str md5: md5 hex digest """ self._final_path = final_path - self._temp_path = temp_path self._length = length self._chunk_size = chunk_size self._next_integrity_chunk = next_integrity_chunk @@ -67,16 +66,6 @@ def final_path(self): """ return self._final_path - @property - def temp_path(self): - # type: (Download) -> str - """Temp path - :param Download self: this - :rtype: str - :return: temp path - """ - return self._temp_path - @property def length(self): # type: (Download) -> int @@ -163,9 +152,9 @@ def __repr__(self): :rtype: str :return: representation string """ - return ('Download').format( - self.final_path, self.temp_path, self.length, - self.chunk_size, self.next_integrity_chunk, - self.completed, self.md5hexdigest, + self.final_path, self.length, self.chunk_size, + self.next_integrity_chunk, self.completed, + self.md5hexdigest, ) diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 61dfe53..177d41d 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -34,6 +34,7 @@ import requests # local imports import blobxfer.models +import blobxfer.models.metadata import blobxfer.operations.azure.blob.append import blobxfer.operations.azure.blob.block import blobxfer.operations.azure.blob.page @@ -263,6 +264,100 @@ def files(self, creds, options, general_options): creds, options, general_options): yield blob + def _convert_to_storage_entity_with_encryption_metadata( + self, options, sa, entity, vio, is_file, container, dir): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, StorageAccount, object, + # blobxfer.models.metadata.VectoredStripe, bool, str, + # str) -> StorageEntity + """Convert entity into StorageEntity with encryption metadata if avail + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param StorageAccount sa: storage account + :param object entity: Storage File or Blob object + :param blobxfer.models.metadata.VectoredStripe vio: Vectored stripe + :param bool is_file: is a file object + :param str container: container + :param str dir: Azure File directory structure + :rtype: StorageEntity + :return: Azure storage entity object + """ + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(entity.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + entity.metadata, file.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(container, ed) + if is_file: + ase.populate_from_file(sa, entity, dir, vio) + else: + ase.populate_from_blob(sa, entity, vio) + return ase + + def _handle_vectored_io_stripe( + self, creds, options, general_options, sa, entity, is_file, + container, dir=None): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General, StorageAccount, object, + # bool, str, str) -> StorageEntity + """Handle Vectored IO stripe entries + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :param StorageAccount sa: storage account + :param object entity: Storage File or Blob object + :param bool is_file: is a file object + :param str container: container + :param str dir: Azure File directory structure + :rtype: StorageEntity + :return: Azure storage entity object + """ + vio = blobxfer.models.metadata.vectored_io_from_metadata( + entity.metadata) + if not isinstance(vio, blobxfer.models.metadata.VectoredStripe): + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, None, is_file, container, dir) + yield ase + return + # if this slice is not the first, ignore. the reason for this is + # 1. Ensures direct get on a slice does nothing unless the + # zero-th blob is retrieved/accessed (eliminates partial data + # download), which will reconstruct all of the stripes via next + # pointers + # 2. Data is not retrieved multiple times for the same slice without + # having to maintain a fetched map + if vio.slice_id != 0: + yield None + return + # yield this entity + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, vio, is_file, container, dir) + yield ase + # iterate all slices + while vio.next is not None: + # follow next pointer + sa = creds.get_storage_account(vio.next.storage_account_name) + if is_file: + entity = blobxfer.operations.azure.file.get_file_properties( + sa.file_client, vio.next.container, vio.next.name, + timeout=general_options.timeout_sec) + _, dir = blobxfer.util.explode_azure_path(vio.next.name) + else: + entity = blobxfer.operations.azure.blob.get_blob_properties( + sa.block_blob_client, vio.next.container, vio.next.name, + ase.mode, timeout=general_options.timeout_sec) + vio = blobxfer.models.metadata.vectored_io_from_metadata( + entity.metadata) + # yield next + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, vio, is_file, container, dir) + yield ase + def _populate_from_list_files(self, creds, options, general_options): # type: (SourcePath, StorageCredentials, # blobxfer.models.options.Download, @@ -284,19 +379,15 @@ def _populate_from_list_files(self, creds, options, general_options): general_options.timeout_sec): if not self._inclusion_check(file.name): continue - if blobxfer.models.crypto.EncryptionMetadata.\ - encryption_metadata_exists(file.metadata): - ed = blobxfer.models.crypto.EncryptionMetadata() - ed.convert_from_json( - file.metadata, file.name, options.rsa_private_key) - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) if dir is not None: dir, _ = blobxfer.operations.azure.file.parse_file_path( dir) - ase.populate_from_file(sa, file, dir) - yield ase + for ase in self._handle_vectored_io_stripe( + creds, options, general_options, sa, file, True, cont, + dir): + if ase is None: + continue + yield ase def _populate_from_list_blobs(self, creds, options, general_options): # type: (SourcePath, StorageCredentials, @@ -319,16 +410,12 @@ def _populate_from_list_blobs(self, creds, options, general_options): options.recursive, general_options.timeout_sec): if not self._inclusion_check(blob.name): continue - if blobxfer.models.crypto.EncryptionMetadata.\ - encryption_metadata_exists(blob.metadata): - ed = blobxfer.models.crypto.EncryptionMetadata() - ed.convert_from_json( - blob.metadata, blob.name, options.rsa_private_key) - else: - ed = None - ase = blobxfer.models.azure.StorageEntity(cont, ed) - ase.populate_from_blob(sa, blob) - yield ase + for ase in self._handle_vectored_io_stripe( + creds, options, general_options, sa, blob, False, + cont): + if ase is None: + continue + yield ase class DestinationPath(blobxfer.models._BaseSourcePaths): diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py index ba6982c..76cf001 100644 --- a/blobxfer/operations/crypto.py +++ b/blobxfer/operations/crypto.py @@ -262,7 +262,8 @@ def _worker_process(self): self._done_cv.acquire() self._done_queue.put(fpath) elif inst[0] == CryptoAction.Decrypt: - final_path, local_path, offsets, symkey, iv, hmac_datafile = \ + final_path, internal_fdstart, offsets, symkey, iv, \ + hmac_datafile = \ inst[1], inst[2], inst[3], inst[4], inst[5], inst[6] # read encrypted data from disk with open(hmac_datafile, 'rb') as fd: @@ -271,8 +272,8 @@ def _worker_process(self): symkey, iv, encdata, offsets.unpad) # write decrypted data to disk if len(data) > 0: - with open(local_path, 'r+b') as fd: - fd.seek(offsets.fd_start, 0) + with open(final_path, 'r+b') as fd: + fd.seek(internal_fdstart + offsets.fd_start, 0) fd.write(data) self._done_cv.acquire() self._done_queue.put((final_path, offsets)) @@ -281,21 +282,22 @@ def _worker_process(self): self._done_cv.release() def add_decrypt_chunk( - self, final_path, local_path, offsets, symkey, iv, hmac_datafile): - # type: (CryptoOffload, str, str, blobxfer.models.download.Offsets, + self, final_path, internal_fdstart, offsets, symkey, iv, + hmac_datafile): + # type: (CryptoOffload, str, int, blobxfer.models.download.Offsets, # bytes, bytes, str) -> None """Add a chunk to decrypt :param CryptoOffload self: this :param str final_path: final path - :param str local_path: temp local path + :param int internal_fdstart: internal fd offset start :param blobxfer.models.download.Offsets offsets: offsets :param bytes symkey: symmetric key :param bytes iv: initialization vector :param str hmac_datafile: encrypted data file """ self._task_queue.put( - (CryptoAction.Decrypt, final_path, local_path, offsets, symkey, - iv, hmac_datafile) + (CryptoAction.Decrypt, final_path, internal_fdstart, offsets, + symkey, iv, hmac_datafile) ) # UNUSED due to AES256-CBC FullBlob mode diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 47c237e..eb01eac 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -97,6 +97,7 @@ def __init__(self, general_options, creds, spec): self._start_time = None self._delete_after = set() self._dd_map = {} + self._vio_map = {} self._general_options = general_options self._creds = creds self._spec = spec @@ -170,17 +171,30 @@ def ensure_local_destination(creds, spec): # ensure destination path spec.destination.ensure_path_exists() + @staticmethod + def create_unique_transfer_operation_id(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Create a unique transfer operation id + :param blobxfer.models.azure.StorageEntity ase: storage entity + :rtype: str + :return: unique transfer id + """ + return ';'.join( + (ase._client.primary_endpoint, ase.path, str(ase.vectored_io)) + ) + @staticmethod def create_unique_disk_operation_id(dd, offsets): # type: (blobxfer.models.download.Descriptor, - # blobxfer.models.download.Offsets) -> None + # blobxfer.models.download.Offsets) -> str """Create a unique disk operation id :param blobxfer.models.download.Descriptor dd: download descriptor :param blobxfer.models.download.Offsets offsets: download offsets + :rtype: str + :return: unique disk id """ - # TODO add local view offset or slice num with stripe support return ';'.join( - (str(dd.local_path), dd.entity._client.primary_endpoint, + (str(dd.final_path), dd.entity._client.primary_endpoint, dd.entity.path, str(offsets.range_start)) ) @@ -282,7 +296,9 @@ def _post_md5_skip_on_check(self, filename, md5_match): lpath = pathlib.Path(filename) if md5_match: with self._transfer_lock: - self._transfer_set.remove(lpath) + self._transfer_set.remove( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(rfile)) self._download_total -= 1 self._download_bytes_total -= lpath.stat().st_size else: @@ -467,14 +483,37 @@ def _process_download_descriptor(self, dd): del resume_bytes # check if all operations completed if offsets is None and dd.all_operations_completed: - # finalize file - dd.finalize_file() + finalize = True + # finalize integrity + dd.finalize_integrity() # accounting with self._transfer_lock: + sfpath = str(dd.final_path) if dd.entity.is_encrypted: - self._dd_map.pop(str(dd.final_path)) - self._transfer_set.remove(dd.final_path) + self._dd_map.pop(sfpath) + self._transfer_set.remove( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(dd.entity)) self._download_sofar += 1 + if dd.entity.vectored_io is not None: + if sfpath not in self._vio_map: + self._vio_map[sfpath] = 1 + else: + self._vio_map[sfpath] += 1 + if (self._vio_map[sfpath] == + dd.entity.vectored_io.total_slices): + self._vio_map.pop(sfpath) + else: + finalize = False + del sfpath + # finalize file + if finalize: + dd.finalize_file() + # remove from delete after set + try: + self._delete_after.remove(dd.final_path) + except KeyError: + pass return # re-enqueue for other threads to download self._transfer_queue.put(dd) @@ -524,7 +563,7 @@ def _process_data(self, dd, offsets, data): # decrypt data if self._crypto_offload is not None: self._crypto_offload.add_decrypt_chunk( - str(dd.final_path), str(dd.local_path), offsets, + str(dd.final_path), dd._view.fd_start, offsets, dd.entity.encryption_metadata.symmetric_key, iv, _hmac_datafile) # data will be integrity checked and written once @@ -652,11 +691,6 @@ def _run(self): else: lpath = pathlib.Path( self._spec.destination.path, rfile.name) - # remove from delete after set - try: - self._delete_after.remove(lpath) - except KeyError: - pass # check on download conditions action = self._check_download_conditions(lpath, rfile) if action == DownloadAction.Skip: @@ -665,7 +699,9 @@ def _run(self): continue # add potential download to set with self._transfer_lock: - self._transfer_set.add(lpath) + self._transfer_set.add( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(rfile)) # either MD5 check or download now if action == DownloadAction.CheckMd5: self._pre_md5_skip_on_check(lpath, rfile) diff --git a/blobxfer/operations/resume.py b/blobxfer/operations/resume.py index 0f76562..97e37e4 100644 --- a/blobxfer/operations/resume.py +++ b/blobxfer/operations/resume.py @@ -92,45 +92,59 @@ def datalock(self, acquire=True): if acquire: self._lock.release() - def get_record(self, final_path, lock=True): + @staticmethod + def generate_record_key(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Generate a record key + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: str + :return: record key + """ + return '{}:{}'.format(ase._client.primary_endpoint, ase.path) + + def get_record(self, ase, key=None, lock=True): # type: (DownloadResumeManager, str, # bool) -> blobxfer.models.resume.Download """Get a resume record :param DownloadResumeManager self: this - :param str final_path: final path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :param str key: record key :param bool lock: acquire lock :rtype: blobxfer.models.resume.Download :return: Download record """ + if key is None: + key = blobxfer.operations.resume.DownloadResumeManager.\ + generate_record_key(ase) with self.datalock(lock): try: - return self._data[final_path] + return self._data[key] except KeyError: return None def add_or_update_record( - self, final_path, temp_path, length, chunk_size, - next_integrity_chunk, completed, md5): - # type: (DownloadResumeManager, pathlib.Path, pathlib.Path, int, int, - # int, bool, str) -> None + self, final_path, ase, chunk_size, next_integrity_chunk, + completed, md5): + # type: (DownloadResumeManager, pathlib.Path, + # blobxfer.models.azure.StorageEntity, int, int, bool, + # str) -> None """Get a resume record :param DownloadResumeManager self: this :param pathlib.Path final_path: final path - :param pathlib.Path temp_path: temp local path - :param int length: content length + :param blobxfer.models.azure.StorageEntity ase: Storage Entity :param int chunk_size: chunk size in bytes :param int next_integrity_chunk: next integrity chunk :param bool completed: if completed :param str md5: md5 hex digest """ - sfp = str(final_path) + key = blobxfer.operations.resume.DownloadResumeManager.\ + generate_record_key(ase) with self.datalock(): - dl = self.get_record(sfp, lock=False) + dl = self.get_record(ase, key=key, lock=False) if dl is None: dl = blobxfer.models.resume.Download( - final_path=sfp, - temp_path=str(temp_path), - length=length, + final_path=str(final_path), + length=ase._size, chunk_size=chunk_size, next_integrity_chunk=next_integrity_chunk, completed=completed, @@ -145,5 +159,5 @@ def add_or_update_record( else: dl.next_integrity_chunk = next_integrity_chunk dl.md5hexdigest = md5 - self._data[sfp] = dl + self._data[key] = dl self._data.sync() From e8ab3780649ad79e6cd44464097ee3dd34056a61 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Sun, 28 May 2017 19:39:40 -0700 Subject: [PATCH 34/47] Docstring updates --- blobxfer/models/crypto.py | 3 ++- blobxfer/models/upload.py | 57 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py index c6670f2..b7b0004 100644 --- a/blobxfer/models/crypto.py +++ b/blobxfer/models/crypto.py @@ -329,13 +329,14 @@ def convert_to_json_with_mac(self, md5digest, hmacdigest): :rtype: dict :return: encryption metadata """ + # encrypt keys enc_content_key = blobxfer.operations.crypto.\ rsa_encrypt_key_base64_encoded( None, self._rsa_public_key, self.symmetric_key) enc_sign_key = blobxfer.operations.crypto.\ rsa_encrypt_key_base64_encoded( None, self._rsa_public_key, self.signing_key) - + # generate json encjson = { EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE: EncryptionMetadata._ENCRYPTION_MODE, diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index da3a1fe..bc251f9 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -91,7 +91,16 @@ def __str__(self): class LocalPath(object): + """Local Path""" + def __init__(self, parent_path, relative_path, view=None): + # type: (LocalPath, pathlib.Path, pathlib.Path, LocalPathView) -> None + """Ctor for LocalPath + :param LocalPath self: this + :param pathlib.Path parent_path: parent path + :param pathlib.Path relative_path: relative path + :param LocalPathView view: local path view + """ self.parent_path = parent_path self.relative_path = relative_path # populate properties @@ -111,30 +120,72 @@ def __init__(self, parent_path, relative_path, view=None): @property def absolute_path(self): + # type: (LocalPath) -> pathlib.Path + """Absolute path + :param LocalPath self: this + :rtype: pathlib.Path + :return: absolute path + """ return self.parent_path / self.relative_path @property def size(self): + # type: (LocalPath) -> int + """Size of view + :param LocalPath self: this + :rtype: int + :return: size of view portion of the file + """ return self._size @property def total_size(self): + # type: (LocalPath) -> int + """Total Size of file + :param LocalPath self: this + :rtype: int + :return: total size of file (non-view) + """ return self._stat.st_size @property def lmt(self): + # type: (LocalPath) -> int + """mtime of file + :param LocalPath self: this + :rtype: int + :return: mtime of file + """ return self._stat.st_mtime @property def mode(self): + # type: (LocalPath) -> str + """Octal file mode + :param LocalPath self: this + :rtype: str + :return: octal file mode + """ return str(oct(self._stat.st_mode)) @property def uid(self): + # type: (LocalPath) -> int + """Uid of file + :param LocalPath self: this + :rtype: int + :return: uid of file + """ return self._stat.st_uid @property def gid(self): + # type: (LocalPath) -> int + """Gid of file + :param LocalPath self: this + :rtype: int + :return: gid of file + """ return self._stat.st_gid @@ -142,6 +193,12 @@ class LocalSourcePath(blobxfer.models._BaseSourcePaths): """Local Source Path""" def can_rename(self): + # type: (LocalSourcePaths) -> bool + """Check if ource can be renamed + :param LocalSourcePath self: this + :rtype: bool + :return: if rename possible + """ return len(self._paths) == 1 and self._paths[0].is_file() def files(self): From 53d0beb8d9d14228adc88c2b3a644f6a5a3ada21 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Mon, 29 May 2017 18:00:29 -0700 Subject: [PATCH 35/47] Support stdin --- blobxfer/__init__.py | 11 +++ blobxfer/models/upload.py | 119 +++++++++++++++++++++++++------- blobxfer/operations/progress.py | 22 ++++-- blobxfer/operations/upload.py | 40 ++++++++--- 4 files changed, 150 insertions(+), 42 deletions(-) diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py index 8babc97..29ee1b0 100644 --- a/blobxfer/__init__.py +++ b/blobxfer/__init__.py @@ -22,6 +22,7 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. +import sys from .version import __version__ # noqa # monkeypatch User-Agent string @@ -31,3 +32,13 @@ # monkeypatch SOCKET_TIMEOUT value in Azure Storage SDK azure.storage._constants.SOCKET_TIMEOUT = (5, 300) + +# set stdin source +if sys.version_info >= (3, 0): + STDIN = sys.stdin.buffer +else: + # set stdin to binary mode on Windows + if sys.platform == 'win32': + import os, msvcrt # noqa + msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) + STDIN = sys.stdin diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index bc251f9..8f4c579 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # global defines _MAX_BLOCK_BLOB_ONESHOT_BYTES = 268435456 -_MAX_BLOCK_BLOB_CHUNKSIZE_BYTES = 268435456 +_MAX_BLOCK_BLOB_CHUNKSIZE_BYTES = 104857600 _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304 _MAX_NUM_CHUNKS = 50000 _DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216 @@ -93,18 +93,30 @@ def __str__(self): class LocalPath(object): """Local Path""" - def __init__(self, parent_path, relative_path, view=None): - # type: (LocalPath, pathlib.Path, pathlib.Path, LocalPathView) -> None + def __init__(self, parent_path, relative_path, use_stdin=False, view=None): + # type: (LocalPath, pathlib.Path, pathlib.Path, bool, + # LocalPathView) -> None """Ctor for LocalPath :param LocalPath self: this :param pathlib.Path parent_path: parent path :param pathlib.Path relative_path: relative path + :param bool use_stdin: use stdin :param LocalPathView view: local path view """ self.parent_path = parent_path self.relative_path = relative_path + self.use_stdin = use_stdin # populate properties - self._stat = self.absolute_path.stat() + if self.use_stdin: + # create dummy stat object + self._stat = type('stat', (object,), {}) + self._stat.st_size = 0 + self._stat.st_mtime = 0 + self._stat.st_mode = 0 + self._stat.st_uid = 0 + self._stat.st_gid = 0 + else: + self._stat = self.absolute_path.stat() if view is None: self.view = LocalPathView( fd_start=0, @@ -194,13 +206,25 @@ class LocalSourcePath(blobxfer.models._BaseSourcePaths): def can_rename(self): # type: (LocalSourcePaths) -> bool - """Check if ource can be renamed + """Check if source can be renamed :param LocalSourcePath self: this :rtype: bool :return: if rename possible """ return len(self._paths) == 1 and self._paths[0].is_file() + @staticmethod + def is_stdin(path): + # type: (str) -> bool + """Check if path is stdin + :param str path: path to check + :rtype: bool + :return: if path is stdin + """ + if path == '-' or path == '/dev/stdin': + return True + return False + def files(self): # type: (LocalSourcePaths) -> LocalPath """Generator for files in paths @@ -210,6 +234,15 @@ def files(self): """ for _path in self._paths: _ppath = os.path.expandvars(os.path.expanduser(str(_path))) + # check of path is stdin + if blobxfer.models.upload.LocalSourcePath.is_stdin(_ppath): + yield LocalPath( + parent_path=pathlib.Path(), + relative_path=pathlib.Path('stdin'), + use_stdin=True, + ) + continue + # resolve path _expath = pathlib.Path(_ppath).resolve() # check if path is a single file tmp = pathlib.Path(_ppath) @@ -217,7 +250,8 @@ def files(self): if self._inclusion_check(tmp.name): yield LocalPath( parent_path=tmp.parent, - relative_path=pathlib.Path(tmp.name) + relative_path=pathlib.Path(tmp.name), + use_stdin=False, ) continue del tmp @@ -225,7 +259,11 @@ def files(self): _rpath = pathlib.Path(entry.path).relative_to(_ppath) if not self._inclusion_check(_rpath): continue - yield LocalPath(parent_path=_expath, relative_path=_rpath) + yield LocalPath( + parent_path=_expath, + relative_path=_rpath, + use_stdin=False, + ) class Specification(object): @@ -523,7 +561,12 @@ def _adjust_chunk_size(self, options): logger.debug( 'auto-selected chunk size of {} for {}'.format( chunk_size, self.local_path.absolute_path)) - self._chunk_size = min((chunk_size, self._ase.size)) + if self.local_path.use_stdin: + self._chunk_size = max( + (chunk_size, _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES) + ) + else: + self._chunk_size = min((chunk_size, self._ase.size)) # ensure chunk sizes are compatible with mode if self._ase.mode == blobxfer.models.azure.StorageModes.Append: if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: @@ -533,7 +576,8 @@ def _adjust_chunk_size(self, options): 'from {}').format( self._chunk_size, self.local_path.absolute_path)) elif self._ase.mode == blobxfer.models.azure.StorageModes.Block: - if self._ase.size <= options.one_shot_bytes: + if (not self.local_path.use_stdin and + self._ase.size <= options.one_shot_bytes): self._chunk_size = min( (self._ase.size, options.one_shot_bytes) ) @@ -569,6 +613,8 @@ def _compute_total_chunks(self, chunk_size): chunks = int(math.ceil(self._ase.size / chunk_size)) except ZeroDivisionError: chunks = 1 + if self.local_path.use_stdin and chunks == 0: + chunks = 1 if chunks > 50000: max_vector = False if self._ase.mode == blobxfer.models.azure.StorageModes.Block: @@ -645,26 +691,49 @@ def next_offsets(self): ), resume_bytes def read_data(self, offsets): - # type: (Descriptor, Offsets) -> bytes + # type: (Descriptor, Offsets) -> Tuple[bytes, Offsets] """Read data from file :param Descriptor self: this :param Offsets offsets: offsets - :rtype: bytes - :return: file data - """ - if offsets.num_bytes == 0: - return None - # compute start from view - start = self.local_path.view.fd_start + offsets.range_start - # encrypted offsets will read past the end of the file due - # to padding, but will be accounted for after encryption+padding - with self.local_path.absolute_path.open('rb') as fd: - fd.seek(start, 0) - data = fd.read(offsets.num_bytes) - if self.must_compute_md5: + :rtype: tuple + :return: (file data bytes, new Offsets if stdin) + """ + newoffset = None + if not self.local_path.use_stdin: + if offsets.num_bytes == 0: + return None, None + # compute start from view + start = self.local_path.view.fd_start + offsets.range_start + # encrypted offsets will read past the end of the file due + # to padding, but will be accounted for after encryption+padding + with self.local_path.absolute_path.open('rb') as fd: + fd.seek(start, 0) + data = fd.read(offsets.num_bytes) + else: + data = blobxfer.STDIN.read(self._chunk_size) + if not data: + with self._meta_lock: + self._total_chunks -= 1 + self._chunk_num -= 1 + self._outstanding_ops -= 1 + else: + num_bytes = len(data) + with self._meta_lock: + newoffset = Offsets( + chunk_num=self._chunk_num - 1, + num_bytes=num_bytes, + range_start=self._offset, + range_end=self._offset + num_bytes - 1, + pad=False, + ) + self._total_chunks += 1 + self._outstanding_ops += 1 + self._offset += num_bytes + self._ase.size += num_bytes + if self.must_compute_md5 and data: with self._hasher_lock: self.md5.update(data) - return data + return data, newoffset def generate_metadata(self): # type: (Descriptor) -> dict @@ -690,7 +759,7 @@ def generate_metadata(self): encmeta = self._ase.encryption_metadata.convert_to_json_with_mac( md5digest, hmacdigest) # generate file attribute metadata - if self._store_file_attr: + if self._store_file_attr and not self.local_path.use_stdin: merged = blobxfer.models.metadata.generate_fileattr_metadata( self.local_path, genmeta) if merged is not None: diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index b9d93a7..654653f 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -48,9 +48,9 @@ def update_progress_bar( go, optext, start, total_files, files_sofar, total_bytes, - bytes_sofar): + bytes_sofar, stdin_upload=False): # type: (blobxfer.models.options.General, str, datetime.datetime, int, - # int, int, int) -> None + # int, int, int, bool) -> None """Update the progress bar :param blobxfer.models.options.General go: general options :param str optext: operation prefix text @@ -59,6 +59,7 @@ def update_progress_bar( :param int files_sofar: files transfered so far :param int total_bytes: total number of bytes :param int bytes_sofar: bytes transferred so far + :param bool stdin_upload: stdin upload """ if (not go.progress_bar or blobxfer.util.is_none_or_empty(go.log_file) or start is None): @@ -80,11 +81,18 @@ def update_progress_bar( fprog = 'n/a' else: fprog = '{}/{}'.format(files_sofar, total_files) - sys.stdout.write( - ('\r{0} progress: [{1:30s}] {2:.2f}% {3:12.3f} MiB/sec, ' - '{4} {5}').format( - optext, '>' * int(done * 30), done * 100, rate, fprog, rtext) - ) + if stdin_upload: + sys.stdout.write( + ('\r{0} progress: [{1:30s}] n/a % {2:12.3f} MiB/sec, ' + '{3} {4}').format( + optext, '>' * int(total_bytes % 30), rate, fprog, rtext) + ) + else: + sys.stdout.write( + ('\r{0} progress: [{1:30s}] {2:.2f}% {3:12.3f} MiB/sec, ' + '{4} {5}').format( + optext, '>' * int(done * 30), done * 100, rate, fprog, rtext) + ) if files_sofar == total_files: sys.stdout.write(os.linesep) sys.stdout.flush() diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 7fc094f..adfcc77 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -191,10 +191,11 @@ def append_slice_suffix_to_name(name, slice): """ return '{}.bxslice-{}'.format(name, slice) - def _update_progress_bar(self): - # type: (Uploader) -> None + def _update_progress_bar(self, stdin=False): + # type: (Uploader, bool) -> None """Update progress bar :param Uploader self: this + :param bool stdin: stdin upload """ if not self._all_files_processed: return @@ -206,6 +207,7 @@ def _update_progress_bar(self): self._upload_sofar, self._upload_bytes_total, self._upload_bytes_sofar, + stdin_upload=stdin, ) def _pre_md5_skip_on_check(self, src, rfile): @@ -370,7 +372,9 @@ def _process_transfer(self, ud, ase, offsets, data): self._put_data(ud, ase, offsets, data) # accounting with self._transfer_lock: - if offsets.chunk_num == 0: + if ud.local_path.use_stdin: + self._upload_bytes_total += offsets.num_bytes + elif offsets.chunk_num == 0: self._upload_bytes_total += ase.size self._upload_bytes_sofar += offsets.num_bytes self._transfer_set.remove( @@ -378,7 +382,7 @@ def _process_transfer(self, ud, ase, offsets, data): ud.local_path, ase, offsets)) ud.complete_offset_upload() # update progress bar - self._update_progress_bar() + self._update_progress_bar(stdin=ud.local_path.use_stdin) def _put_data(self, ud, ase, offsets, data): # type: (Uploader, blobxfer.models.upload.Descriptor, @@ -462,7 +466,15 @@ def _prepare_upload(self, ase, offsets): :param blobxfer.models.azure.StorageEntity ase: Storage entity :param blobxfer.models.upload.Offsets offsets: offsets """ - if ase.mode == blobxfer.models.azure.StorageModes.Block: + if ase.mode == blobxfer.models.azure.StorageModes.Append: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.append.create_blob( + ase, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Block: # create container if necessary blobxfer.operations.azure.blob.create_container( ase, self._containers_created, @@ -496,7 +508,7 @@ def _process_upload_descriptor(self, ud): :param Uploader self: this :param blobxfer.models.upload.Descriptor: upload descriptor """ - # get download offsets + # get upload offsets offsets, resume_bytes = ud.next_offsets() # add resume bytes to counter if resume_bytes is not None: @@ -531,7 +543,7 @@ def _process_upload_descriptor(self, ud): # encrypt data if self._crypto_offload is None: # read data from file and encrypt - data = ud.read_data(offsets) + data, _ = ud.read_data(offsets) encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( ud.entity.encryption_metadata.symmetric_key, ud.current_iv, data, offsets.pad) @@ -552,9 +564,15 @@ def _process_upload_descriptor(self, ud): # retrieved from crypto queue # return_early = True else: - data = ud.read_data(offsets) + data, newoffset = ud.read_data(offsets) + # set new offset if stdin + if newoffset is not None: + offsets = newoffset # re-enqueue for other threads to upload self._upload_queue.put(ud) + # no data can be returned on stdin uploads + if not data: + return # add data to transfer queue with self._transfer_lock: self._transfer_set.add( @@ -713,7 +731,7 @@ def _check_upload_conditions(self, local_path, rfile): """ lpath = local_path.absolute_path # check if local file still exists - if not lpath.exists(): + if not local_path.use_stdin and not lpath.exists(): return UploadAction.Skip # if remote file doesn't exist, upload if rfile is None: @@ -849,7 +867,8 @@ def _vectorize_and_bind(self, local_path, dest): :return: action, LocalPath, ase """ if (self._spec.options.vectored_io.distribution_mode == - blobxfer.models.upload.VectoredIoDistributionMode.Stripe): + blobxfer.models.upload.VectoredIoDistributionMode.Stripe and + not local_path.use_stdin): # compute total number of slices slices = int(math.ceil( local_path.total_size / @@ -897,6 +916,7 @@ def _vectorize_and_bind(self, local_path, dest): lp_slice = blobxfer.models.upload.LocalPath( parent_path=local_path.parent_path, relative_path=local_path.relative_path, + use_stdin=False, view=blobxfer.models.upload.LocalPathView( fd_start=start, fd_end=end, From 5ae05958ed13c94b47e73f067f739889261ced73 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 30 May 2017 08:55:36 -0700 Subject: [PATCH 36/47] Append blob support --- blobxfer/__init__.py | 5 +- blobxfer/models/azure.py | 32 ++++ blobxfer/models/upload.py | 17 +- blobxfer/operations/azure/blob/__init__.py | 31 ++++ blobxfer/operations/azure/blob/append.py | 15 ++ blobxfer/operations/azure/blob/page.py | 31 ---- blobxfer/operations/progress.py | 2 + blobxfer/operations/upload.py | 190 +++++++++++++-------- cli/cli.py | 3 +- 9 files changed, 216 insertions(+), 110 deletions(-) diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py index 29ee1b0..0a8432f 100644 --- a/blobxfer/__init__.py +++ b/blobxfer/__init__.py @@ -38,7 +38,8 @@ STDIN = sys.stdin.buffer else: # set stdin to binary mode on Windows - if sys.platform == 'win32': - import os, msvcrt # noqa + if sys.platform == 'win32': # noqa + import msvcrt + import os msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) STDIN = sys.stdin diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py index abfa10e..b9eb088 100644 --- a/blobxfer/models/azure.py +++ b/blobxfer/models/azure.py @@ -71,6 +71,8 @@ def __init__(self, container, ed=None, fileattr=None): self._snapshot = None self._md5 = None self._encryption = ed + self._from_local = False + self._append_create = True self._vio = None self._fileattr = None self.replica_targets = None @@ -184,6 +186,35 @@ def mode(self): """ return self._mode + @property + def from_local(self): + # type: (StorageEntity) -> bool + """If entity was created from a local file (no remote exists) + :param StorageEntity self: this + :rtype: bool + :return: if entity is from local (no remote exists) + """ + return self._from_local + + @property + def append_create(self): + # type: (StorageEntity) -> bool + """If append blob should be created + :param StorageEntity self: this + :rtype: bool + :return: if append blob should be created + """ + return self._append_create + + @append_create.setter + def append_create(self, value): + # type: (StorageEntity, bool) -> None + """Set append create option + :param StorageEntity self: this + :param bool value: value to set + """ + self._append_create = value + @property def is_encrypted(self): # type: (StorageEntity) -> bool @@ -303,6 +334,7 @@ def populate_from_local(self, sa, container, path, mode): self._container = container self._name = path self._mode = mode + self._from_local = True if mode == StorageModes.Append: self._client = sa.append_blob_client elif mode == StorageModes.Block: diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 8f4c579..585ea51 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -438,6 +438,16 @@ def remote_is_page_blob(self): """ return self.entity.mode == blobxfer.models.azure.StorageModes.Page + @property + def remote_is_append_blob(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure Append Blob + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure Append Blob + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.Append + @property def is_one_shot_block_blob(self): # type: (Descriptor) -> bool @@ -468,7 +478,8 @@ def requires_non_encrypted_md5_put(self): :rtype: bool :return: if finalize requires a put file properties """ - return not self.entity.is_encrypted and self.must_compute_md5 + return (not self.entity.is_encrypted and self.must_compute_md5 and + not self.remote_is_append_blob) @property def requires_set_file_properties_md5(self): @@ -505,7 +516,6 @@ def _initialize_encryption(self, options): :param Descriptor self: this :param blobxfer.models.options.Upload options: upload options """ - # TODO support append blobs? if (options.rsa_public_key is not None and self.local_path.size > 0 and (self._ase.mode == blobxfer.models.azure.StorageModes.Block or self._ase.mode == blobxfer.models.azure.StorageModes.File)): @@ -653,7 +663,8 @@ def _initialize_integrity_checkers(self, options): self.local_path.absolute_path)) self.hmac = self._ase.encryption_metadata.initialize_hmac() # both hmac and md5 can be enabled - if options.store_file_properties.md5: + if (options.store_file_properties.md5 and + not self.remote_is_append_blob): self.md5 = blobxfer.util.new_md5_hasher() def next_offsets(self): diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py index 63fd4a1..e256319 100644 --- a/blobxfer/operations/azure/blob/__init__.py +++ b/blobxfer/operations/azure/blob/__init__.py @@ -219,3 +219,34 @@ def create_container(ase, containers_created, timeout=None): logger.info( 'created blob container {} on storage account {}'.format( ase.container, ase.client.account_name)) + + +def set_blob_md5(ase, md5, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, str, int) -> None + """Set blob properties MD5 + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param str md5: md5 as base64 + :param int timeout: timeout + """ + ase.client.set_blob_properties( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + timeout=timeout) + + +def set_blob_metadata(ase, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Set blob metadata + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + ase.client.set_blob_metadata( + container_name=ase.container, + blob_name=ase.name, + metadata=metadata, + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/append.py b/blobxfer/operations/azure/blob/append.py index e28fcdb..abc276a 100644 --- a/blobxfer/operations/azure/blob/append.py +++ b/blobxfer/operations/azure/blob/append.py @@ -77,3 +77,18 @@ def create_blob(ase, timeout=None): content_type=blobxfer.util.get_mime_type(ase.name) ), timeout=timeout) + + +def append_block(ase, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, int) -> None + """Appends a block into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param bytes data: data + :param int timeout: timeout + """ + ase.client.append_block( + container_name=ase.container, + blob_name=ase.name, + block=data, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py index 4223a30..aa92b14 100644 --- a/blobxfer/operations/azure/blob/page.py +++ b/blobxfer/operations/azure/blob/page.py @@ -98,34 +98,3 @@ def put_page(ase, page_start, page_end, data, timeout=None): end_range=page_end, validate_content=False, # integrity is enforced with HTTPS timeout=timeout) - - -def set_blob_md5(ase, md5, timeout=None): - # type: (blobxfer.models.azure.StorageEntity, str, int) -> None - """Set blob properties MD5 - :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity - :param str md5: md5 as base64 - :param int timeout: timeout - """ - ase.client.set_blob_properties( - container_name=ase.container, - blob_name=ase.name, - content_settings=azure.storage.blob.models.ContentSettings( - content_type=blobxfer.util.get_mime_type(ase.name), - content_md5=md5, - ), - timeout=timeout) - - -def set_blob_metadata(ase, metadata, timeout=None): - # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None - """Set blob metadata - :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity - :param dict metadata: metadata kv pairs - :param int timeout: timeout - """ - ase.client.set_blob_metadata( - container_name=ase.container, - blob_name=ase.name, - metadata=metadata, - timeout=timeout) diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index 654653f..07a9281 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -145,6 +145,8 @@ def output_parameters(general_options, spec): # TODO handle synccopy spec # common block + log.append(' log file: {}'.format( + general_options.log_file)) log.append(' resume file: {}'.format( general_options.resume_file)) log.append(' timeout: {}'.format( diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index adfcc77..b756590 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -381,6 +381,9 @@ def _process_transfer(self, ud, ase, offsets, data): blobxfer.operations.upload.Uploader.create_unique_transfer_id( ud.local_path, ase, offsets)) ud.complete_offset_upload() + # add descriptor back to upload queue only for append blobs + if ud.entity.mode == blobxfer.models.azure.StorageModes.Append: + self._upload_queue.put(ud) # update progress bar self._update_progress_bar(stdin=ud.local_path.use_stdin) @@ -397,7 +400,10 @@ def _put_data(self, ud, ase, offsets, data): """ print('UL', offsets, ase.path, len(data) if data is not None else None) if ase.mode == blobxfer.models.azure.StorageModes.Append: - raise NotImplementedError() + # append block + if data is not None: + blobxfer.operations.azure.blob.append.append_block( + ase, data, timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.Block: # handle one-shot uploads if ud.is_one_shot_block_blob: @@ -467,13 +473,14 @@ def _prepare_upload(self, ase, offsets): :param blobxfer.models.upload.Offsets offsets: offsets """ if ase.mode == blobxfer.models.azure.StorageModes.Append: - # create container if necessary - blobxfer.operations.azure.blob.create_container( - ase, self._containers_created, - timeout=self._general_options.timeout_sec) - # create remote blob - blobxfer.operations.azure.blob.append.create_blob( - ase, timeout=self._general_options.timeout_sec) + if ase.append_create: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.append.create_blob( + ase, timeout=self._general_options.timeout_sec) elif ase.mode == blobxfer.models.azure.StorageModes.Block: # create container if necessary blobxfer.operations.azure.blob.create_container( @@ -520,7 +527,7 @@ def _process_upload_descriptor(self, ud): # check if all operations completed if offsets is None and ud.all_operations_completed: # finalize file - self._finalize_file(ud) + self._finalize_upload(ud) # accounting with self._upload_lock: if ud.entity.is_encrypted: @@ -568,8 +575,9 @@ def _process_upload_descriptor(self, ud): # set new offset if stdin if newoffset is not None: offsets = newoffset - # re-enqueue for other threads to upload - self._upload_queue.put(ud) + # re-enqueue for other threads to upload if not append + if ud.entity.mode != blobxfer.models.azure.StorageModes.Append: + self._upload_queue.put(ud) # no data can be returned on stdin uploads if not data: return @@ -589,73 +597,106 @@ def _process_upload_descriptor(self, ud): ) self._transfer_queue.put((ud, ase, offsets, data)) - def _finalize_file(self, ud): + def _finalize_block_blob(self, ud, metadata): + """Finalize Block blob + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + if not ud.entity.is_encrypted and ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.put_block_list( + ud.entity, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.block.put_block_list( + ase, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) + + def _set_blob_md5(self, ud): + """Set blob MD5 + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + """ + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.blob.set_blob_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.set_blob_md5( + ase, digest, timeout=self._general_options.timeout_sec) + + def _set_blob_metadata(self, ud, metadata): + """Set blob metadata + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + blobxfer.operations.azure.blob.set_blob_metadata( + ud.entity, metadata, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.set_blob_metadata( + ase, metadata, timeout=self._general_options.timeout_sec) + + def _finalize_nonblock_blob(self, ud, metadata): + """Finalize Non-Block blob + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + # set md5 page blob property if required + if ud.requires_non_encrypted_md5_put: + self._set_blob_md5(ud) + # set metadata if needed + if blobxfer.util.is_not_empty(metadata): + self._set_blob_metadata(ud, metadata) + + def _finalize_azure_file(self, ud, metadata): + # type: (Uploader, blobxfer.models.upload.Descriptor, dict) -> None + """Finalize Azure File + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + # set md5 file property if required + if ud.requires_non_encrypted_md5_put: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.file.set_file_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_md5( + ase, digest, timeout=self._general_options.timeout_sec) + # set file metadata if needed + if blobxfer.util.is_not_empty(metadata): + blobxfer.operations.azure.file.set_file_metadata( + ud.entity, metadata, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_metadata( + ase, metadata, + timeout=self._general_options.timeout_sec) + + def _finalize_upload(self, ud): # type: (Uploader, blobxfer.models.upload.Descriptor) -> None """Finalize file upload :param Uploader self: this - :param blobxfer.models.upload.Descriptor: upload descriptor + :param blobxfer.models.upload.Descriptor ud: upload descriptor """ metadata = ud.generate_metadata() - # put block list for non one-shot block blobs if ud.requires_put_block_list: - if not ud.entity.is_encrypted and ud.must_compute_md5: - digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) - else: - digest = None - blobxfer.operations.azure.blob.block.put_block_list( - ud.entity, ud.last_block_num, digest, metadata, - timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.blob.block.put_block_list( - ase, ud.last_block_num, digest, metadata, - timeout=self._general_options.timeout_sec) - # page blob finalization - if ud.remote_is_page_blob: - # set md5 page blob property if required - if ud.requires_non_encrypted_md5_put: - digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) - blobxfer.operations.azure.blob.page.set_blob_md5( - ud.entity, digest, - timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.blob.page.set_blob_md5( - ase, digest, - timeout=self._general_options.timeout_sec) - # set metadata if needed - if blobxfer.util.is_not_empty(metadata): - blobxfer.operations.azure.blob.page.set_blob_metadata( - ud.entity, metadata, - timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.blob.page.set_blob_metadata( - ase, metadata, - timeout=self._general_options.timeout_sec) - # azure file finalization - if ud.remote_is_file: - # set md5 file property if required - if ud.requires_non_encrypted_md5_put: - digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) - blobxfer.operations.azure.file.set_file_md5( - ud.entity, digest, - timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.file.set_file_md5( - ase, digest, - timeout=self._general_options.timeout_sec) - # set file metadata if needed - if blobxfer.util.is_not_empty(metadata): - blobxfer.operations.azure.file.set_file_metadata( - ud.entity, metadata, - timeout=self._general_options.timeout_sec) - if blobxfer.util.is_not_empty(ud.entity.replica_targets): - for ase in ud.entity.replica_targets: - blobxfer.operations.azure.file.set_file_metadata( - ase, metadata, - timeout=self._general_options.timeout_sec) + # put block list for non one-shot block blobs + self._finalize_block_blob(ud, metadata) + elif ud.remote_is_page_blob or ud.remote_is_append_blob: + # append and page blob finalization + self._finalize_nonblock_blob(ud, metadata) + elif ud.remote_is_file: + # azure file finalization + self._finalize_azure_file(ud, metadata) def _get_destination_paths(self): # type: (Uploader) -> @@ -734,10 +775,13 @@ def _check_upload_conditions(self, local_path, rfile): if not local_path.use_stdin and not lpath.exists(): return UploadAction.Skip # if remote file doesn't exist, upload - if rfile is None: + if rfile is None or rfile.from_local: return UploadAction.Upload # check overwrite option if not self._spec.options.overwrite: + if rfile.mode == blobxfer.models.azure.StorageModes.Append: + rfile.append_create = False + return UploadAction.Upload logger.info( 'not overwriting remote file: {} (local: {})'.format( rfile.path, lpath)) diff --git a/cli/cli.py b/cli/cli.py index 8ce121b..a5e7ab1 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -469,7 +469,8 @@ def callback(ctx, param, value): '--overwrite/--no-overwrite', expose_value=False, default=True, - help='Overwrite destination if exists [True]', + help='Overwrite destination if exists. For append blobs, ' + '--no-overwrite will append to any existing blob. [True]', callback=callback)(f) From 351eee537c0ef23c8d867a630bd68efc2ab7d72a Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 30 May 2017 12:59:39 -0700 Subject: [PATCH 37/47] useconfig upload/download support - Fix various yaml/dict config issues - Allow md5 checks through vectored io stripes --- blobxfer/models/download.py | 116 ++++++++++++++++++++--------- blobxfer/models/metadata.py | 3 +- blobxfer/operations/download.py | 74 ++++++++++++------ blobxfer/operations/md5.py | 51 +++++++++---- blobxfer/operations/upload.py | 21 +++--- cli/cli.py | 57 +++++++++++--- cli/settings.py | 128 ++++++++++++++++++-------------- 7 files changed, 297 insertions(+), 153 deletions(-) diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index e2adcbc..a197a25 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -49,7 +49,8 @@ # create logger logger = logging.getLogger(__name__) - +# global defines +_AUTO_SELECT_CHUNKSIZE_BYTES = 16777216 # named tuples Offsets = collections.namedtuple( 'Offsets', [ @@ -167,10 +168,12 @@ def __init__( # validate compatible options if not self.options.check_file_md5 and self.skip_on.md5_match: raise ValueError( - 'Cannot specify skip on MD5 match without file MD5 enabled') + 'cannot specify skip on MD5 match without file MD5 enabled') if (self.options.restore_file_attributes and not blobxfer.util.on_windows() and os.getuid() != 0): - logger.warning('Cannot set file uid/gid without root privileges') + logger.warning('cannot set file uid/gid without root privileges') + if self.options.chunk_size_bytes < 0: + raise ValueError('chunk size cannot be negative') def add_azure_source_path(self, source): # type: (Specification, blobxfer.operations.azure.SourcePath) -> None @@ -212,9 +215,14 @@ def __init__(self, lpath, ase, options, resume_mgr): self._ase = ase # set paths self.final_path = lpath - self._view = None + self.view = None + # auto-select chunk size + if options.chunk_size_bytes == 0: + chunk_size_bytes = _AUTO_SELECT_CHUNKSIZE_BYTES + else: + chunk_size_bytes = options.chunk_size_bytes + self._chunk_size = min((chunk_size_bytes, self._ase.size)) # calculate the total number of ops required for transfer - self._chunk_size = min((options.chunk_size_bytes, self._ase.size)) self._total_chunks = self._compute_total_chunks(self._chunk_size) self._outstanding_ops = self._total_chunks # initialize integrity checkers @@ -296,20 +304,23 @@ def _initialize_integrity_checkers(self, options): blobxfer.util.is_not_empty(self._ase.md5)): self.md5 = blobxfer.util.new_md5_hasher() - def _compute_allocated_size(self, size): - # type: (Descriptor, int) -> int + @staticmethod + def compute_allocated_size(size, is_encrypted): + # type: (int, bool) -> int """Compute allocated size on disk - :param Descriptor self: this :param int size: size (content length) + :param bool is_ecrypted: if entity is encrypted :rtype: int :return: required size on disk """ # compute size if size > 0: - if self._ase.is_encrypted: + if is_encrypted: # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs - allocatesize = (size // self._AES_BLOCKSIZE - 1) * \ - self._AES_BLOCKSIZE + allocatesize = ( + size // + blobxfer.models.download.Descriptor._AES_BLOCKSIZE - 1 + ) * blobxfer.models.download.Descriptor._AES_BLOCKSIZE else: allocatesize = size if allocatesize < 0: @@ -318,6 +329,49 @@ def _compute_allocated_size(self, size): allocatesize = 0 return allocatesize + @staticmethod + def generate_view(ase): + # type: (blobxfer.models.azure.StorageEntity) -> + # Tuple[LocalPathView, int] + """Generate local path view and total size required + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: tuple + :return: (local path view, allocation size) + """ + slicesize = blobxfer.models.download.Descriptor.compute_allocated_size( + ase.size, ase.is_encrypted) + if ase.vectored_io is None: + view = LocalPathView( + fd_start=0, + fd_end=slicesize, + ) + total_size = ase.size + else: + view = LocalPathView( + fd_start=ase.vectored_io.offset_start, + fd_end=ase.vectored_io.offset_start + slicesize, + ) + total_size = ase.vectored_io.total_size + return view, total_size + + @staticmethod + def convert_vectored_io_slice_to_final_path_name(local_path, ase): + # type: (pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> pathlib.Path + """Convert vectored io slice to final path name + :param pathlib.Path local_path: local path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: pathlib.Path + :return: converted final path + """ + name = local_path.name + name = blobxfer.models.metadata.\ + remove_vectored_io_slice_suffix_from_name( + name, ase.vectored_io.slice_id) + _tmp = list(local_path.parts[:-1]) + _tmp.append(name) + return pathlib.Path(*_tmp) + def _set_final_path_view(self): # type: (Descriptor) -> int """Set final path view and return required space on disk @@ -325,26 +379,16 @@ def _set_final_path_view(self): :rtype: int :return: required size on disk """ - slicesize = self._compute_allocated_size(self._ase.size) - if self._ase.vectored_io is None: - self._view = LocalPathView( - fd_start=0, - fd_end=slicesize, - ) - return self._ase.size - else: - name = self.final_path.name - name = blobxfer.models.metadata.\ - remove_vectored_io_slice_suffix_from_name( - name, self._ase.vectored_io.slice_id) - _tmp = list(self.final_path.parts[:-1]) - _tmp.append(name) - self.final_path = pathlib.Path(*_tmp) - self._view = LocalPathView( - fd_start=self._ase.vectored_io.offset_start, - fd_end=self._ase.vectored_io.offset_start + slicesize, - ) - return self._ase.vectored_io.total_size + # set final path if vectored io stripe + if self._ase.vectored_io is not None: + self.final_path = blobxfer.models.download.Descriptor.\ + convert_vectored_io_slice_to_final_path_name( + self.final_path, self._ase) + # generate view + view, total_size = blobxfer.models.download.Descriptor.generate_view( + self._ase) + self.view = view + return total_size def _allocate_disk_space(self): # type: (Descriptor) -> None @@ -431,12 +475,12 @@ def _resume(self): logger.debug( 'integrity checking existing file {} offset {} -> {}'.format( self.final_path, - self._view.fd_start, - self._view.fd_start + _end_offset) + self.view.fd_start, + self.view.fd_start + _end_offset) ) with self._hasher_lock: with self.final_path.open('rb') as filedesc: - filedesc.seek(self._view.fd_start, 0) + filedesc.seek(self.view.fd_start, 0) while _fd_offset < _end_offset: if (_fd_offset + _blocksize) > _end_offset: _blocksize = _end_offset - _fd_offset @@ -559,7 +603,7 @@ def write_unchecked_data(self, offsets, data): self.write_data(offsets, data) unchecked = UncheckedChunk( data_len=len(data), - fd_start=self._view.fd_start + offsets.fd_start, + fd_start=self.view.fd_start + offsets.fd_start, file_path=self.final_path, temp=False, ) @@ -666,7 +710,7 @@ def write_data(self, offsets, data): if len(data) > 0: with self.final_path.open('r+b') as fd: # offset some internal view - fd.seek(self._view.fd_start + offsets.fd_start, 0) + fd.seek(self.view.fd_start + offsets.fd_start, 0) fd.write(data) def finalize_integrity(self): diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py index f891696..7d5ea0d 100644 --- a/blobxfer/models/metadata.py +++ b/blobxfer/models/metadata.py @@ -221,8 +221,7 @@ def generate_vectored_io_stripe_metadata(local_path, metadata): local_path.view.total_slices, _JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID: local_path.view.slice_num, - _JSON_KEY_VECTORED_IO_STRIPE_NEXT: - explode_vectored_io_next_entry(local_path.view.next), + _JSON_KEY_VECTORED_IO_STRIPE_NEXT: local_path.view.next, } } } diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index eb01eac..3df37bc 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -89,9 +89,9 @@ def __init__(self, general_options, creds, spec): self._disk_set = set() self._disk_threads = [] self._download_start_time = None - self._download_total = None + self._download_total = 0 self._download_sofar = 0 - self._download_bytes_total = None + self._download_bytes_total = 0 self._download_bytes_sofar = 0 self._download_terminate = False self._start_time = None @@ -224,7 +224,13 @@ def _check_download_conditions(self, lpath, rfile): :return: download action """ if not lpath.exists(): - return DownloadAction.Download + if rfile.vectored_io is not None: + fpath = blobxfer.models.download.Descriptor.\ + convert_vectored_io_slice_to_final_path_name(lpath, rfile) + if not fpath.exists(): + return DownloadAction.Download + else: + return DownloadAction.Download if not self._spec.options.overwrite: logger.info( 'not overwriting local file: {} (remote: {})'.format( @@ -279,28 +285,44 @@ def _pre_md5_skip_on_check(self, lpath, rfile): pre_encrypted_content_md5 if md5 is None: md5 = rfile.md5 - slpath = str(lpath) + key = blobxfer.operations.download.Downloader.\ + create_unique_transfer_operation_id(rfile) with self._md5_meta_lock: - self._md5_map[slpath] = rfile - self._md5_offload.add_localfile_for_md5_check(slpath, md5, rfile.mode) + self._md5_map[key] = rfile + slpath = str(lpath) + # temporarily create a download descriptor view for vectored io + if rfile.vectored_io is not None: + view, _ = blobxfer.models.download.Descriptor.generate_view(rfile) + fpath = str( + blobxfer.models.download.Descriptor. + convert_vectored_io_slice_to_final_path_name(lpath, rfile) + ) + else: + fpath = slpath + self._md5_offload.add_localfile_for_md5_check( + key, slpath, fpath, md5, rfile.mode, view) - def _post_md5_skip_on_check(self, filename, md5_match): - # type: (Downloader, str, bool) -> None + def _post_md5_skip_on_check(self, key, filename, size, md5_match): + # type: (Downloader, str, str, int, bool) -> None """Perform post MD5 skip on check :param Downloader self: this + :param str key: md5 map key :param str filename: local filename + :param int size: size of checked data :param bool md5_match: if MD5 matches """ with self._md5_meta_lock: - rfile = self._md5_map.pop(filename) + rfile = self._md5_map.pop(key) lpath = pathlib.Path(filename) if md5_match: + if size is None: + size = lpath.stat().st_size with self._transfer_lock: self._transfer_set.remove( blobxfer.operations.download.Downloader. create_unique_transfer_operation_id(rfile)) self._download_total -= 1 - self._download_bytes_total -= lpath.stat().st_size + self._download_bytes_total -= size else: self._add_to_download_queue(lpath, rfile) @@ -325,7 +347,8 @@ def _check_for_downloads_from_md5(self): break cv.release() if result is not None: - self._post_md5_skip_on_check(result[0], result[1]) + self._post_md5_skip_on_check( + result[0], result[1], result[2], result[3]) def _check_for_crypto_done(self): # type: (Downloader) -> None @@ -563,7 +586,7 @@ def _process_data(self, dd, offsets, data): # decrypt data if self._crypto_offload is not None: self._crypto_offload.add_decrypt_chunk( - str(dd.final_path), dd._view.fd_start, offsets, + str(dd.final_path), dd.view.fd_start, offsets, dd.entity.encryption_metadata.symmetric_key, iv, _hmac_datafile) # data will be integrity checked and written once @@ -674,16 +697,12 @@ def _run(self): self._initialize_transfer_threads() self._initialize_disk_threads() # initialize local counters - nfiles = 0 - total_size = 0 skipped_files = 0 skipped_size = 0 # iterate through source paths to download for src in self._spec.sources: for rfile in src.files( self._creds, self._spec.options, self._general_options): - nfiles += 1 - total_size += rfile.size # form local path for remote file if (not self._spec.destination.is_dir and self._spec.options.rename): @@ -702,22 +721,26 @@ def _run(self): self._transfer_set.add( blobxfer.operations.download.Downloader. create_unique_transfer_operation_id(rfile)) + self._download_total += 1 + self._download_bytes_total += rfile.size # either MD5 check or download now if action == DownloadAction.CheckMd5: self._pre_md5_skip_on_check(lpath, rfile) elif action == DownloadAction.Download: self._add_to_download_queue(lpath, rfile) - self._download_total = nfiles - skipped_files - self._download_bytes_total = total_size - skipped_size - download_size_mib = self._download_bytes_total / blobxfer.util.MEGABYTE # set remote files processed with self._md5_meta_lock: self._all_remote_files_processed = True - logger.debug( - ('{0} remote files processed, waiting for download completion ' - 'of {1:.4f} MiB').format(nfiles, download_size_mib)) - del nfiles - del total_size + with self._transfer_lock: + self._download_total -= skipped_files + self._download_bytes_total -= skipped_size + download_size_mib = ( + self._download_bytes_total / blobxfer.util.MEGABYTE + ) + logger.debug( + ('{0} remote files processed, waiting for download ' + 'completion of approx. {1:.4f} MiB').format( + self._download_total, download_size_mib)) del skipped_files del skipped_size # wait for downloads to complete @@ -747,6 +770,9 @@ def _run(self): # output throughput if self._download_start_time is not None: dltime = (end_time - self._download_start_time).total_seconds() + download_size_mib = ( + self._download_bytes_total / blobxfer.util.MEGABYTE + ) dlmibspeed = download_size_mib / dltime logger.info( ('elapsed download + verify time and throughput of {0:.4f} ' diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py index 7239a2d..f14431f 100644 --- a/blobxfer/operations/md5.py +++ b/blobxfer/operations/md5.py @@ -47,7 +47,8 @@ _MAX_PAGE_SIZE_BYTES = 4194304 -def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): +def compute_md5_for_file_asbase64( + filename, pagealign=False, start=None, end=None, blocksize=65536): # type: (str, bool, int) -> str """Compute MD5 hash for file and encode as Base64 :param str filename: file to compute MD5 for @@ -58,7 +59,16 @@ def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): """ hasher = blobxfer.util.new_md5_hasher() with open(filename, 'rb') as filedesc: + if start is not None: + filedesc.seek(start) + curr = start + else: + curr = 0 while True: + if end is not None and curr + blocksize > end: + blocksize = end - curr + if blocksize == 0: + break buf = filedesc.read(blocksize) if not buf: break @@ -68,6 +78,7 @@ def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): if aligned != buflen: buf = buf.ljust(aligned, b'\0') hasher.update(buf) + curr += blocksize return blobxfer.util.base64_encode_as_string(hasher.digest()) @@ -120,33 +131,47 @@ def _worker_process(self): """ while not self.terminated: try: - filename, remote_md5, pagealign = self._task_queue.get( - True, 0.25) + key, lpath, fpath, remote_md5, pagealign, lpview = \ + self._task_queue.get(True, 0.1) except queue.Empty: continue + if lpview is None: + start = None + end = None + size = None + else: + start = lpview.fd_start + end = lpview.fd_end + size = end - start md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( - filename, pagealign) - logger.debug('MD5: {} {} {}'.format( - md5, remote_md5, filename)) + fpath, pagealign, start, end) + logger.debug('pre-transfer MD5 check: {} {} {}'.format( + md5, remote_md5, fpath)) self._done_cv.acquire() - self._done_queue.put((filename, md5 == remote_md5)) + self._done_queue.put((key, lpath, size, md5 == remote_md5)) self._done_cv.notify() self._done_cv.release() - def add_localfile_for_md5_check(self, filename, remote_md5, mode): - # type: (LocalFileMd5Offload, str, str, - # blobxfer.models.azure.StorageModes) -> None + def add_localfile_for_md5_check( + self, key, lpath, fpath, remote_md5, mode, lpview): + # type: (LocalFileMd5Offload, str, str, str, str, + # blobxfer.models.azure.StorageModes, object) -> None """Add a local file to MD5 check queue :param LocalFileMd5Offload self: this - :param str filename: file to compute MD5 for + :param str key: md5 map key + :param str lpath: "local" path for descriptor + :param str fpath: "final" path for/where file :param str remote_md5: remote MD5 to compare against :param blobxfer.models.azure.StorageModes mode: mode + :param object lpview: local path view """ if blobxfer.util.is_none_or_empty(remote_md5): raise ValueError('comparison MD5 is empty for file {}'.format( - filename)) + lpath)) if mode == blobxfer.models.azure.StorageModes.Page: pagealign = True else: pagealign = False - self._task_queue.put((filename, remote_md5, pagealign)) + self._task_queue.put( + (key, lpath, fpath, remote_md5, pagealign, lpview) + ) diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index b756590..6bf9761 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -226,26 +226,26 @@ def _pre_md5_skip_on_check(self, src, rfile): pre_encrypted_content_md5 if md5 is None: md5 = rfile.md5 - slpath = str(src.absolute_path) + key = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) with self._md5_meta_lock: - self._md5_map[slpath] = (src, rfile) - self._md5_offload.add_localfile_for_md5_check(slpath, md5, rfile.mode) + self._md5_map[key] = (src, rfile) + self._md5_offload.add_localfile_for_md5_check( + key, None, str(src.absolute_path), md5, rfile.mode, src.view) - def _post_md5_skip_on_check(self, filename, md5_match): + def _post_md5_skip_on_check(self, key, md5_match): # type: (Uploader, str, bool) -> None """Perform post MD5 skip on check :param Uploader self: this - :param str filename: local filename + :param str key: md5 map key :param bool md5_match: if MD5 matches """ with self._md5_meta_lock: - src, rfile = self._md5_map.pop(filename) + src, rfile = self._md5_map.pop(key) uid = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) if md5_match: with self._upload_lock: self._upload_set.remove(uid) self._upload_total -= 1 - self._upload_bytes_total -= src.size else: self._add_to_upload_queue(src, rfile, uid) @@ -270,7 +270,7 @@ def _check_for_uploads_from_md5(self): break cv.release() if result is not None: - self._post_md5_skip_on_check(result[0], result[1]) + self._post_md5_skip_on_check(result[0], result[3]) def _add_to_upload_queue(self, src, rfile, uid): # type: (Uploader, blobxfer.models.upload.LocalPath, @@ -398,7 +398,6 @@ def _put_data(self, ud, ase, offsets, data): :param blobxfer.models.upload.Offsets offsets: offsets :param bytes data: data to upload """ - print('UL', offsets, ase.path, len(data) if data is not None else None) if ase.mode == blobxfer.models.azure.StorageModes.Append: # append block if data is not None: @@ -875,9 +874,9 @@ def _generate_destination_for_source(self, local_path): spath = pathlib.Path(*_rparts[_strip:]) # create a storage entity for each destination for sa, cont, name, dpath in self._get_destination_paths(): - # apply rename + # if not renaming, form name from with spath if not self._spec.options.rename: - name = str(spath / name) + name = str(name / spath) if blobxfer.util.is_none_or_empty(name): raise ValueError( ('invalid destination, must specify a container or ' diff --git a/cli/cli.py b/cli/cli.py index a5e7ab1..7375891 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -93,12 +93,14 @@ def _init_config(self): if blobxfer.util.is_not_empty(self.yaml_config): self.yaml_config = pathlib.Path(self.yaml_config) self._read_yaml_file(self.yaml_config) - # merge cli options with config - settings.merge_settings(self.config, self.cli_options) + else: + # merge cli options with config + settings.merge_settings(self.config, self.cli_options) # set log file if specified - blobxfer.util.setup_logger(logger, self.config['options']['log_file']) + blobxfer.util.setup_logger( + logger, self.config['options'].get('log_file', None)) # output config - if self.config['options']['verbose']: + if self.config['options'].get('verbose', False): blobxfer.util.set_verbose_logger_handlers() logger.debug('config: \n' + json.dumps(self.config, indent=4)) # free mem @@ -174,7 +176,7 @@ def callback(ctx, param, value): '--progress-bar/--no-progress-bar', expose_value=False, default=True, - help='Display progress bar instead of console logs', + help='Display progress bar instead of console logs [True]', callback=callback)(f) @@ -331,7 +333,7 @@ def callback(ctx, param, value): '--chunk-size-bytes', expose_value=False, type=int, - default=4194304, + default=0, help='Block or chunk size in bytes; set to 0 for auto-select ' 'on upload [0]', callback=callback)(f) @@ -794,19 +796,54 @@ def upload(ctx, local_resource, storage_account, remote_path): @cli.group() @pass_cli_context def useconfig(ctx): - """Use config file for transfer""" + """Use yaml configuration file for transfer""" pass -@useconfig.command('upload') +@useconfig.command('download') +@config_arguments +@common_options +@pass_cli_context +def useconfig_download(ctx, config): + """Download blobs or files from Azure Storage via yaml configuration""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Download, None, None, None) + ctx.initialize() + specs = settings.create_download_specifications(ctx.config) + for spec in specs: + blobxfer.api.Downloader( + ctx.general_options, ctx.credentials, spec + ).start() + + +@useconfig.command('synccopy') @config_arguments @common_options @pass_cli_context -def useconfig_upload(ctx): - """Upload files to Azure File Storage""" +def useconfig_synccopy(ctx, config): + """Synchronously copy blobs between Azure Storage accounts via yaml + configuration""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Synccopy, None, None, None) ctx.initialize() raise NotImplementedError() +@useconfig.command('upload') +@config_arguments +@common_options +@pass_cli_context +def useconfig_upload(ctx, config): + """Upload files to Azure Storage via yaml configuration""" + settings.add_cli_options( + ctx.cli_options, settings.TransferAction.Upload, None, None, None) + ctx.initialize() + specs = settings.create_upload_specifications(ctx.config) + for spec in specs: + blobxfer.api.Uploader( + ctx.general_options, ctx.credentials, spec + ).start() + + if __name__ == '__main__': cli() diff --git a/cli/settings.py b/cli/settings.py index d198359..5911719 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -219,15 +219,22 @@ def merge_settings(config, cli_options): # merge general options if 'options' not in config: config['options'] = {} - config['options']['crypto_processes'] = cli_options['crypto_processes'] config['options']['log_file'] = cli_options['log_file'] - config['options']['md5_processes'] = cli_options['md5_processes'] config['options']['progress_bar'] = cli_options['progress_bar'] config['options']['resume_file'] = cli_options['resume_file'] config['options']['timeout_sec'] = cli_options['timeout'] - config['options']['disk_threads'] = cli_options['disk_threads'] - config['options']['transfer_threads'] = cli_options['transfer_threads'] config['options']['verbose'] = cli_options['verbose'] + # merge concurrency options + if 'concurrency' not in config['options']: + config['options']['concurrency'] = {} + config['options']['concurrency']['crypto_processes'] = \ + cli_options['crypto_processes'] + config['options']['concurrency']['disk_threads'] = \ + cli_options['disk_threads'] + config['options']['concurrency']['md5_processes'] = \ + cli_options['md5_processes'] + config['options']['concurrency']['transfer_threads'] = \ + cli_options['transfer_threads'] def create_azure_storage_credentials(config, general_options): @@ -254,18 +261,19 @@ def create_general_options(config): :rtype: blobxfer.models.options.General :return: general options object """ + conc = config['options'].get('concurrency', {}) return blobxfer.models.options.General( concurrency=blobxfer.models.options.Concurrency( - crypto_processes=config['options']['crypto_processes'], - disk_threads=config['options']['disk_threads'], - md5_processes=config['options']['md5_processes'], - transfer_threads=config['options']['transfer_threads'], + crypto_processes=conc.get('crypto_processes', 0), + disk_threads=conc.get('disk_threads', 0), + md5_processes=conc.get('md5_processes', 0), + transfer_threads=conc.get('transfer_threads', 0), ), - log_file=config['options']['log_file'], - progress_bar=config['options']['progress_bar'], - resume_file=config['options']['resume_file'], - timeout_sec=config['options']['timeout_sec'], - verbose=config['options']['verbose'], + log_file=config['options'].get('log_file', None), + progress_bar=config['options'].get('progress_bar', True), + resume_file=config['options'].get('resume_file', None), + timeout_sec=config['options'].get('timeout_sec', None), + verbose=config['options'].get('verbose', False), ) @@ -279,7 +287,7 @@ def create_download_specifications(config): specs = [] for conf in config['download']: # create download options - confmode = conf['options']['mode'].lower() + confmode = conf['options'].get('mode', 'auto').lower() if confmode == 'auto': mode = blobxfer.models.azure.StorageModes.Auto elif confmode == 'append': @@ -293,32 +301,33 @@ def create_download_specifications(config): else: raise ValueError('unknown mode: {}'.format(confmode)) # load RSA private key PEM file if specified - rpk = conf['options']['rsa_private_key'] + rpk = conf['options'].get('rsa_private_key', None) if blobxfer.util.is_not_empty(rpk): - rpkp = conf['options']['rsa_private_key_passphrase'] + rpkp = conf['options'].get('rsa_private_key_passphrase', None) rpk = blobxfer.operations.crypto.load_rsa_private_key_file( rpk, rpkp) else: rpk = None # create specification + sod = conf['options'].get('skip_on', {}) ds = blobxfer.models.download.Specification( download_options=blobxfer.models.options.Download( - check_file_md5=conf['options']['check_file_md5'], - chunk_size_bytes=conf['options']['chunk_size_bytes'], - delete_extraneous_destination=conf[ - 'options']['delete_extraneous_destination'], + check_file_md5=conf['options'].get('check_file_md5', False), + chunk_size_bytes=conf['options'].get('chunk_size_bytes', 0), + delete_extraneous_destination=conf['options'].get( + 'delete_extraneous_destination', False), mode=mode, - overwrite=conf['options']['overwrite'], - recursive=conf['options']['recursive'], - rename=conf['options']['rename'], + overwrite=conf['options'].get('overwrite', True), + recursive=conf['options'].get('recursive', True), + rename=conf['options'].get('rename', False), restore_file_attributes=conf[ - 'options']['restore_file_attributes'], + 'options'].get('restore_file_attributes', False), rsa_private_key=rpk, ), skip_on_options=blobxfer.models.options.SkipOn( - filesize_match=conf['options']['skip_on']['filesize_match'], - lmt_ge=conf['options']['skip_on']['lmt_ge'], - md5_match=conf['options']['skip_on']['md5_match'], + filesize_match=sod.get('filesize_match', False), + lmt_ge=sod.get('lmt_ge', False), + md5_match=sod.get('md5_match', False), ), local_destination_path=blobxfer.models.download. LocalDestinationPath( @@ -333,10 +342,12 @@ def create_download_specifications(config): sa = next(iter(src)) asp = blobxfer.operations.azure.SourcePath() asp.add_path_with_storage_account(src[sa], sa) - if blobxfer.util.is_not_empty(conf['include']): - asp.add_includes(conf['include']) - if blobxfer.util.is_not_empty(conf['exclude']): - asp.add_excludes(conf['exclude']) + incl = conf.get('include', None) + if blobxfer.util.is_not_empty(incl): + asp.add_includes(incl) + excl = conf.get('exclude', None) + if blobxfer.util.is_not_empty(excl): + asp.add_excludes(excl) ds.add_azure_source_path(asp) # append spec to list specs.append(ds) @@ -353,7 +364,7 @@ def create_upload_specifications(config): specs = [] for conf in config['upload']: # create upload options - confmode = conf['options']['mode'].lower() + confmode = conf['options'].get('mode', 'auto').lower() if confmode == 'auto': mode = blobxfer.models.azure.StorageModes.Auto elif confmode == 'append': @@ -367,14 +378,14 @@ def create_upload_specifications(config): else: raise ValueError('unknown mode: {}'.format(confmode)) # load RSA public key PEM if specified - rpk = conf['options']['rsa_public_key'] + rpk = conf['options'].get('rsa_public_key', None) if blobxfer.util.is_not_empty(rpk): rpk = blobxfer.operations.crypto.load_rsa_public_key_file(rpk) if rpk is None: # load RSA private key PEM file if specified - rpk = conf['options']['rsa_private_key'] + rpk = conf['options'].get('rsa_private_key', None) if blobxfer.util.is_not_empty(rpk): - rpkp = conf['options']['rsa_private_key_passphrase'] + rpkp = conf['options'].get('rsa_private_key_passphrase', None) rpk = blobxfer.operations.crypto.load_rsa_private_key_file( rpk, rpkp) rpk = rpk.public_key() @@ -383,41 +394,44 @@ def create_upload_specifications(config): # create local source paths lsp = blobxfer.models.upload.LocalSourcePath() lsp.add_paths(conf['source']) - if blobxfer.util.is_not_empty(conf['include']): - lsp.add_includes(conf['include']) - if blobxfer.util.is_not_empty(conf['exclude']): - lsp.add_excludes(conf['exclude']) + incl = conf.get('include', None) + if blobxfer.util.is_not_empty(incl): + lsp.add_includes(incl) + excl = conf.get('exclude', None) + if blobxfer.util.is_not_empty(excl): + lsp.add_excludes(excl) # create specification + sfp = conf['options'].get('store_file_properties', {}) + vio = conf['options'].get('vectored_io', {}) + sod = conf['options'].get('skip_on', {}) us = blobxfer.models.upload.Specification( upload_options=blobxfer.models.options.Upload( - chunk_size_bytes=conf['options']['chunk_size_bytes'], - delete_extraneous_destination=conf[ - 'options']['delete_extraneous_destination'], + chunk_size_bytes=conf['options'].get('chunk_size_bytes', 0), + delete_extraneous_destination=conf['options'].get( + 'delete_extraneous_destination', False), mode=mode, - one_shot_bytes=conf['options']['one_shot_bytes'], - overwrite=conf['options']['overwrite'], - recursive=conf['options']['recursive'], - rename=conf['options']['rename'], + one_shot_bytes=conf['options'].get('one_shot_bytes', 0), + overwrite=conf['options'].get('overwrite', True), + recursive=conf['options'].get('recursive', True), + rename=conf['options'].get('rename', False), rsa_public_key=rpk, store_file_properties=blobxfer.models.options.FileProperties( - attributes=conf[ - 'options']['store_file_properties']['attributes'], - md5=conf['options']['store_file_properties']['md5'], + attributes=sfp.get('attributes', False), + md5=sfp.get('md5', False), ), - strip_components=conf['options']['strip_components'], + strip_components=conf['options'].get('strip_components', 1), vectored_io=blobxfer.models.options.VectoredIo( - stripe_chunk_size_bytes=conf[ - 'options']['vectored_io']['stripe_chunk_size_bytes'], + stripe_chunk_size_bytes=vio.get( + 'stripe_chunk_size_bytes', 1073741824), distribution_mode=blobxfer. models.upload.VectoredIoDistributionMode( - conf['options']['vectored_io'][ - 'distribution_mode'].lower()), + vio.get('distribution_mode', 'disabled').lower()), ), ), skip_on_options=blobxfer.models.options.SkipOn( - filesize_match=conf['options']['skip_on']['filesize_match'], - lmt_ge=conf['options']['skip_on']['lmt_ge'], - md5_match=conf['options']['skip_on']['md5_match'], + filesize_match=sod.get('filesize_match', False), + lmt_ge=sod.get('lmt_ge', False), + md5_match=sod.get('md5_match', False), ), local_source_path=lsp, ) From 4e0ee1b21f2da8950410ff41bd8924d86b14bf6b Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 30 May 2017 15:53:18 -0700 Subject: [PATCH 38/47] Fix packaging issues --- cli/cli.py | 4 ++-- setup.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cli/cli.py b/cli/cli.py index 7375891..bf397b0 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -41,7 +41,7 @@ import blobxfer.api import blobxfer.util # local imports -import settings +from . import settings # create logger logger = logging.getLogger('blobxfer') @@ -736,7 +736,7 @@ def config_arguments(f): @click.version_option(version=blobxfer.__version__) @click.pass_context def cli(ctx): - """Blobxfer-CLI: Azure Storage transfer tool""" + """Blobxfer: Azure Storage transfer tool""" pass diff --git a/setup.py b/setup.py index c61be64..212597c 100644 --- a/setup.py +++ b/setup.py @@ -39,10 +39,9 @@ ] install_requires = [ - 'azure-common==1.1.6', 'azure-storage==0.34.2', 'click==6.7', - 'cryptography>=1.8.2', + 'cryptography>=1.9', 'future==0.16.0', 'python-dateutil==2.6.0', 'requests==2.14.2', @@ -61,8 +60,7 @@ version=version, author='Microsoft Corporation, Azure Batch and HPC Team', author_email='', - description=( - 'Azure storage transfer tool and library with AzCopy-like features'), + description='Azure storage transfer tool and library', long_description=long_description, platforms='any', url='https://github.com/Azure/blobxfer', From a04a5724bc6ff5b810a1901881121a943a536d5f Mon Sep 17 00:00:00 2001 From: Fred Park Date: Tue, 30 May 2017 19:58:11 -0700 Subject: [PATCH 39/47] Upload resume support --- blobxfer/models/resume.py | 139 ++++++++++++++++++++++++++++++++++ blobxfer/models/upload.py | 133 ++++++++++++++++++++++++++++++-- blobxfer/operations/resume.py | 100 +++++++++++++++++++----- blobxfer/operations/upload.py | 9 ++- setup.py | 1 + 5 files changed, 352 insertions(+), 30 deletions(-) diff --git a/blobxfer/models/resume.py b/blobxfer/models/resume.py index aa8b9da..a0108cb 100644 --- a/blobxfer/models/resume.py +++ b/blobxfer/models/resume.py @@ -158,3 +158,142 @@ def __repr__(self): self.next_integrity_chunk, self.completed, self.md5hexdigest, ) + + +class Upload(object): + """Upload resume object""" + def __init__( + self, local_path, length, chunk_size, total_chunks, + completed_chunks, completed, md5): + # type: (Upload, str, int, int, int, int, bool, str) -> None + """Ctor for Upload + :param Upload self: this + :param str local_path: local path + :param int length: total bytes + :param int chunk_size: chunk size in bytes + :param int total_chunks: total chunks + :param int completed_chunks: completed chunks + :param bool completed: completed + :param str md5: md5 hex digest + """ + self._local_path = local_path + self._length = length + self._chunk_size = chunk_size + self._total_chunks = total_chunks + self._completed_chunks = completed_chunks + self._completed = completed + self._md5hexdigest = md5 if md5 is not None else None + + @property + def local_path(self): + # type: (Upload) -> str + """Local path + :param Upload self: this + :rtype: str + :return: local path + """ + return self._local_path + + @property + def length(self): + # type: (Upload) -> int + """Content length + :param Upload self: this + :rtype: int + :return: number of bytes + """ + return self._length + + @property + def chunk_size(self): + # type: (Upload) -> int + """Chunk size + :param Upload self: this + :rtype: int + :return: chunk size in bytes + """ + return self._chunk_size + + @property + def total_chunks(self): + # type: (Upload) -> int + """Get total number of chunks + :param Upload self: this + :rtype: int + :return: total chunks + """ + return self._total_chunks + + @property + def completed_chunks(self): + # type: (Upload) -> int + """Get Completed chunks + :param Upload self: this + :rtype: int + :return: completed chunks + """ + return self._completed_chunks + + @completed_chunks.setter + def completed_chunks(self, value): + # type: (Upload, int) -> None + """Set Completed chunks + :param Upload self: this + :param int value: completed chunks + """ + self._completed_chunks = value + + @property + def completed(self): + # type: (Upload) -> bool + """Get Completed + :param Upload self: this + :rtype: bool + :return: if completed + """ + return self._completed + + @completed.setter + def completed(self, value): + # type: (Upload) -> None + """Set Completed + :param Upload self: this + :param bool value: completion value + """ + self._completed = value + + @property + def md5hexdigest(self): + # type: (Upload) -> str + """Get md5 hex digest + :param Upload self: this + :rtype: str + :return: md5 hex digest + """ + return self._md5hexdigest + + @md5hexdigest.setter + def md5hexdigest(self, value): + # type: (Upload) -> None + """Set md5 hex digest value if value is not None + :param Upload self: this + :param str value: md5 hex digest + """ + if value is None: + return + self._md5hexdigest = value + + def __repr__(self): + # type: (Upload) -> str + """Return representation + :param Upload self: this + :rtype: str + :return: representation string + """ + return ('Upload').format( + self.local_path, self.length, self.chunk_size, + self.total_chunks, self.completed_chunks, self.completed, + self.md5hexdigest, + ) diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index 585ea51..d411bb0 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -42,6 +42,7 @@ import pathlib import threading # non-stdlib imports +import bitstring # local imports import blobxfer.models import blobxfer.models.azure @@ -57,6 +58,7 @@ _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304 _MAX_NUM_CHUNKS = 50000 _DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216 +_MAX_MD5_CACHE_RESUME_ENTRIES = 25 # named tuples @@ -360,6 +362,10 @@ def __init__(self, lpath, ase, uid, options, resume_mgr): self._outstanding_ops = self._total_chunks if blobxfer.util.is_not_empty(self._ase.replica_targets): self._outstanding_ops *= len(self._ase.replica_targets) + if self._resume_mgr: + self._completed_chunks = bitstring.BitArray( + length=self._total_chunks) + self._md5_cache = {} # initialize integrity checkers self.hmac = None self.md5 = None @@ -416,7 +422,8 @@ def is_resumable(self): :rtype: bool :return: if resumable """ - return self._resume_mgr is not None and self.hmac is None + return (self._resume_mgr is not None and self.hmac is None and + not self.remote_is_append_blob) @property def remote_is_file(self): @@ -492,14 +499,40 @@ def requires_set_file_properties_md5(self): return (not self.entity.is_encrypted and self.must_compute_md5 and self.remote_is_file) - def complete_offset_upload(self): - # type: (Descriptor) -> None + def complete_offset_upload(self, chunk_num): + # type: (Descriptor, int) -> None """Complete the upload for the offset :param Descriptor self: this + :param int chunk_num: chunk num completed """ with self._meta_lock: self._outstanding_ops -= 1 - # TODO save resume state + # save resume state + if self.is_resumable: + self._completed_chunks.set(True, chunk_num) + completed = self._outstanding_ops == 0 + if not completed and self.must_compute_md5: + last_consecutive = ( + self._completed_chunks.find('0b0')[0] - 1 + ) + md5digest = self._md5_cache[last_consecutive] + else: + md5digest = None + if completed: + last_consecutive = None + self._md5_cache.clear() + self._resume_mgr.add_or_update_record( + self.local_path.absolute_path, self._ase, self._chunk_size, + self._total_chunks, self._completed_chunks.int, completed, + md5digest, + ) + # prune md5 cache + if len(self._md5_cache) > _MAX_MD5_CACHE_RESUME_ENTRIES: + mkeys = sorted(list(self._md5_cache.keys())) + for key in mkeys: + if key >= last_consecutive: + break + self._md5_cache.pop(key) def hmac_data(self, data): # type: (Descriptor, bytes) -> None @@ -667,6 +700,92 @@ def _initialize_integrity_checkers(self, options): not self.remote_is_append_blob): self.md5 = blobxfer.util.new_md5_hasher() + def _resume(self): + if self._resume_mgr is None or self._offset > 0: + return None + # check if path exists in resume db + rr = self._resume_mgr.get_record(self._ase) + if rr is None: + logger.debug('no resume record for {}'.format(self._ase.path)) + return None + # ensure lengths are the same + if rr.length != self._ase.size: + logger.warning('resume length mismatch {} -> {}'.format( + rr.length, self._ase.size)) + return None + # set offsets if completed + if rr.completed: + with self._meta_lock: + logger.debug('{} upload already completed'.format( + self._ase.path)) + self._offset = rr.total_chunks * rr.chunk_size + self._chunk_num = rr.total_chunks + self._chunk_size = rr.chunk_size + self._total_chunks = rr.total_chunks + self._completed_chunks.int = rr.completed_chunks + self._outstanding_ops = 0 + return self._ase.size + # encrypted files are not resumable due to hmac requirement + if self._ase.is_encrypted: + logger.debug('cannot resume encrypted entity {}'.format( + self._ase.path)) + return None + # check if path exists + if not pathlib.Path(rr.local_path).exists(): + logger.warning('resume from local path {} does not exist'.format( + rr.local_path)) + return None + # re-hash from 0 to offset if needed + _cc = bitstring.BitArray(length=rr.total_chunks) + _cc.int = rr.completed_chunks + curr_chunk = _cc.find('0b0')[0] + del _cc + _fd_offset = 0 + _end_offset = min((curr_chunk * rr.chunk_size, rr.length)) + if self.md5 is not None and curr_chunk > 0: + _blocksize = blobxfer.util.MEGABYTE << 2 + logger.debug( + 'integrity checking existing file {} offset {} -> {}'.format( + self._ase.path, + self.local_path.view.fd_start, + self.local_path.view.fd_start + _end_offset) + ) + with self._hasher_lock: + with self.local_path.absolute_path.open('rb') as filedesc: + filedesc.seek(self.local_path.view.fd_start, 0) + while _fd_offset < _end_offset: + if (_fd_offset + _blocksize) > _end_offset: + _blocksize = _end_offset - _fd_offset + _buf = filedesc.read(_blocksize) + self.md5.update(_buf) + _fd_offset += _blocksize + del _blocksize + # compare hashes + hexdigest = self.md5.hexdigest() + if rr.md5hexdigest != hexdigest: + logger.warning( + 'MD5 mismatch resume={} computed={} for {}'.format( + rr.md5hexdigest, hexdigest, self._ase.path)) + # reset hasher + self.md5 = blobxfer.util.new_md5_hasher() + return None + # set values from resume + with self._meta_lock: + self._offset = _end_offset + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = rr.total_chunks + self._completed_chunks = bitstring.BitArray(length=rr.total_chunks) + self._completed_chunks.set(True, range(0, curr_chunk + 1)) + self._outstanding_ops = rr.total_chunks - curr_chunk + logger.debug( + ('resuming file {} from byte={} chunk={} chunk_size={} ' + 'total_chunks={} outstanding_ops={}').format( + self._ase.path, self._offset, self._chunk_num, + self._chunk_size, self._total_chunks, + self._outstanding_ops)) + return _end_offset + def next_offsets(self): # type: (Descriptor) -> Offsets """Retrieve the next offsets @@ -674,9 +793,7 @@ def next_offsets(self): :rtype: Offsets :return: upload offsets """ - # TODO RESUME - resume_bytes = None -# resume_bytes = self._resume() + resume_bytes = self._resume() with self._meta_lock: if self._chunk_num >= self._total_chunks: return None, resume_bytes @@ -744,6 +861,8 @@ def read_data(self, offsets): if self.must_compute_md5 and data: with self._hasher_lock: self.md5.update(data) + if self.is_resumable: + self._md5_cache[self._chunk_num - 1] = self.md5.hexdigest() return data, newoffset def generate_metadata(self): diff --git a/blobxfer/operations/resume.py b/blobxfer/operations/resume.py index 97e37e4..0458bec 100644 --- a/blobxfer/operations/resume.py +++ b/blobxfer/operations/resume.py @@ -44,12 +44,11 @@ logger = logging.getLogger(__name__) -class DownloadResumeManager(): - """Download Resume Manager""" +class _BaseResumeManager(): def __init__(self, resume_file): - # type: (DownloadResumeManager, str) -> None - """Ctor for DownloadResumeManager - :param DownloadResumeManager self: this + # type: (_BaseResumeManager, str) -> None + """Ctor for _BaseResumeManager + :param _BaseResumeManager self: this :param pathlib.Path resume_file: resume file """ self._lock = threading.Lock() @@ -58,18 +57,18 @@ def __init__(self, resume_file): str(resume_file), protocol=pickle.HIGHEST_PROTOCOL) def close(self): - # type: (DownloadResumeManager) -> None + # type: (_BaseResumeManager) -> None """Close the internal data store - :param DownloadResumeManager self: this + :param _BaseResumeManager self: this """ if self._data is not None: self._data.close() self._data = None def delete(self): - # type: (DownloadResumeManager) -> None + # type: (_BaseResumeManager) -> None """Delete the resume file db - :param DownloadResumeManager self: this + :param _BaseResumeManager self: this """ self.close() try: @@ -79,9 +78,9 @@ def delete(self): @contextlib.contextmanager def datalock(self, acquire=True): - # type: (DownloadResumeManager) -> None + # type: (_BaseResumeManager) -> None """Delete the resume file db - :param DownloadResumeManager self: this + :param _BaseResumeManager self: this :param bool acquire: acquire lock """ if acquire: @@ -103,18 +102,18 @@ def generate_record_key(ase): return '{}:{}'.format(ase._client.primary_endpoint, ase.path) def get_record(self, ase, key=None, lock=True): - # type: (DownloadResumeManager, str, - # bool) -> blobxfer.models.resume.Download + # type: (_BaseResumeManager, str, + # bool) -> object """Get a resume record - :param DownloadResumeManager self: this + :param _BaseResumeManager self: this :param blobxfer.models.azure.StorageEntity ase: Storage Entity :param str key: record key :param bool lock: acquire lock - :rtype: blobxfer.models.resume.Download - :return: Download record + :rtype: blobxfer.models.resume._Base + :return: _Base record """ if key is None: - key = blobxfer.operations.resume.DownloadResumeManager.\ + key = blobxfer.operations.resume._BaseResumeManager.\ generate_record_key(ase) with self.datalock(lock): try: @@ -122,13 +121,24 @@ def get_record(self, ase, key=None, lock=True): except KeyError: return None + +class DownloadResumeManager(_BaseResumeManager): + """Download Resume Manager""" + def __init__(self, resume_file): + # type: (DownloadResumeManager, str) -> None + """Ctor for DownloadResumeManager + :param DownloadResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + super(DownloadResumeManager, self).__init__(resume_file) + def add_or_update_record( self, final_path, ase, chunk_size, next_integrity_chunk, completed, md5): # type: (DownloadResumeManager, pathlib.Path, # blobxfer.models.azure.StorageEntity, int, int, bool, # str) -> None - """Get a resume record + """Add or update a resume record :param DownloadResumeManager self: this :param pathlib.Path final_path: final path :param blobxfer.models.azure.StorageEntity ase: Storage Entity @@ -137,7 +147,7 @@ def add_or_update_record( :param bool completed: if completed :param str md5: md5 hex digest """ - key = blobxfer.operations.resume.DownloadResumeManager.\ + key = blobxfer.operations.resume._BaseResumeManager.\ generate_record_key(ase) with self.datalock(): dl = self.get_record(ase, key=key, lock=False) @@ -161,3 +171,55 @@ def add_or_update_record( dl.md5hexdigest = md5 self._data[key] = dl self._data.sync() + + +class UploadResumeManager(_BaseResumeManager): + """Upload Resume Manager""" + def __init__(self, resume_file): + # type: (UploadResumeManager, str) -> None + """Ctor for UploadResumeManager + :param UploadResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + super(UploadResumeManager, self).__init__(resume_file) + + def add_or_update_record( + self, local_path, ase, chunk_size, total_chunks, completed_chunks, + completed, md5): + # type: (UploadResumeManager, pathlib.Path, + # blobxfer.models.azure.StorageEntity, int, int, int, bool, + # str) -> None + """Add or update a resume record + :param UploadResumeManager self: this + :param pathlib.Path local_path: local path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :param int chunk_size: chunk size in bytes + :param int total_chunks: total chunks + :param int completed_chunks: completed chunks bitarray + :param bool completed: if completed + :param str md5: md5 hex digest + """ + key = blobxfer.operations.resume._BaseResumeManager.\ + generate_record_key(ase) + with self.datalock(): + ul = self.get_record(ase, key=key, lock=False) + if ul is None: + ul = blobxfer.models.resume.Upload( + local_path=str(local_path), + length=ase._size, + chunk_size=chunk_size, + total_chunks=total_chunks, + completed_chunks=completed_chunks, + completed=completed, + md5=md5, + ) + else: + if ul.completed or completed_chunks == ul.completed_chunks: + return + if completed: + ul.completed = completed + else: + ul.completed_chunks = completed_chunks + ul.md5hexdigest = md5 + self._data[key] = ul + self._data.sync() diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 6bf9761..a946d7c 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -380,7 +380,7 @@ def _process_transfer(self, ud, ase, offsets, data): self._transfer_set.remove( blobxfer.operations.upload.Uploader.create_unique_transfer_id( ud.local_path, ase, offsets)) - ud.complete_offset_upload() + ud.complete_offset_upload(offsets.chunk_num) # add descriptor back to upload queue only for append blobs if ud.entity.mode == blobxfer.models.azure.StorageModes.Append: self._upload_queue.put(ud) @@ -519,6 +519,7 @@ def _process_upload_descriptor(self, ud): # add resume bytes to counter if resume_bytes is not None: with self._transfer_lock: + self._upload_bytes_total += ud.entity.size self._upload_bytes_sofar += resume_bytes logger.debug('adding {} sofar {} from {}'.format( resume_bytes, self._upload_bytes_sofar, ud._ase.name)) @@ -1007,9 +1008,9 @@ def _run(self): self._start_time = blobxfer.util.datetime_now() logger.info('blobxfer start time: {0}'.format(self._start_time)) # initialize resume db if specified -# if self._general_options.resume_file is not None: -# self._resume = blobxfer.operations.resume.DownloadResumeManager( -# self._general_options.resume_file) + if self._general_options.resume_file is not None: + self._resume = blobxfer.operations.resume.UploadResumeManager( + self._general_options.resume_file) # initialize MD5 processes if ((self._spec.options.store_file_properties.md5 or self._spec.skip_on.md5_match) and diff --git a/setup.py b/setup.py index 212597c..1f36502 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ install_requires = [ 'azure-storage==0.34.2', + 'bitstring==3.1.5', 'click==6.7', 'cryptography>=1.9', 'future==0.16.0', From e308ed85953b21870a66fc52388dabd49bad72d6 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Wed, 31 May 2017 21:01:37 -0700 Subject: [PATCH 40/47] Fix existing tests - Fix various issues uncovered from UTs --- blobxfer/models/__init__.py | 26 +- blobxfer/models/download.py | 4 +- blobxfer/models/options.py | 18 +- blobxfer/operations/azure/__init__.py | 2 +- blobxfer/operations/download.py | 6 +- blobxfer/operations/progress.py | 4 +- blobxfer/operations/resume.py | 16 +- blobxfer/operations/upload.py | 5 +- blobxfer/retry.py | 32 ++- blobxfer/util.py | 2 +- cli/cli.py | 6 +- test_requirements.txt | 6 +- tests/test_blobxfer_models_azure.py | 2 +- tests/test_blobxfer_models_download.py | 120 ++++----- tests/test_blobxfer_models_options.py | 30 ++- tests/test_blobxfer_models_resume.py | 3 +- tests/test_blobxfer_models_upload.py | 13 +- tests/test_blobxfer_operations_azure.py | 4 +- tests/test_blobxfer_operations_crypto.py | 4 +- tests/test_blobxfer_operations_download.py | 269 +++++++++++++++------ tests/test_blobxfer_operations_md5.py | 24 +- tests/test_blobxfer_operations_progress.py | 4 +- tests/test_blobxfer_operations_resume.py | 25 +- tests/test_blobxfer_retry.py | 20 +- tox.ini | 4 +- 25 files changed, 403 insertions(+), 246 deletions(-) diff --git a/blobxfer/models/__init__.py b/blobxfer/models/__init__.py index b9e9fbc..1d2e850 100644 --- a/blobxfer/models/__init__.py +++ b/blobxfer/models/__init__.py @@ -60,17 +60,6 @@ def paths(self): """ return self._paths - def add_include(self, incl): - # type: (_BaseSourcePaths, str) -> None - """Add an include - :param _BaseSourcePaths self: this - :param str incl: include filter - """ - if self._include is None: - self._include = list(incl) - else: - self._include.append(incl) - def add_includes(self, includes): # type: (_BaseSourcePaths, list) -> None """Add a list of includes @@ -78,23 +67,12 @@ def add_includes(self, includes): :param list includes: list of includes """ if not isinstance(includes, list): - includes = list(includes) + includes = [includes] if self._include is None: self._include = includes else: self._include.extend(includes) - def add_exclude(self, excl): - # type: (_BaseSourcePaths, str) -> None - """Add an exclude - :param _BaseSourcePaths self: this - :param str excl: exclude filter - """ - if self._exclude is None: - self._exclude = list(excl) - else: - self._exclude.append(excl) - def add_excludes(self, excludes): # type: (_BaseSourcePaths, list) -> None """Add a list of excludes @@ -102,7 +80,7 @@ def add_excludes(self, excludes): :param list excludes: list of excludes """ if not isinstance(excludes, list): - excludes = list(excludes) + excludes = [excludes] if self._exclude is None: self._exclude = excludes else: diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py index a197a25..cc363ef 100644 --- a/blobxfer/models/download.py +++ b/blobxfer/models/download.py @@ -345,7 +345,7 @@ def generate_view(ase): fd_start=0, fd_end=slicesize, ) - total_size = ase.size + total_size = slicesize else: view = LocalPathView( fd_start=ase.vectored_io.offset_start, @@ -529,7 +529,7 @@ def cleanup_all_temporary_files(self): pass # iterate unchecked chunks and delete for key in self._unchecked_chunks: - ucc = self._unchecked_chunks[key] + ucc = self._unchecked_chunks[key]['ucc'] if ucc.temp: try: ucc.file_path.unlink() diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index cdc32df..c516d01 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -122,24 +122,24 @@ def __init__( if self.crypto_processes is None or self.crypto_processes < 1: self.crypto_processes = 0 if self.md5_processes is None or self.md5_processes < 1: - self.md5_processes = multiprocessing.cpu_count() // 2 + self.md5_processes = multiprocessing.cpu_count() >> 1 if self.md5_processes < 1: self.md5_processes = 1 auto_disk = False if self.disk_threads is None or self.disk_threads < 1: - self.disk_threads = multiprocessing.cpu_count() * 4 - # cap maximum number of disk threads from cpu count to 96 - if self.disk_threads > 96: - self.transfer_threads = 96 + self.disk_threads = multiprocessing.cpu_count() << 1 + # cap maximum number of disk threads from cpu count to 64 + if self.disk_threads > 64: + self.disk_threads = 64 auto_disk = True if self.transfer_threads is None or self.transfer_threads < 1: if auto_disk: self.transfer_threads = self.disk_threads << 1 else: - self.transfer_threads = multiprocessing.cpu_count() * 2 - # cap maximum number of threads from cpu count to 64 - if self.transfer_threads > 64: - self.transfer_threads = 64 + self.transfer_threads = multiprocessing.cpu_count() << 2 + # cap maximum number of threads from cpu count to 96 + if self.transfer_threads > 96: + self.transfer_threads = 96 class General(object): diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py index 177d41d..67d531f 100644 --- a/blobxfer/operations/azure/__init__.py +++ b/blobxfer/operations/azure/__init__.py @@ -287,7 +287,7 @@ def _convert_to_storage_entity_with_encryption_metadata( encryption_metadata_exists(entity.metadata): ed = blobxfer.models.crypto.EncryptionMetadata() ed.convert_from_json( - entity.metadata, file.name, options.rsa_private_key) + entity.metadata, entity.name, options.rsa_private_key) else: ed = None ase = blobxfer.models.azure.StorageEntity(container, ed) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 3df37bc..a369d0f 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -298,6 +298,7 @@ def _pre_md5_skip_on_check(self, lpath, rfile): convert_vectored_io_slice_to_final_path_name(lpath, rfile) ) else: + view = None fpath = slpath self._md5_offload.add_localfile_for_md5_check( key, slpath, fpath, md5, rfile.mode, view) @@ -453,11 +454,10 @@ def _worker_thread_transfer(self): """Worker thread download :param Downloader self: this """ + max_set_len = self._general_options.concurrency.disk_threads << 2 while not self.termination_check: try: - if (len(self._disk_set) > - self._general_options.concurrency. - disk_threads * 4): + if len(self._disk_set) > max_set_len: time.sleep(0.2) continue else: diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py index 07a9281..0bf132e 100644 --- a/blobxfer/operations/progress.py +++ b/blobxfer/operations/progress.py @@ -39,6 +39,8 @@ import cryptography import requests # local imports +import blobxfer.models.download +import blobxfer.models.upload import blobxfer.util import blobxfer.version @@ -158,7 +160,7 @@ def output_parameters(general_options, spec): spec.skip_on.filesize_match, spec.skip_on.lmt_ge, spec.skip_on.md5_match)) - log.append(' chunk size: {} bytes'.format( + log.append(' chunk size bytes: {}'.format( spec.options.chunk_size_bytes)) log.append(' delete extraneous: {}'.format( spec.options.delete_extraneous_destination)) diff --git a/blobxfer/operations/resume.py b/blobxfer/operations/resume.py index 0458bec..88172e4 100644 --- a/blobxfer/operations/resume.py +++ b/blobxfer/operations/resume.py @@ -44,7 +44,8 @@ logger = logging.getLogger(__name__) -class _BaseResumeManager(): +class _BaseResumeManager(object): + """Base Resume Manager""" def __init__(self, resume_file): # type: (_BaseResumeManager, str) -> None """Ctor for _BaseResumeManager @@ -99,18 +100,21 @@ def generate_record_key(ase): :rtype: str :return: record key """ - return '{}:{}'.format(ase._client.primary_endpoint, ase.path) + key = '{}:{}'.format(ase._client.primary_endpoint, ase.path) + if blobxfer.util.on_python2(): + return key.encode('utf8') + else: + return key def get_record(self, ase, key=None, lock=True): - # type: (_BaseResumeManager, str, - # bool) -> object + # type: (_BaseResumeManager, str, bool) -> object """Get a resume record :param _BaseResumeManager self: this :param blobxfer.models.azure.StorageEntity ase: Storage Entity :param str key: record key :param bool lock: acquire lock - :rtype: blobxfer.models.resume._Base - :return: _Base record + :rtype: object + :return: resume record object """ if key is None: key = blobxfer.operations.resume._BaseResumeManager.\ diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index a946d7c..02447c9 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -446,11 +446,10 @@ def _worker_thread_upload(self): """Worker thread upload :param Uploader self: this """ + max_set_len = self._general_options.concurrency.transfer_threads << 2 while not self.termination_check: try: - if (len(self._transfer_set) > - self._general_options.concurrency. - transfer_threads * 4): + if len(self._transfer_set) > max_set_len: time.sleep(0.2) continue else: diff --git a/blobxfer/retry.py b/blobxfer/retry.py index 892b25c..daee22a 100644 --- a/blobxfer/retry.py +++ b/blobxfer/retry.py @@ -37,23 +37,34 @@ class ExponentialRetryWithMaxWait(azure.storage.retry._Retry): """Exponential Retry with Max Wait (infinite retries)""" - def __init__(self, initial_backoff=0.1, max_backoff=2, reset_at_max=True): - # type: (ExponentialRetryWithMaxWait, int, int, bool) -> None + def __init__( + self, initial_backoff=0.1, max_backoff=1, max_retries=None, + reset_at_max=True): + # type: (ExponentialRetryWithMaxWait, int, int, int, bool) -> None """Ctor for ExponentialRetryWithMaxWait :param ExponentialRetryWithMaxWait self: this :param int initial_backoff: initial backoff :param int max_backoff: max backoff + :param int max_retries: max retries :param bool reset_at_max: reset after reaching max wait """ + if max_backoff <= 0: + raise ValueError( + 'max backoff is non-positive: {}'.format(max_backoff)) + if max_retries is not None and max_retries < 0: + raise ValueError( + 'max retries is invalid: {}'.format(max_retries)) if max_backoff < initial_backoff: raise ValueError( 'max backoff {} less than initial backoff {}'.format( max_backoff, initial_backoff)) + self._backoff_count = 0 + self._last_backoff = initial_backoff self.initial_backoff = initial_backoff self.max_backoff = max_backoff self.reset_at_max = reset_at_max super(ExponentialRetryWithMaxWait, self).__init__( - max_backoff if self.reset_at_max else 2147483647, False) + max_retries if max_retries is not None else 2147483647, False) def retry(self, context): # type: (ExponentialRetryWithMaxWait, @@ -75,11 +86,12 @@ def _backoff(self, context): :rtype: int :return: backoff amount """ - if context.count == 1: - backoff = self.initial_backoff + self._backoff_count += 1 + if self._backoff_count == 1: + self._last_backoff = self.initial_backoff else: - backoff = self.initial_backoff * (context.count - 1) - if backoff > self.max_backoff and self.reset_at_max: - backoff = self.initial_backoff - context.count = 1 - return backoff + self._last_backoff *= 2 + if self._last_backoff > self.max_backoff and self.reset_at_max: + self._backoff_count = 1 + self._last_backoff = self.initial_backoff + return self._last_backoff diff --git a/blobxfer/util.py b/blobxfer/util.py index cce84f0..166b98f 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -65,7 +65,7 @@ def on_python2(): return future.utils.PY2 -def on_windows(): +def on_windows(): # noqa # type: (None) -> bool """Execution on Windows :rtype: bool diff --git a/cli/cli.py b/cli/cli.py index bf397b0..ec4f3f9 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -41,7 +41,11 @@ import blobxfer.api import blobxfer.util # local imports -from . import settings +try: + from . import settings +except (SystemError, ImportError): # noqa + # for local testing + import settings # create logger logger = logging.getLogger('blobxfer') diff --git a/test_requirements.txt b/test_requirements.txt index bc58365..c576b44 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,5 +1,5 @@ flake8>=3.3.0 mock>=2.0.0; python_version < '3.3' -pypandoc>=1.3.3 -pytest>=3.0.7 -pytest-cov>=2.4.0 +pypandoc>=1.4 +pytest>=3.1.1 +pytest-cov>=2.5.1 diff --git a/tests/test_blobxfer_models_azure.py b/tests/test_blobxfer_models_azure.py index 6ddc95b..f075092 100644 --- a/tests/test_blobxfer_models_azure.py +++ b/tests/test_blobxfer_models_azure.py @@ -49,6 +49,6 @@ def test_azurestorageentity(): assert ase.snapshot is not None blob.snapshot = None - ase.populate_from_file(mock.MagicMock(), blob) + ase.populate_from_file(mock.MagicMock(), blob, 'path') assert ase.mode == azmodels.StorageModes.File assert ase.snapshot is None diff --git a/tests/test_blobxfer_models_download.py b/tests/test_blobxfer_models_download.py index 918a7f0..c1b568e 100644 --- a/tests/test_blobxfer_models_download.py +++ b/tests/test_blobxfer_models_download.py @@ -110,32 +110,33 @@ def test_downloaddescriptor(tmpdir): d._allocate_disk_space() assert d.entity == ase + assert d.entity.is_encrypted assert not d.must_compute_md5 + assert d.hmac is not None assert d._total_chunks == 64 assert d._offset == 0 assert d.final_path == lp - assert str(d.local_path) == str(lp) + '.bxtmp' assert d._allocated - assert d.local_path.stat().st_size == 1024 - 16 + assert d.final_path.stat().st_size == ase._size - 16 d._allocate_disk_space() assert d._allocated - d.local_path.unlink() - ase._size = 1 + d.final_path.unlink() + ase._size = 32 d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() - assert d._total_chunks == 1 + assert d._total_chunks == 2 assert d._allocated - assert d.local_path.stat().st_size == 0 + assert d.final_path.stat().st_size == ase._size - 16 - d.local_path.unlink() + d.final_path.unlink() ase._encryption = None ase._size = 1024 d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() assert d._allocated - assert d.local_path.stat().st_size == 1024 + assert d.final_path.stat().st_size == ase._size # pre-existing file check ase._size = 0 @@ -143,13 +144,12 @@ def test_downloaddescriptor(tmpdir): d._allocate_disk_space() assert d._total_chunks == 0 assert d._allocated - assert d.local_path.stat().st_size == 0 + assert d.final_path.stat().st_size == ase._size @unittest.skipIf(util.on_python2(), 'fallocate does not exist') def test_downloaddescriptor_allocate_disk_space_via_seek(tmpdir): fp = pathlib.Path(str(tmpdir.join('fp'))) - lp = pathlib.Path(str(tmpdir.join('fp.bxtmp'))) opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 256 @@ -162,14 +162,13 @@ def test_downloaddescriptor_allocate_disk_space_via_seek(tmpdir): patched_fallocate.side_effect = [AttributeError()] d._allocate_disk_space() assert d._allocated - assert not fp.exists() - assert lp.stat().st_size == ase._size + assert fp.exists() + assert fp.stat().st_size == ase._size def test_downloaddescriptor_resume(tmpdir): resumefile = pathlib.Path(str(tmpdir.join('resume'))) fp = pathlib.Path(str(tmpdir.join('fp'))) - lp = pathlib.Path(str(tmpdir.join('fp.bxtmp'))) opts = mock.MagicMock() opts.check_file_md5 = True @@ -177,6 +176,7 @@ def test_downloaddescriptor_resume(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 128 ase._name = 'blob' + ase._client = mock.MagicMock() # test no record rmgr = rops.DownloadResumeManager(resumefile) @@ -185,7 +185,7 @@ def test_downloaddescriptor_resume(tmpdir): assert rb is None # test length mismatch - rmgr.add_or_update_record(str(fp), str(lp), 127, 0, 0, False, None) + rmgr.add_or_update_record(str(fp), ase, 0, 0, False, None) rb = d._resume() assert rb is None @@ -193,7 +193,7 @@ def test_downloaddescriptor_resume(tmpdir): rmgr.delete() rmgr = rops.DownloadResumeManager(resumefile) - rmgr.add_or_update_record(str(fp), str(lp), ase._size, 0, 0, False, None) + rmgr.add_or_update_record(str(fp), ase, 0, 0, False, None) d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() assert rb is None @@ -202,7 +202,7 @@ def test_downloaddescriptor_resume(tmpdir): rmgr.delete() rmgr = rops.DownloadResumeManager(resumefile) - rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, True, None) + rmgr.add_or_update_record(str(fp), ase, 32, 1, True, None) d = models.Descriptor(fp, ase, opts, rmgr) fp.touch() rb = d._resume() @@ -215,22 +215,23 @@ def test_downloaddescriptor_resume(tmpdir): ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'123' - rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() assert rb is None - # test if intermediate file not exists + # test up to chunk rmgr.delete() rmgr = rops.DownloadResumeManager(resumefile) ase = azmodels.StorageEntity('cont') ase._size = 128 ase._name = 'blob' + ase._client = mock.MagicMock() - rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() - assert rb is None + assert rb == 32 # ensure hmac not populated rmgr.delete() @@ -238,9 +239,10 @@ def test_downloaddescriptor_resume(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 128 ase._name = 'blob' - lp.touch() + ase._client = mock.MagicMock() + fp.touch() - rmgr.add_or_update_record(str(fp), str(lp), ase._size, 32, 1, False, None) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) d = models.Descriptor(fp, ase, opts, rmgr) d.hmac = True with pytest.raises(RuntimeError): @@ -251,13 +253,12 @@ def test_downloaddescriptor_resume(tmpdir): rmgr = rops.DownloadResumeManager(resumefile) data = os.urandom(32) - with lp.open('wb') as f: + with fp.open('wb') as f: f.write(data) md5 = util.new_md5_hasher() md5.update(data) - rmgr.add_or_update_record( - str(fp), str(lp), ase._size, 32, 1, False, md5.hexdigest()) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, md5.hexdigest()) d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() assert rb == 32 @@ -265,8 +266,7 @@ def test_downloaddescriptor_resume(tmpdir): # md5 hash mismatch rmgr.delete() rmgr = rops.DownloadResumeManager(resumefile) - rmgr.add_or_update_record( - str(fp), str(lp), ase._size, 32, 1, False, 'abc') + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, 'abc') ase._md5 = 'abc' d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() @@ -278,10 +278,10 @@ def test_downloaddescriptor_resume(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 128 ase._name = 'blob' + ase._client = mock.MagicMock() ase._mode = azmodels.StorageModes.Page - rmgr.add_or_update_record( - str(fp), str(lp), ase._size, 32, 1, False, md5.hexdigest()) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, md5.hexdigest()) d = models.Descriptor(fp, ase, opts, rmgr) rb = d._resume() assert rb == 32 @@ -443,10 +443,11 @@ def test_write_unchecked_data(tmpdir): assert offsets.chunk_num in d._unchecked_chunks ucc = d._unchecked_chunks[offsets.chunk_num] - assert ucc.data_len == ase._size - assert ucc.fd_start == offsets.fd_start - assert ucc.file_path == d.local_path - assert not ucc.temp + assert ucc['ucc'].data_len == ase._size + assert ucc['ucc'].fd_start == offsets.fd_start + assert ucc['ucc'].file_path == d.final_path + assert not ucc['ucc'].temp + assert ucc['decrypted'] def test_write_unchecked_hmac_data(tmpdir): @@ -464,10 +465,11 @@ def test_write_unchecked_hmac_data(tmpdir): assert offsets.chunk_num in d._unchecked_chunks ucc = d._unchecked_chunks[offsets.chunk_num] - assert ucc.data_len == ase._size - assert ucc.fd_start == offsets.fd_start - assert ucc.file_path != d.local_path - assert ucc.temp + assert ucc['ucc'].data_len == ase._size + assert ucc['ucc'].fd_start == offsets.fd_start + assert ucc['ucc'].file_path != d.final_path + assert ucc['ucc'].temp + assert not ucc['decrypted'] def test_perform_chunked_integrity_check(tmpdir): @@ -505,10 +507,12 @@ def test_perform_chunked_integrity_check(tmpdir): offsets1, _ = d.next_offsets() d.write_unchecked_hmac_data(offsets1, data) ucc1 = d._unchecked_chunks[offsets1.chunk_num] + ucc['decrypted'] = True + ucc1['decrypted'] = True d.perform_chunked_integrity_check() - assert not ucc.file_path.exists() - assert not ucc1.file_path.exists() + assert ucc['ucc'].file_path != d.final_path + assert ucc1['ucc'].file_path != d.final_path assert d._next_integrity_chunk == 2 assert 0 not in d._unchecked_chunks assert 1 not in d._unchecked_chunks @@ -529,6 +533,7 @@ def test_perform_chunked_integrity_check(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 32 ase._name = 'blob' + ase._client = mock.MagicMock() ase._md5 = md5.hexdigest() rmgr = rops.DownloadResumeManager(resumefile) @@ -539,7 +544,7 @@ def test_perform_chunked_integrity_check(tmpdir): d.perform_chunked_integrity_check() assert d._next_integrity_chunk == 1 assert len(d._unchecked_chunks) == 0 - dr = rmgr.get_record(str(fp)) + dr = rmgr.get_record(ase) assert dr.next_integrity_chunk == 1 assert dr.md5hexdigest == md5.hexdigest() @@ -553,11 +558,12 @@ def test_update_resume_for_completed(tmpdir): ase = azmodels.StorageEntity('cont') ase._size = 32 ase._name = 'blob' + ase._client = mock.MagicMock() rmgr = rops.DownloadResumeManager(resumefile) d = models.Descriptor(fp, ase, opts, rmgr) offsets, _ = d.next_offsets() d._update_resume_for_completed() - dr = rmgr.get_record(str(fp)) + dr = rmgr.get_record(ase) assert dr.completed @@ -575,8 +581,8 @@ def test_cleanup_all_temporary_files(tmpdir): d.write_unchecked_data(offsets, data) assert len(d._unchecked_chunks) == 1 d.cleanup_all_temporary_files() - assert not d.local_path.exists() - assert not d._unchecked_chunks[0].file_path.exists() + assert not d.final_path.exists() + assert not d._unchecked_chunks[0]['ucc'].file_path.exists() lp = pathlib.Path(str(tmpdir.join('b'))) d = models.Descriptor(lp, ase, opts, None) @@ -585,11 +591,10 @@ def test_cleanup_all_temporary_files(tmpdir): data = b'0' * opts.chunk_size_bytes d.write_unchecked_hmac_data(offsets, data) assert len(d._unchecked_chunks) == 1 - d.local_path.unlink() - d._unchecked_chunks[0].file_path.unlink() + d._unchecked_chunks[0]['ucc'].file_path.unlink() d.cleanup_all_temporary_files() - assert not d.local_path.exists() - assert not d._unchecked_chunks[0].file_path.exists() + assert not d.final_path.exists() + assert not d._unchecked_chunks[0]['ucc'].file_path.exists() def test_write_data(tmpdir): @@ -606,11 +611,11 @@ def test_write_data(tmpdir): data = b'0' * ase._size d.write_data(offsets, data) - assert d.local_path.exists() - assert d.local_path.stat().st_size == len(data) + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) -def test_finalize_file(tmpdir): +def test_finalize_integrity_and_file(tmpdir): # already finalized lp = pathlib.Path(str(tmpdir.join('af'))) opts = mock.MagicMock() @@ -624,11 +629,12 @@ def test_finalize_file(tmpdir): d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() d._finalized = True + d.finalize_integrity() d.finalize_file() - assert d.local_path.exists() - assert not d.final_path.exists() - d.local_path.unlink() + assert d.final_path.exists() + assert d.final_path.stat().st_size == ase._size + d.final_path.unlink() # hmac check success lp = pathlib.Path(str(tmpdir.join('a'))) @@ -654,9 +660,9 @@ def test_finalize_file(tmpdir): d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() d.hmac.update(data) + d.finalize_integrity() d.finalize_file() - assert not d.local_path.exists() assert d.final_path.exists() assert d.final_path.stat().st_size == len(data) @@ -676,9 +682,9 @@ def test_finalize_file(tmpdir): d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() d.md5.update(data) + d.finalize_integrity() d.finalize_file() - assert not d.local_path.exists() assert d.final_path.exists() assert d.final_path.stat().st_size == len(data) @@ -694,9 +700,9 @@ def test_finalize_file(tmpdir): d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() + d.finalize_integrity() d.finalize_file() - assert not d.local_path.exists() assert d.final_path.exists() assert d.final_path.stat().st_size == len(data) @@ -714,9 +720,9 @@ def test_finalize_file(tmpdir): d = models.Descriptor(lp, ase, opts, None) d._allocate_disk_space() d.md5.update(data) + d.finalize_integrity() d.finalize_file() - assert not d.local_path.exists() assert not d.final_path.exists() diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py index 1ee72bb..31edde7 100644 --- a/tests/test_blobxfer_models_options.py +++ b/tests/test_blobxfer_models_options.py @@ -21,22 +21,38 @@ def test_concurrency_options(patched_cc): a = options.Concurrency( crypto_processes=-1, md5_processes=0, + disk_threads=-1, transfer_threads=-2, ) assert a.crypto_processes == 0 assert a.md5_processes == 1 + assert a.disk_threads == 2 + assert a.transfer_threads == 4 + + a = options.Concurrency( + crypto_processes=-1, + md5_processes=0, + disk_threads=1, + transfer_threads=-1, + ) + + assert a.crypto_processes == 0 + assert a.md5_processes == 1 + assert a.disk_threads == 1 assert a.transfer_threads == 4 @mock.patch('multiprocessing.cpu_count', return_value=64) -def test_concurrency_options_max_transfer_threads(patched_cc): +def test_concurrency_options_max_disk_and_transfer_threads(patched_cc): a = options.Concurrency( crypto_processes=1, md5_processes=1, + disk_threads=None, transfer_threads=None, ) + assert a.disk_threads == 64 assert a.transfer_threads == 96 @@ -45,7 +61,8 @@ def test_general_options(): concurrency=options.Concurrency( crypto_processes=1, md5_processes=2, - transfer_threads=3, + disk_threads=3, + transfer_threads=4, ), log_file='abc.log', progress_bar=False, @@ -56,7 +73,8 @@ def test_general_options(): assert a.concurrency.crypto_processes == 1 assert a.concurrency.md5_processes == 2 - assert a.concurrency.transfer_threads == 3 + assert a.concurrency.disk_threads == 3 + assert a.concurrency.transfer_threads == 4 assert a.log_file == 'abc.log' assert not a.progress_bar assert a.resume_file == pathlib.Path('abc') @@ -67,7 +85,8 @@ def test_general_options(): concurrency=options.Concurrency( crypto_processes=1, md5_processes=2, - transfer_threads=3, + disk_threads=3, + transfer_threads=4, ), progress_bar=False, resume_file=None, @@ -77,7 +96,8 @@ def test_general_options(): assert a.concurrency.crypto_processes == 1 assert a.concurrency.md5_processes == 2 - assert a.concurrency.transfer_threads == 3 + assert a.concurrency.disk_threads == 3 + assert a.concurrency.transfer_threads == 4 assert a.log_file is None assert not a.progress_bar assert a.resume_file is None diff --git a/tests/test_blobxfer_models_resume.py b/tests/test_blobxfer_models_resume.py index 55a6009..7fb12a3 100644 --- a/tests/test_blobxfer_models_resume.py +++ b/tests/test_blobxfer_models_resume.py @@ -8,9 +8,8 @@ def test_download(): - d = rmodels.Download('fp', 'tp', 1, 2, 0, False, '') + d = rmodels.Download('fp', 1, 2, 0, False, '') assert d.final_path == 'fp' - assert d.temp_path == 'tp' assert d.length == 1 assert d.chunk_size == 2 assert d.next_integrity_chunk == 0 diff --git a/tests/test_blobxfer_models_upload.py b/tests/test_blobxfer_models_upload.py index e6447d7..7d9e057 100644 --- a/tests/test_blobxfer_models_upload.py +++ b/tests/test_blobxfer_models_upload.py @@ -7,7 +7,6 @@ except ImportError: # noqa import pathlib # non-stdlib imports -import pytest # module under test import blobxfer.models.upload as upload @@ -26,14 +25,10 @@ def test_localsourcepaths_files(tmpdir): defpath.join('moo.cow').write('y') a = upload.LocalSourcePath() - a.add_include('*.txt') + a.add_includes('*.txt') a.add_includes(['moo.cow', '*blah*']) - with pytest.raises(ValueError): - a.add_includes('abc') - a.add_exclude('**/blah.x') + a.add_excludes('**/blah.x') a.add_excludes(['world.txt']) - with pytest.raises(ValueError): - a.add_excludes('abc') a.add_path(str(tmpdir)) a_set = set() for file in a.files(): @@ -47,9 +42,9 @@ def test_localsourcepaths_files(tmpdir): b = upload.LocalSourcePath() b.add_includes(['moo.cow', '*blah*']) - b.add_include('*.txt') + b.add_includes('*.txt') b.add_excludes(['world.txt']) - b.add_exclude('**/blah.x') + b.add_excludes('**/blah.x') b.add_paths([pathlib.Path(str(tmpdir))]) for file in a.files(): sfile = str(file.parent_path / file.relative_path) diff --git a/tests/test_blobxfer_operations_azure.py b/tests/test_blobxfer_operations_azure.py index 346fab6..0322aa4 100644 --- a/tests/test_blobxfer_operations_azure.py +++ b/tests/test_blobxfer_operations_azure.py @@ -106,7 +106,7 @@ def test_azuresourcepath_files(patched_lf, patched_em): i = 0 for file in asp.files(creds, options, mock.MagicMock()): i += 1 - assert file.name == 'name' + assert file.name == 'remote/name' assert file.encryption_metadata is None assert i == 1 @@ -119,7 +119,7 @@ def test_azuresourcepath_files(patched_lf, patched_em): i = 0 for file in asp.files(creds, options, mock.MagicMock()): i += 1 - assert file.name == 'name' + assert file.name == 'remote/name' assert file.encryption_metadata is not None assert i == 1 diff --git a/tests/test_blobxfer_operations_crypto.py b/tests/test_blobxfer_operations_crypto.py index d3fdc62..f3dfc61 100644 --- a/tests/test_blobxfer_operations_crypto.py +++ b/tests/test_blobxfer_operations_crypto.py @@ -118,7 +118,7 @@ def test_cryptooffload_decrypt(tmpdir): unpad=False, ) a.add_decrypt_chunk( - 'fp', str(bfile), offsets, symkey, iv, hmacfile) + str(bfile), 0, offsets, symkey, iv, hmacfile) i = 33 checked = False while i > 0: @@ -127,7 +127,7 @@ def test_cryptooffload_decrypt(tmpdir): time.sleep(0.3) i -= 1 continue - assert result == 'fp' + assert result == (str(bfile), offsets) checked = True break assert checked diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index 08702b1..fcc2865 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -285,18 +285,26 @@ def test_pre_md5_skip_on_check(): rfile = azmodels.StorageEntity('cont') rfile._encryption = mock.MagicMock() rfile._encryption.blobxfer_extensions = mock.MagicMock() - rfile._encryption.blobxfer_extensions.pre_encrypted_content_md5 = \ - 'abc' + rfile._encryption.blobxfer_extensions.pre_encrypted_content_md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None lpath = 'lpath' + key = ops.Downloader.create_unique_transfer_operation_id(rfile) d._pre_md5_skip_on_check(lpath, rfile) - assert lpath in d._md5_map + assert key in d._md5_map + rfile._name = 'name2' lpath = 'lpath2' rfile._encryption = None rfile._md5 = 'abc' + key = ops.Downloader.create_unique_transfer_operation_id(rfile) d._pre_md5_skip_on_check(lpath, rfile) - assert lpath in d._md5_map + assert key in d._md5_map + + assert len(d._md5_map) == 2 def test_post_md5_skip_on_check(tmpdir): @@ -309,28 +317,45 @@ def test_post_md5_skip_on_check(tmpdir): lpath = str(lp) rfile = azmodels.StorageEntity('cont') rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 d._pre_md5_skip_on_check(lpath, rfile) - d._download_set.add(pathlib.Path(lpath)) - assert lpath in d._md5_map + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d._transfer_set.add(key) + assert key in d._md5_map - d._post_md5_skip_on_check(lpath, True) - assert lpath not in d._md5_map + d._post_md5_skip_on_check(key, lpath, rfile._size, True) + assert key not in d._md5_map d._add_to_download_queue = mock.MagicMock() d._pre_md5_skip_on_check(lpath, rfile) - d._download_set.add(pathlib.Path(lpath)) - d._post_md5_skip_on_check(lpath, False) + d._transfer_set.add(key) + d._post_md5_skip_on_check(key, lpath, rfile._size, False) assert d._add_to_download_queue.call_count == 1 def test_check_for_downloads_from_md5(): lpath = 'lpath' + rfile = azmodels.StorageEntity('cont') + rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 + key = ops.Downloader.create_unique_transfer_operation_id(rfile) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._md5_map[lpath] = mock.MagicMock() - d._download_set.add(pathlib.Path(lpath)) + d._md5_map[key] = rfile + d._transfer_set.add(key) d._md5_offload = mock.MagicMock() d._md5_offload.done_cv = multiprocessing.Condition() - d._md5_offload.pop_done_queue.side_effect = [None, (lpath, False)] + d._md5_offload.pop_done_queue.side_effect = [ + None, + (key, lpath, rfile._size, False), + ] d._add_to_download_queue = mock.MagicMock() d._all_remote_files_processed = False d._download_terminate = True @@ -343,11 +368,14 @@ def test_check_for_downloads_from_md5(): new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._md5_map[lpath] = mock.MagicMock() - d._download_set.add(pathlib.Path(lpath)) + d._md5_map[key] = rfile + d._transfer_set.add(key) d._md5_offload = mock.MagicMock() d._md5_offload.done_cv = multiprocessing.Condition() - d._md5_offload.pop_done_queue.side_effect = [None, (lpath, False)] + d._md5_offload.pop_done_queue.side_effect = [ + None, + (key, lpath, rfile._size, False), + ] d._add_to_download_queue = mock.MagicMock() patched_tc.side_effect = [False, False, True] d._check_for_downloads_from_md5() @@ -359,8 +387,8 @@ def test_check_for_downloads_from_md5(): new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._md5_map[lpath] = mock.MagicMock() - d._download_set.add(pathlib.Path(lpath)) + d._md5_map[key] = rfile + d._transfer_set.add(key) d._md5_offload = mock.MagicMock() d._md5_offload.done_cv = multiprocessing.Condition() d._md5_offload.pop_done_queue.side_effect = [None] @@ -372,15 +400,25 @@ def test_check_for_downloads_from_md5(): def test_check_for_crypto_done(): lpath = 'lpath' + rfile = azmodels.StorageEntity('cont') + rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 + key = ops.Downloader.create_unique_transfer_operation_id(rfile) d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._download_set.add(pathlib.Path(lpath)) + d._transfer_set.add(key) dd = mock.MagicMock() d._dd_map[lpath] = dd + offsets = mock.MagicMock() + offsets.range_start = 0 d._crypto_offload = mock.MagicMock() d._crypto_offload.done_cv = multiprocessing.Condition() d._crypto_offload.pop_done_queue.side_effect = [ None, - lpath, + (lpath, offsets) ] d._all_remote_files_processed = False d._download_terminate = True @@ -393,14 +431,16 @@ def test_check_for_crypto_done(): new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._download_set.add(pathlib.Path(lpath)) + d._transfer_set.add(key) dd = mock.MagicMock() + dd.entity = rfile + dd.final_path = lpath d._dd_map[lpath] = dd d._crypto_offload = mock.MagicMock() d._crypto_offload.done_cv = multiprocessing.Condition() d._crypto_offload.pop_done_queue.side_effect = [ None, - lpath, + (lpath, offsets), ] patched_tc.side_effect = [False, False, True] d._complete_chunk_download = mock.MagicMock() @@ -413,13 +453,15 @@ def test_check_for_crypto_done(): new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) - d._download_set.add(pathlib.Path(lpath)) + d._transfer_set.add(key) dd = mock.MagicMock() + dd.entity = rfile + dd.final_path = lpath d._crypto_offload = mock.MagicMock() d._crypto_offload.done_cv = multiprocessing.Condition() d._crypto_offload.pop_done_queue.side_effect = [ None, - lpath, + (lpath, offsets), ] patched_tc.side_effect = [False, False, True] d._complete_chunk_download = mock.MagicMock() @@ -438,39 +480,41 @@ def test_add_to_download_queue(tmpdir): d._spec.options.chunk_size_bytes = 1 d._add_to_download_queue(lpath, ase) - assert d._download_queue.qsize() == 1 + assert d._transfer_queue.qsize() == 1 assert path in d._dd_map -def test_initialize_and_terminate_download_threads(): +def test_initialize_and_terminate_transfer_threads(): opts = mock.MagicMock() opts.concurrency.transfer_threads = 2 d = ops.Downloader(opts, mock.MagicMock(), mock.MagicMock()) - d._worker_thread_download = mock.MagicMock() + d._worker_thread_transfer = mock.MagicMock() - d._initialize_download_threads() - assert len(d._download_threads) == 2 + d._initialize_transfer_threads() + assert len(d._transfer_threads) == 2 - d._wait_for_download_threads(terminate=True) + d._wait_for_transfer_threads(terminate=True) assert d._download_terminate - for thr in d._download_threads: + for thr in d._transfer_threads: assert not thr.is_alive() @mock.patch('blobxfer.operations.crypto.aes_cbc_decrypt_data') @mock.patch('blobxfer.operations.azure.file.get_file_range') @mock.patch('blobxfer.operations.azure.blob.get_blob_range') -def test_worker_thread_download( +def test_worker_thread_transfer( patched_gbr, patched_gfr, patched_acdd, tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._complete_chunk_download = mock.MagicMock() d._download_terminate = True - d._worker_thread_download() + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + d._worker_thread_transfer() assert d._complete_chunk_download.call_count == 0 d._download_terminate = False d._all_remote_files_processed = True - d._worker_thread_download() + d._worker_thread_transfer() assert d._complete_chunk_download.call_count == 0 with mock.patch( @@ -486,11 +530,11 @@ def test_worker_thread_download( opts.check_file_md5 = False opts.chunk_size_bytes = 16 dd = models.Descriptor(lp, ase, opts, None) - d._download_queue = mock.MagicMock() - d._download_queue.get.side_effect = [queue.Empty, dd] + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [queue.Empty, dd] d._process_download_descriptor = mock.MagicMock() d._process_download_descriptor.side_effect = RuntimeError('oops') - d._worker_thread_download() + d._worker_thread_transfer() assert len(d._exceptions) == 1 assert d._process_download_descriptor.call_count == 1 @@ -503,26 +547,35 @@ def test_worker_thread_download( new_callable=mock.PropertyMock) as patched_aoc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._size = 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' lp = pathlib.Path(str(tmpdir.join('a'))) dd = models.Descriptor(lp, ase, opts, None) dd.next_offsets = mock.MagicMock( side_effect=[(None, 1), (None, 2)]) + dd.finalize_integrity = mock.MagicMock() dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() + dd.all_operations_completed.side_effect = [False, True] patched_aoc.side_effect = [False, True] patched_tc.side_effect = [False, False, False, True] d._dd_map[str(lp)] = dd - d._download_set.add(lp) - d._download_queue = mock.MagicMock() - d._download_queue.get.side_effect = [queue.Empty, dd, dd] - d._worker_thread_download() + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [queue.Empty, dd, dd] + d._worker_thread_transfer() assert str(lp) not in d._dd_map assert dd.finalize_file.call_count == 1 assert d._download_sofar == 1 @@ -533,23 +586,33 @@ def test_worker_thread_download( new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 opts = mock.MagicMock() opts.check_file_md5 = True opts.chunk_size_bytes = 16 ase = azmodels.StorageEntity('cont') ase._mode = azmodels.StorageModes.File ase._size = 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('b'))) dd = models.Descriptor(lp, ase, opts, None) dd.finalize_file = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() d._dd_map[str(lp)] = mock.MagicMock() - d._download_set.add(lp) - d._download_queue = mock.MagicMock() - d._download_queue.get.side_effect = [dd] + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd] patched_tc.side_effect = [False, True] - d._worker_thread_download() + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) assert dd.perform_chunked_integrity_check.call_count == 1 with mock.patch( @@ -557,6 +620,8 @@ def test_worker_thread_download( new_callable=mock.PropertyMock) as patched_tc: d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 @@ -566,6 +631,11 @@ def test_worker_thread_download( ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' ase._encryption.content_encryption_iv = b'0' * 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('c'))) dd = models.Descriptor(lp, ase, opts, None) @@ -575,11 +645,14 @@ def test_worker_thread_download( d._crypto_offload = mock.MagicMock() d._crypto_offload.add_decrypt_chunk = mock.MagicMock() d._dd_map[str(lp)] = dd - d._download_set.add(lp) - d._download_queue = mock.MagicMock() - d._download_queue.get.side_effect = [dd] + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd] patched_tc.side_effect = [False, True] - d._worker_thread_download() + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) assert d._crypto_offload.add_decrypt_chunk.call_count == 1 assert dd.write_unchecked_hmac_data.call_count == 1 @@ -589,6 +662,8 @@ def test_worker_thread_download( d = ops.Downloader( mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._general_options.concurrency.crypto_processes = 0 + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 opts = mock.MagicMock() opts.check_file_md5 = False opts.chunk_size_bytes = 16 @@ -598,19 +673,28 @@ def test_worker_thread_download( ase._encryption = mock.MagicMock() ase._encryption.symmetric_key = b'abc' ase._encryption.content_encryption_iv = b'0' * 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) patched_gfr.return_value = b'0' * ase._size lp = pathlib.Path(str(tmpdir.join('d'))) dd = models.Descriptor(lp, ase, opts, None) dd.next_offsets() dd.write_unchecked_hmac_data = mock.MagicMock() dd.perform_chunked_integrity_check = mock.MagicMock() + dd.mark_unchecked_chunk_decrypted = mock.MagicMock() patched_acdd.return_value = b'0' * 16 d._dd_map[str(lp)] = mock.MagicMock() - d._download_set.add(lp) - d._download_queue = mock.MagicMock() - d._download_queue.get.side_effect = [dd] + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd, dd] patched_tc.side_effect = [False, True] - d._worker_thread_download() + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) assert patched_acdd.call_count == 1 assert dd.write_unchecked_hmac_data.call_count == 1 assert dd.perform_chunked_integrity_check.call_count == 1 @@ -631,7 +715,7 @@ def test_cleanup_temporary_files(tmpdir): d._general_options.resume_file = pathlib.Path('abc') d._dd_map[0] = dd d._cleanup_temporary_files() - assert dd.local_path.exists() + assert dd.final_path.exists() lp = pathlib.Path(str(tmpdir.join('b'))) opts = mock.MagicMock() @@ -645,7 +729,7 @@ def test_cleanup_temporary_files(tmpdir): d._general_options.resume_file = None d._dd_map[0] = dd d._cleanup_temporary_files() - assert not dd.local_path.exists() + assert not dd.final_path.exists() lp = pathlib.Path(str(tmpdir.join('c'))) opts = mock.MagicMock() @@ -661,7 +745,7 @@ def test_cleanup_temporary_files(tmpdir): d._general_options.resume_file = None d._dd_map[0] = dd d._cleanup_temporary_files() - assert dd.local_path.exists() + assert dd.final_path.exists() def test_catalog_local_files_for_deletion(tmpdir): @@ -699,21 +783,16 @@ def test_delete_extraneous_files(tmpdir): d._delete_extraneous_files() -@mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') -@mock.patch('blobxfer.operations.azure.blob.list_blobs') -@mock.patch( - 'blobxfer.operations.download.Downloader.ensure_local_destination', - return_value=True -) -def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): +def _create_downloader_for_start(td): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._cleanup_temporary_files = mock.MagicMock() d._download_start = datetime.datetime.now(tz=dateutil.tz.tzlocal()) - d._initialize_download_threads = mock.MagicMock() - patched_lfmo._check_thread = mock.MagicMock() - d._general_options.concurrency.crypto_processes = 1 + d._initialize_transfer_threads = mock.MagicMock() + d._general_options.concurrency.crypto_processes = 0 d._general_options.concurrency.md5_processes = 1 - d._general_options.resume_file = pathlib.Path(str(tmpdir.join('rf'))) + d._general_options.concurrency.disk_threads = 1 + d._general_options.concurrency.transfer_threads = 1 + d._general_options.resume_file = pathlib.Path(str(td.join('rf'))) d._spec.sources = [] d._spec.options = mock.MagicMock() d._spec.options.chunk_size_bytes = 1 @@ -725,50 +804,84 @@ def test_start(patched_eld, patched_lb, patched_lfmo, tmpdir): d._spec.skip_on.lmt_ge = False d._spec.skip_on.filesize_match = False d._spec.destination = mock.MagicMock() - d._spec.destination.path = pathlib.Path(str(tmpdir)) + d._spec.destination.path = pathlib.Path(str(td)) d._download_start_time = util.datetime_now() + d._pre_md5_skip_on_check = mock.MagicMock() + d._check_download_conditions = mock.MagicMock() + d._all_remote_files_processed = False p = '/cont/remote/path' asp = azops.SourcePath() asp.add_path_with_storage_account(p, 'sa') d._spec.sources.append(asp) - b = azure.storage.blob.models.Blob(name='name') + return d + + +@mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') +@mock.patch('blobxfer.operations.azure.blob.list_blobs') +@mock.patch( + 'blobxfer.operations.download.Downloader.ensure_local_destination', + return_value=True +) +@mock.patch( + 'blobxfer.operations.download.Downloader.' + 'create_unique_transfer_operation_id', + return_value='id' +) +@mock.patch( + 'blobxfer.operations.download.Downloader._wait_for_transfer_threads', + return_value=None +) +@mock.patch( + 'blobxfer.operations.download.Downloader._wait_for_disk_threads', + return_value=None +) +def test_start( + patched_wdt, patched_wtt, patched_cutoi, patched_eld, patched_lb, + patched_lfmo, tmpdir): + patched_lfmo._check_thread = mock.MagicMock() + + b = azure.storage.blob.models.Blob(name='remote/path/name') b.properties.content_length = 1 patched_lb.side_effect = [[b]] - d._pre_md5_skip_on_check = mock.MagicMock() - d._check_download_conditions = mock.MagicMock() + d = _create_downloader_for_start(tmpdir) d._check_download_conditions.return_value = ops.DownloadAction.Skip + d._download_sofar = -1 + d._download_bytes_sofar = -1 d.start() assert d._pre_md5_skip_on_check.call_count == 0 patched_lb.side_effect = [[b]] - d._all_remote_files_processed = False + d = _create_downloader_for_start(tmpdir) d._check_download_conditions.return_value = ops.DownloadAction.CheckMd5 + d._download_sofar = -1 with pytest.raises(RuntimeError): d.start() + d._download_terminate = True assert d._pre_md5_skip_on_check.call_count == 1 b.properties.content_length = 0 patched_lb.side_effect = [[b]] - d._all_remote_files_processed = False + d = _create_downloader_for_start(tmpdir) d._check_download_conditions.return_value = ops.DownloadAction.Download with pytest.raises(RuntimeError): d.start() - assert d._download_queue.qsize() == 1 + d._download_terminate = True + assert d._transfer_queue.qsize() == 1 # test exception count b = azure.storage.blob.models.Blob(name='name') b.properties.content_length = 1 patched_lb.side_effect = [[b]] + d = _create_downloader_for_start(tmpdir) d._spec.destination.is_dir = False d._spec.options.rename = True - d._pre_md5_skip_on_check = mock.MagicMock() - d._check_download_conditions = mock.MagicMock() d._check_download_conditions.return_value = ops.DownloadAction.Skip d._exceptions = [RuntimeError('oops')] with pytest.raises(RuntimeError): d.start() + d._download_terminate = True assert d._pre_md5_skip_on_check.call_count == 0 @@ -776,11 +889,11 @@ def test_start_keyboard_interrupt(): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) d._general_options.resume_file = None d._run = mock.MagicMock(side_effect=KeyboardInterrupt) - d._wait_for_download_threads = mock.MagicMock() + d._wait_for_transfer_threads = mock.MagicMock() d._cleanup_temporary_files = mock.MagicMock() d._md5_offload = mock.MagicMock() with pytest.raises(KeyboardInterrupt): d.start() - assert d._wait_for_download_threads.call_count == 1 + assert d._wait_for_transfer_threads.call_count == 1 assert d._cleanup_temporary_files.call_count == 1 diff --git a/tests/test_blobxfer_operations_md5.py b/tests/test_blobxfer_operations_md5.py index 5bd7b20..02be647 100644 --- a/tests/test_blobxfer_operations_md5.py +++ b/tests/test_blobxfer_operations_md5.py @@ -57,6 +57,8 @@ def test_finalize_md5_processes(): def test_from_add_to_done_non_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') + fpath = str(file) + key = 'key' remote_md5 = ops.compute_md5_for_file_asbase64(str(file)) @@ -67,7 +69,7 @@ def test_from_add_to_done_non_pagealigned(tmpdir): assert result is None a.add_localfile_for_md5_check( - str(file), remote_md5, azmodels.StorageModes.Block) + key, fpath, fpath, remote_md5, azmodels.StorageModes.Block, None) i = 33 checked = False while i > 0: @@ -76,9 +78,11 @@ def test_from_add_to_done_non_pagealigned(tmpdir): time.sleep(0.3) i -= 1 continue - assert len(result) == 2 - assert result[0] == str(file) - assert result[1] + assert len(result) == 4 + assert result[0] == key + assert result[1] == str(file) + assert result[2] is None + assert result[3] checked = True break assert checked @@ -90,6 +94,8 @@ def test_from_add_to_done_non_pagealigned(tmpdir): def test_from_add_to_done_pagealigned(tmpdir): file = tmpdir.join('a') file.write('abc') + fpath = str(file) + key = 'key' remote_md5 = ops.compute_md5_for_file_asbase64(str(file), True) @@ -100,7 +106,7 @@ def test_from_add_to_done_pagealigned(tmpdir): assert result is None a.add_localfile_for_md5_check( - str(file), remote_md5, azmodels.StorageModes.Page) + key, fpath, fpath, remote_md5, azmodels.StorageModes.Page, None) i = 33 checked = False while i > 0: @@ -109,9 +115,11 @@ def test_from_add_to_done_pagealigned(tmpdir): time.sleep(0.3) i -= 1 continue - assert len(result) == 2 - assert result[0] == str(file) - assert result[1] + assert len(result) == 4 + assert result[0] == key + assert result[1] == str(file) + assert result[2] is None + assert result[3] checked = True break assert checked diff --git a/tests/test_blobxfer_operations_progress.py b/tests/test_blobxfer_operations_progress.py index 75f9f79..721501e 100644 --- a/tests/test_blobxfer_operations_progress.py +++ b/tests/test_blobxfer_operations_progress.py @@ -13,12 +13,12 @@ import blobxfer.operations.progress as ops -def test_output_download_parameters(): +def test_output_parameters(): go = mock.MagicMock() spec = mock.MagicMock() go.log_file = 'abc' - ops.output_download_parameters(go, spec) + ops.output_parameters(go, spec) assert util.is_not_empty(go.log_file) diff --git a/tests/test_blobxfer_operations_resume.py b/tests/test_blobxfer_operations_resume.py index 52f11b8..9894d3b 100644 --- a/tests/test_blobxfer_operations_resume.py +++ b/tests/test_blobxfer_operations_resume.py @@ -2,6 +2,10 @@ """Tests for operations resume""" # stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock try: import pathlib2 as pathlib except ImportError: # noqa @@ -23,23 +27,28 @@ def test_download_resume_manager(tmpdir): assert drm._data is None assert not tmpdb.exists() + ase = mock.MagicMock() + ase._name = 'name' + ase._client.primary_endpoint = 'ep' + ase._size = 16 + final_path = 'fp' drm = ops.DownloadResumeManager(tmpdb) - drm.add_or_update_record(final_path, 'tp', 1, 2, 0, False, None) - d = drm.get_record(final_path) + drm.add_or_update_record(final_path, ase, 2, 0, False, None) + d = drm.get_record(ase) assert d.final_path == final_path - drm.add_or_update_record(final_path, 'tp', 1, 2, 1, False, 'abc') - d = drm.get_record(final_path) + drm.add_or_update_record(final_path, ase, 2, 1, False, 'abc') + d = drm.get_record(ase) assert d.final_path == final_path assert not d.completed assert d.next_integrity_chunk == 1 assert d.md5hexdigest == 'abc' - drm.add_or_update_record(final_path, 'tp', 1, 2, 1, True, None) - d = drm.get_record(final_path) + drm.add_or_update_record(final_path, ase, 2, 1, True, None) + d = drm.get_record(ase) assert d.final_path == final_path assert d.completed @@ -47,8 +56,8 @@ def test_download_resume_manager(tmpdir): assert d.md5hexdigest == 'abc' # idempotent check after completed - drm.add_or_update_record(final_path, 'tp', 1, 2, 1, True, None) - d = drm.get_record(final_path) + drm.add_or_update_record(final_path, ase, 2, 1, True, None) + d = drm.get_record(ase) assert d.final_path == final_path assert d.completed diff --git a/tests/test_blobxfer_retry.py b/tests/test_blobxfer_retry.py index 9d84b90..d44fa21 100644 --- a/tests/test_blobxfer_retry.py +++ b/tests/test_blobxfer_retry.py @@ -17,26 +17,34 @@ def test_exponentialretrywithmaxwait(): er = retry.ExponentialRetryWithMaxWait( initial_backoff=1, max_backoff=0) + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=1, max_backoff=1, max_retries=-1) + + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=2, max_backoff=1) + er = retry.ExponentialRetryWithMaxWait() context = mock.MagicMock() context.count = 0 context.response.status = 500 bo = er.retry(context) assert context.count == 1 - assert bo == 1 + assert bo == 0.1 bo = er.retry(context) assert context.count == 2 - assert bo == 2 + assert bo == 0.2 bo = er.retry(context) assert context.count == 3 - assert bo == 4 + assert bo == 0.4 bo = er.retry(context) assert context.count == 4 - assert bo == 8 + assert bo == 0.8 bo = er.retry(context) - assert context.count == 1 - assert bo == 1 + assert context.count == 5 + assert bo == 0.1 diff --git a/tox.ini b/tox.ini index 58a6df6..d05615f 100644 --- a/tox.ini +++ b/tox.ini @@ -4,8 +4,8 @@ envlist = py27, py35 [testenv] deps = -rtest_requirements.txt commands = - #flake8 {envsitepackagesdir}/blobxfer_cli/ - #flake8 {envsitepackagesdir}/blobxfer/ + flake8 {envsitepackagesdir}/blobxfer_cli/ + flake8 {envsitepackagesdir}/blobxfer/ py.test \ -x -l -s \ --ignore venv/ \ From 2999d0bc297faa68e2c64cb60abfb460a4ffa759 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 1 Jun 2017 08:21:11 -0700 Subject: [PATCH 41/47] Fix vectored replica mode - Fix MD5 check condition --- blobxfer/models/metadata.py | 20 +++++++++++- blobxfer/models/upload.py | 5 ++- blobxfer/operations/download.py | 14 +++----- blobxfer/operations/upload.py | 38 +++++++++++++--------- tests/test_blobxfer_operations_download.py | 3 ++ 5 files changed, 53 insertions(+), 27 deletions(-) diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py index 7d5ea0d..ead4b79 100644 --- a/blobxfer/models/metadata.py +++ b/blobxfer/models/metadata.py @@ -88,6 +88,24 @@ ) +def get_md5_from_metadata(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Get MD5 from properties or metadata + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :rtype: str or None + :return: md5 + """ + # if encryption metadata is present, check for pre-encryption + # md5 in blobxfer extensions + md5 = None + if ase.is_encrypted: + md5 = ase.encryption_metadata.blobxfer_extensions.\ + pre_encrypted_content_md5 + if blobxfer.util.is_none_or_empty(md5): + md5 = ase.md5 + return md5 + + def generate_fileattr_metadata(local_path, metadata): # type: (blobxfer.models.upload.LocalPath, dict) -> dict """Generate file attribute metadata dict @@ -159,7 +177,7 @@ def restore_fileattr(path, metadata): def create_vectored_io_next_entry(ase): - # type: (blobxfer.models.upload.LocalPath) -> str + # type: (blobxfer.models.azure.StorageEntity) -> str """Create Vectored IO next entry id :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity :rtype: str diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py index d411bb0..d5151b4 100644 --- a/blobxfer/models/upload.py +++ b/blobxfer/models/upload.py @@ -361,7 +361,7 @@ def __init__(self, lpath, ase, uid, options, resume_mgr): self._total_chunks = self._compute_total_chunks(self._chunk_size) self._outstanding_ops = self._total_chunks if blobxfer.util.is_not_empty(self._ase.replica_targets): - self._outstanding_ops *= len(self._ase.replica_targets) + self._outstanding_ops *= len(self._ase.replica_targets) + 1 if self._resume_mgr: self._completed_chunks = bitstring.BitArray( length=self._total_chunks) @@ -577,6 +577,9 @@ def _compute_remote_size(self): else: allocatesize = 0 self._ase.size = allocatesize + if blobxfer.util.is_not_empty(self._ase.replica_targets): + for rt in self._ase.replica_targets: + rt.size = allocatesize logger.debug('remote size for {} is {} bytes'.format( self._ase.path, self._ase.size)) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index a369d0f..3cbef61 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -45,6 +45,7 @@ # non-stdlib imports # local imports import blobxfer.models.crypto +import blobxfer.models.metadata import blobxfer.operations.azure.blob import blobxfer.operations.azure.file import blobxfer.operations.crypto @@ -237,8 +238,8 @@ def _check_download_conditions(self, lpath, rfile): lpath, rfile.path)) return DownloadAction.Skip # check skip on options, MD5 match takes priority - if (self._spec.skip_on.md5_match and - blobxfer.util.is_not_empty(rfile.md5)): + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + if self._spec.skip_on.md5_match and blobxfer.util.is_not_empty(md5): return DownloadAction.CheckMd5 # if neither of the remaining skip on actions are activated, download if (not self._spec.skip_on.filesize_match and @@ -277,14 +278,7 @@ def _pre_md5_skip_on_check(self, lpath, rfile): :param pathlib.Path lpath: local path :param blobxfer.models.azure.StorageEntity rfile: remote file """ - # if encryption metadata is present, check for pre-encryption - # md5 in blobxfer extensions - md5 = None - if rfile.encryption_metadata is not None: - md5 = rfile.encryption_metadata.blobxfer_extensions.\ - pre_encrypted_content_md5 - if md5 is None: - md5 = rfile.md5 + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) key = blobxfer.operations.download.Downloader.\ create_unique_transfer_operation_id(rfile) with self._md5_meta_lock: diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 02447c9..232e8ba 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -46,7 +46,11 @@ # non-stdlib imports # local imports import blobxfer.models.crypto +import blobxfer.models.metadata import blobxfer.operations.azure.blob +import blobxfer.operations.azure.blob.append +import blobxfer.operations.azure.blob.block +import blobxfer.operations.azure.blob.page import blobxfer.operations.azure.file import blobxfer.operations.crypto import blobxfer.operations.md5 @@ -86,9 +90,9 @@ def __init__(self, general_options, creds, spec): self._upload_set = set() self._upload_start_time = None self._disk_threads = [] - self._upload_total = None + self._upload_total = 0 self._upload_sofar = 0 - self._upload_bytes_total = None + self._upload_bytes_total = 0 self._upload_bytes_sofar = 0 self._upload_terminate = False self._transfer_lock = threading.Lock() @@ -218,14 +222,7 @@ def _pre_md5_skip_on_check(self, src, rfile): :param blobxfer.models.upload.LocalPath src: local path :param blobxfer.models.azure.StorageEntity rfile: remote file """ - # if encryption metadata is present, check for pre-encryption - # md5 in blobxfer extensions - md5 = None - if rfile.encryption_metadata is not None: - md5 = rfile.encryption_metadata.blobxfer_extensions.\ - pre_encrypted_content_md5 - if md5 is None: - md5 = rfile.md5 + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) key = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) with self._md5_meta_lock: self._md5_map[key] = (src, rfile) @@ -786,8 +783,8 @@ def _check_upload_conditions(self, local_path, rfile): rfile.path, lpath)) return UploadAction.Skip # check skip on options, MD5 match takes priority - if (self._spec.skip_on.md5_match and - blobxfer.util.is_not_empty(rfile.md5)): + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + if self._spec.skip_on.md5_match and blobxfer.util.is_not_empty(md5): return UploadAction.CheckMd5 # if neither of the remaining skip on actions are activated, upload if (not self._spec.skip_on.filesize_match and @@ -991,7 +988,17 @@ def _vectorize_and_bind(self, local_path, dest): yield action, local_path, ase else: primary_ase = dst[0] + if primary_ase.replica_targets is None: + primary_ase.replica_targets = [] primary_ase.replica_targets.extend(dst[1:]) + # add replica targets to deletion exclusion set + if self._spec.options.delete_extraneous_destination: + for rt in primary_ase.replica_targets: + self._delete_exclude.add( + blobxfer.operations.upload.Uploader. + create_deletion_id( + rt._client, rt.container, rt.name) + ) yield action, local_path, primary_ase else: for _, ase in dest: @@ -1019,7 +1026,8 @@ def _run(self): self._md5_offload.initialize_check_thread( self._check_for_uploads_from_md5) # initialize crypto processes - if self._general_options.concurrency.crypto_processes > 0: + if (self._spec.options.rsa_public_key is not None and + self._general_options.concurrency.crypto_processes > 0): logger.warning( 'crypto offload for upload is not possible due to ' 'sequential nature of {} and FullBlob encryption mode'.format( @@ -1033,8 +1041,6 @@ def _run(self): skipped_files = 0 skipped_size = 0 approx_total_bytes = 0 - self._upload_total = 0 - self._upload_bytes_total = 0 if not self._spec.sources.can_rename() and self._spec.options.rename: raise RuntimeError( 'cannot rename to specified destination with multiple sources') @@ -1056,6 +1062,8 @@ def _run(self): skipped_size += ase.size if ase.size is not None else 0 continue approx_total_bytes += lp.size + if blobxfer.util.is_not_empty(ase.replica_targets): + approx_total_bytes += lp.size * len(ase.replica_targets) # add to potential upload set uid = blobxfer.operations.upload.Uploader.create_unique_id( lp, ase) diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py index fcc2865..90b83fe 100644 --- a/tests/test_blobxfer_operations_download.py +++ b/tests/test_blobxfer_operations_download.py @@ -189,6 +189,7 @@ def test_check_download_conditions(tmpdir): d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) rfile = mock.MagicMock() rfile.md5 = 'abc' + rfile._encryption = None result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.CheckMd5 @@ -238,6 +239,7 @@ def test_check_download_conditions(tmpdir): rfile = azmodels.StorageEntity('cont') rfile._size = util.page_align_content_length(ep.stat().st_size) rfile._mode = azmodels.StorageModes.Page + rfile._encryption = None result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.Skip @@ -269,6 +271,7 @@ def test_check_download_conditions(tmpdir): rfile = azmodels.StorageEntity('cont') rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) + \ datetime.timedelta(days=1) + rfile._encryption = None result = d._check_download_conditions(ep, rfile) assert result == ops.DownloadAction.Download From 6267551cc7e96b06d0c8e59559d9621601832faa Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 1 Jun 2017 13:02:31 -0700 Subject: [PATCH 42/47] Add base documentation - Update README - Rev version to 1.0.0a2 --- CHANGELOG.md | 93 ++++++++++++++++----------- README.md | 57 ++++++++++++++-- blobxfer/version.py | 2 +- cli/cli.py | 4 +- docs/01-installation.md | 72 +++++++++++++++++++++ docs/10-cli-usage.md | 30 +++++++++ docs/20-yaml-configuration.md | 3 + docs/30-vectored-io.md | 3 + docs/40-client-side-encryption.md | 29 +++++++++ docs/80-blobxfer-python-library.md | 3 + docs/98-performance-considerations.md | 68 ++++++++++++++++++++ docs/99-current-limitations.md | 24 +++++++ docs/index.md | 16 +++++ 13 files changed, 356 insertions(+), 48 deletions(-) create mode 100644 docs/01-installation.md create mode 100644 docs/10-cli-usage.md create mode 100644 docs/20-yaml-configuration.md create mode 100644 docs/30-vectored-io.md create mode 100644 docs/40-client-side-encryption.md create mode 100644 docs/80-blobxfer-python-library.md create mode 100644 docs/98-performance-considerations.md create mode 100644 docs/99-current-limitations.md create mode 100644 docs/index.md diff --git a/CHANGELOG.md b/CHANGELOG.md index d1e4233..3abb9b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,158 +2,172 @@ ## [Unreleased] +## [1.0.0a2] - 2017-06-XX +### Changed +- From scratch rewrite providing a consistent CLI experience and a vast +array of new and advanced features. Please see the +[1.0.0 Milestone](https://github.com/Azure/blobxfer/milestone/1) for a +catalog of changes. +- All dependencies updated to latest + +### Removed +- Azure Service Management certificate support + +### Security +- Update cryptography requirement to 1.9 + ## [0.12.1] - 2016-12-09 -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Allow page blobs up to 1TB -#### Security +### Security - Update cryptography requirement to 1.6 ## [0.12.0] - 2016-10-17 -#### Added +### Added - Support for Account-level SAS keys - Update README regarding non-normalized exceptions being thrown (#5) ## [0.11.5] - 2016-10-03 -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Fix incorrect fileshare path splitting (#3) -#### Security +### Security - Update cryptography requirement to 1.5.2 ## [0.11.4] - 2016-09-12 -#### Added +### Added - Created [Docker image](https://hub.docker.com/r/alfpark/blobxfer) -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Fix `--delete` and blob listing with azure-storage (#1) -#### Security +### Security - Update cryptography requirement to 1.5 ## [0.11.2] - 2016-07-28 -#### Added +### Added - Allow rsakeypassphrase to be passed as an environment variable ## 0.11.1 - 2016-07-05 -#### Added +### Added - Allow storage account or sas key credentials to be passed as environment variables ## 0.11.0 - 2016-06-09 -#### Added +### Added - Azure Files support, please refer to the General Notes section for limitations -#### Changed +### Changed - `--blobep` option has been renamed to `--endpoint` ## 0.10.1 - 2016-06-06 -#### Changed +### Changed - Update all dependencies to latest versions - Add flag for block/page level md5 computation which is now disabled by default -#### Fixed +### Fixed - Update against breaking changes from azure-storage 0.32.0 -#### Removed +### Removed - Remove RC designation from encryption/decryption functionality -#### Security +### Security - Update cryptography requirement to 1.4 ## 0.10.0 - 2016-03-22 -#### Added +### Added - Added ``--disable-urllib-warnings`` option to suppress urllib3 warnings (use with care) -#### Changed +### Changed - Update script for compatibility with azure-storage 0.30.0 which is now a required dependency - Promote encryption to RC status - `--blobep` now refers to endpoint suffix rather than blob endpoint (e.g., core.windows.net rather than blob.core.windows.net) -#### Security +### Security - Update cryptography requirement to 1.3 ## 0.9.9.11 - 2016-02-22 -#### Changed +### Changed - Pin azure dependencies due to breaking changes -#### Fixed +### Fixed - Minor bug fixes -#### Security +### Security - Update cryptography requirement to 1.2.2 ## 0.9.9.10 - 2016-01-31 -#### Fixed +### Fixed - Fix regression in blob name encoding with Python3 ## 0.9.9.9 - 2016-01-29 -#### Added +### Added - Emit warning when attempting to use remoteresource with a directory upload -#### Changed +### Changed - Update setup.py dependencies to latest available versions -#### Fixed +### Fixed - Fix regression in single file upload and remoteresource renaming - Replace socket exception handling with requests ConnectionError handling - Properly handle blob names containing `?` if using SAS ## 0.9.9.8 - 2016-01-06 -#### Fixed +### Fixed - Disable unnecessary thread daemonization - Gracefully handle KeyboardInterrupts - Explicitly add azure-common to setup.py install reqs ## 0.9.9.7 - 2016-01-05 -#### Added +### Added - Add python environment and package info to parameter dump to aid issue/bug reports -#### Changed +### Changed - Reduce number of default concurrent workers to 3x CPU count - Change azure\_request backoff mechanism -#### Fixed +### Fixed - Make base requirements non-optional in import process - Update azure\_request exception handling to support new Azure Storage Python SDK errors ## 0.9.9.6 - 2016-01-04 -#### Added +### Added - Encryption support - No file overwrite on download option - Auto-detection of file mimetype - Remote delete option - Include pattern option -#### Changed +### Changed - Replace keeprootdir with strip-components option - Reduce the number of default concurrent workers to 4x CPU count -#### Fixed +### Fixed - Fix shared key upload with non-existent container - Fix zero-byte blob download issue ## 0.9.9.5 - 2015-09-27 -#### Added +### Added - File collation support -#### Fixed +### Fixed - Fix page alignment bug - Reduce memory usage @@ -183,7 +197,8 @@ `--no-skiponmatch`. - 0.8.2: performance regression fixes -[Unreleased]: https://github.com/Azure/blobxfer/compare/0.12.1...HEAD +[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a2...HEAD +[1.0.0a2]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a2 [0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1 [0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0 [0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5 diff --git a/README.md b/README.md index 6853cb4..a2cf2b9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,57 @@ -blobxfer -======== +[![Build Status](https://travis-ci.org/Azure/blobxfer.svg?branch=master)](https://travis-ci.org/Azure/blobxfer) +[![Coverage Status](https://coveralls.io/repos/github/Azure/blobxfer/badge.svg?branch=master)](https://coveralls.io/github/Azure/blobxfer?branch=master) +[![PyPI](https://img.shields.io/pypi/v/blobxfer.svg)](https://pypi.python.org/pypi/blobxfer) +[![PyPI](https://img.shields.io/pypi/pyversions/blobxfer.svg)](https://pypi.python.org/pypi/blobxfer) +[![Docker Pulls](https://img.shields.io/docker/pulls/alfpark/blobxfer.svg)](https://hub.docker.com/r/alfpark/blobxfer) +[![Image Layers](https://images.microbadger.com/badges/image/alfpark/blobxfer:latest.svg)](http://microbadger.com/images/alfpark/blobxfer) -AzCopy-like OS independent Azure storage blob and file share transfer tool +# blobxfer +`blobxfer` is an advanced data movement tool and library for Azure Storage +Blob and Files. With `blobxfer` you can copy your files into or out of Azure +Storage with the CLI or integrate the `blobxfer` data movement library into +your own Python scripts. -Change Log ----------- +## Major Features +* Command-line interface (CLI) providing data movement capability to and +from Azure Blob and File Storage +* High-performance design with asynchronous transfers and disk I/O +* YAML configuration driven execution support +* Resume support +* Vectored IO + * `stripe` mode allows striping a single file across multiple blobs to + break through single blob or fileshare throughput limits including + multi-storage account destinations + * `replica` mode allows replication of a file across multiple locations + including multi-storage account destinations +* Client-side encryption support +* Advanced skip options for rsync-like operations +* Store/restore POSIX filemode and uid/gid +* `stdin` piping support +* Append blob support +* Configurable one-shot block upload support +* Block (chunk) size selection support +* Rsync-like `--delete-after` support +* Support for reading from blob snapshots +* Automatic block blob size adjustment for uploading +* Automatic uploading of VHD and VHDX files as page blobs +* Wildcard filtering with include and exclude support +* No clobber support in either direction +* File logging support -See the [CHANGELOG.md](https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md) file. +## Installation +`blobxfer` is on [PyPI](https://pypi.python.org/pypi/blobxfer) and on +[Docker Hub](https://hub.docker.com/r/alfpark/blobxfer/). Please refer to +the [installation guide](https://github.com/Azure/blobxfer/blob/master/docs/01-installation.md) +on how to install `blobxfer`. + +## Documentation +Please refer to the [blobxfer Documentation](https://github.com/Azure/blobxfer/blob/master/docs) +for more details and usage information. + +## Change Log +For recent changes, please refer to the +[CHANGELOG.md](https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md) +file. ------------------------------------------------------------------------ diff --git a/blobxfer/version.py b/blobxfer/version.py index 9e8b65b..6280e0b 100644 --- a/blobxfer/version.py +++ b/blobxfer/version.py @@ -22,4 +22,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -__version__ = '1.0.0a1' +__version__ = '1.0.0a2' diff --git a/cli/cli.py b/cli/cli.py index ec4f3f9..9b8c788 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -771,12 +771,12 @@ def synccopy( ctx, local_resource, storage_account, remote_path, sync_copy_dest_storage_account, sync_copy_dest_remote_path): """Synchronously copy blobs between Azure Storage accounts""" + raise NotImplementedError() settings.add_cli_options( ctx.cli_options, settings.TransferAction.Synccopy, local_resource, storage_account, remote_path, sync_copy_dest_storage_account, sync_copy_dest_remote_path) ctx.initialize() - raise NotImplementedError() @cli.command('upload') @@ -827,10 +827,10 @@ def useconfig_download(ctx, config): def useconfig_synccopy(ctx, config): """Synchronously copy blobs between Azure Storage accounts via yaml configuration""" + raise NotImplementedError() settings.add_cli_options( ctx.cli_options, settings.TransferAction.Synccopy, None, None, None) ctx.initialize() - raise NotImplementedError() @useconfig.command('upload') diff --git a/docs/01-installation.md b/docs/01-installation.md new file mode 100644 index 0000000..cffe6c6 --- /dev/null +++ b/docs/01-installation.md @@ -0,0 +1,72 @@ +# blobxfer Installation +`blobxfer` is a pure Python package, however, some dependencies require a C +compiler and supporting libraries if there is no binary wheel. Please follow +the pre-requisites section first prior to invoking installation via `pip`. +Alternatively, you can use the +[blobxfer Docker image](https://hub.docker.com/r/alfpark/blobxfer/). + +## Pre-requisites +`blobxfer` depends on `cryptography` and `ruamel.yaml` which require a +C compiler if your platform does not have a pre-made binary wheel. Please +follow the instructions below for your platform. + +### Ubuntu +```shell +apt-get update +# for Python3 (recommended) +apt-get install -y build-essential libssl-dev libffi-dev python3-dev python3-pip +# for Python2 +apt-get install -y build-essential libssl-dev libffi-dev python-dev python-pip +``` + +### CentOS/RHEL +```shell +# for Python2 +yum install -y gcc openssl-dev libffi-devel python-devel +curl -fSsL https://bootstrap.pypa.io/get-pip.py | python +``` + +### SLES/OpenSUSE +```shell +zypper ref +# for Python2 +zypper -n in gcc libopenssl-devel libffi48-devel python-devel +curl -fSsL https://bootstrap.pypa.io/get-pip.py | python +``` + +## Installation via `pip` +[blobxfer](https://pypi.python.org/pypi/blobxfer) is on PyPI and can be +installed via: + +```shell +# for Python2 +pip install blobxfer +# for Python3 +pip3 instlal blobxfer +``` + +`blobxfer` is compatible with Python 2.7 and 3.3+. To install for Python 3 +(which is recommended), some distributions may use `pip3` instead of `pip`. +Installing into your user area via `--user` or via a virtual environment +is recommended to avoid installation issues with system-wide Python +packages. + +## Installation via Docker +[blobxfer](https://hub.docker.com/r/alfpark/blobxfer/) is also on Docker +Hub and can be retrieved via: + +```shell +docker pull alfpark/blobxfer +``` + +## Troubleshooting +#### `azure.storage` dependency not found +If you get an error that `azure.storage` cannot be found or loaded this means +that there was an issue installing this package with other `azure` packages +that share the same base namespace. You can correct this by issuing: +```shell +# for Python2 +pip install azure-storage +# for Python3 +pip3 install azure-storage +``` diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md new file mode 100644 index 0000000..9c9f111 --- /dev/null +++ b/docs/10-cli-usage.md @@ -0,0 +1,30 @@ +# blobxfer Command-Line Usage + +## TODO + + +### General Notes +* `blobxfer` does not take any leases on blobs or containers. It is up to the +user to ensure that blobs are not modified while download/uploads are being +performed. +* No validation is performed regarding container and file naming and length +restrictions. +* `blobxfer` will attempt to download from blob storage as-is. If the source +filename is incompatible with the destination operating system, then failure +may result. +* When using SAS, the SAS key must be a container- or share-level SAS if +performing recursive directory upload or container/file share download. +* If uploading via service-level SAS keys, the container or file share must +already be created in Azure storage prior to upload. Account-level SAS keys +with the signed resource type of `c` (i.e., container-level permission) is +required for to allow conatiner or file share creation. +* When uploading files as page blobs, the content is page boundary +byte-aligned. The MD5 for the blob is computed using the final aligned data +if the source is not page boundary byte-aligned. This enables these page +blobs or files to be skipped during subsequent download or upload with the +appropriate `skip_on` option, respectively. +* Globbing of wildcards must be disabled by your shell (or properly quoted) +during invoking `blobxfer` such that include and exclude patterns can be +read verbatim without the shell expanding the wildcards. +* The `--delete` operates similarly to `--delete-after` in rsync. Please +note that this option interacts with `--include` and `--exclude` filters. diff --git a/docs/20-yaml-configuration.md b/docs/20-yaml-configuration.md new file mode 100644 index 0000000..000e01f --- /dev/null +++ b/docs/20-yaml-configuration.md @@ -0,0 +1,3 @@ +# blobxfer YAML Configuration + +## TODO diff --git a/docs/30-vectored-io.md b/docs/30-vectored-io.md new file mode 100644 index 0000000..0eb67fd --- /dev/null +++ b/docs/30-vectored-io.md @@ -0,0 +1,3 @@ +# blobxfer Vectored IO + +## TODO diff --git a/docs/40-client-side-encryption.md b/docs/40-client-side-encryption.md new file mode 100644 index 0000000..e16d87a --- /dev/null +++ b/docs/40-client-side-encryption.md @@ -0,0 +1,29 @@ +# blobxfer Client-side Encryption Notes +Please read the following carefully regarding client-side encryption support +in `blobxfer`. Additionally, current limitations for client-side encryption +can be found [here](99-current-limitations.md). + +* Encryption is performed using AES256-CBC. MACs are generated using +HMAC-SHA256. +* All required information regarding the encryption process is stored on +each blob's `encryptiondata` and `encryptiondata_authentication` metadata +fields. These metadata entries are used on download to configure the proper +download parameters for the decryption process as well as to authenticate +the `encryptiondata` metadata and the encrypted entity. Encryption metadata +set by `blobxfer` (or any Azure Storage SDK) should not be modified or +the blob/file may be unrecoverable. +* Keys for the AES256 block cipher are generated on a per-blob/file basis. +These keys are encrypted using RSAES-OAEP and encoded in the metadata. +* MD5 for both the pre-encrypted and encrypted version of the file is stored +in the entity metadata, if enabled. `skip_on` options will still work +transparently with encrypted blobs/files. +* MAC integrity checks are preferred over MD5 to validate encrypted data. +* Attempting to upload the same file that exists in Azure Storage, but the +file in Azure Storage is not encrypted will not occur if any `skip_on` match +condition succeeds. This behavior can be overridden by deleting the target +file in Azure Storage or disabling the `skip_on` behavior. +* Attempting to upload the same file as an encrypted blob with a different +RSA key will not occur if the file content MD5 is the same. This behavior +can be overridden by deleting the target file in Azure Storage or disabling +the `skip_on` `md5_match` behavior. +* Zero-byte files are not encrypted. diff --git a/docs/80-blobxfer-python-library.md b/docs/80-blobxfer-python-library.md new file mode 100644 index 0000000..e0d74a2 --- /dev/null +++ b/docs/80-blobxfer-python-library.md @@ -0,0 +1,3 @@ +# blobxfer Python Library + +## TODO diff --git a/docs/98-performance-considerations.md b/docs/98-performance-considerations.md new file mode 100644 index 0000000..6ade0a2 --- /dev/null +++ b/docs/98-performance-considerations.md @@ -0,0 +1,68 @@ +# blobxfer Performance Considerations +Please read the following carefully regarding considerations that should +be applied with regard to performance and `blobxfer`. Additionally, +please review the +[Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) +for an overview of general performance targets that apply to Azure Blobs +and File shares. + +## Concurrency +* `blobxfer` offers four concurrency knobs. Each one should be tuned for +maximum performance according to your system and network characteristics. + 1. MD5 processes: computing MD5 for potential omission from transfer due + to `skip_on` `md5_match` being specified are offloaded to the specified + number of processors. + 2. Crypto processes: decrypting encrypted blobs and files can be offloaded + to the specified number of processors. Due to the inherent + non-parallelizable encryption algorithm used, this is ignored for + encryption (uploads). + 3. Disk threads: concurrency in reading (uploads) and writing (downloads) to + disk is controlled by the number of disk threads. + 4. Transfer threads: concurrency in the number of threads from/to Azure + Storage is controlled by the number of transfer threads. +* The thread concurrency options (disk and transfer) can be set to a +non-positive number to be automatically set as a multiple of the number of +cores available on the machine. + +## Azure File Share Performance +File share performance can be "slow" or become a bottleneck, especially for +file shares containing thousands of files as multiple REST calls must be +performed for each file. Currently, a single file share has a limit of up +to 60 MB/s and 1000 8KB IOPS. Please refer to the +[Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) +for performance targets and limits regarding Azure Storage File shares. +If scalable high performance is required, consider using Blob storage +instead. + +## MD5 Hashing +MD5 hashing will impose some performance penalties to check if the file +should be uploaded or downloaded. For instance, if uploading and the local +file is determined to be different than it's remote counterpart, then the +time spent performing the MD5 comparison is lost. + +## Client-side Encryption +Client-side encryption will naturally impose a performance penalty on +`blobxfer` both for uploads (encrypting) and downloads (decrypting) depending +upon the processor speed and number of cores available. Additionally, for +uploads, encryption is not parallelizable and is in-lined with the main +process. + +## pyOpenSSL +As of requests 2.6.0 and Python versions < 2.7.9 (i.e., interpreter found on +default Ubuntu 14.04 installations, 16.04 is not affected), if certain +packages are installed, as those found in `requests[security]` then the +underlying urllib3 package will utilize the `ndg-httpsclient` package which +will use `pyOpenSSL`. This will ensure the peers are fully validated. However, +this incurs a rather larger performance penalty. If you understand the +potential security risks for disabling this behavior due to high performance +requirements, you can either remove `ndg-httpsclient` or use `blobxfer` in a +virtualenv environment without the `ndg-httpsclient` package. Python +versions >= 2.7.9 are not affected by this issue. + +Additionally, `urllib3` (which `requests` uses) may use `pyOpenSSL` which +may result in exceptions being thrown that are not normalized by `urllib3`. +This may result in exceptions that should be retried, but are not. It is +recommended to upgrade your Python where `pyOpenSSL` is not required for +fully validating peers and such that `blobxfer` can operate without +`pyOpenSSL` in a secure fashion. You can also run `blobxfer` via Docker +or in a virtualenv environment without `pyOpenSSL`. diff --git a/docs/99-current-limitations.md b/docs/99-current-limitations.md new file mode 100644 index 0000000..9b8a3f7 --- /dev/null +++ b/docs/99-current-limitations.md @@ -0,0 +1,24 @@ +# blobxfer Current Limitations +Please read this section carefully for any current known limitations to +`blobxfer`. + +### Client-side Encryption +* Client-side encryption is currently only available for block blobs and +Azure Files. +* `stdin` sources cannot be encrypted. +* Azure KeyVault key references are currently not supported. + +### Platform-specific Issues +* File attribute store/restore is not supported on Windows. + +### Resume Support +* Encrypted uploads/downloads cannot currently be resumed as the Python +SHA256 object cannot be pickled. +* Append blobs currently cannot be resumed for upload. + +### Other Limitations +* MD5 is not computed for append blobs. +* Empty directories are not created locally when downloading from an Azure +File share which has empty directories. +* Empty directories are not deleted if `--delete` is specified and no files +remain in the directory on the Azure File share. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..b05fcc4 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,16 @@ +# blobxfer Documentation +`blobxfer` is a transfer tool and library to move data between local file +systems and Azure Storage. `blobxfer` command-line interface is powered by +an advanced, high performance data movement library in Python with the same +name. The `blobxfer` data movement library is built on the +[Azure Storage Python SDK](https://github.com/Azure/azure-storage-python). +Please refer to the following documents detailing the usage of `blobxfer`. + +1. [Installation](01-installation.md) +2. [Command-Line Usage](10-cli-usage.md) +3. [YAML Configuration](20-yaml-configuration.md) +4. [Vectored IO](30-vectored-io.md) +5. [Client-side Encryption](40-client-side-encryption.md) +6. [blobxfer Data Movement Library](80-blobxfer-python-library.md) +7. [Performance Considerations](98-performance-considerations.md) +8. [Current Limitations](99-current-limitations.md) From 0dca02ce83624743abf3a848a5a00209581a5374 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 1 Jun 2017 13:55:54 -0700 Subject: [PATCH 43/47] Doc updates - Remove useconfig command and instead use --config option --- README.md | 19 +- cli/cli.py | 245 ++++++++++---------------- cli/settings.py | 19 +- docs/01-installation.md | 4 +- docs/10-cli-usage.md | 144 ++++++++++++++- docs/98-performance-considerations.md | 22 +-- docs/{index.md => README.md} | 0 7 files changed, 268 insertions(+), 185 deletions(-) rename docs/{index.md => README.md} (100%) diff --git a/README.md b/README.md index a2cf2b9..ef6c462 100644 --- a/README.md +++ b/README.md @@ -14,27 +14,28 @@ your own Python scripts. ## Major Features * Command-line interface (CLI) providing data movement capability to and from Azure Blob and File Storage +* Standalone library for integration with scripts or other Python packages * High-performance design with asynchronous transfers and disk I/O * YAML configuration driven execution support * Resume support * Vectored IO - * `stripe` mode allows striping a single file across multiple blobs to - break through single blob or fileshare throughput limits including - multi-storage account destinations - * `replica` mode allows replication of a file across multiple locations - including multi-storage account destinations + * `stripe` mode allows striping a single file across multiple blobs (even + to multiple storage accounts) to break through single blob or fileshare + throughput limits + * `replica` mode allows replication of a file across multiple destinations + including to multiple storage accounts * Client-side encryption support * Advanced skip options for rsync-like operations * Store/restore POSIX filemode and uid/gid -* `stdin` piping support +* Read/pipe from `stdin` support * Append blob support * Configurable one-shot block upload support * Block (chunk) size selection support -* Rsync-like `--delete-after` support +* Rsync-like delete support * Support for reading from blob snapshots * Automatic block blob size adjustment for uploading -* Automatic uploading of VHD and VHDX files as page blobs -* Wildcard filtering with include and exclude support +* Automatic uploading of VHD/VHDX files as page blobs +* Include and exclude filtering support * No clobber support in either direction * File logging support diff --git a/cli/cli.py b/cli/cli.py index 9b8c788..7f58c04 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -57,7 +57,6 @@ class CliContext(object): """CliContext class: holds context for CLI commands""" def __init__(self): """Ctor for CliContext""" - self.yaml_config = None self.config = {} self.cli_options = {} self.credentials = None @@ -85,8 +84,8 @@ def _read_yaml_file(self, yaml_file): f, Loader=ruamel.yaml.RoundTripLoader) else: self.config = blobxfer.util.merge_dict( - self.config, ruamel.yaml.load( - f, Loader=ruamel.yaml.RoundTripLoader)) + ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader), + self.config) def _init_config(self): # type: (CliContext) -> None @@ -94,9 +93,9 @@ def _init_config(self): :param CliContext self: this """ # load yaml config file into memory - if blobxfer.util.is_not_empty(self.yaml_config): - self.yaml_config = pathlib.Path(self.yaml_config) - self._read_yaml_file(self.yaml_config) + if blobxfer.util.is_not_empty(self.cli_options['yaml_config']): + yaml_config = pathlib.Path(self.cli_options['yaml_config']) + self._read_yaml_file(yaml_config) else: # merge cli options with config settings.merge_settings(self.config, self.cli_options) @@ -108,7 +107,6 @@ def _init_config(self): blobxfer.util.set_verbose_logger_handlers() logger.debug('config: \n' + json.dumps(self.config, indent=4)) # free mem - del self.yaml_config del self.cli_options @@ -116,6 +114,19 @@ def _init_config(self): pass_cli_context = click.make_pass_decorator(CliContext, ensure=True) +def _config_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['yaml_config'] = value + return value + return click.option( + '--config', + expose_value=False, + help='YAML configuration file', + envvar='BLOBXFER_CONFIG_FILE', + callback=callback)(f) + + def _crypto_processes_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) @@ -237,81 +248,61 @@ def callback(ctx, param, value): callback=callback)(f) -def common_options(f): - f = _verbose_option(f) - f = _transfer_threads_option(f) - f = _timeout_option(f) - f = _resume_file_option(f) - f = _progress_bar_option(f) - f = _md5_processes_option(f) - f = _log_file_option(f) - f = _disk_threads_option(f) - f = _crypto_processes_option(f) - return f - - -def _local_resource_argument(f): +def _local_resource_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) - clictx.local_resource = value + clictx.cli_options['local_resource'] = value return value - return click.argument( - 'local-resource', + return click.option( + '--local-resource', + expose_value=False, + help='Local resource', callback=callback)(f) -def _storage_account_argument(f): +def _storage_account_name_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['storage_account'] = value return value - return click.argument( - 'storage-account', + return click.option( + '--storage-account', + expose_value=False, + help='Storage account name', + envvar='BLOBXFER_STORAGE_ACCOUNT_NAME', callback=callback)(f) -def _remote_path_argument(f): +def _remote_path_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['remote_path'] = value return value - return click.argument( - 'remote-path', + return click.option( + '--remote-path', + expose_value=False, + help='Remote path on Azure Storage', callback=callback)(f) -def upload_download_arguments(f): - f = _remote_path_argument(f) - f = _storage_account_argument(f) - f = _local_resource_argument(f) +def common_options(f): + f = _verbose_option(f) + f = _transfer_threads_option(f) + f = _timeout_option(f) + f = _resume_file_option(f) + f = _progress_bar_option(f) + f = _md5_processes_option(f) + f = _log_file_option(f) + f = _disk_threads_option(f) + f = _crypto_processes_option(f) + f = _config_option(f) return f -def _sync_copy_dest_storage_account_argument(f): - def callback(ctx, param, value): - clictx = ctx.ensure_object(CliContext) - clictx.cli_options['sync_copy_dest_storage_account'] = value - return value - return click.argument( - 'sync-copy-dest-storage-account', - callback=callback)(f) - - -def _sync_copy_dest_remote_path_argument(f): - def callback(ctx, param, value): - clictx = ctx.ensure_object(CliContext) - clictx.cli_options['sync_copy_dest_remote_path'] = value - return value - return click.argument( - 'sync-copy-dest-remote-path', - callback=callback)(f) - - -def sync_copy_arguments(f): - f = _sync_copy_dest_remote_path_argument(f) - f = _sync_copy_dest_storage_account_argument(f) - f = _remote_path_argument(f) - f = _storage_account_argument(f) +def upload_download_options(f): + f = _remote_path_option(f) + f = _storage_account_name_option(f) + f = _local_resource_option(f) return f @@ -321,10 +312,10 @@ def callback(ctx, param, value): clictx.cli_options['access_key'] = value return value return click.option( - '--access-key', + '--storage-account-key', expose_value=False, help='Storage account access key', - envvar='BLOBXFER_ACCESS_KEY', + envvar='BLOBXFER_STORAGE_ACCOUNT_KEY', callback=callback)(f) @@ -516,7 +507,7 @@ def callback(ctx, param, value): '--rsa-private-key', expose_value=False, default=None, - help='RSA private key', + help='RSA private key PEM file', envvar='BLOBXFER_RSA_PRIVATE_KEY', callback=callback)(f) @@ -544,7 +535,7 @@ def callback(ctx, param, value): '--rsa-public-key', expose_value=False, default=None, - help='RSA public key', + help='RSA public key PEM file', envvar='BLOBXFER_RSA_PUBLIC_KEY', callback=callback)(f) @@ -635,10 +626,35 @@ def callback(ctx, param, value): clictx.cli_options['sync_copy_dest_access_key'] = value return value return click.option( - '--sync-copy-dest-access-key', + '--sync-copy-dest-storage-account-key', expose_value=False, help='Storage account access key for synccopy destination', - envvar='BLOBXFER_SYNC_COPY_DEST_ACCESS_KEY', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_KEY', + callback=callback)(f) + + +def _sync_copy_dest_storage_account_name_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_storage_account'] = value + return value + return click.option( + '--sync-copy-dest-storage-account', + expose_value=False, + help='Storage account name for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME', + callback=callback)(f) + + +def _sync_copy_dest_remote_path_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_remote_path'] = value + return value + return click.option( + '--sync-copy-dest-remote-path', + expose_value=False, + help='Remote path on Azure Storage for synccopy destination', callback=callback)(f) @@ -651,7 +667,7 @@ def callback(ctx, param, value): '--sync-copy-dest-sas', expose_value=False, help='Shared access signature for synccopy destination', - envvar='BLOBXFER_SYNC_COPY_SAS', + envvar='BLOBXFER_SYNC_COPY_DEST_SAS', callback=callback)(f) @@ -705,12 +721,16 @@ def download_options(f): def sync_copy_options(f): + f = _sync_copy_dest_storage_account_name_option(f) f = _sync_copy_dest_sas_option(f) + f = _sync_copy_dest_remote_path_option(f) f = _sync_copy_dest_access_key_option(f) + f = _storage_account_name_option(f) f = _skip_on_md5_match_option(f) f = _skip_on_lmt_ge_option(f) f = _skip_on_filesize_match_option(f) f = _sas_option(f) + f = _remote_path_option(f) f = _overwrite_option(f) f = _mode_option(f) f = _include_option(f) @@ -721,21 +741,6 @@ def sync_copy_options(f): return f -def _config_argument(f): - def callback(ctx, param, value): - clictx = ctx.ensure_object(CliContext) - clictx.yaml_config = value - return value - return click.argument( - 'config', - callback=callback)(f) - - -def config_arguments(f): - f = _config_argument(f) - return f - - @click.group(context_settings=_CONTEXT_SETTINGS) @click.version_option(version=blobxfer.__version__) @click.pass_context @@ -745,15 +750,13 @@ def cli(ctx): @cli.command('download') -@upload_download_arguments +@upload_download_options @download_options @common_options @pass_cli_context -def download(ctx, local_resource, storage_account, remote_path): +def download(ctx): """Download blobs or files from Azure Storage""" - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Download, local_resource, - storage_account, remote_path) + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download) ctx.initialize() specs = settings.create_download_specifications(ctx.config) for spec in specs: @@ -763,84 +766,24 @@ def download(ctx, local_resource, storage_account, remote_path): @cli.command('synccopy') -@sync_copy_arguments @sync_copy_options @common_options @pass_cli_context -def synccopy( - ctx, local_resource, storage_account, remote_path, - sync_copy_dest_storage_account, sync_copy_dest_remote_path): +def synccopy(ctx): """Synchronously copy blobs between Azure Storage accounts""" raise NotImplementedError() - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Synccopy, local_resource, - storage_account, remote_path, sync_copy_dest_storage_account, - sync_copy_dest_remote_path) + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy) ctx.initialize() @cli.command('upload') -@upload_download_arguments +@upload_download_options @upload_options @common_options @pass_cli_context -def upload(ctx, local_resource, storage_account, remote_path): +def upload(ctx): """Upload files to Azure Storage""" - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Upload, local_resource, - storage_account, remote_path) - ctx.initialize() - specs = settings.create_upload_specifications(ctx.config) - for spec in specs: - blobxfer.api.Uploader( - ctx.general_options, ctx.credentials, spec - ).start() - - -@cli.group() -@pass_cli_context -def useconfig(ctx): - """Use yaml configuration file for transfer""" - pass - - -@useconfig.command('download') -@config_arguments -@common_options -@pass_cli_context -def useconfig_download(ctx, config): - """Download blobs or files from Azure Storage via yaml configuration""" - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Download, None, None, None) - ctx.initialize() - specs = settings.create_download_specifications(ctx.config) - for spec in specs: - blobxfer.api.Downloader( - ctx.general_options, ctx.credentials, spec - ).start() - - -@useconfig.command('synccopy') -@config_arguments -@common_options -@pass_cli_context -def useconfig_synccopy(ctx, config): - """Synchronously copy blobs between Azure Storage accounts via yaml - configuration""" - raise NotImplementedError() - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Synccopy, None, None, None) - ctx.initialize() - - -@useconfig.command('upload') -@config_arguments -@common_options -@pass_cli_context -def useconfig_upload(ctx, config): - """Upload files to Azure Storage via yaml configuration""" - settings.add_cli_options( - ctx.cli_options, settings.TransferAction.Upload, None, None, None) + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload) ctx.initialize() specs = settings.create_upload_specifications(ctx.config) for spec in specs: diff --git a/cli/settings.py b/cli/settings.py index 5911719..378325c 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -49,21 +49,16 @@ class TransferAction(enum.Enum): Synccopy = 3, -def add_cli_options( - cli_options, action, local_resource=None, storage_account=None, - remote_path=None, sync_copy_dest_storage_account=None, - sync_copy_dest_remote_path=None): - # type: (dict, str, str, str, str, str, str) -> None +def add_cli_options(cli_options, action): + # type: (dict, str) -> None """Adds CLI options to the configuration object :param dict cli_options: CLI options dict :param TransferAction action: action - :param str local_resource: local resource - :param str storage_account: storage account - :param str remote_path: remote path - :param str sync_copy_dest_storage_account: synccopy dest sa - :param str sync_copy_dest_remote_path: synccopy dest rp """ cli_options['_action'] = action.name.lower() + local_resource = cli_options['local_resource'] + storage_account = cli_options['storage_account'] + remote_path = cli_options['remote_path'] if blobxfer.util.is_not_empty(storage_account): # add credentials try: @@ -149,6 +144,10 @@ def add_cli_options( }, } elif action == TransferAction.Synccopy: + sync_copy_dest_storage_account = \ + cli_options['sync_copy_dest_storage_account'] + sync_copy_dest_remote_path = \ + cli_options['sync_copy_dest_remote_path'] if blobxfer.util.is_none_or_empty(sync_copy_dest_storage_account): raise RuntimeError( 'must specify a destination storage account') diff --git a/docs/01-installation.md b/docs/01-installation.md index cffe6c6..7b85c52 100644 --- a/docs/01-installation.md +++ b/docs/01-installation.md @@ -66,7 +66,7 @@ that there was an issue installing this package with other `azure` packages that share the same base namespace. You can correct this by issuing: ```shell # for Python2 -pip install azure-storage +pip install --upgrade --force-reinstall azure-storage # for Python3 -pip3 install azure-storage +pip3 install --upgrade --force-reinstall azure-storage ``` diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md index 9c9f111..7c06508 100644 --- a/docs/10-cli-usage.md +++ b/docs/10-cli-usage.md @@ -1,9 +1,149 @@ # blobxfer Command-Line Usage +`blobxfer` operates using a command followed by options. Each +command will be detailed along with all options available. -## TODO +## Commands +### `download` +Downloads a remote Azure path, which may contain many resources, to the +local machine. This command requires at the minimum, the following options: +* `--storage-account-name` +* `--remote-path` +* `--local-resource` +Additionally, an authentication option for the storage account is required. +Please see the Authentication sub-section below under Options. +### `upload` +Uploads a local path to a remote Azure path. The local path may contain +many resources on the local machine. This command requires at the minimum, +the following options: +* `--local-resource` +* `--storage-account-name` +* `--remote-path` -### General Notes +Additionally, an authentication option for the storage account is required. +Please see the Authentication sub-section below under Options. + +### `synccopy` +TODO: not yet implemented. + +## Options +### General +* `--config` specifies the YAML configuration file to use. This can be +optionally provided through an environment variable `BLOBXFER_CONFIG_FILE`. +* `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed. +* `--local-resource` is the local resource path. +* `--log-file` specifies the log file to write to. +* `--mode` is the operating mode. The default is `auto` but may be set to +`append`, `block`, `file`, or `page`. If specified with the `upload` +command, then all files will be uploaded as the specified `mode` type. +If specified with `download`, then only remote entities with that `mode` +type are downloaded. Note that `file` should be specified if interacting +with Azure File shares. +* `--overwrite` or `--no-overwrite` controls clobber semantics at the +destination. +* `--progress-bar` or `--no-progress-bar` controls if a progress bar is +output to the console. +* `--recursive` or `--no-recursive` controls if the source path should be +recursively uploaded or downloaded. +* `--remote-path` is the remote Azure path. This path must contain the +Blob container or File share at the begining, e.g., `mycontainer/vdir` +* `--resume-file` specifies the resume file to write to. +* `--timeout` is the integral timeout value in seconds to use. +* `-h` or `--help` can be passed at every command level to receive context +sensitive help. +* `-v` will output verbose messages including the configuration used + +### Authentication +`blobxfer` supports both Storage Account access keys and Shared Access +Signature (SAS) tokens. One type must be supplied with all commands in +order to successfully authenticate against Azure Storage. These options are: +* `--storage-account-key` is the storage account access key. This can be +optionally provided through an environment variable +`BLOBXFER_STORAGE_ACCOUNT_KEY` instead. +* `--sas` is a shared access signature (sas) token. This can can be +optionally provided through an environment variable `BLOBXFER_SAS` instead. + +### Concurrency +Please see the [performance considerations](98-performance-considerations.md) +document for more information regarding concurrency options. +* `--crypto-processes` is the number of decryption offload processes to spawn. +`0` will in-line the decryption routine with the main thread. +* `--disk-threads` is the number of threads to create for disk I/O. +* `--md5-processes` is the number of MD5 offload processes to spawn for +comparing files with `skip_on` `md5_match`. +* `--transfer-threads` is the number of threads to create for transferring +to/from Azure Storage. + +### Connection +* `--endpoint` is the Azure Storage endpoint to connect to; the default is +Azure Public regions, or `core.windows.net`. +* `--storage-account-name` is the storage account to connect to. + +### Encryption +* `--rsa-private-key` is the RSA private key in PEM format to use. This can +be provided for uploads but must be specified to decrypt encrypted remote +entities. This can be optionally provided through an environment variable +`BLOBXFER_RSA_PRIVATE_KEY`. +* `--rsa-private-key-passphrase` is the RSA private key passphrase. This can +be optionally provided through an environment variable +`BLOBXFER_RSA_PRIVATE_KEY_PASSPHRASE`. +* `--rsa-public-key` is the RSA public key in PEM format to use. This +can only be provided for uploads. This can be optionally provided through an +environment variable `BLOBXFER_RSA_PUBLIC_KEY`. + +### Filtering +* `--exclude` is an exclude pattern to use; this can be specified multiple +times. Exclude patterns are applied after include patterns. If both an exclude +and an include pattern match a target, the target is excluded. +* `--include` is an include pattern to use; this can be specified multiple +times + +### Skip On +* `--skip-on-filesize-match` will skip the transfer action if the filesizes +match between source and destination. This should not be specified for +encrypted files. +* `--skip-on-lmt-ge` will skip the transfer action: + * On upload if the last modified time of the remote file is greater than + or equal to the local file. + * On download if the last modified time of the local file is greater than + or equal to the remote file. +* `--skip-on-md5-match` will skip the transfer action if the MD5 hash match +between source and destination. This can be transparently used through +encrypted files that have been uploaded with `blobxfer`. + +### Vectored IO +Please see the [Vectored IO](30-vectored-io.md) document for more information +regarding Vectored IO operations in `blobxfer`. +* `--distribution-mode` is the Vectored IO distribution mode + * `disabled` which is default (no Vectored IO) + * `replica` which will replicate source files to target destinations on + upload + * `stripe`which will stripe source files to target destinations on upload +* `--stripe-chunk-size-bytes` is the stripe chunk width for stripe-based +Vectored IO operations + +### Other +* `--delete` deletes extraneous files at the remote destination path on +uploads and at the local resource on downloads. This actions occur after the +transfer has taken place. +* `--one-shot-bytes` controls the number of bytes to "one shot" a block +Blob upload. The maximum value that can be specified is 256MiB. This may +be useful when using account-level SAS keys and enforcing non-overwrite +behavior. +* `--rename` renames a single file upload or download to the target +destination or source path, respectively. +* `--strip-components N` will strip the leading `N` components from the +file path. The default is `1`. + +## Examples +### `download` Examples +TODO. +blobxfer download + +### `upload` Examples +TODO. + +## General Notes * `blobxfer` does not take any leases on blobs or containers. It is up to the user to ensure that blobs are not modified while download/uploads are being performed. diff --git a/docs/98-performance-considerations.md b/docs/98-performance-considerations.md index 6ade0a2..89ff0c6 100644 --- a/docs/98-performance-considerations.md +++ b/docs/98-performance-considerations.md @@ -9,17 +9,17 @@ and File shares. ## Concurrency * `blobxfer` offers four concurrency knobs. Each one should be tuned for maximum performance according to your system and network characteristics. - 1. MD5 processes: computing MD5 for potential omission from transfer due - to `skip_on` `md5_match` being specified are offloaded to the specified - number of processors. - 2. Crypto processes: decrypting encrypted blobs and files can be offloaded - to the specified number of processors. Due to the inherent - non-parallelizable encryption algorithm used, this is ignored for - encryption (uploads). - 3. Disk threads: concurrency in reading (uploads) and writing (downloads) to - disk is controlled by the number of disk threads. - 4. Transfer threads: concurrency in the number of threads from/to Azure - Storage is controlled by the number of transfer threads. + * Disk threads: concurrency in reading (uploads) and writing (downloads) to + disk is controlled by the number of disk threads. + * Transfer threads: concurrency in the number of threads from/to Azure + Storage is controlled by the number of transfer threads. + * MD5 processes: computing MD5 for potential omission from transfer due + to `skip_on` `md5_match` being specified are offloaded to the specified + number of processors. + * Crypto processes: decrypting encrypted blobs and files can be offloaded + to the specified number of processors. Due to the inherent + non-parallelizable encryption algorithm used, this is ignored for + encryption (uploads). * The thread concurrency options (disk and transfer) can be set to a non-positive number to be automatically set as a multiple of the number of cores available on the machine. diff --git a/docs/index.md b/docs/README.md similarity index 100% rename from docs/index.md rename to docs/README.md From a327445e45ceede3471e4c3967b8320cd81cbabb Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 1 Jun 2017 14:55:47 -0700 Subject: [PATCH 44/47] Update Dockerfile to Alpine 3.6 and libressl --- docker/Dockerfile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 42e8b2d..a713e15 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,12 +1,11 @@ # Dockerfile for Azure/blobxfer -FROM gliderlabs/alpine:3.4 +FROM alpine:3.6 MAINTAINER Fred Park -RUN apk add --update --no-cache musl build-base python3 python3-dev openssl-dev libffi-dev ca-certificates \ - && pip3 install --no-cache-dir --upgrade pip \ +RUN apk add --update --no-cache musl build-base python3 python3-dev libressl-dev libffi-dev ca-certificates \ && pip3 install --no-cache-dir --upgrade blobxfer \ - && apk del --purge build-base python3-dev openssl-dev libffi-dev \ + && apk del --purge build-base python3-dev libressl-dev libffi-dev \ && rm /var/cache/apk/* ENTRYPOINT ["blobxfer"] From bb9f68a3bb04a1d74632bba6d2a84105b8b312d8 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Thu, 1 Jun 2017 20:38:15 -0700 Subject: [PATCH 45/47] More documentation - Rename some options - Catch KeyErrors and rethrow for required options to clarify --- CHANGELOG.md | 6 +- README.md | 12 +- cli/cli.py | 8 +- cli/settings.py | 45 ++++++-- docs/01-installation.md | 29 +++-- docs/10-cli-usage.md | 102 +++++++++++++++-- docs/20-yaml-configuration.md | 210 +++++++++++++++++++++++++++++++++- 7 files changed, 370 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3abb9b2..ba7b442 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,16 @@ ## [Unreleased] -## [1.0.0a2] - 2017-06-XX +## [1.0.0a2] - 2017-06-02 ### Changed - From scratch rewrite providing a consistent CLI experience and a vast array of new and advanced features. Please see the [1.0.0 Milestone](https://github.com/Azure/blobxfer/milestone/1) for a catalog of changes. +- **Breaking Changes:** there have been a significant number of breaking +changes with the rewrite from the command-line invocation of `blobxfer` +itself to the options and environment variable names. Please review the +usage documentation carefully when upgrading from 0.12.1. - All dependencies updated to latest ### Removed diff --git a/README.md b/README.md index ef6c462..fd2c904 100644 --- a/README.md +++ b/README.md @@ -18,24 +18,24 @@ from Azure Blob and File Storage * High-performance design with asynchronous transfers and disk I/O * YAML configuration driven execution support * Resume support -* Vectored IO +* Vectored IO support * `stripe` mode allows striping a single file across multiple blobs (even to multiple storage accounts) to break through single blob or fileshare throughput limits * `replica` mode allows replication of a file across multiple destinations including to multiple storage accounts * Client-side encryption support +* Support all blob types for both upload and download * Advanced skip options for rsync-like operations * Store/restore POSIX filemode and uid/gid -* Read/pipe from `stdin` support -* Append blob support -* Configurable one-shot block upload support -* Block (chunk) size selection support -* Rsync-like delete support +* Support for reading/pipe from `stdin` * Support for reading from blob snapshots +* Configurable one-shot block upload support +* Configurable chunk size for both upload and download * Automatic block blob size adjustment for uploading * Automatic uploading of VHD/VHDX files as page blobs * Include and exclude filtering support +* Rsync-like delete support * No clobber support in either direction * File logging support diff --git a/cli/cli.py b/cli/cli.py index 7f58c04..4b1e211 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -256,7 +256,7 @@ def callback(ctx, param, value): return click.option( '--local-resource', expose_value=False, - help='Local resource', + help='Local resource; use - for stdin', callback=callback)(f) @@ -266,7 +266,7 @@ def callback(ctx, param, value): clictx.cli_options['storage_account'] = value return value return click.option( - '--storage-account', + '--storage-account-name', expose_value=False, help='Storage account name', envvar='BLOBXFER_STORAGE_ACCOUNT_NAME', @@ -602,7 +602,7 @@ def callback(ctx, param, value): expose_value=False, type=int, default=1, - help='Strip leading file path components [1]', + help='Strip leading file path components on upload [1]', callback=callback)(f) @@ -639,7 +639,7 @@ def callback(ctx, param, value): clictx.cli_options['sync_copy_dest_storage_account'] = value return value return click.option( - '--sync-copy-dest-storage-account', + '--sync-copy-dest-storage-account-name', expose_value=False, help='Storage account name for synccopy destination', envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME', diff --git a/cli/settings.py b/cli/settings.py index 378325c..1507c82 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -56,9 +56,24 @@ def add_cli_options(cli_options, action): :param TransferAction action: action """ cli_options['_action'] = action.name.lower() - local_resource = cli_options['local_resource'] - storage_account = cli_options['storage_account'] - remote_path = cli_options['remote_path'] + try: + local_resource = cli_options['local_resource'] + if blobxfer.util.is_none_or_empty(local_resource): + raise KeyError() + except KeyError: + raise ValueError('--local-resource must be specified') + try: + storage_account = cli_options['storage_account'] + if blobxfer.util.is_none_or_empty(storage_account): + raise KeyError() + except KeyError: + raise ValueError('--storage-account-name must be specified') + try: + remote_path = cli_options['remote_path'] + if blobxfer.util.is_none_or_empty(remote_path): + raise KeyError() + except KeyError: + raise ValueError('--remote-path must be specified') if blobxfer.util.is_not_empty(storage_account): # add credentials try: @@ -144,13 +159,23 @@ def add_cli_options(cli_options, action): }, } elif action == TransferAction.Synccopy: - sync_copy_dest_storage_account = \ - cli_options['sync_copy_dest_storage_account'] - sync_copy_dest_remote_path = \ - cli_options['sync_copy_dest_remote_path'] - if blobxfer.util.is_none_or_empty(sync_copy_dest_storage_account): - raise RuntimeError( - 'must specify a destination storage account') + try: + sync_copy_dest_storage_account = \ + cli_options['sync_copy_dest_storage_account'] + if blobxfer.util.is_none_or_empty( + sync_copy_dest_storage_account): + raise KeyError() + except KeyError: + raise ValueError( + '--sync-copy-dest-storage-account-name must be specified') + try: + sync_copy_dest_remote_path = \ + cli_options['sync_copy_dest_remote_path'] + if blobxfer.util.is_none_or_empty(sync_copy_dest_remote_path): + raise KeyError() + except KeyError: + raise ValueError( + '--sync-copy-dest-remote-path must be specified') arg = { 'source': sa_rp, 'destination': [ diff --git a/docs/01-installation.md b/docs/01-installation.md index 7b85c52..2609f07 100644 --- a/docs/01-installation.md +++ b/docs/01-installation.md @@ -1,8 +1,8 @@ # blobxfer Installation `blobxfer` is a pure Python package, however, some dependencies require a C -compiler and supporting libraries if there is no binary wheel. Please follow -the pre-requisites section first prior to invoking installation via `pip`. -Alternatively, you can use the +compiler and supporting libraries if there is no binary wheel for that +dependency and your platform. Please follow the pre-requisites section first +prior to invoking installation via `pip`. Alternatively, you can use the [blobxfer Docker image](https://hub.docker.com/r/alfpark/blobxfer/). ## Pre-requisites @@ -34,15 +34,26 @@ zypper -n in gcc libopenssl-devel libffi48-devel python-devel curl -fSsL https://bootstrap.pypa.io/get-pip.py | python ``` +### Mac OS X +Python 2.7 should come pre-installed. However, if you want to install +`blobxfer` for Python 3.5+ (recommended), please follow the steps outlined on +[this guide](http://docs.python-guide.org/en/latest/starting/install/osx/) +to ensure that you have the latest version of Python, a compiler and pip. + +### Windows +Please install at least Python 3.5 or higher to avoid requiring a +compiler. If you must use Python 2.7, you can download the necessary +development headers and compiler [from Microsoft](http://aka.ms/vcpython27). + ## Installation via `pip` [blobxfer](https://pypi.python.org/pypi/blobxfer) is on PyPI and can be installed via: ```shell +# for Python3 (recommended) +pip3 install blobxfer # for Python2 pip install blobxfer -# for Python3 -pip3 instlal blobxfer ``` `blobxfer` is compatible with Python 2.7 and 3.3+. To install for Python 3 @@ -61,12 +72,12 @@ docker pull alfpark/blobxfer ## Troubleshooting #### `azure.storage` dependency not found -If you get an error that `azure.storage` cannot be found or loaded this means -that there was an issue installing this package with other `azure` packages +If you get an error that `azure.storage` cannot be found or loaded, then +most likely there was a conflict with this package with other `azure` packages that share the same base namespace. You can correct this by issuing: ```shell -# for Python2 -pip install --upgrade --force-reinstall azure-storage # for Python3 pip3 install --upgrade --force-reinstall azure-storage +# for Python2 +pip install --upgrade --force-reinstall azure-storage ``` diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md index 7c06508..8931464 100644 --- a/docs/10-cli-usage.md +++ b/docs/10-cli-usage.md @@ -2,13 +2,20 @@ `blobxfer` operates using a command followed by options. Each command will be detailed along with all options available. -## Commands +### Quick Navigation +1. [Commands](#commands) +2. [Options](#options) +3. [Example Invocations](#examples) +4. [General Notes](#general-notes) + +## Commands ### `download` Downloads a remote Azure path, which may contain many resources, to the local machine. This command requires at the minimum, the following options: * `--storage-account-name` * `--remote-path` * `--local-resource` + Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. @@ -23,15 +30,27 @@ the following options: Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. +If piping from `stdin`, `--local-resource` should be set to `-` as per +convention. + ### `synccopy` TODO: not yet implemented. -## Options +## Options ### General * `--config` specifies the YAML configuration file to use. This can be optionally provided through an environment variable `BLOBXFER_CONFIG_FILE`. +* `--chunk-size-bytes` is the chunk size in bytes. For downloads, this +is the maximum length of data to transfer per request. For uploads, this +corresponds to one of block size for append and block blobs, page size for +page blobs, or file chunk for files. Only block blobs can have a block size +of up to 100MiB, all others have a maximum of 4MiB. +* `--file-attributes` or `--no-file-attributes` controls if POSIX file +attributes (mode and ownership) should be stored or restored. Note that to +restore uid/gid, `blobxfer` must be run as root or under sudo. * `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed. -* `--local-resource` is the local resource path. +* `--local-resource` is the local resource path. Set to `-` if piping from +`stdin`. * `--log-file` specifies the log file to write to. * `--mode` is the operating mode. The default is `auto` but may be set to `append`, `block`, `file`, or `page`. If specified with the `upload` @@ -117,8 +136,11 @@ regarding Vectored IO operations in `blobxfer`. * `--distribution-mode` is the Vectored IO distribution mode * `disabled` which is default (no Vectored IO) * `replica` which will replicate source files to target destinations on - upload - * `stripe`which will stripe source files to target destinations on upload + upload. Note that replicating across multiple destinations will require + a YAML configuration file. + * `stripe` which will stripe source files to target destinations on upload. + Note that striping across multiple destinations will require a YAML + configuration file. * `--stripe-chunk-size-bytes` is the stripe chunk width for stripe-based Vectored IO operations @@ -135,15 +157,73 @@ destination or source path, respectively. * `--strip-components N` will strip the leading `N` components from the file path. The default is `1`. -## Examples +## Example Invocations ### `download` Examples -TODO. -blobxfer download +#### Download an Entire Encrypted Blob Container to Current Working Directory +```shell +blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-public-key ~/mypubkey.pem +``` + +#### Download an Entire File Share to Designated Path and Skip On Filesize Matches +```shell +blobxfer download --mode file --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-resource /my/path --skip-on-filesize-match +``` + +#### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path +```shell +blobxfer download --mode page --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource /my/pageblobs --no-recursive --delete +``` + +#### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes +```shell +blobxfer download --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes +``` + +#### Download a Blob Snapshot +```shell +blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-resource . +``` + +#### Download using a YAML Configuration File +```shell +blobxfer download --config myconfig.yaml +``` ### `upload` Examples -TODO. - -## General Notes +#### Upload Current Working Directory as Encrypted Block Blobs Non-recursively +```shell +blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-private-key ~/myprivatekey.pem --no-recursive +``` + +#### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files +```shell +blobxfer upload --mode file --storage-account-name mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-resource . --file-md5 --file-attributes --exclude '*.bak' +``` + +#### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks +```shell +blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 +``` + +#### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path +```shell +blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /my/path --file-md5 --skip-on-md5-match --delete +``` + +#### Upload From Piped `stdin` +```shell +curl -fSsL https://some.uri | blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource - +``` + +#### Upload using a YAML Configuration File +```shell +blobxfer upload --config myconfig.yaml +``` + +### `synccopy` Examples +TODO: not implemented yet. + +## General Notes * `blobxfer` does not take any leases on blobs or containers. It is up to the user to ensure that blobs are not modified while download/uploads are being performed. diff --git a/docs/20-yaml-configuration.md b/docs/20-yaml-configuration.md index 000e01f..78437b0 100644 --- a/docs/20-yaml-configuration.md +++ b/docs/20-yaml-configuration.md @@ -1,3 +1,211 @@ # blobxfer YAML Configuration +`blobxfer` accepts YAML configuration files to drive the transfer. YAML +configuration files are specified with the `--config` option to any +`blobxfer` command. -## TODO +## Schema +The `blobxfer` YAML schema consists of 5 distinct "sections". The following +sub-sections will describe each. You may combine all 5 sections into the +same YAML file if desired as `blobxfer` will only read the required sections +to execute the specified command. + +#### Configuration Sections +1. [`azure_storage`](#azure-storage) +2. [`options`](#options) +3. [`download`](#download) +4. [`upload`](#upload) +5. [`synccopy`](#synccopy) + +### `azure_storage` +The `azure_storage` section specifies Azure Storage credentials that will +be referenced for any transfer while processing the YAML file. This section +is required. + +```yaml +azure_storage: + endpoint: core.windows.net + accounts: + mystorageaccount0: ABCDEF... + mystorageaccount1: ?se... +``` + +* `endpoint` specifies for which endpoint to connect to with Azure Storage. +Generally this can be omitted if using Public Azure regions. +* `accounts` is a dictionary of storage account names and either a +storage account key or a shared access signature token. + +### `options` +The `options` section specifies general options that may be applied across +all other sections in the YAML configuration. + +```yaml +options: + log_file: /path/to/blobxfer.log + resume_file: /path/to/resumefile.db + progress_bar: true + verbose: true + timeout_sec: null + concurrency: + md5_processes: 2 + crypto_processes: 2 + disk_threads: 16 + transfer_threads: 32 +``` + +* `log_file` is the location of the log file to write to +* `resume_file` is the location of the resume database to create +* `progress_bar` controls display of a progress bar output to the console +* `verbose` controls if verbose logging is enabled +* `timeout_sec` is the timeout to apply to requests/responses +* `concurrency` is a dictionary of concurrency limits + * `md5_processes` is the number of MD5 offload processes to create for + MD5 comparison checking + * `crypto_processes` is the number of decryption offload processes to create + * `disk_threads` is the number of threads for disk I/O + * `transfer_threads` is the number of threads for network transfers + +### `download` +The `download` section specifies download sources and destination. Note +that `download` refers to a list of objects, thus you may specify as many +of these sub-configuration blocks on the `download` property as you need. +When the `download` command with the YAML config is specified, the list +is iterated and all specified sources are downloaded. + +```yaml +download: + - source: + - mystorageaccount0: mycontainer + - mystorageaccount1: someothercontainer/vpath + destination: /path/to/store/downloads + include: + - "*.txt" + - "*.bxslice-*" + exclude: + - "*.bak" + options: + check_file_md5: true + chunk_size_bytes: 16777216 + delete_extraneous_destination: false + mode: auto + overwrite: true + recursive: true + rename: false + restore_file_attributes: true + rsa_private_key: myprivatekey.pem + rsa_private_key_passphrase: myoptionalpassword + skip_on: + filesize_match: false + lmt_ge: false + md5_match: true + - source: + # next if needed... +``` + +* `source` is a list of storage account to remote path mappings +* `destination` is the local resource path +* `include` is a list of include patterns +* `exclude` is a list of exclude patterns +* `options` are download-specific options + * `check_file_md5` will integrity check downloaded files using the stored MD5 + * `chunk_size_bytes` is the maximum amount of data to download per request + * `delete_extraneous_destination` will cleanup any files locally that are + not found on the remote. Note that this interacts with include and + exclude filters. + * `mode` is the operating mode + * `overwrite` specifies clobber behavior + * `recursive` specifies if remote paths should be recursively searched for + entities to download + * `rename` will rename a single entity source path to the `destination` + * `restore_file_attributes` will restore POSIX file mode and ownership if + stored on the entity metadata + * `rsa_private_key` is the RSA private key PEM file to use to decrypt + encrypted blobs or files + * `rsa_private_key_passphrase` is the RSA private key passphrase, if required + * `skip_on` are skip on options to use + * `filesize_match` skip if file size match + * `lmt_ge` skip if local file has a last modified time greater than or + equal to the remote file + * `md5_match` skip if MD5 match + +### `upload` +The `upload` section specifies upload sources and destinations. Note +that `upload` refers to a list of objects, thus you may specify as many +of these sub-configuration blocks on the `upload` property as you need. +When the `upload` command with the YAML config is specified, the list +is iterated and all specified sources are uploaded. + +```yaml +upload: + - source: + - /path/to/hugefile1 + - /path/to/hugefile2 + destination: + - mystorageaccount0: mycontainer/vdir + - mystorageaccount1: someothercontainer/vdir2 + include: + - "*.bin" + exclude: + - "*.tmp" + options: + mode: auto + chunk_size_bytes: 0 + delete_extraneous_destination: true + one_shot_bytes: 33554432 + overwrite: true + recursive: true + rename: false + rsa_public_key: mypublickey.pem + skip_on: + filesize_match: false + lmt_ge: false + md5_match: true + store_file_properties: + attributes: true + md5: true + strip_components: 1 + vectored_io: + stripe_chunk_size_bytes: 1000000 + distribution_mode: stripe + - source: + # next if needed... +``` + +* `source` is a list of local resource paths +* `destination` is a list of storage account to remote path mappings +* `include` is a list of include patterns +* `exclude` is a list of exclude patterns +* `options` are upload-specific options + * `mode` is the operating mode + * `chunk_size_bytes` is the maximum amount of data to upload per request. + This corresponds to the block size for block and append blobs, page size + for page blobs, and the file chunk for files. Only block blobs can have + a block size of up to 100MiB, all others have a maximum of 4MiB. + * `one_shot_bytes` is the size limit to upload block blobs in a single + request. + * `overwrite` specifies clobber behavior + * `recursive` specifies if local paths should be recursively searched for + files to upload + * `rename` will rename a single entity destination path to a single `source` + * `rsa_public_key` is the RSA public key PEM file to use to encrypt files + * `skip_on` are skip on options to use + * `filesize_match` skip if file size match + * `lmt_ge` skip if remote file has a last modified time greater than or + equal to the local file + * `md5_match` skip if MD5 match + * `store_file_properties` stores the following file properties if enabled + * `attributes` will store POSIX file mode and ownership + * `md5` will store the MD5 of the file + * `strip_components` is the number of leading path components to strip + * `vectored_io` are the Vectored IO options to apply to the upload + * `stripe_chunk_size_bytes` is the stripe width for each chunk if `stripe` + `distribution_mode` is selected + * `distribution_mode` is the Vectored IO mode to use which can be one of + * `disabled` will disable Vectored IO + * `replica` which will replicate source files to target destinations on + upload. Note that more than one destination should be specified. + * `stripe` which will stripe source files to target destinations on + upload. If more than one destination is specified, striping occurs in + round-robin order amongst the destinations listed. + +### `synccopy` +TODO: not yet implemented. From b7782619d1503d2f10e4ab37a26f16a4ed71eef3 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 2 Jun 2017 08:01:24 -0700 Subject: [PATCH 46/47] Add Vectored IO docs --- docs/30-vectored-io.md | 96 +++++++++++++++++++++++++++++++++++++++++- docs/README.md | 2 +- 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/docs/30-vectored-io.md b/docs/30-vectored-io.md index 0eb67fd..a007b7d 100644 --- a/docs/30-vectored-io.md +++ b/docs/30-vectored-io.md @@ -1,3 +1,95 @@ -# blobxfer Vectored IO +# blobxfer Vectored I/O +`blobxfer` supports Vectored I/O (scatter/gather) which can help alleviate +problems associated with +[single blob or single fileshare throughput limits](https://docs.microsoft.com/en-us/azure/storage/storage-scalability-targets). +Additionally, `blobxfer` has the ability to replicate a single source to +multiple destinations to allow for increased resiliency or throughput for +consumption later. -## TODO +## Distribution Modes +`blobxfer` supports two distribution modes: `replica` and `stripe`. The +following sections describe each. + +### Replica +`replica` mode replicates an entire file (or set of files) across all +specified destinations. This allows for multiple backups, resiliency, +and potentially increased download throughput later if the clients understand +how to download from multiple sources. + +The logic is fairly simple in how this is accomplished. Each source file +has portions of the file read from disk, buffered in memory and then +replicated across multiple storage accounts. + +``` + Whole File +---------------------+ + Replication | | + +------------------------------> | Destination 0: | + | | Storage Account A | + | | | + | +---------------------+ + | + | ++------------+---------------+ Whole File +---------------------+ +| | Replication | | +| 10 GiB VHD on Local Disk +--------------> | Destination 1: | +| | | Storage Account B | ++------------+---------------+ | | + | +---------------------+ + | + | + | +---------------------+ + | Whole File | | + | Replication | Destination 2: | + +------------------------------> | Storage Account C | + | | + +---------------------+ +``` + +In order to take advantage of `replica` Vectored IO, you must use a YAML +configuration file to define multiple destinations. + +### Stripe +`stripe` mode will splice a file into multiple chunks and scatter these +chunks across destinations specified. These destinations can be different +containers within the same storage account or even containers distributed +across multiple storage accounts if single storage account bandwidth limits +are insufficient. + +`blobxfer` will slice the source file into multiple chunks where the +`stripe_chunk_size_bytes` is the stripe width of each chunk. This parameter +will allow you to effectively control how many blobs/files are created on +Azure. `blobxfer` will then round-robin through all of the destinations +specified to store the slices. Information required to reconstruct the +original file is stored on the blob or file metadata. It is important to +keep this metadata in-tact or reconstruction will fail. + +``` + +---------------------+ + | | <-----------------------------------+ + | Destination 1: | | + | Storage Account B | <---------------------+ | + | | | | + +---------------------+ <-------+ | | + | | | + ^ ^ | | | + | | | | | + 1 GiB Stripe | | | | | ++-----------------------------+ Width +------+---+--+------+---+--+------+---+--+------+---+--+------+---+--+ +| | | | | | | | | | | | | +| 10 GiB File on Local Disk | +-----------> | D0 | D1 | D0 | D1 | D0 | D1 | D0 | D1 | D0 | D1 | +| | | | | | | | | | | | | ++-----------------------------+ 10 Vectored +---+--+------+---+--+------+---+--+------+---+--+------+---+--+------+ + Slices | | | | | + | | | | | + | v | | | + | | | | + +> +---------------------+ <+ | | + | | | | + | Destination 0: | <--------------+ | + | Storage Account A | | + | | <----------------------------+ + +---------------------+ +``` + +In order to take advantage of `stripe` Vectored IO, you must use a YAML +configuration file to define multiple destinations. diff --git a/docs/README.md b/docs/README.md index b05fcc4..eb1b4f5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,7 +9,7 @@ Please refer to the following documents detailing the usage of `blobxfer`. 1. [Installation](01-installation.md) 2. [Command-Line Usage](10-cli-usage.md) 3. [YAML Configuration](20-yaml-configuration.md) -4. [Vectored IO](30-vectored-io.md) +4. [Vectored I/O](30-vectored-io.md) 5. [Client-side Encryption](40-client-side-encryption.md) 6. [blobxfer Data Movement Library](80-blobxfer-python-library.md) 7. [Performance Considerations](98-performance-considerations.md) From e1d97fa3cb813559cff15661aa6d13d12dc30175 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 2 Jun 2017 08:16:46 -0700 Subject: [PATCH 47/47] Tag for 1.0.0a3 release - Rename some options - Make thread join more robust on Python2 --- CHANGELOG.md | 6 ++-- blobxfer/models/options.py | 6 +++- blobxfer/operations/download.py | 8 ++--- blobxfer/operations/upload.py | 4 +-- blobxfer/util.py | 14 +++++++++ blobxfer/version.py | 2 +- cli/cli.py | 42 ++++++++++++++------------ cli/settings.py | 12 ++++---- docs/01-installation.md | 7 +++-- docs/10-cli-usage.md | 43 +++++++++++++++------------ docs/30-vectored-io.md | 4 +-- docs/98-performance-considerations.md | 42 ++++++++++++++++++++++++-- 12 files changed, 129 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba7b442..fd71b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## [Unreleased] -## [1.0.0a2] - 2017-06-02 +## [1.0.0a3] - 2017-06-02 ### Changed - From scratch rewrite providing a consistent CLI experience and a vast array of new and advanced features. Please see the @@ -201,8 +201,8 @@ usage documentation carefully when upgrading from 0.12.1. `--no-skiponmatch`. - 0.8.2: performance regression fixes -[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a2...HEAD -[1.0.0a2]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a2 +[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a3...HEAD +[1.0.0a3]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a3 [0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1 [0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0 [0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5 diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index c516d01..2a17c1a 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -105,13 +105,14 @@ class Concurrency(object): """Concurrency Options""" def __init__( self, crypto_processes, md5_processes, disk_threads, - transfer_threads): + transfer_threads, is_download=None): """Ctor for Concurrency Options :param Concurrency self: this :param int crypto_processes: number of crypto procs :param int md5_processes: number of md5 procs :param int disk_threads: number of disk threads :param int transfer_threads: number of transfer threads + :param bool is_download: download hint """ self.crypto_processes = crypto_processes self.md5_processes = md5_processes @@ -131,6 +132,9 @@ def __init__( # cap maximum number of disk threads from cpu count to 64 if self.disk_threads > 64: self.disk_threads = 64 + # for downloads, cap disk threads to lower value + if is_download and self.disk_threads > 16: + self.disk_threads = 16 auto_disk = True if self.transfer_threads is None or self.transfer_threads < 1: if auto_disk: diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 3cbef61..9d51d32 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -430,7 +430,7 @@ def _wait_for_disk_threads(self, terminate): if terminate: self._download_terminate = terminate for thr in self._disk_threads: - thr.join() + blobxfer.util.join_thread(thr) def _wait_for_transfer_threads(self, terminate): # type: (Downloader, bool) -> None @@ -441,7 +441,7 @@ def _wait_for_transfer_threads(self, terminate): if terminate: self._download_terminate = terminate for thr in self._transfer_threads: - thr.join() + blobxfer.util.join_thread(thr) def _worker_thread_transfer(self): # type: (Downloader) -> None @@ -452,7 +452,7 @@ def _worker_thread_transfer(self): while not self.termination_check: try: if len(self._disk_set) > max_set_len: - time.sleep(0.2) + time.sleep(0.1) continue else: dd = self._transfer_queue.get(block=False, timeout=0.1) @@ -792,8 +792,8 @@ def start(self): 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') try: - self._wait_for_transfer_threads(terminate=True) self._wait_for_disk_threads(terminate=True) + self._wait_for_transfer_threads(terminate=True) finally: self._cleanup_temporary_files() raise diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 232e8ba..9db2863 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -447,10 +447,10 @@ def _worker_thread_upload(self): while not self.termination_check: try: if len(self._transfer_set) > max_set_len: - time.sleep(0.2) + time.sleep(0.1) continue else: - ud = self._upload_queue.get(False, 0.1) + ud = self._upload_queue.get(block=False, timeout=0.1) except queue.Empty: continue try: diff --git a/blobxfer/util.py b/blobxfer/util.py index 166b98f..a17b8a5 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -124,6 +124,20 @@ def is_not_empty(obj): return obj is not None and len(obj) > 0 +def join_thread(thr): + # type: (threading.Thread) -> None + """Join a thread + :type threading.Thread thr: thread to join + """ + if on_python2(): + while True: + thr.join(timeout=1) + if not thr.isAlive(): + break + else: + thr.join() + + def merge_dict(dict1, dict2): # type: (dict, dict) -> dict """Recursively merge dictionaries: dict2 on to dict1. This differs diff --git a/blobxfer/version.py b/blobxfer/version.py index 6280e0b..0f2a584 100644 --- a/blobxfer/version.py +++ b/blobxfer/version.py @@ -22,4 +22,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -__version__ = '1.0.0a2' +__version__ = '1.0.0a3' diff --git a/cli/cli.py b/cli/cli.py index 4b1e211..4c12bbd 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -62,13 +62,15 @@ def __init__(self): self.credentials = None self.general_options = None - def initialize(self): - # type: (CliContext) -> None + def initialize(self, action): + # type: (CliContext, settings.TransferAction) -> None """Initialize context :param CliContext self: this + :param settings.TransferAction action: transfer action """ self._init_config() - self.general_options = settings.create_general_options(self.config) + self.general_options = settings.create_general_options( + self.config, action) self.credentials = settings.create_azure_storage_credentials( self.config, self.general_options) @@ -164,7 +166,8 @@ def callback(ctx, param, value): '--log-file', expose_value=False, default=None, - help='Log to file specified', + help='Log to file specified; this must be specified for progress ' + 'bar to show', callback=callback)(f) @@ -191,7 +194,8 @@ def callback(ctx, param, value): '--progress-bar/--no-progress-bar', expose_value=False, default=True, - help='Display progress bar instead of console logs [True]', + help='Display progress bar instead of console logs; log file must ' + 'be specified [True]', callback=callback)(f) @@ -254,22 +258,22 @@ def callback(ctx, param, value): clictx.cli_options['local_resource'] = value return value return click.option( - '--local-resource', + '--local-path', expose_value=False, - help='Local resource; use - for stdin', + help='Local path; use - for stdin', callback=callback)(f) -def _storage_account_name_option(f): +def _storage_account_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['storage_account'] = value return value return click.option( - '--storage-account-name', + '--storage-account', expose_value=False, help='Storage account name', - envvar='BLOBXFER_STORAGE_ACCOUNT_NAME', + envvar='BLOBXFER_STORAGE_ACCOUNT', callback=callback)(f) @@ -301,7 +305,7 @@ def common_options(f): def upload_download_options(f): f = _remote_path_option(f) - f = _storage_account_name_option(f) + f = _storage_account_option(f) f = _local_resource_option(f) return f @@ -633,16 +637,16 @@ def callback(ctx, param, value): callback=callback)(f) -def _sync_copy_dest_storage_account_name_option(f): +def _sync_copy_dest_storage_account_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['sync_copy_dest_storage_account'] = value return value return click.option( - '--sync-copy-dest-storage-account-name', + '--sync-copy-dest-storage-account', expose_value=False, help='Storage account name for synccopy destination', - envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT', callback=callback)(f) @@ -721,11 +725,11 @@ def download_options(f): def sync_copy_options(f): - f = _sync_copy_dest_storage_account_name_option(f) + f = _sync_copy_dest_storage_account_option(f) f = _sync_copy_dest_sas_option(f) f = _sync_copy_dest_remote_path_option(f) f = _sync_copy_dest_access_key_option(f) - f = _storage_account_name_option(f) + f = _storage_account_option(f) f = _skip_on_md5_match_option(f) f = _skip_on_lmt_ge_option(f) f = _skip_on_filesize_match_option(f) @@ -757,7 +761,7 @@ def cli(ctx): def download(ctx): """Download blobs or files from Azure Storage""" settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download) - ctx.initialize() + ctx.initialize(settings.TransferAction.Download) specs = settings.create_download_specifications(ctx.config) for spec in specs: blobxfer.api.Downloader( @@ -773,7 +777,7 @@ def synccopy(ctx): """Synchronously copy blobs between Azure Storage accounts""" raise NotImplementedError() settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy) - ctx.initialize() + ctx.initialize(settings.TransferAction.Synccopy) @cli.command('upload') @@ -784,7 +788,7 @@ def synccopy(ctx): def upload(ctx): """Upload files to Azure Storage""" settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload) - ctx.initialize() + ctx.initialize(settings.TransferAction.Upload) specs = settings.create_upload_specifications(ctx.config) for spec in specs: blobxfer.api.Uploader( diff --git a/cli/settings.py b/cli/settings.py index 1507c82..24d1a7f 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -61,13 +61,13 @@ def add_cli_options(cli_options, action): if blobxfer.util.is_none_or_empty(local_resource): raise KeyError() except KeyError: - raise ValueError('--local-resource must be specified') + raise ValueError('--local-path must be specified') try: storage_account = cli_options['storage_account'] if blobxfer.util.is_none_or_empty(storage_account): raise KeyError() except KeyError: - raise ValueError('--storage-account-name must be specified') + raise ValueError('--storage-account must be specified') try: remote_path = cli_options['remote_path'] if blobxfer.util.is_none_or_empty(remote_path): @@ -167,7 +167,7 @@ def add_cli_options(cli_options, action): raise KeyError() except KeyError: raise ValueError( - '--sync-copy-dest-storage-account-name must be specified') + '--sync-copy-dest-storage-account must be specified') try: sync_copy_dest_remote_path = \ cli_options['sync_copy_dest_remote_path'] @@ -278,10 +278,11 @@ def create_azure_storage_credentials(config, general_options): return creds -def create_general_options(config): - # type: (dict) -> blobxfer.models.options.General +def create_general_options(config, action): + # type: (dict, TransferAction) -> blobxfer.models.options.General """Create a General Options object from configuration :param dict config: config dict + :param TransferAction action: transfer action :rtype: blobxfer.models.options.General :return: general options object """ @@ -292,6 +293,7 @@ def create_general_options(config): disk_threads=conc.get('disk_threads', 0), md5_processes=conc.get('md5_processes', 0), transfer_threads=conc.get('transfer_threads', 0), + is_download=action == TransferAction.Download, ), log_file=config['options'].get('log_file', None), progress_bar=config['options'].get('progress_bar', True), diff --git a/docs/01-installation.md b/docs/01-installation.md index 2609f07..9a3fd74 100644 --- a/docs/01-installation.md +++ b/docs/01-installation.md @@ -72,9 +72,10 @@ docker pull alfpark/blobxfer ## Troubleshooting #### `azure.storage` dependency not found -If you get an error that `azure.storage` cannot be found or loaded, then -most likely there was a conflict with this package with other `azure` packages -that share the same base namespace. You can correct this by issuing: +If you get an error such as `ImportError: No module named storage` or that +`azure.storage` cannot be found or loaded, then most likely there was a +conflict with this package with other `azure` packages that share the same +base namespace. You can correct this by issuing: ```shell # for Python3 pip3 install --upgrade --force-reinstall azure-storage diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md index 8931464..2f3aad4 100644 --- a/docs/10-cli-usage.md +++ b/docs/10-cli-usage.md @@ -12,9 +12,9 @@ command will be detailed along with all options available. ### `download` Downloads a remote Azure path, which may contain many resources, to the local machine. This command requires at the minimum, the following options: -* `--storage-account-name` +* `--storage-account` * `--remote-path` -* `--local-resource` +* `--local-path` Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. @@ -23,14 +23,14 @@ Please see the Authentication sub-section below under Options. Uploads a local path to a remote Azure path. The local path may contain many resources on the local machine. This command requires at the minimum, the following options: -* `--local-resource` -* `--storage-account-name` +* `--local-path` +* `--storage-account` * `--remote-path` Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. -If piping from `stdin`, `--local-resource` should be set to `-` as per +If piping from `stdin`, `--local-path` should be set to `-` as per convention. ### `synccopy` @@ -49,9 +49,10 @@ of up to 100MiB, all others have a maximum of 4MiB. attributes (mode and ownership) should be stored or restored. Note that to restore uid/gid, `blobxfer` must be run as root or under sudo. * `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed. -* `--local-resource` is the local resource path. Set to `-` if piping from +* `--local-path` is the local resource path. Set to `-` if piping from `stdin`. -* `--log-file` specifies the log file to write to. +* `--log-file` specifies the log file to write to. This must be specified +for a progress bar to be output to console. * `--mode` is the operating mode. The default is `auto` but may be set to `append`, `block`, `file`, or `page`. If specified with the `upload` command, then all files will be uploaded as the specified `mode` type. @@ -61,12 +62,16 @@ with Azure File shares. * `--overwrite` or `--no-overwrite` controls clobber semantics at the destination. * `--progress-bar` or `--no-progress-bar` controls if a progress bar is -output to the console. +output to the console. `--log-file` must be specified for a progress bar +to be output. * `--recursive` or `--no-recursive` controls if the source path should be recursively uploaded or downloaded. * `--remote-path` is the remote Azure path. This path must contain the Blob container or File share at the begining, e.g., `mycontainer/vdir` * `--resume-file` specifies the resume file to write to. +* `--storage-account` specifies the storage account to use. This can be +optionally provided through an environment variable `BLOBXFER_STORAGE_ACCOUNT` +instead. * `--timeout` is the integral timeout value in seconds to use. * `-h` or `--help` can be passed at every command level to receive context sensitive help. @@ -96,7 +101,7 @@ to/from Azure Storage. ### Connection * `--endpoint` is the Azure Storage endpoint to connect to; the default is Azure Public regions, or `core.windows.net`. -* `--storage-account-name` is the storage account to connect to. +* `--storage-account` is the storage account to connect to. ### Encryption * `--rsa-private-key` is the RSA private key in PEM format to use. This can @@ -161,27 +166,27 @@ file path. The default is `1`. ### `download` Examples #### Download an Entire Encrypted Blob Container to Current Working Directory ```shell -blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-public-key ~/mypubkey.pem +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-public-key ~/mypubkey.pem ``` #### Download an Entire File Share to Designated Path and Skip On Filesize Matches ```shell -blobxfer download --mode file --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-resource /my/path --skip-on-filesize-match +blobxfer download --mode file --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-path /my/path --skip-on-filesize-match ``` #### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path ```shell -blobxfer download --mode page --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource /my/pageblobs --no-recursive --delete +blobxfer download --mode page --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path /my/pageblobs --no-recursive --delete ``` #### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes ```shell -blobxfer download --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes +blobxfer download --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes ``` #### Download a Blob Snapshot ```shell -blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-resource . +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-path . ``` #### Download using a YAML Configuration File @@ -192,27 +197,27 @@ blobxfer download --config myconfig.yaml ### `upload` Examples #### Upload Current Working Directory as Encrypted Block Blobs Non-recursively ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-private-key ~/myprivatekey.pem --no-recursive +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-private-key ~/myprivatekey.pem --no-recursive ``` #### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files ```shell -blobxfer upload --mode file --storage-account-name mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-resource . --file-md5 --file-attributes --exclude '*.bak' +blobxfer upload --mode file --storage-account mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-path . --file-md5 --file-attributes --exclude '*.bak' ``` #### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 ``` #### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /my/path --file-md5 --skip-on-md5-match --delete +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /my/path --file-md5 --skip-on-md5-match --delete ``` #### Upload From Piped `stdin` ```shell -curl -fSsL https://some.uri | blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource - +curl -fSsL https://some.uri | blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path - ``` #### Upload using a YAML Configuration File diff --git a/docs/30-vectored-io.md b/docs/30-vectored-io.md index a007b7d..1d17c40 100644 --- a/docs/30-vectored-io.md +++ b/docs/30-vectored-io.md @@ -91,5 +91,5 @@ keep this metadata in-tact or reconstruction will fail. +---------------------+ ``` -In order to take advantage of `stripe` Vectored IO, you must use a YAML -configuration file to define multiple destinations. +In order to take advantage of `stripe` Vectored IO across multiple +destinations, you must use a YAML configuration file. diff --git a/docs/98-performance-considerations.md b/docs/98-performance-considerations.md index 89ff0c6..8a511fc 100644 --- a/docs/98-performance-considerations.md +++ b/docs/98-performance-considerations.md @@ -3,8 +3,8 @@ Please read the following carefully regarding considerations that should be applied with regard to performance and `blobxfer`. Additionally, please review the [Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) -for an overview of general performance targets that apply to Azure Blobs -and File shares. +for an overview of general performance targets that apply to Azure Blobs, +File shares and Storage Account types (GRS, LRS, ZRS, etc). ## Concurrency * `blobxfer` offers four concurrency knobs. Each one should be tuned for @@ -23,6 +23,44 @@ maximum performance according to your system and network characteristics. * The thread concurrency options (disk and transfer) can be set to a non-positive number to be automatically set as a multiple of the number of cores available on the machine. +* For uploads, there should be a sufficient number of disk threads to ensure +that all transfer threads have work to do. For downloads, there should be +sufficient number of disk threads to write data to disk so transfer threads +are not artificially blocked. + +## Chunk Sizing +Chunk sizing refers to the `chunk_size_bytes` option and the meaning of which +varies upon the context of uploading or downloading. + +### Uploads +For uploads, chunk sizes correspond to the maximum amount of data to transfer +with a single request. The Azure Storage service imposes maximums depending +upon the type of entity that is being written. For block blobs, the maximum +is 100MiB (although you may "one-shot" up to 256MiB). For page blobs, the +maximum is 4MiB. For append blobs, the maximum is 4MiB. For Azure Files, +the maximum is 4MiB. + +For block blobs, setting the chunk size to something greater than 4MiB will +not only allow you larger file sizes (recall that the maximum number of +blocks for a block blob is 50000, thus at 100MiB blocks, you can create a +5TiB block blob object) but will allow you to amortize larger portions of +data transfer over each request/response overhead. `blobxfer` can +automatically select the proper block size given your file, but will not +automatically tune the chunk size as that depends upon your system and +network characteristics. + +### Downloads +For downloads, chunk sizes correspond to the maximum amount of data to +request from the server for each request. It is important to keep a balance +between the chunk size and the number of in-flight operations afforded by +the `transfer_threads` concurrency control. `blobxfer` does not automatically +tune this (but can automatically set it to a value that should work for +most situations) due to varying system and network conditions. + +Additionally, disk write performance is typically lower than disk read +performance so you need to ensure that the number of `disk_threads` is not +set to a very large number to prevent thrashing and highly random write +patterns. ## Azure File Share Performance File share performance can be "slow" or become a bottleneck, especially for