diff --git a/.coveragerc b/.coveragerc index b710cba..5fc34c3 100644 --- a/.coveragerc +++ b/.coveragerc @@ -6,6 +6,7 @@ omit = exclude_lines = # Have to re-enable the standard pragma pragma: no cover + noqa # Don't complain about missing debug-only code: def __repr__ diff --git a/.gitignore b/.gitignore index ddc86bb..21d27b6 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ htmlcov/ nosetests.xml coverage.xml *,cover +junit-*.xml # Translations *.mo diff --git a/.travis.yml b/.travis.yml index cdf9217..b183124 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,28 +5,10 @@ python: - 3.3 - 3.4 - 3.5 - - pypy - # disable pypy3 until 3.3 compliance - #- pypy3 + - 3.6 install: - - | - if [ "$TRAVIS_PYTHON_VERSION" = "pypy" ]; then - export PYENV_ROOT="$HOME/.pyenv" - if [ -f "$PYENV_ROOT/bin/pyenv" ]; then - pushd "$PYENV_ROOT" && git pull && popd - else - rm -rf "$PYENV_ROOT" && git clone --depth 1 https://github.com/yyuu/pyenv.git "$PYENV_ROOT" - fi - export PYPY_VERSION="5.4.1" - "$PYENV_ROOT/bin/pyenv" install --skip-existing "pypy-$PYPY_VERSION" - virtualenv --python="$PYENV_ROOT/versions/pypy-$PYPY_VERSION/bin/python" "$HOME/virtualenvs/pypy-$PYPY_VERSION" - source "$HOME/virtualenvs/pypy-$PYPY_VERSION/bin/activate" - fi - - travis_retry pip install -e . - - travis_retry pip install coveralls flake8 mock pytest pytest-cov requests_mock + - travis_retry pip install tox-travis coveralls script: - - flake8 blobxfer.py test/test_blobxfer.py - - PYTHONPATH=. py.test -l --full-trace --cov-config .coveragerc --cov-report term-missing --cov blobxfer test/test_blobxfer.py + - tox after_success: - coveralls --rcfile=.coveragerc --verbose - diff --git a/CHANGELOG.md b/CHANGELOG.md index d1e4233..fd71b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,158 +2,176 @@ ## [Unreleased] +## [1.0.0a3] - 2017-06-02 +### Changed +- From scratch rewrite providing a consistent CLI experience and a vast +array of new and advanced features. Please see the +[1.0.0 Milestone](https://github.com/Azure/blobxfer/milestone/1) for a +catalog of changes. +- **Breaking Changes:** there have been a significant number of breaking +changes with the rewrite from the command-line invocation of `blobxfer` +itself to the options and environment variable names. Please review the +usage documentation carefully when upgrading from 0.12.1. +- All dependencies updated to latest + +### Removed +- Azure Service Management certificate support + +### Security +- Update cryptography requirement to 1.9 + ## [0.12.1] - 2016-12-09 -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Allow page blobs up to 1TB -#### Security +### Security - Update cryptography requirement to 1.6 ## [0.12.0] - 2016-10-17 -#### Added +### Added - Support for Account-level SAS keys - Update README regarding non-normalized exceptions being thrown (#5) ## [0.11.5] - 2016-10-03 -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Fix incorrect fileshare path splitting (#3) -#### Security +### Security - Update cryptography requirement to 1.5.2 ## [0.11.4] - 2016-09-12 -#### Added +### Added - Created [Docker image](https://hub.docker.com/r/alfpark/blobxfer) -#### Changed +### Changed - Update all dependencies to latest versions -#### Fixed +### Fixed - Fix `--delete` and blob listing with azure-storage (#1) -#### Security +### Security - Update cryptography requirement to 1.5 ## [0.11.2] - 2016-07-28 -#### Added +### Added - Allow rsakeypassphrase to be passed as an environment variable ## 0.11.1 - 2016-07-05 -#### Added +### Added - Allow storage account or sas key credentials to be passed as environment variables ## 0.11.0 - 2016-06-09 -#### Added +### Added - Azure Files support, please refer to the General Notes section for limitations -#### Changed +### Changed - `--blobep` option has been renamed to `--endpoint` ## 0.10.1 - 2016-06-06 -#### Changed +### Changed - Update all dependencies to latest versions - Add flag for block/page level md5 computation which is now disabled by default -#### Fixed +### Fixed - Update against breaking changes from azure-storage 0.32.0 -#### Removed +### Removed - Remove RC designation from encryption/decryption functionality -#### Security +### Security - Update cryptography requirement to 1.4 ## 0.10.0 - 2016-03-22 -#### Added +### Added - Added ``--disable-urllib-warnings`` option to suppress urllib3 warnings (use with care) -#### Changed +### Changed - Update script for compatibility with azure-storage 0.30.0 which is now a required dependency - Promote encryption to RC status - `--blobep` now refers to endpoint suffix rather than blob endpoint (e.g., core.windows.net rather than blob.core.windows.net) -#### Security +### Security - Update cryptography requirement to 1.3 ## 0.9.9.11 - 2016-02-22 -#### Changed +### Changed - Pin azure dependencies due to breaking changes -#### Fixed +### Fixed - Minor bug fixes -#### Security +### Security - Update cryptography requirement to 1.2.2 ## 0.9.9.10 - 2016-01-31 -#### Fixed +### Fixed - Fix regression in blob name encoding with Python3 ## 0.9.9.9 - 2016-01-29 -#### Added +### Added - Emit warning when attempting to use remoteresource with a directory upload -#### Changed +### Changed - Update setup.py dependencies to latest available versions -#### Fixed +### Fixed - Fix regression in single file upload and remoteresource renaming - Replace socket exception handling with requests ConnectionError handling - Properly handle blob names containing `?` if using SAS ## 0.9.9.8 - 2016-01-06 -#### Fixed +### Fixed - Disable unnecessary thread daemonization - Gracefully handle KeyboardInterrupts - Explicitly add azure-common to setup.py install reqs ## 0.9.9.7 - 2016-01-05 -#### Added +### Added - Add python environment and package info to parameter dump to aid issue/bug reports -#### Changed +### Changed - Reduce number of default concurrent workers to 3x CPU count - Change azure\_request backoff mechanism -#### Fixed +### Fixed - Make base requirements non-optional in import process - Update azure\_request exception handling to support new Azure Storage Python SDK errors ## 0.9.9.6 - 2016-01-04 -#### Added +### Added - Encryption support - No file overwrite on download option - Auto-detection of file mimetype - Remote delete option - Include pattern option -#### Changed +### Changed - Replace keeprootdir with strip-components option - Reduce the number of default concurrent workers to 4x CPU count -#### Fixed +### Fixed - Fix shared key upload with non-existent container - Fix zero-byte blob download issue ## 0.9.9.5 - 2015-09-27 -#### Added +### Added - File collation support -#### Fixed +### Fixed - Fix page alignment bug - Reduce memory usage @@ -183,7 +201,8 @@ `--no-skiponmatch`. - 0.8.2: performance regression fixes -[Unreleased]: https://github.com/Azure/blobxfer/compare/0.12.1...HEAD +[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a3...HEAD +[1.0.0a3]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a3 [0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1 [0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0 [0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5 diff --git a/README.md b/README.md new file mode 100644 index 0000000..fd2c904 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +[![Build Status](https://travis-ci.org/Azure/blobxfer.svg?branch=master)](https://travis-ci.org/Azure/blobxfer) +[![Coverage Status](https://coveralls.io/repos/github/Azure/blobxfer/badge.svg?branch=master)](https://coveralls.io/github/Azure/blobxfer?branch=master) +[![PyPI](https://img.shields.io/pypi/v/blobxfer.svg)](https://pypi.python.org/pypi/blobxfer) +[![PyPI](https://img.shields.io/pypi/pyversions/blobxfer.svg)](https://pypi.python.org/pypi/blobxfer) +[![Docker Pulls](https://img.shields.io/docker/pulls/alfpark/blobxfer.svg)](https://hub.docker.com/r/alfpark/blobxfer) +[![Image Layers](https://images.microbadger.com/badges/image/alfpark/blobxfer:latest.svg)](http://microbadger.com/images/alfpark/blobxfer) + +# blobxfer +`blobxfer` is an advanced data movement tool and library for Azure Storage +Blob and Files. With `blobxfer` you can copy your files into or out of Azure +Storage with the CLI or integrate the `blobxfer` data movement library into +your own Python scripts. + +## Major Features +* Command-line interface (CLI) providing data movement capability to and +from Azure Blob and File Storage +* Standalone library for integration with scripts or other Python packages +* High-performance design with asynchronous transfers and disk I/O +* YAML configuration driven execution support +* Resume support +* Vectored IO support + * `stripe` mode allows striping a single file across multiple blobs (even + to multiple storage accounts) to break through single blob or fileshare + throughput limits + * `replica` mode allows replication of a file across multiple destinations + including to multiple storage accounts +* Client-side encryption support +* Support all blob types for both upload and download +* Advanced skip options for rsync-like operations +* Store/restore POSIX filemode and uid/gid +* Support for reading/pipe from `stdin` +* Support for reading from blob snapshots +* Configurable one-shot block upload support +* Configurable chunk size for both upload and download +* Automatic block blob size adjustment for uploading +* Automatic uploading of VHD/VHDX files as page blobs +* Include and exclude filtering support +* Rsync-like delete support +* No clobber support in either direction +* File logging support + +## Installation +`blobxfer` is on [PyPI](https://pypi.python.org/pypi/blobxfer) and on +[Docker Hub](https://hub.docker.com/r/alfpark/blobxfer/). Please refer to +the [installation guide](https://github.com/Azure/blobxfer/blob/master/docs/01-installation.md) +on how to install `blobxfer`. + +## Documentation +Please refer to the [blobxfer Documentation](https://github.com/Azure/blobxfer/blob/master/docs) +for more details and usage information. + +## Change Log +For recent changes, please refer to the +[CHANGELOG.md](https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md) +file. + +------------------------------------------------------------------------ + +This project has adopted the +[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the +[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +or contact [](mailto:opencode@microsoft.com) with any +additional questions or comments. diff --git a/README.rst b/README.rst deleted file mode 100644 index 882d883..0000000 --- a/README.rst +++ /dev/null @@ -1,426 +0,0 @@ -.. image:: https://travis-ci.org/Azure/blobxfer.svg?branch=master - :target: https://travis-ci.org/Azure/blobxfer -.. image:: https://coveralls.io/repos/github/Azure/blobxfer/badge.svg?branch=master - :target: https://coveralls.io/github/Azure/blobxfer?branch=master -.. image:: https://img.shields.io/pypi/v/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/pypi/pyversions/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/pypi/l/blobxfer.svg - :target: https://pypi.python.org/pypi/blobxfer -.. image:: https://img.shields.io/docker/pulls/alfpark/blobxfer.svg - :target: https://hub.docker.com/r/alfpark/blobxfer -.. image:: https://images.microbadger.com/badges/image/alfpark/blobxfer.svg - :target: https://microbadger.com/images/alfpark/blobxfer - -blobxfer -======== -AzCopy-like OS independent Azure storage blob and file share transfer tool - -Installation ------------- -`blobxfer`_ is on PyPI and can be installed via: - -:: - - pip install blobxfer - -blobxfer is compatible with Python 2.7 and 3.3+. To install for Python 3, some -distributions may use ``pip3`` instead. If you do not want to install blobxfer -as a system-wide binary and modify system-wide python packages, use the -``--user`` flag with ``pip`` or ``pip3``. - -blobxfer is also on `Docker Hub`_, and the Docker image for Linux can be -pulled with the following command: - -:: - - docker pull alfpark/blobxfer - -Please see example usage below on how to use the docker image. - -If you encounter difficulties installing the script, it may be due to the -``cryptography`` dependency. Please ensure that your system is able to install -binary wheels provided by these dependencies (e.g., on Windows) or is able to -compile the dependencies (i.e., ensure you have a C compiler, python, ssl, -and ffi development libraries/headers installed prior to invoking pip). For -instance, to install blobxfer on a fresh Ubuntu 14.04/16.04 installation for -Python 2.7, issue the following commands: - -:: - - apt-get update - apt-get install -y build-essential libssl-dev libffi-dev libpython-dev python-dev python-pip - pip install --upgrade blobxfer - -If you need more fine-grained control on installing dependencies, continue -reading this section. Depending upon the desired mode of authentication with -Azure and options, the script will require the following packages, some of -which will automatically pull required dependent packages. Below is a list of -dependent packages: - -- Base Requirements - - - `azure-common`_ - - `azure-storage`_ - - `requests`_ - -- Encryption Support - - - `cryptography`_ - -- Service Management Certificate Support - - - `azure-servicemanagement-legacy`_ - -You can install these packages using pip, easy_install or through standard -setup.py procedures. These dependencies will be automatically installed if -using a package-based install or setup.py. The required versions of these -dependent packages can be found in ``setup.py``. - -.. _blobxfer: https://pypi.python.org/pypi/blobxfer -.. _Docker Hub: https://hub.docker.com/r/alfpark/blobxfer -.. _azure-common: https://pypi.python.org/pypi/azure-common -.. _azure-storage: https://pypi.python.org/pypi/azure-storage -.. _requests: https://pypi.python.org/pypi/requests -.. _cryptography: https://pypi.python.org/pypi/cryptography -.. _azure-servicemanagement-legacy: https://pypi.python.org/pypi/azure-servicemanagement-legacy - -Introduction ------------- - -The blobxfer.py script allows interacting with storage accounts using any of -the following methods: (1) management certificate, (2) shared account key, -(3) SAS key. The script can, in addition to working with single files, mirror -entire directories into and out of containers or file shares from Azure -Storage, respectively. File and block/page level MD5 integrity checking is -supported along with various transfer optimizations, built-in retries, -user-specified timeouts, and client-side encryption. - -Program parameters and command-line options can be listed via the ``-h`` -switch. Please invoke this first if you are unfamiliar with blobxfer operation -as not all options are explained below. At the minimum, three positional -arguments are required: storage account name, container or share name, and -local resource. Additionally, one of the following authentication switches -must be supplied: ``--subscriptionid`` with ``--managementcert``, -``--storageaccountkey``, or ``--saskey``. Do not combine different -authentication schemes together. - -Environment variables ``BLOBXFER_STORAGEACCOUNTKEY``, ``BLOBXFER_SASKEY``, -and ``BLOBXFER_RSAKEYPASSPHRASE`` can take the place of -``--storageaccountkey``, ``--saskey``, and ``--rsakeypassphrase`` respectively -if you do not want to expose credentials on a command line. - -It is generally recommended to use SAS keys wherever appropriate; only HTTPS -transport is used in the script. Please note that when using SAS keys that -only container- or fileshare-level SAS keys will allow for entire directory -uploading or container/fileshare downloading. The container/fileshare must -also have been created beforehand if using a service SAS, as -containers/fileshares cannot be created using service SAS keys. Account-level -SAS keys with a signed resource type of ``c`` or container will allow -containers/fileshares to be created with SAS keys. - -Example Usage -------------- - -The following examples show how to invoke the script with commonly used -options. Note that the authentication parameters are missing from the below -examples. You will need to select a preferred method of authenticating with -Azure and add the authentication switches (or as environment variables) as -noted above. - -The script will attempt to perform a smart transfer, by detecting if the local -resource exists. For example: - -:: - - blobxfer mystorageacct container0 mylocalfile.txt - -Note: if you downloaded the script directly from github, then you should append -``.py`` to the blobxfer command. - -If mylocalfile.txt exists locally, then the script will attempt to upload the -file to container0 on mystorageacct. If the file does not exist, then it will -attempt to download the resource. If the desired behavior is to download the -file from Azure even if the local file exists, one can override the detection -mechanism with ``--download``. ``--upload`` is available to force the transfer -to Azure storage. Note that specifying a particular direction does not force -the actual operation to occur as that depends on other options specified such -as skipping on MD5 matches. Note that you may use the ``--remoteresource`` flag -to rename the local file as the blob name on Azure storage if uploading, -however, ``--remoteresource`` has no effect if uploading a directory of files. -Please refer to the ``--collate`` option as explained below. - -If the local resource is a directory that exists, the script will attempt to -mirror (recursively copy) the entire directory to Azure storage while -maintaining subdirectories as virtual directories in Azure storage. You can -disable the recursive copy (i.e., upload only the files in the directory) -using the ``--no-recursive`` flag. - -To upload a directory with files only matching a Unix-style shell wildcard -pattern, an example commandline would be: - -:: - - blobxfer mystorageacct container0 mylocaldir --upload --include '**/*.txt' - -This would attempt to recursively upload the contents of mylocaldir -to container0 for any file matching the wildcard pattern ``*.txt`` within -all subdirectories. Include patterns can be applied for uploads as well as -downloads. Note that you will need to prevent globbing by your shell such -that wildcard expansion does not take place before script interprets the -argument. If ``--include`` is not specified, all files will be uploaded -or downloaded for the specific context. - -To download an entire container from your storage account, an example -commandline would be: - -:: - - blobxfer mystorageacct container0 mylocaldir --remoteresource . - -Assuming mylocaldir directory does not exist, the script will attempt to -download all of the contents in container0 because “.” is set with -``--remoteresource`` flag. To download individual blobs, one would specify the -blob name instead of “.” with the ``--remoteresource`` flag. If mylocaldir -directory exists, the script will attempt to upload the directory instead of -downloading it. If you want to force the download direction even if the -directory exists, indicate that with the ``--download`` flag. When downloading -an entire container, the script will attempt to pre-allocate file space and -recreate the sub-directory structure as needed. - -To collate files into specified virtual directories or local paths, use -the ``--collate`` flag with the appropriate parameter. For example, the -following commandline: - -:: - - blobxfer mystorageacct container0 myvhds --upload --collate vhds --autovhd - -If the directory ``myvhds`` had two vhd files a.vhd and subdir/b.vhd, these -files would be uploaded into ``container0`` under the virtual directory named -``vhds``, and b.vhd would not contain the virtual directory subdir; thus, -flattening the directory structure. The ``--autovhd`` flag would automatically -enable page blob uploads for these files. If you wish to collate all files -into the container directly, you would replace ``--collate vhds`` with -``--collate .`` - -To strip leading components of a path on upload, use ``--strip-components`` -with a number argument which will act similarly to tar's -``--strip-components=NUMBER`` parameter. This parameter is only applied -during an upload. - -To encrypt or decrypt files, the option ``--rsapublickey`` and -``--rsaprivatekey`` is available. This option requires a file location for a -PEM encoded RSA public or private key. An optional parameter, -``--rsakeypassphrase`` is available for passphrase protected RSA private keys. - -To encrypt and upload, only the RSA public key is required although an RSA -private key may be specified. To download and decrypt blobs which are -encrypted, the RSA private key is required. - -:: - - blobxfer mystorageacct container0 myblobs --upload --rsapublickey mypublickey.pem - -The above example commandline would encrypt and upload files contained in -``myblobs`` using an RSA public key named ``mypublickey.pem``. An RSA private -key may be specified instead for uploading (public parts will be used). - -:: - - blobxfer mystorageacct container0 myblobs --remoteresource . --download --rsaprivatekey myprivatekey.pem - -The above example commandline would download and decrypt all blobs in the -container ``container0`` using an RSA private key named ``myprivatekey.pem``. -An RSA private key must be specified for downloading and decryption of -encrypted blobs. - -Currently only the ``FullBlob`` encryption mode is supported for the -parameter ``--encmode``. The ``FullBlob`` encryption mode either uploads or -downloads Azure Storage .NET/Java compatible client-side encrypted block blobs. - -Please read important points in the Encryption Notes below for more -information. - -To transfer to an Azure Files share, specify the ``--fileshare`` option and -specify the share name as the second positional argument. - -:: - - blobxfer mystorageacct myshare localfiles --fileshare --upload - -The above example would upload all files in the ``localfiles`` directory to -the share named ``myshare``. Encryption/decryption options are compatible with -Azure Files as the destination or source. Please refer to this `MSDN article`_ -for features not supported by the Azure File Service. - -.. _MSDN article: https://msdn.microsoft.com/en-us/library/azure/dn744326.aspx - -Docker Usage ------------- - -An example execution for uploading the host path ``/example/host/path`` -to a storage container named ``container0`` would be: - -:: - - docker run --rm -t -v /example/host/path:/path/in/container alfpark/blobxfer mystorageacct container0 /path/in/container --upload - -Note that docker volume mount mappings must be crafted with care to ensure -consistency with directory depth between the host and the container. -Optionally, you can utilize the ``--strip-components`` flag to remove leading -path components as desired. - -General Notes -------------- - -- If the pyOpenSSL package is present, urllib3/requests may use this package - (as discussed in the Performance Notes below), which may result in - exceptions being thrown that are not normalized by urllib3. This may - result in exceptions that should be retried, but are not. It is recommended - to upgrade your Python where pyOpenSSL is not required for fully validating - peers and such that blobxfer can operate without pyOpenSSL in a secure - fashion. You can also run blobxfer via Docker or in a virtualenv - environment without pyOpenSSL. -- blobxfer does not take any leases on blobs or containers. It is up to - the user to ensure that blobs are not modified while download/uploads - are being performed. -- No validation is performed regarding container and file naming and length - restrictions. -- blobxfer will attempt to download from blob storage as-is. If the source - filename is incompatible with the destination operating system, then - failure may result. -- When using SAS, the SAS key must be a container- or share-level SAS if - performing recursive directory upload or container/file share download. -- If uploading via service-level SAS keys, the container or file share must - already be created in Azure storage prior to upload. Account-level SAS keys - with the signed resource type of ``c`` or container-level permission will - allow conatiner or file share creation. -- For non-SAS requests, timeouts may not be properly honored due to - limitations of the Azure Python SDK. -- By default, files with matching MD5 checksums will be skipped for both - download (if MD5 information is present on the blob) and upload. Specify - ``--no-skiponmatch`` to disable this functionality. -- When uploading files as page blobs, the content is page boundary - byte-aligned. The MD5 for the blob is computed using the final aligned - data if the source is not page boundary byte-aligned. This enables these - page blobs or files to be skipped during subsequent download or upload by - default (i.e., ``--no-skiponmatch`` parameter is not specified). -- If ``--delete`` is specified, any remote files found that have no - corresponding local file in directory upload mode will be deleted. Deletion - occurs prior to any transfers, analogous to the delete-before rsync option. - Please note that this parameter will interact with ``--include`` and any - file not included from the include pattern will be deleted. -- ``--include`` has no effect when specifying a single file to upload or - blob to download. When specifying ``--include`` on container download, - the pattern will be applied to the blob name without the container name. - Globbing of wildcards must be disabled such that the script can read - the include pattern without the shell expanding the wildcards, if specified. -- Empty directories are not created locally when downloading from an Azure - file share which has empty directories. -- Empty directories are not deleted if ``--delete`` is specified and no - files remain in the directory on the Azure file share. - -Performance Notes ------------------ - -- Most likely, you will need to tweak the ``--numworkers`` argument that best - suits your environment. The default is the number of CPUs on the running - machine multiplied by 3 (except when transferring to/from file shares). - Increasing this number (or even using the default) may not provide the - optimal balance between concurrency and your network conditions. - Additionally, this number may not work properly if you are attempting to - run multiple blobxfer sessions in parallel from one machine or IP address. - Futhermore, this number may be defaulted to be set too high if encryption - is enabled and the machine cannot handle processing multiple threads in - parallel. -- Computing file MD5 can be time consuming for large files. If integrity - checking or rsync-like capability is not required, specify - ``--no-computefilemd5`` to disable MD5 computation for files. -- File share performance can be "slow" or become a bottleneck, especially for - file shares containing thousands of files as multiple REST calls must be - performed for each file. Currently, a single file share has a limit of up - to 60 MB/s and 1000 8KB IOPS. Please refer to the - `Azure Storage Scalability and Performance Targets`_ for performance targets - and limits regarding Azure Storage Blobs and Files. If scalable high - performance is required, consider using blob storage or multiple file - shares. -- Using SAS keys may provide the best performance as the script bypasses - the Azure Storage Python SDK and uses requests/urllib3 directly with - Azure Storage endpoints. Transfers to/from Azure Files will always use - the Azure Storage Python SDK even with SAS keys. -- As of requests 2.6.0 and Python versions < 2.7.9 (i.e., interpreter found - on default Ubuntu 14.04 installations), if certain packages are installed, - as those found in ``requests[security]`` then the underlying ``urllib3`` - package will utilize the ``ndg-httpsclient`` package which will use - `pyOpenSSL`_. This will ensure the peers are `fully validated`_. However, - this incurs a rather larger performance penalty. If you understand the - potential security risks for disabling this behavior due to high performance - requirements, you can either remove ``ndg-httpsclient`` or use the script - in a ``virtualenv`` environment without the ``ndg-httpsclient`` package. - Python versions >= 2.7.9 are not affected by this issue. These warnings can - be suppressed using ``--disable-urllib-warnings``, but is not recommended - unless you understand the security implications. - -.. _Azure Storage Scalability and Performance Targets: https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/ -.. _pyOpenSSL: https://urllib3.readthedocs.org/en/latest/security.html#pyopenssl -.. _fully validated: https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning - - -Encryption Notes ----------------- - -- All required information regarding the encryption process is stored on - each blob's ``encryptiondata`` and ``encryptiondata_authentication`` - metadata. These metadata entries are used on download to configure the proper - download and parameters for the decryption process as well as to authenticate - the encryption. Encryption metadata set by blobxfer (or the Azure Storage - .NET/Java client library) should not be modified or blobs/files may be - unrecoverable. -- Local files can be encrypted by blobxfer and stored in Azure Files and, - correspondingly, remote files on Azure File shares can be decrypted by - blobxfer as long as the metdata portions remain in-tact. -- Keys for AES256 block cipher are generated on a per-blob/file basis. These - keys are encrypted using RSAES-OAEP. -- MD5 for both the pre-encrypted and encrypted version of the file is stored - in blob/file metadata. Rsync-like synchronization is still supported - transparently with encrypted blobs/files. -- Whole file MD5 checks are skipped if a message authentication code is found - to validate the integrity of the encrypted data. -- Attempting to upload the same file as an encrypted blob with a different RSA - key or under a different encryption mode will not occur if the file content - MD5 is the same. This behavior can be overridden by including the option - ``--no-skiponmatch``. -- If one wishes to apply encryption to a blob/file already uploaded to Azure - Storage that has not changed, the upload will not occur since the underlying - file content MD5 has not changed; this behavior can be overriden by - including the option ``--no-skiponmatch``. -- Encryption is only applied to block blobs (or fileshare files). Encrypted - page blobs appear to be of minimal value stored in Azure Storage via - blobxfer. Thus, if uploading VHDs while enabling encryption in the script, - do not enable the option ``--pageblob``. ``--autovhd`` will continue to work - transparently where vhd files will be uploaded as page blobs in unencrypted - form while other files will be uploaded as encrypted block blobs. Note that - using ``--autovhd`` with encryption will force set the max chunk size to - 4 MiB for non-encrypted vhd files. -- Downloading encrypted blobs/files may not fully preallocate each file due to - padding. Script failure can result during transfer if there is insufficient - disk space. -- Zero-byte (empty) files are not encrypted. - -Change Log ----------- - -See the `CHANGELOG.md`_ file. - -.. _CHANGELOG.md: https://github.com/Azure/blobxfer/blob/master/CHANGELOG.md - ----- - -This project has adopted the -`Microsoft Open Source Code of Conduct `__. -For more information see the -`Code of Conduct FAQ `__ -or contact `opencode@microsoft.com `__ with any -additional questions or comments. diff --git a/blobxfer.py b/blobxfer.py deleted file mode 100755 index 5cadcba..0000000 --- a/blobxfer.py +++ /dev/null @@ -1,3033 +0,0 @@ -#!/usr/bin/env python - -# blobxfer Tool -# -# Copyright (c) Microsoft Corporation -# -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN THE SOFTWARE. - -""" -Data transfer tool for Azure blob and file storage - -See notes in the README.rst file. - -TODO list: -- convert from threading to multiprocessing -- move instruction queue data to class -- migrate connections with sas to azure-storage -""" - -# pylint: disable=R0913,R0914 - -# stdlib imports -from __future__ import print_function -import argparse -import base64 -import errno -import fnmatch -import hashlib -import hmac -import json -import mimetypes -import multiprocessing -import os -import platform -# pylint: disable=F0401 -try: - import queue -except ImportError: # pragma: no cover - import Queue as queue -# pylint: enable=F0401 -import socket -import sys -import threading -import time -import traceback -try: - from urllib.parse import quote as urlquote -except ImportError: # pramga: no cover - from urllib import quote as urlquote -import xml.etree.ElementTree as ET -# non-stdlib imports -import azure.common -try: - import azure.servicemanagement -except ImportError: # pragma: no cover - pass -import azure.storage.blob -import azure.storage.file -try: - import cryptography.hazmat.backends - import cryptography.hazmat.primitives.asymmetric.padding - import cryptography.hazmat.primitives.asymmetric.rsa - import cryptography.hazmat.primitives.ciphers - import cryptography.hazmat.primitives.ciphers.algorithms - import cryptography.hazmat.primitives.ciphers.modes - import cryptography.hazmat.primitives.constant_time - import cryptography.hazmat.primitives.hashes - import cryptography.hazmat.primitives.padding - import cryptography.hazmat.primitives.serialization -except ImportError: # pragma: no cover - pass -import requests - -# remap keywords for Python3 -# pylint: disable=W0622,C0103 -try: - xrange -except NameError: # pragma: no cover - xrange = range -try: - long -except NameError: # pragma: no cover - long = int -# pylint: enable=W0622,C0103 - -# global defines -_SCRIPT_VERSION = '0.12.1' -_PY2 = sys.version_info.major == 2 -_DEFAULT_MAX_STORAGEACCOUNT_WORKERS = multiprocessing.cpu_count() * 3 -_MAX_BLOB_CHUNK_SIZE_BYTES = 4194304 -_EMPTY_MAX_PAGE_SIZE_MD5 = 'tc+p1sj+vWGPkawoQ9UKHA==' -_MAX_LISTBLOBS_RESULTS = 1000 -_PAGEBLOB_BOUNDARY = 512 -_DEFAULT_STORAGE_ENDPOINT = 'core.windows.net' -_DEFAULT_MANAGEMENT_ENDPOINT = 'management.core.windows.net' -_ENVVAR_STORAGEACCOUNTKEY = 'BLOBXFER_STORAGEACCOUNTKEY' -_ENVVAR_SASKEY = 'BLOBXFER_SASKEY' -_ENVVAR_RSAKEYPASSPHRASE = 'BLOBXFER_RSAKEYPASSPHRASE' -# encryption defines -_AES256_KEYLENGTH_BYTES = 32 -_AES256_BLOCKSIZE_BYTES = 16 -_HMACSHA256_DIGESTSIZE_BYTES = 32 -_AES256CBC_HMACSHA256_OVERHEAD_BYTES = _AES256_BLOCKSIZE_BYTES + \ - _HMACSHA256_DIGESTSIZE_BYTES -_ENCRYPTION_MODE_FULLBLOB = 'FullBlob' -_ENCRYPTION_MODE_CHUNKEDBLOB = 'ChunkedBlob' -_DEFAULT_ENCRYPTION_MODE = _ENCRYPTION_MODE_FULLBLOB -_ENCRYPTION_PROTOCOL_VERSION = '1.0' -_ENCRYPTION_ALGORITHM = 'AES_CBC_256' -_ENCRYPTION_AUTH_ALGORITHM = 'HMAC-SHA256' -_ENCRYPTION_CHUNKSTRUCTURE = 'IV || EncryptedData || Signature' -_ENCRYPTION_ENCRYPTED_KEY_SCHEME = 'RSA-OAEP' -_ENCRYPTION_METADATA_NAME = 'encryptiondata' -_ENCRYPTION_METADATA_MODE = 'EncryptionMode' -_ENCRYPTION_METADATA_ALGORITHM = 'Algorithm' -_ENCRYPTION_METADATA_MAC = 'MessageAuthenticationCode' -_ENCRYPTION_METADATA_LAYOUT = 'EncryptedDataLayout' -_ENCRYPTION_METADATA_CHUNKOFFSETS = 'ChunkByteOffsets' -_ENCRYPTION_METADATA_CHUNKSTRUCTURE = 'ChunkStructure' -_ENCRYPTION_METADATA_AGENT = 'EncryptionAgent' -_ENCRYPTION_METADATA_PROTOCOL = 'Protocol' -_ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM = 'EncryptionAlgorithm' -_ENCRYPTION_METADATA_INTEGRITY_AUTH = 'EncryptionAuthentication' -_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY = 'WrappedContentKey' -_ENCRYPTION_METADATA_ENCRYPTEDKEY = 'EncryptedKey' -_ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY = 'EncryptedAuthenticationKey' -_ENCRYPTION_METADATA_CONTENT_IV = 'ContentEncryptionIV' -_ENCRYPTION_METADATA_KEYID = 'KeyId' -_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS = 'BlobxferExtensions' -_ENCRYPTION_METADATA_PREENCRYPTED_MD5 = 'PreEncryptedContentMD5' -_ENCRYPTION_METADATA_AUTH_NAME = 'encryptiondata_authentication' -_ENCRYPTION_METADATA_AUTH_METAAUTH = 'EncryptionMetadataAuthentication' -_ENCRYPTION_METADATA_AUTH_ENCODING = 'Encoding' -_ENCRYPTION_METADATA_AUTH_ENCODING_TYPE = 'UTF-8' - - -class EncryptionMetadataJson(object): - """Class for handling encryption metadata json""" - def __init__( - self, args, symkey, signkey, iv, encdata_signature, - preencrypted_md5, rsakeyid=None): - """Ctor for EncryptionMetadataJson - Parameters: - args - program arguments - symkey - symmetric key - signkey - signing key - iv - initialization vector - encdata_signature - encrypted data signature (MAC) - preencrypted_md5 - pre-encrypted md5 hash - rsakeyid - symmetric key id - Returns: - Nothing - Raises: - Nothing - """ - self.encmode = args.encmode - self.rsaprivatekey = args.rsaprivatekey - self.rsapublickey = args.rsapublickey - self.chunksizebytes = args.chunksizebytes - self.symkey = symkey - self.signkey = signkey - if rsakeyid is None: - self.rsakeyid = 'private:key1' - else: - self.rsakeyid = rsakeyid - self.iv = iv - self.hmac = encdata_signature - self.md5 = preencrypted_md5 - - def construct_metadata_json(self): - """Constructs encryptiondata metadata - Paramters: - None - Returns: - dict of encryptiondata and encryptiondata_authentiation json - Raises: - Nothing - """ - encsymkey, _ = rsa_encrypt_key( - self.rsaprivatekey, self.rsapublickey, self.symkey) - encsignkey, _ = rsa_encrypt_key( - self.rsaprivatekey, self.rsapublickey, self.signkey) - encjson = { - _ENCRYPTION_METADATA_MODE: self.encmode, - _ENCRYPTION_METADATA_WRAPPEDCONTENTKEY: { - _ENCRYPTION_METADATA_KEYID: self.rsakeyid, - _ENCRYPTION_METADATA_ENCRYPTEDKEY: encsymkey, - _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY: encsignkey, - _ENCRYPTION_METADATA_ALGORITHM: - _ENCRYPTION_ENCRYPTED_KEY_SCHEME, - }, - _ENCRYPTION_METADATA_AGENT: { - _ENCRYPTION_METADATA_PROTOCOL: _ENCRYPTION_PROTOCOL_VERSION, - _ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM: - _ENCRYPTION_ALGORITHM - }, - _ENCRYPTION_METADATA_INTEGRITY_AUTH: { - _ENCRYPTION_METADATA_ALGORITHM: - _ENCRYPTION_AUTH_ALGORITHM, - }, - 'KeyWrappingMetadata': {}, - } - if self.md5 is not None: - encjson[_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS] = { - _ENCRYPTION_METADATA_PREENCRYPTED_MD5: self.md5 - } - if self.encmode == _ENCRYPTION_MODE_FULLBLOB: - encjson[_ENCRYPTION_METADATA_CONTENT_IV] = base64encode(self.iv) - encjson[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_MAC] = base64encode(self.hmac) - elif self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - encjson[_ENCRYPTION_METADATA_LAYOUT] = {} - encjson[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKOFFSETS] = \ - self.chunksizebytes + _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - encjson[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKSTRUCTURE] = \ - _ENCRYPTION_CHUNKSTRUCTURE - else: - raise RuntimeError( - 'Unknown encryption mode: {}'.format(self.encmode)) - bencjson = json.dumps( - encjson, sort_keys=True, ensure_ascii=False).encode( - _ENCRYPTION_METADATA_AUTH_ENCODING_TYPE) - encjson = {_ENCRYPTION_METADATA_NAME: - json.dumps(encjson, sort_keys=True)} - # compute MAC over encjson - hmacsha256 = hmac.new(self.signkey, digestmod=hashlib.sha256) - hmacsha256.update(bencjson) - authjson = { - _ENCRYPTION_METADATA_AUTH_METAAUTH: { - _ENCRYPTION_METADATA_ALGORITHM: _ENCRYPTION_AUTH_ALGORITHM, - _ENCRYPTION_METADATA_AUTH_ENCODING: - _ENCRYPTION_METADATA_AUTH_ENCODING_TYPE, - _ENCRYPTION_METADATA_MAC: base64encode(hmacsha256.digest()), - } - } - encjson[_ENCRYPTION_METADATA_AUTH_NAME] = json.dumps( - authjson, sort_keys=True) - return encjson - - def parse_metadata_json( - self, blobname, rsaprivatekey, rsapublickey, mddict): - """Parses a meta data dictionary containing the encryptiondata - metadata - Parameters: - blobname - name of blob - rsaprivatekey - RSA private key - rsapublickey - RSA public key - mddict - metadata dictionary - Returns: - Nothing - Raises: - RuntimeError if encryptiondata metadata contains invalid or - unknown fields - """ - if _ENCRYPTION_METADATA_NAME not in mddict: - return - # json parse internal dict - meta = json.loads(mddict[_ENCRYPTION_METADATA_NAME]) - # populate preencryption md5 - if (_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS in meta and - _ENCRYPTION_METADATA_PREENCRYPTED_MD5 in meta[ - _ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS]): - self.md5 = meta[_ENCRYPTION_METADATA_BLOBXFER_EXTENSIONS][ - _ENCRYPTION_METADATA_PREENCRYPTED_MD5] - else: - self.md5 = None - # if RSA key is not present return - if rsaprivatekey is None and rsapublickey is None: - return - # check for required metadata fields - if (_ENCRYPTION_METADATA_MODE not in meta or - _ENCRYPTION_METADATA_AGENT not in meta): - return - # populate encryption mode - self.encmode = meta[_ENCRYPTION_METADATA_MODE] - # validate known encryption metadata is set to proper values - if self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - chunkstructure = meta[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKSTRUCTURE] - if chunkstructure != _ENCRYPTION_CHUNKSTRUCTURE: - raise RuntimeError( - '{}: unknown encrypted chunk structure {}'.format( - blobname, chunkstructure)) - protocol = meta[_ENCRYPTION_METADATA_AGENT][ - _ENCRYPTION_METADATA_PROTOCOL] - if protocol != _ENCRYPTION_PROTOCOL_VERSION: - raise RuntimeError('{}: unknown encryption protocol: {}'.format( - blobname, protocol)) - blockcipher = meta[_ENCRYPTION_METADATA_AGENT][ - _ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM] - if blockcipher != _ENCRYPTION_ALGORITHM: - raise RuntimeError('{}: unknown block cipher: {}'.format( - blobname, blockcipher)) - if _ENCRYPTION_METADATA_INTEGRITY_AUTH in meta: - intauth = meta[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_ALGORITHM] - if intauth != _ENCRYPTION_AUTH_ALGORITHM: - raise RuntimeError( - '{}: unknown integrity/auth method: {}'.format( - blobname, intauth)) - symkeyalg = meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ALGORITHM] - if symkeyalg != _ENCRYPTION_ENCRYPTED_KEY_SCHEME: - raise RuntimeError('{}: unknown key encryption scheme: {}'.format( - blobname, symkeyalg)) - # populate iv and hmac - if self.encmode == _ENCRYPTION_MODE_FULLBLOB: - self.iv = base64.b64decode(meta[_ENCRYPTION_METADATA_CONTENT_IV]) - # don't base64 decode hmac - if _ENCRYPTION_METADATA_INTEGRITY_AUTH in meta: - self.hmac = meta[_ENCRYPTION_METADATA_INTEGRITY_AUTH][ - _ENCRYPTION_METADATA_MAC] - else: - self.hmac = None - # populate chunksize - if self.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - self.chunksizebytes = long( - meta[_ENCRYPTION_METADATA_LAYOUT][ - _ENCRYPTION_METADATA_CHUNKOFFSETS]) - # if RSA key is a public key, stop here as keys cannot be decrypted - if rsaprivatekey is None: - return - # decrypt symmetric key - self.symkey = rsa_decrypt_key( - rsaprivatekey, - meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ENCRYPTEDKEY], None) - # decrypt signing key, if it exists - if _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY in meta[ - _ENCRYPTION_METADATA_WRAPPEDCONTENTKEY]: - self.signkey = rsa_decrypt_key( - rsaprivatekey, - meta[_ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - _ENCRYPTION_METADATA_ENCRYPTEDAUTHKEY], None) - else: - self.signkey = None - # validate encryptiondata metadata using the signing key - if (self.signkey is not None and - _ENCRYPTION_METADATA_AUTH_NAME in mddict): - authmeta = json.loads(mddict[_ENCRYPTION_METADATA_AUTH_NAME]) - if _ENCRYPTION_METADATA_AUTH_METAAUTH not in authmeta: - raise RuntimeError( - '{}: encryption metadata auth block not found'.format( - blobname)) - if _ENCRYPTION_METADATA_AUTH_ENCODING not in authmeta[ - _ENCRYPTION_METADATA_AUTH_METAAUTH]: - raise RuntimeError( - '{}: encryption metadata auth encoding not found'.format( - blobname)) - intauth = authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_ALGORITHM] - if intauth != _ENCRYPTION_AUTH_ALGORITHM: - raise RuntimeError( - '{}: unknown integrity/auth method: {}'.format( - blobname, intauth)) - authhmac = base64.b64decode( - authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_MAC]) - bmeta = mddict[_ENCRYPTION_METADATA_NAME].encode( - authmeta[_ENCRYPTION_METADATA_AUTH_METAAUTH][ - _ENCRYPTION_METADATA_AUTH_ENCODING]) - hmacsha256 = hmac.new(self.signkey, digestmod=hashlib.sha256) - hmacsha256.update(bmeta) - if hmacsha256.digest() != authhmac: - raise RuntimeError( - '{}: encryption metadata authentication failed'.format( - blobname)) - - -class PqTupleSort(tuple): - """Priority Queue tuple sorter: handles priority collisions. - 0th item in the tuple is the priority number.""" - def __lt__(self, rhs): - return self[0] < rhs[0] - - def __gt__(self, rhs): - return self[0] > rhs[0] - - def __le__(self, rhs): - return self[0] <= rhs[0] - - def __ge__(self, rhs): - return self[0] >= rhs[0] - - -class SasBlobList(object): - """Sas Blob listing object""" - def __init__(self): - """Ctor for SasBlobList""" - self.blobs = [] - self.next_marker = None - - def __iter__(self): - """Iterator""" - return iter(self.blobs) - - def __len__(self): - """Length""" - return len(self.blobs) - - def __getitem__(self, index): - """Accessor""" - return self.blobs[index] - - def add_blob(self, name, content_length, content_md5, blobtype, mddict): - """Adds a blob to the list - Parameters: - name - blob name - content_length - content length - content_md5 - content md5 - blobtype - blob type - mddict - metadata dictionary - Returns: - Nothing - Raises: - Nothing - """ - obj = type('bloblistobject', (object,), {}) - obj.name = name - obj.metadata = mddict - obj.properties = type('properties', (object,), {}) - obj.properties.content_length = content_length - obj.properties.content_settings = azure.storage.blob.ContentSettings() - if content_md5 is not None and len(content_md5) > 0: - obj.properties.content_settings.content_md5 = content_md5 - obj.properties.blobtype = blobtype - self.blobs.append(obj) - - def set_next_marker(self, marker): - """Set the continuation token - Parameters: - marker - next marker - Returns: - Nothing - Raises: - Nothing - """ - if marker is not None and len(marker) > 0: - self.next_marker = marker - - -class SasBlobService(object): - """BlobService supporting SAS for functions used in the Python SDK. - create_container method does not exist because it is not a supported - operation under SAS""" - def __init__(self, endpoint, saskey, timeout): - """SAS Blob Service ctor - Parameters: - endpoint - storage endpoint - saskey - saskey - timeout - timeout - Returns: - Nothing - Raises: - Nothing - """ - self.endpoint = endpoint - # normalize sas key - if saskey[0] != '?': - self.saskey = '?' + saskey - else: - self.saskey = saskey - self.timeout = timeout - - def _parse_blob_list_xml(self, content): - """Parse blob list in xml format to an attribute-based object - Parameters: - content - http response content in xml - Returns: - attribute-based object - Raises: - No special exception handling - """ - result = SasBlobList() - root = ET.fromstring(content) - blobs = root.find('Blobs') - for blob in blobs.iter('Blob'): - name = blob.find('Name').text - props = blob.find('Properties') - cl = long(props.find('Content-Length').text) - md5 = props.find('Content-MD5').text - bt = props.find('BlobType').text - metadata = blob.find('Metadata') - mddict = {} - for md in metadata: - mddict[md.tag] = md.text - result.add_blob(name, cl, md5, bt, mddict) - try: - result.set_next_marker(root.find('NextMarker').text) - except Exception: - pass - return result - - def list_blobs( - self, container_name, marker=None, - max_results=_MAX_LISTBLOBS_RESULTS, include=None): - """List blobs in container - Parameters: - container_name - container name - marker - marker - max_results - max results - include - `azure.storage.models.Include` include object - Returns: - List of blobs - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - saskey=self.saskey) - reqparams = { - 'restype': 'container', - 'comp': 'list', - 'maxresults': str(max_results)} - if marker is not None: - reqparams['marker'] = marker - if include is not None and include.metadata: - reqparams['include'] = 'metadata' - response = azure_request( - requests.get, url=url, params=reqparams, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError( - 'incorrect status code returned for list_blobs: {}'.format( - response.status_code)) - return self._parse_blob_list_xml(response.content) - - def _get_blob(self, container_name, blob_name, start_range, end_range): - """Get blob - Parameters: - container_name - container name - blob_name - name of blob - start_range - start range of bytes - end_range - end range of bytes - Returns: - `azure.storage.blob.Blob` object - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-range': 'bytes={}-{}'.format(start_range, end_range) - } - response = azure_request( - requests.get, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200 and response.status_code != 206: - raise IOError( - 'incorrect status code returned for get_blob: {}'.format( - response.status_code)) - return azure.storage.blob.Blob(content=response.content) - - def get_blob_properties(self, container_name, blob_name): - """Get blob properties - Parameters: - container_name - container name - blob_name - name of blob - Returns: - `azure.storage.blob.Blob` object - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - response = azure_request( - requests.head, url=url, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError('incorrect status code returned for ' - 'get_blob_properties: {}'.format( - response.status_code)) - # parse response headers into blob object - blob = azure.storage.blob.Blob() - blob.propertes = azure.storage.blob.BlobProperties() - blob.properties.content_length = \ - long(response.headers['content-length']) - blob.properties.content_settings = azure.storage.blob.ContentSettings() - if 'content-md5' in response.headers: - blob.properties.content_settings.content_md5 = \ - response.headers['content-md5'] - # read meta values, all meta values are lowercased - mddict = {} - for res in response.headers: - if res.startswith('x-ms-meta-'): - mddict[res[10:]] = response.headers[res] - blob.metadata = mddict - return blob - - def set_blob_metadata( - self, container_name, blob_name, metadata): - """Set blob metadata. Clearing is not supported. - Parameters: - container_name - container name - blob_name - name of blob - metadata - blob metadata dictionary - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - if metadata is None or len(metadata) == 0: - return - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqparams = {'comp': 'metadata'} - reqheaders = {} - for key in metadata: - reqheaders['x-ms-meta-' + key] = metadata[key] - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError( - 'incorrect status code returned for ' - 'set_blob_metadata: {}'.format(response.status_code)) - - def create_blob( - self, container_name, blob_name, content_length, content_settings): - """Create blob for initializing page blobs - Parameters: - container_name - container name - blob_name - name of blob - content_length - content length aligned to 512-byte boundary - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - response content - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-blob-type': 'PageBlob', - 'x-ms-blob-content-length': str(content_length), - } - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - response = azure_request( - requests.put, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for create_blob: {}'.format( - response.status_code)) - return response.content - - def _put_blob( - self, container_name, blob_name, blob, content_settings): - """Put blob for creating/updated block blobs - Parameters: - container_name - container name - blob_name - name of blob - blob - blob content - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - response content - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {'x-ms-blob-type': 'BlockBlob'} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - response = azure_request( - requests.put, url=url, headers=reqheaders, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_blob: {}'.format( - response.status_code)) - return response.content - - def update_page( - self, container_name, blob_name, page, start_range, end_range, - validate_content=False, content_md5=None): - """Put page for page blob. This API differs from the Python storage - sdk to maintain efficiency for block md5 computation. - Parameters: - container_name - container name - blob_name - name of blob - page - page data - start_range - start range of bytes - end_range - end range of bytes - validate_content - validate content - content_md5 - md5 hash for page data - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = { - 'x-ms-range': 'bytes={}-{}'.format(start_range, end_range), - 'x-ms-page-write': 'update'} - if validate_content and content_md5 is not None: - reqheaders['Content-MD5'] = content_md5 - reqparams = {'comp': 'page'} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=page, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for update_page: {}'.format( - response.status_code)) - - def put_block( - self, container_name, blob_name, block, block_id, - validate_content=False): - """Put block for blob - Parameters: - container_name - container name - blob_name - name of blob - block - block data - block_id - block id - validate_content - validate content - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - # compute block md5 - if validate_content: - reqheaders = {'Content-MD5': compute_md5_for_data_asbase64(block)} - else: - reqheaders = None - reqparams = {'comp': 'block', 'blockid': block_id} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=block, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_block: {}'.format( - response.status_code)) - - def put_block_list( - self, container_name, blob_name, block_list, - content_settings): - """Put block list for blob - Parameters: - container_name - container name - blob_name - name of blob - block_list - list of `azure.storage.blob.BlobBlock` - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - if content_settings.content_type is not None: - reqheaders['x-ms-blob-content-type'] = \ - content_settings.content_type - reqparams = {'comp': 'blocklist'} - body = [''] - for block in block_list: - body.append('{}'.format(block.id)) - body.append('') - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - data=''.join(body), timeout=self.timeout) - response.raise_for_status() - if response.status_code != 201: - raise IOError( - 'incorrect status code returned for put_block_list: {}'.format( - response.status_code)) - - def set_blob_properties( - self, container_name, blob_name, content_settings): - """Sets blob properties (MD5 only) - Parameters: - container_name - container name - blob_name - name of blob - content_settings - `azure.storage.blob.ContentSettings` object - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - reqheaders = {} - if content_settings is not None: - if content_settings.content_md5 is not None: - reqheaders['x-ms-blob-content-md5'] = \ - content_settings.content_md5 - reqparams = {'comp': 'properties'} - response = azure_request( - requests.put, url=url, params=reqparams, headers=reqheaders, - timeout=self.timeout) - response.raise_for_status() - if response.status_code != 200: - raise IOError('incorrect status code returned for ' - 'set_blob_properties: {}'.format( - response.status_code)) - - def delete_blob( - self, container_name, blob_name): - """Deletes a blob - Parameters: - container_name - container name - blob_name - name of blob - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}/{blob_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - blob_name=blob_name, saskey=self.saskey) - response = azure_request( - requests.delete, url=url, timeout=self.timeout) - response.raise_for_status() - if response.status_code != 202: - raise IOError( - 'incorrect status code returned for delete_blob: {}'.format( - response.status_code)) - - def create_container( - self, container_name, fail_on_exist=False): - """Create a container - Parameters: - container_name - container name - Returns: - Nothing - Raises: - IOError if unexpected status code - """ - url = '{endpoint}{container_name}{saskey}'.format( - endpoint=self.endpoint, container_name=container_name, - saskey=self.saskey) - reqparams = {'restype': 'container'} - response = azure_request( - requests.put, url=url, params=reqparams, timeout=self.timeout) - if response.status_code != 201: - if response.status_code == 409: - if fail_on_exist: - response.raise_for_status() - else: - return - raise IOError('incorrect status code returned for ' - 'create_container: {}'.format( - response.status_code)) - - -class StorageChunkWorker(threading.Thread): - """Chunk worker for a storage entity""" - def __init__( - self, exc, s_in_queue, s_out_queue, args, xfertoazure, - blob_service, file_service): - """Storage Chunk worker Thread ctor - Parameters: - exc - exception list - s_in_queue - storage in queue - s_out_queue - storage out queue - args - program arguments - xfertoazure - xfer to azure (direction) - blob_service - blob service - file_service - file service - Returns: - Nothing - Raises: - Nothing - """ - threading.Thread.__init__(self) - self.terminate = False - self._exc = exc - self._in_queue = s_in_queue - self._out_queue = s_out_queue - self.args = args - self.xfertoazure = xfertoazure - self.blob_service = blob_service - self.file_service = file_service - - def run(self): - """Thread code - Parameters: - Nothing - Returns: - Nothing - Raises: - Nothing - """ - while not self.terminate: - try: - pri, (localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) = \ - self._in_queue.get_nowait() - except queue.Empty: - break - # detect termination early and break if necessary - if self.terminate: - break - try: - if self.xfertoazure: - # if iv is not ready for this chunk, re-add back to queue - if (not as_page_blob(self.args, localresource) and - ((self.args.rsaprivatekey is not None or - self.args.rsapublickey is not None) and - self.args.encmode == _ENCRYPTION_MODE_FULLBLOB)): - _iblockid = int(blockid) - if _iblockid not in encparam[2]: - self._in_queue.put( - PqTupleSort(( - pri, - (localresource, container, remoteresource, - blockid, offset, bytestoxfer, encparam, - flock, filedesc)))) - continue - # upload block/page - self.put_storage_data( - localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) - else: - # download range - self.get_storage_range( - localresource, container, remoteresource, blockid, - offset, bytestoxfer, encparam, flock, filedesc) - # pylint: disable=W0703 - except Exception: - # pylint: enable=W0703 - self._exc.append(traceback.format_exc()) - self._out_queue.put((localresource, encparam)) - if len(self._exc) > 0: - break - - def put_storage_data( - self, localresource, container, remoteresource, blockid, offset, - bytestoxfer, encparam, flock, filedesc): - """Puts data (blob, page or file bits) into Azure storage - Parameters: - localresource - name of local resource - container - blob container - remoteresource - name of remote resource - blockid - block id (ignored for page blobs) - offset - file offset - bytestoxfer - number of bytes to xfer - encparam - encryption metadata: (symkey, signkey, ivmap, pad) - flock - file lock - filedesc - file handle - Returns: - Nothing - Raises: - IOError if file cannot be read - """ - # if bytestoxfer is zero, then we're transferring a zero-byte - # file, use put blob instead of page/block ops - if bytestoxfer == 0: - contentmd5 = compute_md5_for_data_asbase64(b'') - if as_page_blob(self.args, localresource): - azure_request( - self.blob_service[1].create_blob, container_name=container, - blob_name=remoteresource, content_length=bytestoxfer, - content_settings=azure.storage.blob.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - elif self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - azure_request( - self.file_service.create_file, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=bytestoxfer, - content_settings=azure.storage.file.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - else: - azure_request( - self.blob_service[0]._put_blob, container_name=container, - blob_name=remoteresource, blob=None, - content_settings=azure.storage.blob.ContentSettings( - content_type=get_mime_type(localresource), - content_md5=contentmd5)) - return - # read the file at specified offset, must take lock - data = None - with flock: - closefd = False - if not filedesc: - filedesc = open(localresource, 'rb') - closefd = True - filedesc.seek(offset, 0) - data = filedesc.read(bytestoxfer) - if closefd: - filedesc.close() - if not data: - raise IOError('could not read {}: {} -> {}'.format( - localresource, offset, offset + bytestoxfer)) - # issue REST put - if as_page_blob(self.args, localresource): - aligned = page_align_content_length(bytestoxfer) - # fill data to boundary - if aligned != bytestoxfer: - data = data.ljust(aligned, b'\0') - # compute page md5 - contentmd5 = compute_md5_for_data_asbase64(data) - # check if this page is empty - if contentmd5 == _EMPTY_MAX_PAGE_SIZE_MD5: - return - elif len(data) != _MAX_BLOB_CHUNK_SIZE_BYTES: - data_chk = b'\0' * len(data) - data_chk_md5 = compute_md5_for_data_asbase64(data_chk) - del data_chk - if data_chk_md5 == contentmd5: - return - del data_chk_md5 - # upload page range - if self.args.saskey: - azure_request( - self.blob_service[1].update_page, container_name=container, - blob_name=remoteresource, page=data, start_range=offset, - end_range=offset + aligned - 1, - validate_content=self.args.computeblockmd5, - content_md5=contentmd5, timeout=self.args.timeout) - else: - azure_request( - self.blob_service[1].update_page, container_name=container, - blob_name=remoteresource, page=data, start_range=offset, - end_range=offset + aligned - 1, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - else: - # encrypt block if required - if (encparam is not None and - (self.args.rsaprivatekey is not None or - self.args.rsapublickey is not None)): - symkey = encparam[0] - signkey = encparam[1] - if self.args.encmode == _ENCRYPTION_MODE_FULLBLOB: - _blkid = int(blockid) - iv = encparam[2][_blkid] - pad = encparam[3] - else: - iv = None - pad = True - data = encrypt_chunk( - symkey, signkey, data, self.args.encmode, iv=iv, pad=pad) - with flock: - if self.args.encmode == _ENCRYPTION_MODE_FULLBLOB: - # compute hmac for chunk - if _blkid == 0: - encparam[2]['hmac'].update(iv + data) - else: - encparam[2]['hmac'].update(data) - # store iv for next chunk - encparam[2][_blkid + 1] = data[ - len(data) - _AES256_BLOCKSIZE_BYTES:] - # compute md5 for encrypted data chunk - encparam[2]['md5'].update(data) - if self.args.fileshare: - bytestoxfer = len(data) - encparam[2]['filesize'] += bytestoxfer - if self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - # subtract 1 from end_range - azure_request( - self.file_service.update_range, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - data=data, start_range=offset, - end_range=offset + bytestoxfer - 1, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - else: - azure_request( - self.blob_service[0].put_block, container_name=container, - blob_name=remoteresource, block=data, block_id=blockid, - validate_content=self.args.computeblockmd5, - timeout=self.args.timeout) - del data - - def get_storage_range( - self, localresource, container, remoteresource, blockid, offset, - bytestoxfer, encparam, flock, filedesc): - """Get a segment of a blob/page/file using range offset downloading - Parameters: - localresource - name of local resource - container - blob container - remoteresource - name of remote resource - blockid - block id (integral) - offset - file offset - bytestoxfer - number of bytes to xfer - encparam - decryption metadata: - (symkey, signkey, offset_mod, encmode, ivmap, unpad) - flock - file lock - filedesc - file handle - Returns: - Nothing - Raises: - Nothing - """ - if (encparam[0] is not None and - encparam[3] == _ENCRYPTION_MODE_FULLBLOB): - if offset == 0: - start_range = offset - end_range = offset + bytestoxfer - else: - # retrieve block size data prior for IV - start_range = offset - _AES256_BLOCKSIZE_BYTES - end_range = offset + bytestoxfer - else: - start_range = offset - end_range = offset + bytestoxfer - if self.args.fileshare: - fsfile = split_fileshare_path_into_parts(remoteresource) - _blob = azure_request( - self.file_service._get_file, share_name=container, - directory_name=fsfile[0], file_name=fsfile[1], - start_range=start_range, end_range=end_range, - timeout=self.args.timeout) - else: - if as_page_blob(self.args, localresource): - blob_service = self.blob_service[1] - else: - blob_service = self.blob_service[0] - _blob = azure_request( - blob_service._get_blob, timeout=self.args.timeout, - container_name=container, blob_name=remoteresource, - start_range=start_range, end_range=end_range) - blobdata = _blob.content - # decrypt block if required - if encparam[0] is not None: - if encparam[3] == _ENCRYPTION_MODE_FULLBLOB: - if offset == 0: - iv = encparam[4][0] - else: - iv = blobdata[:_AES256_BLOCKSIZE_BYTES] - blobdata = blobdata[_AES256_BLOCKSIZE_BYTES:] - unpad = encparam[5] - # update any buffered data to hmac - hmacdict = encparam[4]['hmac'] - if hmacdict['hmac'] is not None: - # grab file lock to manipulate hmac - with flock: - # include iv in first hmac calculation - if offset == 0: - hmacdict['buffered'][blockid] = iv + blobdata - else: - hmacdict['buffered'][blockid] = blobdata - # try to process hmac data - while True: - curr = hmacdict['curr'] - if curr in hmacdict['buffered']: - hmacdict['hmac'].update( - hmacdict['buffered'][curr]) - hmacdict['buffered'].pop(curr) - hmacdict['curr'] = curr + 1 - else: - break - else: - iv = None - unpad = True - blobdata = decrypt_chunk( - encparam[0], encparam[1], blobdata, encparam[3], iv=iv, - unpad=unpad) - if blobdata is not None: - with flock: - closefd = False - if not filedesc: - filedesc = open(localresource, 'r+b') - closefd = True - filedesc.seek(offset - (encparam[2] or 0), 0) - filedesc.write(blobdata) - if closefd: - filedesc.close() - del blobdata - del _blob - - -def pad_pkcs7(buf): - """Appends PKCS7 padding to an input buffer. - Parameters: - buf - buffer to add padding - Returns: - buffer with PKCS7_PADDING - Raises: - No special exception handling - """ - padder = cryptography.hazmat.primitives.padding.PKCS7( - cryptography.hazmat.primitives.ciphers. - algorithms.AES.block_size).padder() - return padder.update(buf) + padder.finalize() - - -def unpad_pkcs7(buf): - """Removes PKCS7 padding a decrypted object. - Parameters: - buf - buffer to remove padding - Returns: - buffer without PKCS7_PADDING - Raises: - No special exception handling - """ - unpadder = cryptography.hazmat.primitives.padding.PKCS7( - cryptography.hazmat.primitives.ciphers. - algorithms.AES.block_size).unpadder() - return unpadder.update(buf) + unpadder.finalize() - - -def generate_aes256_keys(): - """Generate AES256 symmetric key and signing key - Parameters: - None - Returns: - Tuple of symmetric key and signing key - Raises: - Nothing - """ - symkey = os.urandom(_AES256_KEYLENGTH_BYTES) - signkey = os.urandom(_AES256_KEYLENGTH_BYTES) - return symkey, signkey - - -def rsa_encrypt_key(rsaprivatekey, rsapublickey, plainkey, asbase64=True): - """Encrypt a plaintext key using RSA and PKCS1_OAEP padding - Parameters: - rsaprivatekey - rsa private key for encryption - rsapublickey - rsa public key for encryption - plainkey - plaintext key - asbase64 - encode as base64 - Returns: - Tuple of encrypted key and signature (if RSA private key is given) - Raises: - Nothing - """ - if rsapublickey is None: - rsapublickey = rsaprivatekey.public_key() - if rsaprivatekey is None: - signature = None - else: - signer = rsaprivatekey.signer( - cryptography.hazmat.primitives.asymmetric.padding.PSS( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - cryptography.hazmat.primitives.hashes.SHA256()), - salt_length=cryptography.hazmat.primitives.asymmetric. - padding.PSS.MAX_LENGTH), - cryptography.hazmat.primitives.hashes.SHA256()) - signer.update(plainkey) - signature = signer.finalize() - enckey = rsapublickey.encrypt( - plainkey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - algorithm=cryptography.hazmat.primitives.hashes.SHA1()), - algorithm=cryptography.hazmat.primitives.hashes.SHA1(), - label=None)) - if asbase64: - return base64encode(enckey), base64encode( - signature) if signature is not None else signature - else: - return enckey, signature - - -def rsa_decrypt_key(rsaprivatekey, enckey, signature, isbase64=True): - """Decrypt an RSA encrypted key and optional signature verification - Parameters: - rsaprivatekey - rsa private key for decryption - enckey - encrypted key - signature - optional signature to verify encrypted data - isbase64 - if keys are base64 encoded - Returns: - Decrypted key - Raises: - RuntimeError if RSA signature validation fails - """ - if isbase64: - enckey = base64.b64decode(enckey) - deckey = rsaprivatekey.decrypt( - enckey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - algorithm=cryptography.hazmat.primitives.hashes.SHA1()), - algorithm=cryptography.hazmat.primitives.hashes.SHA1(), - label=None)) - if signature is not None and len(signature) > 0: - rsapublickey = rsaprivatekey.public_key() - if isbase64: - signature = base64.b64decode(signature) - verifier = rsapublickey.verifier( - signature, cryptography.hazmat.primitives.asymmetric.padding.PSS( - mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( - cryptography.hazmat.primitives.hashes.SHA256()), - salt_length=cryptography.hazmat.primitives.asymmetric. - padding.PSS.MAX_LENGTH), - cryptography.hazmat.primitives.hashes.SHA256()) - verifier.update(deckey) - verifier.verify() - return deckey - - -def encrypt_chunk(symkey, signkey, data, encmode, iv=None, pad=False): - """Encrypt a chunk of data - Parameters: - symkey - symmetric key - signkey - signing key - data - data to encrypt - encmode - encryption mode - iv - initialization vector - pad - pad data - Returns: - iv and hmac not specified: iv || encrypted data || signature - else: encrypted data - Raises: - No special exception handling - """ - # create iv - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - iv = os.urandom(_AES256_BLOCKSIZE_BYTES) - # force padding on since this will be an individual encrypted chunk - pad = True - # encrypt data - cipher = cryptography.hazmat.primitives.ciphers.Cipher( - cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), - cryptography.hazmat.primitives.ciphers.modes.CBC(iv), - backend=cryptography.hazmat.backends.default_backend()).encryptor() - if pad: - encdata = cipher.update(pad_pkcs7(data)) + cipher.finalize() - else: - encdata = cipher.update(data) + cipher.finalize() - # sign encrypted data - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - hmacsha256 = hmac.new(signkey, digestmod=hashlib.sha256) - hmacsha256.update(iv + encdata) - return iv + encdata + hmacsha256.digest() - else: - return encdata - - -def decrypt_chunk( - symkey, signkey, encchunk, encmode, iv=None, unpad=False): - """Decrypt a chunk of data - Parameters: - symkey - symmetric key - signkey - signing key - encchunk - data to decrypt - encmode - encryption mode - blockid - block id - iv - initialization vector - unpad - unpad data - Returns: - decrypted data - Raises: - RuntimeError if signature verification fails - """ - # if chunked blob, then preprocess for iv and signature - if encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - # retrieve iv - iv = encchunk[:_AES256_BLOCKSIZE_BYTES] - # retrieve encrypted data - encdata = encchunk[ - _AES256_BLOCKSIZE_BYTES:-_HMACSHA256_DIGESTSIZE_BYTES] - # retrieve signature - sig = encchunk[-_HMACSHA256_DIGESTSIZE_BYTES:] - # validate integrity of data - hmacsha256 = hmac.new(signkey, digestmod=hashlib.sha256) - # compute hmac over iv + encdata - hmacsha256.update(encchunk[:-_HMACSHA256_DIGESTSIZE_BYTES]) - if not cryptography.hazmat.primitives.constant_time.bytes_eq( - hmacsha256.digest(), sig): - raise RuntimeError( - 'Encrypted data integrity check failed for chunk') - else: - encdata = encchunk - # decrypt data - cipher = cryptography.hazmat.primitives.ciphers.Cipher( - cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), - cryptography.hazmat.primitives.ciphers.modes.CBC(iv), - backend=cryptography.hazmat.backends.default_backend()).decryptor() - decrypted = cipher.update(encdata) + cipher.finalize() - if unpad: - return unpad_pkcs7(decrypted) - else: - return decrypted - - -def azure_request(req, timeout=None, *args, **kwargs): - """Wrapper method to issue/retry requests to Azure, works with both - the Azure Python SDK and Requests - Parameters: - req - request to issue - timeout - timeout in seconds - args - positional args to req - kwargs - keyworded args to req - Returns: - result of request - Raises: - Any uncaught exceptions - IOError if timeout - """ - start = time.clock() - lastwait = None - while True: - try: - return req(*args, **kwargs) - except requests.Timeout: - pass - except (requests.ConnectionError, - requests.exceptions.ChunkedEncodingError) as exc: - if (isinstance(exc.args[0], requests.packages.urllib3. - exceptions.ProtocolError) and - isinstance(exc.args[0].args[1], socket.error)): - err = exc.args[0].args[1].errno - if (err != errno.ECONNRESET and - err != errno.ECONNREFUSED and - err != errno.ECONNABORTED and - err != errno.ENETRESET and - err != errno.ETIMEDOUT): - raise - except requests.HTTPError as exc: - if (exc.response.status_code < 500 or - exc.response.status_code == 501 or - exc.response.status_code == 505): - raise - except azure.common.AzureHttpError as exc: - if (exc.status_code < 500 or - exc.status_code == 501 or - exc.status_code == 505): - raise - if timeout is not None and time.clock() - start > timeout: - raise IOError( - 'waited {} sec for request {}, exceeded timeout of {}'.format( - time.clock() - start, req.__name__, timeout)) - if lastwait is None or lastwait > 8: - wait = 1 - else: - wait = lastwait << 1 - lastwait = wait - time.sleep(wait) - - -def create_dir_ifnotexists(dirname): - """Create a directory if it doesn't exist - Parameters: - dirname - name of directory to create - Returns: - Nothing - Raises: - Unhandled exceptions - """ - try: - os.makedirs(dirname) - print('created local directory: {}'.format(dirname)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise # pragma: no cover - - -def get_mime_type(filename): - """Guess the type of a file based on its filename - Parameters: - filename - filename to guess the content-type - Returns: - A string of the form 'type/subtype', - usable for a MIME content-type header - Raises: - Nothing - """ - return (mimetypes.guess_type(filename)[0] or 'application/octet-stream') - - -def encode_blobname(args, blobname): - """Encode blob name: url encode. Due to current Azure Python Storage SDK - limitations, does not apply to non-SAS requests. - Parameters: - args - program arguments - Returns: - urlencoded blob name - Raises: - Nothing - """ - if args.saskey is None or args.fileshare: - return blobname - else: - return urlquote(blobname) - - -def base64encode(obj): - """Encode object to base64 - Parameters: - obj - object to encode - Returns: - base64 encoded string - Raises: - Nothing - """ - if _PY2: - return base64.b64encode(obj) - else: - return str(base64.b64encode(obj), 'ascii') - - -def compute_md5_for_file_asbase64(filename, pagealign=False, blocksize=65536): - """Compute MD5 hash for file and encode as Base64 - Parameters: - filename - filename to compute md5 - pagealign - align bytes for page boundary - blocksize - block size in bytes - Returns: - MD5 for file encoded as Base64 - Raises: - Nothing - """ - hasher = hashlib.md5() - with open(filename, 'rb') as filedesc: - while True: - buf = filedesc.read(blocksize) - if not buf: - break - buflen = len(buf) - if pagealign and buflen < blocksize: - aligned = page_align_content_length(buflen) - if aligned != buflen: - buf = buf.ljust(aligned, b'\0') - hasher.update(buf) - return base64encode(hasher.digest()) - - -def compute_md5_for_data_asbase64(data): - """Compute MD5 hash for bits and encode as Base64 - Parameters: - data - data to compute MD5 hash over - Returns: - MD5 for data encoded as Base64 - Raises: - Nothing - """ - hasher = hashlib.md5() - hasher.update(data) - return base64encode(hasher.digest()) - - -def page_align_content_length(length): - """Compute page boundary alignment - Parameters: - length - content length - Returns: - aligned byte boundary - Raises: - Nothing - """ - mod = length % _PAGEBLOB_BOUNDARY - if mod != 0: - return length + (_PAGEBLOB_BOUNDARY - mod) - return length - - -def as_page_blob(args, name): - """Determines if the file should be a pageblob depending upon args - Parameters: - args - program args - name - file name - Returns: - True if file should be a pageblob - Raises: - Nothing - """ - if not args.fileshare and ( - args.pageblob or (args.autovhd and name.lower().endswith('.vhd'))): - return True - return False - - -def get_blob_listing(blob_service, args, metadata=True): - """Convenience method for generating a blob listing of a container - Parameters: - blob_service - blob service - args - program arguments - metadata - include metadata - Returns: - dictionary of blob -> list [content length, content md5, enc metadata] - Raises: - Nothing - """ - marker = None - blobdict = {} - if metadata: - incl = azure.storage.blob.Include.METADATA - else: - incl = None - while True: - try: - result = azure_request( - blob_service.list_blobs, timeout=args.timeout, - container_name=args.container, marker=marker, include=incl) - except azure.common.AzureMissingResourceHttpError: - break - for blob in result: - blobdict[blob.name] = [ - blob.properties.content_length, - blob.properties.content_settings.content_md5, None] - if (blob.metadata is not None and - _ENCRYPTION_METADATA_NAME in blob.metadata): - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - blob.name, args.rsaprivatekey, args.rsapublickey, - blob.metadata) - blobdict[blob.name][1] = encmeta.md5 - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - blobdict[blob.name][2] = encmeta - marker = result.next_marker - if marker is None or len(marker) < 1: - break - return blobdict - - -def get_fileshare_listing(file_service, args, metadata=True): - """Retrieve all files and directories under a file share - Parameters: - file_service - file service - args - program args - metadata - retrieve metadata - Returns: - dictionary of files -> list [content length, content md5, enc metadata] - Raises: - Nothing - """ - blobdict = {} - dirs = [None] - while len(dirs) > 0: - dir = dirs.pop() - fsfiles = file_service.list_directories_and_files( - share_name=args.container, directory_name=dir, - timeout=args.timeout) - if dir is None: - dir = '' - for fsfile in fsfiles: - fspath = os.path.join(dir, fsfile.name) - if isinstance(fsfile, azure.storage.file.File): - fsprop = get_fileshare_file_properties( - file_service, args, fspath) - blobdict[fspath] = fsprop[1] - else: - dirs.append(fspath) - return blobdict - - -def split_fileshare_path_into_parts(remotefname): - """Split fileshare name into parts - Parameters: - remotefname - remote file name - Returns: - tuple of (directory name, file name) - Raises: - Nothing - """ - parts = remotefname.split(os.path.sep) - dirname = os.path.sep.join(parts[:len(parts) - 1]) - return (dirname, parts[-1]) - - -def get_fileshare_file_properties(file_service, args, remotefname): - """Convenience method for retrieving a file share file's properties and - metadata - Parameters: - file_service - file service - args - program arguments - remotefname - remote file name - Returns: - blobdict entry tuple (file name, blobdict value) - Raises: - Nothing - """ - # split directory and file name - dirname, fname = split_fileshare_path_into_parts(remotefname) - try: - fsfile = file_service.get_file_properties( - args.container, dirname, fname, timeout=args.timeout) - except azure.common.AzureMissingResourceHttpError: - return None - fsmeta = file_service.get_file_metadata( - args.container, dirname, fname, timeout=args.timeout) - entry = [ - fsfile.properties.content_length, - fsfile.properties.content_settings.content_md5, None] - if fsmeta is not None and _ENCRYPTION_METADATA_NAME in fsmeta: - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - fsfile.name, args.rsaprivatekey, args.rsapublickey, - fsmeta) - entry[1] = encmeta.md5 - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - entry[2] = encmeta - return (fsfile.name, entry) - - -def create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated): - """Create all parent directories of a given file share path - Parameters - file_service - file service - args - program args - fsfile - file share path - dirscreated - directories created set - Returns: - Nothing - Raises: - Nothing - """ - dirs = fsfile[0].split(os.path.sep) - for i in xrange(0, len(dirs)): - dir = os.path.join(*(dirs[0:i + 1])) - if dir not in dirscreated: - file_service.create_directory( - share_name=args.container, - directory_name=dir, fail_on_exist=False, - timeout=args.timeout) - dirscreated.add(dir) - - -def generate_xferspec_download( - blob_service, file_service, args, storage_in_queue, localfile, - remoteresource, addfd, blobprop): - """Generate an xferspec for download - Parameters: - blob_service - blob service - file_service - file service - args - program arguments - storage_in_queue - storage input queue - localfile - name of local resource - remoteresource - name of remote resource - addfd - create and add file handle - blobprop - blob properties list [length, md5, metadatadict] - Returns: - xferspec containing instructions - Raises: - ValueError if get_blob_properties returns an invalid result or - contentlength is invalid - """ - contentlength = blobprop[0] - contentmd5 = blobprop[1] - encmeta = blobprop[2] - remoteresource = encode_blobname(args, remoteresource) - # get the blob metadata if missing - if not args.fileshare and ( - contentlength is None or contentmd5 is None or - (args.rsaprivatekey is not None and encmeta is None)): - result = azure_request( - blob_service.get_blob_properties, timeout=args.timeout, - container_name=args.container, blob_name=remoteresource) - if not result: - raise ValueError( - 'unexpected result for get_blob_properties is None') - contentmd5 = result.properties.content_settings.content_md5 - contentlength = result.properties.content_length - if (args.rsaprivatekey is not None and - _ENCRYPTION_METADATA_NAME in result.metadata): - encmeta = EncryptionMetadataJson( - args, None, None, None, None, None) - encmeta.parse_metadata_json( - remoteresource, args.rsaprivatekey, args.rsapublickey, - result.metadata) - if contentlength < 0: - raise ValueError( - 'contentlength is invalid for {}'.format(remoteresource)) - # overwrite content md5 if encryption metadata exists - if encmeta is not None: - contentmd5 = encmeta.md5 - # check if download is needed - if (args.skiponmatch and contentmd5 is not None and - os.path.exists(localfile)): - print('computing file md5 on: {} length: {}'.format( - localfile, contentlength)) - lmd5 = compute_md5_for_file_asbase64(localfile) - print(' >> {} {} {} '.format( - lmd5, contentmd5, remoteresource), end='') - if lmd5 != contentmd5: - print('MISMATCH: re-download') - else: - print('match: skip') - return None, None, None, None - else: - print('remote blob: {} length: {} bytes, md5: {}'.format( - remoteresource, contentlength, contentmd5)) - tmpfilename = localfile + '.blobtmp' - if encmeta is not None: - chunksize = encmeta.chunksizebytes - symkey = encmeta.symkey - signkey = encmeta.signkey - if encmeta.encmode == _ENCRYPTION_MODE_FULLBLOB: - ivmap = { - 0: encmeta.iv, - 'hmac': { - 'hmac': None, - 'buffered': {}, - 'curr': 0, - 'sig': encmeta.hmac, - } - } - if signkey is not None: - ivmap['hmac']['hmac'] = hmac.new( - signkey, digestmod=hashlib.sha256) - offset_mod = 0 - elif encmeta.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - ivmap = None - offset_mod = _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - else: - raise RuntimeError('Unknown encryption mode: {}'.format( - encmeta.encmode)) - else: - chunksize = args.chunksizebytes - offset_mod = 0 - symkey = None - signkey = None - ivmap = None - nchunks = contentlength // chunksize - # compute allocation size, if encrypted this will be an - # underallocation estimate - if contentlength > 0: - if encmeta is not None: - if encmeta.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - allocatesize = contentlength - ((nchunks + 2) * offset_mod) - else: - allocatesize = contentlength - _AES256_BLOCKSIZE_BYTES - else: - allocatesize = contentlength - if allocatesize < 0: - allocatesize = 0 - else: - allocatesize = 0 - currfileoffset = 0 - nstorageops = 0 - flock = threading.Lock() - filedesc = None - # preallocate file - with flock: - filedesc = open(tmpfilename, 'wb') - if allocatesize > 0: - filedesc.seek(allocatesize - 1) - filedesc.write(b'\0') - filedesc.close() - if addfd: - # reopen under r+b mode - filedesc = open(tmpfilename, 'r+b') - else: - filedesc = None - chunktoadd = min(chunksize, contentlength) - for i in xrange(nchunks + 1): - if chunktoadd + currfileoffset > contentlength: - chunktoadd = contentlength - currfileoffset - # on download, chunktoadd must be offset by 1 as the x-ms-range - # header expects it that way. x -> y bytes means first bits of the - # (x+1)th byte to the last bits of the (y+1)th byte. for example, - # 0 -> 511 means byte 1 to byte 512 - encparam = [ - symkey, signkey, i * offset_mod, - encmeta.encmode if encmeta is not None else None, ivmap, False] - xferspec = (tmpfilename, args.container, remoteresource, i, - currfileoffset, chunktoadd - 1, encparam, flock, filedesc) - currfileoffset = currfileoffset + chunktoadd - nstorageops = nstorageops + 1 - storage_in_queue.put(PqTupleSort((i, xferspec))) - if currfileoffset >= contentlength: - encparam[5] = True - break - return contentlength, nstorageops, contentmd5, filedesc - - -def generate_xferspec_upload( - args, storage_in_queue, blobskipdict, blockids, localfile, - remoteresource, addfd): - """Generate an xferspec for upload - Parameters: - args - program arguments - storage_in_queue - storage input queue - blobskipdict - blob skip dictionary - blockids - block id dictionary - localfile - name of local resource - remoteresource - name of remote resource - addfd - create and add file handle - Returns: - xferspec containing instructions - Raises: - Nothing - """ - # compute md5 hash - md5digest = None - if args.computefilemd5: - print('computing file md5 on: {}'.format(localfile)) - md5digest = compute_md5_for_file_asbase64( - localfile, as_page_blob(args, localfile)) - # check if upload is needed - if args.skiponmatch and remoteresource in blobskipdict: - print(' >> {} {} {} '.format( - md5digest, blobskipdict[remoteresource][1], - remoteresource), end='') - if md5digest != blobskipdict[remoteresource][1]: - print('MISMATCH: re-upload') - else: - print('match: skip') - return None, 0, None, None - else: - print(' >> md5: {}'.format(md5digest)) - # create blockids entry - if localfile not in blockids: - blockids[localfile] = [] - # partition local file into chunks - filesize = os.path.getsize(localfile) - if as_page_blob(args, localfile) and ( - args.rsaprivatekey is not None or - args.rsapublickey is not None): - chunksizebytes = _MAX_BLOB_CHUNK_SIZE_BYTES - nchunks = filesize // chunksizebytes - if nchunks > 250000: - raise RuntimeError( - '{} chunks for file {} exceeds Azure Storage limits for a ' - 'single page blob'.format(nchunks, localfile)) - else: - chunksizebytes = args.chunksizebytes - nchunks = filesize // chunksizebytes - if nchunks > 50000: - raise RuntimeError( - '{} chunks for file {} exceeds Azure Storage limits for a ' - 'single block blob'.format(nchunks, localfile)) - chunktoadd = min(chunksizebytes, filesize) - currfileoffset = 0 - nstorageops = 0 - flock = threading.Lock() - filedesc = None - if addfd: - with flock: - filedesc = open(localfile, 'rb') - symkey = None - signkey = None - ivmap = None - for i in xrange(nchunks + 1): - if chunktoadd + currfileoffset > filesize: - chunktoadd = filesize - currfileoffset - blockid = '{0:08d}'.format(currfileoffset // chunksizebytes) - # generate the ivmap for the first block - if (not as_page_blob(args, localfile) and - (args.rsaprivatekey is not None or - args.rsapublickey is not None) and currfileoffset == 0): - # generate sym/signing keys - symkey, signkey = generate_aes256_keys() - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - ivmap = { - i: os.urandom(_AES256_BLOCKSIZE_BYTES), - 'hmac': hmac.new(signkey, digestmod=hashlib.sha256), - } - else: - ivmap = {} - ivmap['md5'] = hashlib.md5() - ivmap['filesize'] = 0 - blockids[localfile].append(blockid) - encparam = [symkey, signkey, ivmap, False] - xferspec = (localfile, args.container, - encode_blobname(args, remoteresource), blockid, - currfileoffset, chunktoadd, encparam, flock, filedesc) - currfileoffset = currfileoffset + chunktoadd - nstorageops = nstorageops + 1 - storage_in_queue.put(PqTupleSort((i, xferspec))) - if currfileoffset >= filesize: - encparam[3] = True - break - return filesize, nstorageops, md5digest, filedesc - - -def apply_file_collation_and_strip(args, fname): - """Apply collation path or component strip to a remote filename - Parameters: - args - arguments - fname - file name - Returns: - remote filename - Raises: - No special exception handling - """ - remotefname = fname.strip(os.path.sep) - if args.collate is not None: - remotefname = remotefname.split(os.path.sep)[-1] - if args.collate != '.': - remotefname = os.path.sep.join((args.collate, remotefname)) - elif args.stripcomponents > 0: - rtmp = remotefname.split(os.path.sep) - nsc = min((len(rtmp) - 1, args.stripcomponents)) - if nsc > 0: - remotefname = os.path.sep.join(rtmp[nsc:]) - return remotefname - - -def main(): - """Main function - Parameters: - None - Returns: - Nothing - Raises: - ValueError for invalid arguments - """ - # get command-line args - args = parseargs() - - # populate args from env vars - if args.storageaccountkey is None: - args.storageaccountkey = os.getenv(_ENVVAR_STORAGEACCOUNTKEY) - if args.saskey is None: - args.saskey = os.getenv(_ENVVAR_SASKEY) - if args.rsakeypassphrase is None: - args.rsakeypassphrase = os.getenv(_ENVVAR_RSAKEYPASSPHRASE) - - # check some parameters - if (len(args.localresource) < 1 or len(args.storageaccount) < 1 or - len(args.container) < 1): - raise ValueError('invalid positional arguments') - if len(args.endpoint) < 1: - raise ValueError('storage endpoint is invalid') - if args.upload and args.download: - raise ValueError( - 'cannot specify both download and upload transfer direction ' - 'within the same invocation') - if args.subscriptionid is not None and args.managementcert is None: - raise ValueError( - 'cannot specify subscription id without a management cert') - if args.subscriptionid is None and args.managementcert is not None: - raise ValueError( - 'cannot specify a management cert without a subscription id') - if args.storageaccountkey is not None and args.saskey is not None: - raise ValueError('cannot use both a sas key and storage account key') - if args.pageblob and args.fileshare: - raise ValueError( - 'cannot specify both page blob and file share destinations') - if args.autovhd and args.fileshare: - raise ValueError( - 'cannot specify both autovhd and file share destination') - if args.pageblob and args.autovhd: - raise ValueError('cannot specify both pageblob and autovhd parameters') - if args.collate is not None and args.stripcomponents is not None: - raise ValueError( - 'cannot specify collate and non-default component ' - 'strip: {}'.format(args.stripcomponents)) - if args.stripcomponents is None: - args.stripcomponents = 1 - if args.stripcomponents < 0: - raise ValueError('invalid component strip number: {}'.format( - args.stripcomponents)) - if args.rsaprivatekey is not None and args.rsapublickey is not None: - raise ValueError('cannot specify both RSA private and public keys') - if args.rsapublickey is not None and args.rsakeypassphrase is not None: - raise ValueError('cannot specify an RSA public key and passphrase') - if args.timeout is not None and args.timeout <= 0: - args.timeout = None - - # get key if we don't have a handle on one - sms = None - if args.saskey is not None: - if len(args.saskey) < 1: - raise ValueError('invalid sas key specified') - elif args.storageaccountkey is None: - if (args.managementcert is not None and - args.subscriptionid is not None): - # check to ensure management cert is valid - if len(args.managementcert) == 0 or \ - args.managementcert.split('.')[-1].lower() != 'pem': - raise ValueError('management cert appears to be invalid') - if args.managementep is None or len(args.managementep) == 0: - raise ValueError('management endpoint is invalid') - # expand management cert path out if contains ~ - args.managementcert = os.path.abspath(args.managementcert) - # get sms reference - sms = azure.servicemanagement.ServiceManagementService( - args.subscriptionid, args.managementcert, args.managementep) - # get keys - service_keys = azure_request( - sms.get_storage_account_keys, timeout=args.timeout, - service_name=args.storageaccount) - args.storageaccountkey = service_keys.storage_service_keys.primary - else: - raise ValueError('could not determine authentication to use') - - # check storage account key validity - if args.storageaccountkey is not None and \ - len(args.storageaccountkey) < 1: - raise ValueError('storage account key is invalid') - - # set valid num workers - if args.numworkers < 1: - args.numworkers = 1 - if (args.fileshare and - args.numworkers == _DEFAULT_MAX_STORAGEACCOUNT_WORKERS): - args.numworkers //= 2 - - # expand any paths - args.localresource = os.path.expanduser(args.localresource) - - # sanitize remote file name - if args.remoteresource: - args.remoteresource = args.remoteresource.strip(os.path.sep) - - # set chunk size - if (args.chunksizebytes is None or args.chunksizebytes < 64 or - args.chunksizebytes > _MAX_BLOB_CHUNK_SIZE_BYTES): - args.chunksizebytes = _MAX_BLOB_CHUNK_SIZE_BYTES - - # set storage ep - endpoint = None - if sms: - storage_acct = azure_request( - sms.get_storage_account_properties, timeout=args.timeout, - service_name=args.storageaccount) - if args.fileshare: - endpoint = storage_acct.storage_service_properties.endpoints[3] - else: - endpoint = storage_acct.storage_service_properties.endpoints[0] - else: - if args.fileshare: - endpoint = 'https://{}.file.{}/'.format( - args.storageaccount, args.endpoint) - else: - endpoint = 'https://{}.blob.{}/'.format( - args.storageaccount, args.endpoint) - - # create master block blob, page blob and file service - blob_service = None - if args.storageaccountkey: - if args.endpoint[0] == '.': - args.endpoint = args.endpoint[1:] - block_blob_service = azure.storage.blob.BlockBlobService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - page_blob_service = azure.storage.blob.PageBlobService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - file_service = azure.storage.file.FileService( - account_name=args.storageaccount, - account_key=args.storageaccountkey, - endpoint_suffix=args.endpoint) - blob_service = (block_blob_service, page_blob_service) - elif args.saskey: - _bs = SasBlobService(endpoint, args.saskey, args.timeout) - blob_service = (_bs, _bs) - # normalize sas key for python sdk - if args.saskey[0] == '?': - args.saskey = args.saskey[1:] - file_service = azure.storage.file.FileService( - account_name=args.storageaccount, - sas_token=args.saskey, - endpoint_suffix=args.endpoint) - # disable container/share creation if SAS is not account-level and - # does not contain a signed resource type with container-level access - if args.createcontainer: - args.createcontainer = False - sasparts = args.saskey.split('&') - for part in sasparts: - tmp = part.split('=') - if tmp[0] == 'srt': - if 'c' in tmp[1]: - args.createcontainer = True - break - del sasparts - if blob_service is None: - raise ValueError('blob_service is invalid') - if args.fileshare and file_service is None: - raise ValueError('file_service is invalid') - - # check which way we're transfering - xfertoazure = False - if (args.upload or - (not args.download and os.path.exists(args.localresource))): - xfertoazure = True - else: - if args.remoteresource is None: - raise ValueError('cannot download remote file if not specified') - - # import rsa key - if args.rsaprivatekey is not None: - rsakeyfile = args.rsaprivatekey - elif args.rsapublickey is not None: - rsakeyfile = args.rsapublickey - else: - rsakeyfile = None - if rsakeyfile is not None: - # check for conflicting options - if args.pageblob: - raise ValueError( - 'cannot operate in page blob mode with encryption enabled') - # check for supported encryption modes - if (args.encmode != _ENCRYPTION_MODE_FULLBLOB and - args.encmode != _ENCRYPTION_MODE_CHUNKEDBLOB): - raise RuntimeError( - 'Unknown encryption mode: {}'.format(args.encmode)) - # only allow full blob encryption mode for now due to - # possible compatibility issues - if args.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - raise RuntimeError( - '{} encryption mode not allowed'.format(args.encmode)) - with open(rsakeyfile, 'rb') as keyfile: - if args.rsaprivatekey is not None: - args.rsaprivatekey = cryptography.hazmat.primitives.\ - serialization.load_pem_private_key( - keyfile.read(), args.rsakeypassphrase, - backend=cryptography.hazmat.backends.default_backend()) - else: - args.rsapublickey = cryptography.hazmat.primitives.\ - serialization.load_pem_public_key( - keyfile.read(), - backend=cryptography.hazmat.backends.default_backend()) - if args.rsaprivatekey is None and not xfertoazure: - raise ValueError('imported RSA key does not have a private key') - # adjust chunk size for padding for chunked mode - if xfertoazure: - if args.encmode == _ENCRYPTION_MODE_CHUNKEDBLOB: - args.chunksizebytes -= _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - elif args.encmode == _ENCRYPTION_MODE_FULLBLOB: - nchunks = args.chunksizebytes // \ - _AES256CBC_HMACSHA256_OVERHEAD_BYTES - args.chunksizebytes = (nchunks - 1) * \ - _AES256CBC_HMACSHA256_OVERHEAD_BYTES - del nchunks - # ensure chunk size is greater than overhead - if args.chunksizebytes <= ( - _AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1) << 1: - raise ValueError('chunksizebytes {} <= encryption min {}'.format( - args.chunksizebytes, - (_AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1) << 1)) - - # disable urllib3 warnings if specified - if args.disableurllibwarnings: - print('!!! WARNING: DISABLING URLLIB3 WARNINGS !!!') - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.InsecurePlatformWarning) - requests.packages.urllib3.disable_warnings( - requests.packages.urllib3.exceptions.SNIMissingWarning) - - # collect package versions - packages = ['az.common=' + azure.common.__version__] - try: - packages.append('az.sml=' + azure.servicemanagement.__version__) - except Exception: - pass - try: - packages.append('az.stor=' + azure.storage.__version__) - except Exception: - pass - try: - packages.append('crypt=' + cryptography.__version__) - except Exception: - pass - packages.append( - 'req=' + requests.__version__) - - # print all parameters - print('=====================================') - print(' azure blobxfer parameters [v{}]'.format(_SCRIPT_VERSION)) - print('=====================================') - print(' platform: {}'.format(platform.platform())) - print(' python interpreter: {} {}'.format( - platform.python_implementation(), platform.python_version())) - print(' package versions: {}'.format(' '.join(packages))) - del packages - print(' subscription id: {}'.format(args.subscriptionid)) - print(' management cert: {}'.format(args.managementcert)) - print(' transfer direction: {}'.format( - 'local->Azure' if xfertoazure else 'Azure->local')) - print(' local resource: {}'.format(args.localresource)) - print(' include pattern: {}'.format(args.include)) - print(' remote resource: {}'.format(args.remoteresource)) - print(' max num of workers: {}'.format(args.numworkers)) - print(' timeout: {}'.format(args.timeout)) - print(' storage account: {}'.format(args.storageaccount)) - print(' use SAS: {}'.format(True if args.saskey else False)) - print(' upload as page blob: {}'.format(args.pageblob)) - print(' auto vhd->page blob: {}'.format(args.autovhd)) - print(' upload to file share: {}'.format(args.fileshare)) - print(' container/share name: {}'.format(args.container)) - print(' container/share URI: {}'.format(endpoint + args.container)) - print(' compute block MD5: {}'.format(args.computeblockmd5)) - print(' compute file MD5: {}'.format(args.computefilemd5)) - print(' skip on MD5 match: {}'.format(args.skiponmatch)) - print(' chunk size (bytes): {}'.format(args.chunksizebytes)) - print(' create container: {}'.format(args.createcontainer)) - print(' keep mismatched MD5: {}'.format(args.keepmismatchedmd5files)) - print(' recursive if dir: {}'.format(args.recursive)) - print('component strip on up: {}'.format(args.stripcomponents)) - print(' remote delete: {}'.format(args.delete)) - print(' collate to: {}'.format(args.collate or 'disabled')) - print(' local overwrite: {}'.format(args.overwrite)) - print(' encryption mode: {}'.format( - (args.encmode or 'disabled' if xfertoazure else 'file dependent') - if args.rsaprivatekey is not None or args.rsapublickey is not None - else 'disabled')) - print(' RSA key file: {}'.format(rsakeyfile or 'disabled')) - print(' RSA key type: {}'.format( - 'private' if args.rsaprivatekey is not None else 'public' - if args.rsapublickey is not None else 'disabled')) - print('=======================================\n') - - # mark start time after init - print('script start time: {}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))) - start = time.time() - - # populate instruction queues - allfilesize = 0 - storage_in_queue = queue.PriorityQueue() - nstorageops = 0 - blockids = {} - completed_blockids = {} - filemap = {} - filesizes = {} - delblobs = None - md5map = {} - filedesc = None - if xfertoazure: - # if skiponmatch is enabled, list blobs first and check - if args.skiponmatch and not args.fileshare: - blobskipdict = get_blob_listing(blob_service[0], args) - else: - blobskipdict = {} - if os.path.isdir(args.localresource): - if args.remoteresource is not None: - print('WARNING: ignorning specified remoteresource {} for ' - 'directory upload'.format(args.remoteresource)) - _remotefiles = set() - # mirror directory - if args.recursive: - for root, _, files in os.walk(args.localresource): - for dirfile in files: - fname = os.path.join(root, dirfile) - if args.include is not None and not fnmatch.fnmatch( - fname, args.include): - continue - remotefname = apply_file_collation_and_strip( - args, fname) - _remotefiles.add(remotefname) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, remotefname) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, ops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, - blockids, fname, remotefname, False) - if filesize is not None: - completed_blockids[fname] = 0 - md5map[fname] = md5digest - filemap[fname] = encode_blobname(args, remotefname) - filesizes[fname] = filesize - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - else: - # copy just directory contents, non-recursively - for lfile in os.listdir(args.localresource): - fname = os.path.join(args.localresource, lfile) - if os.path.isdir(fname) or ( - args.include is not None and not fnmatch.fnmatch( - fname, args.include)): - continue - remotefname = apply_file_collation_and_strip(args, fname) - _remotefiles.add(remotefname) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, remotefname) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, ops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, - blockids, fname, remotefname, False) - if filesize is not None: - completed_blockids[fname] = 0 - md5map[fname] = md5digest - filemap[fname] = encode_blobname(args, remotefname) - filesizes[fname] = filesize - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - # fill deletion list - if args.delete: - # get blob skip dict if it hasn't been populated - if len(blobskipdict) == 0: - if args.fileshare: - blobskipdict = get_fileshare_listing( - file_service, args) - else: - blobskipdict = get_blob_listing( - blob_service[0], args, metadata=False) - delblobs = [x for x in blobskipdict if x not in _remotefiles] - del _remotefiles - else: - # upload single file - if args.remoteresource is None: - args.remoteresource = args.localresource - else: - if args.stripcomponents > 0: - args.stripcomponents -= 1 - args.remoteresource = apply_file_collation_and_strip( - args, args.remoteresource) - # manually pull file properties for file service - if args.fileshare and args.skiponmatch: - fsfile = get_fileshare_file_properties( - file_service, args, args.remoteresource) - if fsfile is not None: - blobskipdict[fsfile[0]] = fsfile[1] - filesize, nstorageops, md5digest, filedesc = \ - generate_xferspec_upload( - args, storage_in_queue, blobskipdict, blockids, - args.localresource, args.remoteresource, True) - if filesize is not None: - completed_blockids[args.localresource] = 0 - md5map[args.localresource] = md5digest - filemap[args.localresource] = encode_blobname( - args, args.remoteresource) - filesizes[args.localresource] = filesize - allfilesize = allfilesize + filesize - del blobskipdict - # create container/file share if needed - if args.createcontainer: - if args.fileshare: - print('creating file share, if needed: {}'.format( - args.container)) - try: - azure_request( - file_service.create_share, share_name=args.container, - fail_on_exist=False, timeout=args.timeout) - except azure.common.AzureConflictHttpError: - pass - else: - print('creating container, if needed: {}'.format( - args.container)) - try: - azure_request( - blob_service[0].create_container, timeout=args.timeout, - container_name=args.container, fail_on_exist=False) - except azure.common.AzureConflictHttpError: - pass - # initialize page blobs or file share files - if len(filemap) > 0: - if args.pageblob or args.autovhd: - print('initializing page blobs') - for key in filemap: - if as_page_blob(args, key): - blob_service[1].create_blob( - container_name=args.container, - blob_name=filemap[key], - content_length=page_align_content_length( - filesizes[key]), content_settings=None) - elif args.fileshare: - print('initializing files on fileshare') - dirscreated = set() - for key in filemap: - fsfile = split_fileshare_path_into_parts(filemap[key]) - if args.rsaprivatekey or args.rsapublickey: - fspad = _AES256_BLOCKSIZE_BYTES - else: - fspad = 0 - # try to create the file first, if preconditon failure - # then try creating the parent directory - try: - file_service.create_file( - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=filesizes[key] + fspad, - content_settings=None, timeout=args.timeout) - except azure.common.AzureMissingResourceHttpError as exc: - create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - file_service.create_file( - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=filesizes[key] + fspad, - content_settings=None, timeout=args.timeout) - del dirscreated - else: - if args.remoteresource == '.': - print('attempting to copy entire {} {} to {}'.format( - 'file share' if args.fileshare else 'container', - args.container, args.localresource)) - if args.fileshare: - blobdict = get_fileshare_listing(file_service, args) - else: - blobdict = get_blob_listing(blob_service[0], args) - else: - if args.fileshare: - fsfile = get_fileshare_file_properties( - file_service, args, args.remoteresource) - if fsfile is None: - raise RuntimeError('file {} not found on share {}'.format( - args.remoteresource, args.container)) - blobdict = {args.remoteresource: fsfile[1]} - else: - blobdict = {args.remoteresource: [None, None, None]} - if len(blobdict) > 0: - print('generating local directory structure and ' - 'pre-allocating space') - # make the localresource directory - created_dirs = set() - create_dir_ifnotexists(args.localresource) - created_dirs.add(args.localresource) - # generate xferspec for all blobs - for blob in blobdict: - # filter results - if args.include is not None and not fnmatch.fnmatch( - blob, args.include): - continue - if args.collate is not None: - localfile = os.path.join( - args.localresource, args.collate, blob) - else: - localfile = os.path.join(args.localresource, blob) - # create any subdirectories if required - localdir = os.path.dirname(localfile) - if localdir not in created_dirs: - create_dir_ifnotexists(localdir) - created_dirs.add(localdir) - # add instructions - filesize, ops, md5digest, filedesc = \ - generate_xferspec_download( - blob_service[0], file_service, args, storage_in_queue, - localfile, blob, False, blobdict[blob]) - if filesize is not None: - md5map[localfile] = md5digest - filemap[localfile] = localfile + '.blobtmp' - allfilesize = allfilesize + filesize - nstorageops = nstorageops + ops - if len(blobdict) > 0: - del created_dirs - del blobdict - - # delete any remote blobs if specified - if xfertoazure and delblobs is not None: - if args.fileshare: - print('deleting {} remote files'.format(len(delblobs))) - for blob in delblobs: - fsfile = split_fileshare_path_into_parts(blob) - azure_request( - file_service.delete_file, - share_name=args.container, directory_name=fsfile[0], - file_name=fsfile[1], timeout=args.timeout) - else: - print('deleting {} remote blobs'.format(len(delblobs))) - for blob in delblobs: - azure_request( - blob_service[0].delete_blob, timeout=args.timeout, - container_name=args.container, blob_name=blob) - print('deletion complete.') - - if nstorageops == 0: - print('detected no transfer actions needed to be taken, exiting...') - sys.exit(0) - - if xfertoazure: - # count number of empty files - emptyfiles = 0 - for fsize in filesizes.items(): - if fsize[1] == 0: - emptyfiles += 1 - print('detected {} empty files to upload'.format(emptyfiles)) - if args.fileshare: - print('performing {} put ranges and {} set file properties'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'ranges' - elif args.pageblob: - print('performing {} put pages/blobs and {} set blob ' - 'properties'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'pages' - elif args.autovhd: - print('performing {} mixed page/block operations with {} ' - 'finalizing operations'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'chunks' - else: - print('performing {} put blocks/blobs and {} put block ' - 'lists'.format( - nstorageops, len(blockids) - emptyfiles)) - progress_text = 'blocks' - else: - print('performing {} range-gets'.format(nstorageops)) - progress_text = 'range-gets' - - # spawn workers - storage_out_queue = queue.Queue(nstorageops) - maxworkers = min((args.numworkers, nstorageops)) - print('spawning {} worker threads'.format(maxworkers)) - exc_list = [] - threads = [] - for _ in xrange(maxworkers): - thr = StorageChunkWorker( - exc_list, storage_in_queue, storage_out_queue, args, xfertoazure, - blob_service, file_service) - thr.start() - threads.append(thr) - - done_ops = 0 - hmacs = {} - storage_start = time.time() - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - while True: - try: - localresource, encparam = storage_out_queue.get() - except KeyboardInterrupt: - print('\n\nKeyboardInterrupt detected, force terminating ' - 'threads (this may take a while)...') - for thr in threads: - thr.terminate = True - for thr in threads: - thr.join() - raise - if len(exc_list) > 0: - for exc in exc_list: - print(exc) - sys.exit(1) - if xfertoazure: - completed_blockids[localresource] = completed_blockids[ - localresource] + 1 - if completed_blockids[localresource] == len( - blockids[localresource]): - if as_page_blob(args, localresource): - if args.computefilemd5: - azure_request( - blob_service[1].set_blob_properties, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - content_settings=azure.storage.blob. - ContentSettings(content_md5=md5map[localresource])) - elif args.fileshare: - fsfile = split_fileshare_path_into_parts( - filemap[localresource]) - # set file metadata for encrypted files - if filesizes[localresource] > 0 and ( - args.rsaprivatekey is not None or - args.rsapublickey is not None): - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], - encparam[2][0], - encparam[2]['hmac'].digest(), - md5map[localresource] - ).construct_metadata_json() - else: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], None, - None, md5map[localresource] - ).construct_metadata_json() - azure_request( - file_service.set_file_metadata, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - metadata=encmetadata, - timeout=args.timeout) - # resize file to final encrypted size if required - if (filesizes[localresource] + - _AES256_BLOCKSIZE_BYTES != - encparam[2]['filesize']): - azure_request( - file_service.resize_file, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_length=encparam[2]['filesize'], - timeout=args.timeout) - if args.computefilemd5: - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - md5 = base64encode(encparam[2]['md5'].digest()) - else: - md5 = md5map[localresource] - azure_request( - file_service.set_file_properties, - share_name=args.container, - directory_name=fsfile[0], file_name=fsfile[1], - content_settings=azure.storage.file. - ContentSettings(content_md5=md5), - timeout=args.timeout) - else: - # only perform put block list on non-zero byte files - if filesizes[localresource] > 0: - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - md5 = base64encode(encparam[2]['md5'].digest()) - else: - md5 = md5map[localresource] - block_list = [] - for bid in blockids[localresource]: - block_list.append( - azure.storage.blob.BlobBlock(id=bid)) - azure_request( - blob_service[0].put_block_list, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - block_list=block_list, - content_settings=azure.storage.blob. - ContentSettings( - content_type=get_mime_type(localresource), - content_md5=md5)) - # set blob metadata for encrypted blobs - if (args.rsaprivatekey is not None or - args.rsapublickey is not None): - if args.encmode == _ENCRYPTION_MODE_FULLBLOB: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], - encparam[2][0], - encparam[2]['hmac'].digest(), - md5map[localresource] - ).construct_metadata_json() - else: - encmetadata = EncryptionMetadataJson( - args, encparam[0], encparam[1], None, - None, md5map[localresource] - ).construct_metadata_json() - azure_request( - blob_service[0].set_blob_metadata, - timeout=args.timeout, - container_name=args.container, - blob_name=filemap[localresource], - metadata=encmetadata) - else: - if (args.rsaprivatekey is not None and - encparam[3] == _ENCRYPTION_MODE_FULLBLOB and - not as_page_blob(args, localresource) and - encparam[4]['hmac']['hmac'] is not None): - hmacs[localresource] = encparam[4]['hmac'] - done_ops += 1 - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - if done_ops == nstorageops: - break - endtime = time.time() - if filedesc is not None: - filedesc.close() - progress_bar( - args.progressbar, 'xfer', progress_text, nstorageops, - done_ops, storage_start) - print('\n\n{} MiB transfered, elapsed {} sec. ' - 'Throughput = {} Mbit/sec\n'.format( - allfilesize / 1048576.0, endtime - storage_start, - (8.0 * allfilesize / 1048576.0) / (endtime - storage_start))) - - # finalize files/blobs - if not xfertoazure: - print( - 'performing finalization (if applicable): {}: {}, MD5: {}'.format( - _ENCRYPTION_AUTH_ALGORITHM, - args.rsaprivatekey is not None, args.computefilemd5)) - for localfile in filemap: - tmpfilename = filemap[localfile] - finalizefile = True - skipmd5 = False - # check hmac - if (args.rsaprivatekey is not None and - args.encmode == _ENCRYPTION_MODE_FULLBLOB): - if tmpfilename in hmacs: - hmacdict = hmacs[tmpfilename] - # process any remaining hmac data - while len(hmacdict['buffered']) > 0: - curr = hmacdict['curr'] - if curr in hmacdict['buffered']: - hmacdict['hmac'].update(hmacdict['buffered'][curr]) - hmacdict['buffered'].pop(curr) - hmacdict['curr'] = curr + 1 - else: - break - digest = base64encode(hmacdict['hmac'].digest()) - res = 'OK' - if digest != hmacdict['sig']: - res = 'MISMATCH' - finalizefile = False - else: - skipmd5 = True - print('[{}: {}, {}] {} {}'.format( - _ENCRYPTION_AUTH_ALGORITHM, res, localfile, - digest, hmacdict['sig'])) - # compare md5 hash - if args.computefilemd5 and not skipmd5: - lmd5 = compute_md5_for_file_asbase64(tmpfilename) - if md5map[localfile] is None: - print('[MD5: SKIPPED, {}] {} {}'.format( - localfile, lmd5, md5map[localfile])) - else: - if lmd5 != md5map[localfile]: - res = 'MISMATCH' - if not args.keepmismatchedmd5files: - finalizefile = False - else: - res = 'OK' - print('[MD5: {}, {}] {} {}'.format( - res, localfile, lmd5, md5map[localfile])) - if finalizefile: - # check for existing file first - if os.path.exists(localfile): - if args.overwrite: - os.remove(localfile) - else: - raise IOError( - 'cannot overwrite existing file: {}'.format( - localfile)) - # move tmp file to real file - os.rename(tmpfilename, localfile) - else: - os.remove(tmpfilename) - print('finalization complete.') - - # output final log lines - print('\nscript elapsed time: {} sec'.format(time.time() - start)) - print('script end time: {}'.format(time.strftime("%Y-%m-%d %H:%M:%S"))) - - -def progress_bar(display, sprefix, rtext, value, qsize, start): - """Display a progress bar - Parameters: - display - display bar - sprefix - progress prefix - rtext - rate text - value - value input value - qsize - queue size - start - start time - Returns: - Nothing - Raises: - Nothing - """ - if not display: - return - done = float(qsize) / value - diff = time.time() - start - if diff <= 0: - # arbitrarily give a small delta - diff = 1e-6 - rate = float(qsize) / (diff / 60) - sys.stdout.write( - '\r{0} progress: [{1:30s}] {2:.2f}% {3:10.2f} {4}/min '.format( - sprefix, '>' * int(done * 30), done * 100, rate, rtext)) - sys.stdout.flush() - - -def parseargs(): # pragma: no cover - """Sets up command-line arguments and parser - Parameters: - Nothing - Returns: - Parsed command line arguments - Raises: - Nothing - """ - parser = argparse.ArgumentParser( - description='Transfer files/blobs to/from Azure blob or file storage') - parser.set_defaults( - autovhd=False, endpoint=_DEFAULT_STORAGE_ENDPOINT, - chunksizebytes=_MAX_BLOB_CHUNK_SIZE_BYTES, collate=None, - computeblockmd5=False, computefilemd5=True, createcontainer=True, - delete=False, disableurllibwarnings=False, - encmode=_DEFAULT_ENCRYPTION_MODE, fileshare=False, include=None, - managementep=_DEFAULT_MANAGEMENT_ENDPOINT, - numworkers=_DEFAULT_MAX_STORAGEACCOUNT_WORKERS, overwrite=True, - pageblob=False, progressbar=True, recursive=True, rsaprivatekey=None, - rsapublickey=None, rsakeypassphrase=None, skiponmatch=True, - stripcomponents=None, timeout=None) - parser.add_argument('storageaccount', help='name of storage account') - parser.add_argument( - 'container', - help='name of blob container or file share') - parser.add_argument( - 'localresource', - help='name of the local file or directory, if mirroring. "."=use ' - 'current directory') - parser.add_argument( - '--autovhd', action='store_true', - help='automatically upload files ending in .vhd as page blobs') - parser.add_argument( - '--collate', nargs='?', - help='collate all files into a specified path') - parser.add_argument( - '--computeblockmd5', dest='computeblockmd5', action='store_true', - help='compute block/page level MD5 during upload') - parser.add_argument( - '--chunksizebytes', type=int, - help='maximum chunk size to transfer in bytes [{}]'.format( - _MAX_BLOB_CHUNK_SIZE_BYTES)) - parser.add_argument( - '--delete', action='store_true', - help='delete extraneous remote blobs that have no corresponding ' - 'local file when uploading directories') - parser.add_argument( - '--disable-urllib-warnings', action='store_true', - dest='disableurllibwarnings', - help='disable urllib warnings (not recommended)') - parser.add_argument( - '--download', action='store_true', - help='force transfer direction to download from Azure') - parser.add_argument( - '--encmode', - help='encryption mode [{}]'.format(_DEFAULT_ENCRYPTION_MODE)) - parser.add_argument( - '--endpoint', - help='storage endpoint [{}]'.format(_DEFAULT_STORAGE_ENDPOINT)) - parser.add_argument( - '--fileshare', action='store_true', - help='transfer to a file share rather than block/page blob') - parser.add_argument( - '--include', type=str, - help='include pattern (Unix shell-style wildcards)') - parser.add_argument( - '--keepmismatchedmd5files', action='store_true', - help='keep files with MD5 mismatches') - parser.add_argument( - '--managementcert', - help='path to management certificate .pem file') - parser.add_argument( - '--managementep', - help='management endpoint [{}]'.format(_DEFAULT_MANAGEMENT_ENDPOINT)) - parser.add_argument( - '--no-computefilemd5', dest='computefilemd5', action='store_false', - help='do not compute file MD5 and either upload as metadata ' - 'or validate on download') - parser.add_argument( - '--no-createcontainer', dest='createcontainer', action='store_false', - help='do not create container if it does not exist') - parser.add_argument( - '--no-overwrite', dest='overwrite', action='store_false', - help='do not overwrite local files on download') - parser.add_argument( - '--no-progressbar', dest='progressbar', action='store_false', - help='disable progress bar') - parser.add_argument( - '--no-recursive', dest='recursive', action='store_false', - help='do not mirror local directory recursively') - parser.add_argument( - '--no-skiponmatch', dest='skiponmatch', action='store_false', - help='do not skip upload/download on MD5 match') - parser.add_argument( - '--numworkers', type=int, - help='max number of workers [{}]'.format( - _DEFAULT_MAX_STORAGEACCOUNT_WORKERS)) - parser.add_argument( - '--pageblob', action='store_true', - help='upload as page blob rather than block blob, blobs will ' - 'be page-aligned in Azure storage') - parser.add_argument( - '--rsaprivatekey', - help='RSA private key file in PEM format. Specifying an RSA private ' - 'key will turn on decryption (or encryption). An RSA private key is ' - 'required for downloading and decrypting blobs and may be specified ' - 'for encrypting and uploading blobs.') - parser.add_argument( - '--rsapublickey', - help='RSA public key file in PEM format. Specifying an RSA public ' - 'key will turn on encryption. An RSA public key can only be used ' - 'for encrypting and uploading blobs.') - parser.add_argument( - '--rsakeypassphrase', - help='Optional passphrase for decrypting an RSA private key; can be ' - 'specified as {} environment variable instead'.format( - _ENVVAR_RSAKEYPASSPHRASE)) - parser.add_argument( - '--remoteresource', - help='name of remote resource on Azure storage. "."=container ' - 'copy recursive implied') - parser.add_argument( - '--saskey', - help='SAS key to use, if recursive upload or container download, ' - 'this must be a container SAS; can be specified as ' - '{} environment variable instead'.format(_ENVVAR_SASKEY)) - parser.add_argument( - '--storageaccountkey', - help='storage account shared key; can be specified as ' - '{} environment variable instead'.format(_ENVVAR_STORAGEACCOUNTKEY)) - parser.add_argument( - '--strip-components', dest='stripcomponents', type=int, - help='strip N leading components from path on upload [1]') - parser.add_argument('--subscriptionid', help='subscription id') - parser.add_argument( - '--timeout', type=float, - help='timeout in seconds for any operation to complete') - parser.add_argument( - '--upload', action='store_true', - help='force transfer direction to upload to Azure') - parser.add_argument('--version', action='version', version=_SCRIPT_VERSION) - return parser.parse_args() - - -if __name__ == '__main__': - main() diff --git a/blobxfer/__init__.py b/blobxfer/__init__.py new file mode 100644 index 0000000..0a8432f --- /dev/null +++ b/blobxfer/__init__.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +import sys +from .version import __version__ # noqa + +# monkeypatch User-Agent string +import azure.storage +azure.storage._constants.USER_AGENT_STRING = 'blobxfer/{} {}'.format( + __version__, azure.storage._constants.USER_AGENT_STRING) + +# monkeypatch SOCKET_TIMEOUT value in Azure Storage SDK +azure.storage._constants.SOCKET_TIMEOUT = (5, 300) + +# set stdin source +if sys.version_info >= (3, 0): + STDIN = sys.stdin.buffer +else: + # set stdin to binary mode on Windows + if sys.platform == 'win32': # noqa + import msvcrt + import os + msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY) + STDIN = sys.stdin diff --git a/blobxfer/api.py b/blobxfer/api.py new file mode 100644 index 0000000..9034b2e --- /dev/null +++ b/blobxfer/api.py @@ -0,0 +1,53 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function # noqa +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +# non-stdlib imports +# local imports + +from .operations.azure.blob.append import ( # noqa + create_client as create_append_blob_client +) +from .operations.azure.blob.block import ( # noqa + create_client as create_block_blob_client +) +from .operations.azure.blob.page import ( # noqa + create_client as create_page_blob_client +) +from .operations.azure.file import ( # noqa + create_client as create_file_client +) + +from .operations.download import ( # noqa + Downloader +) +from .operations.upload import ( # noqa + Uploader +) diff --git a/blobxfer/models/__init__.py b/blobxfer/models/__init__.py new file mode 100644 index 0000000..1d2e850 --- /dev/null +++ b/blobxfer/models/__init__.py @@ -0,0 +1,123 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import fnmatch +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# local imports + + +class _BaseSourcePaths(object): + """Base Source Paths""" + def __init__(self): + # type: (_BaseSourcePaths) -> None + """Ctor for _BaseSourcePaths + :param _BaseSourcePaths self: this + """ + self._include = None + self._exclude = None + self._paths = [] + + @property + def paths(self): + # type: (_BaseSourcePaths) -> List[pathlib.Path] + """Stored paths + :param _BaseSourcePaths self: this + :rtype: list + :return: list of pathlib.Path + """ + return self._paths + + def add_includes(self, includes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of includes + :param _BaseSourcePaths self: this + :param list includes: list of includes + """ + if not isinstance(includes, list): + includes = [includes] + if self._include is None: + self._include = includes + else: + self._include.extend(includes) + + def add_excludes(self, excludes): + # type: (_BaseSourcePaths, list) -> None + """Add a list of excludes + :param _BaseSourcePaths self: this + :param list excludes: list of excludes + """ + if not isinstance(excludes, list): + excludes = [excludes] + if self._exclude is None: + self._exclude = excludes + else: + self._exclude.extend(excludes) + + def add_path(self, path): + # type: (_BaseSourcePaths, str) -> None + """Add a local path + :param _BaseSourcePaths self: this + :param str path: path to add + """ + if isinstance(path, pathlib.Path): + self._paths.append(path) + else: + self._paths.append(pathlib.Path(path)) + + def add_paths(self, paths): + # type: (_BaseSourcePaths, list) -> None + """Add a list of local paths + :param _BaseSourcePaths self: this + :param list paths: paths to add + """ + for path in paths: + self.add_path(path) + + def _inclusion_check(self, path): + # type: (_BaseSourcePaths, pathlib.Path) -> bool + """Check file for inclusion against filters + :param _BaseSourcePaths self: this + :param pathlib.Path path: path to check + :rtype: bool + :return: if file should be included + """ + _spath = str(path) + inc = True + if self._include is not None: + inc = any([fnmatch.fnmatch(_spath, x) for x in self._include]) + if inc and self._exclude is not None: + inc = not any([fnmatch.fnmatch(_spath, x) for x in self._exclude]) + return inc diff --git a/blobxfer/models/azure.py b/blobxfer/models/azure.py new file mode 100644 index 0000000..b9eb088 --- /dev/null +++ b/blobxfer/models/azure.py @@ -0,0 +1,353 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +from azure.storage.blob.models import _BlobTypes as BlobTypes +# local imports +import blobxfer.models.metadata + + +# enums +class StorageModes(enum.Enum): + Auto = 10 + Append = 20 + Block = 30 + File = 40 + Page = 50 + + +class StorageEntity(object): + """Azure Storage Entity""" + def __init__(self, container, ed=None, fileattr=None): + # type: (StorageEntity, str + # blobxfer.models.crypto.EncryptionMetadata) -> None + """Ctor for StorageEntity + :param StorageEntity self: this + :param str container: container name + :param blobxfer.models.crypto.EncryptionMetadata ed: + encryption metadata + """ + self._create_containers = None + self._client = None + self._container = container + self._name = None + self._mode = None + self._lmt = None + self._size = None + self._snapshot = None + self._md5 = None + self._encryption = ed + self._from_local = False + self._append_create = True + self._vio = None + self._fileattr = None + self.replica_targets = None + + @property + def create_containers(self): + # type: (StorageEntity) -> bool + """Create containers + :param StorageEntity self: this + :rtype: bool + :return: create containers + """ + return self._create_containers + + @property + def client(self): + # type: (StorageEntity) -> object + """Associated storage client + :param StorageEntity self: this + :rtype: object + :return: associated storage client + """ + return self._client + + @property + def container(self): + # type: (StorageEntity) -> str + """Container name + :param StorageEntity self: this + :rtype: str + :return: name of container or file share + """ + return self._container + + @property + def name(self): + # type: (StorageEntity) -> str + """Entity name + :param StorageEntity self: this + :rtype: str + :return: name of entity + """ + return self._name + + @property + def path(self): + # type: (StorageEntity) -> str + """Entity path + :param StorageEntity self: this + :rtype: str + :return: remote path of entity + """ + return '{}/{}'.format(self._container, self._name) + + @property + def lmt(self): + # type: (StorageEntity) -> datetime.datetime + """Entity last modified time + :param StorageEntity self: this + :rtype: datetime.datetime + :return: LMT of entity + """ + return self._lmt + + @property + def size(self): + # type: (StorageEntity) -> int + """Entity size + :param StorageEntity self: this + :rtype: int + :return: size of entity + """ + return self._size + + @size.setter + def size(self, value): + # type: (StorageEntity, int) -> None + """Set entity size + :param StorageEntity self: this + :param int value: value + """ + self._size = value + + @property + def snapshot(self): + # type: (StorageEntity) -> str + """Entity snapshot + :param StorageEntity self: this + :rtype: str + :return: snapshot of entity + """ + return self._snapshot + + @property + def md5(self): + # type: (StorageEntity) -> str + """Base64-encoded MD5 + :param StorageEntity self: this + :rtype: str + :return: md5 of entity + """ + return self._md5 + + @property + def mode(self): + # type: (StorageEntity) -> blobxfer.models.azure.StorageModes + """Entity mode (type) + :param StorageEntity self: this + :rtype: blobxfer.models.azure.StorageModes + :return: type of entity + """ + return self._mode + + @property + def from_local(self): + # type: (StorageEntity) -> bool + """If entity was created from a local file (no remote exists) + :param StorageEntity self: this + :rtype: bool + :return: if entity is from local (no remote exists) + """ + return self._from_local + + @property + def append_create(self): + # type: (StorageEntity) -> bool + """If append blob should be created + :param StorageEntity self: this + :rtype: bool + :return: if append blob should be created + """ + return self._append_create + + @append_create.setter + def append_create(self, value): + # type: (StorageEntity, bool) -> None + """Set append create option + :param StorageEntity self: this + :param bool value: value to set + """ + self._append_create = value + + @property + def is_encrypted(self): + # type: (StorageEntity) -> bool + """If data is encrypted + :param StorageEntity self: this + :rtype: bool + :return: if encryption metadata is present + """ + return self._encryption is not None + + @property + def encryption_metadata(self): + # type: (StorageEntity) -> + # blobxfer.models.crypto.EncryptionMetadata + """Get encryption metadata + :param StorageEntity self: this + :rtype: blobxfer.models.crypto.EncryptionMetadata + :return: encryption metadata of entity + """ + return self._encryption + + @encryption_metadata.setter + def encryption_metadata(self, value): + # type: (StorageEntity, + # blobxfer.models.crypto.EncryptionMetadata) -> None + """Set encryption metadata + :param StorageEntity self: this + :param blobxfer.models.crypto.EncryptionMetadata value: value + """ + self._encryption = value + + @property + def file_attributes(self): + # type: (StorageEntity) -> object + """Return file attributes collection + :param StorageEntity self: this + :rtype: blobxfer.models.metadata.PosixFileAttr or + blobxfer.models.metadata.WindowsFileAttr or None + :return: file attributes + """ + return self._fileattr + + @property + def vectored_io(self): + # type: (StorageEntity) -> object + """Return vectored io metadata, currently stripe only + :param StorageEntity self: this + :rtype: blobxfer.models.metadata.VectoredStripe or None + :return: vectored io metadata + """ + return self._vio + + def populate_from_blob(self, sa, blob, vio=None): + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, + # azure.storage.blob.models.Blob) -> None + """Populate properties from Blob + :param StorageEntity self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param azure.storage.blob.models.Blob blob: blob to populate from + """ + # set props from metadata + self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( + blob.metadata) + self._vio = vio + self._create_containers = sa.create_containers + self._name = blob.name + self._snapshot = blob.snapshot + self._lmt = blob.properties.last_modified + self._size = blob.properties.content_length + self._md5 = blob.properties.content_settings.content_md5 + if blob.properties.blob_type == BlobTypes.AppendBlob: + self._mode = StorageModes.Append + self._client = sa.append_blob_client + elif blob.properties.blob_type == BlobTypes.BlockBlob: + self._mode = StorageModes.Block + self._client = sa.block_blob_client + elif blob.properties.blob_type == BlobTypes.PageBlob: + self._mode = StorageModes.Page + self._client = sa.page_blob_client + + def populate_from_file(self, sa, file, path, vio=None): + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount, + # azure.storage.file.models.File, str) -> None + """Populate properties from File + :param StorageEntity self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param azure.storage.file.models.File file: file to populate from + :param str path: full path to file + """ + # set props from metadata + self._fileattr = blobxfer.models.metadata.fileattr_from_metadata( + file.metadata) + self._vio = vio + self._create_containers = sa.create_containers + if path is not None: + self._name = str(pathlib.Path(path) / file.name) + else: + self._name = file.name + self._snapshot = None + self._lmt = file.properties.last_modified + self._size = file.properties.content_length + self._md5 = file.properties.content_settings.content_md5 + self._mode = StorageModes.File + self._client = sa.file_client + + def populate_from_local(self, sa, container, path, mode): + # type: (StorageEntity, blobxfer.operations.azure.StorageAccount + # str, str, blobxfer.models.azure.StorageModes) -> None + """Populate properties from local + :param StorageEntity self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param str container: container + :param str path: full path to file + :param blobxfer.models.azure.StorageModes mode: storage mode + """ + self._create_containers = sa.create_containers + self._container = container + self._name = path + self._mode = mode + self._from_local = True + if mode == StorageModes.Append: + self._client = sa.append_blob_client + elif mode == StorageModes.Block: + self._client = sa.block_blob_client + elif mode == StorageModes.File: + self._client = sa.file_client + elif mode == StorageModes.Page: + self._client = sa.page_blob_client + elif mode == StorageModes.Auto: + name = self.name.lower() + if name.endswith('.vhd') or name.endswith('.vhdx'): + self._client = sa.page_blob_client + self._mode = StorageModes.Page + else: + self._client = sa.block_blob_client + self._mode = StorageModes.Block diff --git a/blobxfer/models/crypto.py b/blobxfer/models/crypto.py new file mode 100644 index 0000000..b7b0004 --- /dev/null +++ b/blobxfer/models/crypto.py @@ -0,0 +1,405 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import base64 +import collections +import hashlib +import hmac +import json +import os +# non-stdlib imports +# local imports +import blobxfer.models.offload +import blobxfer.operations.crypto +import blobxfer.util + +# encryption constants +AES256_BLOCKSIZE_BYTES = 16 + +# named tuples +EncryptionBlobxferExtensions = collections.namedtuple( + 'EncryptionBlobxferExtensions', [ + 'pre_encrypted_content_md5', + ] +) +EncryptionAgent = collections.namedtuple( + 'EncryptionAgent', [ + 'encryption_algorithm', + 'protocol', + ] +) +EncryptionAuthentication = collections.namedtuple( + 'EncryptionAuthentication', [ + 'algorithm', + 'message_authentication_code', + ] +) +EncryptionWrappedContentKey = collections.namedtuple( + 'EncryptionWrappedContentKey', [ + 'algorithm', + 'encrypted_authentication_key', + 'encrypted_key', + 'key_id', + ] +) +EncryptionMetadataAuthentication = collections.namedtuple( + 'EncryptionMetadataAuthentication', [ + 'algorithm', + 'encoding', + 'message_authentication_code', + ] +) + + +class EncryptionMetadata(object): + """EncryptionMetadata""" + + # constants + _ENCRYPTION_MODE = 'FullBlob' + _ENCRYPTION_PROTOCOL_VERSION = '1.0' + _ENCRYPTION_ALGORITHM = 'AES_CBC_256' + _ENCRYPTED_KEY_SCHEME = 'RSA-OAEP' + _AUTH_ALGORITHM = 'HMAC-SHA256' + _AUTH_ENCODING_TYPE = 'UTF-8' + + _METADATA_KEY_NAME = 'encryptiondata' + _METADATA_KEY_AUTH_NAME = 'encryptiondata_authentication' + + _JSON_KEY_ENCRYPTION_MODE = 'EncryptionMode' + _JSON_KEY_ALGORITHM = 'Algorithm' + _JSON_KEY_MAC = 'MessageAuthenticationCode' + _JSON_KEY_ENCRYPTION_AGENT = 'EncryptionAgent' + _JSON_KEY_PROTOCOL = 'Protocol' + _JSON_KEY_ENCRYPTION_ALGORITHM = 'EncryptionAlgorithm' + _JSON_KEY_INTEGRITY_AUTH = 'EncryptionAuthentication' + _JSON_KEY_WRAPPEDCONTENTKEY = 'WrappedContentKey' + _JSON_KEY_ENCRYPTED_KEY = 'EncryptedKey' + _JSON_KEY_ENCRYPTED_AUTHKEY = 'EncryptedAuthenticationKey' + _JSON_KEY_CONTENT_IV = 'ContentEncryptionIV' + _JSON_KEY_KEYID = 'KeyId' + _JSON_KEY_KEY_WRAPPING_METADATA = 'KeyWrappingMetadata' + _JSON_KEY_BLOBXFER_EXTENSIONS = 'BlobxferExtensions' + _JSON_KEY_PREENCRYPTED_MD5 = 'PreEncryptedContentMD5' + + _JSON_KEY_AUTH_METAAUTH = 'EncryptionMetadataAuthentication' + _JSON_KEY_AUTH_ENCODING = 'Encoding' + + def __init__(self): + # type: (EncryptionMetadata) -> None + """Ctor for EncryptionMetadata + :param EncryptionMetadata self: this + """ + self.blobxfer_extensions = None + self.content_encryption_iv = None + self.encryption_agent = None + self.encryption_authentication = None + self.encryption_mode = None + self.key_wrapping_metadata = {} + self.wrapped_content_key = None + self.encryption_metadata_authentication = None + self._symkey = None + self._signkey = None + self._rsa_public_key = None + + @property + def symmetric_key(self): + # type: (EncryptionMetadata) -> bytes + """Get symmetric key + :param EncryptionMetadata self: this + :rtype: bytes + :return: symmetric key + """ + return self._symkey + + @property + def signing_key(self): + # type: (EncryptionMetadata) -> bytes + """Get singing key + :param EncryptionMetadata self: this + :rtype: bytes + :return: signing key + """ + return self._signkey + + @staticmethod + def encryption_metadata_exists(md): + # type: (dict) -> bool + """Check if encryption metadata exists in json metadata + :param dict md: metadata dictionary + :rtype: bool + :return: if encryption metadata exists + """ + try: + if blobxfer.util.is_not_empty( + md[EncryptionMetadata._METADATA_KEY_NAME]): + return True + except (KeyError, TypeError): + pass + return False + + def create_new_metadata(self, rsa_public_key): + # type: (EncryptionMetadata, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey) + # -> None + """Create new metadata entries for encryption (upload) + :param EncryptionMetadata self: this + :param cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey: + rsa public key + """ + self._rsa_public_key = rsa_public_key + self._symkey = os.urandom( + blobxfer.operations.crypto._AES256_KEYLENGTH_BYTES) + self._signkey = os.urandom( + blobxfer.operations.crypto._AES256_KEYLENGTH_BYTES) + self.content_encryption_iv = os.urandom(AES256_BLOCKSIZE_BYTES) + self.encryption_agent = EncryptionAgent( + encryption_algorithm=EncryptionMetadata._ENCRYPTION_ALGORITHM, + protocol=EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION, + ) + self.encryption_mode = EncryptionMetadata._ENCRYPTION_MODE + + def convert_from_json(self, md, entityname, rsaprivatekey): + # type: (EncryptionMetadata, dict, str, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey) + # -> None + """Read metadata json into objects + :param EncryptionMetadata self: this + :param dict md: metadata dictionary + :param str entityname: entity name + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + """ + # populate from encryption data + ed = json.loads(md[EncryptionMetadata._METADATA_KEY_NAME]) + try: + self.blobxfer_extensions = EncryptionBlobxferExtensions( + pre_encrypted_content_md5=ed[ + EncryptionMetadata._JSON_KEY_BLOBXFER_EXTENSIONS][ + EncryptionMetadata._JSON_KEY_PREENCRYPTED_MD5], + ) + except KeyError: + pass + self.content_encryption_iv = base64.b64decode( + ed[EncryptionMetadata._JSON_KEY_CONTENT_IV]) + self.encryption_agent = EncryptionAgent( + encryption_algorithm=ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT][ + EncryptionMetadata._JSON_KEY_ENCRYPTION_ALGORITHM], + protocol=ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT][ + EncryptionMetadata._JSON_KEY_PROTOCOL], + ) + if (self.encryption_agent.encryption_algorithm != + EncryptionMetadata._ENCRYPTION_ALGORITHM): + raise RuntimeError('{}: unknown block cipher: {}'.format( + entityname, self.encryption_agent.encryption_algorithm)) + if (self.encryption_agent.protocol != + EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION): + raise RuntimeError('{}: unknown encryption protocol: {}'.format( + entityname, self.encryption_agent.protocol)) + self.encryption_authentication = EncryptionAuthentication( + algorithm=ed[ + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + message_authentication_code=ed[ + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_MAC], + ) + if (self.encryption_authentication.algorithm != + EncryptionMetadata._AUTH_ALGORITHM): + raise RuntimeError( + '{}: unknown integrity/auth method: {}'.format( + entityname, self.encryption_authentication.algorithm)) + self.encryption_mode = ed[ + EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE] + if self.encryption_mode != EncryptionMetadata._ENCRYPTION_MODE: + raise RuntimeError( + '{}: unknown encryption mode: {}'.format( + entityname, self.encryption_mode)) + try: + _eak = ed[EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ENCRYPTED_AUTHKEY] + except KeyError: + _eak = None + self.wrapped_content_key = EncryptionWrappedContentKey( + algorithm=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + encrypted_authentication_key=_eak, + encrypted_key=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_ENCRYPTED_KEY], + key_id=ed[ + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY][ + EncryptionMetadata._JSON_KEY_KEYID], + ) + if (self.wrapped_content_key.algorithm != + EncryptionMetadata._ENCRYPTED_KEY_SCHEME): + raise RuntimeError('{}: unknown key encryption scheme: {}'.format( + entityname, self.wrapped_content_key.algorithm)) + # if RSA key is a public key, stop here as keys cannot be decrypted + if rsaprivatekey is None: + return + # decrypt symmetric key + self._symkey = blobxfer.operations.crypto.\ + rsa_decrypt_base64_encoded_key( + rsaprivatekey, self.wrapped_content_key.encrypted_key) + # decrypt signing key, if it exists + if blobxfer.util.is_not_empty( + self.wrapped_content_key.encrypted_authentication_key): + self._signkey = blobxfer.operations.crypto.\ + rsa_decrypt_base64_encoded_key( + rsaprivatekey, + self.wrapped_content_key.encrypted_authentication_key) + else: + self._signkey = None + # populate from encryption data authentication + try: + eda = json.loads(md[EncryptionMetadata._METADATA_KEY_AUTH_NAME]) + except KeyError: + pass + else: + self.encryption_metadata_authentication = \ + EncryptionMetadataAuthentication( + algorithm=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_ALGORITHM], + encoding=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_AUTH_ENCODING], + message_authentication_code=eda[ + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH][ + EncryptionMetadata._JSON_KEY_MAC], + ) + if (self.encryption_metadata_authentication.algorithm != + EncryptionMetadata._AUTH_ALGORITHM): + raise RuntimeError( + '{}: unknown integrity/auth method: {}'.format( + entityname, + self.encryption_metadata_authentication.algorithm)) + # verify hmac + authhmac = base64.b64decode( + self.encryption_metadata_authentication. + message_authentication_code) + bmeta = md[EncryptionMetadata._METADATA_KEY_NAME].encode( + self.encryption_metadata_authentication.encoding) + hmacsha256 = hmac.new(self._signkey, digestmod=hashlib.sha256) + hmacsha256.update(bmeta) + if hmacsha256.digest() != authhmac: + raise RuntimeError( + '{}: encryption metadata authentication failed'.format( + entityname)) + + def convert_to_json_with_mac(self, md5digest, hmacdigest): + # type: (EncryptionMetadata, str, str) -> dict + """Constructs metadata for encryption + :param EncryptionMetadata self: this + :param str md5digest: md5 digest + :param str hmacdigest: hmac-sha256 digest (data) + :rtype: dict + :return: encryption metadata + """ + # encrypt keys + enc_content_key = blobxfer.operations.crypto.\ + rsa_encrypt_key_base64_encoded( + None, self._rsa_public_key, self.symmetric_key) + enc_sign_key = blobxfer.operations.crypto.\ + rsa_encrypt_key_base64_encoded( + None, self._rsa_public_key, self.signing_key) + # generate json + encjson = { + EncryptionMetadata._JSON_KEY_ENCRYPTION_MODE: + EncryptionMetadata._ENCRYPTION_MODE, + EncryptionMetadata._JSON_KEY_CONTENT_IV: + blobxfer.util.base64_encode_as_string(self.content_encryption_iv), + EncryptionMetadata._JSON_KEY_WRAPPEDCONTENTKEY: { + EncryptionMetadata._JSON_KEY_KEYID: 'private:pem', + EncryptionMetadata._JSON_KEY_ENCRYPTED_KEY: enc_content_key, + EncryptionMetadata._JSON_KEY_ENCRYPTED_AUTHKEY: enc_sign_key, + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._ENCRYPTED_KEY_SCHEME, + }, + EncryptionMetadata._JSON_KEY_ENCRYPTION_AGENT: { + EncryptionMetadata._JSON_KEY_PROTOCOL: + EncryptionMetadata._ENCRYPTION_PROTOCOL_VERSION, + EncryptionMetadata._JSON_KEY_ENCRYPTION_ALGORITHM: + EncryptionMetadata._ENCRYPTION_ALGORITHM, + }, + EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH: { + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._AUTH_ALGORITHM, + }, + EncryptionMetadata._JSON_KEY_KEY_WRAPPING_METADATA: {}, + } + if md5digest is not None: + encjson[EncryptionMetadata._JSON_KEY_BLOBXFER_EXTENSIONS] = { + EncryptionMetadata._JSON_KEY_PREENCRYPTED_MD5: md5digest + } + if hmacdigest is not None: + encjson[EncryptionMetadata._JSON_KEY_INTEGRITY_AUTH][ + EncryptionMetadata._JSON_KEY_MAC] = hmacdigest + bencjson = json.dumps( + encjson, sort_keys=True, ensure_ascii=False).encode( + EncryptionMetadata._AUTH_ENCODING_TYPE) + encjson = { + EncryptionMetadata._METADATA_KEY_NAME: + json.dumps(encjson, sort_keys=True) + } + # compute MAC over encjson + hmacsha256 = hmac.new(self._signkey, digestmod=hashlib.sha256) + hmacsha256.update(bencjson) + authjson = { + EncryptionMetadata._JSON_KEY_AUTH_METAAUTH: { + EncryptionMetadata._JSON_KEY_ALGORITHM: + EncryptionMetadata._AUTH_ALGORITHM, + EncryptionMetadata._JSON_KEY_AUTH_ENCODING: + EncryptionMetadata._AUTH_ENCODING_TYPE, + EncryptionMetadata._JSON_KEY_MAC: + blobxfer.util.base64_encode_as_string(hmacsha256.digest()), + } + } + encjson[EncryptionMetadata._METADATA_KEY_AUTH_NAME] = json.dumps( + authjson, sort_keys=True) + return encjson + + def initialize_hmac(self): + # type: (EncryptionMetadata) -> hmac.HMAC + """Initialize an hmac from a signing key if it exists + :param EncryptionMetadata self: this + :rtype: hmac.HMAC or None + :return: hmac + """ + if self._signkey is not None: + return hmac.new(self._signkey, digestmod=hashlib.sha256) + else: + return None diff --git a/blobxfer/models/download.py b/blobxfer/models/download.py new file mode 100644 index 0000000..cc363ef --- /dev/null +++ b/blobxfer/models/download.py @@ -0,0 +1,796 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import logging +import math +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +import tempfile +import threading +# non-stdlib imports +# local imports +import blobxfer.models.azure +import blobxfer.models.crypto +import blobxfer.models.options +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) +# global defines +_AUTO_SELECT_CHUNKSIZE_BYTES = 16777216 +# named tuples +Offsets = collections.namedtuple( + 'Offsets', [ + 'chunk_num', + 'fd_start', + 'num_bytes', + 'range_end', + 'range_start', + 'unpad', + ] +) +UncheckedChunk = collections.namedtuple( + 'UncheckedChunk', [ + 'data_len', + 'fd_start', + 'file_path', + 'temp', + ] +) +LocalPathView = collections.namedtuple( + 'LocalPathView', [ + 'fd_end', + 'fd_start', + ] +) + + +class LocalDestinationPath(object): + """Local Destination Path""" + def __init__(self, path=None): + # type: (LocalDestinationPath, str) -> None + """Ctor for LocalDestinationPath + :param LocalDestinationPath self: this + :param str path: path + """ + self._is_dir = None + if path is not None: + self.path = path + + @property + def path(self): + # type: (LocalDestinationPath) -> pathlib.Path + """Path property + :param LocalDestinationPath self: this + :rtype: pathlib.Path + :return: local destination path + """ + return self._path + + @path.setter + def path(self, value): + # type: (LocalDestinationPath, str) -> None + """Path property setter + :param LocalDestinationPath self: this + :param str value: value to set path to + """ + self._path = pathlib.Path(value) + + @property + def is_dir(self): + # type: (LocalDestinationPath) -> bool + """is_dir property + :param LocalDestinationPath self: this + :rtype: bool + :return: if local destination path is a directory + """ + return self._is_dir + + @is_dir.setter + def is_dir(self, value): + # type: (LocalDestinationPath, bool) -> None + """is_dir property setter + :param LocalDestinationPath self: this + :param bool value: value to set is_dir to + """ + self._is_dir = value + + def ensure_path_exists(self): + # type: (LocalDestinationPath) -> None + """Ensure path exists + :param LocalDestinationPath self: this + """ + if self._is_dir is None: + raise RuntimeError('is_dir not set') + if self._is_dir: + self._path.mkdir(mode=0o750, parents=True, exist_ok=True) + else: + if self._path.exists() and self._path.is_dir(): + raise RuntimeError( + ('destination path {} already exists and is a ' + 'directory').format(self._path)) + else: + # ensure parent path exists and is created + self._path.parent.mkdir( + mode=0o750, parents=True, exist_ok=True) + + +class Specification(object): + """Download Specification""" + def __init__( + self, download_options, skip_on_options, local_destination_path): + # type: (Specification, blobxfer.models.options.Download, + # blobxfer.models.options.SkipOn, LocalDestinationPath) -> None + """Ctor for Specification + :param DownloadSpecification self: this + :param blobxfer.models.options.Download download_options: + download options + :param blobxfer.models.options.SkipOn skip_on_options: skip on options + :param LocalDestinationPath local_destination_path: local dest path + """ + self.options = download_options + self.skip_on = skip_on_options + self.destination = local_destination_path + self.sources = [] + # validate compatible options + if not self.options.check_file_md5 and self.skip_on.md5_match: + raise ValueError( + 'cannot specify skip on MD5 match without file MD5 enabled') + if (self.options.restore_file_attributes and + not blobxfer.util.on_windows() and os.getuid() != 0): + logger.warning('cannot set file uid/gid without root privileges') + if self.options.chunk_size_bytes < 0: + raise ValueError('chunk size cannot be negative') + + def add_azure_source_path(self, source): + # type: (Specification, blobxfer.operations.azure.SourcePath) -> None + """Add an Azure Source Path + :param DownloadSpecification self: this + :param blobxfer.operations.Azure.SourcePath source: + Azure source path to add + """ + self.sources.append(source) + + +class Descriptor(object): + """Download Descriptor""" + + _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES + + def __init__(self, lpath, ase, options, resume_mgr): + # type: (Descriptior, pathlib.Path, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.options.Download, + # blobxfer.operations.resume.DownloadResumeManager) -> None + """Ctor for Descriptor + :param Descriptor self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :param blobxfer.models.options.Download options: download options + :param blobxfer.operations.resume.DownloadResumeManager resume_mgr: + download resume manager + """ + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._unchecked_chunks = {} + self._allocated = False + self._finalized = False + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() + self._resume_mgr = resume_mgr + self._ase = ase + # set paths + self.final_path = lpath + self.view = None + # auto-select chunk size + if options.chunk_size_bytes == 0: + chunk_size_bytes = _AUTO_SELECT_CHUNKSIZE_BYTES + else: + chunk_size_bytes = options.chunk_size_bytes + self._chunk_size = min((chunk_size_bytes, self._ase.size)) + # calculate the total number of ops required for transfer + self._total_chunks = self._compute_total_chunks(self._chunk_size) + self._outstanding_ops = self._total_chunks + # initialize integrity checkers + self.hmac = None + self.md5 = None + self._integrity_failed = False + self._initialize_integrity_checkers(options) + + @property + def entity(self): + # type: (Descriptor) -> blobxfer.models.azure.StorageEntity + """Get linked blobxfer.models.azure.StorageEntity + :param Descriptor self: this + :rtype: blobxfer.models.azure.StorageEntity + :return: blobxfer.models.azure.StorageEntity + """ + return self._ase + + @property + def must_compute_md5(self): + # type: (Descriptor) -> bool + """Check if MD5 must be computed + :param Descriptor self: this + :rtype: bool + :return: if MD5 must be computed + """ + return self.md5 is not None + + @property + def all_operations_completed(self): + # type: (Descriptor) -> bool + """All operations are completed + :param Descriptor self: this + :rtype: bool + :return: if all operations completed + """ + with self._meta_lock: + return (self._outstanding_ops == 0 and + len(self._unchecked_chunks) == 0) + + @property + def is_resumable(self): + # type: (Descriptor) -> bool + """Download is resume capable + :param Descriptor self: this + :rtype: bool + :return: if resumable + """ + return self._resume_mgr is not None and self.hmac is None + + def _compute_total_chunks(self, chunk_size): + # type: (Descriptor, int) -> int + """Compute total number of chunks for entity + :param Descriptor self: this + :param int chunk_size: chunk size + :rtype: int + :return: num chunks + """ + try: + return int(math.ceil(self._ase.size / chunk_size)) + except ZeroDivisionError: + return 0 + + def _initialize_integrity_checkers(self, options): + # type: (Descriptor, blobxfer.models.options.Download) -> None + """Initialize file integrity checkers + :param Descriptor self: this + :param blobxfer.models.options.Download options: download options + """ + if self._ase.is_encrypted: + # ensure symmetric key exists + if blobxfer.util.is_none_or_empty( + self._ase.encryption_metadata.symmetric_key): + raise RuntimeError( + 'symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt') + self.hmac = self._ase.encryption_metadata.initialize_hmac() + if (self.hmac is None and options.check_file_md5 and + blobxfer.util.is_not_empty(self._ase.md5)): + self.md5 = blobxfer.util.new_md5_hasher() + + @staticmethod + def compute_allocated_size(size, is_encrypted): + # type: (int, bool) -> int + """Compute allocated size on disk + :param int size: size (content length) + :param bool is_ecrypted: if entity is encrypted + :rtype: int + :return: required size on disk + """ + # compute size + if size > 0: + if is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = ( + size // + blobxfer.models.download.Descriptor._AES_BLOCKSIZE - 1 + ) * blobxfer.models.download.Descriptor._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + return allocatesize + + @staticmethod + def generate_view(ase): + # type: (blobxfer.models.azure.StorageEntity) -> + # Tuple[LocalPathView, int] + """Generate local path view and total size required + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: tuple + :return: (local path view, allocation size) + """ + slicesize = blobxfer.models.download.Descriptor.compute_allocated_size( + ase.size, ase.is_encrypted) + if ase.vectored_io is None: + view = LocalPathView( + fd_start=0, + fd_end=slicesize, + ) + total_size = slicesize + else: + view = LocalPathView( + fd_start=ase.vectored_io.offset_start, + fd_end=ase.vectored_io.offset_start + slicesize, + ) + total_size = ase.vectored_io.total_size + return view, total_size + + @staticmethod + def convert_vectored_io_slice_to_final_path_name(local_path, ase): + # type: (pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> pathlib.Path + """Convert vectored io slice to final path name + :param pathlib.Path local_path: local path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: pathlib.Path + :return: converted final path + """ + name = local_path.name + name = blobxfer.models.metadata.\ + remove_vectored_io_slice_suffix_from_name( + name, ase.vectored_io.slice_id) + _tmp = list(local_path.parts[:-1]) + _tmp.append(name) + return pathlib.Path(*_tmp) + + def _set_final_path_view(self): + # type: (Descriptor) -> int + """Set final path view and return required space on disk + :param Descriptor self: this + :rtype: int + :return: required size on disk + """ + # set final path if vectored io stripe + if self._ase.vectored_io is not None: + self.final_path = blobxfer.models.download.Descriptor.\ + convert_vectored_io_slice_to_final_path_name( + self.final_path, self._ase) + # generate view + view, total_size = blobxfer.models.download.Descriptor.generate_view( + self._ase) + self.view = view + return total_size + + def _allocate_disk_space(self): + # type: (Descriptor) -> None + """Perform file allocation (possibly sparse) + :param Descriptor self: this + """ + with self._meta_lock: + if self._allocated or self._offset != 0: + return + # set local path view + allocatesize = self._set_final_path_view() + # check if path already exists and is of sufficient size + if (not self.final_path.exists() or + self.final_path.stat().st_size != allocatesize): + # create parent path + self.final_path.parent.mkdir( + mode=0o750, parents=True, exist_ok=True) + # allocate file + with self.final_path.open('wb') as fd: + if allocatesize > 0: + try: + os.posix_fallocate(fd.fileno(), 0, allocatesize) + except AttributeError: + fd.seek(allocatesize - 1) + fd.write(b'\0') + self._allocated = True + + def _resume(self): + # type: (Descriptor) -> int + """Resume a download, if possible + :param Descriptor self: this + :rtype: int or None + :return: verified download offset + """ + if self._resume_mgr is None or self._offset > 0 or self._finalized: + return None + # check if path exists in resume db + rr = self._resume_mgr.get_record(self._ase) + if rr is None: + logger.debug('no resume record for {}'.format(self.final_path)) + return None + # ensure lengths are the same + if rr.length != self._ase.size: + logger.warning('resume length mismatch {} -> {}'.format( + rr.length, self._ase.size)) + return None + # calculate current chunk and offset + if rr.next_integrity_chunk == 0: + logger.debug('nothing to resume for {}'.format(self.final_path)) + return None + curr_chunk = rr.next_integrity_chunk + # set offsets if completed and the final path exists + if rr.completed and self.final_path.exists(): + with self._meta_lock: + logger.debug('{} download already completed'.format( + self.final_path)) + self._offset = self._ase.size + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = self._compute_total_chunks(rr.chunk_size) + self._next_integrity_chunk = rr.next_integrity_chunk + self._outstanding_ops = 0 + self._finalized = True + return self._ase.size + # encrypted files are not resumable due to hmac requirement + if self._ase.is_encrypted: + logger.debug('cannot resume encrypted entity {}'.format( + self._ase.path)) + return None + self._allocate_disk_space() + # check if final path exists + if not self.final_path.exists(): + logger.warning('download path {} does not exist'.format( + self.final_path)) + return None + if self.hmac is not None: + raise RuntimeError( + 'unexpected hmac object for entity {}'.format(self._ase.path)) + # re-hash from 0 to offset if needed + _fd_offset = 0 + _end_offset = min((curr_chunk * rr.chunk_size, rr.length)) + if self.md5 is not None and curr_chunk > 0: + _blocksize = blobxfer.util.MEGABYTE << 2 + logger.debug( + 'integrity checking existing file {} offset {} -> {}'.format( + self.final_path, + self.view.fd_start, + self.view.fd_start + _end_offset) + ) + with self._hasher_lock: + with self.final_path.open('rb') as filedesc: + filedesc.seek(self.view.fd_start, 0) + while _fd_offset < _end_offset: + if (_fd_offset + _blocksize) > _end_offset: + _blocksize = _end_offset - _fd_offset + _buf = filedesc.read(_blocksize) + self.md5.update(_buf) + _fd_offset += _blocksize + del _blocksize + # compare hashes + hexdigest = self.md5.hexdigest() + if rr.md5hexdigest != hexdigest: + logger.warning( + 'MD5 mismatch resume={} computed={} for {}'.format( + rr.md5hexdigest, hexdigest, self.final_path)) + # reset hasher + self.md5 = blobxfer.util.new_md5_hasher() + return None + # set values from resume + with self._meta_lock: + self._offset = _end_offset + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = self._compute_total_chunks(rr.chunk_size) + self._next_integrity_chunk = rr.next_integrity_chunk + self._outstanding_ops = ( + self._total_chunks - self._next_integrity_chunk + ) + logger.debug( + ('resuming file {} from byte={} chunk={} chunk_size={} ' + 'total_chunks={} next_integrity_chunk={} ' + 'outstanding_ops={}').format( + self.final_path, self._offset, self._chunk_num, + self._chunk_size, self._total_chunks, + self._next_integrity_chunk, self._outstanding_ops)) + return _end_offset + + def cleanup_all_temporary_files(self): + # type: (Descriptor) -> None + """Cleanup all temporary files in case of an exception or interrupt. + This function is not thread-safe. + :param Descriptor self: this + """ + # delete local file + try: + self.final_path.unlink() + except OSError: + pass + # iterate unchecked chunks and delete + for key in self._unchecked_chunks: + ucc = self._unchecked_chunks[key]['ucc'] + if ucc.temp: + try: + ucc.file_path.unlink() + except OSError: + pass + + def next_offsets(self): + # type: (Descriptor) -> Offsets + """Retrieve the next offsets + :param Descriptor self: this + :rtype: Offsets + :return: download offsets + """ + resume_bytes = self._resume() + if resume_bytes is None and not self._allocated: + self._allocate_disk_space() + with self._meta_lock: + if self._offset >= self._ase.size: + return None, resume_bytes + if self._offset + self._chunk_size > self._ase.size: + chunk = self._ase.size - self._offset + else: + chunk = self._chunk_size + # on download, num_bytes must be offset by -1 as the x-ms-range + # header expects it that way. x -> y bytes means first bits of the + # (x+1)th byte to the last bits of the (y+1)th byte. for example, + # 0 -> 511 means byte 1 to byte 512 + num_bytes = chunk - 1 + chunk_num = self._chunk_num + fd_start = self._offset + range_start = self._offset + if self._ase.is_encrypted: + # ensure start is AES block size aligned + range_start = range_start - \ + (range_start % self._AES_BLOCKSIZE) - \ + self._AES_BLOCKSIZE + if range_start <= 0: + range_start = 0 + range_end = self._offset + num_bytes + self._offset += chunk + self._chunk_num += 1 + if self._ase.is_encrypted and self._offset >= self._ase.size: + unpad = True + else: + unpad = False + return Offsets( + chunk_num=chunk_num, + fd_start=fd_start, + num_bytes=chunk, + range_start=range_start, + range_end=range_end, + unpad=unpad, + ), resume_bytes + + def hmac_iv(self, iv): + # type: (Descriptor, bytes) -> None + """Send IV through hasher + :param Descriptor self: this + :param bytes iv: iv + """ + with self._hasher_lock: + self.hmac.update(iv) + + def write_unchecked_data(self, offsets, data): + # type: (Descriptor, Offsets, bytes) -> None + """Write unchecked data to disk + :param Descriptor self: this + :param Offsets offsets: download offsets + :param bytes data: data + """ + self.write_data(offsets, data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=self.view.fd_start + offsets.fd_start, + file_path=self.final_path, + temp=False, + ) + with self._meta_lock: + self._unchecked_chunks[offsets.chunk_num] = { + 'ucc': unchecked, + 'decrypted': True, + } + + def write_unchecked_hmac_data(self, offsets, data): + # type: (Descriptor, Offsets, bytes) -> None + """Write unchecked encrypted data to disk + :param Descriptor self: this + :param Offsets offsets: download offsets + :param bytes data: hmac/encrypted data + """ + fname = None + with tempfile.NamedTemporaryFile(mode='wb', delete=False) as fd: + fname = fd.name + fd.write(data) + unchecked = UncheckedChunk( + data_len=len(data), + fd_start=0, + file_path=pathlib.Path(fname), + temp=True, + ) + with self._meta_lock: + self._unchecked_chunks[offsets.chunk_num] = { + 'ucc': unchecked, + 'decrypted': False, + } + return str(unchecked.file_path) + + def mark_unchecked_chunk_decrypted(self, chunk_num): + # type: (Descriptor, int) -> None + """Mark an unchecked chunk as decrypted + :param Descriptor self: this + :param int chunk_num: unchecked chunk number + """ + with self._meta_lock: + self._unchecked_chunks[chunk_num]['decrypted'] = True + + def perform_chunked_integrity_check(self): + # type: (Descriptor) -> None + """Hash data against stored hasher safely + :param Descriptor self: this + """ + hasher = self.hmac or self.md5 + # iterate from next chunk to be checked + while True: + ucc = None + with self._meta_lock: + chunk_num = self._next_integrity_chunk + # check if the next chunk is ready + if (chunk_num in self._unchecked_chunks and + self._unchecked_chunks[chunk_num]['decrypted']): + ucc = self._unchecked_chunks.pop(chunk_num)['ucc'] + else: + break + # hash data and set next integrity chunk + md5hexdigest = None + if hasher is not None: + with ucc.file_path.open('rb') as fd: + if not ucc.temp: + fd.seek(ucc.fd_start, 0) + chunk = fd.read(ucc.data_len) + if ucc.temp: + ucc.file_path.unlink() + with self._hasher_lock: + hasher.update(chunk) + if hasher == self.md5: + md5hexdigest = hasher.hexdigest() + with self._meta_lock: + # update integrity counter and resume db + self._next_integrity_chunk += 1 + if self.is_resumable: + self._resume_mgr.add_or_update_record( + self.final_path, self._ase, self._chunk_size, + self._next_integrity_chunk, False, md5hexdigest, + ) + # decrement outstanding op counter + self._outstanding_ops -= 1 + + def _update_resume_for_completed(self): + # type: (Descriptor) -> None + """Update resume for completion + :param Descriptor self: this + """ + if not self.is_resumable: + return + with self._meta_lock: + self._resume_mgr.add_or_update_record( + self.final_path, self._ase, self._chunk_size, + self._next_integrity_chunk, True, None, + ) + + def write_data(self, offsets, data): + # type: (Descriptor, Offsets, bytes) -> None + """Postpone integrity check for chunk + :param Descriptor self: this + :param Offsets offsets: download offsets + :param bytes data: data + """ + if len(data) > 0: + with self.final_path.open('r+b') as fd: + # offset some internal view + fd.seek(self.view.fd_start + offsets.fd_start, 0) + fd.write(data) + + def finalize_integrity(self): + # type: (Descriptor) -> None + """Finalize integrity check for download + :param Descriptor self: this + """ + with self._meta_lock: + if self._finalized: + return + # check final file integrity + check = False + msg = None + if self.hmac is not None: + mac = self._ase.encryption_metadata.encryption_authentication.\ + message_authentication_code + digest = blobxfer.util.base64_encode_as_string(self.hmac.digest()) + if digest == mac: + check = True + msg = '{}: {}, {} {} {}'.format( + self._ase.encryption_metadata.encryption_authentication. + algorithm, + 'OK' if check else 'MISMATCH', + self._ase.path, + digest, + mac, + ) + elif self.md5 is not None: + digest = blobxfer.util.base64_encode_as_string(self.md5.digest()) + if digest == self._ase.md5: + check = True + msg = 'MD5: {}, {} {} {}'.format( + 'OK' if check else 'MISMATCH', + self._ase.path, + digest, + self._ase.md5, + ) + else: + check = True + msg = 'MD5: SKIPPED, {} None {}'.format( + self._ase.path, + self._ase.md5 + ) + # cleanup if download failed + if not check: + self._integrity_failed = True + logger.error(msg) + logger.info(msg) + + def _restore_file_attributes(self): + # type: (Descriptor) -> None + """Restore file attributes for file + :param Descriptor self: this + """ + if self._ase.file_attributes is None: + return + # set file uid/gid and mode + if blobxfer.util.on_windows(): + # TODO not implemented yet + pass + else: + self.final_path.chmod(int(self._ase.file_attributes.mode, 8)) + if os.getuid() == 0: + os.chown( + str(self.final_path), + self._ase.file_attributes.uid, + self._ase.file_attributes.gid + ) + + def finalize_file(self): + # type: (Descriptor) -> None + """Finalize file for download + :param Descriptor self: this + """ + # delete bad file if integrity failed + if self._integrity_failed: + self.final_path.unlink() + else: + self._restore_file_attributes() + # update resume file + self._update_resume_for_completed() + with self._meta_lock: + self._finalized = True diff --git a/blobxfer/models/metadata.py b/blobxfer/models/metadata.py new file mode 100644 index 0000000..ead4b79 --- /dev/null +++ b/blobxfer/models/metadata.py @@ -0,0 +1,282 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import json +import logging +# non-stdlib imports +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) +# global defines +JSON_KEY_BLOBXFER_METADATA = 'blobxfer_metadata' +# file attributes +_JSON_KEY_FILE_ATTRIBUTES = 'FileAttributes' +_JSON_KEY_FILE_ATTRIBUTES_POSIX = 'POSIX' +_JSON_KEY_FILE_ATTRIBUTES_WINDOWS = 'Windows' +_JSON_KEY_FILE_ATTRIBUTES_MODE = 'mode' +_JSON_KEY_FILE_ATTRIBUTES_UID = 'uid' +_JSON_KEY_FILE_ATTRIBUTES_GID = 'gid' +# vectored io +_JSON_KEY_VECTORED_IO = 'VectoredIO' +_JSON_KEY_VECTORED_IO_MODE = 'Mode' +_JSON_KEY_VECTORED_IO_STRIPE = 'Stripe' +_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE = 'TotalSize' +_JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START = 'OffsetStart' +_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES = 'TotalSlices' +_JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID = 'SliceId' +_JSON_KEY_VECTORED_IO_STRIPE_NEXT = 'Next' +# named tuples +PosixFileAttr = collections.namedtuple( + 'PosixFileAttr', [ + 'gid', + 'mode', + 'uid', + ] +) +WindowsFileAttr = collections.namedtuple( + 'WindowsFileAttr', [ + ] +) +VectoredStripe = collections.namedtuple( + 'VectoredStripe', [ + 'next', + 'offset_start', + 'slice_id', + 'total_size', + 'total_slices', + ] +) +VectoredNextEntry = collections.namedtuple( + 'VectoredNextEntry', [ + 'storage_account_name', + 'endpoint', + 'container', + 'name', + ] +) + + +def get_md5_from_metadata(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Get MD5 from properties or metadata + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :rtype: str or None + :return: md5 + """ + # if encryption metadata is present, check for pre-encryption + # md5 in blobxfer extensions + md5 = None + if ase.is_encrypted: + md5 = ase.encryption_metadata.blobxfer_extensions.\ + pre_encrypted_content_md5 + if blobxfer.util.is_none_or_empty(md5): + md5 = ase.md5 + return md5 + + +def generate_fileattr_metadata(local_path, metadata): + # type: (blobxfer.models.upload.LocalPath, dict) -> dict + """Generate file attribute metadata dict + :param blobxfer.models.upload.LocalPath local_path: local path + :param dict metadata: existing metadata dict + :rtype: dict + :return: merged metadata dictionary + """ + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported yet') + return None + else: + md = { + _JSON_KEY_FILE_ATTRIBUTES: { + _JSON_KEY_FILE_ATTRIBUTES_POSIX: { + _JSON_KEY_FILE_ATTRIBUTES_MODE: local_path.mode, + _JSON_KEY_FILE_ATTRIBUTES_UID: local_path.uid, + _JSON_KEY_FILE_ATTRIBUTES_GID: local_path.gid, + } + } + } + return blobxfer.util.merge_dict(metadata, md) + + +def fileattr_from_metadata(md): + # type: (dict) -> collections.namedtuple + """Convert fileattr metadata in json metadata + :param dict md: metadata dictionary + :rtype: PosixFileAttr or WindowsFileAttr or None + :return: fileattr metadata + """ + try: + mdattr = json.loads( + md[JSON_KEY_BLOBXFER_METADATA])[_JSON_KEY_FILE_ATTRIBUTES] + except (KeyError, TypeError): + return None + else: + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported ' + 'yet') + fileattr = None + else: + try: + fileattr = PosixFileAttr( + mode=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_MODE], + uid=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_UID], + gid=mdattr[_JSON_KEY_FILE_ATTRIBUTES_POSIX][ + _JSON_KEY_FILE_ATTRIBUTES_GID], + ) + except KeyError: + fileattr = None + return fileattr + + +def restore_fileattr(path, metadata): + # type: (pathlib.Path, dict) -> None + """Restore file attributes from metadata + :param pathlib.Path path: path to modify + :param dict metadata: existing metadata dict + """ + if blobxfer.util.on_windows(): + logger.warning( + 'file attributes store/restore on Windows is not supported yet') + raise NotImplementedError() + + +def create_vectored_io_next_entry(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Create Vectored IO next entry id + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :rtype: str + :return: vectored io next entry + """ + return ';'.join( + (ase.client.primary_endpoint, ase.container, ase.name) + ) + + +def explode_vectored_io_next_entry(entry): + # type: (str, int) -> str + """Explode next vectored io entry + :param str entry: next entry + :rtype: VectoredNextEntry + :return: vectored next entry + """ + tmp = entry.split(';') + _sa = tmp[0].split('.') + return VectoredNextEntry( + storage_account_name=_sa[0], + endpoint='.'.join(_sa[2:]), + container=tmp[1], + name=tmp[2], + ) + + +def remove_vectored_io_slice_suffix_from_name(name, slice): + # type: (str, int) -> str + """Remove vectored io (stripe) slice suffix from a given name + :param str name: entity name + :param int slice: slice num + :rtype: str + :return: name without suffix + """ + suffix = '.bxslice-{}'.format(slice) + if name.endswith(suffix): + return name[:-len(suffix)] + else: + return name + + +def generate_vectored_io_stripe_metadata(local_path, metadata): + # type: (blobxfer.models.upload.LocalPath, dict) -> dict + """Generate vectored io stripe metadata dict + :param blobxfer.models.upload.LocalPath local_path: local path + :param dict metadata: existing metadata dict + :rtype: dict + :return: merged metadata dictionary + """ + md = { + _JSON_KEY_VECTORED_IO: { + _JSON_KEY_VECTORED_IO_MODE: _JSON_KEY_VECTORED_IO_STRIPE, + _JSON_KEY_VECTORED_IO_STRIPE: { + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE: local_path.total_size, + _JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START: + local_path.view.fd_start, + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES: + local_path.view.total_slices, + _JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID: + local_path.view.slice_num, + _JSON_KEY_VECTORED_IO_STRIPE_NEXT: local_path.view.next, + } + } + } + return blobxfer.util.merge_dict(metadata, md) + + +def vectored_io_from_metadata(md): + # type: (dict) -> collections.namedtuple + """Convert vectored io metadata in json metadata + :param dict md: metadata dictionary + :rtype: VectoredStripe or None + :return: vectored io metadata + """ + try: + mdattr = json.loads( + md[JSON_KEY_BLOBXFER_METADATA])[_JSON_KEY_VECTORED_IO] + except (KeyError, TypeError): + pass + else: + if mdattr[_JSON_KEY_VECTORED_IO_MODE] == _JSON_KEY_VECTORED_IO_STRIPE: + mdstripe = mdattr[_JSON_KEY_VECTORED_IO_STRIPE] + try: + nextptr = explode_vectored_io_next_entry( + mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_NEXT]) + except (KeyError, AttributeError): + nextptr = None + vio = VectoredStripe( + total_size=mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SIZE], + offset_start=mdstripe[ + _JSON_KEY_VECTORED_IO_STRIPE_OFFSET_START], + total_slices=mdstripe[ + _JSON_KEY_VECTORED_IO_STRIPE_TOTAL_SLICES], + slice_id=mdstripe[_JSON_KEY_VECTORED_IO_STRIPE_SLICE_ID], + next=nextptr, + ) + return vio + else: + raise RuntimeError('Cannot handle Vectored IO mode: {}'.format( + mdattr[_JSON_KEY_VECTORED_IO_MODE])) + return None diff --git a/blobxfer/models/offload.py b/blobxfer/models/offload.py new file mode 100644 index 0000000..ca2cc85 --- /dev/null +++ b/blobxfer/models/offload.py @@ -0,0 +1,129 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import logging +import multiprocessing +import threading +try: + import queue +except ImportError: # noqa + import Queue as queue + +# create logger +logger = logging.getLogger(__name__) + + +class _MultiprocessOffload(object): + def __init__(self, target, num_workers, description=None): + # type: (_MultiprocessOffload, function, int, str) -> None + """Ctor for Crypto Offload + :param _MultiprocessOffload self: this + :param function target: target function for process + :param int num_workers: number of worker processes + :param str description: description + """ + self._task_queue = multiprocessing.Queue() + self._done_queue = multiprocessing.Queue() + self._done_cv = multiprocessing.Condition() + self._term_signal = multiprocessing.Value('i', 0) + self._procs = [] + self._check_thread = None + self._initialize_processes(target, num_workers, description) + + @property + def done_cv(self): + # type: (_MultiprocessOffload) -> multiprocessing.Condition + """Get Done condition variable + :param _MultiprocessOffload self: this + :rtype: multiprocessing.Condition + :return: cv for download done + """ + return self._done_cv + + @property + def terminated(self): + # type: (_MultiprocessOffload) -> bool + """Check if terminated + :param _MultiprocessOffload self: this + :rtype: bool + :return: if terminated + """ + return self._term_signal.value == 1 + + def _initialize_processes(self, target, num_workers, description): + # type: (_MultiprocessOffload, function, int, str) -> None + """Initialize processes + :param _MultiprocessOffload self: this + :param function target: target function for process + :param int num_workers: number of worker processes + :param str description: description + """ + if num_workers is None or num_workers < 1: + raise ValueError('invalid num_workers: {}'.format(num_workers)) + logger.debug('initializing {}{} processes'.format( + num_workers, ' ' + description if not None else '')) + for _ in range(num_workers): + proc = multiprocessing.Process(target=target) + proc.start() + self._procs.append(proc) + + def finalize_processes(self): + # type: (_MultiprocessOffload) -> None + """Finalize processes + :param _MultiprocessOffload self: this + """ + self._term_signal.value = 1 + if self._check_thread is not None: + self._check_thread.join() + for proc in self._procs: + proc.join() + + def pop_done_queue(self): + # type: (_MultiprocessOffload) -> object + """Get item from done queue + :param _MultiprocessOffload self: this + :rtype: object or None + :return: object from done queue, if exists + """ + try: + return self._done_queue.get_nowait() + except queue.Empty: + return None + + def initialize_check_thread(self, check_func): + # type: (_MultiprocessOffload, function) -> None + """Initialize the crypto done queue check thread + :param Downloader self: this + :param function check_func: check function + """ + self._check_thread = threading.Thread(target=check_func) + self._check_thread.start() diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py new file mode 100644 index 0000000..2a17c1a --- /dev/null +++ b/blobxfer/models/options.py @@ -0,0 +1,173 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import logging +import multiprocessing +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# named tuples +VectoredIo = collections.namedtuple( + 'VectoredIoOptions', [ + 'stripe_chunk_size_bytes', + 'distribution_mode', + ] +) +SkipOn = collections.namedtuple( + 'SkipOn', [ + 'filesize_match', + 'lmt_ge', + 'md5_match', + ] +) +FileProperties = collections.namedtuple( + 'FileProperties', [ + 'attributes', + 'md5', + ] +) +Upload = collections.namedtuple( + 'Upload', [ + 'chunk_size_bytes', + 'delete_extraneous_destination', + 'mode', + 'one_shot_bytes', + 'overwrite', + 'recursive', + 'rename', + 'rsa_public_key', + 'store_file_properties', + 'strip_components', + 'vectored_io', + ] +) +Download = collections.namedtuple( + 'Download', [ + 'check_file_md5', + 'chunk_size_bytes', + 'delete_extraneous_destination', + 'mode', + 'overwrite', + 'recursive', + 'rename', + 'restore_file_attributes', + 'rsa_private_key', + ] +) +SyncCopy = collections.namedtuple( + 'SyncCopy', [ + 'chunk_size_bytes', + 'mode', + 'overwrite', + ] +) + + +class Concurrency(object): + """Concurrency Options""" + def __init__( + self, crypto_processes, md5_processes, disk_threads, + transfer_threads, is_download=None): + """Ctor for Concurrency Options + :param Concurrency self: this + :param int crypto_processes: number of crypto procs + :param int md5_processes: number of md5 procs + :param int disk_threads: number of disk threads + :param int transfer_threads: number of transfer threads + :param bool is_download: download hint + """ + self.crypto_processes = crypto_processes + self.md5_processes = md5_processes + self.disk_threads = disk_threads + self.transfer_threads = transfer_threads + # allow crypto processes to be zero (which will inline crypto + # routines with main process) + if self.crypto_processes is None or self.crypto_processes < 1: + self.crypto_processes = 0 + if self.md5_processes is None or self.md5_processes < 1: + self.md5_processes = multiprocessing.cpu_count() >> 1 + if self.md5_processes < 1: + self.md5_processes = 1 + auto_disk = False + if self.disk_threads is None or self.disk_threads < 1: + self.disk_threads = multiprocessing.cpu_count() << 1 + # cap maximum number of disk threads from cpu count to 64 + if self.disk_threads > 64: + self.disk_threads = 64 + # for downloads, cap disk threads to lower value + if is_download and self.disk_threads > 16: + self.disk_threads = 16 + auto_disk = True + if self.transfer_threads is None or self.transfer_threads < 1: + if auto_disk: + self.transfer_threads = self.disk_threads << 1 + else: + self.transfer_threads = multiprocessing.cpu_count() << 2 + # cap maximum number of threads from cpu count to 96 + if self.transfer_threads > 96: + self.transfer_threads = 96 + + +class General(object): + """General Options""" + def __init__( + self, concurrency, log_file=None, progress_bar=True, + resume_file=None, timeout_sec=None, verbose=False): + """Ctor for General Options + :param General self: this + :param Concurrency concurrency: concurrency options + :param bool progress_bar: progress bar + :param str log_file: log file + :param str resume_file: resume file + :param int timeout_sec: timeout in seconds + :param bool verbose: verbose output + """ + if concurrency is None: + raise ValueError('concurrency option is unspecified') + self.concurrency = concurrency + self.log_file = log_file + self.progress_bar = progress_bar + if blobxfer.util.is_not_empty(resume_file): + self.resume_file = pathlib.Path(resume_file) + else: + self.resume_file = None + self.timeout_sec = timeout_sec + self.verbose = verbose diff --git a/blobxfer/models/resume.py b/blobxfer/models/resume.py new file mode 100644 index 0000000..a0108cb --- /dev/null +++ b/blobxfer/models/resume.py @@ -0,0 +1,299 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +# local imports + + +class Download(object): + """Download resume object""" + def __init__( + self, final_path, length, chunk_size, next_integrity_chunk, + completed, md5): + # type: (Download, str, int, int, int, bool, str) -> None + """Ctor for Download + :param Download self: this + :param str final_path: final path + :param int length: total bytes + :param int chunk_size: chunk size in bytes + :param int next_integrity_chunk: next integrity chunk + :param bool completed: completed + :param str md5: md5 hex digest + """ + self._final_path = final_path + self._length = length + self._chunk_size = chunk_size + self._next_integrity_chunk = next_integrity_chunk + self._completed = completed + self._md5hexdigest = md5 if md5 is not None else None + + @property + def final_path(self): + # type: (Download) -> str + """Final path + :param Download self: this + :rtype: str + :return: final path + """ + return self._final_path + + @property + def length(self): + # type: (Download) -> int + """Content length + :param Download self: this + :rtype: int + :return: number of bytes + """ + return self._length + + @property + def chunk_size(self): + # type: (Download) -> int + """Chunk size + :param Download self: this + :rtype: int + :return: chunk size in bytes + """ + return self._chunk_size + + @property + def next_integrity_chunk(self): + # type: (Download) -> int + """Get Next integrity chunk + :param Download self: this + :rtype: int + :return: next integrity chunk + """ + return self._next_integrity_chunk + + @next_integrity_chunk.setter + def next_integrity_chunk(self, value): + # type: (Download) -> None + """Set Next integrity chunk + :param Download self: this + :param int value: next chunk num + """ + self._next_integrity_chunk = value + + @property + def completed(self): + # type: (Download) -> bool + """Get Completed + :param Download self: this + :rtype: bool + :return: if completed + """ + return self._completed + + @completed.setter + def completed(self, value): + # type: (Download) -> None + """Set Completed + :param Download self: this + :param bool value: completion value + """ + self._completed = value + + @property + def md5hexdigest(self): + # type: (Download) -> str + """Get md5 hex digest + :param Download self: this + :rtype: str + :return: md5 hex digest + """ + return self._md5hexdigest + + @md5hexdigest.setter + def md5hexdigest(self, value): + # type: (Download) -> None + """Set md5 hex digest value if value is not None + :param Download self: this + :param str value: md5 hex digest + """ + if value is None: + return + self._md5hexdigest = value + + def __repr__(self): + # type: (Download) -> str + """Return representation + :param Download self: this + :rtype: str + :return: representation string + """ + return ('Download').format( + self.final_path, self.length, self.chunk_size, + self.next_integrity_chunk, self.completed, + self.md5hexdigest, + ) + + +class Upload(object): + """Upload resume object""" + def __init__( + self, local_path, length, chunk_size, total_chunks, + completed_chunks, completed, md5): + # type: (Upload, str, int, int, int, int, bool, str) -> None + """Ctor for Upload + :param Upload self: this + :param str local_path: local path + :param int length: total bytes + :param int chunk_size: chunk size in bytes + :param int total_chunks: total chunks + :param int completed_chunks: completed chunks + :param bool completed: completed + :param str md5: md5 hex digest + """ + self._local_path = local_path + self._length = length + self._chunk_size = chunk_size + self._total_chunks = total_chunks + self._completed_chunks = completed_chunks + self._completed = completed + self._md5hexdigest = md5 if md5 is not None else None + + @property + def local_path(self): + # type: (Upload) -> str + """Local path + :param Upload self: this + :rtype: str + :return: local path + """ + return self._local_path + + @property + def length(self): + # type: (Upload) -> int + """Content length + :param Upload self: this + :rtype: int + :return: number of bytes + """ + return self._length + + @property + def chunk_size(self): + # type: (Upload) -> int + """Chunk size + :param Upload self: this + :rtype: int + :return: chunk size in bytes + """ + return self._chunk_size + + @property + def total_chunks(self): + # type: (Upload) -> int + """Get total number of chunks + :param Upload self: this + :rtype: int + :return: total chunks + """ + return self._total_chunks + + @property + def completed_chunks(self): + # type: (Upload) -> int + """Get Completed chunks + :param Upload self: this + :rtype: int + :return: completed chunks + """ + return self._completed_chunks + + @completed_chunks.setter + def completed_chunks(self, value): + # type: (Upload, int) -> None + """Set Completed chunks + :param Upload self: this + :param int value: completed chunks + """ + self._completed_chunks = value + + @property + def completed(self): + # type: (Upload) -> bool + """Get Completed + :param Upload self: this + :rtype: bool + :return: if completed + """ + return self._completed + + @completed.setter + def completed(self, value): + # type: (Upload) -> None + """Set Completed + :param Upload self: this + :param bool value: completion value + """ + self._completed = value + + @property + def md5hexdigest(self): + # type: (Upload) -> str + """Get md5 hex digest + :param Upload self: this + :rtype: str + :return: md5 hex digest + """ + return self._md5hexdigest + + @md5hexdigest.setter + def md5hexdigest(self, value): + # type: (Upload) -> None + """Set md5 hex digest value if value is not None + :param Upload self: this + :param str value: md5 hex digest + """ + if value is None: + return + self._md5hexdigest = value + + def __repr__(self): + # type: (Upload) -> str + """Return representation + :param Upload self: this + :rtype: str + :return: representation string + """ + return ('Upload').format( + self.local_path, self.length, self.chunk_size, + self.total_chunks, self.completed_chunks, self.completed, + self.md5hexdigest, + ) diff --git a/blobxfer/models/upload.py b/blobxfer/models/upload.py new file mode 100644 index 0000000..d5151b4 --- /dev/null +++ b/blobxfer/models/upload.py @@ -0,0 +1,915 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import collections +import enum +import json +import logging +import math +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +import threading +# non-stdlib imports +import bitstring +# local imports +import blobxfer.models +import blobxfer.models.azure +import blobxfer.models.crypto +import blobxfer.models.metadata +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) +# global defines +_MAX_BLOCK_BLOB_ONESHOT_BYTES = 268435456 +_MAX_BLOCK_BLOB_CHUNKSIZE_BYTES = 104857600 +_MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304 +_MAX_NUM_CHUNKS = 50000 +_DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216 +_MAX_MD5_CACHE_RESUME_ENTRIES = 25 + + +# named tuples +Offsets = collections.namedtuple( + 'Offsets', [ + 'chunk_num', + 'num_bytes', + 'range_end', + 'range_start', + 'pad', + ] +) +LocalPathView = collections.namedtuple( + 'LocalPathView', [ + 'fd_end', + 'fd_start', + 'mode', + 'next', + 'slice_num', + 'total_slices', + ] +) + + +class VectoredIoDistributionMode(enum.Enum): + Disabled = 'disabled' + Stripe = 'stripe' + Replica = 'replica' + + def __str__(self): + return self.value + + +class LocalPath(object): + """Local Path""" + + def __init__(self, parent_path, relative_path, use_stdin=False, view=None): + # type: (LocalPath, pathlib.Path, pathlib.Path, bool, + # LocalPathView) -> None + """Ctor for LocalPath + :param LocalPath self: this + :param pathlib.Path parent_path: parent path + :param pathlib.Path relative_path: relative path + :param bool use_stdin: use stdin + :param LocalPathView view: local path view + """ + self.parent_path = parent_path + self.relative_path = relative_path + self.use_stdin = use_stdin + # populate properties + if self.use_stdin: + # create dummy stat object + self._stat = type('stat', (object,), {}) + self._stat.st_size = 0 + self._stat.st_mtime = 0 + self._stat.st_mode = 0 + self._stat.st_uid = 0 + self._stat.st_gid = 0 + else: + self._stat = self.absolute_path.stat() + if view is None: + self.view = LocalPathView( + fd_start=0, + fd_end=self._stat.st_size, + slice_num=0, + mode=VectoredIoDistributionMode.Disabled, + total_slices=1, + next=None, + ) + else: + self.view = view + self._size = self.view.fd_end - self.view.fd_start + + @property + def absolute_path(self): + # type: (LocalPath) -> pathlib.Path + """Absolute path + :param LocalPath self: this + :rtype: pathlib.Path + :return: absolute path + """ + return self.parent_path / self.relative_path + + @property + def size(self): + # type: (LocalPath) -> int + """Size of view + :param LocalPath self: this + :rtype: int + :return: size of view portion of the file + """ + return self._size + + @property + def total_size(self): + # type: (LocalPath) -> int + """Total Size of file + :param LocalPath self: this + :rtype: int + :return: total size of file (non-view) + """ + return self._stat.st_size + + @property + def lmt(self): + # type: (LocalPath) -> int + """mtime of file + :param LocalPath self: this + :rtype: int + :return: mtime of file + """ + return self._stat.st_mtime + + @property + def mode(self): + # type: (LocalPath) -> str + """Octal file mode + :param LocalPath self: this + :rtype: str + :return: octal file mode + """ + return str(oct(self._stat.st_mode)) + + @property + def uid(self): + # type: (LocalPath) -> int + """Uid of file + :param LocalPath self: this + :rtype: int + :return: uid of file + """ + return self._stat.st_uid + + @property + def gid(self): + # type: (LocalPath) -> int + """Gid of file + :param LocalPath self: this + :rtype: int + :return: gid of file + """ + return self._stat.st_gid + + +class LocalSourcePath(blobxfer.models._BaseSourcePaths): + """Local Source Path""" + + def can_rename(self): + # type: (LocalSourcePaths) -> bool + """Check if source can be renamed + :param LocalSourcePath self: this + :rtype: bool + :return: if rename possible + """ + return len(self._paths) == 1 and self._paths[0].is_file() + + @staticmethod + def is_stdin(path): + # type: (str) -> bool + """Check if path is stdin + :param str path: path to check + :rtype: bool + :return: if path is stdin + """ + if path == '-' or path == '/dev/stdin': + return True + return False + + def files(self): + # type: (LocalSourcePaths) -> LocalPath + """Generator for files in paths + :param LocalSourcePath self: this + :rtype: LocalPath + :return: LocalPath + """ + for _path in self._paths: + _ppath = os.path.expandvars(os.path.expanduser(str(_path))) + # check of path is stdin + if blobxfer.models.upload.LocalSourcePath.is_stdin(_ppath): + yield LocalPath( + parent_path=pathlib.Path(), + relative_path=pathlib.Path('stdin'), + use_stdin=True, + ) + continue + # resolve path + _expath = pathlib.Path(_ppath).resolve() + # check if path is a single file + tmp = pathlib.Path(_ppath) + if tmp.is_file(): + if self._inclusion_check(tmp.name): + yield LocalPath( + parent_path=tmp.parent, + relative_path=pathlib.Path(tmp.name), + use_stdin=False, + ) + continue + del tmp + for entry in blobxfer.util.scantree(_ppath): + _rpath = pathlib.Path(entry.path).relative_to(_ppath) + if not self._inclusion_check(_rpath): + continue + yield LocalPath( + parent_path=_expath, + relative_path=_rpath, + use_stdin=False, + ) + + +class Specification(object): + """Upload Specification""" + def __init__( + self, upload_options, skip_on_options, local_source_path): + # type: (Specification, blobxfer.models.options.Upload, + # blobxfer.models.options.SkipOn, LocalSourcePath) -> None + """Ctor for Specification + :param UploadSpecification self: this + :param blobxfer.models.options.Upload upload_options: upload options + :param blobxfer.models.options.SkipOn skip_on_options: skip on options + :param LocalSourcePath local_source_path: local source path + """ + self.options = upload_options + self.skip_on = skip_on_options + self.destinations = [] + self.sources = local_source_path + # validate options + if self.options.rename: + # ensure only one internal path is present + if len(self.sources.paths) > 1: + raise ValueError( + 'cannot add more than one internal source path if rename ' + 'is specified') + # check if internal source path is directory and rename is enabled + if self.sources.paths[0].is_dir(): + raise ValueError( + 'cannot rename a directory of files to upload') + if self.options.chunk_size_bytes < 0: + raise ValueError('chunk size cannot be negative') + if self.options.chunk_size_bytes > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + raise ValueError( + ('chunk size value of {} exceeds maximum allowable ' + 'of {}').format( + self.options.chunk_size_bytes, + _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES)) + if self.options.one_shot_bytes < 0: + raise ValueError('one shot bytes value must be at least 0') + if self.options.one_shot_bytes > _MAX_BLOCK_BLOB_ONESHOT_BYTES: + raise ValueError( + ('one shot bytes value of {} exceeds maximum allowable ' + 'of {}').format( + self.options.chunk_size_bytes, + _MAX_BLOCK_BLOB_ONESHOT_BYTES)) + + def add_azure_destination_path(self, dest): + # type: (Specification, + # blobxfer.operations.azure.DestinationPath) -> None + """Add a remote Azure Destination path + :param UploadSpecification self: this + :param blobxfer.operations.azure.DestinationPath dest: + Remote destination path + """ + self.destinations.append(dest) + + +class Descriptor(object): + """Upload Descriptor""" + + _AES_BLOCKSIZE = blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES + + def __init__(self, lpath, ase, uid, options, resume_mgr): + # type: (Descriptior, LocalPath, + # blobxfer.models.azure.StorageEntity, str, + # blobxfer.models.options.Upload, + # blobxfer.operations.resume.UploadResumeManager) -> None + """Ctor for Descriptor + :param Descriptor self: this + :param LocalPath lpath: local path + :param blobxfer.models.azure.StorageEntity ase: Azure Storage Entity + :param str uid: unique id + :param blobxfer.models.options.Upload options: download options + :param blobxfer.operations.resume.UploadResumeManager resume_mgr: + upload resume manager + """ + self.local_path = lpath + self.unique_id = uid + self._offset = 0 + self._chunk_num = 0 + self._next_integrity_chunk = 0 + self._finalized = False + self._meta_lock = threading.Lock() + self._hasher_lock = threading.Lock() + self._resume_mgr = resume_mgr + self._ase = ase + self._store_file_attr = options.store_file_properties.attributes + self.current_iv = None + self._initialize_encryption(options) + # calculate the total number of ops required for transfer + self._compute_remote_size() + self._adjust_chunk_size(options) + self._total_chunks = self._compute_total_chunks(self._chunk_size) + self._outstanding_ops = self._total_chunks + if blobxfer.util.is_not_empty(self._ase.replica_targets): + self._outstanding_ops *= len(self._ase.replica_targets) + 1 + if self._resume_mgr: + self._completed_chunks = bitstring.BitArray( + length=self._total_chunks) + self._md5_cache = {} + # initialize integrity checkers + self.hmac = None + self.md5 = None + self._initialize_integrity_checkers(options) + + @property + def entity(self): + # type: (Descriptor) -> blobxfer.models.azure.StorageEntity + """Get linked blobxfer.models.azure.StorageEntity + :param Descriptor self: this + :rtype: blobxfer.models.azure.StorageEntity + :return: blobxfer.models.azure.StorageEntity + """ + return self._ase + + @property + def must_compute_md5(self): + # type: (Descriptor) -> bool + """Check if MD5 must be computed + :param Descriptor self: this + :rtype: bool + :return: if MD5 must be computed + """ + return self.md5 is not None + + @property + def all_operations_completed(self): + # type: (Descriptor) -> bool + """All operations are completed + :param Descriptor self: this + :rtype: bool + :return: if all operations completed + """ + with self._meta_lock: + return self._outstanding_ops == 0 + + @property + def last_block_num(self): + # type: (Descriptor) -> bool + """Last used block number for block id, should only be called for + finalize operation + :param Descriptor self: this + :rtype: int + :return: block number + """ + with self._meta_lock: + return self._chunk_num - 1 + + @property + def is_resumable(self): + # type: (Descriptor) -> bool + """Upload is resume capable + :param Descriptor self: this + :rtype: bool + :return: if resumable + """ + return (self._resume_mgr is not None and self.hmac is None and + not self.remote_is_append_blob) + + @property + def remote_is_file(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure File + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure File + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.File + + @property + def remote_is_page_blob(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure Page Blob + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure Page Blob + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.Page + + @property + def remote_is_append_blob(self): + # type: (Descriptor) -> bool + """Remote destination is an Azure Append Blob + :param Descriptor self: this + :rtype: bool + :return: remote is an Azure Append Blob + """ + return self.entity.mode == blobxfer.models.azure.StorageModes.Append + + @property + def is_one_shot_block_blob(self): + # type: (Descriptor) -> bool + """Is one shot block blob + :param Descriptor self: this + :rtype: bool + :return: if upload is a one-shot block blob + """ + return (self._ase.mode == blobxfer.models.azure.StorageModes.Block and + self._total_chunks == 1) + + @property + def requires_put_block_list(self): + # type: (Descriptor) -> bool + """Requires a put block list operation to finalize + :param Descriptor self: this + :rtype: bool + :return: if finalize requires a put block list + """ + return (self._ase.mode == blobxfer.models.azure.StorageModes.Block and + self._total_chunks > 1) + + @property + def requires_non_encrypted_md5_put(self): + # type: (Descriptor) -> bool + """Requires a set file properties for md5 to finalize + :param Descriptor self: this + :rtype: bool + :return: if finalize requires a put file properties + """ + return (not self.entity.is_encrypted and self.must_compute_md5 and + not self.remote_is_append_blob) + + @property + def requires_set_file_properties_md5(self): + # type: (Descriptor) -> bool + """Requires a set file properties for md5 to finalize + :param Descriptor self: this + :rtype: bool + :return: if finalize requires a put file properties + """ + return (not self.entity.is_encrypted and self.must_compute_md5 and + self.remote_is_file) + + def complete_offset_upload(self, chunk_num): + # type: (Descriptor, int) -> None + """Complete the upload for the offset + :param Descriptor self: this + :param int chunk_num: chunk num completed + """ + with self._meta_lock: + self._outstanding_ops -= 1 + # save resume state + if self.is_resumable: + self._completed_chunks.set(True, chunk_num) + completed = self._outstanding_ops == 0 + if not completed and self.must_compute_md5: + last_consecutive = ( + self._completed_chunks.find('0b0')[0] - 1 + ) + md5digest = self._md5_cache[last_consecutive] + else: + md5digest = None + if completed: + last_consecutive = None + self._md5_cache.clear() + self._resume_mgr.add_or_update_record( + self.local_path.absolute_path, self._ase, self._chunk_size, + self._total_chunks, self._completed_chunks.int, completed, + md5digest, + ) + # prune md5 cache + if len(self._md5_cache) > _MAX_MD5_CACHE_RESUME_ENTRIES: + mkeys = sorted(list(self._md5_cache.keys())) + for key in mkeys: + if key >= last_consecutive: + break + self._md5_cache.pop(key) + + def hmac_data(self, data): + # type: (Descriptor, bytes) -> None + """Send data through hmac hasher + :param Descriptor self: this + :param bytes data: data + """ + with self._hasher_lock: + self.hmac.update(data) + + def _initialize_encryption(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Download is resume capable + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + if (options.rsa_public_key is not None and self.local_path.size > 0 and + (self._ase.mode == blobxfer.models.azure.StorageModes.Block or + self._ase.mode == blobxfer.models.azure.StorageModes.File)): + em = blobxfer.models.crypto.EncryptionMetadata() + em.create_new_metadata(options.rsa_public_key) + self.current_iv = em.content_encryption_iv + self._ase.encryption_metadata = em + + def _compute_remote_size(self): + # type: (Descriptor, int) -> None + """Compute total remote file size + :param Descriptor self: this + :rtype: int + :return: remote file size + """ + size = self.local_path.size + if size > 0: + if self._ase.is_encrypted: + # cipher_len_without_iv = (clear_len / aes_bs + 1) * aes_bs + allocatesize = (size // self._AES_BLOCKSIZE + 1) * \ + self._AES_BLOCKSIZE + else: + allocatesize = size + if allocatesize < 0: + allocatesize = 0 + else: + allocatesize = 0 + self._ase.size = allocatesize + if blobxfer.util.is_not_empty(self._ase.replica_targets): + for rt in self._ase.replica_targets: + rt.size = allocatesize + logger.debug('remote size for {} is {} bytes'.format( + self._ase.path, self._ase.size)) + + def _adjust_chunk_size(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Adjust chunk size for entity mode + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + chunk_size = options.chunk_size_bytes + # auto-select chunk size + if chunk_size == 0: + if self._ase.mode != blobxfer.models.azure.StorageModes.Block: + chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + else: + if self._ase.size == 0: + chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + else: + chunk_size = _DEFAULT_AUTO_CHUNKSIZE_BYTES + while chunk_size < _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + chunks = int(math.ceil(self._ase.size / chunk_size)) + if chunks <= _MAX_NUM_CHUNKS: + break + chunk_size = chunk_size << 1 + logger.debug( + 'auto-selected chunk size of {} for {}'.format( + chunk_size, self.local_path.absolute_path)) + if self.local_path.use_stdin: + self._chunk_size = max( + (chunk_size, _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES) + ) + else: + self._chunk_size = min((chunk_size, self._ase.size)) + # ensure chunk sizes are compatible with mode + if self._ase.mode == blobxfer.models.azure.StorageModes.Append: + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + logger.debug( + ('adjusting chunk size to {} for append blob ' + 'from {}').format( + self._chunk_size, self.local_path.absolute_path)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.Block: + if (not self.local_path.use_stdin and + self._ase.size <= options.one_shot_bytes): + self._chunk_size = min( + (self._ase.size, options.one_shot_bytes) + ) + else: + if self._chunk_size > _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES + logger.debug( + ('adjusting chunk size to {} for block blob ' + 'from {}').format( + self._chunk_size, self.local_path.absolute_path)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.File: + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for file from {}'.format( + self._chunk_size, self.local_path.absolute_path)) + elif self._ase.mode == blobxfer.models.azure.StorageModes.Page: + if self._chunk_size > _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + self._chunk_size = _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES + logger.debug( + 'adjusting chunk size to {} for page blob from {}'.format( + self._chunk_size, self.local_path.absolute_path)) + + def _compute_total_chunks(self, chunk_size): + # type: (Descriptor, int) -> int + """Compute total number of chunks for entity + :param Descriptor self: this + :param int chunk_size: chunk size + :rtype: int + :return: num chunks + """ + try: + chunks = int(math.ceil(self._ase.size / chunk_size)) + except ZeroDivisionError: + chunks = 1 + if self.local_path.use_stdin and chunks == 0: + chunks = 1 + if chunks > 50000: + max_vector = False + if self._ase.mode == blobxfer.models.azure.StorageModes.Block: + if self._chunk_size == _MAX_BLOCK_BLOB_CHUNKSIZE_BYTES: + max_vector = True + elif self._chunk_size == _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES: + max_vector = True + if max_vector: + raise RuntimeError( + ('number of chunks {} exceeds maximum permissible ' + 'limit and chunk size is set at the maximum value ' + 'for {}. Please try using stripe mode ' + 'vectorization to overcome this limitation').format( + chunks, self.local_path.absolute_path)) + else: + raise RuntimeError( + ('number of chunks {} exceeds maximum permissible ' + 'limit for {}, please adjust chunk size higher or ' + 'set to -1 for automatic chunk size selection').format( + chunks, self.local_path.absolute_path)) + return chunks + + def _initialize_integrity_checkers(self, options): + # type: (Descriptor, blobxfer.models.options.Upload) -> None + """Initialize file integrity checkers + :param Descriptor self: this + :param blobxfer.models.options.Upload options: upload options + """ + if self._ase.is_encrypted: + # ensure symmetric key exists + if blobxfer.util.is_none_or_empty( + self._ase.encryption_metadata.symmetric_key): + raise RuntimeError( + ('symmetric key is invalid: provide RSA private key ' + 'or metadata corrupt for {}').format( + self.local_path.absolute_path)) + self.hmac = self._ase.encryption_metadata.initialize_hmac() + # both hmac and md5 can be enabled + if (options.store_file_properties.md5 and + not self.remote_is_append_blob): + self.md5 = blobxfer.util.new_md5_hasher() + + def _resume(self): + if self._resume_mgr is None or self._offset > 0: + return None + # check if path exists in resume db + rr = self._resume_mgr.get_record(self._ase) + if rr is None: + logger.debug('no resume record for {}'.format(self._ase.path)) + return None + # ensure lengths are the same + if rr.length != self._ase.size: + logger.warning('resume length mismatch {} -> {}'.format( + rr.length, self._ase.size)) + return None + # set offsets if completed + if rr.completed: + with self._meta_lock: + logger.debug('{} upload already completed'.format( + self._ase.path)) + self._offset = rr.total_chunks * rr.chunk_size + self._chunk_num = rr.total_chunks + self._chunk_size = rr.chunk_size + self._total_chunks = rr.total_chunks + self._completed_chunks.int = rr.completed_chunks + self._outstanding_ops = 0 + return self._ase.size + # encrypted files are not resumable due to hmac requirement + if self._ase.is_encrypted: + logger.debug('cannot resume encrypted entity {}'.format( + self._ase.path)) + return None + # check if path exists + if not pathlib.Path(rr.local_path).exists(): + logger.warning('resume from local path {} does not exist'.format( + rr.local_path)) + return None + # re-hash from 0 to offset if needed + _cc = bitstring.BitArray(length=rr.total_chunks) + _cc.int = rr.completed_chunks + curr_chunk = _cc.find('0b0')[0] + del _cc + _fd_offset = 0 + _end_offset = min((curr_chunk * rr.chunk_size, rr.length)) + if self.md5 is not None and curr_chunk > 0: + _blocksize = blobxfer.util.MEGABYTE << 2 + logger.debug( + 'integrity checking existing file {} offset {} -> {}'.format( + self._ase.path, + self.local_path.view.fd_start, + self.local_path.view.fd_start + _end_offset) + ) + with self._hasher_lock: + with self.local_path.absolute_path.open('rb') as filedesc: + filedesc.seek(self.local_path.view.fd_start, 0) + while _fd_offset < _end_offset: + if (_fd_offset + _blocksize) > _end_offset: + _blocksize = _end_offset - _fd_offset + _buf = filedesc.read(_blocksize) + self.md5.update(_buf) + _fd_offset += _blocksize + del _blocksize + # compare hashes + hexdigest = self.md5.hexdigest() + if rr.md5hexdigest != hexdigest: + logger.warning( + 'MD5 mismatch resume={} computed={} for {}'.format( + rr.md5hexdigest, hexdigest, self._ase.path)) + # reset hasher + self.md5 = blobxfer.util.new_md5_hasher() + return None + # set values from resume + with self._meta_lock: + self._offset = _end_offset + self._chunk_num = curr_chunk + self._chunk_size = rr.chunk_size + self._total_chunks = rr.total_chunks + self._completed_chunks = bitstring.BitArray(length=rr.total_chunks) + self._completed_chunks.set(True, range(0, curr_chunk + 1)) + self._outstanding_ops = rr.total_chunks - curr_chunk + logger.debug( + ('resuming file {} from byte={} chunk={} chunk_size={} ' + 'total_chunks={} outstanding_ops={}').format( + self._ase.path, self._offset, self._chunk_num, + self._chunk_size, self._total_chunks, + self._outstanding_ops)) + return _end_offset + + def next_offsets(self): + # type: (Descriptor) -> Offsets + """Retrieve the next offsets + :param Descriptor self: this + :rtype: Offsets + :return: upload offsets + """ + resume_bytes = self._resume() + with self._meta_lock: + if self._chunk_num >= self._total_chunks: + return None, resume_bytes + if self._offset + self._chunk_size > self._ase.size: + num_bytes = self._ase.size - self._offset + else: + num_bytes = self._chunk_size + chunk_num = self._chunk_num + range_start = self._offset + range_end = self._offset + num_bytes - 1 + self._offset += num_bytes + self._chunk_num += 1 + if self._ase.is_encrypted and self._offset >= self._ase.size: + pad = True + else: + pad = False + return Offsets( + chunk_num=chunk_num, + num_bytes=num_bytes, + range_start=range_start, + range_end=range_end, + pad=pad, + ), resume_bytes + + def read_data(self, offsets): + # type: (Descriptor, Offsets) -> Tuple[bytes, Offsets] + """Read data from file + :param Descriptor self: this + :param Offsets offsets: offsets + :rtype: tuple + :return: (file data bytes, new Offsets if stdin) + """ + newoffset = None + if not self.local_path.use_stdin: + if offsets.num_bytes == 0: + return None, None + # compute start from view + start = self.local_path.view.fd_start + offsets.range_start + # encrypted offsets will read past the end of the file due + # to padding, but will be accounted for after encryption+padding + with self.local_path.absolute_path.open('rb') as fd: + fd.seek(start, 0) + data = fd.read(offsets.num_bytes) + else: + data = blobxfer.STDIN.read(self._chunk_size) + if not data: + with self._meta_lock: + self._total_chunks -= 1 + self._chunk_num -= 1 + self._outstanding_ops -= 1 + else: + num_bytes = len(data) + with self._meta_lock: + newoffset = Offsets( + chunk_num=self._chunk_num - 1, + num_bytes=num_bytes, + range_start=self._offset, + range_end=self._offset + num_bytes - 1, + pad=False, + ) + self._total_chunks += 1 + self._outstanding_ops += 1 + self._offset += num_bytes + self._ase.size += num_bytes + if self.must_compute_md5 and data: + with self._hasher_lock: + self.md5.update(data) + if self.is_resumable: + self._md5_cache[self._chunk_num - 1] = self.md5.hexdigest() + return data, newoffset + + def generate_metadata(self): + # type: (Descriptor) -> dict + """Generate metadata for descriptor + :param Descriptor self: this + :rtype: dict or None + :return: kv metadata dict + """ + genmeta = {} + encmeta = {} + # generate encryption metadata + if self._ase.is_encrypted: + if self.must_compute_md5: + md5digest = blobxfer.util.base64_encode_as_string( + self.md5.digest()) + else: + md5digest = None + if self.hmac is not None: + hmacdigest = blobxfer.util.base64_encode_as_string( + self.hmac.digest()) + else: + hmacdigest = None + encmeta = self._ase.encryption_metadata.convert_to_json_with_mac( + md5digest, hmacdigest) + # generate file attribute metadata + if self._store_file_attr and not self.local_path.use_stdin: + merged = blobxfer.models.metadata.generate_fileattr_metadata( + self.local_path, genmeta) + if merged is not None: + genmeta = merged + # generate vectored io metadata + if self.local_path.view.mode == VectoredIoDistributionMode.Stripe: + merged = blobxfer.models.metadata.\ + generate_vectored_io_stripe_metadata(self.local_path, genmeta) + if merged is not None: + genmeta = merged + if len(encmeta) > 0: + metadata = encmeta + else: + metadata = {} + if len(genmeta) > 0: + metadata[blobxfer.models.metadata.JSON_KEY_BLOBXFER_METADATA] = \ + json.dumps(genmeta, ensure_ascii=False, sort_keys=True) + if len(metadata) == 0: + return None + return metadata diff --git a/blobxfer/operations/__init__.py b/blobxfer/operations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/blobxfer/operations/azure/__init__.py b/blobxfer/operations/azure/__init__.py new file mode 100644 index 0000000..67d531f --- /dev/null +++ b/blobxfer/operations/azure/__init__.py @@ -0,0 +1,453 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +import requests +# local imports +import blobxfer.models +import blobxfer.models.metadata +import blobxfer.operations.azure.blob.append +import blobxfer.operations.azure.blob.block +import blobxfer.operations.azure.blob.page +import blobxfer.operations.azure.file + + +class StorageCredentials(object): + """Azure Storage Credentials""" + def __init__(self, general_options): + # type: (StorageCredentials, blobxfer.models.options.General) -> None + """Ctor for StorageCredentials + :param StorageCredentials self: this + :param blobxfer.models.options.General: general options + """ + self._storage_accounts = {} + self._general_options = general_options + + def add_storage_account(self, name, key, endpoint): + # type: (StorageCredentials, str, str, str) -> None + """Add a storage account + :param StorageCredentials self: this + :param str name: name of storage account to store + :param str key: storage key or sas + :param str endpoint: endpoint + """ + if name in self._storage_accounts: + raise ValueError( + '{} already exists in storage accounts'.format(name)) + self._storage_accounts[name] = StorageAccount( + name, key, endpoint, + self._general_options.concurrency.transfer_threads + ) + + def get_storage_account(self, name): + # type: (StorageCredentials, str) -> StorageAccount + """Get storage account details + :param StorageCredentials self: this + :param str name: name of storage account to retrieve + :rtype: StorageAccount + :return: storage account details + """ + return self._storage_accounts[name] + + +class StorageAccount(object): + """Azure Storage Account""" + def __init__(self, name, key, endpoint, transfer_threads): + # type: (StorageAccount, str, str, str, int) -> None + """Ctor for StorageAccount + :param str name: name of storage account + :param str key: storage key or sas + :param str endpoint: endpoint + :param int transfer_threads: number of transfer threads + """ + self._append_blob_client = None + self._block_blob_client = None + self._file_client = None + self._page_blob_client = None + self.name = name + self.key = key + self.endpoint = endpoint + self.is_sas = StorageAccount._key_is_sas(self.key) + self.create_containers = self._container_creation_allowed() + # normalize sas keys + if self.is_sas and self.key.startswith('?'): + self.key = self.key[1:] + # create requests session for connection pooling + self.session = requests.Session() + self.session.mount( + 'https://', + requests.adapters.HTTPAdapter( + pool_connections=transfer_threads, + pool_maxsize=transfer_threads << 1, + ) + ) + self._create_clients() + + @staticmethod + def _key_is_sas(key): + # type: (str) -> bool + """Determine if key is a sas + :param str key: key to parse + :rtype: bool + :return: if key is a sas + """ + # keys starting with ? are sas keys as ? is not in the base-64 + # character range + if key.startswith('?'): + return True + else: + # & is not in the base-64 character range, so technically + # the presence of this character means the key is a sas. however, + # perform a stronger check for the sig= parameter. + tmp = key.split('&') + if len(tmp) == 1: + return False + elif any(x.startswith('sig=') for x in tmp): + return True + return False + + def _container_creation_allowed(self): + # # type: (StorageAccount) -> bool + """Check if container creation is allowed + :param StorageAccount self: this + :rtype: bool + :return: if container creation is allowed + """ + if self.is_sas: + # search for account sas "c" resource + sasparts = self.key.split('&') + for part in sasparts: + tmp = part.split('=') + if tmp[0] == 'srt': + if 'c' in tmp[1]: + return True + else: + # storage account key always allows container creation + return True + return False + + def _create_clients(self): + # type: (StorageAccount) -> None + """Create Azure Storage clients + :param StorageAccount self: this + """ + self._append_blob_client = \ + blobxfer.operations.azure.blob.append.create_client(self) + self._block_blob_client = \ + blobxfer.operations.azure.blob.block.create_client(self) + self._file_client = blobxfer.operations.azure.file.create_client(self) + self._page_blob_client = \ + blobxfer.operations.azure.blob.page.create_client(self) + + @property + def append_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.AppendBlobService + """Get append blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.AppendBlobService + :return: append blob client + """ + return self._append_blob_client + + @property + def block_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.BlockBlobService + """Get block blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.BlockBlobService + :return: block blob client + """ + return self._block_blob_client + + @property + def file_client(self): + # type: (StorageAccount) -> azure.storage.file.FileService + """Get file client + :param StorageAccount self: this + :rtype: azure.storage.file.FileService + :return: file client + """ + return self._file_client + + @property + def page_blob_client(self): + # type: (StorageAccount) -> azure.storage.blob.PageBlobService + """Get page blob client + :param StorageAccount self: this + :rtype: azure.storage.blob.PageBlobService + :return: page blob client + """ + return self._page_blob_client + + +class SourcePath(blobxfer.models._BaseSourcePaths): + """Azure Source Path""" + def __init__(self): + # type: (SourcePath) -> None + """Ctor for SourcePath + :param SourcePath self: this + """ + super(SourcePath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (SourcePath, str, str) -> None + """Add a path with an associated storage account + :param SourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to SourcePath objects') + rpath = blobxfer.util.normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (SourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param SourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] + + def files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Generator of Azure remote files or blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + if options.mode == blobxfer.models.azure.StorageModes.File: + for file in self._populate_from_list_files( + creds, options, general_options): + yield file + else: + for blob in self._populate_from_list_blobs( + creds, options, general_options): + yield blob + + def _convert_to_storage_entity_with_encryption_metadata( + self, options, sa, entity, vio, is_file, container, dir): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, StorageAccount, object, + # blobxfer.models.metadata.VectoredStripe, bool, str, + # str) -> StorageEntity + """Convert entity into StorageEntity with encryption metadata if avail + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param StorageAccount sa: storage account + :param object entity: Storage File or Blob object + :param blobxfer.models.metadata.VectoredStripe vio: Vectored stripe + :param bool is_file: is a file object + :param str container: container + :param str dir: Azure File directory structure + :rtype: StorageEntity + :return: Azure storage entity object + """ + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(entity.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json( + entity.metadata, entity.name, options.rsa_private_key) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(container, ed) + if is_file: + ase.populate_from_file(sa, entity, dir, vio) + else: + ase.populate_from_blob(sa, entity, vio) + return ase + + def _handle_vectored_io_stripe( + self, creds, options, general_options, sa, entity, is_file, + container, dir=None): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General, StorageAccount, object, + # bool, str, str) -> StorageEntity + """Handle Vectored IO stripe entries + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :param StorageAccount sa: storage account + :param object entity: Storage File or Blob object + :param bool is_file: is a file object + :param str container: container + :param str dir: Azure File directory structure + :rtype: StorageEntity + :return: Azure storage entity object + """ + vio = blobxfer.models.metadata.vectored_io_from_metadata( + entity.metadata) + if not isinstance(vio, blobxfer.models.metadata.VectoredStripe): + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, None, is_file, container, dir) + yield ase + return + # if this slice is not the first, ignore. the reason for this is + # 1. Ensures direct get on a slice does nothing unless the + # zero-th blob is retrieved/accessed (eliminates partial data + # download), which will reconstruct all of the stripes via next + # pointers + # 2. Data is not retrieved multiple times for the same slice without + # having to maintain a fetched map + if vio.slice_id != 0: + yield None + return + # yield this entity + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, vio, is_file, container, dir) + yield ase + # iterate all slices + while vio.next is not None: + # follow next pointer + sa = creds.get_storage_account(vio.next.storage_account_name) + if is_file: + entity = blobxfer.operations.azure.file.get_file_properties( + sa.file_client, vio.next.container, vio.next.name, + timeout=general_options.timeout_sec) + _, dir = blobxfer.util.explode_azure_path(vio.next.name) + else: + entity = blobxfer.operations.azure.blob.get_blob_properties( + sa.block_blob_client, vio.next.container, vio.next.name, + ase.mode, timeout=general_options.timeout_sec) + vio = blobxfer.models.metadata.vectored_io_from_metadata( + entity.metadata) + # yield next + ase = self._convert_to_storage_entity_with_encryption_metadata( + options, sa, entity, vio, is_file, container, dir) + yield ase + + def _populate_from_list_files(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote files + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for file in blobxfer.operations.azure.file.list_files( + sa.file_client, cont, dir, options.recursive, + general_options.timeout_sec): + if not self._inclusion_check(file.name): + continue + if dir is not None: + dir, _ = blobxfer.operations.azure.file.parse_file_path( + dir) + for ase in self._handle_vectored_io_stripe( + creds, options, general_options, sa, file, True, cont, + dir): + if ase is None: + continue + yield ase + + def _populate_from_list_blobs(self, creds, options, general_options): + # type: (SourcePath, StorageCredentials, + # blobxfer.models.options.Download, + # blobxfer.models.options.General) -> StorageEntity + """Internal generator for Azure remote blobs + :param SourcePath self: this + :param StorageCredentials creds: storage creds + :param blobxfer.models.options.Download options: download options + :param blobxfer.models.options.General general_options: general options + :rtype: StorageEntity + :return: Azure storage entity object + """ + for _path in self._paths: + rpath = str(_path) + cont, dir = blobxfer.util.explode_azure_path(rpath) + sa = creds.get_storage_account(self.lookup_storage_account(rpath)) + for blob in blobxfer.operations.azure.blob.list_blobs( + sa.block_blob_client, cont, dir, options.mode, + options.recursive, general_options.timeout_sec): + if not self._inclusion_check(blob.name): + continue + for ase in self._handle_vectored_io_stripe( + creds, options, general_options, sa, blob, False, + cont): + if ase is None: + continue + yield ase + + +class DestinationPath(blobxfer.models._BaseSourcePaths): + """Azure Destination Path""" + def __init__(self): + # type: (SourcePath) -> None + """Ctor for SourcePath + :param SourcePath self: this + """ + super(DestinationPath, self).__init__() + self._path_map = {} + + def add_path_with_storage_account(self, remote_path, storage_account): + # type: (SourcePath, str, str) -> None + """Add a path with an associated storage account + :param SourcePath self: this + :param str remote_path: remote path + :param str storage_account: storage account to associate with path + """ + if len(self._path_map) >= 1: + raise RuntimeError( + 'cannot add multiple remote paths to SourcePath objects') + rpath = blobxfer.util.normalize_azure_path(remote_path) + self.add_path(rpath) + self._path_map[rpath] = storage_account + + def lookup_storage_account(self, remote_path): + # type: (SourcePath, str) -> str + """Lookup the storage account associated with the remote path + :param SourcePath self: this + :param str remote_path: remote path + :rtype: str + :return: storage account associated with path + """ + return self._path_map[blobxfer.util.normalize_azure_path(remote_path)] diff --git a/blobxfer/operations/azure/blob/__init__.py b/blobxfer/operations/azure/blob/__init__.py new file mode 100644 index 0000000..e256319 --- /dev/null +++ b/blobxfer/operations/azure/blob/__init__.py @@ -0,0 +1,252 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +import azure.common +import azure.storage.blob.models +# local imports +import blobxfer.models.azure +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +def check_if_single_blob(client, container, prefix, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, int) -> bool + """Check if prefix is a single blob or multiple blobs + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + :param int timeout: timeout + :rtype: bool + :return: if prefix in container is a single blob + """ + if blobxfer.util.blob_is_snapshot(prefix): + return True + try: + client.get_blob_properties( + container_name=container, blob_name=prefix, timeout=timeout) + except azure.common.AzureMissingResourceHttpError: + return False + return True + + +def get_blob_properties(client, container, prefix, mode, timeout=None): + if mode == blobxfer.models.azure.StorageModes.File: + raise RuntimeError('cannot list Azure Files from blob client') + try: + blob = client.get_blob_properties( + container_name=container, blob_name=prefix, timeout=timeout) + except azure.common.AzureMissingResourceHttpError: + return None + if (mode == blobxfer.models.azure.StorageModes.Append and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.AppendBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + elif (mode == blobxfer.models.azure.StorageModes.Block and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.BlockBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + elif (mode == blobxfer.models.azure.StorageModes.Page and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.PageBlob): + raise RuntimeError( + 'existing blob type {} mismatch with mode {}'.format( + blob.properties.blob_type, mode)) + return blob + + +def list_blobs(client, container, prefix, mode, recursive, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, + # blobxfer.models.azure.StorageModes, bool, int) -> + # azure.storage.blob.models.Blob + """List blobs in path conforming to mode + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str prefix: path prefix + :param blobxfer.models.azure.StorageModes mode: storage mode + :param bool recursive: recursive + :param int timeout: timeout + :rtype: azure.storage.blob.models.Blob + :return: generator of blobs + """ + if mode == blobxfer.models.azure.StorageModes.File: + raise RuntimeError('cannot list Azure Files from blob client') + if blobxfer.util.blob_is_snapshot(prefix): + base_blob, snapshot = blobxfer.util.parse_blob_snapshot_parameter( + prefix) + blob = client.get_blob_properties( + container_name=container, blob_name=base_blob, snapshot=snapshot, + timeout=timeout) + yield blob + return + blobs = client.list_blobs( + container_name=container, + prefix=prefix if blobxfer.util.is_not_empty(prefix) else None, + include=azure.storage.blob.models.Include.METADATA, + timeout=timeout, + ) + for blob in blobs: + if (mode == blobxfer.models.azure.StorageModes.Append and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.AppendBlob): + continue + elif (mode == blobxfer.models.azure.StorageModes.Block and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.BlockBlob): + continue + elif (mode == blobxfer.models.azure.StorageModes.Page and + blob.properties.blob_type != + azure.storage.blob.models._BlobTypes.PageBlob): + continue + if not recursive and '/' in blob.name: + continue + # auto or match, yield the blob + yield blob + + +def list_all_blobs(client, container, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, int) -> + # azure.storage.blob.models.Blob + """List all blobs in a container + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param int timeout: timeout + :rtype: azure.storage.blob.models.Blob + :return: generator of blobs + """ + blobs = client.list_blobs( + container_name=container, + prefix=None, + timeout=timeout, + ) + for blob in blobs: + yield blob + + +def delete_blob(client, container, name, timeout=None): + # type: (azure.storage.blob.BaseBlobService, str, str, int) -> None + """Delete blob, including all associated snapshots + :param azure.storage.blob.BaseBlobService client: blob client + :param str container: container + :param str name: blob name + :param int timeout: timeout + """ + client.delete_blob( + container_name=container, + blob_name=name, + delete_snapshots=azure.storage.blob.models.DeleteSnapshot.Include, + timeout=timeout, + ) + + +def get_blob_range(ase, offsets, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.download.Offsets, int) -> bytes + """Retrieve blob range + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.download.Offsets offsets: download offsets + :param int timeout: timeout + :rtype: bytes + :return: content for blob range + """ + return ase.client._get_blob( + container_name=ase.container, + blob_name=ase.name, + snapshot=ase.snapshot, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # HTTPS takes care of integrity during xfer + timeout=timeout, + ).content + + +def create_container(ase, containers_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create blob container + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict containers_created: containers already created map + :param int timeout: timeout + """ + # check if auth allows create container + if not ase.create_containers: + return + key = ase.client.account_name + ':blob=' + ase.container + if key not in containers_created: + try: + ase.client.create_container( + container_name=ase.container, + fail_on_exist=True, + timeout=timeout) + except azure.common.AzureConflictHttpError: + pass + else: + containers_created.add(key) + logger.info( + 'created blob container {} on storage account {}'.format( + ase.container, ase.client.account_name)) + + +def set_blob_md5(ase, md5, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, str, int) -> None + """Set blob properties MD5 + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param str md5: md5 as base64 + :param int timeout: timeout + """ + ase.client.set_blob_properties( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + timeout=timeout) + + +def set_blob_metadata(ase, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Set blob metadata + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + ase.client.set_blob_metadata( + container_name=ase.container, + blob_name=ase.name, + metadata=metadata, + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/append.py b/blobxfer/operations/azure/blob/append.py new file mode 100644 index 0000000..abc276a --- /dev/null +++ b/blobxfer/operations/azure/blob/append.py @@ -0,0 +1,94 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +import azure.storage.blob +# local imports +import blobxfer.retry + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.operations.azure.StorageAccount) -> AppendBlobService + """Create Append blob client + :param blobxfer.operations.azure.StorageAccount storage_account: + storage account + :rtype: AppendBlobService + :return: append blob service client + """ + if storage_account.is_sas: + client = azure.storage.blob.AppendBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + else: + client = azure.storage.blob.AppendBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry + return client + + +def create_blob(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create append blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + ase.client.create_blob( + container_name=ase.container, + blob_name=ase.name, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name) + ), + timeout=timeout) + + +def append_block(ase, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, int) -> None + """Appends a block into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param bytes data: data + :param int timeout: timeout + """ + ase.client.append_block( + container_name=ase.container, + blob_name=ase.name, + block=data, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/block.py b/blobxfer/operations/azure/blob/block.py new file mode 100644 index 0000000..b8e5bc5 --- /dev/null +++ b/blobxfer/operations/azure/blob/block.py @@ -0,0 +1,143 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +import azure.storage.blob +# local imports +import blobxfer.retry + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.operations.azure.StorageAccount) -> BlockBlobService + """Create block blob client + :param blobxfer.operations.azure.StorageAccount storage_account: + storage account + :rtype: azure.storage.blob.BlockBlobService + :return: block blob service client + """ + if storage_account.is_sas: + client = azure.storage.blob.BlockBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + else: + client = azure.storage.blob.BlockBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry + return client + + +def create_blob(ase, data, md5, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, str, dict, + # int) -> None + """Create one shot block blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param bytes data: blob data + :param str md5: md5 as base64 + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + ase.client._put_blob( + container_name=ase.container, + blob_name=ase.name, + blob=data, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + metadata=metadata, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def _format_block_id(chunk_num): + # type: (int) -> str + """Create a block id given a block (chunk) number + :param int chunk_num: chunk number + :rtype: str + :return: block id + """ + return '{0:08d}'.format(chunk_num) + + +def put_block(ase, offsets, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes, int) -> None + """Puts a block into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.upload.Offsets offsets: upload offsets + :param bytes data: data + :param int timeout: timeout + """ + ase.client.put_block( + container_name=ase.container, + blob_name=ase.name, + block=data, + block_id=_format_block_id(offsets.chunk_num), + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def put_block_list(ase, last_block_num, md5, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, bytes, str, dict, + # int) -> None + """Create block blob from blocks + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int last_block_num: last block number (chunk_num) + :param str md5: md5 as base64 + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + # construct block list + block_list = [ + azure.storage.blob.BlobBlock(id=_format_block_id(x)) + for x in range(0, last_block_num + 1) + ] + ase.client.put_block_list( + container_name=ase.container, + blob_name=ase.name, + block_list=block_list, + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name), + content_md5=md5, + ), + metadata=metadata, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) diff --git a/blobxfer/operations/azure/blob/page.py b/blobxfer/operations/azure/blob/page.py new file mode 100644 index 0000000..aa92b14 --- /dev/null +++ b/blobxfer/operations/azure/blob/page.py @@ -0,0 +1,100 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +# non-stdlib imports +import azure.storage.blob +# local imports +import blobxfer.retry + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.operations.azure.StorageAccount) -> PageBlobService + """Create block blob client + :param blobxfer.operations.azure.StorageAccount storage_account: + storage account + :rtype: PageBlobService + :return: block blob service client + """ + if storage_account.is_sas: + client = azure.storage.blob.PageBlobService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + else: + client = azure.storage.blob.PageBlobService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry + return client + + +def create_blob(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create page blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + ase.client.create_blob( + container_name=ase.container, + blob_name=ase.name, + content_length=blobxfer.util.page_align_content_length(ase.size), + content_settings=azure.storage.blob.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(ase.name) + ), + timeout=timeout) + + +def put_page(ase, page_start, page_end, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # int, int, bytes, int) -> None + """Puts a page into remote blob + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int page_start: page range start + :param int page_end: page range end + :param bytes data: data + :param int timeout: timeout + """ + ase.client.update_page( + container_name=ase.container, + blob_name=ase.name, + page=data, + start_range=page_start, + end_range=page_end, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) diff --git a/blobxfer/operations/azure/file.py b/blobxfer/operations/azure/file.py new file mode 100644 index 0000000..3a905bf --- /dev/null +++ b/blobxfer/operations/azure/file.py @@ -0,0 +1,364 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import azure.common +import azure.storage.file +# local imports +import blobxfer.retry +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +def create_client(storage_account): + # type: (blobxfer.operations.azure.StorageAccount) -> FileService + """Create file client + :param blobxfer.operations.azure.StorageAccount storage_account: + storage account + :rtype: FileService + :return: file service client + """ + if storage_account.is_sas: + client = azure.storage.file.FileService( + account_name=storage_account.name, + sas_token=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + else: + client = azure.storage.file.FileService( + account_name=storage_account.name, + account_key=storage_account.key, + endpoint_suffix=storage_account.endpoint, + request_session=storage_account.session) + # set retry policy + client.retry = blobxfer.retry.ExponentialRetryWithMaxWait().retry + return client + + +def parse_file_path(filepath): + # type: (pathlib.Path) -> Tuple[str, str] + """Parse file path from file path + :param str filepath: file path + :rtype: tuple + :return: (dirname, rest of path) + """ + if not isinstance(filepath, pathlib.Path): + filepath = pathlib.Path(filepath) + dirname = '/'.join(filepath.parts[:len(filepath.parts) - 1]) + if len(dirname) == 0: + dirname = None + if len(filepath.parts) > 0: + fname = filepath.parts[-1] + else: + fname = None + return (dirname, fname) + + +def get_file_properties(client, fileshare, prefix, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> + # azure.storage.file.models.File + """Get file properties + :param FileService client: blob client + :param str fileshare: file share name + :param str prefix: path prefix + :param int timeout: timeout + :rtype: azure.storage.file.models.File + :return: file properties + """ + dirname, fname = parse_file_path(prefix) + try: + return client.get_file_properties( + share_name=fileshare, + directory_name=dirname, + file_name=fname, + timeout=timeout, + ) + except azure.common.AzureMissingResourceHttpError: + return None + + +def check_if_single_file(client, fileshare, prefix, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> + # Tuple[bool, azure.storage.file.models.File] + """Check if prefix is a single file or multiple files + :param FileService client: blob client + :param str fileshare: file share name + :param str prefix: path prefix + :param int timeout: timeout + :rtype: tuple + :return: (if prefix in fileshare is a single file, file) + """ + if blobxfer.util.is_none_or_empty(prefix): + return (False, None) + file = get_file_properties(client, fileshare, prefix, timeout) + if file is None: + return (False, file) + else: + return (True, file) + + +def list_files(client, fileshare, prefix, recursive, timeout=None): + # type: (azure.storage.file.FileService, str, str, bool, int) -> + # azure.storage.file.models.File + """List files in path + :param azure.storage.file.FileService client: file client + :param str fileshare: file share + :param str prefix: path prefix + :param bool recursive: recursive + :param int timeout: timeout + :rtype: azure.storage.file.models.File + :return: generator of files + """ + # if single file, then yield file and return + _check = check_if_single_file(client, fileshare, prefix, timeout) + if _check[0]: + yield _check[1] + return + # else recursively list from prefix path + dirs = [prefix] + while len(dirs) > 0: + dir = dirs.pop() + files = client.list_directories_and_files( + share_name=fileshare, + directory_name=dir, + timeout=timeout, + ) + for file in files: + fspath = str( + pathlib.Path(dir if dir is not None else '') / file.name) + if type(file) == azure.storage.file.models.File: + fsprop = client.get_file_properties( + share_name=fileshare, + directory_name=None, + file_name=fspath, + timeout=timeout, + ) + yield fsprop + else: + if recursive: + dirs.append(fspath) + + +def list_all_files(client, fileshare, timeout=None): + # type: (azure.storage.file.FileService, str, int) -> str + """List all files in share + :param azure.storage.file.FileService client: file client + :param str fileshare: file share + :param int timeout: timeout + :rtype: str + :return: file name + """ + dirs = [None] + while len(dirs) > 0: + dir = dirs.pop() + files = client.list_directories_and_files( + share_name=fileshare, + directory_name=dir, + timeout=timeout, + ) + for file in files: + fspath = str( + pathlib.Path(dir if dir is not None else '') / file.name) + if type(file) == azure.storage.file.models.File: + yield fspath + else: + dirs.append(fspath) + + +def delete_file(client, fileshare, name, timeout=None): + # type: (azure.storage.file.FileService, str, str, int) -> None + """Delete file from share + :param azure.storage.file.FileService client: file client + :param str fileshare: file share + :param str name: file name + :param int timeout: timeout + """ + dir, fpath = parse_file_path(name) + client.delete_file( + share_name=fileshare, + directory_name=dir, + file_name=fpath, + timeout=timeout, + ) + + +def get_file_range(ase, offsets, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.download.Offsets, int) -> bytes + """Retrieve file range + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.download.Offsets offsets: download offsets + :param int timeout: timeout + :rtype: bytes + :return: content for file range + """ + dir, fpath = parse_file_path(ase.name) + return ase.client._get_file( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # HTTPS takes care of integrity during xfer + timeout=timeout, + ).content + + +def create_share(ase, containers_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create file share + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict containers_created: containers already created map + :param int timeout: timeout + """ + # check if auth allows create container + if not ase.create_containers: + return + key = ase.client.account_name + ':file=' + ase.container + if key not in containers_created: + try: + ase.client.create_share( + share_name=ase.container, + fail_on_exist=True, + timeout=timeout) + except azure.common.AzureConflictHttpError: + pass + else: + containers_created.add(key) + logger.info('created file share {} on storage account {}'.format( + ase.container, ase.client.account_name)) + + +def create_all_parent_directories(ase, dirs_created, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Create all parent directories for a file + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict dirs_created: directories already created map + :param int timeout: timeout + """ + dirs = pathlib.Path(ase.name).parts + if len(dirs) <= 1: + return + # remove last part (which is the file) + dirs = dirs[:-1] + dk = ase.client.account_name + ':' + ase.container + for i in range(0, len(dirs)): + dir = str(pathlib.Path(*(dirs[0:i + 1]))) + if dk not in dirs_created or dir not in dirs_created[dk]: + ase.client.create_directory( + share_name=ase.container, + directory_name=dir, + fail_on_exist=False, + timeout=timeout) + if dk not in dirs_created: + dirs_created[dk] = set() + dirs_created[dk].add(dir) + + +def create_file(ase, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, int) -> None + """Create file remotely + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.create_file( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + content_length=ase.size, + content_settings=azure.storage.file.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(fpath) + ), + timeout=timeout) + + +def put_file_range(ase, offsets, data, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes, int) -> None + """Puts a range of bytes into the remote file + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param blobxfer.models.upload.Offsets offsets: upload offsets + :param bytes data: data + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.update_range( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + data=data, + start_range=offsets.range_start, + end_range=offsets.range_end, + validate_content=False, # integrity is enforced with HTTPS + timeout=timeout) + + +def set_file_md5(ase, md5, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, str, int) -> None + """Set file properties MD5 + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param str md5: md5 as base64 + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.set_file_properties( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + content_settings=azure.storage.file.models.ContentSettings( + content_type=blobxfer.util.get_mime_type(fpath), + content_md5=md5, + ), + timeout=timeout) + + +def set_file_metadata(ase, metadata, timeout=None): + # type: (blobxfer.models.azure.StorageEntity, dict, int) -> None + """Set file metadata + :param blobxfer.models.azure.StorageEntity ase: Azure StorageEntity + :param dict metadata: metadata kv pairs + :param int timeout: timeout + """ + dir, fpath = parse_file_path(ase.name) + ase.client.set_file_metadata( + share_name=ase.container, + directory_name=dir, + file_name=fpath, + metadata=metadata, + timeout=timeout) diff --git a/blobxfer/operations/crypto.py b/blobxfer/operations/crypto.py new file mode 100644 index 0000000..76cf001 --- /dev/null +++ b/blobxfer/operations/crypto.py @@ -0,0 +1,316 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import base64 +import enum +import logging +import os +try: + import queue +except ImportError: # noqa + import Queue as queue +import tempfile +# non-stdlib imports +import cryptography.hazmat.backends +import cryptography.hazmat.primitives.asymmetric.padding +import cryptography.hazmat.primitives.asymmetric.rsa +import cryptography.hazmat.primitives.ciphers +import cryptography.hazmat.primitives.ciphers.algorithms +import cryptography.hazmat.primitives.ciphers.modes +import cryptography.hazmat.primitives.constant_time +import cryptography.hazmat.primitives.hashes +import cryptography.hazmat.primitives.padding +import cryptography.hazmat.primitives.serialization +# local imports +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + +# encryption constants +_AES256_KEYLENGTH_BYTES = 32 + + +# enums +class CryptoAction(enum.Enum): + Encrypt = 1 + Decrypt = 2 + + +def load_rsa_private_key_file(rsakeyfile, passphrase): + # type: (str, str) -> + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + """Load an RSA Private key PEM file with passphrase if specified + :param str rsakeyfile: RSA private key PEM file to load + :param str passphrase: optional passphrase + :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :return: RSAPrivateKey + """ + keypath = os.path.expandvars(os.path.expanduser(rsakeyfile)) + with open(keypath, 'rb') as keyfile: + return cryptography.hazmat.primitives.serialization.\ + load_pem_private_key( + keyfile.read(), + passphrase.encode('utf8') if passphrase is not None else None, + backend=cryptography.hazmat.backends.default_backend() + ) + + +def load_rsa_public_key_file(rsakeyfile): + # type: (str, str) -> + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + """Load an RSA Public key PEM file + :param str rsakeyfile: RSA public key PEM file to load + :rtype: cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + :return: RSAPublicKey + """ + keypath = os.path.expandvars(os.path.expanduser(rsakeyfile)) + with open(keypath, 'rb') as keyfile: + return cryptography.hazmat.primitives.serialization.\ + load_pem_public_key( + keyfile.read(), + backend=cryptography.hazmat.backends.default_backend() + ) + + +def rsa_decrypt_base64_encoded_key(rsaprivatekey, enckey): + # type: (cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey, + # str) -> bytes + """Decrypt an RSA encrypted key encoded as base64 + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :param str enckey: base64-encoded key + :rtype: bytes + :return: decrypted key + """ + return rsaprivatekey.decrypt( + base64.b64decode(enckey), + cryptography.hazmat.primitives.asymmetric.padding.OAEP( + mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( + algorithm=cryptography.hazmat.primitives.hashes.SHA1() + ), + algorithm=cryptography.hazmat.primitives.hashes.SHA1(), + label=None, + ) + ) + + +def rsa_encrypt_key_base64_encoded(rsaprivatekey, rsapublickey, plainkey): + # type: (cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey, + # cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey, + # bytes) -> str + """Encrypt a plaintext key using RSA and PKCS1_OAEP padding + :param rsaprivatekey: RSA private key + :type rsaprivatekey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPrivateKey + :param rsapublickey: RSA public key + :type rsapublickey: + cryptography.hazmat.primitives.asymmetric.rsa.RSAPublicKey + :param bytes plainkey: plain key + :rtype: str + :return: encrypted key + """ + if rsapublickey is None: + rsapublickey = rsaprivatekey.public_key() + enckey = rsapublickey.encrypt( + plainkey, cryptography.hazmat.primitives.asymmetric.padding.OAEP( + mgf=cryptography.hazmat.primitives.asymmetric.padding.MGF1( + algorithm=cryptography.hazmat.primitives.hashes.SHA1()), + algorithm=cryptography.hazmat.primitives.hashes.SHA1(), + label=None)) + return blobxfer.util.base64_encode_as_string(enckey) + + +def pkcs7_pad(buf): + # type: (bytes) -> bytes + """Appends PKCS7 padding to an input buffer + :param bytes buf: buffer to add padding + :rtype: bytes + :return: buffer with PKCS7_PADDING + """ + padder = cryptography.hazmat.primitives.padding.PKCS7( + cryptography.hazmat.primitives.ciphers. + algorithms.AES.block_size).padder() + return padder.update(buf) + padder.finalize() + + +def pkcs7_unpad(buf): + # type: (bytes) -> bytes + """Removes PKCS7 padding a decrypted object + :param bytes buf: buffer to remove padding + :rtype: bytes + :return: buffer without PKCS7_PADDING + """ + unpadder = cryptography.hazmat.primitives.padding.PKCS7( + cryptography.hazmat.primitives.ciphers. + algorithms.AES.block_size).unpadder() + return unpadder.update(buf) + unpadder.finalize() + + +def aes256_generate_random_key(): + # type: (None) -> bytes + """Generate random AES256 key + :rtype: bytes + :return: random key + """ + return os.urandom(_AES256_KEYLENGTH_BYTES) + + +def aes_cbc_decrypt_data(symkey, iv, encdata, unpad): + # type: (bytes, bytes, bytes, bool) -> bytes + """Decrypt data using AES CBC + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes encdata: data to decrypt + :param bool unpad: unpad data + :rtype: bytes + :return: decrypted data + """ + cipher = cryptography.hazmat.primitives.ciphers.Cipher( + cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), + cryptography.hazmat.primitives.ciphers.modes.CBC(iv), + backend=cryptography.hazmat.backends.default_backend()).decryptor() + decrypted = cipher.update(encdata) + cipher.finalize() + if unpad: + return pkcs7_unpad(decrypted) + else: + return decrypted + + +def aes_cbc_encrypt_data(symkey, iv, data, pad): + # type: (bytes, bytes, bytes, bool) -> bytes + """Encrypt data using AES CBC + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param bytes data: data to encrypt + :param bool pad: pad data + :rtype: bytes + :return: encrypted data + """ + cipher = cryptography.hazmat.primitives.ciphers.Cipher( + cryptography.hazmat.primitives.ciphers.algorithms.AES(symkey), + cryptography.hazmat.primitives.ciphers.modes.CBC(iv), + backend=cryptography.hazmat.backends.default_backend()).encryptor() + if pad: + return cipher.update(pkcs7_pad(data)) + cipher.finalize() + else: + return cipher.update(data) + cipher.finalize() + + +class CryptoOffload(blobxfer.models.offload._MultiprocessOffload): + def __init__(self, num_workers): + # type: (CryptoOffload, int) -> None + """Ctor for Crypto Offload + :param CryptoOffload self: this + :param int num_workers: number of worker processes + """ + super(CryptoOffload, self).__init__( + self._worker_process, num_workers, 'Crypto') + + def _worker_process(self): + # type: (CryptoOffload) -> None + """Crypto worker + :param CryptoOffload self: this + """ + while not self.terminated: + try: + inst = self._task_queue.get(True, 0.1) + except queue.Empty: + continue + # UNUSED due to AES256-CBC FullBlob mode + if inst[0] == CryptoAction.Encrypt: # noqa + local_file, offsets, symkey, iv = \ + inst[1], inst[2], inst[3], inst[4] + with open(local_file, 'rb') as fd: + data = fd.read() + encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( + symkey, iv, data, offsets.pad) + with tempfile.NamedTemporaryFile( + mode='wb', delete=False) as fd: + fpath = fd.name + fd.write(encdata) + self._done_cv.acquire() + self._done_queue.put(fpath) + elif inst[0] == CryptoAction.Decrypt: + final_path, internal_fdstart, offsets, symkey, iv, \ + hmac_datafile = \ + inst[1], inst[2], inst[3], inst[4], inst[5], inst[6] + # read encrypted data from disk + with open(hmac_datafile, 'rb') as fd: + encdata = fd.read() + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( + symkey, iv, encdata, offsets.unpad) + # write decrypted data to disk + if len(data) > 0: + with open(final_path, 'r+b') as fd: + fd.seek(internal_fdstart + offsets.fd_start, 0) + fd.write(data) + self._done_cv.acquire() + self._done_queue.put((final_path, offsets)) + # notify and release condition var + self._done_cv.notify() + self._done_cv.release() + + def add_decrypt_chunk( + self, final_path, internal_fdstart, offsets, symkey, iv, + hmac_datafile): + # type: (CryptoOffload, str, int, blobxfer.models.download.Offsets, + # bytes, bytes, str) -> None + """Add a chunk to decrypt + :param CryptoOffload self: this + :param str final_path: final path + :param int internal_fdstart: internal fd offset start + :param blobxfer.models.download.Offsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + :param str hmac_datafile: encrypted data file + """ + self._task_queue.put( + (CryptoAction.Decrypt, final_path, internal_fdstart, offsets, + symkey, iv, hmac_datafile) + ) + + # UNUSED due to AES256-CBC FullBlob mode + def add_encrypt_chunk(self, local_file, offsets, symkey, iv): # noqa + # type: (CryptoOffload, pathlib.Path, blobxfer.models.upload.Offsets, + # bytes, bytes) -> None + """Add a chunk to encrypt + :param CryptoOffload self: this + :param pathlib.Path local_file: local file + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes symkey: symmetric key + :param bytes iv: initialization vector + """ + self._task_queue.put( + (CryptoAction.Encrypt, str(local_file), offsets, symkey, iv) + ) diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py new file mode 100644 index 0000000..9d51d32 --- /dev/null +++ b/blobxfer/operations/download.py @@ -0,0 +1,808 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +import logging +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue +import threading +import time +# non-stdlib imports +# local imports +import blobxfer.models.crypto +import blobxfer.models.metadata +import blobxfer.operations.azure.blob +import blobxfer.operations.azure.file +import blobxfer.operations.crypto +import blobxfer.operations.md5 +import blobxfer.operations.progress +import blobxfer.operations.resume +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +class DownloadAction(enum.Enum): + Skip = 1 + CheckMd5 = 2 + Download = 3 + + +class Downloader(object): + """Downloader""" + def __init__(self, general_options, creds, spec): + # type: (Downloader, blobxfer.models.options.General, + # blobxfer.operations.azure.StorageCredentials, + # blobxfer.models.download.Specification) -> None + """Ctor for Downloader + :param Downloader self: this + :param blobxfer.models.options.General general_options: general opts + :param blobxfer.operations.azure.StorageCredentials creds: creds + :param blobxfer.models.download.Specification spec: download spec + """ + self._all_remote_files_processed = False + self._crypto_offload = None + self._md5_meta_lock = threading.Lock() + self._md5_map = {} + self._md5_offload = None + self._transfer_lock = threading.Lock() + self._transfer_queue = queue.Queue() + self._transfer_set = set() + self._transfer_threads = [] + self._disk_operation_lock = threading.Lock() + self._disk_queue = queue.Queue() + self._disk_set = set() + self._disk_threads = [] + self._download_start_time = None + self._download_total = 0 + self._download_sofar = 0 + self._download_bytes_total = 0 + self._download_bytes_sofar = 0 + self._download_terminate = False + self._start_time = None + self._delete_after = set() + self._dd_map = {} + self._vio_map = {} + self._general_options = general_options + self._creds = creds + self._spec = spec + self._resume = None + self._exceptions = [] + + @property + def termination_check(self): + # type: (Downloader) -> bool + """Check if terminated + :param Downloader self: this + :rtype: bool + :return: if terminated + """ + with self._transfer_lock: + with self._disk_operation_lock: + return (self._download_terminate or + len(self._exceptions) > 0 or + (self._all_remote_files_processed and + len(self._transfer_set) == 0 and + len(self._disk_set) == 0)) + + @property + def termination_check_md5(self): + # type: (Downloader) -> bool + """Check if terminated from MD5 context + :param Downloader self: this + :rtype: bool + :return: if terminated from MD5 context + """ + with self._md5_meta_lock: + with self._transfer_lock: + return (self._download_terminate or + (self._all_remote_files_processed and + len(self._md5_map) == 0 and + len(self._transfer_set) == 0)) + + @staticmethod + def ensure_local_destination(creds, spec): + # type: (blobxfer.operations.azure.StorageCredentials, + # blobxfer.models.download.Specification) -> None + """Ensure a local destination path given a download spec + :param blobxfer.operations.azure.StorageCredentials creds: creds + :param blobxfer.models.download.Specification spec: download spec + """ + # ensure destination path is writable given the source + if len(spec.sources) < 1: + raise RuntimeError('no sources to download from specified') + # set is_dir for destination + spec.destination.is_dir = True + if len(spec.sources) == 1: + # we need to query the source to see if this is a directory + rpath = str(spec.sources[0].paths[0]) + cont, dir = blobxfer.util.explode_azure_path(rpath) + if not blobxfer.util.is_none_or_empty(dir): + sa = creds.get_storage_account( + spec.sources[0].lookup_storage_account(rpath)) + if (spec.options.mode == + blobxfer.models.azure.StorageModes.File): + if (blobxfer.operations.azure.file.check_if_single_file( + sa.file_client, cont, dir)[0] and + spec.options.rename): + spec.destination.is_dir = False + else: + if (blobxfer.operations.azure.blob.check_if_single_blob( + sa.block_blob_client, cont, dir) and + spec.options.rename): + spec.destination.is_dir = False + logger.debug('dest is_dir={} for {} specs'.format( + spec.destination.is_dir, len(spec.sources))) + # ensure destination path + spec.destination.ensure_path_exists() + + @staticmethod + def create_unique_transfer_operation_id(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Create a unique transfer operation id + :param blobxfer.models.azure.StorageEntity ase: storage entity + :rtype: str + :return: unique transfer id + """ + return ';'.join( + (ase._client.primary_endpoint, ase.path, str(ase.vectored_io)) + ) + + @staticmethod + def create_unique_disk_operation_id(dd, offsets): + # type: (blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets) -> str + """Create a unique disk operation id + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: download offsets + :rtype: str + :return: unique disk id + """ + return ';'.join( + (str(dd.final_path), dd.entity._client.primary_endpoint, + dd.entity.path, str(offsets.range_start)) + ) + + def _update_progress_bar(self): + # type: (Downloader) -> None + """Update progress bar + :param Downloader self: this + """ + blobxfer.operations.progress.update_progress_bar( + self._general_options, + 'download', + self._download_start_time, + self._download_total, + self._download_sofar, + self._download_bytes_total, + self._download_bytes_sofar, + ) + + def _check_download_conditions(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> DownloadAction + """Check for download conditions + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + :rtype: DownloadAction + :return: download action + """ + if not lpath.exists(): + if rfile.vectored_io is not None: + fpath = blobxfer.models.download.Descriptor.\ + convert_vectored_io_slice_to_final_path_name(lpath, rfile) + if not fpath.exists(): + return DownloadAction.Download + else: + return DownloadAction.Download + if not self._spec.options.overwrite: + logger.info( + 'not overwriting local file: {} (remote: {})'.format( + lpath, rfile.path)) + return DownloadAction.Skip + # check skip on options, MD5 match takes priority + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + if self._spec.skip_on.md5_match and blobxfer.util.is_not_empty(md5): + return DownloadAction.CheckMd5 + # if neither of the remaining skip on actions are activated, download + if (not self._spec.skip_on.filesize_match and + not self._spec.skip_on.lmt_ge): + return DownloadAction.Download + # check skip on file size match + dl_fs = None + if self._spec.skip_on.filesize_match: + lsize = lpath.stat().st_size + if rfile.mode == blobxfer.models.azure.StorageModes.Page: + lsize = blobxfer.util.page_align_content_length(lsize) + if rfile.size == lsize: + dl_fs = False + else: + dl_fs = True + # check skip on lmt ge + dl_lmt = None + if self._spec.skip_on.lmt_ge: + mtime = blobxfer.util.datetime_from_timestamp( + lpath.stat().st_mtime) + if mtime >= rfile.lmt: + dl_lmt = False + else: + dl_lmt = True + # download if either skip on mismatch is True + if dl_fs or dl_lmt: + return DownloadAction.Download + else: + return DownloadAction.Skip + + def _pre_md5_skip_on_check(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> None + """Perform pre MD5 skip on check + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + """ + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + key = blobxfer.operations.download.Downloader.\ + create_unique_transfer_operation_id(rfile) + with self._md5_meta_lock: + self._md5_map[key] = rfile + slpath = str(lpath) + # temporarily create a download descriptor view for vectored io + if rfile.vectored_io is not None: + view, _ = blobxfer.models.download.Descriptor.generate_view(rfile) + fpath = str( + blobxfer.models.download.Descriptor. + convert_vectored_io_slice_to_final_path_name(lpath, rfile) + ) + else: + view = None + fpath = slpath + self._md5_offload.add_localfile_for_md5_check( + key, slpath, fpath, md5, rfile.mode, view) + + def _post_md5_skip_on_check(self, key, filename, size, md5_match): + # type: (Downloader, str, str, int, bool) -> None + """Perform post MD5 skip on check + :param Downloader self: this + :param str key: md5 map key + :param str filename: local filename + :param int size: size of checked data + :param bool md5_match: if MD5 matches + """ + with self._md5_meta_lock: + rfile = self._md5_map.pop(key) + lpath = pathlib.Path(filename) + if md5_match: + if size is None: + size = lpath.stat().st_size + with self._transfer_lock: + self._transfer_set.remove( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(rfile)) + self._download_total -= 1 + self._download_bytes_total -= size + else: + self._add_to_download_queue(lpath, rfile) + + def _check_for_downloads_from_md5(self): + # type: (Downloader) -> None + """Check queue for a file to download + :param Downloader self: this + """ + cv = self._md5_offload.done_cv + while not self.termination_check_md5: + result = None + cv.acquire() + while True: + result = self._md5_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + if self.termination_check_md5: + break + else: + break + cv.release() + if result is not None: + self._post_md5_skip_on_check( + result[0], result[1], result[2], result[3]) + + def _check_for_crypto_done(self): + # type: (Downloader) -> None + """Check queue for crypto done + :param Downloader self: this + """ + cv = self._crypto_offload.done_cv + while not self.termination_check: + result = None + cv.acquire() + while True: + result = self._crypto_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(0.1) + # check for terminating conditions + if self.termination_check: + break + else: + break + cv.release() + if result is not None: + try: + final_path, offsets = result + with self._transfer_lock: + dd = self._dd_map[final_path] + self._finalize_chunk(dd, offsets) + except KeyError: + # this can happen if all of the last integrity + # chunks are processed at once + pass + + def _add_to_download_queue(self, lpath, rfile): + # type: (Downloader, pathlib.Path, + # blobxfer.models.azure.StorageEntity) -> None + """Add remote file to download queue + :param Downloader self: this + :param pathlib.Path lpath: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + """ + # prepare remote file for download + dd = blobxfer.models.download.Descriptor( + lpath, rfile, self._spec.options, self._resume) + if dd.entity.is_encrypted: + with self._transfer_lock: + self._dd_map[str(dd.final_path)] = dd + # add download descriptor to queue + self._transfer_queue.put(dd) + if self._download_start_time is None: + with self._transfer_lock: + if self._download_start_time is None: + self._download_start_time = blobxfer.util.datetime_now() + + def _initialize_disk_threads(self): + # type: (Downloader) -> None + """Initialize download threads + :param Downloader self: this + """ + logger.debug('spawning {} disk threads'.format( + self._general_options.concurrency.disk_threads)) + for _ in range(self._general_options.concurrency.disk_threads): + thr = threading.Thread(target=self._worker_thread_disk) + self._disk_threads.append(thr) + thr.start() + + def _initialize_transfer_threads(self): + # type: (Downloader) -> None + """Initialize transfer threads + :param Downloader self: this + """ + logger.debug('spawning {} transfer threads'.format( + self._general_options.concurrency.transfer_threads)) + for _ in range(self._general_options.concurrency.transfer_threads): + thr = threading.Thread(target=self._worker_thread_transfer) + self._transfer_threads.append(thr) + thr.start() + + def _wait_for_disk_threads(self, terminate): + # type: (Downloader, bool) -> None + """Wait for disk threads + :param Downloader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._download_terminate = terminate + for thr in self._disk_threads: + blobxfer.util.join_thread(thr) + + def _wait_for_transfer_threads(self, terminate): + # type: (Downloader, bool) -> None + """Wait for download threads + :param Downloader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._download_terminate = terminate + for thr in self._transfer_threads: + blobxfer.util.join_thread(thr) + + def _worker_thread_transfer(self): + # type: (Downloader) -> None + """Worker thread download + :param Downloader self: this + """ + max_set_len = self._general_options.concurrency.disk_threads << 2 + while not self.termination_check: + try: + if len(self._disk_set) > max_set_len: + time.sleep(0.1) + continue + else: + dd = self._transfer_queue.get(block=False, timeout=0.1) + except queue.Empty: + continue + try: + self._process_download_descriptor(dd) + except Exception as e: + with self._transfer_lock: + self._exceptions.append(e) + + def _worker_thread_disk(self): + # type: (Downloader) -> None + """Worker thread for disk + :param Downloader self: this + """ + while not self.termination_check: + try: + dd, offsets, data = self._disk_queue.get( + block=False, timeout=0.1) + except queue.Empty: + continue + try: + self._process_data(dd, offsets, data) + except Exception as e: + with self._transfer_lock: + self._exceptions.append(e) + + def _process_download_descriptor(self, dd): + # type: (Downloader, blobxfer.models.download.Descriptor) -> None + """Process download descriptor + :param Downloader self: this + :param blobxfer.models.download.Descriptor dd: download descriptor + """ + # update progress bar + self._update_progress_bar() + # get download offsets + offsets, resume_bytes = dd.next_offsets() + # add resume bytes to counter + if resume_bytes is not None: + with self._disk_operation_lock: + self._download_bytes_sofar += resume_bytes + logger.debug('adding {} sofar {} from {}'.format( + resume_bytes, self._download_bytes_sofar, dd._ase.name)) + del resume_bytes + # check if all operations completed + if offsets is None and dd.all_operations_completed: + finalize = True + # finalize integrity + dd.finalize_integrity() + # accounting + with self._transfer_lock: + sfpath = str(dd.final_path) + if dd.entity.is_encrypted: + self._dd_map.pop(sfpath) + self._transfer_set.remove( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(dd.entity)) + self._download_sofar += 1 + if dd.entity.vectored_io is not None: + if sfpath not in self._vio_map: + self._vio_map[sfpath] = 1 + else: + self._vio_map[sfpath] += 1 + if (self._vio_map[sfpath] == + dd.entity.vectored_io.total_slices): + self._vio_map.pop(sfpath) + else: + finalize = False + del sfpath + # finalize file + if finalize: + dd.finalize_file() + # remove from delete after set + try: + self._delete_after.remove(dd.final_path) + except KeyError: + pass + return + # re-enqueue for other threads to download + self._transfer_queue.put(dd) + if offsets is None: + return + # issue get range + if dd.entity.mode == blobxfer.models.azure.StorageModes.File: + data = blobxfer.operations.azure.file.get_file_range( + dd.entity, offsets, self._general_options.timeout_sec) + else: + data = blobxfer.operations.azure.blob.get_blob_range( + dd.entity, offsets, self._general_options.timeout_sec) + # enqueue data for processing + with self._disk_operation_lock: + self._disk_set.add( + blobxfer.operations.download.Downloader. + create_unique_disk_operation_id(dd, offsets)) + self._disk_queue.put((dd, offsets, data)) + + def _process_data(self, dd, offsets, data): + # type: (Downloader, blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets, bytes) -> None + """Process downloaded data for disk + :param Downloader self: this + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: offsets + :param bytes data: data to process + """ + # decrypt if necessary + if dd.entity.is_encrypted: + # slice data to proper bounds and get iv for chunk + if offsets.chunk_num == 0: + # set iv + iv = dd.entity.encryption_metadata.content_encryption_iv + # set data to decrypt + encdata = data + # send iv through hmac + dd.hmac_iv(iv) + else: + # set iv + iv = data[:blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES] + # set data to decrypt + encdata = data[blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] + # write encdata to disk for hmac later + _hmac_datafile = dd.write_unchecked_hmac_data( + offsets, encdata) + # decrypt data + if self._crypto_offload is not None: + self._crypto_offload.add_decrypt_chunk( + str(dd.final_path), dd.view.fd_start, offsets, + dd.entity.encryption_metadata.symmetric_key, + iv, _hmac_datafile) + # data will be integrity checked and written once + # retrieved from crypto queue + return + else: + data = blobxfer.operations.crypto.aes_cbc_decrypt_data( + dd.entity.encryption_metadata.symmetric_key, + iv, encdata, offsets.unpad) + dd.write_data(offsets, data) + else: + # write data to disk + dd.write_unchecked_data(offsets, data) + # finalize chunk + self._finalize_chunk(dd, offsets) + + def _finalize_chunk(self, dd, offsets): + # type: (Downloader, blobxfer.models.download.Descriptor, + # blobxfer.models.download.Offsets) -> None + """Finalize written chunk + :param Downloader self: this + :param blobxfer.models.download.Descriptor dd: download descriptor + :param blobxfer.models.download.Offsets offsets: offsets + """ + if dd.entity.is_encrypted: + dd.mark_unchecked_chunk_decrypted(offsets.chunk_num) + # integrity check data and write to disk (this is called + # regardless of md5/hmac enablement for resume purposes) + dd.perform_chunked_integrity_check() + # remove from disk set and add bytes to counter + with self._disk_operation_lock: + self._disk_set.remove( + blobxfer.operations.download.Downloader. + create_unique_disk_operation_id(dd, offsets)) + self._download_bytes_sofar += offsets.num_bytes + + def _cleanup_temporary_files(self): + # type: (Downloader) -> None + """Cleanup temporary files in case of an exception or interrupt. + This function is not thread-safe. + :param Downloader self: this + """ + # iterate through dd map and cleanup files + for key in self._dd_map: + dd = self._dd_map[key] + try: + dd.cleanup_all_temporary_files() + except Exception as e: + logger.exception(e) + + def _catalog_local_files_for_deletion(self): + # type: (Downloader) -> None + """Catalog all local files if delete extraneous enabled + :param Downloader self: this + """ + if not (self._spec.options.delete_extraneous_destination and + self._spec.destination.is_dir): + return + dst = str(self._spec.destination.path) + for file in blobxfer.util.scantree(dst): + self._delete_after.add(pathlib.Path(file.path)) + + def _delete_extraneous_files(self): + # type: (Downloader) -> None + """Delete extraneous files cataloged + :param Downloader self: this + """ + logger.info('attempting to delete {} extraneous files'.format( + len(self._delete_after))) + for file in self._delete_after: + try: + file.unlink() + except OSError: + pass + + def _run(self): + # type: (Downloader) -> None + """Execute Downloader + :param Downloader self: this + """ + # mark start + self._start_time = blobxfer.util.datetime_now() + logger.info('blobxfer start time: {0}'.format(self._start_time)) + # ensure destination path + blobxfer.operations.download.Downloader.ensure_local_destination( + self._creds, self._spec) + logger.info('downloading blobs/files to local path: {}'.format( + self._spec.destination.path)) + self._catalog_local_files_for_deletion() + # initialize resume db if specified + if self._general_options.resume_file is not None: + self._resume = blobxfer.operations.resume.DownloadResumeManager( + self._general_options.resume_file) + # initialize MD5 processes + if (self._spec.options.check_file_md5 and + self._general_options.concurrency.md5_processes > 0): + self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( + num_workers=self._general_options.concurrency.md5_processes) + self._md5_offload.initialize_check_thread( + self._check_for_downloads_from_md5) + # initialize crypto processes + if self._general_options.concurrency.crypto_processes > 0: + self._crypto_offload = blobxfer.operations.crypto.CryptoOffload( + num_workers=self._general_options.concurrency.crypto_processes) + self._crypto_offload.initialize_check_thread( + self._check_for_crypto_done) + # initialize download threads + self._initialize_transfer_threads() + self._initialize_disk_threads() + # initialize local counters + skipped_files = 0 + skipped_size = 0 + # iterate through source paths to download + for src in self._spec.sources: + for rfile in src.files( + self._creds, self._spec.options, self._general_options): + # form local path for remote file + if (not self._spec.destination.is_dir and + self._spec.options.rename): + lpath = pathlib.Path(self._spec.destination.path) + else: + lpath = pathlib.Path( + self._spec.destination.path, rfile.name) + # check on download conditions + action = self._check_download_conditions(lpath, rfile) + if action == DownloadAction.Skip: + skipped_files += 1 + skipped_size += rfile.size + continue + # add potential download to set + with self._transfer_lock: + self._transfer_set.add( + blobxfer.operations.download.Downloader. + create_unique_transfer_operation_id(rfile)) + self._download_total += 1 + self._download_bytes_total += rfile.size + # either MD5 check or download now + if action == DownloadAction.CheckMd5: + self._pre_md5_skip_on_check(lpath, rfile) + elif action == DownloadAction.Download: + self._add_to_download_queue(lpath, rfile) + # set remote files processed + with self._md5_meta_lock: + self._all_remote_files_processed = True + with self._transfer_lock: + self._download_total -= skipped_files + self._download_bytes_total -= skipped_size + download_size_mib = ( + self._download_bytes_total / blobxfer.util.MEGABYTE + ) + logger.debug( + ('{0} remote files processed, waiting for download ' + 'completion of approx. {1:.4f} MiB').format( + self._download_total, download_size_mib)) + del skipped_files + del skipped_size + # wait for downloads to complete + self._wait_for_transfer_threads(terminate=False) + self._wait_for_disk_threads(terminate=False) + end_time = blobxfer.util.datetime_now() + # update progress bar + self._update_progress_bar() + # check for exceptions + if len(self._exceptions) > 0: + logger.error('exceptions encountered while downloading') + # raise the first one + raise self._exceptions[0] + # check for mismatches + if (self._download_sofar != self._download_total or + self._download_bytes_sofar != self._download_bytes_total): + raise RuntimeError( + 'download mismatch: [count={}/{} bytes={}/{}]'.format( + self._download_sofar, self._download_total, + self._download_bytes_sofar, self._download_bytes_total)) + # delete all remaining local files not accounted for if + # delete extraneous enabled + self._delete_extraneous_files() + # delete resume file if we've gotten this far + if self._resume is not None: + self._resume.delete() + # output throughput + if self._download_start_time is not None: + dltime = (end_time - self._download_start_time).total_seconds() + download_size_mib = ( + self._download_bytes_total / blobxfer.util.MEGABYTE + ) + dlmibspeed = download_size_mib / dltime + logger.info( + ('elapsed download + verify time and throughput of {0:.4f} ' + 'GiB: {1:.3f} sec, {2:.4f} Mbps ({3:.3f} MiB/sec)').format( + download_size_mib / 1024, dltime, dlmibspeed * 8, + dlmibspeed)) + end_time = blobxfer.util.datetime_now() + logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time, (end_time - self._start_time).total_seconds())) + + def start(self): + # type: (Downloader) -> None + """Start the Downloader + :param Downloader self: this + """ + try: + blobxfer.operations.progress.output_parameters( + self._general_options, self._spec) + self._run() + except (KeyboardInterrupt, Exception) as ex: + if isinstance(ex, KeyboardInterrupt): + logger.info( + 'KeyboardInterrupt detected, force terminating ' + 'processes and threads (this may take a while)...') + try: + self._wait_for_disk_threads(terminate=True) + self._wait_for_transfer_threads(terminate=True) + finally: + self._cleanup_temporary_files() + raise + finally: + # shutdown processes + if self._md5_offload is not None: + self._md5_offload.finalize_processes() + if self._crypto_offload is not None: + self._crypto_offload.finalize_processes() + # close resume file + if self._resume is not None: + self._resume.close() diff --git a/blobxfer/operations/md5.py b/blobxfer/operations/md5.py new file mode 100644 index 0000000..f14431f --- /dev/null +++ b/blobxfer/operations/md5.py @@ -0,0 +1,177 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import logging +try: + import queue +except ImportError: # noqa + import Queue as queue +# non-stdlib imports +# local imports +import blobxfer.models.azure +import blobxfer.models.offload +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) +# global defines +_EMPTY_MAX_PAGE_SIZE_MD5 = 'tc+p1sj+vWGPkawoQ9UKHA==' +_MAX_PAGE_SIZE_BYTES = 4194304 + + +def compute_md5_for_file_asbase64( + filename, pagealign=False, start=None, end=None, blocksize=65536): + # type: (str, bool, int) -> str + """Compute MD5 hash for file and encode as Base64 + :param str filename: file to compute MD5 for + :param bool pagealign: page align data + :param int blocksize: block size + :rtype: str + :return: MD5 for file encoded as Base64 + """ + hasher = blobxfer.util.new_md5_hasher() + with open(filename, 'rb') as filedesc: + if start is not None: + filedesc.seek(start) + curr = start + else: + curr = 0 + while True: + if end is not None and curr + blocksize > end: + blocksize = end - curr + if blocksize == 0: + break + buf = filedesc.read(blocksize) + if not buf: + break + buflen = len(buf) + if pagealign and buflen < blocksize: + aligned = blobxfer.util.page_align_content_length(buflen) + if aligned != buflen: + buf = buf.ljust(aligned, b'\0') + hasher.update(buf) + curr += blocksize + return blobxfer.util.base64_encode_as_string(hasher.digest()) + + +def compute_md5_for_data_asbase64(data): + # type: (obj) -> str + """Compute MD5 hash for bits and encode as Base64 + :param any data: data to compute MD5 for + :rtype: str + :return: MD5 for data + """ + hasher = blobxfer.util.new_md5_hasher() + hasher.update(data) + return blobxfer.util.base64_encode_as_string(hasher.digest()) + + +def check_data_is_empty(data): + # type: (bytes) -> bool + """Check if data is empty via MD5 + :param bytes data: data to check + :rtype: bool + :return: if data is empty + """ + contentmd5 = compute_md5_for_data_asbase64(data) + datalen = len(data) + if datalen == _MAX_PAGE_SIZE_BYTES: + if contentmd5 == _EMPTY_MAX_PAGE_SIZE_MD5: + return True + else: + data_chk = b'\0' * datalen + if compute_md5_for_data_asbase64(data_chk) == contentmd5: + return True + return False + + +class LocalFileMd5Offload(blobxfer.models.offload._MultiprocessOffload): + """LocalFileMd5Offload""" + def __init__(self, num_workers): + # type: (LocalFileMd5Offload, int) -> None + """Ctor for Local File Md5 Offload + :param LocalFileMd5Offload self: this + :param int num_workers: number of worker processes + """ + super(LocalFileMd5Offload, self).__init__( + self._worker_process, num_workers, 'MD5') + + def _worker_process(self): + # type: (LocalFileMd5Offload) -> None + """Compute MD5 for local file + :param LocalFileMd5Offload self: this + """ + while not self.terminated: + try: + key, lpath, fpath, remote_md5, pagealign, lpview = \ + self._task_queue.get(True, 0.1) + except queue.Empty: + continue + if lpview is None: + start = None + end = None + size = None + else: + start = lpview.fd_start + end = lpview.fd_end + size = end - start + md5 = blobxfer.operations.md5.compute_md5_for_file_asbase64( + fpath, pagealign, start, end) + logger.debug('pre-transfer MD5 check: {} {} {}'.format( + md5, remote_md5, fpath)) + self._done_cv.acquire() + self._done_queue.put((key, lpath, size, md5 == remote_md5)) + self._done_cv.notify() + self._done_cv.release() + + def add_localfile_for_md5_check( + self, key, lpath, fpath, remote_md5, mode, lpview): + # type: (LocalFileMd5Offload, str, str, str, str, + # blobxfer.models.azure.StorageModes, object) -> None + """Add a local file to MD5 check queue + :param LocalFileMd5Offload self: this + :param str key: md5 map key + :param str lpath: "local" path for descriptor + :param str fpath: "final" path for/where file + :param str remote_md5: remote MD5 to compare against + :param blobxfer.models.azure.StorageModes mode: mode + :param object lpview: local path view + """ + if blobxfer.util.is_none_or_empty(remote_md5): + raise ValueError('comparison MD5 is empty for file {}'.format( + lpath)) + if mode == blobxfer.models.azure.StorageModes.Page: + pagealign = True + else: + pagealign = False + self._task_queue.put( + (key, lpath, fpath, remote_md5, pagealign, lpview) + ) diff --git a/blobxfer/operations/progress.py b/blobxfer/operations/progress.py new file mode 100644 index 0000000..0bf132e --- /dev/null +++ b/blobxfer/operations/progress.py @@ -0,0 +1,201 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import logging +import os +import platform +import sys +# non-stdlib imports +import azure.storage +import cryptography +import requests +# local imports +import blobxfer.models.download +import blobxfer.models.upload +import blobxfer.util +import blobxfer.version + +# create logger +logger = logging.getLogger(__name__) + + +def update_progress_bar( + go, optext, start, total_files, files_sofar, total_bytes, + bytes_sofar, stdin_upload=False): + # type: (blobxfer.models.options.General, str, datetime.datetime, int, + # int, int, int, bool) -> None + """Update the progress bar + :param blobxfer.models.options.General go: general options + :param str optext: operation prefix text + :param datetime.datetime start: start time + :param int total_files: total number of files + :param int files_sofar: files transfered so far + :param int total_bytes: total number of bytes + :param int bytes_sofar: bytes transferred so far + :param bool stdin_upload: stdin upload + """ + if (not go.progress_bar or blobxfer.util.is_none_or_empty(go.log_file) or + start is None): + return + diff = (blobxfer.util.datetime_now() - start).total_seconds() + if diff <= 0: + # arbitrarily give a small delta + diff = 1e-9 + if total_bytes is None or total_bytes == 0 or bytes_sofar > total_bytes: + done = 0 + else: + done = float(bytes_sofar) / total_bytes + rate = bytes_sofar / blobxfer.util.MEGABYTE / diff + if optext == 'synccopy': + rtext = 'sync-copied' + else: + rtext = optext + 'ed' + if total_files is None: + fprog = 'n/a' + else: + fprog = '{}/{}'.format(files_sofar, total_files) + if stdin_upload: + sys.stdout.write( + ('\r{0} progress: [{1:30s}] n/a % {2:12.3f} MiB/sec, ' + '{3} {4}').format( + optext, '>' * int(total_bytes % 30), rate, fprog, rtext) + ) + else: + sys.stdout.write( + ('\r{0} progress: [{1:30s}] {2:.2f}% {3:12.3f} MiB/sec, ' + '{4} {5}').format( + optext, '>' * int(done * 30), done * 100, rate, fprog, rtext) + ) + if files_sofar == total_files: + sys.stdout.write(os.linesep) + sys.stdout.flush() + + +def output_parameters(general_options, spec): + # type: (blobxfer.models.options.General, object) -> None + """Output parameters + :param blobxfer.models.options.General general_options: general options + :param object spec: upload or download spec + """ + sep = '============================================' + log = [] + log.append(sep) + log.append(' Azure blobxfer parameters') + log.append(sep) + log.append(' blobxfer version: {}'.format( + blobxfer.version.__version__)) + log.append(' platform: {}'.format(platform.platform())) + log.append( + ' components: {}={} az.stor={} crypt={} req={}'.format( + platform.python_implementation(), + platform.python_version(), + azure.storage._constants.__version__, + cryptography.__version__, + requests.__version__,)) + # specific preamble + if isinstance(spec, blobxfer.models.download.Specification): + log.append(' transfer direction: {}'.format('Azure -> local')) + log.append( + (' workers: disk={} xfer={} md5={} ' + 'crypto={}').format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.options.check_file_md5 else 0, + general_options.concurrency.crypto_processes)) + elif isinstance(spec, blobxfer.models.upload.Specification): + log.append(' transfer direction: {}'.format('local -> Azure')) + log.append( + (' workers: disk={} xfer={} md5={} ' + 'crypto={}').format( + general_options.concurrency.disk_threads, + general_options.concurrency.transfer_threads, + general_options.concurrency.md5_processes + if spec.skip_on.md5_match or + spec.options.store_file_properties.md5 else 0, + 0)) + + # TODO handle synccopy spec + + # common block + log.append(' log file: {}'.format( + general_options.log_file)) + log.append(' resume file: {}'.format( + general_options.resume_file)) + log.append(' timeout: {}'.format( + general_options.timeout_sec)) + log.append(' mode: {}'.format( + spec.options.mode)) + log.append( + ' skip on: fs_match={} lmt_ge={} md5={}'.format( + spec.skip_on.filesize_match, + spec.skip_on.lmt_ge, + spec.skip_on.md5_match)) + log.append(' chunk size bytes: {}'.format( + spec.options.chunk_size_bytes)) + log.append(' delete extraneous: {}'.format( + spec.options.delete_extraneous_destination)) + log.append(' overwrite: {}'.format( + spec.options.overwrite)) + log.append(' recursive: {}'.format( + spec.options.recursive)) + + # TODO only output rename single if not synccopy + log.append(' rename single: {}'.format( + spec.options.rename)) + + # specific epilog + if isinstance(spec, blobxfer.models.download.Specification): + log.append(' compute file md5: {}'.format( + spec.options.check_file_md5)) + log.append(' restore file attributes: {}'.format( + spec.options.restore_file_attributes)) + log.append(' rsa private key: {}'.format( + 'Loaded' if spec.options.rsa_private_key else 'None')) + log.append(' local destination: {}'.format( + spec.destination.path)) + elif isinstance(spec, blobxfer.models.upload.Specification): + log.append(' one shot bytes: {}'.format( + spec.options.one_shot_bytes)) + log.append(' store properties: attr={} md5={}'.format( + spec.options.store_file_properties.attributes, + spec.options.store_file_properties.md5)) + log.append(' rsa public key: {}'.format( + 'Loaded' if spec.options.rsa_public_key else 'None')) + log.append(' local source paths: {}'.format( + ' '.join([str(src) for src in spec.sources.paths]))) + log.append(sep) + log = os.linesep.join(log) + if blobxfer.util.is_not_empty(general_options.log_file): + print(log) + else: + logger.info('{}{}'.format(os.linesep, log)) diff --git a/blobxfer/operations/resume.py b/blobxfer/operations/resume.py new file mode 100644 index 0000000..88172e4 --- /dev/null +++ b/blobxfer/operations/resume.py @@ -0,0 +1,229 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import contextlib +import logging +import pickle +import shelve +import threading +# non-stdlib imports +# local imports +import blobxfer.models.resume +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +class _BaseResumeManager(object): + """Base Resume Manager""" + def __init__(self, resume_file): + # type: (_BaseResumeManager, str) -> None + """Ctor for _BaseResumeManager + :param _BaseResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + self._lock = threading.Lock() + self._resume_file = resume_file + self._data = shelve.open( + str(resume_file), protocol=pickle.HIGHEST_PROTOCOL) + + def close(self): + # type: (_BaseResumeManager) -> None + """Close the internal data store + :param _BaseResumeManager self: this + """ + if self._data is not None: + self._data.close() + self._data = None + + def delete(self): + # type: (_BaseResumeManager) -> None + """Delete the resume file db + :param _BaseResumeManager self: this + """ + self.close() + try: + self._resume_file.unlink() + except OSError as e: + logger.warning('could not unlink resume db: {}'.format(e)) + + @contextlib.contextmanager + def datalock(self, acquire=True): + # type: (_BaseResumeManager) -> None + """Delete the resume file db + :param _BaseResumeManager self: this + :param bool acquire: acquire lock + """ + if acquire: + self._lock.acquire() + try: + yield + finally: + if acquire: + self._lock.release() + + @staticmethod + def generate_record_key(ase): + # type: (blobxfer.models.azure.StorageEntity) -> str + """Generate a record key + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :rtype: str + :return: record key + """ + key = '{}:{}'.format(ase._client.primary_endpoint, ase.path) + if blobxfer.util.on_python2(): + return key.encode('utf8') + else: + return key + + def get_record(self, ase, key=None, lock=True): + # type: (_BaseResumeManager, str, bool) -> object + """Get a resume record + :param _BaseResumeManager self: this + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :param str key: record key + :param bool lock: acquire lock + :rtype: object + :return: resume record object + """ + if key is None: + key = blobxfer.operations.resume._BaseResumeManager.\ + generate_record_key(ase) + with self.datalock(lock): + try: + return self._data[key] + except KeyError: + return None + + +class DownloadResumeManager(_BaseResumeManager): + """Download Resume Manager""" + def __init__(self, resume_file): + # type: (DownloadResumeManager, str) -> None + """Ctor for DownloadResumeManager + :param DownloadResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + super(DownloadResumeManager, self).__init__(resume_file) + + def add_or_update_record( + self, final_path, ase, chunk_size, next_integrity_chunk, + completed, md5): + # type: (DownloadResumeManager, pathlib.Path, + # blobxfer.models.azure.StorageEntity, int, int, bool, + # str) -> None + """Add or update a resume record + :param DownloadResumeManager self: this + :param pathlib.Path final_path: final path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :param int chunk_size: chunk size in bytes + :param int next_integrity_chunk: next integrity chunk + :param bool completed: if completed + :param str md5: md5 hex digest + """ + key = blobxfer.operations.resume._BaseResumeManager.\ + generate_record_key(ase) + with self.datalock(): + dl = self.get_record(ase, key=key, lock=False) + if dl is None: + dl = blobxfer.models.resume.Download( + final_path=str(final_path), + length=ase._size, + chunk_size=chunk_size, + next_integrity_chunk=next_integrity_chunk, + completed=completed, + md5=md5, + ) + else: + if (dl.completed or + next_integrity_chunk < dl.next_integrity_chunk): + return + if completed: + dl.completed = completed + else: + dl.next_integrity_chunk = next_integrity_chunk + dl.md5hexdigest = md5 + self._data[key] = dl + self._data.sync() + + +class UploadResumeManager(_BaseResumeManager): + """Upload Resume Manager""" + def __init__(self, resume_file): + # type: (UploadResumeManager, str) -> None + """Ctor for UploadResumeManager + :param UploadResumeManager self: this + :param pathlib.Path resume_file: resume file + """ + super(UploadResumeManager, self).__init__(resume_file) + + def add_or_update_record( + self, local_path, ase, chunk_size, total_chunks, completed_chunks, + completed, md5): + # type: (UploadResumeManager, pathlib.Path, + # blobxfer.models.azure.StorageEntity, int, int, int, bool, + # str) -> None + """Add or update a resume record + :param UploadResumeManager self: this + :param pathlib.Path local_path: local path + :param blobxfer.models.azure.StorageEntity ase: Storage Entity + :param int chunk_size: chunk size in bytes + :param int total_chunks: total chunks + :param int completed_chunks: completed chunks bitarray + :param bool completed: if completed + :param str md5: md5 hex digest + """ + key = blobxfer.operations.resume._BaseResumeManager.\ + generate_record_key(ase) + with self.datalock(): + ul = self.get_record(ase, key=key, lock=False) + if ul is None: + ul = blobxfer.models.resume.Upload( + local_path=str(local_path), + length=ase._size, + chunk_size=chunk_size, + total_chunks=total_chunks, + completed_chunks=completed_chunks, + completed=completed, + md5=md5, + ) + else: + if ul.completed or completed_chunks == ul.completed_chunks: + return + if completed: + ul.completed = completed + else: + ul.completed_chunks = completed_chunks + ul.md5hexdigest = md5 + self._data[key] = ul + self._data.sync() diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py new file mode 100644 index 0000000..9db2863 --- /dev/null +++ b/blobxfer/operations/upload.py @@ -0,0 +1,1156 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +import logging +import math +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue +import threading +import time +# non-stdlib imports +# local imports +import blobxfer.models.crypto +import blobxfer.models.metadata +import blobxfer.operations.azure.blob +import blobxfer.operations.azure.blob.append +import blobxfer.operations.azure.blob.block +import blobxfer.operations.azure.blob.page +import blobxfer.operations.azure.file +import blobxfer.operations.crypto +import blobxfer.operations.md5 +import blobxfer.operations.progress +import blobxfer.operations.resume +import blobxfer.util + +# create logger +logger = logging.getLogger(__name__) + + +class UploadAction(enum.Enum): + Skip = 1 + CheckMd5 = 2 + Upload = 3 + + +class Uploader(object): + """Uploader""" + def __init__(self, general_options, creds, spec): + # type: (Uploader, blobxfer.models.options.General, + # blobxfer.operations.azure.StorageCredentials, + # blobxfer.models.upload.Specification) -> None + """Ctor for Uploader + :param Uploader self: this + :param blobxfer.models.options.General general_options: general opts + :param blobxfer.operations.azure.StorageCredentials creds: creds + :param blobxfer.models.uplaod.Specification spec: upload spec + """ + self._all_files_processed = False + self._crypto_offload = None + self._md5_meta_lock = threading.Lock() + self._md5_map = {} + self._md5_offload = None + self._upload_lock = threading.Lock() + self._upload_queue = queue.Queue() + self._upload_set = set() + self._upload_start_time = None + self._disk_threads = [] + self._upload_total = 0 + self._upload_sofar = 0 + self._upload_bytes_total = 0 + self._upload_bytes_sofar = 0 + self._upload_terminate = False + self._transfer_lock = threading.Lock() + self._transfer_queue = queue.Queue() + self._transfer_set = set() + self._transfer_threads = [] + self._start_time = None + self._delete_exclude = set() + self._ud_map = {} + self._containers_created = set() + self._fileshare_dir_lock = threading.Lock() + self._dirs_created = {} + self._general_options = general_options + self._creds = creds + self._spec = spec + self._resume = None + self._exceptions = [] + + @property + def termination_check(self): + # type: (Uploader) -> bool + """Check if terminated + :param Uploader self: this + :rtype: bool + :return: if terminated + """ + with self._upload_lock: + with self._transfer_lock: + return (self._upload_terminate or + len(self._exceptions) > 0 or + (self._all_files_processed and + len(self._upload_set) == 0 and + len(self._transfer_set) == 0)) + + @property + def termination_check_md5(self): + # type: (Uploader) -> bool + """Check if terminated from MD5 context + :param Uploader self: this + :rtype: bool + :return: if terminated from MD5 context + """ + with self._md5_meta_lock: + with self._upload_lock: + return (self._upload_terminate or + (self._all_files_processed and + len(self._md5_map) == 0 and + len(self._upload_set) == 0)) + + @staticmethod + def create_unique_id(src, ase): + # type: (blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> str + """Create a unique id given a LocalPath and StorageEntity + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity ase: azure storage entity + :rtype: str + :return: unique id for pair + """ + return ';'.join( + (str(src.absolute_path), ase._client.primary_endpoint, ase.path) + ) + + @staticmethod + def create_unique_transfer_id(local_path, ase, offsets): + # type: (blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> str + """Create a unique transfer id given a offsets + :param blobxfer.models.upload.LocalPath local_path: local path + :param blobxfer.models.azure.StorageEntity ase: azure storage entity + :param blobxfer.models.upload.Offsets offsets: upload offsets + :rtype: str + :return: unique id for transfer + """ + return ';'.join( + (str(local_path.absolute_path), ase._client.primary_endpoint, + ase.path, str(local_path.view.fd_start), str(offsets.range_start)) + ) + + @staticmethod + def create_deletion_id(client, container, name): + # type: (azure.storage.StorageClient, str, str) -> str + """Create a unique deletion id + :param azure.storage.StorageClient client: storage client + :param str container: container name + :param str name: entity name + :rtype: str + :return: unique id for deletion + """ + return ';'.join((client.primary_endpoint, container, name)) + + @staticmethod + def append_slice_suffix_to_name(name, slice): + # type: (str, int) -> str + """Append a vectored io (stripe) slice suffix to a given name + :param str name: entity name + :param int slice: slice num + :rtype: str + :return: name with appended suffix + """ + return '{}.bxslice-{}'.format(name, slice) + + def _update_progress_bar(self, stdin=False): + # type: (Uploader, bool) -> None + """Update progress bar + :param Uploader self: this + :param bool stdin: stdin upload + """ + if not self._all_files_processed: + return + blobxfer.operations.progress.update_progress_bar( + self._general_options, + 'upload', + self._upload_start_time, + self._upload_total, + self._upload_sofar, + self._upload_bytes_total, + self._upload_bytes_sofar, + stdin_upload=stdin, + ) + + def _pre_md5_skip_on_check(self, src, rfile): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> None + """Perform pre MD5 skip on check + :param Uploader self: this + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + """ + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + key = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) + with self._md5_meta_lock: + self._md5_map[key] = (src, rfile) + self._md5_offload.add_localfile_for_md5_check( + key, None, str(src.absolute_path), md5, rfile.mode, src.view) + + def _post_md5_skip_on_check(self, key, md5_match): + # type: (Uploader, str, bool) -> None + """Perform post MD5 skip on check + :param Uploader self: this + :param str key: md5 map key + :param bool md5_match: if MD5 matches + """ + with self._md5_meta_lock: + src, rfile = self._md5_map.pop(key) + uid = blobxfer.operations.upload.Uploader.create_unique_id(src, rfile) + if md5_match: + with self._upload_lock: + self._upload_set.remove(uid) + self._upload_total -= 1 + else: + self._add_to_upload_queue(src, rfile, uid) + + def _check_for_uploads_from_md5(self): + # type: (Uploader) -> None + """Check queue for a file to upload + :param Uploader self: this + """ + cv = self._md5_offload.done_cv + while not self.termination_check_md5: + result = None + cv.acquire() + while True: + result = self._md5_offload.pop_done_queue() + if result is None: + # use cv timeout due to possible non-wake while running + cv.wait(1) + # check for terminating conditions + if self.termination_check_md5: + break + else: + break + cv.release() + if result is not None: + self._post_md5_skip_on_check(result[0], result[3]) + + def _add_to_upload_queue(self, src, rfile, uid): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity, str) -> None + """Add remote file to download queue + :param Uploader self: this + :param blobxfer.models.upload.LocalPath src: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + :param str uid: unique id + """ + # prepare local file for upload + ud = blobxfer.models.upload.Descriptor( + src, rfile, uid, self._spec.options, self._resume) + if ud.entity.is_encrypted: + with self._upload_lock: + self._ud_map[uid] = ud + # add download descriptor to queue + self._upload_queue.put(ud) + if self._upload_start_time is None: + with self._upload_lock: + if self._upload_start_time is None: + self._upload_start_time = blobxfer.util.datetime_now() + + def _initialize_disk_threads(self): + # type: (Uploader) -> None + """Initialize disk threads + :param Uploader self: this + """ + logger.debug('spawning {} disk threads'.format( + self._general_options.concurrency.transfer_threads)) + for _ in range(self._general_options.concurrency.disk_threads): + thr = threading.Thread(target=self._worker_thread_upload) + self._disk_threads.append(thr) + thr.start() + + def _initialize_transfer_threads(self): + # type: (Uploader) -> None + """Initialize transfer threads + :param Uploader self: this + """ + logger.debug('spawning {} transfer threads'.format( + self._general_options.concurrency.transfer_threads)) + for _ in range(self._general_options.concurrency.transfer_threads): + thr = threading.Thread(target=self._worker_thread_transfer) + self._transfer_threads.append(thr) + thr.start() + + def _wait_for_disk_threads(self, terminate): + # type: (Uploader, bool) -> None + """Wait for disk threads + :param Uploader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._upload_terminate = terminate + for thr in self._disk_threads: + thr.join() + + def _wait_for_transfer_threads(self, terminate): + # type: (Uploader, bool) -> None + """Wait for transfer threads + :param Uploader self: this + :param bool terminate: terminate threads + """ + if terminate: + self._upload_terminate = terminate + for thr in self._transfer_threads: + thr.join() + + def _worker_thread_transfer(self): + # type: (Uploader) -> None + """Worker thread transfer + :param Uploader self: this + """ + while not self.termination_check: + try: + ud, ase, offsets, data = self._transfer_queue.get( + block=False, timeout=0.1) + except queue.Empty: + continue + try: + self._process_transfer(ud, ase, offsets, data) + except Exception as e: + with self._upload_lock: + self._exceptions.append(e) + + def _process_transfer(self, ud, ase, offsets, data): + # type: (Uploader, blobxfer.models.upload.Descriptor, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes) -> None + """Process transfer instructions + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes data: data to upload + """ + # issue put range + self._put_data(ud, ase, offsets, data) + # accounting + with self._transfer_lock: + if ud.local_path.use_stdin: + self._upload_bytes_total += offsets.num_bytes + elif offsets.chunk_num == 0: + self._upload_bytes_total += ase.size + self._upload_bytes_sofar += offsets.num_bytes + self._transfer_set.remove( + blobxfer.operations.upload.Uploader.create_unique_transfer_id( + ud.local_path, ase, offsets)) + ud.complete_offset_upload(offsets.chunk_num) + # add descriptor back to upload queue only for append blobs + if ud.entity.mode == blobxfer.models.azure.StorageModes.Append: + self._upload_queue.put(ud) + # update progress bar + self._update_progress_bar(stdin=ud.local_path.use_stdin) + + def _put_data(self, ud, ase, offsets, data): + # type: (Uploader, blobxfer.models.upload.Descriptor, + # blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets, bytes) -> None + """Put data in Azure + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + :param bytes data: data to upload + """ + if ase.mode == blobxfer.models.azure.StorageModes.Append: + # append block + if data is not None: + blobxfer.operations.azure.blob.append.append_block( + ase, data, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Block: + # handle one-shot uploads + if ud.is_one_shot_block_blob: + metadata = ud.generate_metadata() + if not ud.entity.is_encrypted and ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string( + ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.create_blob( + ase, data, digest, metadata, + timeout=self._general_options.timeout_sec) + return + # upload block + blobxfer.operations.azure.blob.block.put_block( + ase, offsets, data, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.File: + # upload range + if data is not None: + blobxfer.operations.azure.file.put_file_range( + ase, offsets, data, + timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Page: + if data is None: + return + # compute aligned size + aligned = blobxfer.util.page_align_content_length( + offsets.num_bytes) + # align page + if aligned != offsets.num_bytes: + data = data.ljust(aligned, b'\0') + if blobxfer.operations.md5.check_data_is_empty(data): + return + # upload page + blobxfer.operations.azure.blob.page.put_page( + ase, offsets.range_start, offsets.range_start + aligned - 1, + data, timeout=self._general_options.timeout_sec) + + def _worker_thread_upload(self): + # type: (Uploader) -> None + """Worker thread upload + :param Uploader self: this + """ + max_set_len = self._general_options.concurrency.transfer_threads << 2 + while not self.termination_check: + try: + if len(self._transfer_set) > max_set_len: + time.sleep(0.1) + continue + else: + ud = self._upload_queue.get(block=False, timeout=0.1) + except queue.Empty: + continue + try: + self._process_upload_descriptor(ud) + except Exception as e: + with self._upload_lock: + self._exceptions.append(e) + + def _prepare_upload(self, ase, offsets): + # type: (Uploader, blobxfer.models.azure.StorageEntity, + # blobxfer.models.upload.Offsets) -> None + """Prepare upload + :param Uploader self: this + :param blobxfer.models.azure.StorageEntity ase: Storage entity + :param blobxfer.models.upload.Offsets offsets: offsets + """ + if ase.mode == blobxfer.models.azure.StorageModes.Append: + if ase.append_create: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.append.create_blob( + ase, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Block: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.File: + # create share directory structure + with self._fileshare_dir_lock: + # create container if necessary + blobxfer.operations.azure.file.create_share( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create parent directories + blobxfer.operations.azure.file.create_all_parent_directories( + ase, self._dirs_created, + timeout=self._general_options.timeout_sec) + # create remote file + blobxfer.operations.azure.file.create_file( + ase, timeout=self._general_options.timeout_sec) + elif ase.mode == blobxfer.models.azure.StorageModes.Page: + # create container if necessary + blobxfer.operations.azure.blob.create_container( + ase, self._containers_created, + timeout=self._general_options.timeout_sec) + # create remote blob + blobxfer.operations.azure.blob.page.create_blob( + ase, timeout=self._general_options.timeout_sec) + + def _process_upload_descriptor(self, ud): + # type: (Uploader, blobxfer.models.upload.Descriptor) -> None + """Process upload descriptor + :param Uploader self: this + :param blobxfer.models.upload.Descriptor: upload descriptor + """ + # get upload offsets + offsets, resume_bytes = ud.next_offsets() + # add resume bytes to counter + if resume_bytes is not None: + with self._transfer_lock: + self._upload_bytes_total += ud.entity.size + self._upload_bytes_sofar += resume_bytes + logger.debug('adding {} sofar {} from {}'.format( + resume_bytes, self._upload_bytes_sofar, ud._ase.name)) + del resume_bytes + # check if all operations completed + if offsets is None and ud.all_operations_completed: + # finalize file + self._finalize_upload(ud) + # accounting + with self._upload_lock: + if ud.entity.is_encrypted: + self._ud_map.pop(ud.unique_id) + self._upload_set.remove(ud.unique_id) + self._upload_sofar += 1 + return + # if nothing to upload, re-enqueue for finalization + if offsets is None: + self._upload_queue.put(ud) + return + # prepare upload + if offsets.chunk_num == 0: + self._prepare_upload(ud.entity, offsets) + # encrypt if necessary + if ud.entity.is_encrypted and ud.entity.size > 0: + # send iv through hmac if first chunk + if offsets.chunk_num == 0: + ud.hmac_data(ud.current_iv) + # encrypt data + if self._crypto_offload is None: + # read data from file and encrypt + data, _ = ud.read_data(offsets) + encdata = blobxfer.operations.crypto.aes_cbc_encrypt_data( + ud.entity.encryption_metadata.symmetric_key, + ud.current_iv, data, offsets.pad) + # send encrypted data through hmac + ud.hmac_data(encdata) + data = encdata + # save last 16 encrypted bytes for next IV + ud.current_iv = \ + encdata[-blobxfer.models.crypto.AES256_BLOCKSIZE_BYTES:] + else: # noqa + # crypto offload is not supported with AES256-CBC FullBlob + raise NotImplementedError() + # self._crypto_offload.add_encrypt_chunk( + # str(ud.local_path.absolute_path), offsets, + # ud.entity.encryption_metadata.symmetric_key, + # ud.current_iv) + # encrypted data will be retrieved from a temp file once + # retrieved from crypto queue + # return_early = True + else: + data, newoffset = ud.read_data(offsets) + # set new offset if stdin + if newoffset is not None: + offsets = newoffset + # re-enqueue for other threads to upload if not append + if ud.entity.mode != blobxfer.models.azure.StorageModes.Append: + self._upload_queue.put(ud) + # no data can be returned on stdin uploads + if not data: + return + # add data to transfer queue + with self._transfer_lock: + self._transfer_set.add( + blobxfer.operations.upload.Uploader.create_unique_transfer_id( + ud.local_path, ud.entity, offsets)) + self._transfer_queue.put((ud, ud.entity, offsets, data)) + # iterate replicas + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + with self._transfer_lock: + self._transfer_set.add( + blobxfer.operations.upload.Uploader. + create_unique_transfer_id(ud.local_path, ase, offsets) + ) + self._transfer_queue.put((ud, ase, offsets, data)) + + def _finalize_block_blob(self, ud, metadata): + """Finalize Block blob + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + if not ud.entity.is_encrypted and ud.must_compute_md5: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + else: + digest = None + blobxfer.operations.azure.blob.block.put_block_list( + ud.entity, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.block.put_block_list( + ase, ud.last_block_num, digest, metadata, + timeout=self._general_options.timeout_sec) + + def _set_blob_md5(self, ud): + """Set blob MD5 + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + """ + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.blob.set_blob_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.set_blob_md5( + ase, digest, timeout=self._general_options.timeout_sec) + + def _set_blob_metadata(self, ud, metadata): + """Set blob metadata + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + blobxfer.operations.azure.blob.set_blob_metadata( + ud.entity, metadata, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.blob.set_blob_metadata( + ase, metadata, timeout=self._general_options.timeout_sec) + + def _finalize_nonblock_blob(self, ud, metadata): + """Finalize Non-Block blob + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + # set md5 page blob property if required + if ud.requires_non_encrypted_md5_put: + self._set_blob_md5(ud) + # set metadata if needed + if blobxfer.util.is_not_empty(metadata): + self._set_blob_metadata(ud, metadata) + + def _finalize_azure_file(self, ud, metadata): + # type: (Uploader, blobxfer.models.upload.Descriptor, dict) -> None + """Finalize Azure File + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + :param dict metadata: metadata dict + """ + # set md5 file property if required + if ud.requires_non_encrypted_md5_put: + digest = blobxfer.util.base64_encode_as_string(ud.md5.digest()) + blobxfer.operations.azure.file.set_file_md5( + ud.entity, digest, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_md5( + ase, digest, timeout=self._general_options.timeout_sec) + # set file metadata if needed + if blobxfer.util.is_not_empty(metadata): + blobxfer.operations.azure.file.set_file_metadata( + ud.entity, metadata, timeout=self._general_options.timeout_sec) + if blobxfer.util.is_not_empty(ud.entity.replica_targets): + for ase in ud.entity.replica_targets: + blobxfer.operations.azure.file.set_file_metadata( + ase, metadata, + timeout=self._general_options.timeout_sec) + + def _finalize_upload(self, ud): + # type: (Uploader, blobxfer.models.upload.Descriptor) -> None + """Finalize file upload + :param Uploader self: this + :param blobxfer.models.upload.Descriptor ud: upload descriptor + """ + metadata = ud.generate_metadata() + if ud.requires_put_block_list: + # put block list for non one-shot block blobs + self._finalize_block_blob(ud, metadata) + elif ud.remote_is_page_blob or ud.remote_is_append_blob: + # append and page blob finalization + self._finalize_nonblock_blob(ud, metadata) + elif ud.remote_is_file: + # azure file finalization + self._finalize_azure_file(ud, metadata) + + def _get_destination_paths(self): + # type: (Uploader) -> + # Tuple[blobxfer.operations.azure.StorageAccount, str, str] + """Get destination paths + :param Uploader self: this + :rtype: tuple + :return: (storage account, container, name) + """ + for dst in self._spec.destinations: + for dpath in dst.paths: + sdpath = str(dpath) + cont, dir = blobxfer.util.explode_azure_path(sdpath) + sa = self._creds.get_storage_account( + dst.lookup_storage_account(sdpath)) + yield sa, cont, dir, dpath + + def _delete_extraneous_files(self): + # type: (Uploader) -> None + """Delete extraneous files on the remote + :param Uploader self: this + """ + if not self._spec.options.delete_extraneous_destination: + return + # list blobs for all destinations + checked = set() + deleted = 0 + for sa, container, _, _ in self._get_destination_paths(): + key = ';'.join((sa.name, sa.endpoint, container)) + if key in checked: + continue + logger.debug( + 'attempting to delete extraneous blobs/files from: {}'.format( + key)) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + files = blobxfer.operations.azure.file.list_all_files( + sa.file_client, container, + timeout=self._general_options.timeout_sec) + for file in files: + id = blobxfer.operations.upload.Uploader.\ + create_deletion_id(sa.file_client, container, file) + if id not in self._delete_exclude: + blobxfer.operations.azure.file.delete_file( + sa.file_client, container, file, + timeout=self._general_options.timeout_sec) + deleted += 1 + else: + blobs = blobxfer.operations.azure.blob.list_all_blobs( + sa.block_blob_client, container, + timeout=self._general_options.timeout_sec) + for blob in blobs: + id = blobxfer.operations.upload.Uploader.\ + create_deletion_id( + sa.block_blob_client, container, blob.name) + if id not in self._delete_exclude: + blobxfer.operations.azure.blob.delete_blob( + sa.block_blob_client, container, blob.name, + timeout=self._general_options.timeout_sec) + deleted += 1 + checked.add(key) + logger.info('deleted {} extraneous blobs/files'.format(deleted)) + + def _check_upload_conditions(self, local_path, rfile): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity) -> UploadAction + """Check for upload conditions + :param Uploader self: this + :param blobxfer.models.LocalPath local_path: local path + :param blobxfer.models.azure.StorageEntity rfile: remote file + :rtype: UploadAction + :return: upload action + """ + lpath = local_path.absolute_path + # check if local file still exists + if not local_path.use_stdin and not lpath.exists(): + return UploadAction.Skip + # if remote file doesn't exist, upload + if rfile is None or rfile.from_local: + return UploadAction.Upload + # check overwrite option + if not self._spec.options.overwrite: + if rfile.mode == blobxfer.models.azure.StorageModes.Append: + rfile.append_create = False + return UploadAction.Upload + logger.info( + 'not overwriting remote file: {} (local: {})'.format( + rfile.path, lpath)) + return UploadAction.Skip + # check skip on options, MD5 match takes priority + md5 = blobxfer.models.metadata.get_md5_from_metadata(rfile) + if self._spec.skip_on.md5_match and blobxfer.util.is_not_empty(md5): + return UploadAction.CheckMd5 + # if neither of the remaining skip on actions are activated, upload + if (not self._spec.skip_on.filesize_match and + not self._spec.skip_on.lmt_ge): + return UploadAction.Upload + # check skip on file size match + ul_fs = None + if self._spec.skip_on.filesize_match: + lsize = local_path.size + if rfile.mode == blobxfer.models.azure.StorageModes.Page: + lsize = blobxfer.util.page_align_content_length(lsize) + if rfile.size == lsize: + ul_fs = False + else: + ul_fs = True + # check skip on lmt ge + ul_lmt = None + if self._spec.skip_on.lmt_ge: + mtime = blobxfer.util.datetime_from_timestamp(local_path.lmt) + if rfile.lmt >= mtime: + ul_lmt = False + else: + ul_lmt = True + # upload if either skip on mismatch is True + if ul_fs or ul_lmt: + return UploadAction.Upload + else: + return UploadAction.Skip + + def _check_for_existing_remote(self, sa, cont, name): + # type: (Uploader, blobxfer.operations.azure.StorageAccount, + # str, str) -> bobxfer.models.azure.StorageEntity + """Check for an existing remote file + :param Uploader self: this + :param blobxfer.operations.azure.StorageAccount sa: storage account + :param str cont: container + :param str name: entity name + """ + if self._spec.options.mode == blobxfer.models.azure.StorageModes.File: + fp = blobxfer.operations.azure.file.get_file_properties( + sa.file_client, cont, name, + timeout=self._general_options.timeout_sec) + else: + fp = blobxfer.operations.azure.blob.get_blob_properties( + sa.block_blob_client, cont, name, self._spec.options.mode, + timeout=self._general_options.timeout_sec) + if fp is not None: + if blobxfer.models.crypto.EncryptionMetadata.\ + encryption_metadata_exists(fp.metadata): + ed = blobxfer.models.crypto.EncryptionMetadata() + ed.convert_from_json(fp.metadata, fp.name, None) + else: + ed = None + ase = blobxfer.models.azure.StorageEntity(cont, ed) + if (self._spec.options.mode == + blobxfer.models.azure.StorageModes.File): + dir, _ = blobxfer.operations.azure.file.parse_file_path(name) + ase.populate_from_file(sa, fp, dir) + else: + ase.populate_from_blob(sa, fp) + else: + ase = None + return ase + + def _generate_destination_for_source(self, local_path): + # type: (Uploader, blobxfer.models.upload.LocalSourcePath) -> + # Tuple[blobxfer.operations.azure.StorageAccount, + # blobxfer.models.azure.StorageEntity) + """Generate entities for source path + :param Uploader self: this + :param blobxfer.models.upload.LocalSourcePath local_path: local path + :rtype: tuple + :return: storage account, storage entity + """ + # construct stripped destination path + spath = local_path.relative_path + # apply strip components + if self._spec.options.strip_components > 0: + _rparts = local_path.relative_path.parts + _strip = min( + (len(_rparts) - 1, self._spec.options.strip_components) + ) + if _strip > 0: + spath = pathlib.Path(*_rparts[_strip:]) + # create a storage entity for each destination + for sa, cont, name, dpath in self._get_destination_paths(): + # if not renaming, form name from with spath + if not self._spec.options.rename: + name = str(name / spath) + if blobxfer.util.is_none_or_empty(name): + raise ValueError( + ('invalid destination, must specify a container or ' + 'fileshare and remote file name: {}').format(dpath)) + # do not check for existing remote right now if striped + # vectored io mode + if (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload. + VectoredIoDistributionMode.Stripe): + ase = None + else: + ase = self._check_for_existing_remote(sa, cont, name) + if ase is None: + # encryption metadata will be populated later, if required + ase = blobxfer.models.azure.StorageEntity(cont, ed=None) + ase.populate_from_local( + sa, cont, name, self._spec.options.mode) + yield sa, ase + + def _vectorize_and_bind(self, local_path, dest): + # type: (Uploader, blobxfer.models.upload.LocalPath, + # List[blobxfer.models.azure.StorageEntity]) -> + # Tuple[blobxfer.operations.upload.UploadAction, + # blobxfer.models.upload.LocalPath, + # blobxfer.models.azure.StorageEntity] + """Vectorize local path to destinations, if necessary, and bind + :param Uploader self: this + :param blobxfer.models.LocalPath local_path: local path + :param list dest: list of destination tuples (sa, ase) + :rtype: tuple + :return: action, LocalPath, ase + """ + if (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload.VectoredIoDistributionMode.Stripe and + not local_path.use_stdin): + # compute total number of slices + slices = int(math.ceil( + local_path.total_size / + self._spec.options.vectored_io.stripe_chunk_size_bytes)) + # check if vectorization is possible + if slices == 1: + sa, ase = dest[0] + action = self._check_upload_conditions(local_path, ase) + yield action, local_path, ase + return + num_dest = len(dest) + logger.debug( + '{} slices for vectored out of {} to {} destinations'.format( + slices, local_path.absolute_path, num_dest)) + # pre-populate slice map for next pointers + slice_map = {} + for i in range(0, slices): + sa, ase = dest[i % num_dest] + name = blobxfer.operations.upload.Uploader.\ + append_slice_suffix_to_name(ase.name, i) + sase = self._check_for_existing_remote(sa, ase.container, name) + if sase is None: + # encryption metadata will be populated later, if required + sase = blobxfer.models.azure.StorageEntity( + ase.container, ed=None) + sase.populate_from_local( + sa, ase.container, name, self._spec.options.mode) + slice_map[i] = sase + # create new local path to ase mappings + curr = 0 + for i in range(0, slices): + start = curr + end = ( + curr + + self._spec.options.vectored_io.stripe_chunk_size_bytes + ) + if end > local_path.total_size: + end = local_path.total_size + ase = slice_map[i] + if i < slices - 1: + next_entry = blobxfer.models.metadata.\ + create_vectored_io_next_entry(slice_map[i+1]) + else: + next_entry = None + lp_slice = blobxfer.models.upload.LocalPath( + parent_path=local_path.parent_path, + relative_path=local_path.relative_path, + use_stdin=False, + view=blobxfer.models.upload.LocalPathView( + fd_start=start, + fd_end=end, + slice_num=i, + mode=self._spec.options.vectored_io.distribution_mode, + total_slices=slices, + next=next_entry, + ) + ) + action = self._check_upload_conditions(lp_slice, ase) + yield action, lp_slice, ase + curr = end + elif (self._spec.options.vectored_io.distribution_mode == + blobxfer.models.upload.VectoredIoDistributionMode.Replica): + action_map = {} + for _, ase in dest: + action = self._check_upload_conditions(local_path, ase) + if action not in action_map: + action_map[action] = [] + action_map[action].append(ase) + for action in action_map: + dst = action_map[action] + if len(dst) == 1: + yield action, local_path, dst[0] + else: + if (action == UploadAction.CheckMd5 or + action == UploadAction.Skip): + for ase in dst: + yield action, local_path, ase + else: + primary_ase = dst[0] + if primary_ase.replica_targets is None: + primary_ase.replica_targets = [] + primary_ase.replica_targets.extend(dst[1:]) + # add replica targets to deletion exclusion set + if self._spec.options.delete_extraneous_destination: + for rt in primary_ase.replica_targets: + self._delete_exclude.add( + blobxfer.operations.upload.Uploader. + create_deletion_id( + rt._client, rt.container, rt.name) + ) + yield action, local_path, primary_ase + else: + for _, ase in dest: + action = self._check_upload_conditions(local_path, ase) + yield action, local_path, ase + + def _run(self): + # type: (Uploader) -> None + """Execute Uploader + :param Uploader self: this + """ + # mark start + self._start_time = blobxfer.util.datetime_now() + logger.info('blobxfer start time: {0}'.format(self._start_time)) + # initialize resume db if specified + if self._general_options.resume_file is not None: + self._resume = blobxfer.operations.resume.UploadResumeManager( + self._general_options.resume_file) + # initialize MD5 processes + if ((self._spec.options.store_file_properties.md5 or + self._spec.skip_on.md5_match) and + self._general_options.concurrency.md5_processes > 0): + self._md5_offload = blobxfer.operations.md5.LocalFileMd5Offload( + num_workers=self._general_options.concurrency.md5_processes) + self._md5_offload.initialize_check_thread( + self._check_for_uploads_from_md5) + # initialize crypto processes + if (self._spec.options.rsa_public_key is not None and + self._general_options.concurrency.crypto_processes > 0): + logger.warning( + 'crypto offload for upload is not possible due to ' + 'sequential nature of {} and FullBlob encryption mode'.format( + blobxfer.models.crypto.EncryptionMetadata. + _ENCRYPTION_ALGORITHM) + ) + # initialize worker threads + self._initialize_disk_threads() + self._initialize_transfer_threads() + # initialize local counters + skipped_files = 0 + skipped_size = 0 + approx_total_bytes = 0 + if not self._spec.sources.can_rename() and self._spec.options.rename: + raise RuntimeError( + 'cannot rename to specified destination with multiple sources') + # iterate through source paths to upload + for src in self._spec.sources.files(): + # create a destination array for the source + dest = [ + (sa, ase) for sa, ase in + self._generate_destination_for_source(src) + ] + for action, lp, ase in self._vectorize_and_bind(src, dest): + if self._spec.options.delete_extraneous_destination: + self._delete_exclude.add( + blobxfer.operations.upload.Uploader.create_deletion_id( + ase._client, ase.container, ase.name) + ) + if action == UploadAction.Skip: + skipped_files += 1 + skipped_size += ase.size if ase.size is not None else 0 + continue + approx_total_bytes += lp.size + if blobxfer.util.is_not_empty(ase.replica_targets): + approx_total_bytes += lp.size * len(ase.replica_targets) + # add to potential upload set + uid = blobxfer.operations.upload.Uploader.create_unique_id( + lp, ase) + with self._upload_lock: + self._upload_set.add(uid) + self._upload_total += 1 + if action == UploadAction.CheckMd5: + self._pre_md5_skip_on_check(lp, ase) + elif action == UploadAction.Upload: + self._add_to_upload_queue(lp, ase, uid) + # set remote files processed + with self._md5_meta_lock: + self._all_files_processed = True + with self._upload_lock: + self._upload_total -= skipped_files + self._upload_bytes_total -= skipped_size + upload_size_mib = approx_total_bytes / blobxfer.util.MEGABYTE + logger.debug( + ('{0} local/remote files processed, waiting for upload ' + 'completion of approx. {1:.4f} MiB').format( + self._upload_total, upload_size_mib)) + del skipped_files + del skipped_size + del upload_size_mib + del approx_total_bytes + # wait for uploads to complete + self._wait_for_disk_threads(terminate=False) + self._wait_for_transfer_threads(terminate=False) + end_time = blobxfer.util.datetime_now() + # update progress bar + self._update_progress_bar() + # check for exceptions + if len(self._exceptions) > 0: + logger.error('exceptions encountered while uploading') + # raise the first one + raise self._exceptions[0] + # check for mismatches + if (self._upload_sofar != self._upload_total or + self._upload_bytes_sofar != self._upload_bytes_total): + raise RuntimeError( + 'upload mismatch: [count={}/{} bytes={}/{}]'.format( + self._upload_sofar, self._upload_total, + self._upload_bytes_sofar, self._upload_bytes_total)) + # delete all remaining local files not accounted for if + # delete extraneous enabled + self._delete_extraneous_files() + # delete resume file if we've gotten this far + if self._resume is not None: + self._resume.delete() + # output throughput + if self._upload_start_time is not None: + ultime = (end_time - self._upload_start_time).total_seconds() + mibup = self._upload_bytes_total / blobxfer.util.MEGABYTE + mibps = mibup / ultime + logger.info( + ('elapsed upload + verify time and throughput of {0:.4f} ' + 'GiB: {1:.3f} sec, {2:.4f} Mbps ({3:.3f} MiB/s)').format( + mibup / 1024, ultime, mibps * 8, mibps)) + end_time = blobxfer.util.datetime_now() + logger.info('blobxfer end time: {0} (elapsed: {1:.3f} sec)'.format( + end_time, (end_time - self._start_time).total_seconds())) + + def start(self): + # type: (Uploader) -> None + """Start the Uploader + :param Uploader self: this + """ + try: + blobxfer.operations.progress.output_parameters( + self._general_options, self._spec) + self._run() + except (KeyboardInterrupt, Exception) as ex: + if isinstance(ex, KeyboardInterrupt): + logger.info( + 'KeyboardInterrupt detected, force terminating ' + 'processes and threads (this may take a while)...') + try: + self._wait_for_transfer_threads(terminate=True) + self._wait_for_disk_threads(terminate=True) + finally: + raise + finally: + # shutdown processes + if self._md5_offload is not None: + self._md5_offload.finalize_processes() + if self._crypto_offload is not None: + self._crypto_offload.finalize_processes() + # close resume file + if self._resume is not None: + self._resume.close() diff --git a/blobxfer/retry.py b/blobxfer/retry.py new file mode 100644 index 0000000..daee22a --- /dev/null +++ b/blobxfer/retry.py @@ -0,0 +1,97 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +# non-stdlib imports +import azure.storage.retry +# local imports + + +class ExponentialRetryWithMaxWait(azure.storage.retry._Retry): + """Exponential Retry with Max Wait (infinite retries)""" + def __init__( + self, initial_backoff=0.1, max_backoff=1, max_retries=None, + reset_at_max=True): + # type: (ExponentialRetryWithMaxWait, int, int, int, bool) -> None + """Ctor for ExponentialRetryWithMaxWait + :param ExponentialRetryWithMaxWait self: this + :param int initial_backoff: initial backoff + :param int max_backoff: max backoff + :param int max_retries: max retries + :param bool reset_at_max: reset after reaching max wait + """ + if max_backoff <= 0: + raise ValueError( + 'max backoff is non-positive: {}'.format(max_backoff)) + if max_retries is not None and max_retries < 0: + raise ValueError( + 'max retries is invalid: {}'.format(max_retries)) + if max_backoff < initial_backoff: + raise ValueError( + 'max backoff {} less than initial backoff {}'.format( + max_backoff, initial_backoff)) + self._backoff_count = 0 + self._last_backoff = initial_backoff + self.initial_backoff = initial_backoff + self.max_backoff = max_backoff + self.reset_at_max = reset_at_max + super(ExponentialRetryWithMaxWait, self).__init__( + max_retries if max_retries is not None else 2147483647, False) + + def retry(self, context): + # type: (ExponentialRetryWithMaxWait, + # azure.storage.models.RetryContext) -> int + """Retry handler + :param ExponentialRetryWithMaxWait self: this + :param azure.storage.models.RetryContext context: retry context + :rtype: int or None + :return: int + """ + return self._retry(context, self._backoff) + + def _backoff(self, context): + # type: (ExponentialRetryWithMaxWait, + # azure.storage.models.RetryContext) -> int + """Backoff calculator + :param ExponentialRetryWithMaxWait self: this + :param azure.storage.models.RetryContext context: retry context + :rtype: int + :return: backoff amount + """ + self._backoff_count += 1 + if self._backoff_count == 1: + self._last_backoff = self.initial_backoff + else: + self._last_backoff *= 2 + if self._last_backoff > self.max_backoff and self.reset_at_max: + self._backoff_count = 1 + self._last_backoff = self.initial_backoff + return self._last_backoff diff --git a/blobxfer/util.py b/blobxfer/util.py new file mode 100644 index 0000000..a17b8a5 --- /dev/null +++ b/blobxfer/util.py @@ -0,0 +1,327 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip +) +# stdlib imports +import base64 +import copy +import datetime +import hashlib +import logging +import logging.handlers +import mimetypes +try: + from os import scandir as scandir +except ImportError: # noqa + from scandir import scandir as scandir +import platform +import re +import sys +# non-stdlib imports +import dateutil +import dateutil.tz +import future.utils +# local imports + +# global defines +MEGABYTE = 1048576 +_ON_WINDOWS = platform.system() == 'Windows' +_REGISTERED_LOGGER_HANDLERS = [] +_PAGEBLOB_BOUNDARY = 512 + + +def on_python2(): + # type: (None) -> bool + """Execution on python2 + :rtype: bool + :return: if on Python2 + """ + return future.utils.PY2 + + +def on_windows(): # noqa + # type: (None) -> bool + """Execution on Windows + :rtype: bool + :return: if on Windows + """ + return _ON_WINDOWS + + +def setup_logger(logger, logfile): # noqa + # type: (logger, str) -> None + """Set up logger""" + global _REGISTERED_LOGGER_HANDLERS + logger.setLevel(logging.DEBUG) + if is_none_or_empty(logfile): + handler = logging.StreamHandler() + else: + handler = logging.FileHandler(logfile, encoding='utf-8') + logging.getLogger().addHandler(handler) + formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + formatter.default_msec_format = '%s.%03d' + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.propagate = False + _REGISTERED_LOGGER_HANDLERS.append(handler) + + +def set_verbose_logger_handlers(): # noqa + # type: (None) -> None + """Set logger handler formatters to more detail""" + global _REGISTERED_LOGGER_HANDLERS + formatter = logging.Formatter( + '%(asctime)s %(levelname)s %(name)s:%(funcName)s:%(lineno)d ' + '%(message)s') + formatter.default_msec_format = '%s.%03d' + for handler in _REGISTERED_LOGGER_HANDLERS: + handler.setFormatter(formatter) + + +def is_none_or_empty(obj): + # type: (any) -> bool + """Determine if object is None or empty + :type any obj: object + :rtype: bool + :return: if object is None or empty + """ + return obj is None or len(obj) == 0 + + +def is_not_empty(obj): + # type: (any) -> bool + """Determine if object is not None and is length is > 0 + :type any obj: object + :rtype: bool + :return: if object is not None and length is > 0 + """ + return obj is not None and len(obj) > 0 + + +def join_thread(thr): + # type: (threading.Thread) -> None + """Join a thread + :type threading.Thread thr: thread to join + """ + if on_python2(): + while True: + thr.join(timeout=1) + if not thr.isAlive(): + break + else: + thr.join() + + +def merge_dict(dict1, dict2): + # type: (dict, dict) -> dict + """Recursively merge dictionaries: dict2 on to dict1. This differs + from dict.update() in that values that are dicts are recursively merged. + Note that only dict value types are merged, not lists, etc. + + :param dict dict1: dictionary to merge to + :param dict dict2: dictionary to merge with + :rtype: dict + :return: merged dictionary + """ + if not isinstance(dict1, dict) or not isinstance(dict2, dict): + raise ValueError('dict1 or dict2 is not a dictionary') + result = copy.deepcopy(dict1) + for k, v in dict2.items(): + if k in result and isinstance(result[k], dict): + result[k] = merge_dict(result[k], v) + else: + result[k] = copy.deepcopy(v) + return result + + +def datetime_now(): + # type: (None) -> datetime.datetime + """Return a timezone-aware datetime instance with local offset + :rtype: datetime.datetime + :return: datetime now with local tz + """ + return datetime.datetime.now(tz=dateutil.tz.tzlocal()) + + +def datetime_from_timestamp(ts, tz=None): + # type: (int, dateutil.tz) -> datetime.datetime + """Convert a timestamp into datetime with offset + :param int ts: timestamp + :param dateutil.tz tz: time zone or local tz if not specified + :rtype: datetime.datetime + :return: converted timestamp to datetime + """ + if tz is None: + tz = dateutil.tz.tzlocal() + return datetime.datetime.fromtimestamp(ts, tz=tz) + + +def scantree(path): + # type: (str) -> os.DirEntry + """Recursively scan a directory tree + :param str path: path to scan + :rtype: DirEntry + :return: DirEntry via generator + """ + for entry in scandir(path): + if entry.is_dir(follow_symlinks=True): + # due to python2 compat, cannot use yield from here + for t in scantree(entry.path): + yield t + else: + yield entry + + +def replace_file(src, dst): + # type: (pathlib.Path, pathlib.Path) -> None + """Replace a file, using atomic replace if available + :param pathlib.Path src: source path + :param pathlib.Path dst: destination path + """ + if sys.version_info < (3, 3): + if dst.exists(): + dst.unlink() + src.rename(dst) + else: + src.replace(dst) + + +def get_mime_type(filename): + # type: (str) -> str + """Guess the type of a file based on its filename + :param str filename: filename to guess the content-type + :rtype: str + :rturn: string of form 'class/type' for MIME content-type header + """ + return (mimetypes.guess_type(filename)[0] or 'application/octet-stream') + + +def base64_encode_as_string(obj): # noqa + # type: (any) -> str + """Encode object to base64 + :param any obj: object to encode + :rtype: str + :return: base64 encoded string + """ + if on_python2(): + return base64.b64encode(obj) + else: + return str(base64.b64encode(obj), 'ascii') + + +def base64_decode_string(string): + # type: (str) -> str + """Base64 decode a string + :param str string: string to decode + :rtype: str + :return: decoded string + """ + return base64.b64decode(string) + + +def new_md5_hasher(): + # type: (None) -> md5.MD5 + """Create a new MD5 hasher + :rtype: md5.MD5 + :return: new MD5 hasher + """ + return hashlib.md5() + + +def page_align_content_length(length): + # type: (int) -> int + """Compute page boundary alignment + :param int length: content length + :rtype: int + :return: aligned byte boundary + """ + mod = length % _PAGEBLOB_BOUNDARY + if mod != 0: + return length + (_PAGEBLOB_BOUNDARY - mod) + return length + + +def normalize_azure_path(path): + # type: (str) -> str + """Normalize remote path (strip slashes and use forward slashes) + :param str path: path to normalize + :rtype: str + :return: normalized path + """ + if is_none_or_empty(path): + raise ValueError('provided path is invalid') + _path = path.strip('/').strip('\\') + return '/'.join(re.split('/|\\\\', _path)) + + +def explode_azure_path(path): + # type: (str) -> Tuple[str, str] + """Explodes an azure path into a container or fileshare and the + remaining virtual path + :param str path: path to explode + :rtype: tuple + :return: container, vpath + """ + rpath = normalize_azure_path(path).split('/') + container = str(rpath[0]) + if len(rpath) > 1: + rpath = '/'.join(rpath[1:]) + else: + rpath = '' + return container, rpath + + +def blob_is_snapshot(url): + # type: (str) -> bool + """Checks if the blob is a snapshot blob + :param url str: blob url + :rtype: bool + :return: if blob is a snapshot blob + """ + if '?snapshot=' in url: + try: + dateutil.parser.parse(url.split('?snapshot=')[-1]) + return True + except (ValueError, OverflowError): + pass + return False + + +def parse_blob_snapshot_parameter(url): + # type: (str) -> str + """Retrieves the blob snapshot parameter from a url + :param url str: blob url + :rtype: str + :return: snapshot parameter + """ + if blob_is_snapshot(url): + tmp = url.split('?snapshot=') + if len(tmp) == 2: + return tmp[0], tmp[1] + return None diff --git a/blobxfer/version.py b/blobxfer/version.py new file mode 100644 index 0000000..0f2a584 --- /dev/null +++ b/blobxfer/version.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +__version__ = '1.0.0a3' diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cli/cli.py b/cli/cli.py new file mode 100644 index 0000000..4c12bbd --- /dev/null +++ b/cli/cli.py @@ -0,0 +1,800 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import absolute_import, division, print_function +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import json +import logging +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import click +import ruamel.yaml +# blobxfer library imports +import blobxfer.api +import blobxfer.util +# local imports +try: + from . import settings +except (SystemError, ImportError): # noqa + # for local testing + import settings + +# create logger +logger = logging.getLogger('blobxfer') +# global defines +_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + + +class CliContext(object): + """CliContext class: holds context for CLI commands""" + def __init__(self): + """Ctor for CliContext""" + self.config = {} + self.cli_options = {} + self.credentials = None + self.general_options = None + + def initialize(self, action): + # type: (CliContext, settings.TransferAction) -> None + """Initialize context + :param CliContext self: this + :param settings.TransferAction action: transfer action + """ + self._init_config() + self.general_options = settings.create_general_options( + self.config, action) + self.credentials = settings.create_azure_storage_credentials( + self.config, self.general_options) + + def _read_yaml_file(self, yaml_file): + # type: (CliContext, pathlib.Path) -> None + """Read a yaml file into self.config + :param CliContext self: this + :param pathlib.Path yaml_file: yaml file to load + """ + with yaml_file.open('r') as f: + if self.config is None: + self.config = ruamel.yaml.load( + f, Loader=ruamel.yaml.RoundTripLoader) + else: + self.config = blobxfer.util.merge_dict( + ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader), + self.config) + + def _init_config(self): + # type: (CliContext) -> None + """Initializes configuration of the context + :param CliContext self: this + """ + # load yaml config file into memory + if blobxfer.util.is_not_empty(self.cli_options['yaml_config']): + yaml_config = pathlib.Path(self.cli_options['yaml_config']) + self._read_yaml_file(yaml_config) + else: + # merge cli options with config + settings.merge_settings(self.config, self.cli_options) + # set log file if specified + blobxfer.util.setup_logger( + logger, self.config['options'].get('log_file', None)) + # output config + if self.config['options'].get('verbose', False): + blobxfer.util.set_verbose_logger_handlers() + logger.debug('config: \n' + json.dumps(self.config, indent=4)) + # free mem + del self.cli_options + + +# create a pass decorator for shared context between commands +pass_cli_context = click.make_pass_decorator(CliContext, ensure=True) + + +def _config_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['yaml_config'] = value + return value + return click.option( + '--config', + expose_value=False, + help='YAML configuration file', + envvar='BLOBXFER_CONFIG_FILE', + callback=callback)(f) + + +def _crypto_processes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['crypto_processes'] = value + return value + return click.option( + '--crypto-processes', + expose_value=False, + type=int, + default=0, + help='Concurrent crypto processes (download only)', + callback=callback)(f) + + +def _disk_threads_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['disk_threads'] = value + return value + return click.option( + '--disk-threads', + expose_value=False, + type=int, + default=0, + help='Concurrent disk threads', + callback=callback)(f) + + +def _log_file_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['log_file'] = value + return value + return click.option( + '--log-file', + expose_value=False, + default=None, + help='Log to file specified; this must be specified for progress ' + 'bar to show', + callback=callback)(f) + + +def _md5_processes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['md5_processes'] = value + return value + return click.option( + '--md5-processes', + expose_value=False, + type=int, + default=0, + help='Concurrent MD5 processes', + callback=callback)(f) + + +def _progress_bar_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['progress_bar'] = value + return value + return click.option( + '--progress-bar/--no-progress-bar', + expose_value=False, + default=True, + help='Display progress bar instead of console logs; log file must ' + 'be specified [True]', + callback=callback)(f) + + +def _resume_file_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['resume_file'] = value + return value + return click.option( + '--resume-file', + expose_value=False, + default=None, + help='Save or use resume file specified', + callback=callback)(f) + + +def _timeout_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['timeout'] = value + return value + return click.option( + '--timeout', + expose_value=False, + type=int, + help='Individual chunk transfer timeout', + callback=callback)(f) + + +def _transfer_threads_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['transfer_threads'] = value + return value + return click.option( + '--transfer-threads', + expose_value=False, + type=int, + default=0, + help='Concurrent transfer threads', + callback=callback)(f) + + +def _verbose_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['verbose'] = value + return value + return click.option( + '-v', '--verbose', + expose_value=False, + is_flag=True, + help='Verbose output', + callback=callback)(f) + + +def _local_resource_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['local_resource'] = value + return value + return click.option( + '--local-path', + expose_value=False, + help='Local path; use - for stdin', + callback=callback)(f) + + +def _storage_account_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['storage_account'] = value + return value + return click.option( + '--storage-account', + expose_value=False, + help='Storage account name', + envvar='BLOBXFER_STORAGE_ACCOUNT', + callback=callback)(f) + + +def _remote_path_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['remote_path'] = value + return value + return click.option( + '--remote-path', + expose_value=False, + help='Remote path on Azure Storage', + callback=callback)(f) + + +def common_options(f): + f = _verbose_option(f) + f = _transfer_threads_option(f) + f = _timeout_option(f) + f = _resume_file_option(f) + f = _progress_bar_option(f) + f = _md5_processes_option(f) + f = _log_file_option(f) + f = _disk_threads_option(f) + f = _crypto_processes_option(f) + f = _config_option(f) + return f + + +def upload_download_options(f): + f = _remote_path_option(f) + f = _storage_account_option(f) + f = _local_resource_option(f) + return f + + +def _access_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['access_key'] = value + return value + return click.option( + '--storage-account-key', + expose_value=False, + help='Storage account access key', + envvar='BLOBXFER_STORAGE_ACCOUNT_KEY', + callback=callback)(f) + + +def _chunk_size_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['chunk_size_bytes'] = value + return value + return click.option( + '--chunk-size-bytes', + expose_value=False, + type=int, + default=0, + help='Block or chunk size in bytes; set to 0 for auto-select ' + 'on upload [0]', + callback=callback)(f) + + +def _delete_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['delete'] = value + return value + return click.option( + '--delete', + expose_value=False, + is_flag=True, + help='Delete extraneous files on target [False]', + callback=callback)(f) + + +def _distribution_mode(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['distribution_mode'] = value.lower() + return value + return click.option( + '--distribution-mode', + expose_value=False, + default='disabled', + help='Vectored IO distribution mode: disabled, replica, ' + 'stripe [disabled]', + callback=callback)(f) + + +def _endpoint_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['endpoint'] = value + return value + return click.option( + '--endpoint', + expose_value=False, + default='core.windows.net', + help='Azure Storage endpoint [core.windows.net]', + callback=callback)(f) + + +def _exclude_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['exclude'] = value + return value + return click.option( + '--exclude', + expose_value=False, + default=None, + multiple=True, + help='Exclude pattern', + callback=callback)(f) + + +def _file_attributes(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['file_attributes'] = value + return value + return click.option( + '--file-attributes/--no-file-attributes', + expose_value=False, + default=False, + help='Store or restore file attributes [False]', + callback=callback)(f) + + +def _file_md5_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['file_md5'] = value + return value + return click.option( + '--file-md5/--no-file-md5', + expose_value=False, + default=False, + help='Compute file MD5 [False]', + callback=callback)(f) + + +def _include_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['include'] = value + return value + return click.option( + '--include', + expose_value=False, + default=None, + multiple=True, + help='Include pattern', + callback=callback)(f) + + +def _mode_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['mode'] = value + return value + return click.option( + '--mode', + expose_value=False, + default='auto', + help='Transfer mode: auto, append, block, file, page [auto]', + callback=callback)(f) + + +def _one_shot_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['one_shot_bytes'] = value + return value + return click.option( + '--one-shot-bytes', + expose_value=False, + type=int, + default=0, + help='File sizes less than or equal to the specified byte threshold ' + 'will be uploaded as one-shot for block blobs; the valid range that ' + 'can be specified is 0 to 256MiB [0]', + callback=callback)(f) + + +def _overwrite_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['overwrite'] = value + return value + return click.option( + '--overwrite/--no-overwrite', + expose_value=False, + default=True, + help='Overwrite destination if exists. For append blobs, ' + '--no-overwrite will append to any existing blob. [True]', + callback=callback)(f) + + +def _recursive_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['recursive'] = value + return value + return click.option( + '--recursive/--no-recursive', + expose_value=False, + default=True, + help='Recursive [True]', + callback=callback)(f) + + +def _rename_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rename'] = value + return value + return click.option( + '--rename', + expose_value=False, + is_flag=True, + default=False, + help='Rename a single file upload or download [False]', + callback=callback)(f) + + +def _rsa_private_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_private_key'] = value + return value + return click.option( + '--rsa-private-key', + expose_value=False, + default=None, + help='RSA private key PEM file', + envvar='BLOBXFER_RSA_PRIVATE_KEY', + callback=callback)(f) + + +def _rsa_private_key_passphrase_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_private_key_passphrase'] = value + return value + return click.option( + '--rsa-private-key-passphrase', + expose_value=False, + default=None, + help='RSA private key passphrase', + envvar='BLOBXFER_RSA_PRIVATE_KEY_PASSPHRASE', + callback=callback)(f) + + +def _rsa_public_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['rsa_public_key'] = value + return value + return click.option( + '--rsa-public-key', + expose_value=False, + default=None, + help='RSA public key PEM file', + envvar='BLOBXFER_RSA_PUBLIC_KEY', + callback=callback)(f) + + +def _sas_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sas'] = value + return value + return click.option( + '--sas', + expose_value=False, + help='Shared access signature', + envvar='BLOBXFER_SAS', + callback=callback)(f) + + +def _skip_on_filesize_match_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_filesize_match'] = value + return value + return click.option( + '--skip-on-filesize-match', + expose_value=False, + is_flag=True, + help='Skip on equivalent file size [False]', + callback=callback)(f) + + +def _skip_on_lmt_ge_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_lmt_ge'] = value + return value + return click.option( + '--skip-on-lmt-ge', + expose_value=False, + is_flag=True, + help='Skip on last modified time greater than or equal to [False]', + callback=callback)(f) + + +def _skip_on_md5_match_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['skip_on_md5_match'] = value + return value + return click.option( + '--skip-on-md5-match', + expose_value=False, + is_flag=True, + help='Skip on MD5 match [False]', + callback=callback)(f) + + +def _strip_components_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['strip_components'] = value + return value + return click.option( + '--strip-components', + expose_value=False, + type=int, + default=1, + help='Strip leading file path components on upload [1]', + callback=callback)(f) + + +def _stripe_chunk_size_bytes_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['stripe_chunk_size_bytes'] = value + return value + return click.option( + '--stripe-chunk-size-bytes', + expose_value=False, + type=int, + default=1073741824, + help='Vectored IO stripe width in bytes [1073741824]', + callback=callback)(f) + + +def _sync_copy_dest_access_key_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_access_key'] = value + return value + return click.option( + '--sync-copy-dest-storage-account-key', + expose_value=False, + help='Storage account access key for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_KEY', + callback=callback)(f) + + +def _sync_copy_dest_storage_account_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_storage_account'] = value + return value + return click.option( + '--sync-copy-dest-storage-account', + expose_value=False, + help='Storage account name for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT', + callback=callback)(f) + + +def _sync_copy_dest_remote_path_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_remote_path'] = value + return value + return click.option( + '--sync-copy-dest-remote-path', + expose_value=False, + help='Remote path on Azure Storage for synccopy destination', + callback=callback)(f) + + +def _sync_copy_dest_sas_option(f): + def callback(ctx, param, value): + clictx = ctx.ensure_object(CliContext) + clictx.cli_options['sync_copy_dest_sas'] = value + return value + return click.option( + '--sync-copy-dest-sas', + expose_value=False, + help='Shared access signature for synccopy destination', + envvar='BLOBXFER_SYNC_COPY_DEST_SAS', + callback=callback)(f) + + +def upload_options(f): + f = _stripe_chunk_size_bytes_option(f) + f = _strip_components_option(f) + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _rsa_public_key_option(f) + f = _rsa_private_key_passphrase_option(f) + f = _rsa_private_key_option(f) + f = _rename_option(f) + f = _recursive_option(f) + f = _overwrite_option(f) + f = _one_shot_bytes_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _file_md5_option(f) + f = _file_attributes(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _distribution_mode(f) + f = _delete_option(f) + f = _chunk_size_bytes_option(f) + f = _access_key_option(f) + return f + + +def download_options(f): + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _rsa_private_key_passphrase_option(f) + f = _rsa_private_key_option(f) + f = _rename_option(f) + f = _recursive_option(f) + f = _overwrite_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _file_md5_option(f) + f = _file_attributes(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _delete_option(f) + f = _chunk_size_bytes_option(f) + f = _access_key_option(f) + return f + + +def sync_copy_options(f): + f = _sync_copy_dest_storage_account_option(f) + f = _sync_copy_dest_sas_option(f) + f = _sync_copy_dest_remote_path_option(f) + f = _sync_copy_dest_access_key_option(f) + f = _storage_account_option(f) + f = _skip_on_md5_match_option(f) + f = _skip_on_lmt_ge_option(f) + f = _skip_on_filesize_match_option(f) + f = _sas_option(f) + f = _remote_path_option(f) + f = _overwrite_option(f) + f = _mode_option(f) + f = _include_option(f) + f = _exclude_option(f) + f = _endpoint_option(f) + f = _chunk_size_bytes_option(f) + f = _access_key_option(f) + return f + + +@click.group(context_settings=_CONTEXT_SETTINGS) +@click.version_option(version=blobxfer.__version__) +@click.pass_context +def cli(ctx): + """Blobxfer: Azure Storage transfer tool""" + pass + + +@cli.command('download') +@upload_download_options +@download_options +@common_options +@pass_cli_context +def download(ctx): + """Download blobs or files from Azure Storage""" + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download) + ctx.initialize(settings.TransferAction.Download) + specs = settings.create_download_specifications(ctx.config) + for spec in specs: + blobxfer.api.Downloader( + ctx.general_options, ctx.credentials, spec + ).start() + + +@cli.command('synccopy') +@sync_copy_options +@common_options +@pass_cli_context +def synccopy(ctx): + """Synchronously copy blobs between Azure Storage accounts""" + raise NotImplementedError() + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy) + ctx.initialize(settings.TransferAction.Synccopy) + + +@cli.command('upload') +@upload_download_options +@upload_options +@common_options +@pass_cli_context +def upload(ctx): + """Upload files to Azure Storage""" + settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload) + ctx.initialize(settings.TransferAction.Upload) + specs = settings.create_upload_specifications(ctx.config) + for spec in specs: + blobxfer.api.Uploader( + ctx.general_options, ctx.credentials, spec + ).start() + + +if __name__ == '__main__': + cli() diff --git a/cli/settings.py b/cli/settings.py new file mode 100644 index 0000000..24d1a7f --- /dev/null +++ b/cli/settings.py @@ -0,0 +1,475 @@ +# Copyright (c) Microsoft Corporation +# +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# compat imports +from __future__ import ( + absolute_import, division, print_function, unicode_literals +) +from builtins import ( # noqa + bytes, dict, int, list, object, range, ascii, chr, hex, input, + next, oct, open, pow, round, super, filter, map, zip) +# stdlib imports +import enum +# non-stdlib imports +# local imports +import blobxfer.models.azure +import blobxfer.models.download +import blobxfer.models.options +import blobxfer.models.upload +import blobxfer.operations.azure +import blobxfer.operations.crypto +import blobxfer.util + + +# enums +class TransferAction(enum.Enum): + Download = 1, + Upload = 2, + Synccopy = 3, + + +def add_cli_options(cli_options, action): + # type: (dict, str) -> None + """Adds CLI options to the configuration object + :param dict cli_options: CLI options dict + :param TransferAction action: action + """ + cli_options['_action'] = action.name.lower() + try: + local_resource = cli_options['local_resource'] + if blobxfer.util.is_none_or_empty(local_resource): + raise KeyError() + except KeyError: + raise ValueError('--local-path must be specified') + try: + storage_account = cli_options['storage_account'] + if blobxfer.util.is_none_or_empty(storage_account): + raise KeyError() + except KeyError: + raise ValueError('--storage-account must be specified') + try: + remote_path = cli_options['remote_path'] + if blobxfer.util.is_none_or_empty(remote_path): + raise KeyError() + except KeyError: + raise ValueError('--remote-path must be specified') + if blobxfer.util.is_not_empty(storage_account): + # add credentials + try: + key = cli_options['access_key'] + if blobxfer.util.is_none_or_empty(key): + raise KeyError() + except KeyError: + try: + key = cli_options['sas'] + if blobxfer.util.is_none_or_empty(key): + raise KeyError() + except KeyError: + raise RuntimeError('access key or sas must be provided') + azstorage = { + 'endpoint': cli_options['endpoint'], + 'accounts': { + storage_account: key + } + } + del key + # construct "argument" from cli options + sa_rp = {storage_account: remote_path} + if action == TransferAction.Upload: + arg = { + 'source': [local_resource], + 'destination': [sa_rp], + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'chunk_size_bytes': cli_options['chunk_size_bytes'], + 'delete_extraneous_destination': cli_options['delete'], + 'mode': cli_options['mode'], + 'one_shot_bytes': cli_options['one_shot_bytes'], + 'overwrite': cli_options['overwrite'], + 'recursive': cli_options['recursive'], + 'rename': cli_options['rename'], + 'rsa_private_key': cli_options['rsa_private_key'], + 'rsa_private_key_passphrase': cli_options[ + 'rsa_private_key_passphrase'], + 'rsa_public_key': cli_options['rsa_public_key'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + 'store_file_properties': { + 'attributes': cli_options['file_attributes'], + 'md5': cli_options['file_md5'], + }, + 'strip_components': cli_options['strip_components'], + 'vectored_io': { + 'stripe_chunk_size_bytes': cli_options[ + 'stripe_chunk_size_bytes'], + 'distribution_mode': cli_options['distribution_mode'], + }, + }, + } + elif action == TransferAction.Download: + arg = { + 'source': [sa_rp], + 'destination': local_resource, + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'check_file_md5': cli_options['file_md5'], + 'chunk_size_bytes': cli_options['chunk_size_bytes'], + 'delete_extraneous_destination': cli_options['delete'], + 'mode': cli_options['mode'], + 'overwrite': cli_options['overwrite'], + 'recursive': cli_options['recursive'], + 'rename': cli_options['rename'], + 'rsa_private_key': cli_options['rsa_private_key'], + 'rsa_private_key_passphrase': cli_options[ + 'rsa_private_key_passphrase'], + 'restore_file_attributes': cli_options['file_attributes'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + }, + } + elif action == TransferAction.Synccopy: + try: + sync_copy_dest_storage_account = \ + cli_options['sync_copy_dest_storage_account'] + if blobxfer.util.is_none_or_empty( + sync_copy_dest_storage_account): + raise KeyError() + except KeyError: + raise ValueError( + '--sync-copy-dest-storage-account must be specified') + try: + sync_copy_dest_remote_path = \ + cli_options['sync_copy_dest_remote_path'] + if blobxfer.util.is_none_or_empty(sync_copy_dest_remote_path): + raise KeyError() + except KeyError: + raise ValueError( + '--sync-copy-dest-remote-path must be specified') + arg = { + 'source': sa_rp, + 'destination': [ + { + sync_copy_dest_storage_account: + sync_copy_dest_remote_path + } + ], + 'include': cli_options['include'], + 'exclude': cli_options['exclude'], + 'options': { + 'chunk_size_bytes': cli_options['chunk_size_bytes'], + 'mode': cli_options['mode'], + 'overwrite': cli_options['overwrite'], + 'skip_on': { + 'filesize_match': cli_options[ + 'skip_on_filesize_match'], + 'lmt_ge': cli_options['skip_on_lmt_ge'], + 'md5_match': cli_options['skip_on_md5_match'], + }, + }, + } + try: + destkey = cli_options['sync_copy_dest_access_key'] + if blobxfer.util.is_none_or_empty(destkey): + raise KeyError() + except KeyError: + try: + destkey = cli_options['sync_copy_dest_sas'] + if blobxfer.util.is_none_or_empty(destkey): + raise KeyError() + except KeyError: + raise RuntimeError( + 'destination access key or sas must be provided') + azstorage['accounts'][ + cli_options['sync_copy_dest_storage_account']] = destkey + del destkey + cli_options[action.name.lower()] = arg + cli_options['azure_storage'] = azstorage + + +def merge_settings(config, cli_options): + # type: (dict, dict) -> None + """Merge CLI options into main config + :param dict config: config dict + :param dict cli_options: cli options + """ + action = cli_options['_action'] + if (action != TransferAction.Upload.name.lower() and + action != TransferAction.Download.name.lower() and + action == TransferAction.Synccopy.name.lower()): + raise ValueError('invalid action: {}'.format(action)) + # create action options + if action not in config: + config[action] = [] + # merge any argument options + if action in cli_options: + config[action].append(cli_options[action]) + # merge credentials + if 'azure_storage' in cli_options: + if 'azure_storage' not in config: + config['azure_storage'] = {} + config['azure_storage'] = blobxfer.util.merge_dict( + config['azure_storage'], cli_options['azure_storage']) + # merge general options + if 'options' not in config: + config['options'] = {} + config['options']['log_file'] = cli_options['log_file'] + config['options']['progress_bar'] = cli_options['progress_bar'] + config['options']['resume_file'] = cli_options['resume_file'] + config['options']['timeout_sec'] = cli_options['timeout'] + config['options']['verbose'] = cli_options['verbose'] + # merge concurrency options + if 'concurrency' not in config['options']: + config['options']['concurrency'] = {} + config['options']['concurrency']['crypto_processes'] = \ + cli_options['crypto_processes'] + config['options']['concurrency']['disk_threads'] = \ + cli_options['disk_threads'] + config['options']['concurrency']['md5_processes'] = \ + cli_options['md5_processes'] + config['options']['concurrency']['transfer_threads'] = \ + cli_options['transfer_threads'] + + +def create_azure_storage_credentials(config, general_options): + # type: (dict, blobxfer.models.options.General) -> + # blobxfer.operations.azure.StorageCredentials + """Create an Azure StorageCredentials object from configuration + :param dict config: config dict + :param blobxfer.models.options.General: general options + :rtype: blobxfer.operations.azure.StorageCredentials + :return: credentials object + """ + creds = blobxfer.operations.azure.StorageCredentials(general_options) + endpoint = config['azure_storage']['endpoint'] + for name in config['azure_storage']['accounts']: + key = config['azure_storage']['accounts'][name] + creds.add_storage_account(name, key, endpoint) + return creds + + +def create_general_options(config, action): + # type: (dict, TransferAction) -> blobxfer.models.options.General + """Create a General Options object from configuration + :param dict config: config dict + :param TransferAction action: transfer action + :rtype: blobxfer.models.options.General + :return: general options object + """ + conc = config['options'].get('concurrency', {}) + return blobxfer.models.options.General( + concurrency=blobxfer.models.options.Concurrency( + crypto_processes=conc.get('crypto_processes', 0), + disk_threads=conc.get('disk_threads', 0), + md5_processes=conc.get('md5_processes', 0), + transfer_threads=conc.get('transfer_threads', 0), + is_download=action == TransferAction.Download, + ), + log_file=config['options'].get('log_file', None), + progress_bar=config['options'].get('progress_bar', True), + resume_file=config['options'].get('resume_file', None), + timeout_sec=config['options'].get('timeout_sec', None), + verbose=config['options'].get('verbose', False), + ) + + +def create_download_specifications(config): + # type: (dict) -> List[blobxfer.models.download.Specification] + """Create a list of Download Specification objects from configuration + :param dict config: config dict + :rtype: list + :return: list of Download Specification objects + """ + specs = [] + for conf in config['download']: + # create download options + confmode = conf['options'].get('mode', 'auto').lower() + if confmode == 'auto': + mode = blobxfer.models.azure.StorageModes.Auto + elif confmode == 'append': + mode = blobxfer.models.azure.StorageModes.Append + elif confmode == 'block': + mode = blobxfer.models.azure.StorageModes.Block + elif confmode == 'file': + mode = blobxfer.models.azure.StorageModes.File + elif confmode == 'page': + mode = blobxfer.models.azure.StorageModes.Page + else: + raise ValueError('unknown mode: {}'.format(confmode)) + # load RSA private key PEM file if specified + rpk = conf['options'].get('rsa_private_key', None) + if blobxfer.util.is_not_empty(rpk): + rpkp = conf['options'].get('rsa_private_key_passphrase', None) + rpk = blobxfer.operations.crypto.load_rsa_private_key_file( + rpk, rpkp) + else: + rpk = None + # create specification + sod = conf['options'].get('skip_on', {}) + ds = blobxfer.models.download.Specification( + download_options=blobxfer.models.options.Download( + check_file_md5=conf['options'].get('check_file_md5', False), + chunk_size_bytes=conf['options'].get('chunk_size_bytes', 0), + delete_extraneous_destination=conf['options'].get( + 'delete_extraneous_destination', False), + mode=mode, + overwrite=conf['options'].get('overwrite', True), + recursive=conf['options'].get('recursive', True), + rename=conf['options'].get('rename', False), + restore_file_attributes=conf[ + 'options'].get('restore_file_attributes', False), + rsa_private_key=rpk, + ), + skip_on_options=blobxfer.models.options.SkipOn( + filesize_match=sod.get('filesize_match', False), + lmt_ge=sod.get('lmt_ge', False), + md5_match=sod.get('md5_match', False), + ), + local_destination_path=blobxfer.models.download. + LocalDestinationPath( + conf['destination'] + ) + ) + # create remote source paths + for src in conf['source']: + if len(src) != 1: + raise RuntimeError( + 'invalid number of source pairs specified per entry') + sa = next(iter(src)) + asp = blobxfer.operations.azure.SourcePath() + asp.add_path_with_storage_account(src[sa], sa) + incl = conf.get('include', None) + if blobxfer.util.is_not_empty(incl): + asp.add_includes(incl) + excl = conf.get('exclude', None) + if blobxfer.util.is_not_empty(excl): + asp.add_excludes(excl) + ds.add_azure_source_path(asp) + # append spec to list + specs.append(ds) + return specs + + +def create_upload_specifications(config): + # type: (dict) -> List[blobxfer.models.upload.Specification] + """Create a list of Upload Specification objects from configuration + :param dict config: config dict + :rtype: list + :return: list of Upload Specification objects + """ + specs = [] + for conf in config['upload']: + # create upload options + confmode = conf['options'].get('mode', 'auto').lower() + if confmode == 'auto': + mode = blobxfer.models.azure.StorageModes.Auto + elif confmode == 'append': + mode = blobxfer.models.azure.StorageModes.Append + elif confmode == 'block': + mode = blobxfer.models.azure.StorageModes.Block + elif confmode == 'file': + mode = blobxfer.models.azure.StorageModes.File + elif confmode == 'page': + mode = blobxfer.models.azure.StorageModes.Page + else: + raise ValueError('unknown mode: {}'.format(confmode)) + # load RSA public key PEM if specified + rpk = conf['options'].get('rsa_public_key', None) + if blobxfer.util.is_not_empty(rpk): + rpk = blobxfer.operations.crypto.load_rsa_public_key_file(rpk) + if rpk is None: + # load RSA private key PEM file if specified + rpk = conf['options'].get('rsa_private_key', None) + if blobxfer.util.is_not_empty(rpk): + rpkp = conf['options'].get('rsa_private_key_passphrase', None) + rpk = blobxfer.operations.crypto.load_rsa_private_key_file( + rpk, rpkp) + rpk = rpk.public_key() + else: + rpk = None + # create local source paths + lsp = blobxfer.models.upload.LocalSourcePath() + lsp.add_paths(conf['source']) + incl = conf.get('include', None) + if blobxfer.util.is_not_empty(incl): + lsp.add_includes(incl) + excl = conf.get('exclude', None) + if blobxfer.util.is_not_empty(excl): + lsp.add_excludes(excl) + # create specification + sfp = conf['options'].get('store_file_properties', {}) + vio = conf['options'].get('vectored_io', {}) + sod = conf['options'].get('skip_on', {}) + us = blobxfer.models.upload.Specification( + upload_options=blobxfer.models.options.Upload( + chunk_size_bytes=conf['options'].get('chunk_size_bytes', 0), + delete_extraneous_destination=conf['options'].get( + 'delete_extraneous_destination', False), + mode=mode, + one_shot_bytes=conf['options'].get('one_shot_bytes', 0), + overwrite=conf['options'].get('overwrite', True), + recursive=conf['options'].get('recursive', True), + rename=conf['options'].get('rename', False), + rsa_public_key=rpk, + store_file_properties=blobxfer.models.options.FileProperties( + attributes=sfp.get('attributes', False), + md5=sfp.get('md5', False), + ), + strip_components=conf['options'].get('strip_components', 1), + vectored_io=blobxfer.models.options.VectoredIo( + stripe_chunk_size_bytes=vio.get( + 'stripe_chunk_size_bytes', 1073741824), + distribution_mode=blobxfer. + models.upload.VectoredIoDistributionMode( + vio.get('distribution_mode', 'disabled').lower()), + ), + ), + skip_on_options=blobxfer.models.options.SkipOn( + filesize_match=sod.get('filesize_match', False), + lmt_ge=sod.get('lmt_ge', False), + md5_match=sod.get('md5_match', False), + ), + local_source_path=lsp, + ) + # create remote destination paths + for dst in conf['destination']: + if len(dst) != 1: + raise RuntimeError( + 'invalid number of destination pairs specified per entry') + sa = next(iter(dst)) + adp = blobxfer.operations.azure.DestinationPath() + adp.add_path_with_storage_account(dst[sa], sa) + us.add_azure_destination_path(adp) + # append spec to list + specs.append(us) + return specs diff --git a/docker/Dockerfile b/docker/Dockerfile index 42e8b2d..a713e15 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,12 +1,11 @@ # Dockerfile for Azure/blobxfer -FROM gliderlabs/alpine:3.4 +FROM alpine:3.6 MAINTAINER Fred Park -RUN apk add --update --no-cache musl build-base python3 python3-dev openssl-dev libffi-dev ca-certificates \ - && pip3 install --no-cache-dir --upgrade pip \ +RUN apk add --update --no-cache musl build-base python3 python3-dev libressl-dev libffi-dev ca-certificates \ && pip3 install --no-cache-dir --upgrade blobxfer \ - && apk del --purge build-base python3-dev openssl-dev libffi-dev \ + && apk del --purge build-base python3-dev libressl-dev libffi-dev \ && rm /var/cache/apk/* ENTRYPOINT ["blobxfer"] diff --git a/docs/01-installation.md b/docs/01-installation.md new file mode 100644 index 0000000..9a3fd74 --- /dev/null +++ b/docs/01-installation.md @@ -0,0 +1,84 @@ +# blobxfer Installation +`blobxfer` is a pure Python package, however, some dependencies require a C +compiler and supporting libraries if there is no binary wheel for that +dependency and your platform. Please follow the pre-requisites section first +prior to invoking installation via `pip`. Alternatively, you can use the +[blobxfer Docker image](https://hub.docker.com/r/alfpark/blobxfer/). + +## Pre-requisites +`blobxfer` depends on `cryptography` and `ruamel.yaml` which require a +C compiler if your platform does not have a pre-made binary wheel. Please +follow the instructions below for your platform. + +### Ubuntu +```shell +apt-get update +# for Python3 (recommended) +apt-get install -y build-essential libssl-dev libffi-dev python3-dev python3-pip +# for Python2 +apt-get install -y build-essential libssl-dev libffi-dev python-dev python-pip +``` + +### CentOS/RHEL +```shell +# for Python2 +yum install -y gcc openssl-dev libffi-devel python-devel +curl -fSsL https://bootstrap.pypa.io/get-pip.py | python +``` + +### SLES/OpenSUSE +```shell +zypper ref +# for Python2 +zypper -n in gcc libopenssl-devel libffi48-devel python-devel +curl -fSsL https://bootstrap.pypa.io/get-pip.py | python +``` + +### Mac OS X +Python 2.7 should come pre-installed. However, if you want to install +`blobxfer` for Python 3.5+ (recommended), please follow the steps outlined on +[this guide](http://docs.python-guide.org/en/latest/starting/install/osx/) +to ensure that you have the latest version of Python, a compiler and pip. + +### Windows +Please install at least Python 3.5 or higher to avoid requiring a +compiler. If you must use Python 2.7, you can download the necessary +development headers and compiler [from Microsoft](http://aka.ms/vcpython27). + +## Installation via `pip` +[blobxfer](https://pypi.python.org/pypi/blobxfer) is on PyPI and can be +installed via: + +```shell +# for Python3 (recommended) +pip3 install blobxfer +# for Python2 +pip install blobxfer +``` + +`blobxfer` is compatible with Python 2.7 and 3.3+. To install for Python 3 +(which is recommended), some distributions may use `pip3` instead of `pip`. +Installing into your user area via `--user` or via a virtual environment +is recommended to avoid installation issues with system-wide Python +packages. + +## Installation via Docker +[blobxfer](https://hub.docker.com/r/alfpark/blobxfer/) is also on Docker +Hub and can be retrieved via: + +```shell +docker pull alfpark/blobxfer +``` + +## Troubleshooting +#### `azure.storage` dependency not found +If you get an error such as `ImportError: No module named storage` or that +`azure.storage` cannot be found or loaded, then most likely there was a +conflict with this package with other `azure` packages that share the same +base namespace. You can correct this by issuing: +```shell +# for Python3 +pip3 install --upgrade --force-reinstall azure-storage +# for Python2 +pip install --upgrade --force-reinstall azure-storage +``` diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md new file mode 100644 index 0000000..2f3aad4 --- /dev/null +++ b/docs/10-cli-usage.md @@ -0,0 +1,255 @@ +# blobxfer Command-Line Usage +`blobxfer` operates using a command followed by options. Each +command will be detailed along with all options available. + +### Quick Navigation +1. [Commands](#commands) +2. [Options](#options) +3. [Example Invocations](#examples) +4. [General Notes](#general-notes) + +## Commands +### `download` +Downloads a remote Azure path, which may contain many resources, to the +local machine. This command requires at the minimum, the following options: +* `--storage-account` +* `--remote-path` +* `--local-path` + +Additionally, an authentication option for the storage account is required. +Please see the Authentication sub-section below under Options. + +### `upload` +Uploads a local path to a remote Azure path. The local path may contain +many resources on the local machine. This command requires at the minimum, +the following options: +* `--local-path` +* `--storage-account` +* `--remote-path` + +Additionally, an authentication option for the storage account is required. +Please see the Authentication sub-section below under Options. + +If piping from `stdin`, `--local-path` should be set to `-` as per +convention. + +### `synccopy` +TODO: not yet implemented. + +## Options +### General +* `--config` specifies the YAML configuration file to use. This can be +optionally provided through an environment variable `BLOBXFER_CONFIG_FILE`. +* `--chunk-size-bytes` is the chunk size in bytes. For downloads, this +is the maximum length of data to transfer per request. For uploads, this +corresponds to one of block size for append and block blobs, page size for +page blobs, or file chunk for files. Only block blobs can have a block size +of up to 100MiB, all others have a maximum of 4MiB. +* `--file-attributes` or `--no-file-attributes` controls if POSIX file +attributes (mode and ownership) should be stored or restored. Note that to +restore uid/gid, `blobxfer` must be run as root or under sudo. +* `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed. +* `--local-path` is the local resource path. Set to `-` if piping from +`stdin`. +* `--log-file` specifies the log file to write to. This must be specified +for a progress bar to be output to console. +* `--mode` is the operating mode. The default is `auto` but may be set to +`append`, `block`, `file`, or `page`. If specified with the `upload` +command, then all files will be uploaded as the specified `mode` type. +If specified with `download`, then only remote entities with that `mode` +type are downloaded. Note that `file` should be specified if interacting +with Azure File shares. +* `--overwrite` or `--no-overwrite` controls clobber semantics at the +destination. +* `--progress-bar` or `--no-progress-bar` controls if a progress bar is +output to the console. `--log-file` must be specified for a progress bar +to be output. +* `--recursive` or `--no-recursive` controls if the source path should be +recursively uploaded or downloaded. +* `--remote-path` is the remote Azure path. This path must contain the +Blob container or File share at the begining, e.g., `mycontainer/vdir` +* `--resume-file` specifies the resume file to write to. +* `--storage-account` specifies the storage account to use. This can be +optionally provided through an environment variable `BLOBXFER_STORAGE_ACCOUNT` +instead. +* `--timeout` is the integral timeout value in seconds to use. +* `-h` or `--help` can be passed at every command level to receive context +sensitive help. +* `-v` will output verbose messages including the configuration used + +### Authentication +`blobxfer` supports both Storage Account access keys and Shared Access +Signature (SAS) tokens. One type must be supplied with all commands in +order to successfully authenticate against Azure Storage. These options are: +* `--storage-account-key` is the storage account access key. This can be +optionally provided through an environment variable +`BLOBXFER_STORAGE_ACCOUNT_KEY` instead. +* `--sas` is a shared access signature (sas) token. This can can be +optionally provided through an environment variable `BLOBXFER_SAS` instead. + +### Concurrency +Please see the [performance considerations](98-performance-considerations.md) +document for more information regarding concurrency options. +* `--crypto-processes` is the number of decryption offload processes to spawn. +`0` will in-line the decryption routine with the main thread. +* `--disk-threads` is the number of threads to create for disk I/O. +* `--md5-processes` is the number of MD5 offload processes to spawn for +comparing files with `skip_on` `md5_match`. +* `--transfer-threads` is the number of threads to create for transferring +to/from Azure Storage. + +### Connection +* `--endpoint` is the Azure Storage endpoint to connect to; the default is +Azure Public regions, or `core.windows.net`. +* `--storage-account` is the storage account to connect to. + +### Encryption +* `--rsa-private-key` is the RSA private key in PEM format to use. This can +be provided for uploads but must be specified to decrypt encrypted remote +entities. This can be optionally provided through an environment variable +`BLOBXFER_RSA_PRIVATE_KEY`. +* `--rsa-private-key-passphrase` is the RSA private key passphrase. This can +be optionally provided through an environment variable +`BLOBXFER_RSA_PRIVATE_KEY_PASSPHRASE`. +* `--rsa-public-key` is the RSA public key in PEM format to use. This +can only be provided for uploads. This can be optionally provided through an +environment variable `BLOBXFER_RSA_PUBLIC_KEY`. + +### Filtering +* `--exclude` is an exclude pattern to use; this can be specified multiple +times. Exclude patterns are applied after include patterns. If both an exclude +and an include pattern match a target, the target is excluded. +* `--include` is an include pattern to use; this can be specified multiple +times + +### Skip On +* `--skip-on-filesize-match` will skip the transfer action if the filesizes +match between source and destination. This should not be specified for +encrypted files. +* `--skip-on-lmt-ge` will skip the transfer action: + * On upload if the last modified time of the remote file is greater than + or equal to the local file. + * On download if the last modified time of the local file is greater than + or equal to the remote file. +* `--skip-on-md5-match` will skip the transfer action if the MD5 hash match +between source and destination. This can be transparently used through +encrypted files that have been uploaded with `blobxfer`. + +### Vectored IO +Please see the [Vectored IO](30-vectored-io.md) document for more information +regarding Vectored IO operations in `blobxfer`. +* `--distribution-mode` is the Vectored IO distribution mode + * `disabled` which is default (no Vectored IO) + * `replica` which will replicate source files to target destinations on + upload. Note that replicating across multiple destinations will require + a YAML configuration file. + * `stripe` which will stripe source files to target destinations on upload. + Note that striping across multiple destinations will require a YAML + configuration file. +* `--stripe-chunk-size-bytes` is the stripe chunk width for stripe-based +Vectored IO operations + +### Other +* `--delete` deletes extraneous files at the remote destination path on +uploads and at the local resource on downloads. This actions occur after the +transfer has taken place. +* `--one-shot-bytes` controls the number of bytes to "one shot" a block +Blob upload. The maximum value that can be specified is 256MiB. This may +be useful when using account-level SAS keys and enforcing non-overwrite +behavior. +* `--rename` renames a single file upload or download to the target +destination or source path, respectively. +* `--strip-components N` will strip the leading `N` components from the +file path. The default is `1`. + +## Example Invocations +### `download` Examples +#### Download an Entire Encrypted Blob Container to Current Working Directory +```shell +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-public-key ~/mypubkey.pem +``` + +#### Download an Entire File Share to Designated Path and Skip On Filesize Matches +```shell +blobxfer download --mode file --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-path /my/path --skip-on-filesize-match +``` + +#### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path +```shell +blobxfer download --mode page --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path /my/pageblobs --no-recursive --delete +``` + +#### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes +```shell +blobxfer download --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes +``` + +#### Download a Blob Snapshot +```shell +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-path . +``` + +#### Download using a YAML Configuration File +```shell +blobxfer download --config myconfig.yaml +``` + +### `upload` Examples +#### Upload Current Working Directory as Encrypted Block Blobs Non-recursively +```shell +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-private-key ~/myprivatekey.pem --no-recursive +``` + +#### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files +```shell +blobxfer upload --mode file --storage-account mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-path . --file-md5 --file-attributes --exclude '*.bak' +``` + +#### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks +```shell +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 +``` + +#### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path +```shell +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /my/path --file-md5 --skip-on-md5-match --delete +``` + +#### Upload From Piped `stdin` +```shell +curl -fSsL https://some.uri | blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path - +``` + +#### Upload using a YAML Configuration File +```shell +blobxfer upload --config myconfig.yaml +``` + +### `synccopy` Examples +TODO: not implemented yet. + +## General Notes +* `blobxfer` does not take any leases on blobs or containers. It is up to the +user to ensure that blobs are not modified while download/uploads are being +performed. +* No validation is performed regarding container and file naming and length +restrictions. +* `blobxfer` will attempt to download from blob storage as-is. If the source +filename is incompatible with the destination operating system, then failure +may result. +* When using SAS, the SAS key must be a container- or share-level SAS if +performing recursive directory upload or container/file share download. +* If uploading via service-level SAS keys, the container or file share must +already be created in Azure storage prior to upload. Account-level SAS keys +with the signed resource type of `c` (i.e., container-level permission) is +required for to allow conatiner or file share creation. +* When uploading files as page blobs, the content is page boundary +byte-aligned. The MD5 for the blob is computed using the final aligned data +if the source is not page boundary byte-aligned. This enables these page +blobs or files to be skipped during subsequent download or upload with the +appropriate `skip_on` option, respectively. +* Globbing of wildcards must be disabled by your shell (or properly quoted) +during invoking `blobxfer` such that include and exclude patterns can be +read verbatim without the shell expanding the wildcards. +* The `--delete` operates similarly to `--delete-after` in rsync. Please +note that this option interacts with `--include` and `--exclude` filters. diff --git a/docs/20-yaml-configuration.md b/docs/20-yaml-configuration.md new file mode 100644 index 0000000..78437b0 --- /dev/null +++ b/docs/20-yaml-configuration.md @@ -0,0 +1,211 @@ +# blobxfer YAML Configuration +`blobxfer` accepts YAML configuration files to drive the transfer. YAML +configuration files are specified with the `--config` option to any +`blobxfer` command. + +## Schema +The `blobxfer` YAML schema consists of 5 distinct "sections". The following +sub-sections will describe each. You may combine all 5 sections into the +same YAML file if desired as `blobxfer` will only read the required sections +to execute the specified command. + +#### Configuration Sections +1. [`azure_storage`](#azure-storage) +2. [`options`](#options) +3. [`download`](#download) +4. [`upload`](#upload) +5. [`synccopy`](#synccopy) + +### `azure_storage` +The `azure_storage` section specifies Azure Storage credentials that will +be referenced for any transfer while processing the YAML file. This section +is required. + +```yaml +azure_storage: + endpoint: core.windows.net + accounts: + mystorageaccount0: ABCDEF... + mystorageaccount1: ?se... +``` + +* `endpoint` specifies for which endpoint to connect to with Azure Storage. +Generally this can be omitted if using Public Azure regions. +* `accounts` is a dictionary of storage account names and either a +storage account key or a shared access signature token. + +### `options` +The `options` section specifies general options that may be applied across +all other sections in the YAML configuration. + +```yaml +options: + log_file: /path/to/blobxfer.log + resume_file: /path/to/resumefile.db + progress_bar: true + verbose: true + timeout_sec: null + concurrency: + md5_processes: 2 + crypto_processes: 2 + disk_threads: 16 + transfer_threads: 32 +``` + +* `log_file` is the location of the log file to write to +* `resume_file` is the location of the resume database to create +* `progress_bar` controls display of a progress bar output to the console +* `verbose` controls if verbose logging is enabled +* `timeout_sec` is the timeout to apply to requests/responses +* `concurrency` is a dictionary of concurrency limits + * `md5_processes` is the number of MD5 offload processes to create for + MD5 comparison checking + * `crypto_processes` is the number of decryption offload processes to create + * `disk_threads` is the number of threads for disk I/O + * `transfer_threads` is the number of threads for network transfers + +### `download` +The `download` section specifies download sources and destination. Note +that `download` refers to a list of objects, thus you may specify as many +of these sub-configuration blocks on the `download` property as you need. +When the `download` command with the YAML config is specified, the list +is iterated and all specified sources are downloaded. + +```yaml +download: + - source: + - mystorageaccount0: mycontainer + - mystorageaccount1: someothercontainer/vpath + destination: /path/to/store/downloads + include: + - "*.txt" + - "*.bxslice-*" + exclude: + - "*.bak" + options: + check_file_md5: true + chunk_size_bytes: 16777216 + delete_extraneous_destination: false + mode: auto + overwrite: true + recursive: true + rename: false + restore_file_attributes: true + rsa_private_key: myprivatekey.pem + rsa_private_key_passphrase: myoptionalpassword + skip_on: + filesize_match: false + lmt_ge: false + md5_match: true + - source: + # next if needed... +``` + +* `source` is a list of storage account to remote path mappings +* `destination` is the local resource path +* `include` is a list of include patterns +* `exclude` is a list of exclude patterns +* `options` are download-specific options + * `check_file_md5` will integrity check downloaded files using the stored MD5 + * `chunk_size_bytes` is the maximum amount of data to download per request + * `delete_extraneous_destination` will cleanup any files locally that are + not found on the remote. Note that this interacts with include and + exclude filters. + * `mode` is the operating mode + * `overwrite` specifies clobber behavior + * `recursive` specifies if remote paths should be recursively searched for + entities to download + * `rename` will rename a single entity source path to the `destination` + * `restore_file_attributes` will restore POSIX file mode and ownership if + stored on the entity metadata + * `rsa_private_key` is the RSA private key PEM file to use to decrypt + encrypted blobs or files + * `rsa_private_key_passphrase` is the RSA private key passphrase, if required + * `skip_on` are skip on options to use + * `filesize_match` skip if file size match + * `lmt_ge` skip if local file has a last modified time greater than or + equal to the remote file + * `md5_match` skip if MD5 match + +### `upload` +The `upload` section specifies upload sources and destinations. Note +that `upload` refers to a list of objects, thus you may specify as many +of these sub-configuration blocks on the `upload` property as you need. +When the `upload` command with the YAML config is specified, the list +is iterated and all specified sources are uploaded. + +```yaml +upload: + - source: + - /path/to/hugefile1 + - /path/to/hugefile2 + destination: + - mystorageaccount0: mycontainer/vdir + - mystorageaccount1: someothercontainer/vdir2 + include: + - "*.bin" + exclude: + - "*.tmp" + options: + mode: auto + chunk_size_bytes: 0 + delete_extraneous_destination: true + one_shot_bytes: 33554432 + overwrite: true + recursive: true + rename: false + rsa_public_key: mypublickey.pem + skip_on: + filesize_match: false + lmt_ge: false + md5_match: true + store_file_properties: + attributes: true + md5: true + strip_components: 1 + vectored_io: + stripe_chunk_size_bytes: 1000000 + distribution_mode: stripe + - source: + # next if needed... +``` + +* `source` is a list of local resource paths +* `destination` is a list of storage account to remote path mappings +* `include` is a list of include patterns +* `exclude` is a list of exclude patterns +* `options` are upload-specific options + * `mode` is the operating mode + * `chunk_size_bytes` is the maximum amount of data to upload per request. + This corresponds to the block size for block and append blobs, page size + for page blobs, and the file chunk for files. Only block blobs can have + a block size of up to 100MiB, all others have a maximum of 4MiB. + * `one_shot_bytes` is the size limit to upload block blobs in a single + request. + * `overwrite` specifies clobber behavior + * `recursive` specifies if local paths should be recursively searched for + files to upload + * `rename` will rename a single entity destination path to a single `source` + * `rsa_public_key` is the RSA public key PEM file to use to encrypt files + * `skip_on` are skip on options to use + * `filesize_match` skip if file size match + * `lmt_ge` skip if remote file has a last modified time greater than or + equal to the local file + * `md5_match` skip if MD5 match + * `store_file_properties` stores the following file properties if enabled + * `attributes` will store POSIX file mode and ownership + * `md5` will store the MD5 of the file + * `strip_components` is the number of leading path components to strip + * `vectored_io` are the Vectored IO options to apply to the upload + * `stripe_chunk_size_bytes` is the stripe width for each chunk if `stripe` + `distribution_mode` is selected + * `distribution_mode` is the Vectored IO mode to use which can be one of + * `disabled` will disable Vectored IO + * `replica` which will replicate source files to target destinations on + upload. Note that more than one destination should be specified. + * `stripe` which will stripe source files to target destinations on + upload. If more than one destination is specified, striping occurs in + round-robin order amongst the destinations listed. + +### `synccopy` +TODO: not yet implemented. diff --git a/docs/30-vectored-io.md b/docs/30-vectored-io.md new file mode 100644 index 0000000..1d17c40 --- /dev/null +++ b/docs/30-vectored-io.md @@ -0,0 +1,95 @@ +# blobxfer Vectored I/O +`blobxfer` supports Vectored I/O (scatter/gather) which can help alleviate +problems associated with +[single blob or single fileshare throughput limits](https://docs.microsoft.com/en-us/azure/storage/storage-scalability-targets). +Additionally, `blobxfer` has the ability to replicate a single source to +multiple destinations to allow for increased resiliency or throughput for +consumption later. + +## Distribution Modes +`blobxfer` supports two distribution modes: `replica` and `stripe`. The +following sections describe each. + +### Replica +`replica` mode replicates an entire file (or set of files) across all +specified destinations. This allows for multiple backups, resiliency, +and potentially increased download throughput later if the clients understand +how to download from multiple sources. + +The logic is fairly simple in how this is accomplished. Each source file +has portions of the file read from disk, buffered in memory and then +replicated across multiple storage accounts. + +``` + Whole File +---------------------+ + Replication | | + +------------------------------> | Destination 0: | + | | Storage Account A | + | | | + | +---------------------+ + | + | ++------------+---------------+ Whole File +---------------------+ +| | Replication | | +| 10 GiB VHD on Local Disk +--------------> | Destination 1: | +| | | Storage Account B | ++------------+---------------+ | | + | +---------------------+ + | + | + | +---------------------+ + | Whole File | | + | Replication | Destination 2: | + +------------------------------> | Storage Account C | + | | + +---------------------+ +``` + +In order to take advantage of `replica` Vectored IO, you must use a YAML +configuration file to define multiple destinations. + +### Stripe +`stripe` mode will splice a file into multiple chunks and scatter these +chunks across destinations specified. These destinations can be different +containers within the same storage account or even containers distributed +across multiple storage accounts if single storage account bandwidth limits +are insufficient. + +`blobxfer` will slice the source file into multiple chunks where the +`stripe_chunk_size_bytes` is the stripe width of each chunk. This parameter +will allow you to effectively control how many blobs/files are created on +Azure. `blobxfer` will then round-robin through all of the destinations +specified to store the slices. Information required to reconstruct the +original file is stored on the blob or file metadata. It is important to +keep this metadata in-tact or reconstruction will fail. + +``` + +---------------------+ + | | <-----------------------------------+ + | Destination 1: | | + | Storage Account B | <---------------------+ | + | | | | + +---------------------+ <-------+ | | + | | | + ^ ^ | | | + | | | | | + 1 GiB Stripe | | | | | ++-----------------------------+ Width +------+---+--+------+---+--+------+---+--+------+---+--+------+---+--+ +| | | | | | | | | | | | | +| 10 GiB File on Local Disk | +-----------> | D0 | D1 | D0 | D1 | D0 | D1 | D0 | D1 | D0 | D1 | +| | | | | | | | | | | | | ++-----------------------------+ 10 Vectored +---+--+------+---+--+------+---+--+------+---+--+------+---+--+------+ + Slices | | | | | + | | | | | + | v | | | + | | | | + +> +---------------------+ <+ | | + | | | | + | Destination 0: | <--------------+ | + | Storage Account A | | + | | <----------------------------+ + +---------------------+ +``` + +In order to take advantage of `stripe` Vectored IO across multiple +destinations, you must use a YAML configuration file. diff --git a/docs/40-client-side-encryption.md b/docs/40-client-side-encryption.md new file mode 100644 index 0000000..e16d87a --- /dev/null +++ b/docs/40-client-side-encryption.md @@ -0,0 +1,29 @@ +# blobxfer Client-side Encryption Notes +Please read the following carefully regarding client-side encryption support +in `blobxfer`. Additionally, current limitations for client-side encryption +can be found [here](99-current-limitations.md). + +* Encryption is performed using AES256-CBC. MACs are generated using +HMAC-SHA256. +* All required information regarding the encryption process is stored on +each blob's `encryptiondata` and `encryptiondata_authentication` metadata +fields. These metadata entries are used on download to configure the proper +download parameters for the decryption process as well as to authenticate +the `encryptiondata` metadata and the encrypted entity. Encryption metadata +set by `blobxfer` (or any Azure Storage SDK) should not be modified or +the blob/file may be unrecoverable. +* Keys for the AES256 block cipher are generated on a per-blob/file basis. +These keys are encrypted using RSAES-OAEP and encoded in the metadata. +* MD5 for both the pre-encrypted and encrypted version of the file is stored +in the entity metadata, if enabled. `skip_on` options will still work +transparently with encrypted blobs/files. +* MAC integrity checks are preferred over MD5 to validate encrypted data. +* Attempting to upload the same file that exists in Azure Storage, but the +file in Azure Storage is not encrypted will not occur if any `skip_on` match +condition succeeds. This behavior can be overridden by deleting the target +file in Azure Storage or disabling the `skip_on` behavior. +* Attempting to upload the same file as an encrypted blob with a different +RSA key will not occur if the file content MD5 is the same. This behavior +can be overridden by deleting the target file in Azure Storage or disabling +the `skip_on` `md5_match` behavior. +* Zero-byte files are not encrypted. diff --git a/docs/80-blobxfer-python-library.md b/docs/80-blobxfer-python-library.md new file mode 100644 index 0000000..e0d74a2 --- /dev/null +++ b/docs/80-blobxfer-python-library.md @@ -0,0 +1,3 @@ +# blobxfer Python Library + +## TODO diff --git a/docs/98-performance-considerations.md b/docs/98-performance-considerations.md new file mode 100644 index 0000000..8a511fc --- /dev/null +++ b/docs/98-performance-considerations.md @@ -0,0 +1,106 @@ +# blobxfer Performance Considerations +Please read the following carefully regarding considerations that should +be applied with regard to performance and `blobxfer`. Additionally, +please review the +[Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) +for an overview of general performance targets that apply to Azure Blobs, +File shares and Storage Account types (GRS, LRS, ZRS, etc). + +## Concurrency +* `blobxfer` offers four concurrency knobs. Each one should be tuned for +maximum performance according to your system and network characteristics. + * Disk threads: concurrency in reading (uploads) and writing (downloads) to + disk is controlled by the number of disk threads. + * Transfer threads: concurrency in the number of threads from/to Azure + Storage is controlled by the number of transfer threads. + * MD5 processes: computing MD5 for potential omission from transfer due + to `skip_on` `md5_match` being specified are offloaded to the specified + number of processors. + * Crypto processes: decrypting encrypted blobs and files can be offloaded + to the specified number of processors. Due to the inherent + non-parallelizable encryption algorithm used, this is ignored for + encryption (uploads). +* The thread concurrency options (disk and transfer) can be set to a +non-positive number to be automatically set as a multiple of the number of +cores available on the machine. +* For uploads, there should be a sufficient number of disk threads to ensure +that all transfer threads have work to do. For downloads, there should be +sufficient number of disk threads to write data to disk so transfer threads +are not artificially blocked. + +## Chunk Sizing +Chunk sizing refers to the `chunk_size_bytes` option and the meaning of which +varies upon the context of uploading or downloading. + +### Uploads +For uploads, chunk sizes correspond to the maximum amount of data to transfer +with a single request. The Azure Storage service imposes maximums depending +upon the type of entity that is being written. For block blobs, the maximum +is 100MiB (although you may "one-shot" up to 256MiB). For page blobs, the +maximum is 4MiB. For append blobs, the maximum is 4MiB. For Azure Files, +the maximum is 4MiB. + +For block blobs, setting the chunk size to something greater than 4MiB will +not only allow you larger file sizes (recall that the maximum number of +blocks for a block blob is 50000, thus at 100MiB blocks, you can create a +5TiB block blob object) but will allow you to amortize larger portions of +data transfer over each request/response overhead. `blobxfer` can +automatically select the proper block size given your file, but will not +automatically tune the chunk size as that depends upon your system and +network characteristics. + +### Downloads +For downloads, chunk sizes correspond to the maximum amount of data to +request from the server for each request. It is important to keep a balance +between the chunk size and the number of in-flight operations afforded by +the `transfer_threads` concurrency control. `blobxfer` does not automatically +tune this (but can automatically set it to a value that should work for +most situations) due to varying system and network conditions. + +Additionally, disk write performance is typically lower than disk read +performance so you need to ensure that the number of `disk_threads` is not +set to a very large number to prevent thrashing and highly random write +patterns. + +## Azure File Share Performance +File share performance can be "slow" or become a bottleneck, especially for +file shares containing thousands of files as multiple REST calls must be +performed for each file. Currently, a single file share has a limit of up +to 60 MB/s and 1000 8KB IOPS. Please refer to the +[Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) +for performance targets and limits regarding Azure Storage File shares. +If scalable high performance is required, consider using Blob storage +instead. + +## MD5 Hashing +MD5 hashing will impose some performance penalties to check if the file +should be uploaded or downloaded. For instance, if uploading and the local +file is determined to be different than it's remote counterpart, then the +time spent performing the MD5 comparison is lost. + +## Client-side Encryption +Client-side encryption will naturally impose a performance penalty on +`blobxfer` both for uploads (encrypting) and downloads (decrypting) depending +upon the processor speed and number of cores available. Additionally, for +uploads, encryption is not parallelizable and is in-lined with the main +process. + +## pyOpenSSL +As of requests 2.6.0 and Python versions < 2.7.9 (i.e., interpreter found on +default Ubuntu 14.04 installations, 16.04 is not affected), if certain +packages are installed, as those found in `requests[security]` then the +underlying urllib3 package will utilize the `ndg-httpsclient` package which +will use `pyOpenSSL`. This will ensure the peers are fully validated. However, +this incurs a rather larger performance penalty. If you understand the +potential security risks for disabling this behavior due to high performance +requirements, you can either remove `ndg-httpsclient` or use `blobxfer` in a +virtualenv environment without the `ndg-httpsclient` package. Python +versions >= 2.7.9 are not affected by this issue. + +Additionally, `urllib3` (which `requests` uses) may use `pyOpenSSL` which +may result in exceptions being thrown that are not normalized by `urllib3`. +This may result in exceptions that should be retried, but are not. It is +recommended to upgrade your Python where `pyOpenSSL` is not required for +fully validating peers and such that `blobxfer` can operate without +`pyOpenSSL` in a secure fashion. You can also run `blobxfer` via Docker +or in a virtualenv environment without `pyOpenSSL`. diff --git a/docs/99-current-limitations.md b/docs/99-current-limitations.md new file mode 100644 index 0000000..9b8a3f7 --- /dev/null +++ b/docs/99-current-limitations.md @@ -0,0 +1,24 @@ +# blobxfer Current Limitations +Please read this section carefully for any current known limitations to +`blobxfer`. + +### Client-side Encryption +* Client-side encryption is currently only available for block blobs and +Azure Files. +* `stdin` sources cannot be encrypted. +* Azure KeyVault key references are currently not supported. + +### Platform-specific Issues +* File attribute store/restore is not supported on Windows. + +### Resume Support +* Encrypted uploads/downloads cannot currently be resumed as the Python +SHA256 object cannot be pickled. +* Append blobs currently cannot be resumed for upload. + +### Other Limitations +* MD5 is not computed for append blobs. +* Empty directories are not created locally when downloading from an Azure +File share which has empty directories. +* Empty directories are not deleted if `--delete` is specified and no files +remain in the directory on the Azure File share. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..eb1b4f5 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,16 @@ +# blobxfer Documentation +`blobxfer` is a transfer tool and library to move data between local file +systems and Azure Storage. `blobxfer` command-line interface is powered by +an advanced, high performance data movement library in Python with the same +name. The `blobxfer` data movement library is built on the +[Azure Storage Python SDK](https://github.com/Azure/azure-storage-python). +Please refer to the following documents detailing the usage of `blobxfer`. + +1. [Installation](01-installation.md) +2. [Command-Line Usage](10-cli-usage.md) +3. [YAML Configuration](20-yaml-configuration.md) +4. [Vectored I/O](30-vectored-io.md) +5. [Client-side Encryption](40-client-side-encryption.md) +6. [blobxfer Data Movement Library](80-blobxfer-python-library.md) +7. [Performance Considerations](98-performance-considerations.md) +8. [Current Limitations](99-current-limitations.md) diff --git a/setup.py b/setup.py index 7709e0e..1f36502 100644 --- a/setup.py +++ b/setup.py @@ -1,41 +1,82 @@ +from codecs import open +import os import re try: from setuptools import setup -except ImportError: +except ImportError: # noqa from distutils.core import setup +import sys -with open('blobxfer.py', 'r') as fd: +if sys.argv[-1] == 'publish': + os.system('rm -rf blobxfer.egg-info/ build dist __pycache__/') + os.system('python setup.py sdist bdist_wheel') + os.unlink('README.rst') + sys.exit() +elif sys.argv[-1] == 'upload': + os.system('twine upload dist/*') + sys.exit() +elif sys.argv[-1] == 'sdist' or sys.argv[-1] == 'bdist_wheel': + import pypandoc + long_description = pypandoc.convert('README.md', 'rst') +else: + long_description = '' + +with open('blobxfer/version.py', 'r', 'utf-8') as fd: version = re.search( - r'^_SCRIPT_VERSION\s*=\s*[\'"]([^\'"]*)[\'"]', + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1) -with open('README.rst') as readme: - long_description = ''.join(readme).strip() +if not version or len(version) == 0: + raise RuntimeError('Cannot find version') + +packages = [ + 'blobxfer', + 'blobxfer.models', + 'blobxfer.operations', + 'blobxfer.operations.azure', + 'blobxfer.operations.azure.blob', + 'blobxfer_cli', +] + +install_requires = [ + 'azure-storage==0.34.2', + 'bitstring==3.1.5', + 'click==6.7', + 'cryptography>=1.9', + 'future==0.16.0', + 'python-dateutil==2.6.0', + 'requests==2.14.2', + 'ruamel.yaml==0.14.12', +] + +if sys.version_info < (3, 4): + install_requires.append('enum34==1.1.6') + +if sys.version_info < (3, 5): + install_requires.append('pathlib2==2.2.1') + install_requires.append('scandir==1.5') setup( name='blobxfer', version=version, author='Microsoft Corporation, Azure Batch and HPC Team', author_email='', - description='Azure storage transfer tool with AzCopy-like features', + description='Azure storage transfer tool and library', long_description=long_description, platforms='any', url='https://github.com/Azure/blobxfer', license='MIT', - py_modules=['blobxfer'], + packages=packages, + package_data={'blobxfer': ['LICENSE']}, + package_dir={'blobxfer': 'blobxfer', 'blobxfer_cli': 'cli'}, entry_points={ - 'console_scripts': 'blobxfer=blobxfer:main', + 'console_scripts': 'blobxfer=blobxfer_cli.cli:cli', }, - install_requires=[ - 'azure-common==1.1.4', - 'azure-storage==0.33.0', - 'azure-servicemanagement-legacy==0.20.5', - 'cryptography>=1.6', - 'requests==2.12.3' - ], + zip_safe=False, + install_requires=install_requires, tests_require=['pytest'], classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 3 - Alpha', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: System Administrators', @@ -47,7 +88,8 @@ 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Utilities', ], - keywords='azcopy azure storage blob files transfer copy smb', + keywords='azcopy azure storage blob files transfer copy smb cifs', ) diff --git a/test/test_blobxfer.py b/test/test_blobxfer.py deleted file mode 100644 index 28208af..0000000 --- a/test/test_blobxfer.py +++ /dev/null @@ -1,1436 +0,0 @@ -# coding=utf-8 -"""Tests for blobxfer""" - -# stdlib imports -import base64 -import copy -import errno -import json -import math -import os -try: - import queue -except ImportError: - import Queue as queue -import socket -import sys -import threading -import uuid -# non-stdlib imports -import azure.common -import azure.storage.blob -import cryptography.exceptions -import cryptography.hazmat.backends -import cryptography.hazmat.primitives.asymmetric.rsa -import cryptography.hazmat.primitives.serialization -from mock import (MagicMock, Mock, patch) -import pytest -import requests -import requests_mock -# module under test -sys.path.append('..') -import blobxfer # noqa - - -# global defines -_RSAKEY = cryptography.hazmat.primitives.asymmetric.rsa.generate_private_key( - public_exponent=65537, key_size=2048, - backend=cryptography.hazmat.backends.default_backend()) - - -def test_encrypt_decrypt_chunk(): - enckey, signkey = blobxfer.generate_aes256_keys() - assert len(enckey) == blobxfer._AES256_KEYLENGTH_BYTES - assert len(signkey) == blobxfer._AES256_KEYLENGTH_BYTES - - # test random binary data, unaligned - iv = os.urandom(16) - plaindata = os.urandom(31) - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - unpad=True) - assert decdata == plaindata - with pytest.raises(RuntimeError): - badsig = base64.b64encode(b'0') - blobxfer.decrypt_chunk( - enckey, badsig, encdata, blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB, - unpad=True) - - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, pad=True) - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, unpad=True) - assert decdata == plaindata - - # test random binary data aligned on boundary - plaindata = os.urandom(32) - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv=iv, unpad=True) - assert decdata == plaindata - - # test text data - plaindata = b'attack at dawn!' - encdata = blobxfer.encrypt_chunk( - enckey, signkey, plaindata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv, pad=True) - assert encdata != plaindata - decdata = blobxfer.decrypt_chunk( - enckey, signkey, encdata, blobxfer._ENCRYPTION_MODE_FULLBLOB, - iv, unpad=True) - assert decdata == plaindata - - -def test_rsa_keys(): - symkey = os.urandom(32) - enckey, sig = blobxfer.rsa_encrypt_key( - _RSAKEY, None, symkey, asbase64=False) - assert enckey is not None - assert sig is not None - plainkey = blobxfer.rsa_decrypt_key(_RSAKEY, enckey, sig, isbase64=False) - assert symkey == plainkey - - with pytest.raises(cryptography.exceptions.InvalidSignature): - badsig = base64.b64encode(b'0') - blobxfer.rsa_decrypt_key(_RSAKEY, enckey, badsig, isbase64=False) - - enckey, sig = blobxfer.rsa_encrypt_key( - _RSAKEY, None, symkey, asbase64=True) - assert enckey is not None - assert sig is not None - plainkey = blobxfer.rsa_decrypt_key(_RSAKEY, enckey, sig, isbase64=True) - assert symkey == plainkey - - with pytest.raises(cryptography.exceptions.InvalidSignature): - badsig = base64.b64encode(b'0') - blobxfer.rsa_decrypt_key(_RSAKEY, enckey, badsig, isbase64=True) - - -def test_compute_md5(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - testdata = str(uuid.uuid4()) - with open(lpath, 'wt') as f: - f.write(testdata) - md5_file = blobxfer.compute_md5_for_file_asbase64(lpath) - md5_data = blobxfer.compute_md5_for_data_asbase64(testdata.encode('utf8')) - assert md5_file == md5_data - - # test non-existent file - with pytest.raises(IOError): - blobxfer.compute_md5_for_file_asbase64(testdata) - - -def test_page_align_content_length(): - assert 0 == blobxfer.page_align_content_length(0) - assert 512 == blobxfer.page_align_content_length(511) - assert 512 == blobxfer.page_align_content_length(512) - assert 1024 == blobxfer.page_align_content_length(513) - - -def _func_successful_requests_call(timeout=None): - response = MagicMock() - response.raise_for_status = lambda: None - return response - - -def _func_raise_requests_exception_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.Timeout() - raise ex - - -def _func_raise_requests_connection_error_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.ConnectionError( - requests.packages.urllib3.exceptions.ProtocolError( - 'Connection aborted.', - socket.error(errno.ECONNRESET, 'Connection reset by peer'))) - raise ex - - -def _func_raise_requests_chunked_encoding_error_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - response.raise_for_status = lambda: None - return response - val.append(0) - ex = requests.exceptions.ChunkedEncodingError( - requests.packages.urllib3.exceptions.ProtocolError( - 'Connection aborted.', - socket.error(errno.ECONNRESET, 'Connection reset by peer'))) - raise ex - - -def _func_raise_azurehttperror_once(val, timeout=None): - if len(val) > 0: - response = MagicMock() - return response - val.append(0) - ex = azure.common.AzureHttpError('ServerBusy', 503) - raise ex - - -@patch('time.sleep', return_value=None) -def test_azure_request(patched_time_sleep): - azcomerr = azure.common.AzureHttpError('ServerBusy', 503) - - with pytest.raises(IOError): - mock = Mock(side_effect=azcomerr) - mock.__name__ = 'name' - blobxfer.azure_request(mock, timeout=0.001) - - with pytest.raises(Exception): - ex = Exception() - ex.message = 'Uncaught' - blobxfer.azure_request(Mock(side_effect=ex)) - - with pytest.raises(Exception): - ex = Exception() - ex.__delattr__('message') - blobxfer.azure_request(Mock(side_effect=ex)) - - blobxfer.azure_request( - _func_raise_requests_connection_error_once, val=[], timeout=1) - - blobxfer.azure_request( - _func_raise_requests_chunked_encoding_error_once, val=[], timeout=1) - - blobxfer.azure_request( - _func_raise_azurehttperror_once, val=[], timeout=1) - - with pytest.raises(requests.HTTPError): - exc = requests.HTTPError() - exc.response = MagicMock() - exc.response.status_code = 404 - mock = Mock(side_effect=exc) - blobxfer.azure_request(mock) - - try: - blobxfer.azure_request( - _func_raise_requests_exception_once, val=[], timeout=1) - except Exception: - pytest.fail('unexpected Exception raised') - - try: - blobxfer.azure_request(_func_successful_requests_call) - except Exception: - pytest.fail('unexpected Exception raised') - - -def test_sasblobservice_listblobs(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - content = b'string-value' + \ - b'string-valueint-value' + \ - b'string-valueblob-name' + \ - b'date-time-value' + \ - b'date-time-valueetag' + \ - b'2147483648' + \ - b'blob-content-type' + \ - b'abc' + \ - b'sequence-number' + \ - b'BlockBlob' + \ - b'locked|unlocked' + \ - b'available | leased | expired | breaking | broken' + \ - b'infinite | fixedid' + \ - b'pending | success | aborted | failed' + \ - b'source url' + \ - b'bytes copied/bytes total' + \ - b'datetime' + \ - b'error string' + \ - b'value' + \ - b'blob-prefixnm' + \ - b'' - - with requests_mock.mock() as m: - m.get('mock://blobepcontainer?saskey', content=content) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - metamock = MagicMock() - metamock.metadata = True - result = sbs.list_blobs('container', 'marker', include=metamock) - assert len(result) == 1 - assert result[0].name == 'blob-name' - assert result[0].properties.content_length == 2147483648 - assert result[0].properties.content_settings.content_md5 == 'abc' - assert result[0].properties.blobtype == 'BlockBlob' - assert result[0].metadata['Name'] == 'value' - assert result.next_marker == 'nm' - - m.get('mock://blobepcontainer?saskey', content=b'', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.list_blobs('container', 'marker') - - -def test_sasblobservice_setblobmetadata(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey') - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - sbs.set_blob_metadata('container', 'blob', None) - sbs.set_blob_metadata('container', 'blob', {'name': 'value'}) - - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - with pytest.raises(IOError): - sbs.set_blob_metadata('container', 'blob', {'name': 'value'}) - - -def test_sasblobservice_getblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.get('mock://blobepcontainer/blob?saskey', content=b'data') - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - results = sbs._get_blob('container', 'blob', 0, 1) - assert results.content == b'data' - - m.get('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs._get_blob('container', 'blob', 0, 1) - - -def test_sasblobservice_getblobproperties(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', - headers={'x-ms-meta-hello': 'world', 'content-length': '1'}) - sbs = blobxfer.SasBlobService('mock://blobep', '?saskey', None) - results = sbs.get_blob_properties('container', 'blob') - assert results.metadata['hello'] == 'world' - - m.head('mock://blobepcontainer/blob?saskey', text='', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.get_blob_properties('container', 'blob') - - -def test_sasblobservice_putblock(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', '?saskey', None) - try: - sbs.put_block( - 'container', 'blob', 'block', 'blockid', - validate_content=False) - except Exception: - pytest.fail('unexpected Exception raised') - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.put_block( - 'container', 'blob', 'block', 'blockid', - validate_content=False) - - -def test_sasblobservice_putblocklist(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - block_list = [ - azure.storage.blob.BlobBlock(id='1'), - azure.storage.blob.BlobBlock(id='2') - ] - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.put_block_list('container', 'blob', block_list, cs) - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.put_block_list('container', 'blob', block_list, cs) - - -def test_sasblobservice_setblobproperties(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.set_blob_properties('container', 'blob', cs) - - m.put('mock://blobepcontainer/blob?saskey', text='', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.set_blob_properties('container', 'blob', cs) - - -def test_sasblobservice_putblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings( - content_type='a', content_md5='md5') - sbs._put_blob('container', 'blob', None, cs) - - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs._put_blob('container', 'blob', None, cs) - - -def test_sasblobservice_createblob(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - cs = azure.storage.blob.ContentSettings(content_md5='md5') - sbs.create_blob('container', 'blob', 0, cs) - - m.put('mock://blobepcontainer/blob?saskey', content=b'', - status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(IOError): - sbs.create_blob('container', 'blob', 0, cs) - - -def test_sasblobservice_createcontainer(): - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.put('mock://blobepcontainer?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - sbs.create_container('container', fail_on_exist=False) - - m.put('mock://blobepcontainer?saskey', status_code=409) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(requests.exceptions.HTTPError): - sbs.create_container('container', fail_on_exist=True) - - -def test_storagechunkworker_run(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.rsakey = None - args.pageblob = True - args.autovhd = False - args.timeout = None - args.fileshare = False - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - exc_list = [] - flock = threading.Lock() - sa_in_queue = queue.PriorityQueue() - sa_out_queue = queue.Queue() - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=200) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, (sbs, sbs), None) - with pytest.raises(IOError): - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - args.pageblob = False - with requests_mock.mock() as m: - m.put('mock://blobepcontainer/blob?saskey', status_code=201) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, (sbs, sbs), None) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - m.get('mock://blobepcontainer/blob?saskey', status_code=200) - bcw.get_storage_range( - lpath, 'container', 'blob', 0, 0, 4, - [None, None, None, None, None, False], flock, None) - - # test zero-length putblob - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 0, None, flock, None) - bcw._pageblob = True - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 0, None, flock, None) - - # test empty page - with open(lpath, 'wb') as f: - f.write(b'\0' * 4 * 1024 * 1024) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4 * 1024 * 1024, - None, flock, None) - with open(lpath, 'wb') as f: - f.write(b'\0' * 4 * 1024) - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4 * 1024, - None, flock, None) - - sa_in_queue.put((0, (lpath, 'container', 'blob', 'blockid', 0, 4, - [None, None, None, None], flock, None))) - with requests_mock.mock() as m: - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, False, (sbs, sbs), None) - m.get('mock://blobepcontainer/blob?saskey', status_code=201) - bcw.run() - assert len(exc_list) > 0 - - -@patch('azure.storage.file.FileService.update_range') -@patch('azure.storage.file.FileService._get_file') -def test_storagechunkworker_files_run( - patched_get_file, patched_update_range, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.rsakey = None - args.pageblob = False - args.autovhd = False - args.timeout = None - args.fileshare = True - - exc_list = [] - flock = threading.Lock() - sa_in_queue = queue.PriorityQueue() - sa_out_queue = queue.Queue() - fs = azure.storage.file.FileService(account_name='sa', account_key='key') - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, True, None, fs) - patched_update_range.return_value = MagicMock() - bcw.put_storage_data( - lpath, 'container', 'blob', 'blockid', 0, 4, None, flock, None) - - bcw = blobxfer.StorageChunkWorker( - exc_list, sa_in_queue, sa_out_queue, args, False, None, fs) - patched_get_file.return_value = MagicMock() - patched_get_file.return_value.content = b'' - bcw.get_storage_range( - lpath, 'container', 'blob', 0, 0, 4, - [None, None, None, None, None, False], flock, None) - - -@patch('blobxfer.azure_request', return_value=None) -def test_generate_xferspec_download_invalid(patched_azure_request): - args = MagicMock() - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = 'saskey' - args.chunksizebytes = 5 - args.timeout = None - args.fileshare = False - sa_in_queue = queue.PriorityQueue() - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '-1', 'content-md5': 'md5'}) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(ValueError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, 'tmppath', 'blob', True, - [None, None, None]) - - -def test_generate_xferspec_download(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - args.rsakey = None - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = 'saskey' - args.chunksizebytes = 5 - args.timeout = None - args.fileshare = False - sa_in_queue = queue.PriorityQueue() - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with requests_mock.mock() as m: - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '-1', 'content-md5': 'md5'}) - sbs = blobxfer.SasBlobService('mock://blobep', 'saskey', None) - with pytest.raises(ValueError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert sa_in_queue.qsize() == 0 - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '6', 'content-md5': 'md5'}) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert sa_in_queue.qsize() == 2 - assert 2 == nsops - assert 6 == cl - assert 2 == nsops - assert 'md5' == md5 - assert fd is not None - fd.close() - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - assert 2 == nsops - assert fd is None - assert sa_in_queue.qsize() == 4 - with open(lpath, 'wt') as f: - f.write('012345') - m.head('mock://blobepcontainer/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', True, - [None, None, None]) - assert nsops is None - assert cl is None - assert sa_in_queue.qsize() == 4 - - sa_in_queue = queue.PriorityQueue() - args.rsaprivatekey = _RSAKEY - args.rsapublickey = None - symkey, signkey = blobxfer.generate_aes256_keys() - args.encmode = blobxfer._ENCRYPTION_MODE_CHUNKEDBLOB - metajson = blobxfer.EncryptionMetadataJson( - args, symkey, signkey, iv=b'0', encdata_signature=b'0', - preencrypted_md5=None) - encmeta = metajson.construct_metadata_json() - goodencjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - goodauthjson = json.loads( - encmeta[blobxfer._ENCRYPTION_METADATA_AUTH_NAME]) - metajson2 = blobxfer.EncryptionMetadataJson( - args, None, None, None, None, None) - metajson2.parse_metadata_json( - 'blob', args.rsaprivatekey, args.rsapublickey, encmeta) - assert metajson2.symkey == symkey - assert metajson2.signkey == signkey - assert metajson2.encmode == args.encmode - assert metajson2.chunksizebytes == args.chunksizebytes + \ - blobxfer._AES256CBC_HMACSHA256_OVERHEAD_BYTES + 1 - encjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - encjson[blobxfer._ENCRYPTION_METADATA_LAYOUT][ - blobxfer._ENCRYPTION_METADATA_CHUNKSTRUCTURE] = 'X' - headers = { - 'content-length': '64', - 'content-md5': 'md5', - 'x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME: - json.dumps(encjson), - 'x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME: - json.dumps(goodauthjson), - } - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - # switch to full blob mode tests - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - metajson = blobxfer.EncryptionMetadataJson( - args, symkey, signkey, iv=b'0', encdata_signature=b'0', - preencrypted_md5=None) - encmeta = metajson.construct_metadata_json() - goodencjson = json.loads(encmeta[blobxfer._ENCRYPTION_METADATA_NAME]) - goodauthjson = json.loads( - encmeta[blobxfer._ENCRYPTION_METADATA_AUTH_NAME]) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(goodauthjson) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_AGENT][ - blobxfer._ENCRYPTION_METADATA_PROTOCOL] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_AGENT][ - blobxfer._ENCRYPTION_METADATA_ENCRYPTION_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_INTEGRITY_AUTH][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - encjson = copy.deepcopy(goodencjson) - encjson[blobxfer._ENCRYPTION_METADATA_WRAPPEDCONTENTKEY][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson.pop(blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH, None) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH].pop( - blobxfer._ENCRYPTION_METADATA_AUTH_ENCODING, None) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH][ - blobxfer._ENCRYPTION_METADATA_ALGORITHM] = 'X' - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - authjson = copy.deepcopy(goodauthjson) - authjson[blobxfer._ENCRYPTION_METADATA_AUTH_METAAUTH][ - blobxfer._ENCRYPTION_METADATA_MAC] = blobxfer.base64encode(b'X') - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(authjson) - m.head('mock://blobepcontainer/blob?saskey', headers=headers) - with pytest.raises(RuntimeError): - blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [None, None, None]) - - args.chunksizebytes = 5 - metajson.chunksizebytes = args.chunksizebytes - metajson.md5 = headers['content-md5'] - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - encjson = copy.deepcopy(goodencjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_NAME] = \ - json.dumps(encjson) - headers['x-ms-meta-' + blobxfer._ENCRYPTION_METADATA_AUTH_NAME] = \ - json.dumps(goodauthjson) - hcl = int(headers['content-length']) - cl, nsops, md5, fd = blobxfer.generate_xferspec_download( - sbs, None, args, sa_in_queue, lpath, 'blob', False, - [hcl, headers['content-md5'], metajson]) - assert hcl == cl - calcops = hcl // args.chunksizebytes - hclmod = hcl % args.chunksizebytes - if hclmod > 0: - calcops += 1 - assert calcops == nsops - assert headers['content-md5'] == md5 - assert fd is None - assert sa_in_queue.qsize() == nsops - data = sa_in_queue.get() - assert data is not None - - -def test_generate_xferspec_upload(tmpdir): - lpath = str(tmpdir.join('test.tmp')) - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - args = MagicMock() - args.storageaccount = 'sa' - args.container = 'container' - args.storageaccountkey = 'key' - args.chunksizebytes = 5 - args.skiponmatch = False - args.pageblob = False - args.autovhd = False - sa_in_queue = queue.PriorityQueue() - fs, nsops, md5, fd = blobxfer.generate_xferspec_upload( - args, sa_in_queue, {}, {}, lpath, 'rr', True) - stat = os.stat(lpath) - assert stat.st_size == fs - assert math.ceil(stat.st_size / 5.0) == nsops - assert fd is not None - fd.close() - args.skiponmatch = True - with open(lpath, 'wt') as f: - f.write('012345') - sd = {} - sd['rr'] = [6, '1qmpM8iq/FHlWsBmK25NSg=='] - fs, nsops, md5, fd = blobxfer.generate_xferspec_upload( - args, sa_in_queue, sd, {}, lpath, 'rr', False) - assert fs is None - - -def test_apply_file_collation_and_strip(): - args = MagicMock() - args.collate = 'collatedir' - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'collatedir/file0' - - args.collate = None - args.stripcomponents = 0 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'tmpdir/file0' - args.stripcomponents = 1 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'file0' - args.stripcomponents = 2 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/file0') - assert rfname == 'file0' - args.stripcomponents = 1 - rfname = blobxfer.apply_file_collation_and_strip( - args, '/tmpdir/tmpdir2/file0') - assert rfname == 'tmpdir2/file0' - args.stripcomponents = 2 - rfname = blobxfer.apply_file_collation_and_strip( - args, 'tmpdir/tmpdir2/file0') - assert rfname == 'file0' - - -@patch('azure.storage.file.FileService.create_directory') -def test_create_all_parent_directories_fileshare(patched_cd): - patched_cd.return_value = MagicMock() - fsfile = ['tmp/a/b', None] - file_service = MagicMock() - args = MagicMock() - args.container = 'fshare' - args.timeout = None - dirscreated = set() - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 3 - assert 'tmp' in dirscreated - assert 'tmp/a' in dirscreated - assert 'tmp/a/b' in dirscreated - fsfile = ['tmp/a/b/c', None] - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 4 - assert 'tmp/a/b/c' in dirscreated - fsfile = ['x/a/b/c', None] - blobxfer.create_all_parent_directories_fileshare( - file_service, args, fsfile, dirscreated) - assert len(dirscreated) == 8 - assert 'x/a/b/c' in dirscreated - - -def _mock_get_storage_account_keys(timeout=None, service_name=None): - ret = MagicMock() - ret.storage_service_keys.primary = 'mmkey' - return ret - - -def _mock_get_storage_account_properties(timeout=None, service_name=None): - ret = MagicMock() - ret.storage_service_properties.endpoints = [None] - return ret - - -def _mock_blobservice_create_container(timeout=None, container_name=None, - fail_on_exist=None): - raise azure.common.AzureConflictHttpError('conflict', 409) - - -@patch('blobxfer.parseargs') -@patch('azure.servicemanagement.ServiceManagementService.' - 'get_storage_account_keys') -@patch('azure.servicemanagement.ServiceManagementService.' - 'get_storage_account_properties') -def test_main1( - patched_sms_saprops, patched_sms_sakeys, patched_parseargs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - args.include = None - args.stripcomponents = 0 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.rsakeypassphrase = None - args.numworkers = 0 - args.localresource = '' - args.storageaccount = 'blobep' - args.container = 'container' - args.storageaccountkey = None - os.environ[blobxfer._ENVVAR_STORAGEACCOUNTKEY] = 'saskey' - args.chunksizebytes = 5 - args.pageblob = False - args.autovhd = False - args.fileshare = False - patched_parseargs.return_value = args - with pytest.raises(ValueError): - blobxfer.main() - args.localresource = lpath - args.endpoint = '' - with pytest.raises(ValueError): - blobxfer.main() - args.endpoint = 'blobep' - args.upload = True - args.download = True - with pytest.raises(ValueError): - blobxfer.main() - args.upload = None - args.download = None - with pytest.raises(ValueError): - blobxfer.main() - os.environ.pop(blobxfer._ENVVAR_STORAGEACCOUNTKEY) - args.storageaccountkey = None - args.timeout = -1 - args.saskey = '' - with pytest.raises(ValueError): - blobxfer.main() - args.saskey = None - args.storageaccountkey = None - args.managementcert = 'cert.spam' - args.subscriptionid = '1234' - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = 'cert.pem' - args.managementep = None - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = 'mep' - args.subscriptionid = None - with pytest.raises(ValueError): - blobxfer.main() - args.subscriptionid = '1234' - args.pageblob = True - args.autovhd = True - with pytest.raises(ValueError): - blobxfer.main() - args.autovhd = False - args.fileshare = True - with pytest.raises(ValueError): - blobxfer.main() - args.pageblob = False - args.autovhd = True - with pytest.raises(ValueError): - blobxfer.main() - args.autovhd = False - args.fileshare = False - with patch('azure.servicemanagement.ServiceManagementService') as mock: - mock.return_value = MagicMock() - mock.return_value.get_storage_account_keys = \ - _mock_get_storage_account_keys - mock.return_value.get_storage_account_properties = \ - _mock_get_storage_account_properties - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.remoteresource = 'blob' - args.chunksizebytes = None - with patch('azure.storage.blob.BlockBlobService') as mock: - mock.return_value = None - with pytest.raises(ValueError): - blobxfer.main() - args.storageaccountkey = None - args.saskey = None - os.environ[blobxfer._ENVVAR_SASKEY] = 'saskey' - args.remoteresource = None - args.download = True - with pytest.raises(ValueError): - blobxfer.main() - - args.download = False - args.upload = True - args.remoteresource = None - args.storageaccountkey = '' - args.saskey = None - with pytest.raises(ValueError): - blobxfer.main() - - args.collate = 'collatetmp' - with pytest.raises(ValueError): - blobxfer.main() - - args.collate = None - args.storageaccountkey = None - args.saskey = '' - with pytest.raises(ValueError): - blobxfer.main() - - args.saskey = None - os.environ.pop(blobxfer._ENVVAR_SASKEY) - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = '0' - args.managementep = '' - args.subscriptionid = '0' - with pytest.raises(ValueError): - blobxfer.main() - args.managementcert = 'test.pem' - with pytest.raises(ValueError): - blobxfer.main() - args.managementep = 'mep.mep' - ssk = MagicMock() - ssk.storage_service_keys = MagicMock() - ssk.storage_service_keys.primary = '' - patched_sms_sakeys.return_value = ssk - ssp = MagicMock() - ssp.storage_service_properties = MagicMock() - ssp.storage_service_properties.endpoints = ['blobep'] - patched_sms_saprops.return_value = ssp - with pytest.raises(ValueError): - blobxfer.main() - ssk.storage_service_keys.primary = 'key1' - args.storageaccountkey = None - args.rsaprivatekey = '' - args.rsapublickey = '' - with pytest.raises(ValueError): - blobxfer.main() - args.rsaprivatekey = '' - args.rsapublickey = None - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - with pytest.raises(IOError): - blobxfer.main() - - args.rsaprivatekey = None - args.storageaccountkey = None - args.managementcert = None - args.managementep = None - args.subscriptionid = None - - args.upload = False - args.download = True - args.remoteresource = None - args.saskey = 'saskey&srt=c' - with pytest.raises(ValueError): - blobxfer.main() - args.upload = True - args.download = False - args.saskey = None - - os.environ[blobxfer._ENVVAR_SASKEY] = 'saskey' - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - with requests_mock.mock() as m: - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=block&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&blockid=00000000&comp=block', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=block&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=metadata', status_code=200) - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - '' + lpath + '' - '6' - 'md5BlockBlob' - '') - args.progressbar = False - args.skiponmatch = True - blobxfer.main() - - args.progressbar = True - args.download = True - args.upload = False - args.remoteresource = None - with pytest.raises(ValueError): - blobxfer.main() - - args.remoteresource = 'blob' - args.localresource = str(tmpdir) - m.head('https://blobep.blob.blobep/container/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - m.get('https://blobep.blob.blobep/container/blob?saskey', - content=b'012345') - blobxfer.main() - - args.pageblob = False - args.autovhd = False - args.skiponmatch = False - pemcontents = _RSAKEY.private_bytes( - encoding=cryptography.hazmat.primitives.serialization. - Encoding.PEM, - format=cryptography.hazmat.primitives.serialization. - PrivateFormat.PKCS8, - encryption_algorithm=cryptography.hazmat.primitives. - serialization.NoEncryption()) - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - args.rsaprivatekey = pempath - blobxfer.main() - os.remove(pempath) - - args.rsaprivatekey = None - args.skiponmatch = True - args.remoteresource = '.' - args.keepmismatchedmd5files = False - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6' - 'BlockBlob' - '') - m.get('https://blobep.blob.blobep/container/?saskey') - with pytest.raises(SystemExit): - blobxfer.main() - - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6md5' - 'BlockBlob' - '') - blobxfer.main() - - tmplpath = str(tmpdir.join('test', 'test2', 'test3')) - args.localresource = tmplpath - blobxfer.main() - - args.localresource = str(tmpdir) - notmp_lpath = '/'.join(lpath.strip('/').split('/')[1:]) - - with requests_mock.mock() as m: - args.delete = True - args.download = False - args.upload = True - args.remoteresource = None - args.skiponmatch = False - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container' + lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=blocklist', status_code=201) - m.get('https://blobep.blob.blobep/container?saskey&comp=list' - '&restype=container&maxresults=1000', - text='' - 'blob' - '6md5' - 'BlockBlob' - '') - m.delete('https://blobep.blob.blobep/container/blob?saskey', - status_code=202) - with pytest.raises(SystemExit): - blobxfer.main() - - args.recursive = False - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/test.tmp.blobtmp?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=block&blockid=00000000', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=blocklist', status_code=201) - with pytest.raises(SystemExit): - blobxfer.main() - - args.stripcomponents = None - args.collate = '.' - args.pageblob = True - args.upload = True - args.download = False - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey', - status_code=201) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey', - status_code=201) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=properties', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=properties', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey', - status_code=201) - with pytest.raises(IOError): - blobxfer.main() - - args.stripcomponents = None - m.put('https://blobep.blob.blobep/container/blobsaskey', - status_code=200) - with pytest.raises(IOError): - blobxfer.main() - - args.stripcomponents = None - args.pageblob = False - m.put('https://blobep.blob.blobep/container/' + notmp_lpath + - '?saskey&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/blob?saskey', - status_code=201) - blobxfer.main() - - args.stripcomponents = None - args.autovhd = True - blobxfer.main() - - args.stripcomponents = None - args.include = 'nofiles' - with pytest.raises(SystemExit): - blobxfer.main() - - args.stripcomponents = None - args.include = '*' - blobxfer.main() - - args.include = None - args.stripcomponents = None - args.pageblob = False - args.autovhd = False - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - args.rsaprivatekey = pempath - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey&comp=block' - '&blockid=00000000', status_code=201) - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey' - '&comp=blocklist', status_code=201) - m.put('https://blobep.blob.blobep/container/rsa.pem?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/blob?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/blob.blobtmp?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp.blobtmp?saskey' - '&comp=metadata', status_code=200) - m.put('https://blobep.blob.blobep/container/test.tmp?saskey' - '&comp=metadata', status_code=200) - blobxfer.main() - - args.stripcomponents = None - args.download = True - args.upload = False - args.rsaprivatekey = pempath - args.remoteresource = 'blob' - args.localresource = str(tmpdir) - m.head('https://blobep.blob.blobep/container/blob?saskey', headers={ - 'content-length': '6', 'content-md5': '1qmpM8iq/FHlWsBmK25NSg=='}) - m.get('https://blobep.blob.blobep/container/blob?saskey', - content=b'012345') - # TODO add encrypted data json - blobxfer.main() - - os.environ.pop(blobxfer._ENVVAR_SASKEY) - - -@patch('blobxfer.parseargs') -def test_main2(patched_parseargs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - patched_parseargs.return_value = args - args.include = None - args.stripcomponents = 1 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.numworkers = 64 - args.storageaccount = 'blobep' - args.container = 'container' - args.chunksizebytes = 5 - args.localresource = lpath - args.endpoint = '.blobep' - args.timeout = 10 - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.chunksizebytes = None - args.download = False - args.upload = True - args.remoteresource = None - args.collate = None - args.saskey = None - args.storageaccountkey = 'key' - args.fileshare = False - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - session = requests.Session() - adapter = requests_mock.Adapter() - session.mount('mock', adapter) - - with patch('azure.storage.blob.BlockBlobService') as mock: - args.createcontainer = True - args.pageblob = False - args.autovhd = False - mock.return_value = MagicMock() - mock.return_value.create_container = _mock_blobservice_create_container - blobxfer.main() - - -@patch('azure.storage.file.FileService.create_share') -@patch('azure.storage.file.FileService.create_file') -@patch('azure.storage.file.FileService.create_directory') -@patch('azure.storage.file.FileService.get_file_properties') -@patch('azure.storage.file.FileService.get_file_metadata') -@patch('azure.storage.file.FileService.list_directories_and_files') -@patch('azure.storage.file.FileService.update_range') -@patch('azure.storage.file.FileService._get_file') -@patch('azure.storage.file.FileService.set_file_properties') -@patch('azure.storage.file.FileService.set_file_metadata') -@patch('azure.storage.file.FileService.resize_file') -@patch('blobxfer.parseargs') -def test_main3( - patched_parseargs, patched_rf, patched_sfm, patched_sfp, - patched_get_file, patched_update_range, patched_ldaf, patched_gfm, - patched_gfp, patched_cd, patched_cf, patched_cs, tmpdir): - lpath = str(tmpdir.join('test.tmp')) - args = MagicMock() - patched_parseargs.return_value = args - args.include = None - args.stripcomponents = 1 - args.delete = False - args.rsaprivatekey = None - args.rsapublickey = None - args.numworkers = 64 - args.storageaccount = 'sa' - args.container = 'myshare' - args.chunksizebytes = 5 - args.localresource = lpath - args.endpoint = 'core.windows.net' - args.timeout = 10 - args.managementep = None - args.managementcert = None - args.subscriptionid = None - args.chunksizebytes = None - args.download = False - args.upload = True - args.remoteresource = None - args.collate = None - args.saskey = None - args.storageaccountkey = 'key' - args.pageblob = False - args.autovhd = False - args.fileshare = True - args.computefilemd5 = True - args.skiponmatch = True - with open(lpath, 'wt') as f: - f.write(str(uuid.uuid4())) - - patched_cs.return_value = MagicMock() - patched_cf.return_value = MagicMock() - patched_gfp.return_value = MagicMock() - patched_update_range.return_value = MagicMock() - patched_get_file.return_value = MagicMock() - patched_get_file.return_value.content = b'\0' * 8 - - pemcontents = _RSAKEY.private_bytes( - encoding=cryptography.hazmat.primitives.serialization. - Encoding.PEM, - format=cryptography.hazmat.primitives.serialization. - PrivateFormat.PKCS8, - encryption_algorithm=cryptography.hazmat.primitives. - serialization.NoEncryption()) - pempath = str(tmpdir.join('rsa.pem')) - with open(pempath, 'wb') as f: - f.write(pemcontents) - - args.rsaprivatekey = pempath - args.rsakeypassphrase = None - args.encmode = blobxfer._ENCRYPTION_MODE_FULLBLOB - blobxfer.main() - - args.download = True - args.upload = False - args.rsaprivatekey = pempath - args.remoteresource = '.' - with pytest.raises(SystemExit): - blobxfer.main() - - patched_ldaf.return_value = [azure.storage.file.File(name='test.tmp')] - patched_gfp.return_value = MagicMock() - patched_gfp.return_value.properties = MagicMock() - patched_gfp.return_value.properties.content_length = 1 - patched_gfp.return_value.properties.content_settings = MagicMock() - patched_gfp.return_value.properties.content_settings.content_md5 = 'md5' - args.rsaprivatekey = pempath - args.localresource = lpath.rstrip(os.path.sep + 'test.tmp') - blobxfer.main() - - os.remove(pempath) diff --git a/test_requirements.txt b/test_requirements.txt new file mode 100644 index 0000000..c576b44 --- /dev/null +++ b/test_requirements.txt @@ -0,0 +1,5 @@ +flake8>=3.3.0 +mock>=2.0.0; python_version < '3.3' +pypandoc>=1.4 +pytest>=3.1.1 +pytest-cov>=2.5.1 diff --git a/tests/test_blobxfer.py b/tests/test_blobxfer.py new file mode 100644 index 0000000..f64c084 --- /dev/null +++ b/tests/test_blobxfer.py @@ -0,0 +1,13 @@ +# coding=utf-8 +"""Tests for miscellaneous""" + +# stdlib imports +# non-stdlib imports +import azure.storage +# module under test +import blobxfer.version + + +def test_user_agent_monkey_patch(): + verstr = 'blobxfer/{}'.format(blobxfer.version.__version__) + assert azure.storage._constants.USER_AGENT_STRING.startswith(verstr) diff --git a/tests/test_blobxfer_models_azure.py b/tests/test_blobxfer_models_azure.py new file mode 100644 index 0000000..f075092 --- /dev/null +++ b/tests/test_blobxfer_models_azure.py @@ -0,0 +1,54 @@ +# coding=utf-8 +"""Tests for models azure""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import azure.storage +import azure.storage.blob +import azure.storage.file +# module under test +import blobxfer.models.azure as azmodels + + +def test_azurestorageentity(): + ase = azmodels.StorageEntity('cont') + assert ase.container == 'cont' + assert ase.encryption_metadata is None + + blob = mock.MagicMock() + blob.name = 'name' + blob.snapshot = None + blob.properties = mock.MagicMock() + blob.properties.last_modified = 'lmt' + blob.properties.content_length = 123 + blob.properties.content_settings = mock.MagicMock() + blob.properties.content_settings.content_md5 = 'abc' + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.BlockBlob + ase.populate_from_blob(mock.MagicMock(), blob) + + assert ase.client is not None + assert ase.name == 'name' + assert ase.lmt == 'lmt' + assert ase.size == 123 + assert ase.md5 == 'abc' + assert ase.snapshot is None + assert ase.mode == azmodels.StorageModes.Block + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.AppendBlob + ase.populate_from_blob(mock.MagicMock(), blob) + assert ase.mode == azmodels.StorageModes.Append + + blob.properties.blob_type = azure.storage.blob.models._BlobTypes.PageBlob + blob.snapshot = 'abc' + ase.populate_from_blob(mock.MagicMock(), blob) + assert ase.mode == azmodels.StorageModes.Page + assert ase.snapshot is not None + + blob.snapshot = None + ase.populate_from_file(mock.MagicMock(), blob, 'path') + assert ase.mode == azmodels.StorageModes.File + assert ase.snapshot is None diff --git a/tests/test_blobxfer_models_crypto.py b/tests/test_blobxfer_models_crypto.py new file mode 100644 index 0000000..8503a71 --- /dev/null +++ b/tests/test_blobxfer_models_crypto.py @@ -0,0 +1,208 @@ +# coding=utf-8 +"""Tests for crypto models""" + +# stdlib imports +import copy +import json +# non-stdlib imports +import pytest +# local imports +# module under test +import blobxfer.models.crypto as models +import blobxfer.operations.crypto as ops + + +_SAMPLE_RSA_KEY = """ +-----BEGIN RSA PRIVATE KEY----- +MIICXQIBAAKBgQDwlQ0W6O2ixhZM+LYl/ZtUi4lpjFu6+Kt/fyim/LQojaa389yD +e3lqWnAitj13n8uLpv1XuysG2fL+G0AvzT9JJj8gageJRC/8uffhOlxvH/vzfFqU +wQEgwhuv9LXdFcl+mON4TiHqbKsUmggNNPNzSN/P0aohMG8pG8ihyO3uOQIDAQAB +AoGBAIkaKA96RpKQmHzc79DOqgqQSorf9hajR/ismpovQOwrbWs/iddUMmktiOH/ +QSA+7Fx1mcK5Y1fQNO4i0X1sVjdasoPvmU7iGVgHQ9TX6F5LGQtDqAKXAH6GpjkF +V7I7nEBs2vtetpzzq8up2nY7fuwPwse44jdLGZjh1pc0HcFRAkEA/F5XdWq5ZYVo +hMyxxhdb+6J8NKZTsWn92tW0s/pGlkgDwrryglpLqNf9MR+Mm906UUVh6ZmsKoxD +kZzA+4S3bwJBAPQLSryk8CUE0uFviYYANq3asn9sDDTGcvEceSGGwbaZOTDVQNQg +7BhLL5vA8Be/xvkXfEaWa1XipmaBI+4WINcCQGQLEiid0jkIldJvQtoAUJqEYzCL +7wmZtuSVazkdsfXJPpRnf9Nk8DFSzjA3DYqMPJ4THyl3neSQDgkfVvFeP0kCQQDu +0OIJKwsJ3ueSznhw1mKrzTkh8pUbTBwNEQUEpv+H9fd+byGqtLD1sRXcwHjzdKt8 +9Nubo/VTraGS68tCYQsvAkAYxzwSeX7Gj9/mMBFx1Y5v9sSCqLZQeF7q1ltzkwlK +n3by7Z7RvxXXPjv1YoFQPV0WlA6zo4sm0HwFzA0sbOql +-----END RSA PRIVATE KEY----- +""" + +_SAMPLE_ED = \ + { + "BlobxferExtensions": { + "PreEncryptedContentMD5": "tc+p1sj+vWGPkawoQ9UKHA==" + }, + "ContentEncryptionIV": "KjA4Y14+J1p7EJcYWhnKNQ==", + "EncryptionAgent": { + "EncryptionAlgorithm": "AES_CBC_256", + "Protocol": "1.0" + }, + "EncryptionAuthentication": { + "Algorithm": "HMAC-SHA256", + "MessageAuthenticationCode": + "9oKt5Ett7t1AWahxNq3qcGd5NbZMxLtzSN8Lwqy3PgU=" + }, + "EncryptionMode": "FullBlob", + "KeyWrappingMetadata": {}, + "WrappedContentKey": { + "Algorithm": "RSA-OAEP", + "EncryptedAuthenticationKey": + "1kO63RxIqIyUp1EW+v2o5VwyhAlrrJiLc+seXnNcVRm0YLHzJYqOrBCz2+" + "c2do2dJKhzTOXyPsJSwkvQVJ0NuYVUTxf6bzDNip2Ge1jTHnsd5IsljMKy" + "rSAvHaKs9NxdvDu5Ex6lhKEChnuMtJBq52zCML5+LUd98WkBxdB2az4=", + "EncryptedKey": + "yOuWT2txNNzOITtDcjV1Uf3/V+TRn5AKjvOtHt+PRuBgMhq6fOFV8kcJhO" + "zPxh8bHqydIFM2OQ+ktiETQ5Ibg7OA24hhr+n8Y6nJNpw3cGtP6L/23n8a" + "a7RMKhmactl3sToFM3xvaXRO0DYuDZeQtPR/DDKPgi2gK641y1THAoc=", + "KeyId": "private:key1" + } + } + +_SAMPLE_EDA = \ + { + "EncryptionMetadataAuthentication": { + "Algorithm": "HMAC-SHA256", + "Encoding": "UTF-8", + "MessageAuthenticationCode": + "BhJjehtHxgSRIBaITDB6o6ZUt6mdehN0PDkhHtwXTP8=" + } + } + + +def test_encryption_metadata_exists(): + md = None + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {} + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {'encryptiondata': {}} + assert not models.EncryptionMetadata.encryption_metadata_exists(md) + + md = {'encryptiondata': {'key': 'value'}} + assert models.EncryptionMetadata.encryption_metadata_exists(md) + + +def test_convert_from_json(tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write(_SAMPLE_RSA_KEY) + rsaprivatekey = ops.load_rsa_private_key_file(str(keyfile), None) + + # test various missing metadata fields + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAgent']['EncryptionAlgorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAgent']['Protocol'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionAuthentication']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['EncryptionMode'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ced = copy.deepcopy(_SAMPLE_ED) + ced['WrappedContentKey'].pop('EncryptedAuthenticationKey') + ced['WrappedContentKey']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + ceda = copy.deepcopy(_SAMPLE_EDA) + ceda['EncryptionMetadataAuthentication']['Algorithm'] = 'OOPS' + md = { + 'encryptiondata': json.dumps( + _SAMPLE_ED, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(ceda) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + # test failed hmac + ced = copy.deepcopy(_SAMPLE_ED) + ced.pop('BlobxferExtensions') + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + with pytest.raises(RuntimeError): + em.convert_from_json(md, 'blob', rsaprivatekey) + + # test correct path + md = { + 'encryptiondata': json.dumps( + _SAMPLE_ED, sort_keys=True, ensure_ascii=False), + 'encryptiondata_authentication': json.dumps(_SAMPLE_EDA) + } + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', rsaprivatekey) + hmac = em.initialize_hmac() + assert em.wrapped_content_key is not None + assert em._symkey == em.symmetric_key + assert em._signkey == em.signing_key + assert em._symkey is not None + assert em._signkey is not None + assert hmac is not None + + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', None) + assert em.wrapped_content_key is not None + assert em._symkey is None + assert em._signkey is None + + ced = copy.deepcopy(_SAMPLE_ED) + ced['WrappedContentKey'].pop('EncryptedAuthenticationKey') + md = { + 'encryptiondata': json.dumps( + ced, sort_keys=True, ensure_ascii=False) + } + em = models.EncryptionMetadata() + em.convert_from_json(md, 'blob', rsaprivatekey) + hmac = em.initialize_hmac() + assert em.wrapped_content_key is not None + assert em._symkey is not None + assert em._signkey is None + assert hmac is None diff --git a/tests/test_blobxfer_models_download.py b/tests/test_blobxfer_models_download.py new file mode 100644 index 0000000..c1b568e --- /dev/null +++ b/tests/test_blobxfer_models_download.py @@ -0,0 +1,744 @@ +# coding=utf-8 +"""Tests for download models""" + +# stdlib imports +import hashlib +import hmac +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +import os +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +import unittest +# non-stdlib imports +import pytest +# local imports +import blobxfer.models.azure as azmodels +import blobxfer.models.options as options +import blobxfer.operations.azure as azops +import blobxfer.operations.resume as rops +import blobxfer.util as util +# module under test +import blobxfer.models.download as models + + +def test_localdestinationpath(tmpdir): + tmpdir.mkdir('1') + path = tmpdir.join('1') + + a = models.LocalDestinationPath(str(path)) + a.is_dir = True + assert str(a.path) == str(path) + assert a.is_dir + + a.ensure_path_exists() + assert os.path.exists(str(a.path)) + + b = models.LocalDestinationPath() + b.is_dir = False + b.path = str(path) + with pytest.raises(RuntimeError): + b.ensure_path_exists() + assert not b.is_dir + + path2 = tmpdir.join('2') + path3 = path2.join('3') + c = models.LocalDestinationPath(str(path3)) + with pytest.raises(RuntimeError): + c.ensure_path_exists() + c.is_dir = False + c.ensure_path_exists() + assert os.path.exists(str(path2)) + assert os.path.isdir(str(path2)) + assert not c.is_dir + + +def test_downloadspecification(): + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=True, + lmt_ge=False, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + + asp = azops.SourcePath() + p = 'some/remote/path' + asp.add_path_with_storage_account(p, 'sa') + + ds.add_azure_source_path(asp) + + assert ds.options.check_file_md5 + assert not ds.skip_on.lmt_ge + assert ds.destination.path == pathlib.Path('dest') + assert len(ds.sources) == 1 + assert p in ds.sources[0]._path_map + assert ds.sources[0]._path_map[p] == 'sa' + + +def test_downloaddescriptor(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 1024 + ase._encryption = mock.MagicMock() + with pytest.raises(RuntimeError): + d = models.Descriptor(lp, ase, opts, None) + + ase._encryption.symmetric_key = b'123' + d = models.Descriptor(lp, ase, opts, None) + assert not d._allocated + d._allocate_disk_space() + + assert d.entity == ase + assert d.entity.is_encrypted + assert not d.must_compute_md5 + assert d.hmac is not None + assert d._total_chunks == 64 + assert d._offset == 0 + assert d.final_path == lp + assert d._allocated + assert d.final_path.stat().st_size == ase._size - 16 + + d._allocate_disk_space() + assert d._allocated + + d.final_path.unlink() + ase._size = 32 + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + assert d._total_chunks == 2 + assert d._allocated + assert d.final_path.stat().st_size == ase._size - 16 + + d.final_path.unlink() + ase._encryption = None + ase._size = 1024 + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + assert d._allocated + assert d.final_path.stat().st_size == ase._size + + # pre-existing file check + ase._size = 0 + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + assert d._total_chunks == 0 + assert d._allocated + assert d.final_path.stat().st_size == ase._size + + +@unittest.skipIf(util.on_python2(), 'fallocate does not exist') +def test_downloaddescriptor_allocate_disk_space_via_seek(tmpdir): + fp = pathlib.Path(str(tmpdir.join('fp'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + d = models.Descriptor(fp, ase, opts, None) + + with mock.patch('os.posix_fallocate') as patched_fallocate: + patched_fallocate.side_effect = [AttributeError()] + d._allocate_disk_space() + assert d._allocated + assert fp.exists() + assert fp.stat().st_size == ase._size + + +def test_downloaddescriptor_resume(tmpdir): + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + ase._client = mock.MagicMock() + + # test no record + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test length mismatch + rmgr.add_or_update_record(str(fp), ase, 0, 0, False, None) + rb = d._resume() + assert rb is None + + # test nothing to resume + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + rmgr.add_or_update_record(str(fp), ase, 0, 0, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test completion + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + rmgr.add_or_update_record(str(fp), ase, 32, 1, True, None) + d = models.Descriptor(fp, ase, opts, rmgr) + fp.touch() + rb = d._resume() + assert rb == ase._size + + # test encrypted no resume + fp.unlink() + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # test up to chunk + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + ase._client = mock.MagicMock() + + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb == 32 + + # ensure hmac not populated + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + ase._client = mock.MagicMock() + fp.touch() + + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, None) + d = models.Descriptor(fp, ase, opts, rmgr) + d.hmac = True + with pytest.raises(RuntimeError): + d._resume() + + # md5 hash check + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + + data = os.urandom(32) + with fp.open('wb') as f: + f.write(data) + md5 = util.new_md5_hasher() + md5.update(data) + + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, md5.hexdigest()) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb == 32 + + # md5 hash mismatch + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, 'abc') + ase._md5 = 'abc' + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb is None + + # md5 hash check as page file + rmgr.delete() + rmgr = rops.DownloadResumeManager(resumefile) + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._name = 'blob' + ase._client = mock.MagicMock() + ase._mode = azmodels.StorageModes.Page + + rmgr.add_or_update_record(str(fp), ase, 32, 1, False, md5.hexdigest()) + d = models.Descriptor(fp, ase, opts, rmgr) + rb = d._resume() + assert rb == 32 + + +def test_downloaddescriptor_next_offsets(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + d = models.Descriptor(lp, ase, opts, None) + + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 128 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert not offsets.unpad + assert d.next_offsets() == (None, None) + + ase._size = 0 + d = models.Descriptor(lp, ase, opts, None) + assert d._total_chunks == 0 + assert d.next_offsets() == (None, None) + + ase._size = 1 + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 1 + assert offsets.range_start == 0 + assert offsets.range_end == 0 + assert not offsets.unpad + assert d.next_offsets() == (None, None) + + ase._size = 256 + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + assert d.next_offsets() == (None, None) + + ase._size = 256 + 16 + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 2 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert offsets.chunk_num == 1 + assert offsets.fd_start == 256 + assert offsets.num_bytes == 16 + assert offsets.range_start == 256 + assert offsets.range_end == 256 + 15 + assert not offsets.unpad + assert d.next_offsets() == (None, None) + + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + ase._size = 128 + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 128 + assert offsets.range_start == 0 + assert offsets.range_end == 127 + assert offsets.unpad + assert d.next_offsets() == (None, None) + + ase._size = 256 + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 1 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert offsets.unpad + assert d.next_offsets() == (None, None) + + ase._size = 256 + 32 # 16 bytes over + padding + d = models.Descriptor(lp, ase, opts, None) + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert d._total_chunks == 2 + assert offsets.chunk_num == 0 + assert offsets.fd_start == 0 + assert offsets.num_bytes == 256 + assert offsets.range_start == 0 + assert offsets.range_end == 255 + assert not offsets.unpad + offsets, resume_bytes = d.next_offsets() + assert resume_bytes is None + assert offsets.chunk_num == 1 + assert offsets.fd_start == 256 + assert offsets.num_bytes == 32 + assert offsets.range_start == 256 - 16 + assert offsets.range_end == 256 + 31 + assert offsets.unpad + assert d.next_offsets() == (None, None) + + +def test_hmac_iv(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 256 + ase = azmodels.StorageEntity('cont') + ase._size = 128 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + ase._size = 128 + d = models.Descriptor(lp, ase, opts, None) + + iv = b'abc' + d.hmac_iv(iv) + assert d.hmac.update.call_count == 1 + + +def test_write_unchecked_data(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 32 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + d.write_unchecked_data(offsets, b'0' * ase._size) + + assert offsets.chunk_num in d._unchecked_chunks + ucc = d._unchecked_chunks[offsets.chunk_num] + assert ucc['ucc'].data_len == ase._size + assert ucc['ucc'].fd_start == offsets.fd_start + assert ucc['ucc'].file_path == d.final_path + assert not ucc['ucc'].temp + assert ucc['decrypted'] + + +def test_write_unchecked_hmac_data(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 32 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets, b'0' * ase._size) + + assert offsets.chunk_num in d._unchecked_chunks + ucc = d._unchecked_chunks[offsets.chunk_num] + assert ucc['ucc'].data_len == ase._size + assert ucc['ucc'].fd_start == offsets.fd_start + assert ucc['ucc'].file_path != d.final_path + assert ucc['ucc'].temp + assert not ucc['decrypted'] + + +def test_perform_chunked_integrity_check(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d.write_unchecked_data(offsets, data) + d.perform_chunked_integrity_check() + + assert d._next_integrity_chunk == 1 + assert 0 not in d._unchecked_chunks + assert len(d._unchecked_chunks) == 0 + + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + d = models.Descriptor(lp, ase, opts, None) + + data = b'0' * opts.chunk_size_bytes + offsets, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets, data) + ucc = d._unchecked_chunks[offsets.chunk_num] + offsets1, _ = d.next_offsets() + d.write_unchecked_hmac_data(offsets1, data) + ucc1 = d._unchecked_chunks[offsets1.chunk_num] + ucc['decrypted'] = True + ucc1['decrypted'] = True + d.perform_chunked_integrity_check() + + assert ucc['ucc'].file_path != d.final_path + assert ucc1['ucc'].file_path != d.final_path + assert d._next_integrity_chunk == 2 + assert 0 not in d._unchecked_chunks + assert 1 not in d._unchecked_chunks + assert len(d._unchecked_chunks) == 0 + + # check integrity with resume + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + + data = b'0' * opts.chunk_size_bytes + md5 = util.new_md5_hasher() + md5.update(data) + + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._name = 'blob' + ase._client = mock.MagicMock() + ase._md5 = md5.hexdigest() + + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + + offsets, _ = d.next_offsets() + d.write_unchecked_data(offsets, data) + d.perform_chunked_integrity_check() + assert d._next_integrity_chunk == 1 + assert len(d._unchecked_chunks) == 0 + dr = rmgr.get_record(ase) + assert dr.next_integrity_chunk == 1 + assert dr.md5hexdigest == md5.hexdigest() + + +def test_update_resume_for_completed(tmpdir): + resumefile = pathlib.Path(str(tmpdir.join('resume'))) + fp = pathlib.Path(str(tmpdir.join('fp'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._name = 'blob' + ase._client = mock.MagicMock() + rmgr = rops.DownloadResumeManager(resumefile) + d = models.Descriptor(fp, ase, opts, rmgr) + offsets, _ = d.next_offsets() + d._update_resume_for_completed() + dr = rmgr.get_record(ase) + assert dr.completed + + +def test_cleanup_all_temporary_files(tmpdir): + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 16 + lp = pathlib.Path(str(tmpdir.join('a'))) + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d.write_unchecked_data(offsets, data) + assert len(d._unchecked_chunks) == 1 + d.cleanup_all_temporary_files() + assert not d.final_path.exists() + assert not d._unchecked_chunks[0]['ucc'].file_path.exists() + + lp = pathlib.Path(str(tmpdir.join('b'))) + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + data = b'0' * opts.chunk_size_bytes + d.write_unchecked_hmac_data(offsets, data) + assert len(d._unchecked_chunks) == 1 + d._unchecked_chunks[0]['ucc'].file_path.unlink() + d.cleanup_all_temporary_files() + assert not d.final_path.exists() + assert not d._unchecked_chunks[0]['ucc'].file_path.exists() + + +def test_write_data(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + d = models.Descriptor(lp, ase, opts, None) + + offsets, _ = d.next_offsets() + data = b'0' * ase._size + d.write_data(offsets, data) + + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + +def test_finalize_integrity_and_file(tmpdir): + # already finalized + lp = pathlib.Path(str(tmpdir.join('af'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d._finalized = True + d.finalize_integrity() + d.finalize_file() + + assert d.final_path.exists() + assert d.final_path.stat().st_size == ase._size + d.final_path.unlink() + + # hmac check success + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'123' + signkey = os.urandom(32) + ase._encryption.initialize_hmac = mock.MagicMock() + ase._encryption.initialize_hmac.return_value = hmac.new( + signkey, digestmod=hashlib.sha256) + + data = b'0' * (ase._size - 16) + _hmac = hmac.new(signkey, digestmod=hashlib.sha256) + _hmac.update(data) + ase._encryption.encryption_authentication.\ + message_authentication_code = util.base64_encode_as_string( + _hmac.digest()) + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d.hmac.update(data) + d.finalize_integrity() + d.finalize_file() + + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # md5 check success + lp = pathlib.Path(str(tmpdir.join('b'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + md5 = util.new_md5_hasher() + md5.update(data) + ase._md5 = util.base64_encode_as_string(md5.digest()) + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d.md5.update(data) + d.finalize_integrity() + d.finalize_file() + + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # no check + lp = pathlib.Path(str(tmpdir.join('c'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d.finalize_integrity() + d.finalize_file() + + assert d.final_path.exists() + assert d.final_path.stat().st_size == len(data) + + # md5 mismatch + lp = pathlib.Path(str(tmpdir.join('d'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + data = b'0' * ase._size + ase._md5 = 'oops' + + d = models.Descriptor(lp, ase, opts, None) + d._allocate_disk_space() + d.md5.update(data) + d.finalize_integrity() + d.finalize_file() + + assert not d.final_path.exists() + + +def test_operations(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 32 + + d = models.Descriptor(lp, ase, opts, None) + d._outstanding_ops = 1 + d._unchecked_chunks = {0: None} + assert not d.all_operations_completed + + d._outstanding_ops -= 1 + d._unchecked_chunks.pop(0) + assert d.all_operations_completed diff --git a/tests/test_blobxfer_models_offload.py b/tests/test_blobxfer_models_offload.py new file mode 100644 index 0000000..24351e3 --- /dev/null +++ b/tests/test_blobxfer_models_offload.py @@ -0,0 +1,39 @@ +# coding=utf-8 +"""Tests for offload""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import pytest +# local imports +# module under test +import blobxfer.models.offload as offload + + +def test_multiprocess_offload(): + with pytest.raises(ValueError): + a = offload._MultiprocessOffload(None, None) + + target = mock.MagicMock() + a = offload._MultiprocessOffload(target, 1, 'test') + assert len(a._procs) == 1 + assert not a.terminated + assert a._done_cv == a.done_cv + assert a._check_thread is None + assert a.pop_done_queue() is None + + item = (0, 'abc') + a._done_queue.put(item) + + check_func = mock.MagicMock() + a.initialize_check_thread(check_func) + + a.finalize_processes() + assert a.terminated + for proc in a._procs: + assert not proc.is_alive() + + assert a.pop_done_queue() == item diff --git a/tests/test_blobxfer_models_options.py b/tests/test_blobxfer_models_options.py new file mode 100644 index 0000000..31edde7 --- /dev/null +++ b/tests/test_blobxfer_models_options.py @@ -0,0 +1,108 @@ +# coding=utf-8 +"""Tests for models options""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +import pytest +# module under test +import blobxfer.models.options as options + + +@mock.patch('multiprocessing.cpu_count', return_value=1) +def test_concurrency_options(patched_cc): + a = options.Concurrency( + crypto_processes=-1, + md5_processes=0, + disk_threads=-1, + transfer_threads=-2, + ) + + assert a.crypto_processes == 0 + assert a.md5_processes == 1 + assert a.disk_threads == 2 + assert a.transfer_threads == 4 + + a = options.Concurrency( + crypto_processes=-1, + md5_processes=0, + disk_threads=1, + transfer_threads=-1, + ) + + assert a.crypto_processes == 0 + assert a.md5_processes == 1 + assert a.disk_threads == 1 + assert a.transfer_threads == 4 + + +@mock.patch('multiprocessing.cpu_count', return_value=64) +def test_concurrency_options_max_disk_and_transfer_threads(patched_cc): + a = options.Concurrency( + crypto_processes=1, + md5_processes=1, + disk_threads=None, + transfer_threads=None, + ) + + assert a.disk_threads == 64 + assert a.transfer_threads == 96 + + +def test_general_options(): + a = options.General( + concurrency=options.Concurrency( + crypto_processes=1, + md5_processes=2, + disk_threads=3, + transfer_threads=4, + ), + log_file='abc.log', + progress_bar=False, + resume_file='abc', + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.disk_threads == 3 + assert a.concurrency.transfer_threads == 4 + assert a.log_file == 'abc.log' + assert not a.progress_bar + assert a.resume_file == pathlib.Path('abc') + assert a.timeout_sec == 1 + assert a.verbose + + a = options.General( + concurrency=options.Concurrency( + crypto_processes=1, + md5_processes=2, + disk_threads=3, + transfer_threads=4, + ), + progress_bar=False, + resume_file=None, + timeout_sec=1, + verbose=True, + ) + + assert a.concurrency.crypto_processes == 1 + assert a.concurrency.md5_processes == 2 + assert a.concurrency.disk_threads == 3 + assert a.concurrency.transfer_threads == 4 + assert a.log_file is None + assert not a.progress_bar + assert a.resume_file is None + assert a.timeout_sec == 1 + assert a.verbose + + with pytest.raises(ValueError): + a = options.General(None) diff --git a/tests/test_blobxfer_models_resume.py b/tests/test_blobxfer_models_resume.py new file mode 100644 index 0000000..7fb12a3 --- /dev/null +++ b/tests/test_blobxfer_models_resume.py @@ -0,0 +1,31 @@ +# coding=utf-8 +"""Tests for models resume""" + +# stdlib imports +# non-stdlib imports +# module under test +import blobxfer.models.resume as rmodels + + +def test_download(): + d = rmodels.Download('fp', 1, 2, 0, False, '') + assert d.final_path == 'fp' + assert d.length == 1 + assert d.chunk_size == 2 + assert d.next_integrity_chunk == 0 + assert not d.completed + assert d.md5hexdigest == '' + + d.md5hexdigest = None + assert d.md5hexdigest == '' + + d.md5hexdigest = 'abc' + assert d.md5hexdigest == 'abc' + + d.next_integrity_chunk = 1 + assert d.next_integrity_chunk == 1 + + d.completed = True + assert d.completed + + assert len(str(d)) > 0 diff --git a/tests/test_blobxfer_models_upload.py b/tests/test_blobxfer_models_upload.py new file mode 100644 index 0000000..7d9e057 --- /dev/null +++ b/tests/test_blobxfer_models_upload.py @@ -0,0 +1,51 @@ +# coding=utf-8 +"""Tests for models upload""" + +# stdlib imports +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# module under test +import blobxfer.models.upload as upload + + +def test_localsourcepaths_files(tmpdir): + tmpdir.mkdir('abc') + tmpdir.join('moo.cow').write('z') + abcpath = tmpdir.join('abc') + abcpath.join('hello.txt').write('hello') + abcpath.join('blah.x').write('x') + abcpath.join('blah.y').write('x') + abcpath.join('blah.z').write('x') + abcpath.mkdir('def') + defpath = abcpath.join('def') + defpath.join('world.txt').write('world') + defpath.join('moo.cow').write('y') + + a = upload.LocalSourcePath() + a.add_includes('*.txt') + a.add_includes(['moo.cow', '*blah*']) + a.add_excludes('**/blah.x') + a.add_excludes(['world.txt']) + a.add_path(str(tmpdir)) + a_set = set() + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + a_set.add(sfile) + + assert len(a.paths) == 1 + assert str(abcpath.join('blah.x')) not in a_set + assert str(defpath.join('world.txt')) in a_set + assert str(defpath.join('moo.cow')) not in a_set + + b = upload.LocalSourcePath() + b.add_includes(['moo.cow', '*blah*']) + b.add_includes('*.txt') + b.add_excludes(['world.txt']) + b.add_excludes('**/blah.x') + b.add_paths([pathlib.Path(str(tmpdir))]) + for file in a.files(): + sfile = str(file.parent_path / file.relative_path) + assert sfile in a_set diff --git a/tests/test_blobxfer_operations_azure.py b/tests/test_blobxfer_operations_azure.py new file mode 100644 index 0000000..0322aa4 --- /dev/null +++ b/tests/test_blobxfer_operations_azure.py @@ -0,0 +1,164 @@ +# coding=utf-8 +"""Tests for operations azure""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import azure.storage +import azure.storage.blob +import azure.storage.file +import pytest +# module under test +import blobxfer.models.azure as azmodels +import blobxfer.operations.azure as azops + + +def test_storage_credentials(): + creds = azops.StorageCredentials(mock.MagicMock()) + creds.add_storage_account('sa1', 'somekey1', 'endpoint') + + a = creds.get_storage_account('sa1') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + assert isinstance( + a.append_blob_client, azure.storage.blob.AppendBlobService) + assert isinstance( + a.block_blob_client, azure.storage.blob.BlockBlobService) + assert isinstance( + a.file_client, azure.storage.file.FileService) + assert isinstance( + a.page_blob_client, azure.storage.blob.PageBlobService) + + with pytest.raises(KeyError): + a = creds.get_storage_account('sa2') + + with pytest.raises(ValueError): + creds.add_storage_account('sa1', 'somekeyxx', 'endpoint') + + creds.add_storage_account('sa2', 'somekey2', 'endpoint2') + a = creds.get_storage_account('sa1') + b = creds.get_storage_account('sa2') + assert a.name == 'sa1' + assert a.key == 'somekey1' + assert a.endpoint == 'endpoint' + assert b.name == 'sa2' + assert b.key == 'somekey2' + assert b.endpoint == 'endpoint2' + + +def test_key_is_sas(): + a = azops.StorageAccount('name', 'abcdef', 'endpoint', 10) + assert not a.is_sas + + a = azops.StorageAccount('name', 'abcdef&blah', 'endpoint', 10) + assert not a.is_sas + + a = azops.StorageAccount('name', '?abcdef', 'endpoint', 10) + assert a.is_sas + + a = azops.StorageAccount( + 'name', '?sv=0&sr=1&sig=2', 'endpoint', 10) + assert a.is_sas + + a = azops.StorageAccount( + 'name', 'sv=0&sr=1&sig=2', 'endpoint', 10) + assert a.is_sas + + a = azops.StorageAccount( + 'name', 'sig=0&sv=0&sr=1&se=2', 'endpoint', 10) + assert a.is_sas + + +def test_azuresourcepath(): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + with pytest.raises(RuntimeError): + asp.add_path_with_storage_account('x', 'x') + + assert 'sa' == asp.lookup_storage_account(p) + + +@mock.patch('blobxfer.models.crypto.EncryptionMetadata') +@mock.patch('blobxfer.operations.azure.file.list_files') +def test_azuresourcepath_files(patched_lf, patched_em): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = azmodels.StorageModes.File + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.file_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + f = azure.storage.file.models.File(name='name') + patched_lf.side_effect = [[f]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'remote/name' + assert file.encryption_metadata is None + assert i == 1 + + fe = azure.storage.file.models.File(name='name') + fe.metadata = {'encryptiondata': {'a': 'b'}} + patched_lf.side_effect = [[fe]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'remote/name' + assert file.encryption_metadata is not None + assert i == 1 + + +@mock.patch('blobxfer.models.crypto.EncryptionMetadata') +@mock.patch('blobxfer.operations.azure.blob.list_blobs') +def test_azuresourcepath_blobs(patched_lb, patched_em): + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + + options = mock.MagicMock() + options.mode = azmodels.StorageModes.Auto + creds = mock.MagicMock() + creds.get_storage_account = mock.MagicMock() + sa = mock.MagicMock() + sa.block_blob_client = mock.MagicMock() + creds.get_storage_account.return_value = sa + b = azure.storage.blob.models.Blob(name='name') + patched_lb.side_effect = [[b]] + patched_em.encryption_metadata_exists = mock.MagicMock() + patched_em.encryption_metadata_exists.return_value = False + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is None + assert i == 1 + + be = azure.storage.blob.models.Blob(name='name') + be.metadata = {'encryptiondata': {'a': 'b'}} + patched_lb.side_effect = [[be]] + patched_em.encryption_metadata_exists.return_value = True + patched_em.convert_from_json = mock.MagicMock() + + i = 0 + for file in asp.files(creds, options, mock.MagicMock()): + i += 1 + assert file.name == 'name' + assert file.encryption_metadata is not None + assert i == 1 diff --git a/tests/test_blobxfer_operations_azure_blob.py b/tests/test_blobxfer_operations_azure_blob.py new file mode 100644 index 0000000..0ed626a --- /dev/null +++ b/tests/test_blobxfer_operations_azure_blob.py @@ -0,0 +1,115 @@ +# coding=utf-8 +"""Tests for general blob operations""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import azure.common +import azure.storage.blob +import pytest +# local imports +import blobxfer.models.azure as azmodels +# module under test +import blobxfer.operations.azure.blob as ops + + +def test_check_if_single_blob(): + client = mock.MagicMock() + client.get_blob_properties.return_value = True + + result = ops.check_if_single_blob(client, 'a', 'b/c') + assert result + + result = ops.check_if_single_blob( + client, 'a', 'a?snapshot=2017-02-23T22:21:14.8121864Z') + assert result + + client = mock.MagicMock() + client.get_blob_properties = mock.MagicMock() + client.get_blob_properties.side_effect = \ + azure.common.AzureMissingResourceHttpError('msg', 404) + + result = ops.check_if_single_blob(client, 'a', 'b/c') + assert not result + + +def test_list_blobs(): + with pytest.raises(RuntimeError): + for blob in ops.list_blobs( + None, 'cont', 'prefix', azmodels.StorageModes.File, True): + pass + + _blob = azure.storage.blob.models.Blob(name='dir/name') + _blob.properties = azure.storage.blob.models.BlobProperties() + client = mock.MagicMock() + client.list_blobs.return_value = [_blob] + + i = 0 + for blob in ops.list_blobs( + client, 'cont', 'prefix', azmodels.StorageModes.Auto, False): + i += 1 + assert blob.name == _blob.name + assert i == 0 + + i = 0 + for blob in ops.list_blobs( + client, 'cont', 'prefix', azmodels.StorageModes.Auto, True): + i += 1 + assert blob.name == _blob.name + assert i == 1 + + _blob.properties.blob_type = \ + azure.storage.blob.models._BlobTypes.AppendBlob + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', azmodels.StorageModes.Block, True): + i += 1 + assert blob.name == _blob.name + assert i == 0 + + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', azmodels.StorageModes.Page, True): + i += 1 + assert blob.name == _blob.name + assert i == 0 + + _blob.properties.blob_type = \ + azure.storage.blob.models._BlobTypes.BlockBlob + i = 0 + for blob in ops.list_blobs( + client, 'dir', 'prefix', azmodels.StorageModes.Append, True): + i += 1 + assert blob.name == _blob.name + assert i == 0 + + _blob.snapshot = '2017-02-23T22:21:14.8121864Z' + client.get_blob_properties.return_value = _blob + i = 0 + for blob in ops.list_blobs( + client, 'cont', + 'a?snapshot=2017-02-23T22:21:14.8121864Z', + azmodels.StorageModes.Auto, + True): + i += 1 + assert blob.name == _blob.name + assert blob.snapshot == _blob.snapshot + assert i == 1 + + +def test_get_blob_range(): + ase = mock.MagicMock() + ret = mock.MagicMock() + ret.content = b'\0' + ase.client._get_blob.return_value = ret + ase.container = 'cont' + ase.name = 'name' + ase.snapshot = None + offsets = mock.MagicMock() + offsets.start_range = 0 + offsets.end_range = 1 + + assert ops.get_blob_range(ase, offsets) == ret.content diff --git a/tests/test_blobxfer_operations_azure_blob_append.py b/tests/test_blobxfer_operations_azure_blob_append.py new file mode 100644 index 0000000..f6e8c23 --- /dev/null +++ b/tests/test_blobxfer_operations_azure_blob_append.py @@ -0,0 +1,28 @@ +# coding=utf-8 +"""Tests for operations: blob append""" + +# stdlib imports +# non-stdlib imports +import azure.storage +# local imports +# module under test +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.append as ops + + +def test_create_client(): + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.AppendBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.AppendBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_operations_azure_blob_block.py b/tests/test_blobxfer_operations_azure_blob_block.py new file mode 100644 index 0000000..2af2f6f --- /dev/null +++ b/tests/test_blobxfer_operations_azure_blob_block.py @@ -0,0 +1,28 @@ +# coding=utf-8 +"""Tests for operations: block blob""" + +# stdlib imports +# non-stdlib imports +import azure.storage +# local imports +# module under test +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.block as ops + + +def test_create_client(): + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.BlockBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.BlockBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_operations_azure_blob_page.py b/tests/test_blobxfer_operations_azure_blob_page.py new file mode 100644 index 0000000..f1b4d8c --- /dev/null +++ b/tests/test_blobxfer_operations_azure_blob_page.py @@ -0,0 +1,28 @@ +# coding=utf-8 +"""Tests for models""" + +# stdlib imports +# non-stdlib imports +import azure.storage +# local imports +# module under test +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.blob.page as ops + + +def test_create_client(): + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.PageBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.blob.PageBlobService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) diff --git a/tests/test_blobxfer_operations_azure_file.py b/tests/test_blobxfer_operations_azure_file.py new file mode 100644 index 0000000..2a45428 --- /dev/null +++ b/tests/test_blobxfer_operations_azure_file.py @@ -0,0 +1,140 @@ +# coding=utf-8 +"""Tests for file operations""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import azure.common +import azure.storage +# local imports +import blobxfer.util as util +# module under test +import blobxfer.operations.azure as azops +import blobxfer.operations.azure.file as ops + + +def test_create_client(): + sa = azops.StorageAccount('name', 'key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.file.FileService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSharedKeyAuthentication) + + sa = azops.StorageAccount('name', '?key&sig=key', 'endpoint', 10) + client = ops.create_client(sa) + assert client is not None + assert isinstance(client, azure.storage.file.FileService) + assert isinstance( + client.authentication, + azure.storage._auth._StorageSASAuthentication) + + +def test_parse_file_path(): + rpath = '/a/b/c' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir == 'b' + assert fname == 'c' + + rpath = 'a/b/c/d' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir == 'b/c' + assert fname == 'd' + + rpath = 'a/b' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir is None + assert fname == 'b' + + rpath = 'a' + fshare, path = util.explode_azure_path(util.normalize_azure_path(rpath)) + dir, fname = ops.parse_file_path(path) + assert fshare == 'a' + assert dir is None + assert fname is None + + +def test_check_if_single_file(): + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = mock.MagicMock() + + result = ops.check_if_single_file(client, 'a', 'b/c') + assert result[0] + + result = ops.check_if_single_file(client, 'a', '') + assert not result[0] + + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.side_effect = \ + azure.common.AzureMissingResourceHttpError('msg', 404) + + result = ops.check_if_single_file(client, 'a', 'b/c') + assert not result[0] + + +def test_list_files_single_file(): + client = mock.MagicMock() + client.get_file_properties = mock.MagicMock() + client.get_file_properties.return_value = 'fp' + + i = 0 + for file in ops.list_files(client, 'a', 'b/c', True): + i += 1 + assert file == 'fp' + assert i == 1 + + +@mock.patch( + 'blobxfer.operations.azure.file.check_if_single_file', + return_value=(False, None) +) +def test_list_files_directory(patched_cisf): + _file = azure.storage.file.models.File(name='name') + client = mock.MagicMock() + client.list_directories_and_files.return_value = [_file] + client.get_file_properties.return_value = _file + + i = 0 + for file in ops.list_files(client, 'dir', '', True): + i += 1 + assert file.name == 'name' + assert i == 1 + + _dir = azure.storage.file.models.Directory(name='dirname') + _file = azure.storage.file.models.File(name='dirname/name') + client = mock.MagicMock() + client.list_directories_and_files.side_effect = [[_dir, _file]] + client.get_file_properties.side_effect = [_file] + + i = 0 + for file in ops.list_files(client, '', '', True): + i += 1 + assert file.name == _file.name + assert type(file) == azure.storage.file.models.File + assert i == 1 + + +def test_get_file_range(): + ase = mock.MagicMock() + ret = mock.MagicMock() + ret.content = b'\0' + ase.client._get_file.return_value = ret + ase.container = 'cont' + ase.name = 'name' + offsets = mock.MagicMock() + offsets.start_range = 0 + offsets.end_range = 1 + + assert ops.get_file_range(ase, offsets) == ret.content diff --git a/tests/test_blobxfer_operations_crypto.py b/tests/test_blobxfer_operations_crypto.py new file mode 100644 index 0000000..f3dfc61 --- /dev/null +++ b/tests/test_blobxfer_operations_crypto.py @@ -0,0 +1,139 @@ +# coding=utf-8 +"""Tests for crypto operations""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +import os +import time +# non-stdlib imports +import cryptography.hazmat.primitives.asymmetric.rsa +# local imports +import blobxfer.models.download +# module under test +import blobxfer.operations.crypto as ops + + +_RSAKEY = cryptography.hazmat.primitives.asymmetric.rsa.generate_private_key( + public_exponent=65537, key_size=2048, + backend=cryptography.hazmat.backends.default_backend()) + + +@mock.patch( + 'cryptography.hazmat.primitives.serialization.load_pem_private_key') +def test_load_rsa_private_key_file(patched_load, tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write('a') + patched_load.return_value = _RSAKEY + + rv = ops.load_rsa_private_key_file(str(keyfile), None) + assert rv == _RSAKEY + + +@mock.patch('cryptography.hazmat.primitives.serialization.load_pem_public_key') +def test_load_rsa_public_key_file(patched_load, tmpdir): + keyfile = tmpdir.join('keyfile') + keyfile.write('b') + patched_load.return_value = 'rv' + + rv = ops.load_rsa_public_key_file(str(keyfile)) + assert rv == 'rv' + + +def test_rsa_encrypt_decrypt_keys(): + symkey = os.urandom(32) + enckey = ops.rsa_encrypt_key_base64_encoded(_RSAKEY, None, symkey) + assert enckey is not None + plainkey = ops.rsa_decrypt_base64_encoded_key(_RSAKEY, enckey) + assert symkey == plainkey + + +def test_pkcs7_padding(): + buf = os.urandom(32) + pbuf = ops.pkcs7_pad(buf) + buf2 = ops.pkcs7_unpad(pbuf) + assert buf == buf2 + + +def test_aes_cbc_encryption(): + enckey = ops.aes256_generate_random_key() + assert len(enckey) == ops._AES256_KEYLENGTH_BYTES + + # test random binary data, unaligned + iv = os.urandom(16) + plaindata = os.urandom(31) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + + # test random binary data aligned on boundary + plaindata = os.urandom(32) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + + # test "text" data + plaintext = 'attack at dawn!' + plaindata = plaintext.encode('utf8') + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, True) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, True) + assert decdata == plaindata + assert plaindata.decode('utf8') == plaintext + + # test unpadded + plaindata = os.urandom(32) + encdata = ops.aes_cbc_encrypt_data(enckey, iv, plaindata, False) + assert encdata != plaindata + decdata = ops.aes_cbc_decrypt_data(enckey, iv, encdata, False) + assert decdata == plaindata + + +def test_cryptooffload_decrypt(tmpdir): + symkey = ops.aes256_generate_random_key() + iv = os.urandom(16) + plainlen = 16 + plaindata = os.urandom(plainlen) + encdata = ops.aes_cbc_encrypt_data(symkey, iv, plaindata, False) + + afile = tmpdir.join('a') + afile.write(encdata, mode='wb') + hmacfile = str(afile) + bfile = tmpdir.join('b') + bfile.ensure(file=True) + + a = None + try: + a = ops.CryptoOffload(1) + offsets = blobxfer.models.download.Offsets( + chunk_num=0, + fd_start=0, # this matters! + num_bytes=2, + range_end=3, + range_start=4, + unpad=False, + ) + a.add_decrypt_chunk( + str(bfile), 0, offsets, symkey, iv, hmacfile) + i = 33 + checked = False + while i > 0: + result = a.pop_done_queue() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert result == (str(bfile), offsets) + checked = True + break + assert checked + assert bfile.stat().size == plainlen + decdata = bfile.read(mode='rb') + assert decdata == plaindata + finally: + if a is not None: + a.finalize_processes() diff --git a/tests/test_blobxfer_operations_download.py b/tests/test_blobxfer_operations_download.py new file mode 100644 index 0000000..90b83fe --- /dev/null +++ b/tests/test_blobxfer_operations_download.py @@ -0,0 +1,902 @@ +# coding=utf-8 +"""Tests for download operations""" + +# stdlib imports +import datetime +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +import multiprocessing +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +try: + import queue +except ImportError: # noqa + import Queue as queue +# non-stdlib imports +import azure.storage.blob +import dateutil.tz +import pytest +# local imports +import blobxfer.models.azure as azmodels +import blobxfer.models.download as models +import blobxfer.models.options as options +import blobxfer.operations.azure as azops +import blobxfer.util as util +# module under test +import blobxfer.operations.download as ops + + +@mock.patch('blobxfer.operations.azure.file.check_if_single_file') +@mock.patch('blobxfer.operations.azure.blob.check_if_single_blob') +def test_ensure_local_destination(patched_blob, patched_file, tmpdir): + downdir = tmpdir.join('down') + downdir.mkdir() + + # no spec sources + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + + # blob directory + asp = azops.SourcePath() + p = 'cont/remote/path' + asp.add_path_with_storage_account(p, 'sa') + ds.add_azure_source_path(asp) + patched_blob.return_value = False + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + assert ds.destination.is_dir + + # blob single file + rename + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + ds.add_azure_source_path(asp) + patched_blob.return_value = True + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + + # file directory + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.File, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + ds.add_azure_source_path(asp) + patched_file.return_value = (False, None) + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + assert ds.destination.is_dir + + # file single + rename + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.File, + overwrite=True, + recursive=True, + rename=True, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=mock.MagicMock(), + local_destination_path=models.LocalDestinationPath( + str(downdir) + ), + ) + ds.add_azure_source_path(asp) + patched_file.return_value = (True, mock.MagicMock()) + with pytest.raises(RuntimeError): + ops.Downloader.ensure_local_destination(mock.MagicMock(), ds) + + +def test_check_download_conditions(tmpdir): + ap = tmpdir.join('a') + ap.write('abc') + ep = pathlib.Path(str(ap)) + nep = pathlib.Path(str(tmpdir.join('nep'))) + + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=False, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=True, + lmt_ge=True, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + result = d._check_download_conditions(nep, mock.MagicMock()) + assert result == ops.DownloadAction.Download + result = d._check_download_conditions(ep, mock.MagicMock()) + assert result == ops.DownloadAction.Skip + + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=True, + lmt_ge=True, + md5_match=True, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + rfile = mock.MagicMock() + rfile.md5 = 'abc' + rfile._encryption = None + result = d._check_download_conditions(ep, rfile) + assert result == ops.DownloadAction.CheckMd5 + + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=False, + lmt_ge=False, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + result = d._check_download_conditions(ep, mock.MagicMock()) + assert result == ops.DownloadAction.Download + + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=True, + lmt_ge=False, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + rfile = azmodels.StorageEntity('cont') + rfile._size = util.page_align_content_length(ep.stat().st_size) + rfile._mode = azmodels.StorageModes.Page + rfile._encryption = None + result = d._check_download_conditions(ep, rfile) + assert result == ops.DownloadAction.Skip + + rfile._size = ep.stat().st_size + rfile._mode = azmodels.StorageModes.Page + result = d._check_download_conditions(ep, rfile) + assert result == ops.DownloadAction.Download + + ds = models.Specification( + download_options=options.Download( + check_file_md5=True, + chunk_size_bytes=4194304, + delete_extraneous_destination=False, + mode=azmodels.StorageModes.Auto, + overwrite=True, + recursive=True, + rename=False, + restore_file_attributes=False, + rsa_private_key=None, + ), + skip_on_options=options.SkipOn( + filesize_match=False, + lmt_ge=True, + md5_match=False, + ), + local_destination_path=models.LocalDestinationPath('dest'), + ) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), ds) + rfile = azmodels.StorageEntity('cont') + rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) + \ + datetime.timedelta(days=1) + rfile._encryption = None + result = d._check_download_conditions(ep, rfile) + assert result == ops.DownloadAction.Download + + rfile._lmt = datetime.datetime.now(dateutil.tz.tzutc()) - \ + datetime.timedelta(days=1) + result = d._check_download_conditions(ep, rfile) + assert result == ops.DownloadAction.Skip + + +def test_pre_md5_skip_on_check(): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_offload = mock.MagicMock() + + rfile = azmodels.StorageEntity('cont') + rfile._encryption = mock.MagicMock() + rfile._encryption.blobxfer_extensions = mock.MagicMock() + rfile._encryption.blobxfer_extensions.pre_encrypted_content_md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + + lpath = 'lpath' + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d._pre_md5_skip_on_check(lpath, rfile) + assert key in d._md5_map + + rfile._name = 'name2' + lpath = 'lpath2' + rfile._encryption = None + rfile._md5 = 'abc' + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d._pre_md5_skip_on_check(lpath, rfile) + assert key in d._md5_map + + assert len(d._md5_map) == 2 + + +def test_post_md5_skip_on_check(tmpdir): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._download_total = 0 + d._download_bytes_total = 0 + d._md5_offload = mock.MagicMock() + + lp = tmpdir.join('lpath').ensure(file=True) + lpath = str(lp) + rfile = azmodels.StorageEntity('cont') + rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 + d._pre_md5_skip_on_check(lpath, rfile) + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d._transfer_set.add(key) + assert key in d._md5_map + + d._post_md5_skip_on_check(key, lpath, rfile._size, True) + assert key not in d._md5_map + + d._add_to_download_queue = mock.MagicMock() + d._pre_md5_skip_on_check(lpath, rfile) + d._transfer_set.add(key) + d._post_md5_skip_on_check(key, lpath, rfile._size, False) + assert d._add_to_download_queue.call_count == 1 + + +def test_check_for_downloads_from_md5(): + lpath = 'lpath' + rfile = azmodels.StorageEntity('cont') + rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[key] = rfile + d._transfer_set.add(key) + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.pop_done_queue.side_effect = [ + None, + (key, lpath, rfile._size, False), + ] + d._add_to_download_queue = mock.MagicMock() + d._all_remote_files_processed = False + d._download_terminate = True + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 0 + + with mock.patch( + 'blobxfer.operations.download.Downloader.' + 'termination_check_md5', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[key] = rfile + d._transfer_set.add(key) + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.pop_done_queue.side_effect = [ + None, + (key, lpath, rfile._size, False), + ] + d._add_to_download_queue = mock.MagicMock() + patched_tc.side_effect = [False, False, True] + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 1 + + with mock.patch( + 'blobxfer.operations.download.Downloader.' + 'termination_check_md5', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._md5_map[key] = rfile + d._transfer_set.add(key) + d._md5_offload = mock.MagicMock() + d._md5_offload.done_cv = multiprocessing.Condition() + d._md5_offload.pop_done_queue.side_effect = [None] + d._add_to_download_queue = mock.MagicMock() + patched_tc.side_effect = [False, True, True] + d._check_for_downloads_from_md5() + assert d._add_to_download_queue.call_count == 0 + + +def test_check_for_crypto_done(): + lpath = 'lpath' + rfile = azmodels.StorageEntity('cont') + rfile._md5 = 'abc' + rfile._client = mock.MagicMock() + rfile._client.primary_endpoint = 'ep' + rfile._name = 'name' + rfile._vio = None + rfile._size = 256 + key = ops.Downloader.create_unique_transfer_operation_id(rfile) + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._transfer_set.add(key) + dd = mock.MagicMock() + d._dd_map[lpath] = dd + offsets = mock.MagicMock() + offsets.range_start = 0 + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + (lpath, offsets) + ] + d._all_remote_files_processed = False + d._download_terminate = True + d._check_for_crypto_done() + assert dd.perform_chunked_integrity_check.call_count == 0 + + # check successful integrity check call + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._transfer_set.add(key) + dd = mock.MagicMock() + dd.entity = rfile + dd.final_path = lpath + d._dd_map[lpath] = dd + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + (lpath, offsets), + ] + patched_tc.side_effect = [False, False, True] + d._complete_chunk_download = mock.MagicMock() + d._check_for_crypto_done() + assert dd.perform_chunked_integrity_check.call_count == 1 + + # check KeyError on result + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._transfer_set.add(key) + dd = mock.MagicMock() + dd.entity = rfile + dd.final_path = lpath + d._crypto_offload = mock.MagicMock() + d._crypto_offload.done_cv = multiprocessing.Condition() + d._crypto_offload.pop_done_queue.side_effect = [ + None, + (lpath, offsets), + ] + patched_tc.side_effect = [False, False, True] + d._complete_chunk_download = mock.MagicMock() + d._check_for_crypto_done() + assert dd.perform_chunked_integrity_check.call_count == 0 + + +def test_add_to_download_queue(tmpdir): + path = tmpdir.join('a') + lpath = pathlib.Path(str(path)) + ase = azmodels.StorageEntity('cont') + ase._size = 1 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.chunk_size_bytes = 1 + + d._add_to_download_queue(lpath, ase) + assert d._transfer_queue.qsize() == 1 + assert path in d._dd_map + + +def test_initialize_and_terminate_transfer_threads(): + opts = mock.MagicMock() + opts.concurrency.transfer_threads = 2 + d = ops.Downloader(opts, mock.MagicMock(), mock.MagicMock()) + d._worker_thread_transfer = mock.MagicMock() + + d._initialize_transfer_threads() + assert len(d._transfer_threads) == 2 + + d._wait_for_transfer_threads(terminate=True) + assert d._download_terminate + for thr in d._transfer_threads: + assert not thr.is_alive() + + +@mock.patch('blobxfer.operations.crypto.aes_cbc_decrypt_data') +@mock.patch('blobxfer.operations.azure.file.get_file_range') +@mock.patch('blobxfer.operations.azure.blob.get_blob_range') +def test_worker_thread_transfer( + patched_gbr, patched_gfr, patched_acdd, tmpdir): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._complete_chunk_download = mock.MagicMock() + d._download_terminate = True + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + d._worker_thread_transfer() + assert d._complete_chunk_download.call_count == 0 + + d._download_terminate = False + d._all_remote_files_processed = True + d._worker_thread_transfer() + assert d._complete_chunk_download.call_count == 0 + + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + patched_tc.side_effect = [False, False, True] + ase = azmodels.StorageEntity('cont') + ase._size = 16 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + lp = pathlib.Path(str(tmpdir.join('exc'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + dd = models.Descriptor(lp, ase, opts, None) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [queue.Empty, dd] + d._process_download_descriptor = mock.MagicMock() + d._process_download_descriptor.side_effect = RuntimeError('oops') + d._worker_thread_transfer() + assert len(d._exceptions) == 1 + assert d._process_download_descriptor.call_count == 1 + + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + with mock.patch( + 'blobxfer.models.download.Descriptor.' + 'all_operations_completed', + new_callable=mock.PropertyMock) as patched_aoc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + lp = pathlib.Path(str(tmpdir.join('a'))) + dd = models.Descriptor(lp, ase, opts, None) + dd.next_offsets = mock.MagicMock( + side_effect=[(None, 1), (None, 2)]) + dd.finalize_integrity = mock.MagicMock() + dd.finalize_file = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + dd.all_operations_completed.side_effect = [False, True] + patched_aoc.side_effect = [False, True] + patched_tc.side_effect = [False, False, False, True] + d._dd_map[str(lp)] = dd + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [queue.Empty, dd, dd] + d._worker_thread_transfer() + assert str(lp) not in d._dd_map + assert dd.finalize_file.call_count == 1 + assert d._download_sofar == 1 + assert d._download_bytes_sofar == 3 + + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + opts = mock.MagicMock() + opts.check_file_md5 = True + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.File + ase._size = 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('b'))) + dd = models.Descriptor(lp, ase, opts, None) + dd.finalize_file = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + d._dd_map[str(lp)] = mock.MagicMock() + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd] + patched_tc.side_effect = [False, True] + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) + assert dd.perform_chunked_integrity_check.call_count == 1 + + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.Auto + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + ase._encryption.content_encryption_iv = b'0' * 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('c'))) + dd = models.Descriptor(lp, ase, opts, None) + dd.finalize_file = mock.MagicMock() + dd.write_unchecked_hmac_data = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + d._crypto_offload = mock.MagicMock() + d._crypto_offload.add_decrypt_chunk = mock.MagicMock() + d._dd_map[str(lp)] = dd + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd] + patched_tc.side_effect = [False, True] + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) + assert d._crypto_offload.add_decrypt_chunk.call_count == 1 + assert dd.write_unchecked_hmac_data.call_count == 1 + + with mock.patch( + 'blobxfer.operations.download.Downloader.termination_check', + new_callable=mock.PropertyMock) as patched_tc: + d = ops.Downloader( + mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.concurrency.crypto_processes = 0 + d._general_options.concurrency.transfer_threads = 1 + d._general_options.concurrency.disk_threads = 1 + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._mode = azmodels.StorageModes.Auto + ase._size = 32 + ase._encryption = mock.MagicMock() + ase._encryption.symmetric_key = b'abc' + ase._encryption.content_encryption_iv = b'0' * 16 + ase._client = mock.MagicMock() + ase._client.primary_endpoint = 'ep' + ase._name = 'name' + ase._vio = None + key = ops.Downloader.create_unique_transfer_operation_id(ase) + patched_gfr.return_value = b'0' * ase._size + lp = pathlib.Path(str(tmpdir.join('d'))) + dd = models.Descriptor(lp, ase, opts, None) + dd.next_offsets() + dd.write_unchecked_hmac_data = mock.MagicMock() + dd.perform_chunked_integrity_check = mock.MagicMock() + dd.mark_unchecked_chunk_decrypted = mock.MagicMock() + patched_acdd.return_value = b'0' * 16 + d._dd_map[str(lp)] = mock.MagicMock() + d._transfer_set.add(key) + d._transfer_queue = mock.MagicMock() + d._transfer_queue.get.side_effect = [dd, dd] + patched_tc.side_effect = [False, True] + d._worker_thread_transfer() + assert len(d._disk_set) == 1 + a, b, c = d._disk_queue.get() + d._process_data(a, b, c) + assert patched_acdd.call_count == 1 + assert dd.write_unchecked_hmac_data.call_count == 1 + assert dd.perform_chunked_integrity_check.call_count == 1 + + +def test_cleanup_temporary_files(tmpdir): + lp = pathlib.Path(str(tmpdir.join('a'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 16 + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() + dd.cleanup_all_temporary_files = mock.MagicMock() + dd.cleanup_all_temporary_files.side_effect = Exception + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = pathlib.Path('abc') + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert dd.final_path.exists() + + lp = pathlib.Path(str(tmpdir.join('b'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 16 + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert not dd.final_path.exists() + + lp = pathlib.Path(str(tmpdir.join('c'))) + opts = mock.MagicMock() + opts.check_file_md5 = False + opts.chunk_size_bytes = 16 + ase = azmodels.StorageEntity('cont') + ase._size = 16 + dd = models.Descriptor(lp, ase, opts, None) + dd._allocate_disk_space() + dd.cleanup_all_temporary_files = mock.MagicMock() + dd.cleanup_all_temporary_files.side_effect = Exception + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None + d._dd_map[0] = dd + d._cleanup_temporary_files() + assert dd.final_path.exists() + + +def test_catalog_local_files_for_deletion(tmpdir): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.delete_extraneous_destination = False + + d._catalog_local_files_for_deletion() + assert len(d._delete_after) == 0 + + a = tmpdir.join('a') + a.write('abc') + d._spec.destination.path = tmpdir + d._spec.options.delete_extraneous_destination = True + d._spec.destination.is_dir = True + + d._catalog_local_files_for_deletion() + assert len(d._delete_after) == 1 + assert pathlib.Path(str(a)) in d._delete_after + + +def test_delete_extraneous_files(tmpdir): + a = tmpdir.join('a') + a.write('abc') + fp = pathlib.Path(str(a)) + + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._spec.options.delete_extraneous_destination = True + d._spec.destination.is_dir = True + d._delete_after.add(fp) + + d._delete_extraneous_files() + assert not fp.exists() + + # following should not throw exception + d._delete_extraneous_files() + + +def _create_downloader_for_start(td): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._cleanup_temporary_files = mock.MagicMock() + d._download_start = datetime.datetime.now(tz=dateutil.tz.tzlocal()) + d._initialize_transfer_threads = mock.MagicMock() + d._general_options.concurrency.crypto_processes = 0 + d._general_options.concurrency.md5_processes = 1 + d._general_options.concurrency.disk_threads = 1 + d._general_options.concurrency.transfer_threads = 1 + d._general_options.resume_file = pathlib.Path(str(td.join('rf'))) + d._spec.sources = [] + d._spec.options = mock.MagicMock() + d._spec.options.chunk_size_bytes = 1 + d._spec.options.mode = azmodels.StorageModes.Auto + d._spec.options.overwrite = True + d._spec.options.rename = False + d._spec.skip_on = mock.MagicMock() + d._spec.skip_on.md5_match = False + d._spec.skip_on.lmt_ge = False + d._spec.skip_on.filesize_match = False + d._spec.destination = mock.MagicMock() + d._spec.destination.path = pathlib.Path(str(td)) + d._download_start_time = util.datetime_now() + d._pre_md5_skip_on_check = mock.MagicMock() + d._check_download_conditions = mock.MagicMock() + d._all_remote_files_processed = False + + p = '/cont/remote/path' + asp = azops.SourcePath() + asp.add_path_with_storage_account(p, 'sa') + d._spec.sources.append(asp) + + return d + + +@mock.patch('blobxfer.operations.md5.LocalFileMd5Offload') +@mock.patch('blobxfer.operations.azure.blob.list_blobs') +@mock.patch( + 'blobxfer.operations.download.Downloader.ensure_local_destination', + return_value=True +) +@mock.patch( + 'blobxfer.operations.download.Downloader.' + 'create_unique_transfer_operation_id', + return_value='id' +) +@mock.patch( + 'blobxfer.operations.download.Downloader._wait_for_transfer_threads', + return_value=None +) +@mock.patch( + 'blobxfer.operations.download.Downloader._wait_for_disk_threads', + return_value=None +) +def test_start( + patched_wdt, patched_wtt, patched_cutoi, patched_eld, patched_lb, + patched_lfmo, tmpdir): + patched_lfmo._check_thread = mock.MagicMock() + + b = azure.storage.blob.models.Blob(name='remote/path/name') + b.properties.content_length = 1 + patched_lb.side_effect = [[b]] + d = _create_downloader_for_start(tmpdir) + d._check_download_conditions.return_value = ops.DownloadAction.Skip + d._download_sofar = -1 + d._download_bytes_sofar = -1 + d.start() + assert d._pre_md5_skip_on_check.call_count == 0 + + patched_lb.side_effect = [[b]] + d = _create_downloader_for_start(tmpdir) + d._check_download_conditions.return_value = ops.DownloadAction.CheckMd5 + d._download_sofar = -1 + with pytest.raises(RuntimeError): + d.start() + d._download_terminate = True + assert d._pre_md5_skip_on_check.call_count == 1 + + b.properties.content_length = 0 + patched_lb.side_effect = [[b]] + d = _create_downloader_for_start(tmpdir) + d._check_download_conditions.return_value = ops.DownloadAction.Download + with pytest.raises(RuntimeError): + d.start() + d._download_terminate = True + assert d._transfer_queue.qsize() == 1 + + # test exception count + b = azure.storage.blob.models.Blob(name='name') + b.properties.content_length = 1 + patched_lb.side_effect = [[b]] + d = _create_downloader_for_start(tmpdir) + d._spec.destination.is_dir = False + d._spec.options.rename = True + d._check_download_conditions.return_value = ops.DownloadAction.Skip + d._exceptions = [RuntimeError('oops')] + with pytest.raises(RuntimeError): + d.start() + d._download_terminate = True + assert d._pre_md5_skip_on_check.call_count == 0 + + +def test_start_keyboard_interrupt(): + d = ops.Downloader(mock.MagicMock(), mock.MagicMock(), mock.MagicMock()) + d._general_options.resume_file = None + d._run = mock.MagicMock(side_effect=KeyboardInterrupt) + d._wait_for_transfer_threads = mock.MagicMock() + d._cleanup_temporary_files = mock.MagicMock() + d._md5_offload = mock.MagicMock() + + with pytest.raises(KeyboardInterrupt): + d.start() + assert d._wait_for_transfer_threads.call_count == 1 + assert d._cleanup_temporary_files.call_count == 1 diff --git a/tests/test_blobxfer_operations_md5.py b/tests/test_blobxfer_operations_md5.py new file mode 100644 index 0000000..02be647 --- /dev/null +++ b/tests/test_blobxfer_operations_md5.py @@ -0,0 +1,128 @@ +# coding=utf-8 +"""Tests for md5""" + +# stdlib imports +import time +import uuid +# non-stdlib imports +import pytest +# local imports +import blobxfer.models.azure as azmodels +# module under test +import blobxfer.operations.md5 as ops + + +def test_compute_md5(tmpdir): + lpath = str(tmpdir.join('test.tmp')) + testdata = str(uuid.uuid4()) + with open(lpath, 'wt') as f: + f.write(testdata) + md5_file = ops.compute_md5_for_file_asbase64(lpath) + md5_data = ops.compute_md5_for_data_asbase64(testdata.encode('utf8')) + assert md5_file == md5_data + + md5_file_page = ops.compute_md5_for_file_asbase64(lpath, True) + assert md5_file != md5_file_page + + # test non-existent file + with pytest.raises(IOError): + ops.compute_md5_for_file_asbase64(testdata) + + +def test_done_cv(): + a = None + try: + a = ops.LocalFileMd5Offload(num_workers=1) + assert a.done_cv == a._done_cv + finally: + if a: + a.finalize_processes() + + +def test_finalize_md5_processes(): + with pytest.raises(ValueError): + ops.LocalFileMd5Offload(num_workers=0) + + a = None + try: + a = ops.LocalFileMd5Offload(num_workers=1) + finally: + if a: + a.finalize_processes() + + for proc in a._procs: + assert not proc.is_alive() + + +def test_from_add_to_done_non_pagealigned(tmpdir): + file = tmpdir.join('a') + file.write('abc') + fpath = str(file) + key = 'key' + + remote_md5 = ops.compute_md5_for_file_asbase64(str(file)) + + a = None + try: + a = ops.LocalFileMd5Offload(num_workers=1) + result = a.pop_done_queue() + assert result is None + + a.add_localfile_for_md5_check( + key, fpath, fpath, remote_md5, azmodels.StorageModes.Block, None) + i = 33 + checked = False + while i > 0: + result = a.pop_done_queue() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert len(result) == 4 + assert result[0] == key + assert result[1] == str(file) + assert result[2] is None + assert result[3] + checked = True + break + assert checked + finally: + if a: + a.finalize_processes() + + +def test_from_add_to_done_pagealigned(tmpdir): + file = tmpdir.join('a') + file.write('abc') + fpath = str(file) + key = 'key' + + remote_md5 = ops.compute_md5_for_file_asbase64(str(file), True) + + a = None + try: + a = ops.LocalFileMd5Offload(num_workers=1) + result = a.pop_done_queue() + assert result is None + + a.add_localfile_for_md5_check( + key, fpath, fpath, remote_md5, azmodels.StorageModes.Page, None) + i = 33 + checked = False + while i > 0: + result = a.pop_done_queue() + if result is None: + time.sleep(0.3) + i -= 1 + continue + assert len(result) == 4 + assert result[0] == key + assert result[1] == str(file) + assert result[2] is None + assert result[3] + checked = True + break + assert checked + finally: + if a: + a.finalize_processes() diff --git a/tests/test_blobxfer_operations_progress.py b/tests/test_blobxfer_operations_progress.py new file mode 100644 index 0000000..721501e --- /dev/null +++ b/tests/test_blobxfer_operations_progress.py @@ -0,0 +1,41 @@ +# coding=utf-8 +"""Tests for progress operations""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +# local imports +import blobxfer.util as util +# module under test +import blobxfer.operations.progress as ops + + +def test_output_parameters(): + go = mock.MagicMock() + spec = mock.MagicMock() + go.log_file = 'abc' + + ops.output_parameters(go, spec) + + assert util.is_not_empty(go.log_file) + + +def test_update_progress_bar(): + go = mock.MagicMock() + go.progress_bar = True + go.log_file = 'abc' + + start = util.datetime_now() + + ops.update_progress_bar( + go, 'download', start, None, 1, None, 1) + + with mock.patch('blobxfer.util.datetime_now') as patched_dt: + patched_dt.return_value = start + ops.update_progress_bar( + go, 'synccopy', start, 1, 1, 1, 1) + + assert util.is_not_empty(go.log_file) diff --git a/tests/test_blobxfer_operations_resume.py b/tests/test_blobxfer_operations_resume.py new file mode 100644 index 0000000..9894d3b --- /dev/null +++ b/tests/test_blobxfer_operations_resume.py @@ -0,0 +1,74 @@ +# coding=utf-8 +"""Tests for operations resume""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +# non-stdlib imports +# module under test +import blobxfer.operations.resume as ops + + +def test_download_resume_manager(tmpdir): + tmpdb = pathlib.Path(str(tmpdir.join('tmp.db'))) + + drm = ops.DownloadResumeManager(tmpdb) + assert drm._data is not None + drm.close() + assert drm._data is None + assert tmpdb.exists() + drm.delete() + assert drm._data is None + assert not tmpdb.exists() + + ase = mock.MagicMock() + ase._name = 'name' + ase._client.primary_endpoint = 'ep' + ase._size = 16 + + final_path = 'fp' + drm = ops.DownloadResumeManager(tmpdb) + drm.add_or_update_record(final_path, ase, 2, 0, False, None) + d = drm.get_record(ase) + + assert d.final_path == final_path + + drm.add_or_update_record(final_path, ase, 2, 1, False, 'abc') + d = drm.get_record(ase) + + assert d.final_path == final_path + assert not d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + drm.add_or_update_record(final_path, ase, 2, 1, True, None) + d = drm.get_record(ase) + + assert d.final_path == final_path + assert d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + # idempotent check after completed + drm.add_or_update_record(final_path, ase, 2, 1, True, None) + d = drm.get_record(ase) + + assert d.final_path == final_path + assert d.completed + assert d.next_integrity_chunk == 1 + assert d.md5hexdigest == 'abc' + + drm.close() + assert drm._data is None + assert tmpdb.exists() + + tmpdb.unlink() + drm.delete() + assert drm._data is None + assert not tmpdb.exists() diff --git a/tests/test_blobxfer_retry.py b/tests/test_blobxfer_retry.py new file mode 100644 index 0000000..d44fa21 --- /dev/null +++ b/tests/test_blobxfer_retry.py @@ -0,0 +1,50 @@ +# coding=utf-8 +"""Tests for retry""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +# non-stdlib imports +import pytest +# module under test +import blobxfer.retry as retry + + +def test_exponentialretrywithmaxwait(): + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=1, max_backoff=0) + + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=1, max_backoff=1, max_retries=-1) + + with pytest.raises(ValueError): + er = retry.ExponentialRetryWithMaxWait( + initial_backoff=2, max_backoff=1) + + er = retry.ExponentialRetryWithMaxWait() + context = mock.MagicMock() + context.count = 0 + context.response.status = 500 + bo = er.retry(context) + assert context.count == 1 + assert bo == 0.1 + + bo = er.retry(context) + assert context.count == 2 + assert bo == 0.2 + + bo = er.retry(context) + assert context.count == 3 + assert bo == 0.4 + + bo = er.retry(context) + assert context.count == 4 + assert bo == 0.8 + + bo = er.retry(context) + assert context.count == 5 + assert bo == 0.1 diff --git a/tests/test_blobxfer_util.py b/tests/test_blobxfer_util.py new file mode 100644 index 0000000..e294a0e --- /dev/null +++ b/tests/test_blobxfer_util.py @@ -0,0 +1,224 @@ +# coding=utf-8 +"""Tests for util""" + +# stdlib imports +try: + import unittest.mock as mock +except ImportError: # noqa + import mock +try: + import pathlib2 as pathlib +except ImportError: # noqa + import pathlib +import sys +# non-stdlib imports +import pytest +# module under test +import blobxfer.util + + +def test_on_python2(): + py2 = sys.version_info.major == 2 + assert py2 == blobxfer.util.on_python2() + + +def test_is_none_or_empty(): + a = None + assert blobxfer.util.is_none_or_empty(a) + a = [] + assert blobxfer.util.is_none_or_empty(a) + a = {} + assert blobxfer.util.is_none_or_empty(a) + a = '' + assert blobxfer.util.is_none_or_empty(a) + a = 'asdf' + assert not blobxfer.util.is_none_or_empty(a) + a = ['asdf'] + assert not blobxfer.util.is_none_or_empty(a) + a = {'asdf': 0} + assert not blobxfer.util.is_none_or_empty(a) + a = [None] + assert not blobxfer.util.is_none_or_empty(a) + + +def test_is_not_empty(): + a = None + assert not blobxfer.util.is_not_empty(a) + a = [] + assert not blobxfer.util.is_not_empty(a) + a = {} + assert not blobxfer.util.is_not_empty(a) + a = '' + assert not blobxfer.util.is_not_empty(a) + a = 'asdf' + assert blobxfer.util.is_not_empty(a) + a = ['asdf'] + assert blobxfer.util.is_not_empty(a) + a = {'asdf': 0} + assert blobxfer.util.is_not_empty(a) + a = [None] + assert blobxfer.util.is_not_empty(a) + + +def test_merge_dict(): + with pytest.raises(ValueError): + blobxfer.util.merge_dict(1, 2) + + a = {'a_only': 42, 'a_and_b': 43, + 'a_only_dict': {'a': 44}, 'a_and_b_dict': {'a_o': 45, 'a_a_b': 46}} + b = {'b_only': 45, 'a_and_b': 46, + 'b_only_dict': {'a': 47}, 'a_and_b_dict': {'b_o': 48, 'a_a_b': 49}} + c = blobxfer.util.merge_dict(a, b) + assert c['a_only'] == 42 + assert c['b_only'] == 45 + assert c['a_and_b_dict']['a_o'] == 45 + assert c['a_and_b_dict']['b_o'] == 48 + assert c['a_and_b_dict']['a_a_b'] == 49 + assert c['b_only_dict']['a'] == 47 + assert c['a_and_b'] == 46 + assert a['a_only'] == 42 + assert a['a_and_b'] == 43 + assert b['b_only'] == 45 + assert b['a_and_b'] == 46 + + +def test_scantree(tmpdir): + tmpdir.mkdir('abc') + abcpath = tmpdir.join('abc') + abcpath.join('hello.txt').write('hello') + abcpath.mkdir('def') + defpath = abcpath.join('def') + defpath.join('world.txt').write('world') + found = set() + for de in blobxfer.util.scantree(str(tmpdir)): + if de.name != '.lock': + found.add(de.name) + assert 'hello.txt' in found + assert 'world.txt' in found + assert len(found) == 2 + + +def test_replace_file(tmpdir): + src = pathlib.Path(str(tmpdir.join('src'))) + dst = pathlib.Path(str(tmpdir.join('dst'))) + src.touch() + dst.touch() + + replace_avail = sys.version_info >= (3, 3) + + with mock.patch( + 'sys.version_info', + new_callable=mock.PropertyMock(return_value=(3, 2, 0))): + blobxfer.util.replace_file(src, dst) + assert not src.exists() + assert dst.exists() + + dst.unlink() + src.touch() + dst.touch() + + with mock.patch( + 'sys.version_info', + new_callable=mock.PropertyMock(return_value=(3, 3, 0))): + if replace_avail: + blobxfer.util.replace_file(src, dst) + assert not src.exists() + assert dst.exists() + else: + src = mock.MagicMock() + blobxfer.util.replace_file(src, dst) + assert src.replace.call_count == 1 + + +def test_get_mime_type(): + a = 'b.txt' + mt = blobxfer.util.get_mime_type(a) + assert mt == 'text/plain' + a = 'c.probably_cant_determine_this' + mt = blobxfer.util.get_mime_type(a) + assert mt == 'application/octet-stream' + + +def test_base64_encode_as_string(): + a = b'abc' + enc = blobxfer.util.base64_encode_as_string(a) + if blobxfer.util.on_python2(): + assert type(enc) == str + else: + assert type(enc) != bytes + dec = blobxfer.util.base64_decode_string(enc) + assert a == dec + + +def test_page_align_content_length(): + assert 0 == blobxfer.util.page_align_content_length(0) + assert 512 == blobxfer.util.page_align_content_length(1) + assert 512 == blobxfer.util.page_align_content_length(511) + assert 512 == blobxfer.util.page_align_content_length(512) + assert 1024 == blobxfer.util.page_align_content_length(513) + assert 1024 == blobxfer.util.page_align_content_length(1023) + assert 1024 == blobxfer.util.page_align_content_length(1024) + assert 1536 == blobxfer.util.page_align_content_length(1025) + + +def test_normalize_azure_path(): + a = '\\cont\\r1\\r2\\r3\\' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + a = '/cont/r1/r2/r3/' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + a = '/cont\\r1/r2\\r3/' + b = blobxfer.util.normalize_azure_path(a) + assert b == 'cont/r1/r2/r3' + + with pytest.raises(ValueError): + blobxfer.util.normalize_azure_path('') + + +def test_explode_azure_path(): + p = 'cont' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == '' + + p = 'cont/' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == '' + + p = 'cont/a/' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'cont' + assert rpath == 'a' + + p = '/some/remote/path' + cont, rpath = blobxfer.util.explode_azure_path(p) + assert cont == 'some' + assert rpath == 'remote/path' + + +def test_blob_is_snapshot(): + a = '/cont/a?snapshot=2017-02-23T22:21:14.8121864Z' + assert blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=abc' + assert not blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=' + assert not blobxfer.util.blob_is_snapshot(a) + + a = '/cont/a?snapshot=2017-02-23T22:21:14.8121864Z?snapshot=' + assert not blobxfer.util.blob_is_snapshot(a) + + +def test_parse_blob_snapshot_parameter(): + base = '/cont/a' + param = '2017-02-23T22:21:14.8121864Z' + a = base + '?snapshot=' + param + assert blobxfer.util.parse_blob_snapshot_parameter(a) == (base, param) + + a = '/cont/a?snapshot=' + assert blobxfer.util.parse_blob_snapshot_parameter(a) is None diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..d05615f --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +[tox] +envlist = py27, py35 + +[testenv] +deps = -rtest_requirements.txt +commands = + flake8 {envsitepackagesdir}/blobxfer_cli/ + flake8 {envsitepackagesdir}/blobxfer/ + py.test \ + -x -l -s \ + --ignore venv/ \ + --cov-config .coveragerc \ + --cov-report term-missing \ + --cov {envsitepackagesdir}/blobxfer + +[flake8] +max-line-length = 79 +select = F,E,W