From e1d97fa3cb813559cff15661aa6d13d12dc30175 Mon Sep 17 00:00:00 2001 From: Fred Park Date: Fri, 2 Jun 2017 08:16:46 -0700 Subject: [PATCH] Tag for 1.0.0a3 release - Rename some options - Make thread join more robust on Python2 --- CHANGELOG.md | 6 ++-- blobxfer/models/options.py | 6 +++- blobxfer/operations/download.py | 8 ++--- blobxfer/operations/upload.py | 4 +-- blobxfer/util.py | 14 +++++++++ blobxfer/version.py | 2 +- cli/cli.py | 42 ++++++++++++++------------ cli/settings.py | 12 ++++---- docs/01-installation.md | 7 +++-- docs/10-cli-usage.md | 43 +++++++++++++++------------ docs/30-vectored-io.md | 4 +-- docs/98-performance-considerations.md | 42 ++++++++++++++++++++++++-- 12 files changed, 129 insertions(+), 61 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ba7b442..fd71b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## [Unreleased] -## [1.0.0a2] - 2017-06-02 +## [1.0.0a3] - 2017-06-02 ### Changed - From scratch rewrite providing a consistent CLI experience and a vast array of new and advanced features. Please see the @@ -201,8 +201,8 @@ usage documentation carefully when upgrading from 0.12.1. `--no-skiponmatch`. - 0.8.2: performance regression fixes -[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a2...HEAD -[1.0.0a2]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a2 +[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a3...HEAD +[1.0.0a3]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a3 [0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1 [0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0 [0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5 diff --git a/blobxfer/models/options.py b/blobxfer/models/options.py index c516d01..2a17c1a 100644 --- a/blobxfer/models/options.py +++ b/blobxfer/models/options.py @@ -105,13 +105,14 @@ class Concurrency(object): """Concurrency Options""" def __init__( self, crypto_processes, md5_processes, disk_threads, - transfer_threads): + transfer_threads, is_download=None): """Ctor for Concurrency Options :param Concurrency self: this :param int crypto_processes: number of crypto procs :param int md5_processes: number of md5 procs :param int disk_threads: number of disk threads :param int transfer_threads: number of transfer threads + :param bool is_download: download hint """ self.crypto_processes = crypto_processes self.md5_processes = md5_processes @@ -131,6 +132,9 @@ def __init__( # cap maximum number of disk threads from cpu count to 64 if self.disk_threads > 64: self.disk_threads = 64 + # for downloads, cap disk threads to lower value + if is_download and self.disk_threads > 16: + self.disk_threads = 16 auto_disk = True if self.transfer_threads is None or self.transfer_threads < 1: if auto_disk: diff --git a/blobxfer/operations/download.py b/blobxfer/operations/download.py index 3cbef61..9d51d32 100644 --- a/blobxfer/operations/download.py +++ b/blobxfer/operations/download.py @@ -430,7 +430,7 @@ def _wait_for_disk_threads(self, terminate): if terminate: self._download_terminate = terminate for thr in self._disk_threads: - thr.join() + blobxfer.util.join_thread(thr) def _wait_for_transfer_threads(self, terminate): # type: (Downloader, bool) -> None @@ -441,7 +441,7 @@ def _wait_for_transfer_threads(self, terminate): if terminate: self._download_terminate = terminate for thr in self._transfer_threads: - thr.join() + blobxfer.util.join_thread(thr) def _worker_thread_transfer(self): # type: (Downloader) -> None @@ -452,7 +452,7 @@ def _worker_thread_transfer(self): while not self.termination_check: try: if len(self._disk_set) > max_set_len: - time.sleep(0.2) + time.sleep(0.1) continue else: dd = self._transfer_queue.get(block=False, timeout=0.1) @@ -792,8 +792,8 @@ def start(self): 'KeyboardInterrupt detected, force terminating ' 'processes and threads (this may take a while)...') try: - self._wait_for_transfer_threads(terminate=True) self._wait_for_disk_threads(terminate=True) + self._wait_for_transfer_threads(terminate=True) finally: self._cleanup_temporary_files() raise diff --git a/blobxfer/operations/upload.py b/blobxfer/operations/upload.py index 232e8ba..9db2863 100644 --- a/blobxfer/operations/upload.py +++ b/blobxfer/operations/upload.py @@ -447,10 +447,10 @@ def _worker_thread_upload(self): while not self.termination_check: try: if len(self._transfer_set) > max_set_len: - time.sleep(0.2) + time.sleep(0.1) continue else: - ud = self._upload_queue.get(False, 0.1) + ud = self._upload_queue.get(block=False, timeout=0.1) except queue.Empty: continue try: diff --git a/blobxfer/util.py b/blobxfer/util.py index 166b98f..a17b8a5 100644 --- a/blobxfer/util.py +++ b/blobxfer/util.py @@ -124,6 +124,20 @@ def is_not_empty(obj): return obj is not None and len(obj) > 0 +def join_thread(thr): + # type: (threading.Thread) -> None + """Join a thread + :type threading.Thread thr: thread to join + """ + if on_python2(): + while True: + thr.join(timeout=1) + if not thr.isAlive(): + break + else: + thr.join() + + def merge_dict(dict1, dict2): # type: (dict, dict) -> dict """Recursively merge dictionaries: dict2 on to dict1. This differs diff --git a/blobxfer/version.py b/blobxfer/version.py index 6280e0b..0f2a584 100644 --- a/blobxfer/version.py +++ b/blobxfer/version.py @@ -22,4 +22,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -__version__ = '1.0.0a2' +__version__ = '1.0.0a3' diff --git a/cli/cli.py b/cli/cli.py index 4b1e211..4c12bbd 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -62,13 +62,15 @@ def __init__(self): self.credentials = None self.general_options = None - def initialize(self): - # type: (CliContext) -> None + def initialize(self, action): + # type: (CliContext, settings.TransferAction) -> None """Initialize context :param CliContext self: this + :param settings.TransferAction action: transfer action """ self._init_config() - self.general_options = settings.create_general_options(self.config) + self.general_options = settings.create_general_options( + self.config, action) self.credentials = settings.create_azure_storage_credentials( self.config, self.general_options) @@ -164,7 +166,8 @@ def callback(ctx, param, value): '--log-file', expose_value=False, default=None, - help='Log to file specified', + help='Log to file specified; this must be specified for progress ' + 'bar to show', callback=callback)(f) @@ -191,7 +194,8 @@ def callback(ctx, param, value): '--progress-bar/--no-progress-bar', expose_value=False, default=True, - help='Display progress bar instead of console logs [True]', + help='Display progress bar instead of console logs; log file must ' + 'be specified [True]', callback=callback)(f) @@ -254,22 +258,22 @@ def callback(ctx, param, value): clictx.cli_options['local_resource'] = value return value return click.option( - '--local-resource', + '--local-path', expose_value=False, - help='Local resource; use - for stdin', + help='Local path; use - for stdin', callback=callback)(f) -def _storage_account_name_option(f): +def _storage_account_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['storage_account'] = value return value return click.option( - '--storage-account-name', + '--storage-account', expose_value=False, help='Storage account name', - envvar='BLOBXFER_STORAGE_ACCOUNT_NAME', + envvar='BLOBXFER_STORAGE_ACCOUNT', callback=callback)(f) @@ -301,7 +305,7 @@ def common_options(f): def upload_download_options(f): f = _remote_path_option(f) - f = _storage_account_name_option(f) + f = _storage_account_option(f) f = _local_resource_option(f) return f @@ -633,16 +637,16 @@ def callback(ctx, param, value): callback=callback)(f) -def _sync_copy_dest_storage_account_name_option(f): +def _sync_copy_dest_storage_account_option(f): def callback(ctx, param, value): clictx = ctx.ensure_object(CliContext) clictx.cli_options['sync_copy_dest_storage_account'] = value return value return click.option( - '--sync-copy-dest-storage-account-name', + '--sync-copy-dest-storage-account', expose_value=False, help='Storage account name for synccopy destination', - envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME', + envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT', callback=callback)(f) @@ -721,11 +725,11 @@ def download_options(f): def sync_copy_options(f): - f = _sync_copy_dest_storage_account_name_option(f) + f = _sync_copy_dest_storage_account_option(f) f = _sync_copy_dest_sas_option(f) f = _sync_copy_dest_remote_path_option(f) f = _sync_copy_dest_access_key_option(f) - f = _storage_account_name_option(f) + f = _storage_account_option(f) f = _skip_on_md5_match_option(f) f = _skip_on_lmt_ge_option(f) f = _skip_on_filesize_match_option(f) @@ -757,7 +761,7 @@ def cli(ctx): def download(ctx): """Download blobs or files from Azure Storage""" settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download) - ctx.initialize() + ctx.initialize(settings.TransferAction.Download) specs = settings.create_download_specifications(ctx.config) for spec in specs: blobxfer.api.Downloader( @@ -773,7 +777,7 @@ def synccopy(ctx): """Synchronously copy blobs between Azure Storage accounts""" raise NotImplementedError() settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy) - ctx.initialize() + ctx.initialize(settings.TransferAction.Synccopy) @cli.command('upload') @@ -784,7 +788,7 @@ def synccopy(ctx): def upload(ctx): """Upload files to Azure Storage""" settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload) - ctx.initialize() + ctx.initialize(settings.TransferAction.Upload) specs = settings.create_upload_specifications(ctx.config) for spec in specs: blobxfer.api.Uploader( diff --git a/cli/settings.py b/cli/settings.py index 1507c82..24d1a7f 100644 --- a/cli/settings.py +++ b/cli/settings.py @@ -61,13 +61,13 @@ def add_cli_options(cli_options, action): if blobxfer.util.is_none_or_empty(local_resource): raise KeyError() except KeyError: - raise ValueError('--local-resource must be specified') + raise ValueError('--local-path must be specified') try: storage_account = cli_options['storage_account'] if blobxfer.util.is_none_or_empty(storage_account): raise KeyError() except KeyError: - raise ValueError('--storage-account-name must be specified') + raise ValueError('--storage-account must be specified') try: remote_path = cli_options['remote_path'] if blobxfer.util.is_none_or_empty(remote_path): @@ -167,7 +167,7 @@ def add_cli_options(cli_options, action): raise KeyError() except KeyError: raise ValueError( - '--sync-copy-dest-storage-account-name must be specified') + '--sync-copy-dest-storage-account must be specified') try: sync_copy_dest_remote_path = \ cli_options['sync_copy_dest_remote_path'] @@ -278,10 +278,11 @@ def create_azure_storage_credentials(config, general_options): return creds -def create_general_options(config): - # type: (dict) -> blobxfer.models.options.General +def create_general_options(config, action): + # type: (dict, TransferAction) -> blobxfer.models.options.General """Create a General Options object from configuration :param dict config: config dict + :param TransferAction action: transfer action :rtype: blobxfer.models.options.General :return: general options object """ @@ -292,6 +293,7 @@ def create_general_options(config): disk_threads=conc.get('disk_threads', 0), md5_processes=conc.get('md5_processes', 0), transfer_threads=conc.get('transfer_threads', 0), + is_download=action == TransferAction.Download, ), log_file=config['options'].get('log_file', None), progress_bar=config['options'].get('progress_bar', True), diff --git a/docs/01-installation.md b/docs/01-installation.md index 2609f07..9a3fd74 100644 --- a/docs/01-installation.md +++ b/docs/01-installation.md @@ -72,9 +72,10 @@ docker pull alfpark/blobxfer ## Troubleshooting #### `azure.storage` dependency not found -If you get an error that `azure.storage` cannot be found or loaded, then -most likely there was a conflict with this package with other `azure` packages -that share the same base namespace. You can correct this by issuing: +If you get an error such as `ImportError: No module named storage` or that +`azure.storage` cannot be found or loaded, then most likely there was a +conflict with this package with other `azure` packages that share the same +base namespace. You can correct this by issuing: ```shell # for Python3 pip3 install --upgrade --force-reinstall azure-storage diff --git a/docs/10-cli-usage.md b/docs/10-cli-usage.md index 8931464..2f3aad4 100644 --- a/docs/10-cli-usage.md +++ b/docs/10-cli-usage.md @@ -12,9 +12,9 @@ command will be detailed along with all options available. ### `download` Downloads a remote Azure path, which may contain many resources, to the local machine. This command requires at the minimum, the following options: -* `--storage-account-name` +* `--storage-account` * `--remote-path` -* `--local-resource` +* `--local-path` Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. @@ -23,14 +23,14 @@ Please see the Authentication sub-section below under Options. Uploads a local path to a remote Azure path. The local path may contain many resources on the local machine. This command requires at the minimum, the following options: -* `--local-resource` -* `--storage-account-name` +* `--local-path` +* `--storage-account` * `--remote-path` Additionally, an authentication option for the storage account is required. Please see the Authentication sub-section below under Options. -If piping from `stdin`, `--local-resource` should be set to `-` as per +If piping from `stdin`, `--local-path` should be set to `-` as per convention. ### `synccopy` @@ -49,9 +49,10 @@ of up to 100MiB, all others have a maximum of 4MiB. attributes (mode and ownership) should be stored or restored. Note that to restore uid/gid, `blobxfer` must be run as root or under sudo. * `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed. -* `--local-resource` is the local resource path. Set to `-` if piping from +* `--local-path` is the local resource path. Set to `-` if piping from `stdin`. -* `--log-file` specifies the log file to write to. +* `--log-file` specifies the log file to write to. This must be specified +for a progress bar to be output to console. * `--mode` is the operating mode. The default is `auto` but may be set to `append`, `block`, `file`, or `page`. If specified with the `upload` command, then all files will be uploaded as the specified `mode` type. @@ -61,12 +62,16 @@ with Azure File shares. * `--overwrite` or `--no-overwrite` controls clobber semantics at the destination. * `--progress-bar` or `--no-progress-bar` controls if a progress bar is -output to the console. +output to the console. `--log-file` must be specified for a progress bar +to be output. * `--recursive` or `--no-recursive` controls if the source path should be recursively uploaded or downloaded. * `--remote-path` is the remote Azure path. This path must contain the Blob container or File share at the begining, e.g., `mycontainer/vdir` * `--resume-file` specifies the resume file to write to. +* `--storage-account` specifies the storage account to use. This can be +optionally provided through an environment variable `BLOBXFER_STORAGE_ACCOUNT` +instead. * `--timeout` is the integral timeout value in seconds to use. * `-h` or `--help` can be passed at every command level to receive context sensitive help. @@ -96,7 +101,7 @@ to/from Azure Storage. ### Connection * `--endpoint` is the Azure Storage endpoint to connect to; the default is Azure Public regions, or `core.windows.net`. -* `--storage-account-name` is the storage account to connect to. +* `--storage-account` is the storage account to connect to. ### Encryption * `--rsa-private-key` is the RSA private key in PEM format to use. This can @@ -161,27 +166,27 @@ file path. The default is `1`. ### `download` Examples #### Download an Entire Encrypted Blob Container to Current Working Directory ```shell -blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-public-key ~/mypubkey.pem +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-public-key ~/mypubkey.pem ``` #### Download an Entire File Share to Designated Path and Skip On Filesize Matches ```shell -blobxfer download --mode file --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-resource /my/path --skip-on-filesize-match +blobxfer download --mode file --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-path /my/path --skip-on-filesize-match ``` #### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path ```shell -blobxfer download --mode page --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource /my/pageblobs --no-recursive --delete +blobxfer download --mode page --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path /my/pageblobs --no-recursive --delete ``` #### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes ```shell -blobxfer download --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes +blobxfer download --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes ``` #### Download a Blob Snapshot ```shell -blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-resource . +blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-path . ``` #### Download using a YAML Configuration File @@ -192,27 +197,27 @@ blobxfer download --config myconfig.yaml ### `upload` Examples #### Upload Current Working Directory as Encrypted Block Blobs Non-recursively ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-private-key ~/myprivatekey.pem --no-recursive +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-private-key ~/myprivatekey.pem --no-recursive ``` #### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files ```shell -blobxfer upload --mode file --storage-account-name mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-resource . --file-md5 --file-attributes --exclude '*.bak' +blobxfer upload --mode file --storage-account mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-path . --file-md5 --file-attributes --exclude '*.bak' ``` #### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912 ``` #### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path ```shell -blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /my/path --file-md5 --skip-on-md5-match --delete +blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /my/path --file-md5 --skip-on-md5-match --delete ``` #### Upload From Piped `stdin` ```shell -curl -fSsL https://some.uri | blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource - +curl -fSsL https://some.uri | blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path - ``` #### Upload using a YAML Configuration File diff --git a/docs/30-vectored-io.md b/docs/30-vectored-io.md index a007b7d..1d17c40 100644 --- a/docs/30-vectored-io.md +++ b/docs/30-vectored-io.md @@ -91,5 +91,5 @@ keep this metadata in-tact or reconstruction will fail. +---------------------+ ``` -In order to take advantage of `stripe` Vectored IO, you must use a YAML -configuration file to define multiple destinations. +In order to take advantage of `stripe` Vectored IO across multiple +destinations, you must use a YAML configuration file. diff --git a/docs/98-performance-considerations.md b/docs/98-performance-considerations.md index 89ff0c6..8a511fc 100644 --- a/docs/98-performance-considerations.md +++ b/docs/98-performance-considerations.md @@ -3,8 +3,8 @@ Please read the following carefully regarding considerations that should be applied with regard to performance and `blobxfer`. Additionally, please review the [Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/) -for an overview of general performance targets that apply to Azure Blobs -and File shares. +for an overview of general performance targets that apply to Azure Blobs, +File shares and Storage Account types (GRS, LRS, ZRS, etc). ## Concurrency * `blobxfer` offers four concurrency knobs. Each one should be tuned for @@ -23,6 +23,44 @@ maximum performance according to your system and network characteristics. * The thread concurrency options (disk and transfer) can be set to a non-positive number to be automatically set as a multiple of the number of cores available on the machine. +* For uploads, there should be a sufficient number of disk threads to ensure +that all transfer threads have work to do. For downloads, there should be +sufficient number of disk threads to write data to disk so transfer threads +are not artificially blocked. + +## Chunk Sizing +Chunk sizing refers to the `chunk_size_bytes` option and the meaning of which +varies upon the context of uploading or downloading. + +### Uploads +For uploads, chunk sizes correspond to the maximum amount of data to transfer +with a single request. The Azure Storage service imposes maximums depending +upon the type of entity that is being written. For block blobs, the maximum +is 100MiB (although you may "one-shot" up to 256MiB). For page blobs, the +maximum is 4MiB. For append blobs, the maximum is 4MiB. For Azure Files, +the maximum is 4MiB. + +For block blobs, setting the chunk size to something greater than 4MiB will +not only allow you larger file sizes (recall that the maximum number of +blocks for a block blob is 50000, thus at 100MiB blocks, you can create a +5TiB block blob object) but will allow you to amortize larger portions of +data transfer over each request/response overhead. `blobxfer` can +automatically select the proper block size given your file, but will not +automatically tune the chunk size as that depends upon your system and +network characteristics. + +### Downloads +For downloads, chunk sizes correspond to the maximum amount of data to +request from the server for each request. It is important to keep a balance +between the chunk size and the number of in-flight operations afforded by +the `transfer_threads` concurrency control. `blobxfer` does not automatically +tune this (but can automatically set it to a value that should work for +most situations) due to varying system and network conditions. + +Additionally, disk write performance is typically lower than disk read +performance so you need to ensure that the number of `disk_threads` is not +set to a very large number to prevent thrashing and highly random write +patterns. ## Azure File Share Performance File share performance can be "slow" or become a bottleneck, especially for