From 5ed1e2eb726cc7d462a6324e3246aa3d9f8b4892 Mon Sep 17 00:00:00 2001 From: Chris Kuehl Date: Sat, 14 Apr 2018 22:39:30 -0700 Subject: [PATCH] Add example instructions on using Lambda --- README.md | 5 +- lambda/.gitignore | 2 + lambda/Makefile | 13 +++ lambda/README.md | 206 +++++++++++++++++++++++++++++++++++++++++++++ lambda/config.json | 8 ++ lambda/handler.py | 71 ++++++++++++++++ vendor/venv-update | 20 +++-- 7 files changed, 316 insertions(+), 9 deletions(-) create mode 100644 lambda/.gitignore create mode 100644 lambda/Makefile create mode 100644 lambda/README.md create mode 100644 lambda/config.json create mode 100644 lambda/handler.py diff --git a/README.md b/README.md index 1e18bd23e9d..09ec248f48b 100644 --- a/README.md +++ b/README.md @@ -58,12 +58,14 @@ To use dumb-pypi, you need two things: My recommended high-availability (but still quite simple) deployment is: -* Store all of the packages in S3. +* Store all of the packages in an S3 bucket. * Have a cronjob (or equivalent) which rebuilds the index based on the packages in S3. This is incredibly fast—it would not be unreasonable to do it every sixty seconds. After building the index, sync it into a separate S3 bucket. + (You can also use AWS Lambda for this step; [instructions here!][lambda]) + * Have a webserver (or set of webservers behind a load balancer) running nginx (with the config provided below), with the source being that second S3 bucket. @@ -172,6 +174,7 @@ To run the tests, call `make test`. To run an individual test, you can do `py.test -k name_of_test tests` (with the virtualenv activated). +[lambda]: https://github.com/chriskuehl/dumb-pypi/blob/master/lambda/README.md [rationale]: https://github.com/chriskuehl/dumb-pypi/blob/master/RATIONALE.md [pep503]: https://www.python.org/dev/peps/pep-0503/#normalized-names [s3-metadata]: https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#UserMetadata diff --git a/lambda/.gitignore b/lambda/.gitignore new file mode 100644 index 00000000000..6f777825321 --- /dev/null +++ b/lambda/.gitignore @@ -0,0 +1,2 @@ +/bundle +/bundle.zip diff --git a/lambda/Makefile b/lambda/Makefile new file mode 100644 index 00000000000..ae2d4adefe7 --- /dev/null +++ b/lambda/Makefile @@ -0,0 +1,13 @@ +bundle: config.json handler.py ../setup.py + rm -rf $@ + mkdir $@ + cp handler.py config.json $@/ + pip install .. -t $@/ + +bundle.zip: bundle + rm -f $@ + cd bundle && zip -r ../bundle.zip . + +.PHONY: clean +clean: + rm -rf bundle bundle.zip diff --git a/lambda/README.md b/lambda/README.md new file mode 100644 index 00000000000..5929110eba0 --- /dev/null +++ b/lambda/README.md @@ -0,0 +1,206 @@ +# Integrating dumb-pypi with AWS Lambda + +[AWS Lambda][lambda] is a way to run code ("functions") in response to triggers +(like a change in an S3 bucket) without running any servers yourself. + +dumb-pypi works very well with Lambda; you only need to regenerate the index +when your list of packages changes (relatively rare), and you can serve the +generated index without involving dumb-pypi at all. + +The steps below walk you through an example AWS Lambda setup where a change in +a "source" bucket (containing all your packages) automatically triggers +dumb-pypi to regenerate the index and store it in the "output" bucket. + +Depending on if you need to support old pip versions, you may even be able to +serve your index directly from S3, avoiding running any servers entirely. + + +## Initial deployment + +These instructions use the sample code in this directory as the base for the +Lambda handler. The specifics of your bucket will likely vary; it's expected +that you may need to adjust configuration options or the code itself to match +your deployment. + +1. Create two S3 buckets, e.g. `dumb-pypi-source` and `dumb-pypi-output`. + + The source bucket is where you'll drop Python packages (tarballs, wheels, + etc.) in a flat listing (all objects at the root of the bucket). + + The output bucket will contain the generated index (HTML files) which pip + uses. + +2. Create an IAM role which allows reading from the source bucket and + reading/writing to the output bucket. Select "Lambda" as the AWS resource + the role applies to during creation. + + Here's an example policy (adjust as needed): + + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowReadToSourceBucket", + "Effect": "Allow", + "Action": [ + "s3:List*", + "s3:Get*" + ], + "Resource": [ + "arn:aws:s3:::dumb-pypi-source/*", + "arn:aws:s3:::dumb-pypi-source" + ] + }, + { + "Sid": "AllowReadWriteToOutputBucket", + "Effect": "Allow", + "Action": [ + "s3:List*", + "s3:Get*", + "s3:PutObject", + "s3:DeleteObject" + ], + "Resource": [ + "arn:aws:s3:::dumb-pypi-output/*", + "arn:aws:s3:::dumb-pypi-output" + ] + } + ] + } + ``` + +3. Adjust `config.json` in this directory as necessary (e.g. update + source/output bucket and the arguments). You can easily change this stuff + later. + +4. Build the first deployment bundle to upload to Lambda. From this directory, + just run `make bundle.zip`. + +5. Create the function. For example, here's how you might do it with the AWS cli: + + ```bash + aws lambda create-function \ + --region us-west-1 \ + --function-name dumb-pypi \ + --runtime python3.6 \ + --role arn:aws:iam::XXXXXXXXXXXX:role/dumb-pypi \ + --handler handler.main \ + --zip-file fileb://bundle.zip + ``` + + (Replace the role, region, etc. to match your setup.) + +6. [Give your S3 source bucket permission][s3-allow-trigger] to trigger your new + Lambda function. For example: + + ```bash + aws lambda add-permission \ + --region us-west-1 \ + --function-name dumb-pypi \ + --statement-id AllowSourceBucketToTriggerDumbPyPI \ + --action lambda:InvokeFunction \ + --principal s3.amazonaws.com \ + --source-arn arn:aws:s3:::dumb-pypi-source \ + --source-account XXXXXXXXXXXX + ``` + +7. Set up a trigger so that changes to the source bucket cause the `dumb-pypi` + function to run and regenerate the index. + + The AWS cli is very awkward, the easiest way to do this is to make a file + like `policy.json` with contents like: + + ```json + { + "LambdaFunctionConfigurations": [ + { + "Id": "NotifyDumbPyPI", + "LambdaFunctionArn": "arn:aws:lambda:us-west-1:XXXXXXXXXXXX:function:dumb-pypi", + "Events": ["s3:ObjectCreated:*", "s3:ObjectRemoved:*"] + } + ] + } + ``` + + (Again, replacing the function's ARN as appropriate for your account.) + + Then, using the AWS cli, add a "notification configuration" to the source + bucket: + + ```bash + aws s3api put-bucket-notification-configuration \ + --bucket dumb-pypi-source \ + --notification-configuration "$(< policy.json)" + ``` + + +## Serving from the S3 buckets directly + +The whole point of Lambda is to avoid running your own servers, so you might as +well serve directly from S3 :) + +Keep in mind that if you need to support old pip versions, you [can't yet serve +directly from S3][rationale] because these old versions rely on the PyPI server +to do package name normalization; see [the README][README] for suggestions on +how to use nginx to do this normalization. + +If you **do** want to serve from S3 directly, it's pretty easy: + +1. Enable read access to your source bucket. You can enable this to the public, + whitelisted only to your company's IPs, etc. + + Here's an example policy which whitelists your bucket to everyone: + + ```json + { + "Version": "2008-10-17", + "Id": "AllowReadOnlyAccess", + "Statement": [ + { + "Sid": "AllowReadOnlyAccess", + "Effect": "Allow", + "Principal": { + "AWS": "*" + }, + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::dumb-pypi-source/*" + } + ] + } + ``` + + This will make your source bucket available at a URL like + `https://dumb-pypi-source.s3.amazonaws.com`. + +2. Enable read access to your output bucket. Again, it's up to you who you + allow; you can use the same example policy from above (just adjust the + bucket name). + +3. Enable static website hosting for your output bucket, and set `index.html` + as your "Index document". This appears to be the only way to get + `index.html` to show up when accessing the root of a "directory" in S3. + + This will make your output bucket available at a URL like + `http://dumb-pypi-output.s3-website-us-west-1.amazonaws.com/`. + + +## Updating the code or config + +Any time you update the code or config, you need to re-deploy the bundle to +Lambda. + +1. Run `make deploy.zip` to build a new deployment bundle. + +2. Use the AWS cli to update the code for the function: + + ```bash. + aws lambda update-function-code \ + --function-name dumb-pypi \ + --zip-file fileb://bundle.zip + ``` + +[lambda]: https://aws.amazon.com/lambda/ +[rationale]: https://github.com/chriskuehl/dumb-pypi/blob/master/RATIONALE.md +[s3-allow-trigger]: https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html#grant-destinations-permissions-to-s3 +[README]: https://github.com/chriskuehl/dumb-pypi/blob/master/README.md diff --git a/lambda/config.json b/lambda/config.json new file mode 100644 index 00000000000..7118eb18ef6 --- /dev/null +++ b/lambda/config.json @@ -0,0 +1,8 @@ +{ + "source-bucket": "dumb-pypi-source", + "output-bucket": "dumb-pypi-output", + "args": [ + "--packages-url", "https://dumb-pypi-source.s3.amazonaws.com/", + "--title", "My Cool PyPI on S3" + ] +} diff --git a/lambda/handler.py b/lambda/handler.py new file mode 100644 index 00000000000..21fc5c0945c --- /dev/null +++ b/lambda/handler.py @@ -0,0 +1,71 @@ +import json +import mimetypes +import os +import os.path +import tempfile +import time + +import boto3 + +import dumb_pypi.main + + +def _load_config(): + with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f: + return json.load(f) + + +def _list_bucket(bucket): + s3 = boto3.client('s3') + paginator = s3.get_paginator('list_objects_v2') + for page in paginator.paginate(Bucket=bucket): + yield from ( + json.dumps( + { + 'filename': package['Key'], + 'upload_timestamp': time.mktime(package['LastModified'].timetuple()), + }, + sort_keys=True, + ) + for package in page.get('Contents', ()) + ) + + +def _sync_bucket(localdir, bucket_name): + # TODO: should also delete removed files + s3 = boto3.resource('s3') + bucket = s3.Bucket(bucket_name) + for dirpath, _, filenames in os.walk(localdir): + for filename in filenames: + path_on_disk = os.path.join(dirpath, filename) + key = os.path.relpath(path_on_disk, localdir) + print(f'Uploading {path_on_disk} => s3://{bucket_name}/{key}') + with open(path_on_disk, 'rb') as f: + bucket.put_object( + Key=key, + Body=f, + ContentType=mimetypes.guess_type(filename)[0] + ) + + +def main(event, context): + config = _load_config() + + with tempfile.TemporaryDirectory() as td: + with tempfile.NamedTemporaryFile(mode='w') as tf: + for line in _list_bucket(config['source-bucket']): + tf.write(line + '\n') + tf.flush() + + dumb_pypi.main.main(( + '--package-list-json', tf.name, + '--output-dir', td, + *config['args'], + )) + + _sync_bucket(td, config['output-bucket']) + + +# Strictly for testing; we don't look at the event or context anyway. +if __name__ == '__main__': + exit(main(None, None)) diff --git a/vendor/venv-update b/vendor/venv-update index 0d6108dcfc4..b4a974ccbbd 100755 --- a/vendor/venv-update +++ b/vendor/venv-update @@ -49,7 +49,7 @@ See https://pip.readthedocs.org/en/stable/user_guide/#environment-variables For example: PIP_INDEX_URL=https://pypi.example.com/simple venv-update -Please send issues to: https://github.com/yelp/pip-faster +Please send issues to: https://github.com/yelp/venv-update ''' from __future__ import absolute_import from __future__ import print_function @@ -59,7 +59,7 @@ from os.path import exists from os.path import join from subprocess import CalledProcessError -__version__ = '2.0.0' +__version__ = '3.0.0' DEFAULT_VIRTUALENV_PATH = 'venv' DEFAULT_OPTION_VALUES = { 'venv=': (DEFAULT_VIRTUALENV_PATH,), @@ -67,8 +67,8 @@ DEFAULT_OPTION_VALUES = { 'pip-command=': ('pip-faster', 'install', '--upgrade', '--prune'), 'bootstrap-deps=': ('venv-update==' + __version__,), } -__doc__ = __doc__.format( # pylint:disable=redefined-builtin - **dict((key, ' '.join(val)) for key, val in DEFAULT_OPTION_VALUES.items()) +__doc__ = __doc__.format( + **{key: ' '.join(val) for key, val in DEFAULT_OPTION_VALUES.items()} ) # This script must not rely on anything other than @@ -89,10 +89,10 @@ def parseargs(argv): else: options[key] += (arg,) - if set(args) & set(('-h', '--help')): + if set(args) & {'-h', '--help'}: print(__doc__, end='') exit(0) - elif set(args) & set(('-V', '--version')): + elif set(args) & {'-V', '--version'}: print(__version__) exit(0) elif args: @@ -169,7 +169,7 @@ def exec_(argv): # never returns # in python3, sys.exitfunc has gone away, and atexit._run_exitfuncs seems to be the only pubic-ish interface # https://hg.python.org/cpython/file/3.4/Modules/atexitmodule.c#l289 import atexit - atexit._run_exitfuncs() # pylint:disable=protected-access + atexit._run_exitfuncs() from os import execv execv(argv[0], argv) @@ -307,6 +307,10 @@ def ensure_virtualenv(args, return_values): argv[:] = ('virtualenv',) + args info(colorize(argv)) raise_on_failure(virtualenv.main) + # There might not be a venv_path if doing something like "venv= --version" + # and not actually asking virtualenv to make a venv. + if return_values.venv_path is not None: + run(('rm', '-rf', join(return_values.venv_path, 'local'))) def wait_for_all_subprocesses(): @@ -398,7 +402,7 @@ def venv_update( def execfile_(filename): with open(filename) as code: code = compile(code.read(), filename, 'exec') - exec(code, {'__file__': filename}) # pylint:disable=exec-used + exec(code, {'__file__': filename}) def pip_faster(venv_path, pip_command, install, bootstrap_deps):