diff --git a/infrastructure/flash_lambda/main.tf b/infrastructure/flash_lambda/main.tf index be37732..cefd9c0 100644 --- a/infrastructure/flash_lambda/main.tf +++ b/infrastructure/flash_lambda/main.tf @@ -24,4 +24,22 @@ resource "aws_lambda_layer_version" "vysnc_lambda_layer" { "python3.10", ] source_code_hash = data.archive_file.lambda_poetry_dependencies.output_base64sha256 +} + +data "archive_file" "flash_rank_zip" { + type = "zip" + source_dir = " ${path.module}/${var.lambda_deploy}/src" + output_path = "${path.module}/${var.lambda_deploy}.zip" +} + +# Create a lambda function +resource "aws_lambda_function" "flash_rank_lambda" { + filename = data.archive_file.flash_rank_zip.output_path + function_name = "Flash Rank Lambda" + role = aws_iam_role.vysnc_lambda_role.arn + handler = "flash_rank.rank_query_lambda_handler" + runtime = "python3.10" + layers = [aws_lambda_layer_version.vysnc_lambda_layer.arn] + depends_on = [aws_iam_role_policy_attachment.vsync_attach_iam_policy_to_iam_role] + source_code_hash = data.archive_file.flash_rank_zip.output_base64sha256 } \ No newline at end of file diff --git a/infrastructure/flash_lambda/role_policy.tf b/infrastructure/flash_lambda/role_policy.tf index 13ec58e..a0d9aba 100644 --- a/infrastructure/flash_lambda/role_policy.tf +++ b/infrastructure/flash_lambda/role_policy.tf @@ -12,7 +12,7 @@ resource "aws_iam_policy" "vsync_iam_policy_for_lambda" { } # Policy Attachment on the role. -resource "aws_iam_role_policy_attachment" "magna_attach_iam_policy_to_iam_role" { +resource "aws_iam_role_policy_attachment" "vsync_attach_iam_policy_to_iam_role" { role = aws_iam_role.vysnc_lambda_role.name policy_arn = aws_iam_policy.vsync_iam_policy_for_lambda.arn } diff --git a/infrastructure/flash_lambda/src/flash_rank.py b/infrastructure/flash_lambda/src/flash_rank.py new file mode 100644 index 0000000..7cd983e --- /dev/null +++ b/infrastructure/flash_lambda/src/flash_rank.py @@ -0,0 +1,42 @@ +from flashrank import Ranker, RerankRequest + +# Nano (~4MB), blazing fast model & competitive p +ranker = Ranker() +# query = "How to speedup LLMs?" +# passages = [ +# { +# "id":1, +# "text":"Introduce *lookahead decoding*: - a parallel decoding algo to accelerate LLM inference - w/o the need for a draft model or a data store - linearly decreases # decoding steps relative to log(FLOPs) used per decoding step.", +# "meta": {"additional": "info1"} +# }, +# { +# "id":2, +# "text":"LLM inference efficiency will be one of the most crucial topics for both industry and academia, simply because the more efficient you are, the more $$$ you will save. vllm project is a must-read for this direction, and now they have just released the paper", +# "meta": {"additional": "info2"} +# }, +# { +# "id":3, +# "text":"There are many ways to increase LLM inference throughput (tokens/second) and decrease memory footprint, sometimes at the same time. Here are a few methods I’ve found effective when working with Llama 2. These methods are all well-integrated with Hugging Face. This list is far from exhaustive; some of these techniques can be used in combination with each other and there are plenty of others to try. - Bettertransformer (Optimum Library): Simply call `model.to_bettertransformer()` on your Hugging Face model for a modest improvement in tokens per second. - Fp4 Mixed-Precision (Bitsandbytes): Requires minimal configuration and dramatically reduces the model's memory footprint. - AutoGPTQ: Time-consuming but leads to a much smaller model and faster inference. The quantization is a one-time cost that pays off in the long run.", +# "meta": {"additional": "info3"} + +# }, +# { +# "id":4, +# "text":"Ever want to make your LLM inference go brrrrr but got stuck at implementing speculative decoding and finding the suitable draft model? No more pain! Thrilled to unveil Medusa, a simple framework that removes the annoying draft model while getting 2x speedup.", +# "meta": {"additional": "info4"} +# }, +# { +# "id":5, +# "text":"vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is fast with: State-of-the-art serving throughput Efficient management of attention key and value memory with PagedAttention Continuous batching of incoming requests Optimized CUDA kernels", +# "meta": {"additional": "info5"} +# } +# ] + +def rank_query_passages(ranker, query, passages): + rerankrequest = RerankRequest(query=query, passages=passages) + results = ranker.rerank(rerankrequest) + print(results) + return results + +def rank_query_lambda_handler(event, context): + return rank_query_passages(ranker, event['query'], event['passages']) \ No newline at end of file