Skip to content

Commit

Permalink
Test lambda deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
mayankkom-dev committed Feb 13, 2024
1 parent 58e579b commit c6c323a
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 1 deletion.
18 changes: 18 additions & 0 deletions infrastructure/flash_lambda/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,22 @@ resource "aws_lambda_layer_version" "vysnc_lambda_layer" {
"python3.10",
]
source_code_hash = data.archive_file.lambda_poetry_dependencies.output_base64sha256
}

data "archive_file" "flash_rank_zip" {
type = "zip"
source_dir = " ${path.module}/${var.lambda_deploy}/src"
output_path = "${path.module}/${var.lambda_deploy}.zip"
}

# Create a lambda function
resource "aws_lambda_function" "flash_rank_lambda" {
filename = data.archive_file.flash_rank_zip.output_path
function_name = "Flash Rank Lambda"
role = aws_iam_role.vysnc_lambda_role.arn
handler = "flash_rank.rank_query_lambda_handler"
runtime = "python3.10"
layers = [aws_lambda_layer_version.vysnc_lambda_layer.arn]
depends_on = [aws_iam_role_policy_attachment.vsync_attach_iam_policy_to_iam_role]
source_code_hash = data.archive_file.flash_rank_zip.output_base64sha256
}
2 changes: 1 addition & 1 deletion infrastructure/flash_lambda/role_policy.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ resource "aws_iam_policy" "vsync_iam_policy_for_lambda" {
}

# Policy Attachment on the role.
resource "aws_iam_role_policy_attachment" "magna_attach_iam_policy_to_iam_role" {
resource "aws_iam_role_policy_attachment" "vsync_attach_iam_policy_to_iam_role" {
role = aws_iam_role.vysnc_lambda_role.name
policy_arn = aws_iam_policy.vsync_iam_policy_for_lambda.arn
}
Expand Down
42 changes: 42 additions & 0 deletions infrastructure/flash_lambda/src/flash_rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from flashrank import Ranker, RerankRequest

# Nano (~4MB), blazing fast model & competitive p
ranker = Ranker()
# query = "How to speedup LLMs?"
# passages = [
# {
# "id":1,
# "text":"Introduce *lookahead decoding*: - a parallel decoding algo to accelerate LLM inference - w/o the need for a draft model or a data store - linearly decreases # decoding steps relative to log(FLOPs) used per decoding step.",
# "meta": {"additional": "info1"}
# },
# {
# "id":2,
# "text":"LLM inference efficiency will be one of the most crucial topics for both industry and academia, simply because the more efficient you are, the more $$$ you will save. vllm project is a must-read for this direction, and now they have just released the paper",
# "meta": {"additional": "info2"}
# },
# {
# "id":3,
# "text":"There are many ways to increase LLM inference throughput (tokens/second) and decrease memory footprint, sometimes at the same time. Here are a few methods I’ve found effective when working with Llama 2. These methods are all well-integrated with Hugging Face. This list is far from exhaustive; some of these techniques can be used in combination with each other and there are plenty of others to try. - Bettertransformer (Optimum Library): Simply call `model.to_bettertransformer()` on your Hugging Face model for a modest improvement in tokens per second. - Fp4 Mixed-Precision (Bitsandbytes): Requires minimal configuration and dramatically reduces the model's memory footprint. - AutoGPTQ: Time-consuming but leads to a much smaller model and faster inference. The quantization is a one-time cost that pays off in the long run.",
# "meta": {"additional": "info3"}

# },
# {
# "id":4,
# "text":"Ever want to make your LLM inference go brrrrr but got stuck at implementing speculative decoding and finding the suitable draft model? No more pain! Thrilled to unveil Medusa, a simple framework that removes the annoying draft model while getting 2x speedup.",
# "meta": {"additional": "info4"}
# },
# {
# "id":5,
# "text":"vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is fast with: State-of-the-art serving throughput Efficient management of attention key and value memory with PagedAttention Continuous batching of incoming requests Optimized CUDA kernels",
# "meta": {"additional": "info5"}
# }
# ]

def rank_query_passages(ranker, query, passages):
rerankrequest = RerankRequest(query=query, passages=passages)
results = ranker.rerank(rerankrequest)
print(results)
return results

def rank_query_lambda_handler(event, context):
return rank_query_passages(ranker, event['query'], event['passages'])

0 comments on commit c6c323a

Please sign in to comment.