Test lambda deployment

mayankkom-dev · Feb 13, 2024 · c6c323a · c6c323a
1 parent 58e579b
commit c6c323a
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 1 deletion.
diff --git a/infrastructure/flash_lambda/main.tf b/infrastructure/flash_lambda/main.tf
@@ -24,4 +24,22 @@ resource "aws_lambda_layer_version" "vysnc_lambda_layer" {
     "python3.10",
   ]
   source_code_hash = data.archive_file.lambda_poetry_dependencies.output_base64sha256
+}
+
+data "archive_file" "flash_rank_zip" {
+  type        = "zip"
+  source_dir  = " ${path.module}/${var.lambda_deploy}/src"
+  output_path = "${path.module}/${var.lambda_deploy}.zip"
+}
+
+# Create a lambda function
+resource "aws_lambda_function" "flash_rank_lambda" {
+ filename                       = data.archive_file.flash_rank_zip.output_path
+ function_name                  = "Flash Rank Lambda"
+ role                           = aws_iam_role.vysnc_lambda_role.arn
+ handler                        = "flash_rank.rank_query_lambda_handler"
+ runtime                        = "python3.10"
+ layers = [aws_lambda_layer_version.vysnc_lambda_layer.arn]
+ depends_on                     = [aws_iam_role_policy_attachment.vsync_attach_iam_policy_to_iam_role]
+ source_code_hash = data.archive_file.flash_rank_zip.output_base64sha256
 }
diff --git a/infrastructure/flash_lambda/role_policy.tf b/infrastructure/flash_lambda/role_policy.tf
@@ -12,7 +12,7 @@ resource "aws_iam_policy" "vsync_iam_policy_for_lambda" {
 }
 
 # Policy Attachment on the role.
-resource "aws_iam_role_policy_attachment" "magna_attach_iam_policy_to_iam_role" {
+resource "aws_iam_role_policy_attachment" "vsync_attach_iam_policy_to_iam_role" {
   role        = aws_iam_role.vysnc_lambda_role.name
   policy_arn  = aws_iam_policy.vsync_iam_policy_for_lambda.arn
 }

diff --git a/infrastructure/flash_lambda/src/flash_rank.py b/infrastructure/flash_lambda/src/flash_rank.py
@@ -0,0 +1,42 @@
+from flashrank import Ranker, RerankRequest
+
+# Nano (~4MB), blazing fast model & competitive p
+ranker = Ranker()
+# query = "How to speedup LLMs?"
+# passages = [
+#    {
+#       "id":1,
+#       "text":"Introduce *lookahead decoding*: - a parallel decoding algo to accelerate LLM inference - w/o the need for a draft model or a data store - linearly decreases # decoding steps relative to log(FLOPs) used per decoding step.",
+#       "meta": {"additional": "info1"}
+#    },
+#    {
+#       "id":2,
+#       "text":"LLM inference efficiency will be one of the most crucial topics for both industry and academia, simply because the more efficient you are, the more $$$ you will save. vllm project is a must-read for this direction, and now they have just released the paper",
+#       "meta": {"additional": "info2"}
+#    },
+#    {
+#       "id":3,
+#       "text":"There are many ways to increase LLM inference throughput (tokens/second) and decrease memory footprint, sometimes at the same time. Here are a few methods I’ve found effective when working with Llama 2. These methods are all well-integrated with Hugging Face. This list is far from exhaustive; some of these techniques can be used in combination with each other and there are plenty of others to try. - Bettertransformer (Optimum Library): Simply call `model.to_bettertransformer()` on your Hugging Face model for a modest improvement in tokens per second. - Fp4 Mixed-Precision (Bitsandbytes): Requires minimal configuration and dramatically reduces the model's memory footprint. - AutoGPTQ: Time-consuming but leads to a much smaller model and faster inference. The quantization is a one-time cost that pays off in the long run.",
+#       "meta": {"additional": "info3"}
+
+#    },
+#    {
+#       "id":4,
+#       "text":"Ever want to make your LLM inference go brrrrr but got stuck at implementing speculative decoding and finding the suitable draft model? No more pain! Thrilled to unveil Medusa, a simple framework that removes the annoying draft model while getting 2x speedup.",
+#       "meta": {"additional": "info4"}
+#    },
+#    {
+#       "id":5,
+#       "text":"vLLM is a fast and easy-to-use library for LLM inference and serving. vLLM is fast with: State-of-the-art serving throughput Efficient management of attention key and value memory with PagedAttention Continuous batching of incoming requests Optimized CUDA kernels",
+#       "meta": {"additional": "info5"}
+#    }
+# ]
+
+def rank_query_passages(ranker, query, passages):
+    rerankrequest = RerankRequest(query=query, passages=passages)
+    results = ranker.rerank(rerankrequest)
+    print(results)
+    return results
+
+def rank_query_lambda_handler(event, context):
+    return rank_query_passages(ranker, event['query'], event['passages'])