Debugging issue

samruds · samruds · commit 525c8440f5ea · 2024-03-19T15:25:59.000-07:00
diff --git a/requirements/extras/test_requirements.txt b/requirements/extras/test_requirements.txt
@@ -40,3 +40,5 @@ onnx==1.14.1
 # tf2onnx==1.15.1
 nbformat>=5.9,<6
 accelerate>=0.24.1,<=0.27.0
+nvgpu
+
diff --git a/tests/integ/sagemaker/serve/test_serve_pt_happy.py b/tests/integ/sagemaker/serve/test_serve_pt_happy.py
@@ -24,14 +24,14 @@
 from sagemaker.serve.spec.inference_spec import InferenceSpec
 from torchvision.transforms import transforms
 from torchvision.models.squeezenet import squeezenet1_1
-
+import tests.integ
 from tests.integ.sagemaker.serve.constants import (
     PYTORCH_SQUEEZENET_RESOURCE_DIR,
     SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
     PYTHON_VERSION_IS_NOT_310,
 )
 from tests.integ.timeout import timeout
-from tests.integ.utils import cleanup_model_resources
+from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
 import logging
 
 logger = logging.getLogger(__name__)
@@ -140,6 +140,8 @@ def model_builder_inference_spec_schema_builder(squeezenet_inference_spec, squee
         model_path=PYTORCH_SQUEEZENET_RESOURCE_DIR,
         inference_spec=squeezenet_inference_spec,
         schema_builder=squeezenet_schema,
+        image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:2.0.0-cpu-py310",
+        instance_type="ml.c6i.xlarge"
     )
 
 
@@ -148,69 +150,39 @@ def model_builder(request):
     return request.getfixturevalue(request.param)
 
 
-# @pytest.mark.skipif(
-#     PYTHON_VERSION_IS_NOT_310,
-#     reason="The goal of these test are to test the serving components of our feature",
-# )
-# @pytest.mark.parametrize(
-#     "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
-# )
-# @pytest.mark.slow_test
-# @pytest.mark.flaky(reruns=5, reruns_delay=2)
-# def test_happy_pytorch_local_container(sagemaker_session, model_builder, test_image):
-#     logger.info("Running in LOCAL_CONTAINER mode...")
-#     caught_ex = None
-#
-#     model = model_builder.build(mode=Mode.LOCAL_CONTAINER, sagemaker_session=sagemaker_session)
-#
-#     with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT):
-#         try:
-#             logger.info("Deploying and predicting in LOCAL_CONTAINER mode...")
-#             predictor = model.deploy()
-#             logger.info("Local container successfully deployed.")
-#             predictor.predict(test_image)
-#         except Exception as e:
-#             logger.exception("test failed")
-#             caught_ex = e
-#         finally:
-#             if model.modes[str(Mode.LOCAL_CONTAINER)].container:
-#                 model.modes[str(Mode.LOCAL_CONTAINER)].container.kill()
-#             if caught_ex:
-#                 assert (
-#                     False
-#                 ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"
-
-
 @pytest.mark.skipif(
-    PYTHON_VERSION_IS_NOT_310,  # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
-    reason="The goal of these test are to test the serving components of our feature",
+    PYTHON_VERSION_IS_NOT_310,
+    tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
+    and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
+    reason="no ml.p2 or ml.p3 instances in this region"
 )
+@retry_with_instance_list(gpu_list(tests.integ.test_region()))
 @pytest.mark.parametrize(
     "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
 )
 @pytest.mark.slow_test
 def test_happy_pytorch_sagemaker_endpoint(
     sagemaker_session,
     model_builder,
-    cpu_instance_type,
     test_image,
+    **kwargs
 ):
     logger.info("Running in SAGEMAKER_ENDPOINT mode...")
     caught_ex = None
-
-    # iam_client = sagemaker_session.boto_session.client("iam")
-    # role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
-
-    model = model_builder.build(mode=Mode.LOCAL_CONTAINER, sagemaker_session=sagemaker_session)
-
+    iam_client = sagemaker_session.boto_session.client("iam")
+    role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
+    model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
     with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
         try:
             logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
-            predictor = model.deploy()
+            predictor = model.deploy(
+                mode=Mode.SAGEMAKER_ENDPOINT,
+                instance_type="ml.c6i.xlarge",
+                initial_instance_count=1,
+            )
             logger.info("Endpoint successfully deployed.")
             predictor.predict(test_image)
         except Exception as e:
-            logger.exception("test failed")
             caught_ex = e
         finally:
             cleanup_model_resources(
@@ -223,89 +195,3 @@ def test_happy_pytorch_sagemaker_endpoint(
                 assert (
                     False
                 ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"
-
-
-# @pytest.mark.skipif(
-#     PYTHON_VERSION_IS_NOT_310,
-#     reason="The goal of these test are to test the serving components of our feature",
-# )
-# @pytest.mark.parametrize(
-#     "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
-# )
-# @pytest.mark.slow_test
-# def test_happy_pytorch_local_container_overwrite_to_sagemaker_endpoint(
-#     sagemaker_session, model_builder, cpu_instance_type, test_image
-# ):
-#     logger.info("Building model in LOCAL_CONTAINER mode...")
-#     caught_ex = None
-#
-#     iam_client = sagemaker_session.boto_session.client("iam")
-#     role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
-#     logger.debug("Role arn: %s", role_arn)
-#
-#     model = model_builder.build(
-#         mode=Mode.LOCAL_CONTAINER, role_arn=role_arn, sagemaker_session=sagemaker_session
-#     )
-#
-#     with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
-#         try:
-#             logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
-#             predictor = model.deploy(
-#                 instance_type=cpu_instance_type,
-#                 initial_instance_count=1,
-#                 mode=Mode.SAGEMAKER_ENDPOINT,
-#             )
-#             logger.info("Endpoint successfully deployed.")
-#             predictor.predict(test_image)
-#         except Exception as e:
-#             caught_ex = e
-#         finally:
-#             cleanup_model_resources(
-#                 sagemaker_session=model_builder.sagemaker_session,
-#                 model_name=model.name,
-#                 endpoint_name=model.endpoint_name,
-#             )
-#             if caught_ex:
-#                 logger.exception(caught_ex)
-#                 assert (
-#                     False
-#                 ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"
-
-
-# @pytest.mark.skipif(
-#     PYTHON_VERSION_IS_NOT_310,
-#     reason="The goal of these test are to test the serving components of our feature",
-# )
-# @pytest.mark.parametrize(
-#     "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
-# )
-# @pytest.mark.slow_test
-# def test_happy_pytorch_sagemaker_endpoint_overwrite_to_local_container(
-#     sagemaker_session, model_builder, test_image
-# ):
-#     logger.info("Building model in SAGEMAKER_ENDPOINT mode...")
-#     caught_ex = None
-#
-#     iam_client = sagemaker_session.boto_session.client("iam")
-#     role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
-#
-#     model = model_builder.build(
-#         mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session
-#     )
-#
-#     with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT):
-#         try:
-#             logger.info("Deploying and predicting in LOCAL_CONTAINER mode...")
-#             predictor = model.deploy(mode=Mode.LOCAL_CONTAINER)
-#             logger.info("Local container successfully deployed.")
-#             predictor.predict(test_image)
-#         except Exception as e:
-#             logger.exception("test failed")
-#             caught_ex = e
-#         finally:
-#             if model.modes[str(Mode.LOCAL_CONTAINER)].container:
-#                 model.modes[str(Mode.LOCAL_CONTAINER)].container.kill()
-#             if caught_ex:
-#                 assert (
-#                     False
-#                 ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"