From 69f87b57320519f03e59367d62a2846f7c516e20 Mon Sep 17 00:00:00 2001 From: Akash Gupta Date: Wed, 10 Sep 2025 21:44:57 +0530 Subject: [PATCH 1/7] add code for hf model importer --- hf-model-import-job/README.md | 53 +++++++++++++ hf-model-import-job/main.py | 109 +++++++++++++++++++++++++++ hf-model-import-job/requirements.txt | 4 + 3 files changed, 166 insertions(+) create mode 100644 hf-model-import-job/README.md create mode 100644 hf-model-import-job/main.py create mode 100644 hf-model-import-job/requirements.txt diff --git a/hf-model-import-job/README.md b/hf-model-import-job/README.md new file mode 100644 index 0000000..96199fb --- /dev/null +++ b/hf-model-import-job/README.md @@ -0,0 +1,53 @@ +# Hugging Face Model Import Job + +This script downloads a Hugging Face model from a given URL and logs it to TrueFoundry's model registry. + +## Installation + +1. Create a virtual environment (recommended): + +```bash +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +2. Install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Make sure you have TrueFoundry credentials configured (via `tfy login` or environment variables). + +## Usage + +### Basic Usage + +```bash +# Make sure to activate your virtual environment first +source venv/bin/activate # On Windows: venv\Scripts\activate + +python main.py --model-id "microsoft/DialoGPT-medium" --ml-repo "my-ml-repo" --model-name "dialogpt-medium" --model-type "text-generation" +``` + +## Arguments + +- `--model-id` (required): Hugging Face model ID or repository ID +- `--ml-repo` (required): TrueFoundry ML repository name +- `--model-name` (required): Name for the model in TrueFoundry +- `--model-type` (required): Type of the model (e.g., 'text-generation', 'fill-mask') +- `--hf-token` (optional): Hugging Face token for private models + +## Examples + +### Import a popular language model: + +```bash +python main.py --model-id "gpt2" --ml-repo "language-models" --model-name "gpt2-small" --model-type "text-generation" +``` + +### Import a BERT model: + +```bash +python main.py --model-id "bert-base-uncased" --ml-repo "nlp-models" --model-name "bert-base" --model-type "fill-mask" +``` diff --git a/hf-model-import-job/main.py b/hf-model-import-job/main.py new file mode 100644 index 0000000..9b170e4 --- /dev/null +++ b/hf-model-import-job/main.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Hugging Face Model Import Script for TrueFoundry + +This script downloads a Hugging Face model from a given URL and logs it to TrueFoundry's model registry. +""" + +import argparse +import os +import tempfile +import shutil + +from huggingface_hub import snapshot_download +from truefoundry.ml import get_client, TransformersFramework + +def main(): + """Main function to handle command line arguments and orchestrate the process.""" + parser = argparse.ArgumentParser( + description="Download a Hugging Face model and log it to TrueFoundry", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python main.py --model-id "microsoft/DialoGPT-medium" --ml-repo "my-ml-repo" --model-name "dialogpt-medium" --model-type "text-generation" + python main.py --model-id "gpt2" --ml-repo "my-repo" --model-name "gpt2-model" --model-type "text-generation" + """ + ) + + parser.add_argument( + "--model-id", + required=True, + help="Hugging Face model ID (e.g., 'microsoft/DialoGPT-medium')" + ) + + parser.add_argument( + "--ml-repo", + required=True, + help="TrueFoundry ML repository name" + ) + + parser.add_argument( + "--model-name", + required=True, + help="Name for the model in TrueFoundry" + ) + + parser.add_argument( + "--model-type", + required=True, + help="Type of the model (e.g., 'text-generation')" + ) + + parser.add_argument( + "--hf-token", + required=False, + help="Hugging Face token for private models" + ) + + args = parser.parse_args() + + # Create temporary directory for download + temp_dir = tempfile.mkdtemp() + model_download_path = temp_dir + + try: + + snapshot_download( + args.model_id, + revision=None, + cache_dir=None, + local_dir=model_download_path, + ignore_patterns=["*.h5", "*.ot"], + local_dir_use_symlinks=False, + token=args.hf_token, + ) + + if os.path.exists(os.path.join(model_download_path, '.cache')): + shutil.rmtree(os.path.join(model_download_path, '.cache')) + + + ML_REPO = args.ml_repo # ML Repo to upload to + MODEL_NAME = args.model_name # Model Name to upload as + + client = get_client() + model_version = client.log_model( + ml_repo=ML_REPO, + name=MODEL_NAME, + model_file_or_folder=model_download_path, + framework=TransformersFramework( + model_id=args.model_id, + pipeline_tag=args.model_type + ), + ) + + print(f"\n✅ Success! Model logged to TrueFoundry with FQN: {model_version.fqn}") + + # Clean up temporary files + print("Cleaning up temporary files...") + shutil.rmtree(temp_dir) + + except Exception as e: + print(f"\n❌ Error: {str(e)}") + shutil.rmtree(temp_dir) + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/hf-model-import-job/requirements.txt b/hf-model-import-job/requirements.txt new file mode 100644 index 0000000..118e1a3 --- /dev/null +++ b/hf-model-import-job/requirements.txt @@ -0,0 +1,4 @@ +truefoundry==0.11.12 +huggingface_hub>=0.19.0 +transformers>=4.30.0 +torch>=2.0.0 \ No newline at end of file From 1010610f66492fbd4786b11e320acfe087e2d5ad Mon Sep 17 00:00:00 2001 From: Akash Gupta Date: Fri, 12 Sep 2025 19:21:15 +0530 Subject: [PATCH 2/7] update deploy file --- hf-model-import-job/deploy.py | 69 +++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 hf-model-import-job/deploy.py diff --git a/hf-model-import-job/deploy.py b/hf-model-import-job/deploy.py new file mode 100644 index 0000000..2c94689 --- /dev/null +++ b/hf-model-import-job/deploy.py @@ -0,0 +1,69 @@ +import logging +from truefoundry.deploy import ( + Param, + Manual, + Build, + Resources, + Job, + PythonBuild, + NodeSelector, + LocalSource, +) +import argparse + +logging.basicConfig(level=logging.INFO) + +parser = argparse.ArgumentParser() +parser.add_argument("--workspace_fqn", required=True, type=str) +args = parser.parse_args() + +job = Job( + name="hf-model-importer", + image=Build( + # Set build_source=LocalSource(local_build=False), in order to deploy code from your local. + # With local_build=False flag, docker image will be built on cloud instead of local + # Else it will try to use docker installed on your local machine to build the image + build_source=LocalSource(local_build=False), + build_spec=PythonBuild( + python_version="3.11", + build_context_path="./hf-model-import-job", + requirements_path="requirements.txt", + command="python main.py --model-id {{model_id}} --model-type {{model_type}} --ml-repo {{ml_repo}} --model-name {{model_name}}", + ), + ), + trigger=Manual(), + params=[ + Param( + name="model_id", description="Hugging face model ID", param_type="string" + ), + Param( + name="model_type", + description="model type from hugging face", + default="text-generation", + param_type="string", + ), + Param( + name="ml_repo", + description="ML repo name to import model to", + param_type="ml_repo", + ), + Param( + name="model_name", + description="Model name in truefoundry model registry", + param_type="string", + ), + ], + resources=Resources( + cpu_request=1.0, + cpu_limit=2.0, + memory_request=2000, + memory_limit=4000, + ephemeral_storage_request=10000, + ephemeral_storage_limit=20000, + ), + retries=0, + workspace_fqn=args.workspace_fqn, +) + + +job.deploy(workspace_fqn=args.workspace_fqn, wait=False) From f7b204179abb25a7af2de4cbeafaa19e78cbc225 Mon Sep 17 00:00:00 2001 From: Sourav Gupta Date: Fri, 12 Sep 2025 17:21:22 -0700 Subject: [PATCH 3/7] Cloning Model --- hf-model-import-job/clone_model.py | 103 +++++++++++++++++++++++++++++ hf-model-import-job/main.py | 3 +- 2 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 hf-model-import-job/clone_model.py diff --git a/hf-model-import-job/clone_model.py b/hf-model-import-job/clone_model.py new file mode 100644 index 0000000..4eed41e --- /dev/null +++ b/hf-model-import-job/clone_model.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +""" +Model Clone Script for TrueFoundry + +This script clones a given model from one TrueFoundry's Repo and logs it to another TrueFoundry's Repo. +""" + +import argparse +import os +import tempfile +import shutil + +from truefoundry.ml import get_client, TransformersFramework + +def main(): + """Main function to handle command line arguments and orchestrate the process.""" + parser = argparse.ArgumentParser( + description="Clone a model from one TrueFoundry's Repo and log it to another TrueFoundry's Repo", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python clone_model.py --source-ml-repo "wns-testing" --source-model-name "liquid-ai" --source-model-version "1" --target-ml-repo "wns-testing-dest" --target-model-name "liquid-ai" + """ + ) + + parser.add_argument( + "--source-ml-repo", + required=True, + help="TrueFoundry source ML repository name" + ) + + parser.add_argument( + "--source-model-name", + required=True, + help="TrueFoundry source model name" + ) + + parser.add_argument( + "--source-model-version", + required=True, + help="TrueFoundry source model version" + ) + + parser.add_argument( + "--target-ml-repo", + required=True, + help="TrueFoundry target ML repository name" + ) + + parser.add_argument( + "--target-model-name", + required=True, + help="Name for the model in TrueFoundry target ML repository" + ) + + args = parser.parse_args() + + # Create temporary directory for download + temp_dir = tempfile.mkdtemp() + model_download_path = temp_dir + + try: + + client = get_client() + + source_model_version = client.get_model_version( + ml_repo=args.source_ml_repo, + name=args.source_model_name, + version=args.source_model_version + ) + + download_info = source_model_version.download(path=model_download_path, overwrite=True) + + if os.path.exists(os.path.join(model_download_path, '.cache')): + shutil.rmtree(os.path.join(model_download_path, '.cache')) + + print(f"Model downloaded to {model_download_path}") + + destination_model_version = client.log_model( + ml_repo=args.target_ml_repo, + name=args.target_model_name, + description=source_model_version.description, + metadata=source_model_version.metadata, + model_file_or_folder=model_download_path, + framework=source_model_version.framework, + ) + + print(f"\n✅ Success! Model logged to TrueFoundry with FQN: {source_model_version.fqn}") + + # Clean up temporary files + print("Cleaning up temporary files...") + shutil.rmtree(temp_dir) + + except Exception as e: + print(f"\n❌ Error: {str(e)}") + shutil.rmtree(temp_dir) + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/hf-model-import-job/main.py b/hf-model-import-job/main.py index 9b170e4..264f7f4 100644 --- a/hf-model-import-job/main.py +++ b/hf-model-import-job/main.py @@ -22,6 +22,7 @@ def main(): Examples: python main.py --model-id "microsoft/DialoGPT-medium" --ml-repo "my-ml-repo" --model-name "dialogpt-medium" --model-type "text-generation" python main.py --model-id "gpt2" --ml-repo "my-repo" --model-name "gpt2-model" --model-type "text-generation" + python main.py --model-id "LiquidAI/LFM2-350M" --ml-repo "wns-testing" --model-name "liquid-ai" --model-type "text-generation" """ ) @@ -76,7 +77,7 @@ def main(): if os.path.exists(os.path.join(model_download_path, '.cache')): shutil.rmtree(os.path.join(model_download_path, '.cache')) - + print(f"Model downloaded to {model_download_path}") ML_REPO = args.ml_repo # ML Repo to upload to MODEL_NAME = args.model_name # Model Name to upload as From fbf92f1462332d2ce8f6145578baa8cfcb813c0f Mon Sep 17 00:00:00 2001 From: Sourav Gupta Date: Sun, 14 Sep 2025 17:31:06 -0700 Subject: [PATCH 4/7] Model Clone --- hf-model-import-job/clone_model.py | 2 ++ hf-model-import-job/main.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hf-model-import-job/clone_model.py b/hf-model-import-job/clone_model.py index 4eed41e..926fed4 100644 --- a/hf-model-import-job/clone_model.py +++ b/hf-model-import-job/clone_model.py @@ -63,6 +63,8 @@ def main(): client = get_client() + print("List ML Repos:", client.list_ml_repos()) + source_model_version = client.get_model_version( ml_repo=args.source_ml_repo, name=args.source_model_name, diff --git a/hf-model-import-job/main.py b/hf-model-import-job/main.py index 264f7f4..cf857dd 100644 --- a/hf-model-import-job/main.py +++ b/hf-model-import-job/main.py @@ -22,7 +22,6 @@ def main(): Examples: python main.py --model-id "microsoft/DialoGPT-medium" --ml-repo "my-ml-repo" --model-name "dialogpt-medium" --model-type "text-generation" python main.py --model-id "gpt2" --ml-repo "my-repo" --model-name "gpt2-model" --model-type "text-generation" - python main.py --model-id "LiquidAI/LFM2-350M" --ml-repo "wns-testing" --model-name "liquid-ai" --model-type "text-generation" """ ) @@ -77,7 +76,6 @@ def main(): if os.path.exists(os.path.join(model_download_path, '.cache')): shutil.rmtree(os.path.join(model_download_path, '.cache')) - print(f"Model downloaded to {model_download_path}") ML_REPO = args.ml_repo # ML Repo to upload to MODEL_NAME = args.model_name # Model Name to upload as From a1a70463ad5b4e62f7342bee2644b5d6b653c2e5 Mon Sep 17 00:00:00 2001 From: Sourav Gupta Date: Mon, 15 Sep 2025 12:38:30 -0700 Subject: [PATCH 5/7] Retry and backoff --- hf-model-import-job/clone_model.py | 106 +++++++++++++++++++---------- 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/hf-model-import-job/clone_model.py b/hf-model-import-job/clone_model.py index 926fed4..ffb3c0b 100644 --- a/hf-model-import-job/clone_model.py +++ b/hf-model-import-job/clone_model.py @@ -9,49 +9,38 @@ import os import tempfile import shutil +import time from truefoundry.ml import get_client, TransformersFramework def main(): - """Main function to handle command line arguments and orchestrate the process.""" + + print("Starting model cloning process...") + + """Main function to handle cloning a model from one TrueFoundry's Repo and logging it to another TrueFoundry's Repo""" + parser = argparse.ArgumentParser( description="Clone a model from one TrueFoundry's Repo and log it to another TrueFoundry's Repo", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" -Examples: - python clone_model.py --source-ml-repo "wns-testing" --source-model-name "liquid-ai" --source-model-version "1" --target-ml-repo "wns-testing-dest" --target-model-name "liquid-ai" + Examples: + python clone_model.py --source-model-fqn=model:truefoundry/wns-testing/liquid-ai:1 --target-ml-repo=wns-testing-dest """ ) parser.add_argument( - "--source-ml-repo", - required=True, - help="TrueFoundry source ML repository name" - ) - - parser.add_argument( - "--source-model-name", + "--source-model-fqn", required=True, - help="TrueFoundry source model name" + help="TrueFoundry source model FQN" ) - parser.add_argument( - "--source-model-version", - required=True, - help="TrueFoundry source model version" - ) - parser.add_argument( "--target-ml-repo", required=True, help="TrueFoundry target ML repository name" ) - - parser.add_argument( - "--target-model-name", - required=True, - help="Name for the model in TrueFoundry target ML repository" - ) + MAX_RETRIES = os.getenv("MAX_RETRIES", 5) + INITIAL_BACKOFF_SECONDS = os.getenv("INITIAL_BACKOFF_SECONDS", 2) args = parser.parse_args() @@ -60,34 +49,49 @@ def main(): model_download_path = temp_dir try: - client = get_client() + print(f"Successfully connected to TrueFoundry") - print("List ML Repos:", client.list_ml_repos()) - - source_model_version = client.get_model_version( - ml_repo=args.source_ml_repo, - name=args.source_model_name, - version=args.source_model_version + print(f"Getting source model version for FQN: {args.source_model_fqn}") + source_model_version = client.get_model_version_by_fqn( + fqn=args.source_model_fqn ) - download_info = source_model_version.download(path=model_download_path, overwrite=True) + ml_repos = client.list_ml_repos() + if args.target_ml_repo not in ml_repos: + raise ValueError(f"ML Repo {args.target_ml_repo} not found") + + print(f"Downloading model to {model_download_path}") + + run_with_retry( + source_model_version.download, + max_retries=MAX_RETRIES, + initial_backoff=INITIAL_BACKOFF_SECONDS, + path=model_download_path, + overwrite=True + ) if os.path.exists(os.path.join(model_download_path, '.cache')): shutil.rmtree(os.path.join(model_download_path, '.cache')) - print(f"Model downloaded to {model_download_path}") + print(f"Model downloaded successfully to {model_download_path}") + source_model_version.metadata["source_model_fqn"] = args.source_model_fqn + + print(f"Uploading model to {args.target_ml_repo}") - destination_model_version = client.log_model( + destination_model_version = run_with_retry( + client.log_model, + max_retries=MAX_RETRIES, + initial_backoff=INITIAL_BACKOFF_SECONDS, ml_repo=args.target_ml_repo, - name=args.target_model_name, + name=source_model_version.name, description=source_model_version.description, metadata=source_model_version.metadata, model_file_or_folder=model_download_path, framework=source_model_version.framework, ) - print(f"\n✅ Success! Model logged to TrueFoundry with FQN: {source_model_version.fqn}") + print(f"\n✅ Success! Model logged to TrueFoundry with FQN: {destination_model_version.fqn}") # Clean up temporary files print("Cleaning up temporary files...") @@ -100,6 +104,38 @@ def main(): return 0 +def run_with_retry(func, max_retries, initial_backoff, *args, **kwargs): + """ + Runs a function with an exponential backoff retry mechanism. + + :param func: The function to execute. + :param max_retries: Maximum number of times to retry. + :param initial_backoff: The initial wait time in seconds for the first retry. + :param args: Positional arguments to pass to the function. + :param kwargs: Keyword arguments to pass to the function. + :return: The result of the function if successful. + :raises Exception: If the function fails after all retries. + """ + backoff = initial_backoff + for attempt in range(max_retries + 1): + try: + # First attempt (attempt == 0) is without a wait. + if attempt > 0: + print(f"Retrying in {backoff:.2f} seconds...") + time.sleep(backoff) + backoff *= 2 # Exponentially increase backoff time + + return func(*args, **kwargs) + + except Exception as e: + if attempt + 1 > max_retries: + print(f"Final attempt failed. Reached max retries ({max_retries}).") + raise e # Re-raise the last exception + else: + print( + f"Attempt {attempt + 1}/{max_retries} failed: {e}. " + f"Retrying..." + ) if __name__ == "__main__": exit(main()) From 17a34fc25307c1585682df325e6c60dde53ba0f8 Mon Sep 17 00:00:00 2001 From: Sourav Gupta Date: Mon, 15 Sep 2025 12:58:55 -0700 Subject: [PATCH 6/7] Env variable bugfix --- hf-model-import-job/clone_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hf-model-import-job/clone_model.py b/hf-model-import-job/clone_model.py index ffb3c0b..86c06b8 100644 --- a/hf-model-import-job/clone_model.py +++ b/hf-model-import-job/clone_model.py @@ -39,8 +39,8 @@ def main(): required=True, help="TrueFoundry target ML repository name" ) - MAX_RETRIES = os.getenv("MAX_RETRIES", 5) - INITIAL_BACKOFF_SECONDS = os.getenv("INITIAL_BACKOFF_SECONDS", 2) + MAX_RETRIES = int(os.getenv("MAX_RETRIES", 5)) + INITIAL_BACKOFF_SECONDS = int(os.getenv("INITIAL_BACKOFF_SECONDS", 2)) args = parser.parse_args() From dd414757658beb27e433bff9527d90d33c3c5c1c Mon Sep 17 00:00:00 2001 From: Sourav Gupta Date: Mon, 15 Sep 2025 13:11:14 -0700 Subject: [PATCH 7/7] Finalizing changes --- model-clone-job/README.md | 56 +++++++++++++++++++ .../clone_model.py | 0 model-clone-job/requirements.txt | 4 ++ 3 files changed, 60 insertions(+) create mode 100644 model-clone-job/README.md rename {hf-model-import-job => model-clone-job}/clone_model.py (100%) create mode 100644 model-clone-job/requirements.txt diff --git a/model-clone-job/README.md b/model-clone-job/README.md new file mode 100644 index 0000000..e0cb1fd --- /dev/null +++ b/model-clone-job/README.md @@ -0,0 +1,56 @@ +# Model Clone Job + +This script clones a model from one TrueFoundry repository and logs it to another TrueFoundry repository. It downloads the source model, preserves its metadata and framework information, and uploads it to the target repository. + +## Installation + +1. Create a virtual environment (recommended): + +```bash +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +2. Install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Make sure you have TrueFoundry credentials configured (via `tfy login` or environment variables). + +## Usage + +### Basic Usage + +```bash +# Make sure to activate your virtual environment first +source venv/bin/activate # On Windows: venv\Scripts\activate + +python clone_model.py --source-model-fqn "model:truefoundry/source-repo/my-model:1" --target-ml-repo "target-repo" +``` + +## Arguments + +- `--source-model-fqn` (required): TrueFoundry source model FQN (Fully Qualified Name) in the format `model:workspace/source-repo/model-name:version` +- `--target-ml-repo` (required): TrueFoundry target ML repository name where the model will be cloned + +## Examples + +### Clone a model from one repository to another: + +```bash +python clone_model.py --source-model-fqn "model:truefoundry/production-models/bert-classifier:2" --target-ml-repo "staging-models" +``` + +### Clone a model with a specific version: + +```bash +python clone_model.py --source-model-fqn "model:truefoundry/ml-team/image-classifier:v1.0" --target-ml-repo "backup-models" +``` + +### Clone a model across different workspaces: + +```bash +python clone_model.py --source-model-fqn "model:truefoundry/dev-workspace/experimental-model:latest" --target-ml-repo "prod-workspace-models" +``` diff --git a/hf-model-import-job/clone_model.py b/model-clone-job/clone_model.py similarity index 100% rename from hf-model-import-job/clone_model.py rename to model-clone-job/clone_model.py diff --git a/model-clone-job/requirements.txt b/model-clone-job/requirements.txt new file mode 100644 index 0000000..118e1a3 --- /dev/null +++ b/model-clone-job/requirements.txt @@ -0,0 +1,4 @@ +truefoundry==0.11.12 +huggingface_hub>=0.19.0 +transformers>=4.30.0 +torch>=2.0.0 \ No newline at end of file