diff --git a/nbs/s3_loader2.ipynb b/nbs/s3_loader2.ipynb new file mode 100644 index 0000000..0bf6cba --- /dev/null +++ b/nbs/s3_loader2.ipynb @@ -0,0 +1,485 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# | default_exp s3_loader2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# | export\n", + "import boto3\n", + "import os\n", + "import datetime\n", + "from datetime import tzinfo\n", + "from dateutil.tz import tzutc\n", + "from torch_snippets import stem, fname" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from torch_snippets.s3_loader2 import S3FileHandler\n", + "\n", + "aws_access_key_id = \"AKIAQFXXXXXXXX6CN\"\n", + "aws_secret_access_key = \"AC3AJsZ6XXXXXXXXXXXXXXXXXejfNN9h\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# | export\n", + "# | hide\n", + "class S3FileHandler:\n", + " def __init__(self, aws_access_key, aws_secret_access_key):\n", + " self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)\n", + "\n", + " def list_s3_buckets(self):\n", + " \"\"\"\n", + " Lists all the s3 buckets in s3\n", + " \"\"\" \n", + " try:\n", + " # Call S3 to list current buckets\n", + " response = self.s3_client.list_buckets()\n", + " buckets = [bucket['Name'] for bucket in response['Buckets']]\n", + " return buckets\n", + " except Exception as e:\n", + " print(e)\n", + "\n", + " def list_s3_objects(self, bucket_name, key=\"\"):\n", + " \"\"\"\n", + " List all files in an S3 bucket or within a specific prefix.\n", + "\n", + " :param bucket_name: str. Name of the S3 bucket.\n", + " :param key: str or None. Specific prefix to list files from, defaults to None.\n", + " \"\"\"\n", + " try:\n", + " # Initialize a paginator for listing objects\n", + " paginator = self.s3_client.get_paginator('list_objects_v2')\n", + " # Use the paginator to fetch all objects in the specified bucket and prefix if provided\n", + " files = dict()\n", + " for page in paginator.paginate(Bucket=bucket_name, Prefix=key):\n", + " # Access the 'Contents' from the page, which lists the objects\n", + " if 'Contents' in page:\n", + " for obj in page['Contents']:\n", + " files[obj['Key']] = obj['Size']\n", + " # print(f\"{obj['Key']} ({obj['Size']} bytes)\")\n", + " return files\n", + " except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n", + "\n", + " def download_s3_folder(self, bucket_name, local_dir, prefix=\"\", verbose=0):\n", + " \"\"\"\n", + " Download all files from an S3 bucket prefix to a local directory.\n", + "\n", + " :param bucket_name: str. Name of the S3 bucket.\n", + " :param local_dir: str. Local directory to which files will be downloaded.\n", + " :param prefix: str or None. Prefix path of the folder in the bucket. If None, the whole bucket is downloaded.\n", + " \"\"\"\n", + " if not prefix.endswith(\"/\"):\n", + " prefix = prefix + \"/\"\n", + " # Ensure local directory exists\n", + " if prefix == \"\":\n", + " local_dir = os.path.join(local_dir, bucket_name)\n", + " else:\n", + " local_dir = os.path.join(local_dir, stem(prefix))\n", + " if not os.path.exists(local_dir):\n", + " os.makedirs(local_dir)\n", + "\n", + " # List objects within the specified prefix\n", + " paginator = self.s3_client.get_paginator('list_objects_v2')\n", + " for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):\n", + " for obj in page.get('Contents', []):\n", + " key = obj['Key']\n", + " if not key.endswith('/'): # skip directories/folders\n", + " # Define file path locally in same structure\n", + " local_file_path = os.path.join(local_dir, key[len(prefix):])\n", + " local_file_dir = os.path.dirname(local_file_path)\n", + " \n", + " # Ensure local file directory exists\n", + " if not os.path.exists(local_file_dir):\n", + " os.makedirs(local_file_dir)\n", + "\n", + " # Download the file\n", + " self.s3_client.download_file(bucket_name, key, local_file_path)\n", + " if verbose:\n", + " print(f\"Downloaded {key} to {local_file_path}\")\n", + "\n", + " def download_s3_file(self, bucket_name, key, local_dir, metadata=False, verbose=0):\n", + " \"\"\"\n", + " Download a specific file from an S3 bucket and optionally return its metadata.\n", + " \n", + " :param bucket_name: str. Name of the S3 bucket.\n", + " :param key: str. The key of the file in the S3 bucket.\n", + " :param local_dir: str. Local directory to which the file will be downloaded.\n", + " :param metadata: bool. If True, return the file's metadata; otherwise, return None.\n", + " :param verbose: bool.\n", + " :return: dict or None. Returns metadata of the file if metadata is True, otherwise None.\n", + " \"\"\"\n", + " # Define the local file path\n", + " local_file_path = os.path.join(local_dir, os.path.basename(key))\n", + "\n", + " # Ensure the local directory exists\n", + " if not os.path.exists(local_dir):\n", + " os.makedirs(local_dir)\n", + "\n", + " # Download the file\n", + " self.s3_client.download_file(bucket_name, key, local_file_path)\n", + " if verbose:\n", + " print(f\"Downloaded {key} to {local_file_path}\")\n", + "\n", + " # Optionally retrieve and return metadata\n", + " if metadata:\n", + " response = self.s3_client.head_object(Bucket=bucket_name, Key=key)\n", + " return response # Return the metadata dictionary\n", + " return None\n", + "\n", + " def upload_file_to_s3(self, bucket_name, localfile_path, s3_key, metadata=None):\n", + " \"\"\"\n", + " Upload a file to an S3 bucket with optional metadata.\n", + " \n", + " :param bucket_name: str. Name of the S3 bucket.\n", + " :param localfile_path: str. Local path to the file to be uploaded.\n", + " :param s3_key: str. S3 key (path within the bucket) where the file will be stored with file name included.\n", + " :param metadata: dict or None. Optional metadata for the file. Defaults to None.\n", + " \"\"\"\n", + " try:\n", + " # Setup the file upload options\n", + " extra_args = {}\n", + " if metadata:\n", + " extra_args[\"Metadata\"] = metadata\n", + "\n", + " # Perform the file upload\n", + " with open(localfile_path, 'rb') as file_data:\n", + " self.s3_client.upload_fileobj(\n", + " Fileobj=file_data,\n", + " Bucket=bucket_name,\n", + " Key=s3_key,\n", + " ExtraArgs=extra_args\n", + " )\n", + " if metadata:\n", + " print(f\"File uploaded successfully to {bucket_name}/{s3_key} with metadata {metadata}\")\n", + " else:\n", + " print(f\"File uploaded successfully to {bucket_name}/{s3_key}\")\n", + " except Exception as e:\n", + " print(f\"Failed to upload file: {e}\")\n", + "\n", + " def inmemory_download_s3(bucket_name, key):\n", + " \"\"\"\n", + " Downloads a file from an Amazon S3 bucket and loads it directly into a pandas DataFrame. \n", + " The function automatically detects the file format based on its extension.\n", + "\n", + " Parameters:\n", + " key (str): The S3 object key of the file to download.\n", + " bucket (str, optional): The name of the S3 bucket. Defaults to AWS_BUCKET from .env if not provided.\n", + " \"\"\"\n", + " response = self.s3_client.get_object(Bucket=bucket_name, Key=key)\n", + " file_content = response['Body'].read()\n", + " return file_content" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "mys3 = S3FileHandler(aws_access_key_id, aws_secret_access_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List all Buckets\n", + "To lists all the s3 buckets in s3 for given credentials" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['buckettest0011',\n", + " 'candidate-proctoring',\n", + " 'sagemaker-ap-south-1-011528263565',\n", + " 'sagemaker-studio-011528263565-u1h3juay9nd',\n", + " 'sentiment-classification-fastapi']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mys3.list_s3_buckets()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List all file objects\n", + "List all files in an S3 bucket or within a specific prefix of the given bucket along with the file size.\n", + "\n", + ":param bucket_name: str. Name of the S3 bucket. \n", + ":param key: str or None. Specific prefix to list files from, defaults to None." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'test/test/line_profiling_results.txt': 921,\n", + " 'test/test/outer_function_profile.txt': 2845}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mys3.list_s3_objects(bucket_name=\"buckettest0011\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### S3 Folder Download\n", + "Download all files from an S3 bucket prefix to a local directory.\n", + "\n", + ":param bucket_name: str. Name of the S3 bucket. \n", + ":param local_dir: str. Local directory to which files will be downloaded. \n", + ":param prefix: str or None. Prefix path of the folder in the bucket. If None, the whole bucket is downloaded. \n", + ":param verbose: bool. Display the download status " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded test/test/line_profiling_results.txt to ./test/line_profiling_results.txt\n", + "Downloaded test/test/outer_function_profile.txt to ./test/outer_function_profile.txt\n" + ] + } + ], + "source": [ + "mys3.download_s3_folder(bucket_name=\"buckettest0011\", local_dir='.', prefix=\"test/test\", verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### S3 File Download\n", + "Download a specific file from an S3 bucket and optionally return its metadata.\n", + "\n", + ":param bucket_name: str. Name of the S3 bucket. \n", + ":param key: str. The key of the file in the S3 bucket. \n", + ":param local_dir: str. Local directory to which the file will be downloaded. \n", + ":param metadata: bool. If True, return the file's metadata; otherwise, return None. \n", + ":param verbose: bool. \n", + ":return: dict or None. Returns metadata of the file if metadata is True, otherwise None. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ResponseMetadata': {'RequestId': '4RT5YGB089R8ER6Y',\n", + " 'HostId': 'JJCRUZzdH+CUZ5enf4O4r1O2oqr7QFgbmff21q7d8NEgeDDTFTjYl2kH75m3vLp5FaTeA3syDNl8G73FW52w8g==',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': 'JJCRUZzdH+CUZ5enf4O4r1O2oqr7QFgbmff21q7d8NEgeDDTFTjYl2kH75m3vLp5FaTeA3syDNl8G73FW52w8g==',\n", + " 'x-amz-request-id': '4RT5YGB089R8ER6Y',\n", + " 'date': 'Tue, 15 Oct 2024 11:47:02 GMT',\n", + " 'last-modified': 'Tue, 15 Oct 2024 09:40:40 GMT',\n", + " 'etag': '\"7c49753bd7d2109ce96bd2568ad8fbef\"',\n", + " 'x-amz-server-side-encryption': 'AES256',\n", + " 'x-amz-meta-author': 'XXXXX',\n", + " 'accept-ranges': 'bytes',\n", + " 'content-type': 'binary/octet-stream',\n", + " 'server': 'AmazonS3',\n", + " 'content-length': '2845'},\n", + " 'RetryAttempts': 0},\n", + " 'AcceptRanges': 'bytes',\n", + " 'LastModified': datetime.datetime(2024, 10, 15, 9, 40, 40, tzinfo=tzutc()),\n", + " 'ContentLength': 2845,\n", + " 'ETag': '\"7c49753bd7d2109ce96bd2568ad8fbef\"',\n", + " 'ContentType': 'binary/octet-stream',\n", + " 'ServerSideEncryption': 'AES256',\n", + " 'Metadata': {'author': 'XXXXX'}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mys3.download_s3_file(bucket_name=\"buckettest0011\", key=\"test/test/outer_function_profile.txt\", local_dir=\".\", metadata=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Uploading file from local to s3 with/without metadata\n", + "Upload a file to an S3 bucket with optional metadata.\n", + "\n", + ":param bucket_name: str. Name of the S3 bucket. \n", + ":param localfile_path: str. Local path to the file to be uploaded. \n", + ":param s3_key: str. S3 key (path within the bucket) where the file will be stored with file name included. \n", + ":param metadata: dict or None. Optional metadata for the file. Defaults to None." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File uploaded successfully to buckettest0011/test/test/line_profiling_results.txt\n" + ] + } + ], + "source": [ + "mys3.upload_file_to_s3(bucket_name=\"buckettest0011\",\n", + " localfile_path=\"/home/user/Documents/line_profiling_results.txt\",\n", + " s3_key=\"test/test/line_profiling_results.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File uploaded successfully to buckettest0011/test/test/line_profiling_results.txt with metadata {'author': 'xxxxx'}\n" + ] + } + ], + "source": [ + "metadata = {\"author\": \"xxxxx\"}\n", + "mys3.upload_file_to_s3(bucket_name=\"buckettest0011\",\n", + " localfile_path=\"/home/yravi/Documents/line_profiling_results.txt\",\n", + " s3_key=\"test/test/line_profiling_results.txt\",\n", + " metadata=metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now lets check by downloading the uploaded file if the metadata is present or not" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloaded test/test/line_profiling_results.txt to ./line_profiling_results.txt\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ResponseMetadata': {'RequestId': '4RTFDPRWMCY0V3KB',\n", + " 'HostId': '7xhJWRbpiSDCoBpCusjp6HisKzqnC2ofYgK51LHD9lw+NYtromEd0wAipoM3qC8eXfdBmHKnOxSV8jkwz0yi1w==',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': '7xhJWRbpiSDCoBpCusjp6HisKzqnC2ofYgK51LHD9lw+NYtromEd0wAipoM3qC8eXfdBmHKnOxSV8jkwz0yi1w==',\n", + " 'x-amz-request-id': '4RTFDPRWMCY0V3KB',\n", + " 'date': 'Tue, 15 Oct 2024 11:47:02 GMT',\n", + " 'last-modified': 'Tue, 15 Oct 2024 11:47:02 GMT',\n", + " 'etag': '\"5a627cd11fe9a0ec5877b4a4f0f33a62\"',\n", + " 'x-amz-server-side-encryption': 'AES256',\n", + " 'x-amz-meta-author': 'xxxxx',\n", + " 'accept-ranges': 'bytes',\n", + " 'content-type': 'binary/octet-stream',\n", + " 'server': 'AmazonS3',\n", + " 'content-length': '921'},\n", + " 'RetryAttempts': 0},\n", + " 'AcceptRanges': 'bytes',\n", + " 'LastModified': datetime.datetime(2024, 10, 15, 11, 47, 2, tzinfo=tzutc()),\n", + " 'ContentLength': 921,\n", + " 'ETag': '\"5a627cd11fe9a0ec5877b4a4f0f33a62\"',\n", + " 'ContentType': 'binary/octet-stream',\n", + " 'ServerSideEncryption': 'AES256',\n", + " 'Metadata': {'author': 'xxxxx'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mys3.download_s3_file(bucket_name=\"buckettest0011\", key=\"test/test/line_profiling_results.txt\", local_dir=\".\", metadata=True, verbose=1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "luminaml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/torch_snippets/_modidx.py b/torch_snippets/_modidx.py index 58c6c5b..055f520 100644 --- a/torch_snippets/_modidx.py +++ b/torch_snippets/_modidx.py @@ -413,6 +413,22 @@ 'torch_snippets/registry.py'), 'torch_snippets.registry.tryeval': ('registry.html#tryeval', 'torch_snippets/registry.py')}, 'torch_snippets.s3_loader': {}, + 'torch_snippets.s3_loader2': { 'torch_snippets.s3_loader2.S3FileHandler': ( 's3_loader2.html#s3filehandler', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.__init__': ( 's3_loader2.html#s3filehandler.__init__', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.download_s3_file': ( 's3_loader2.html#s3filehandler.download_s3_file', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.download_s3_folder': ( 's3_loader2.html#s3filehandler.download_s3_folder', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.inmemory_download_s3': ( 's3_loader2.html#s3filehandler.inmemory_download_s3', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.list_s3_buckets': ( 's3_loader2.html#s3filehandler.list_s3_buckets', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.list_s3_objects': ( 's3_loader2.html#s3filehandler.list_s3_objects', + 'torch_snippets/s3_loader2.py'), + 'torch_snippets.s3_loader2.S3FileHandler.upload_file_to_s3': ( 's3_loader2.html#s3filehandler.upload_file_to_s3', + 'torch_snippets/s3_loader2.py')}, 'torch_snippets.scp': {}, 'torch_snippets.sklegos': { 'torch_snippets.sklegos.Cat2Num': ('sklegos.html#cat2num', 'torch_snippets/sklegos.py'), 'torch_snippets.sklegos.Cat2Num.__init__': ( 'sklegos.html#cat2num.__init__', diff --git a/torch_snippets/adapters.py b/torch_snippets/adapters.py index e00e699..f98be3c 100644 --- a/torch_snippets/adapters.py +++ b/torch_snippets/adapters.py @@ -1,17 +1,8 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/adapters.ipynb. # %% auto 0 -__all__ = [ - "np_2_b64", - "b64_2_np", - "b64_2_file", - "bytes_2_file", - "file_2_bytes", - "csvs_2_cvat", - "cvat_2_csvs", - "df_2_yolo", - "yolo_2_df", -] +__all__ = ['np_2_b64', 'b64_2_np', 'b64_2_file', 'bytes_2_file', 'file_2_bytes', 'csvs_2_cvat', 'cvat_2_csvs', 'df_2_yolo', + 'yolo_2_df'] # %% ../nbs/adapters.ipynb 2 import base64, cv2, numpy as np @@ -20,7 +11,6 @@ from .paths import P, stems, stem, parent, makedir from .markup import write_xml, read_xml, AttrDict - # %% ../nbs/adapters.ipynb 3 def np_2_b64(image: np.ndarray) -> str: """Convert a numpy image to base64 string""" @@ -75,7 +65,6 @@ def file_2_bytes(fpath): output = f.read() return output - # %% ../nbs/adapters.ipynb 4 def _process( df: pd.DataFrame, label_column="readable_label", default_label="Background" @@ -296,7 +285,6 @@ def cvat_2_csvs(xmlfile, csvs_folder): except Exception as e: Warn(f'{e} @ {item["@name"]}') - # %% ../nbs/adapters.ipynb 5 def df_2_yolo(df, h, w, class2id, class_column): yolo_data = [] diff --git a/torch_snippets/bb_utils.py b/torch_snippets/bb_utils.py index 444c07f..d4090c3 100644 --- a/torch_snippets/bb_utils.py +++ b/torch_snippets/bb_utils.py @@ -1,28 +1,9 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/bounding_boxes.ipynb. # %% auto 0 -__all__ = [ - "randint", - "BB", - "df2bbs", - "bbs2df", - "bbfy", - "jitter", - "compute_eps", - "enlarge_bbs", - "shrink_bbs", - "iou", - "compute_distance_matrix", - "compute_distances", - "split_bb_to_xyXY", - "combine_xyXY_to_bb", - "is_absolute", - "is_relative", - "to_relative", - "to_absolute", - "merge_by_bb", - "isin", -] +__all__ = ['randint', 'BB', 'df2bbs', 'bbs2df', 'bbfy', 'jitter', 'compute_eps', 'enlarge_bbs', 'shrink_bbs', 'iou', + 'compute_distance_matrix', 'compute_distances', 'split_bb_to_xyXY', 'combine_xyXY_to_bb', 'is_absolute', + 'is_relative', 'to_relative', 'to_absolute', 'merge_by_bb', 'isin'] # %% ../nbs/bounding_boxes.ipynb 2 import numpy as np @@ -168,7 +149,6 @@ def distances(self, other_bbs, threshold=None, direction=None): raise NotImplementedError("") return sorted(other_bbs, key=lambda obj: self.l2(obj[1])) - # %% ../nbs/bounding_boxes.ipynb 8 def df2bbs(df): """ @@ -303,7 +283,6 @@ def shrink_bbs(bbs, eps=0.2): for (x, y, X, Y), (h, w) in zip(bbs, shs) ] - # %% ../nbs/bounding_boxes.ipynb 9 def iou(bboxes1, bboxes2): """ @@ -384,7 +363,6 @@ def compute_distances(df1, df2, shrink_factors=(1, 1)): distances = compute_distance_matrix(bbs1, bbs2) return distances - # %% ../nbs/bounding_boxes.ipynb 10 def split_bb_to_xyXY(df): """ @@ -526,7 +504,6 @@ def to_absolute(df, height, width, force=False): df = combine_xyXY_to_bb(df) return df - # %% ../nbs/bounding_boxes.ipynb 18 def merge_by_bb(df1, df2, suffixes=("_x", "_y"), iou_threshold=0.1): """Merge df1 columns to df2 by using iou diff --git a/torch_snippets/bokeh_loader.py b/torch_snippets/bokeh_loader.py index 7bfd42c..23789c1 100644 --- a/torch_snippets/bokeh_loader.py +++ b/torch_snippets/bokeh_loader.py @@ -1,7 +1,9 @@ +"""For rapid prototyping bokeh charts""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/bokeh_plotting.ipynb. # %% auto 0 -__all__ = ["parse_sz", "get_bplot"] +__all__ = ['parse_sz', 'get_bplot'] # %% ../nbs/bokeh_plotting.ipynb 2 from bokeh.io import output_notebook, show as bshow @@ -23,7 +25,6 @@ from bokeh.palettes import Spectral7 import numpy as np - # %% ../nbs/bokeh_plotting.ipynb 3 def parse_sz(size): """ diff --git a/torch_snippets/charts.py b/torch_snippets/charts.py index 171533a..3009dea 100644 --- a/torch_snippets/charts.py +++ b/torch_snippets/charts.py @@ -1,16 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/charts.ipynb. # %% auto 0 -__all__ = [ - "alt", - "Chart", - "CM", - "radar", - "confusion_matrix", - "spider", - "upsetaltair_top_level_configuration", - "UpSetAltair", -] +__all__ = ['alt', 'Chart', 'CM', 'radar', 'confusion_matrix', 'spider', 'upsetaltair_top_level_configuration', 'UpSetAltair'] # %% ../nbs/charts.ipynb 2 import altair as alt @@ -22,7 +13,6 @@ init_plt() - # %% ../nbs/charts.ipynb 5 def confusion_matrix(df=None, truth=None, pred=None, mapping=None, save_to=None): """ @@ -96,7 +86,6 @@ def confusion_matrix(df=None, truth=None, pred=None, mapping=None, save_to=None) CM = confusion_matrix - # %% ../nbs/charts.ipynb 12 def spider( df, @@ -193,7 +182,6 @@ def spider( radar = spider - # %% ../nbs/charts.ipynb 14 # Top-level altair configuration def upsetaltair_top_level_configuration( diff --git a/torch_snippets/decorators.py b/torch_snippets/decorators.py index f7fe43c..7aaa918 100644 --- a/torch_snippets/decorators.py +++ b/torch_snippets/decorators.py @@ -1,7 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/decorators.ipynb. # %% auto 0 -__all__ = ["format", "warn_on_fail", "timeit", "io", "check_kwargs_not_none"] +__all__ = ['format', 'warn_on_fail', 'timeit', 'io', 'check_kwargs_not_none'] # %% ../nbs/decorators.ipynb 2 from functools import wraps @@ -11,7 +11,6 @@ from .markup import AD import time - # %% ../nbs/decorators.ipynb 3 def format(input): if isinstance(input, (list, tuple, set, L)): diff --git a/torch_snippets/imgaug_loader.py b/torch_snippets/imgaug_loader.py index 21094cb..50216f5 100644 --- a/torch_snippets/imgaug_loader.py +++ b/torch_snippets/imgaug_loader.py @@ -1,17 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/imgaug_loader.ipynb. # %% auto 0 -__all__ = [ - "do", - "bw", - "rotate", - "pad", - "get_size", - "rescale", - "crop", - "imgaugbbs2bbs", - "bbs2imgaugbbs", -] +__all__ = ['do', 'bw', 'rotate', 'pad', 'get_size', 'rescale', 'crop', 'imgaugbbs2bbs', 'bbs2imgaugbbs'] # %% ../nbs/imgaug_loader.ipynb 2 import imgaug.augmenters as iaa @@ -24,7 +14,6 @@ to_absolute, ) - # %% ../nbs/imgaug_loader.ipynb 3 def do(img, bbs=None, aug=None, cval=255): """ diff --git a/torch_snippets/inspector.py b/torch_snippets/inspector.py index e745feb..c408cc2 100644 --- a/torch_snippets/inspector.py +++ b/torch_snippets/inspector.py @@ -1,13 +1,14 @@ +"""View statistics of tensors and other python containers""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/inspector.ipynb. # %% auto 0 -__all__ = ["inspect"] +__all__ = ['inspect'] # %% ../nbs/inspector.ipynb 2 from .loader import * from .registry import AttrDict - # %% ../nbs/inspector.ipynb 3 def inspect(*arrays, **kwargs): """ diff --git a/torch_snippets/interactive_show.py b/torch_snippets/interactive_show.py index c8a1dce..497cf83 100644 --- a/torch_snippets/interactive_show.py +++ b/torch_snippets/interactive_show.py @@ -1,18 +1,8 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/interactive_show.ipynb. # %% auto 0 -__all__ = [ - "COLORS", - "to_networkx", - "plot_image", - "plot_graph", - "tonp", - "tolist", - "convert_to_nx", - "viz2", - "df2graph_nodes", - "ishow", -] +__all__ = ['COLORS', 'to_networkx', 'plot_image', 'plot_graph', 'tonp', 'tolist', 'convert_to_nx', 'viz2', 'df2graph_nodes', + 'ishow'] # %% ../nbs/interactive_show.ipynb 1 from pathlib import Path @@ -44,7 +34,6 @@ output_notebook() - # %% ../nbs/interactive_show.ipynb 2 def to_networkx( data, diff --git a/torch_snippets/ipython.py b/torch_snippets/ipython.py index 10b412e..5504e6a 100644 --- a/torch_snippets/ipython.py +++ b/torch_snippets/ipython.py @@ -1,22 +1,10 @@ +"""Utilities specific to jupyter""" + # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/jupyter_notebook.ipynb. # %% auto 0 -__all__ = [ - "save_notebook", - "backup_this_notebook", - "backup_all_notebooks", - "backup_folders_of_nbs", - "display_dfs_side_by_side", - "show_big_dataframe", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "store_scrap", - "shutdown_current_notebook", -] +__all__ = ['save_notebook', 'backup_this_notebook', 'backup_all_notebooks', 'backup_folders_of_nbs', 'display_dfs_side_by_side', + 'show_big_dataframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'store_scrap', 'shutdown_current_notebook'] # %% ../nbs/jupyter_notebook.ipynb 2 import os, sys, json, time, hashlib @@ -30,7 +18,6 @@ # %% ../nbs/jupyter_notebook.ipynb 3 # | export - # %% ../nbs/jupyter_notebook.ipynb 4 def save_notebook(file_path): from IPython.display import display, Javascript @@ -124,7 +111,6 @@ def backup_folders_of_nbs(src, dest): for f in Glob(f"{dest}/*/changelog.md"): f.rm(confirm_prompt=False) - # %% ../nbs/jupyter_notebook.ipynb 6 def display_dfs_side_by_side(*args, titles=cycle([""]), max_rows=50): from IPython.display import display_html @@ -151,7 +137,6 @@ def show_big_dataframe(df, max_rows=30): ): show(df, max_rows=max_rows, frame_count=2) - # %% ../nbs/jupyter_notebook.ipynb 7 def h1(text): from IPython.display import Markdown @@ -188,7 +173,6 @@ def h6(text): show(Markdown(f"###### {text}")) - # %% ../nbs/jupyter_notebook.ipynb 8 @contextmanager def store_scrap(at): @@ -203,7 +187,6 @@ def store_scrap(at): ) return scrap - # %% ../nbs/jupyter_notebook.ipynb 9 # Function to shut down the current notebook session def shutdown_current_notebook(delay: int = None): diff --git a/torch_snippets/load_defaults.py b/torch_snippets/load_defaults.py index f92e6d2..ca88894 100644 --- a/torch_snippets/load_defaults.py +++ b/torch_snippets/load_defaults.py @@ -1,7 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/load_defautls.ipynb. # %% auto 0 -__all__ = ["ifnone", "exists", "loadifexists"] +__all__ = ['ifnone', 'exists', 'loadifexists'] # %% ../nbs/load_defautls.ipynb 2 from .loader import os diff --git a/torch_snippets/logger.py b/torch_snippets/logger.py index 54dd6da..09202bd 100644 --- a/torch_snippets/logger.py +++ b/torch_snippets/logger.py @@ -1,33 +1,10 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/logging.ipynb. # %% auto 0 -__all__ = [ - "console", - "reset_logger_width", - "logger", - "Trace", - "Debug", - "Info", - "Warn", - "Excep", - "warn_mode", - "info_mode", - "debug_mode", - "trace_mode", - "excep_mode", - "in_warn_mode", - "in_info_mode", - "in_debug_mode", - "in_trace_mode", - "in_excep_mode", - "frames", - "get_console", - "reset_logger", - "get_logger_level", - "logger_mode", - "in_logger_mode", - "notify_waiting", -] +__all__ = ['console', 'reset_logger_width', 'logger', 'Trace', 'Debug', 'Info', 'Warn', 'Excep', 'warn_mode', 'info_mode', + 'debug_mode', 'trace_mode', 'excep_mode', 'in_warn_mode', 'in_info_mode', 'in_debug_mode', 'in_trace_mode', + 'in_excep_mode', 'frames', 'get_console', 'reset_logger', 'get_logger_level', 'logger_mode', + 'in_logger_mode', 'notify_waiting'] # %% ../nbs/logging.ipynb 2 import logging @@ -47,7 +24,6 @@ from functools import wraps import time - # %% ../nbs/logging.ipynb 4 def get_console(width=None): return Console( @@ -70,7 +46,6 @@ def get_console(width=None): console = get_console() # print = console.print - # %% ../nbs/logging.ipynb 5 @patch_to(RichHandler) def render( @@ -162,7 +137,6 @@ def reset_logger( exception=print_stack_trace, depth=depth + 1 ).log("ERROR", x[0] if len(x) == 1 else "; ".join([str(i) for i in x])) - # %% ../nbs/logging.ipynb 15 def get_logger_level(): """ diff --git a/torch_snippets/markup.py b/torch_snippets/markup.py index 1e556d2..76d0994 100644 --- a/torch_snippets/markup.py +++ b/torch_snippets/markup.py @@ -1,26 +1,9 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/markups.ipynb. # %% auto 0 -__all__ = [ - "AttrDict", - "json", - "Config", - "isnamedtupleinstance", - "unpack", - "hash_tensor", - "hash_pandas_dataframe", - "AttrDictDeprecated", - "decompose", - "pretty_json", - "read_json", - "write_json", - "write_jsonl", - "read_jsonl", - "read_yaml", - "write_yaml", - "read_xml", - "write_xml", -] +__all__ = ['AttrDict', 'json', 'Config', 'isnamedtupleinstance', 'unpack', 'hash_tensor', 'hash_pandas_dataframe', + 'AttrDictDeprecated', 'decompose', 'pretty_json', 'read_json', 'write_json', 'write_jsonl', 'read_jsonl', + 'read_yaml', 'write_yaml', 'read_xml', 'write_xml'] # %% ../nbs/markups.ipynb 3 import json, os @@ -37,7 +20,6 @@ from typing import Union from .thinc_parser.parser import Config - # %% ../nbs/markups.ipynb 4 def _default(self, obj): import numpy as np @@ -433,7 +415,6 @@ def set_default(obj): return dump print(dump) - # %% ../nbs/markups.ipynb 9 json = json @@ -459,7 +440,6 @@ def write_json(obj, fpath, silent=False): json.dump(obj, f, indent=4) return P(fpath) - # %% ../nbs/markups.ipynb 12 def write_jsonl(items, dest, mode="a"): makedir(parent(dest)) @@ -474,7 +454,6 @@ def write_jsonl(items, dest, mode="a"): def read_jsonl(file): return [json.loads(line) for line in readlines(file, silent=True)] - # %% ../nbs/markups.ipynb 13 def read_yaml(file): with open(file, "r") as stream: @@ -488,7 +467,6 @@ def write_yaml(content, fpath): with open(fpath, "w") as outfile: yaml.dump(content, outfile, default_flow_style=False) - # %% ../nbs/markups.ipynb 14 def read_xml(file_path: Union[str, P]) -> AttrDict: "Read xml data as a dictionary" @@ -506,6 +484,5 @@ def write_xml(data: Union[AttrDict, dict], file_path: Union[str, P]): data = xmltodict.unparse(data, pretty=True) xml_file.write(data) - # %% ../nbs/markups.ipynb 15 Config = Config diff --git a/torch_snippets/misc.py b/torch_snippets/misc.py index 364a6cf..a5a404c 100644 --- a/torch_snippets/misc.py +++ b/torch_snippets/misc.py @@ -1,7 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/misc.ipynb. # %% auto 0 -__all__ = ["Timer", "track2", "summarize_input", "timeit", "io", "tryy"] +__all__ = ['Timer', 'track2', 'summarize_input', 'timeit', 'io', 'tryy'] # %% ../nbs/misc.ipynb 2 import time @@ -11,7 +11,6 @@ from fastcore.basics import ifnone from fastcore.foundation import L - # %% ../nbs/misc.ipynb 3 class Timer: def __init__(self, N, smooth=True, mode=1): @@ -76,7 +75,6 @@ def track2(iterable, *, total=None): if info is not None: yield # Just to ensure the send operation stops - # %% ../nbs/misc.ipynb 10 def summarize_input(args, kwargs, outputs=None): o = AD(args, kwargs) @@ -135,7 +133,6 @@ def inner(*args, **kwargs): o = decorator(func) return o - # %% ../nbs/misc.ipynb 18 def tryy( func=None, diff --git a/torch_snippets/paths.py b/torch_snippets/paths.py index 1bcb00b..a7c9b92 100644 --- a/torch_snippets/paths.py +++ b/torch_snippets/paths.py @@ -1,45 +1,11 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/paths.ipynb. # %% auto 0 -__all__ = [ - "valid_methods", - "P", - "ls", - "print_folder_summary", - "dill", - "input_to_str", - "output_to_path", - "process_f", - "get_fs", - "P0", - "stem", - "stems", - "extn", - "remove_file", - "isdir", - "makedir", - "fname", - "fname2", - "parent", - "Glob", - "find", - "zip_files", - "unzip_file", - "list_zip", - "md5", - "remove_duplicates", - "common_items", - "folder_summary", - "readlines", - "readfile", - "writelines", - "tree", - "folder_structure_to_dict", - "folder_structure_to_json", - "rename_batch", - "dumpdill", - "loaddill", -] +__all__ = ['valid_methods', 'P', 'ls', 'print_folder_summary', 'dill', 'input_to_str', 'output_to_path', 'process_f', 'get_fs', + 'P0', 'stem', 'stems', 'extn', 'remove_file', 'isdir', 'makedir', 'fname', 'fname2', 'parent', 'Glob', + 'find', 'zip_files', 'unzip_file', 'list_zip', 'md5', 'remove_duplicates', 'common_items', 'folder_summary', + 'readlines', 'readfile', 'writelines', 'tree', 'folder_structure_to_dict', 'folder_structure_to_json', + 'rename_batch', 'dumpdill', 'loaddill'] # %% ../nbs/paths.ipynb 2 from fastcore.basics import patch_to @@ -65,7 +31,6 @@ import subprocess import shutil - # %% ../nbs/paths.ipynb 3 def input_to_str(func): @wraps(func) @@ -93,7 +58,6 @@ def inner(input, *args, **kwargs): return inner - # %% ../nbs/paths.ipynb 4 def process_f(f): f = f.replace("-", "_").replace(".", "__") @@ -198,7 +162,6 @@ def __dir__(self): fs = get_fs(self) return self.__og_dir__() + list(fs.keys()) - # %% ../nbs/paths.ipynb 8 @patch_to(P) def rmtree(self, prompt="Really remove `{self}` and its contents? [y/n] ", force=False): @@ -284,7 +247,6 @@ def rm( if not silent: logger.info(f"Aborting delete: {self}") - # %% ../nbs/paths.ipynb 21 def isdir(fpath): return os.path.isdir(fpath) @@ -364,7 +326,6 @@ def find( else: return filtered_items - # %% ../nbs/paths.ipynb 23 import zipfile import tarfile @@ -403,7 +364,6 @@ def list_zip(file): elements.append(elem) return elements - # %% ../nbs/paths.ipynb 25 def md5(fname): hash_md5 = hashlib.md5() @@ -451,7 +411,6 @@ def folder_summary(thing): print_folder_summary = lambda x: print(folder_summary(x)) - # %% ../nbs/paths.ipynb 27 def readlines(fpath, silent=False, encoding=None, _strip=True): with open(fpath, "r", encoding=encoding) as f: @@ -498,7 +457,6 @@ def writelines(lines, file, mode): def write_lines(self, lines, mode): return writelines(lines, self, mode) - # %% ../nbs/paths.ipynb 29 def tree(directory="./", filelimit=50, to=None): from builtins import print @@ -527,7 +485,6 @@ def _tree(self, filelimit=50, to=None): P.tree = P._tree - # %% ../nbs/paths.ipynb 31 def folder_structure_to_dict(path): """ @@ -554,7 +511,6 @@ def folder_structure_to_json(path, output_file=None): with open(output_file, "w") as f: json.dump(folder_dict, f, indent=4) - # %% ../nbs/paths.ipynb 33 def rename_batch(folder, func, debug=False, one_file=False): from torch_snippets.loader import now @@ -580,7 +536,6 @@ def rename_batch(folder, func, debug=False, one_file=False): if one_file: break - # %% ../nbs/paths.ipynb 34 dill = dill diff --git a/torch_snippets/pdf_loader.py b/torch_snippets/pdf_loader.py index 110db1c..0e61d43 100644 --- a/torch_snippets/pdf_loader.py +++ b/torch_snippets/pdf_loader.py @@ -1,7 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/pdf.ipynb. # %% auto 0 -__all__ = ["PDF", "dump_pdf_images", "preview_pdf"] +__all__ = ['PDF', 'dump_pdf_images', 'preview_pdf'] # %% ../nbs/pdf.ipynb 2 from .loader import np, subplots, show, resize, L, Image @@ -10,7 +10,6 @@ from fastcore.basics import ifnone from .cli import cli - # %% ../nbs/pdf.ipynb 3 class PDF: """Load a PDF file from `path` as a list of images diff --git a/torch_snippets/profiler.py b/torch_snippets/profiler.py index 6a1317f..3aca960 100644 --- a/torch_snippets/profiler.py +++ b/torch_snippets/profiler.py @@ -1,34 +1,31 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/profiler.ipynb. # %% auto 0 -__all__ = ["time_profiler"] +__all__ = ['time_profiler'] # %% ../nbs/profiler.ipynb 2 import cProfile import pstats from io import StringIO - # %% ../nbs/profiler.ipynb 3 -def time_profiler(filename="profiling_results.txt"): +def time_profiler(filename='profiling_results.txt'): # The outer function that allows customization of the filename def decorator(func): # The middle function which receives the function to be wrapped def wrapper(*args, **kwargs): # The inner function that actually runs the profiling profiler = cProfile.Profile() - profiler.enable() # Start profiling + profiler.enable() # Start profiling result = func(*args, **kwargs) - profiler.disable() # End profiling + profiler.disable() # End profiling # Create a StringIO stream to capture profiling results s = StringIO() - ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") + ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative') ps.print_stats() # Write the profiling results to the specified file - with open(filename, "w") as f: + with open(filename, 'w') as f: f.write(s.getvalue()) return result - return wrapper - return decorator diff --git a/torch_snippets/registry.py b/torch_snippets/registry.py index fe76c41..caaaafe 100644 --- a/torch_snippets/registry.py +++ b/torch_snippets/registry.py @@ -1,16 +1,7 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/registry.ipynb. # %% auto 0 -__all__ = [ - "Config", - "AttrDict", - "registry", - "tryeval", - "parse_base", - "parse", - "parse_and_resolve", - "parse_string", -] +__all__ = ['Config', 'AttrDict', 'registry', 'tryeval', 'parse_base', 'parse', 'parse_and_resolve', 'parse_string'] # %% ../nbs/registry.ipynb 2 from .markup import Config, AttrDict, L diff --git a/torch_snippets/s3_loader2.py b/torch_snippets/s3_loader2.py new file mode 100644 index 0000000..e6cda31 --- /dev/null +++ b/torch_snippets/s3_loader2.py @@ -0,0 +1,160 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/s3_loader2.ipynb. + +# %% auto 0 +__all__ = ['S3FileHandler'] + +# %% ../nbs/s3_loader2.ipynb 1 +import boto3 +import os +import datetime +from datetime import tzinfo +from dateutil.tz import tzutc +from . import stem, fname + +# %% ../nbs/s3_loader2.ipynb 3 +class S3FileHandler: + def __init__(self, aws_access_key, aws_secret_access_key): + self.s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) + + def list_s3_buckets(self): + """ + Lists all the s3 buckets in s3 + """ + try: + # Call S3 to list current buckets + response = self.s3_client.list_buckets() + buckets = [bucket['Name'] for bucket in response['Buckets']] + return buckets + except Exception as e: + print(e) + + def list_s3_objects(self, bucket_name, key=""): + """ + List all files in an S3 bucket or within a specific prefix. + + :param bucket_name: str. Name of the S3 bucket. + :param key: str or None. Specific prefix to list files from, defaults to None. + """ + try: + # Initialize a paginator for listing objects + paginator = self.s3_client.get_paginator('list_objects_v2') + # Use the paginator to fetch all objects in the specified bucket and prefix if provided + files = dict() + for page in paginator.paginate(Bucket=bucket_name, Prefix=key): + # Access the 'Contents' from the page, which lists the objects + if 'Contents' in page: + for obj in page['Contents']: + files[obj['Key']] = obj['Size'] + # print(f"{obj['Key']} ({obj['Size']} bytes)") + return files + except Exception as e: + print(f"An error occurred: {e}") + + def download_s3_folder(self, bucket_name, local_dir, prefix="", verbose=0): + """ + Download all files from an S3 bucket prefix to a local directory. + + :param bucket_name: str. Name of the S3 bucket. + :param local_dir: str. Local directory to which files will be downloaded. + :param prefix: str or None. Prefix path of the folder in the bucket. If None, the whole bucket is downloaded. + """ + if not prefix.endswith("/"): + prefix = prefix + "/" + # Ensure local directory exists + if prefix == "": + local_dir = os.path.join(local_dir, bucket_name) + else: + local_dir = os.path.join(local_dir, stem(prefix)) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + # List objects within the specified prefix + paginator = self.s3_client.get_paginator('list_objects_v2') + for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix): + for obj in page.get('Contents', []): + key = obj['Key'] + if not key.endswith('/'): # skip directories/folders + # Define file path locally in same structure + local_file_path = os.path.join(local_dir, key[len(prefix):]) + local_file_dir = os.path.dirname(local_file_path) + + # Ensure local file directory exists + if not os.path.exists(local_file_dir): + os.makedirs(local_file_dir) + + # Download the file + self.s3_client.download_file(bucket_name, key, local_file_path) + if verbose: + print(f"Downloaded {key} to {local_file_path}") + + def download_s3_file(self, bucket_name, key, local_dir, metadata=False, verbose=0): + """ + Download a specific file from an S3 bucket and optionally return its metadata. + + :param bucket_name: str. Name of the S3 bucket. + :param key: str. The key of the file in the S3 bucket. + :param local_dir: str. Local directory to which the file will be downloaded. + :param metadata: bool. If True, return the file's metadata; otherwise, return None. + :param verbose: bool. + :return: dict or None. Returns metadata of the file if metadata is True, otherwise None. + """ + # Define the local file path + local_file_path = os.path.join(local_dir, os.path.basename(key)) + + # Ensure the local directory exists + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + # Download the file + self.s3_client.download_file(bucket_name, key, local_file_path) + if verbose: + print(f"Downloaded {key} to {local_file_path}") + + # Optionally retrieve and return metadata + if metadata: + response = self.s3_client.head_object(Bucket=bucket_name, Key=key) + return response # Return the metadata dictionary + return None + + def upload_file_to_s3(self, bucket_name, localfile_path, s3_key, metadata=None): + """ + Upload a file to an S3 bucket with optional metadata. + + :param bucket_name: str. Name of the S3 bucket. + :param localfile_path: str. Local path to the file to be uploaded. + :param s3_key: str. S3 key (path within the bucket) where the file will be stored with file name included. + :param metadata: dict or None. Optional metadata for the file. Defaults to None. + """ + try: + # Setup the file upload options + extra_args = {} + if metadata: + extra_args["Metadata"] = metadata + + # Perform the file upload + with open(localfile_path, 'rb') as file_data: + self.s3_client.upload_fileobj( + Fileobj=file_data, + Bucket=bucket_name, + Key=s3_key, + ExtraArgs=extra_args + ) + if metadata: + print(f"File uploaded successfully to {bucket_name}/{s3_key} with metadata {metadata}") + else: + print(f"File uploaded successfully to {bucket_name}/{s3_key}") + except Exception as e: + print(f"Failed to upload file: {e}") + + def inmemory_download_s3(bucket_name, key): + """ + Downloads a file from an Amazon S3 bucket and loads it directly into a pandas DataFrame. + The function automatically detects the file format based on its extension. + + Parameters: + key (str): The S3 object key of the file to download. + bucket (str, optional): The name of the S3 bucket. Defaults to AWS_BUCKET from .env if not provided. + """ + response = self.s3_client.get_object(Bucket=bucket_name, Key=key) + file_content = response['Body'].read() + return file_content diff --git a/torch_snippets/sklegos.py b/torch_snippets/sklegos.py index c3a1b1c..c3afa41 100644 --- a/torch_snippets/sklegos.py +++ b/torch_snippets/sklegos.py @@ -1,17 +1,8 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/sklegos.ipynb. # %% auto 0 -__all__ = [ - "ColumnSelector", - "GroupedPredictor", - "EstimatorTransformer", - "train_test_split", - "MakeFrame", - "ImputeMissingValues", - "LambdaTransformer", - "Cat2Num", - "SplitDateColumn", -] +__all__ = ['ColumnSelector', 'GroupedPredictor', 'EstimatorTransformer', 'train_test_split', 'MakeFrame', 'ImputeMissingValues', + 'LambdaTransformer', 'Cat2Num', 'SplitDateColumn'] # %% ../nbs/sklegos.ipynb 3 from . import * @@ -28,7 +19,6 @@ def train_test_split(*args, **kwargs): outputs = [i.reset_index(drop=True) for i in outputs] return outputs - # %% ../nbs/sklegos.ipynb 4 from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin @@ -63,7 +53,6 @@ def transform(self, X, y=None): EstimatorTransformer = EstimatorTransformer - # %% ../nbs/sklegos.ipynb 6 class MakeFrame(BaseEstimator, TransformerMixin): """Convert sklearn's output to a pandas dataframe @@ -79,7 +68,6 @@ def fit(self, X, y=None): def transform(self, X, y=None): return pd.DataFrame(X, columns=self.column_names) - # %% ../nbs/sklegos.ipynb 8 class ImputeMissingValues(BaseEstimator, TransformerMixin): """DataFrame input - DataFrame output @@ -127,7 +115,6 @@ def transform(self, X, y=None): def fit_transform(self, trn_df, y=None): return self.transform(self.fit(trn_df, y)) - # %% ../nbs/sklegos.ipynb 9 class LambdaTransformer(BaseEstimator, TransformerMixin): def __init__(self, fn): @@ -148,7 +135,6 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): return self.fit(self.transform(X)) - # %% ../nbs/sklegos.ipynb 10 class MakeFrame(BaseEstimator, TransformerMixin): def __init__(self, column_names): @@ -166,7 +152,6 @@ def predict_proba(self, X, y=None): def predict(self, X, y=None): return self.transform(X) - # %% ../nbs/sklegos.ipynb 11 class Cat2Num(BaseEstimator, TransformerMixin): def __init__(self): ... @@ -188,7 +173,6 @@ def transform(self, df, y=None): def fit_transform(self, trn_df, y=None): return self.transform(self.fit(trn_df, y)) - # %% ../nbs/sklegos.ipynb 12 class SplitDateColumn(BaseEstimator, TransformerMixin): def __init__(self, column_names, has_date, has_time, date_format=None): diff --git a/torch_snippets/trainer/capsule.py b/torch_snippets/trainer/capsule.py index 1de2c0e..928cf83 100644 --- a/torch_snippets/trainer/capsule.py +++ b/torch_snippets/trainer/capsule.py @@ -1,11 +1,10 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/capsule.ipynb. # %% auto 0 -__all__ = ["to", "train", "validate", "predict", "Capsule"] +__all__ = ['to', 'train', 'validate', 'predict', 'Capsule'] # %% ../../nbs/capsule.ipynb 2 from .. import init_torch - init_torch() from functools import wraps @@ -21,7 +20,6 @@ except ImportError: DataContainer = None - # %% ../../nbs/capsule.ipynb 3 def to(item, device): if item is None: @@ -80,7 +78,6 @@ def _predict(self, *args, **kwargs): return _predict - # %% ../../nbs/capsule.ipynb 4 class Capsule(nn.Module): """ diff --git a/torch_snippets/trainer/config.py b/torch_snippets/trainer/config.py index bb23b87..cb6f8ad 100644 --- a/torch_snippets/trainer/config.py +++ b/torch_snippets/trainer/config.py @@ -1,14 +1,13 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/config.ipynb. # %% auto 0 -__all__ = ["DeepLearningConfig", "GenericConfig"] +__all__ = ['DeepLearningConfig', 'GenericConfig'] # %% ../../nbs/config.ipynb 1 from ..registry import parse from .. import store_attr, ifnone import inspect as inspect_builtin - # %% ../../nbs/config.ipynb 2 class DeepLearningConfig: """ @@ -58,7 +57,6 @@ def from_ini_file(cls, filepath, *, config_root=None): def __repr__(self): return f"{self.__class__.__name__}:\n" + str({**self}) - # %% ../../nbs/config.ipynb 7 class GenericConfig(DeepLearningConfig): def __init__(self, **kwargs):