Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ ci:
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/kynan/nbstripout
rev: 0.7.1
rev: 0.8.1
hooks:
- id: nbstripout
# - repo: https://github.com/cmhughes/latexindent.pl.git
# rev: V3.19.1
# hooks:
# - id: latexindent
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
Expand All @@ -43,7 +43,7 @@ repos:
# # Similar to: https://stackoverflow.com/a/73603491/5755604
# additional_dependencies: ['types-PyYAML']
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.9
rev: v0.8.0
hooks:
- id: ruff
args:
Expand Down
22 changes: 10 additions & 12 deletions notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"ProgressBar.enable()\n",
"\n",
"import wandb\n",
"from tqdm.auto import tqdm\n"
"from tqdm.auto import tqdm"
]
},
{
Expand All @@ -47,7 +47,7 @@
"FILE_PATH_INPUT = (\n",
" \"gs://thesis-bucket-option-trade-classification/data/raw/matched_cboe_quotes.csv\"\n",
")\n",
"FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\"\n"
"FILE_PATH_OUTPUT = \"gs://thesis-bucket-option-trade-classification/data/preprocessed/\""
]
},
{
Expand All @@ -58,7 +58,7 @@
"source": [
"os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
"credentials, _ = google.auth.default()\n",
"fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)\n"
"fs = gcsfs.GCSFileSystem(project=\"thesis\", token=credentials)"
]
},
{
Expand All @@ -76,7 +76,7 @@
"source": [
"# connect to weights and biases\n",
"run = wandb.init(project=\"thesis\", job_type=\"dataset-creation\", entity=\"fbv\")\n",
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")\n"
"dataset = wandb.Artifact(name=f\"{EXCHANGE}_{STRATEGY}_csv\", type=\"raw_data\")"
]
},
{
Expand All @@ -88,8 +88,7 @@
"outputs": [],
"source": [
"def import_data(input_file: str) -> pd.DataFrame:\n",
" \"\"\"\n",
" create a dataframe and optimize its memory usage.\n",
" \"\"\"Create a dataframe and optimize its memory usage.\n",
"\n",
" I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
" of unique columns and chunking to enable import.\n",
Expand Down Expand Up @@ -189,7 +188,7 @@
"\n",
" format = \"%d%b%y:%H:%M:%S\"\n",
" df[\"QUOTE_DATETIME\"] = pd.to_datetime(df[\"QUOTE_DATETIME\"], format=format)\n",
" return df\n"
" return df"
]
},
{
Expand All @@ -203,8 +202,7 @@
"def df_to_parquet(\n",
" x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
") -> None:\n",
" \"\"\"\n",
" Write pd.DataFrame to parquet format.\n",
" \"\"\"Write pd.DataFrame to parquet format.\n",
"\n",
" Args:\n",
" x (pd.DataFrame): input dataframe.\n",
Expand All @@ -222,7 +220,7 @@
" slc.to_parquet(output_path, **parquet_wargs)\n",
"\n",
" # log in w & b\n",
" dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")\n"
" dataset.add_reference(output_path, name=f\"raw_parquet_{chunk:04d}\")"
]
},
{
Expand Down Expand Up @@ -805,7 +803,7 @@
"client = Client()\n",
"\n",
"df = import_data(FILE_PATH_INPUT)\n",
"df_to_parquet(df, FILE_PATH_OUTPUT)\n"
"df_to_parquet(df, FILE_PATH_OUTPUT)"
]
},
{
Expand Down Expand Up @@ -833,7 +831,7 @@
"source": [
"# Log the artifact to save it as an output of this run\n",
"run.log_artifact(dataset)\n",
"wandb.finish()\n"
"wandb.finish()"
]
}
],
Expand Down
Loading