diff --git a/data_science/development/choose_stream.ipynb b/data_science/development/choose_stream.ipynb new file mode 100644 index 0000000..f4e3c54 --- /dev/null +++ b/data_science/development/choose_stream.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "5ddebc79", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "20a6c94b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "def choose_stream(\n", + " df,\n", + " excluded: List[str] = [\"created_at\", \"entry_id\"],\n", + " min_selection: int = 3\n", + ") -> List[str]:\n", + " \"\"\"\n", + " Allows user to select data streams (columns) from a dataframe interactively.\n", + "\n", + " - Skips excluded columns (e.g., metadata like 'created_at', 'entry_id')\n", + " - Enforces minimum selection (default = 3)\n", + " - Validates input against available fields\n", + " - Removes duplicates and trims whitespace\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): The dataset\n", + " excluded (List[str]): Columns to ignore for selection\n", + " min_selection (int): Minimum required columns\n", + "\n", + " Returns:\n", + " List[str]: Validated list of selected column names\n", + " \"\"\"\n", + "\n", + " # Filter available columns\n", + " available_fields = [col for col in df.columns if col not in excluded]\n", + "\n", + " # Display fields\n", + " print(\"\\nAvailable fields (select at least 3):\")\n", + " print(\"-\" * 40)\n", + " for col in available_fields:\n", + " print(f\"- {col}\")\n", + " print(\"-\" * 40)\n", + "\n", + " # Interactive loop\n", + " while True:\n", + " user_input = input(f\"Enter at least {min_selection} field names separated by commas: \")\n", + "\n", + " # Clean and normalize input\n", + " selected = [col.strip() for col in user_input.split(\",\") if col.strip()]\n", + " selected = list(dict.fromkeys(selected)) # Remove duplicates (preserve order)\n", + "\n", + " # Validation checks\n", + " if len(selected) < min_selection:\n", + " print(f\"Error: Please select at least {min_selection} fields.\")\n", + " continue\n", + "\n", + " invalid = [col for col in selected if col not in available_fields]\n", + " if invalid:\n", + " print(f\"Error: Invalid field(s): {', '.join(invalid)}\")\n", + " print(f\"Valid options are: {', '.join(available_fields)}\")\n", + " continue\n", + "\n", + " return selected\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9ccb8657", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Available fields (select at least 3):\n", + "----------------------------------------\n", + "- field1\n", + "- field2\n", + "- field3\n", + "- field4\n", + "- field5\n", + "- field6\n", + "- field7\n", + "- field8\n", + "----------------------------------------\n", + "Selected Streams : ['field1', 'field2', 'field3']\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"./datasets/2881821.csv\")\n", + "selected_streams = choose_stream(df)\n", + "print(\"Selected Streams : \", selected_streams)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sit720_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}