From 08d96cf70f22ea88482954a6e98e4444f5174a6a Mon Sep 17 00:00:00 2001 From: Senura Perera Date: Thu, 7 Aug 2025 21:04:29 +1000 Subject: [PATCH 1/2] Implemented the choose stream function --- data_science/development/choose_stream.ipynb | 167 +++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 data_science/development/choose_stream.ipynb diff --git a/data_science/development/choose_stream.ipynb b/data_science/development/choose_stream.ipynb new file mode 100644 index 0000000..45066c6 --- /dev/null +++ b/data_science/development/choose_stream.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "5ddebc79", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8ffc7655", + "metadata": {}, + "outputs": [], + "source": [ + "def choose_stream(df):\n", + " \n", + " # display available column names\n", + " print(\"Available columns\")\n", + " for col in df.columns:\n", + " print(f\"- {col}\")\n", + " \n", + " while True:\n", + " \n", + " # get user input for column names\n", + " user_input = input(\"Enter atleast 3 column names seperated by commas : \")\n", + " selected_columns = [col.strip() for col in user_input.split(\",\")]\n", + "\n", + " # validate selected columns\n", + " if(len(selected_columns)<3):\n", + " print(\"Please select atleast 3 columns.\")\n", + " continue\n", + " if(len(selected_columns)>len(df.columns)):\n", + " print(f\"Error: Cannot select more than available columns ({len(df.columns)}).\")\n", + " continue\n", + "\n", + " #validate column names\n", + " invalid_columns = [col for col in selected_columns if col not in df.columns]\n", + " if invalid_columns:\n", + " print(f\"Error: Invalid column names ({', '.join(invalid_columns)}).\")\n", + " continue\n", + "\n", + " return selected_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "20a6c94b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "def choose_stream(\n", + " df,\n", + " excluded: List[str] = [\"created_at\", \"entry_id\"],\n", + " min_selection: int = 3\n", + ") -> List[str]:\n", + " \"\"\"\n", + " Allows user to select data streams (columns) from a dataframe interactively.\n", + "\n", + " - Skips excluded columns (e.g., metadata like 'created_at', 'entry_id')\n", + " - Enforces minimum selection (default = 3)\n", + " - Validates input against available fields\n", + " - Removes duplicates and trims whitespace\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): The dataset\n", + " excluded (List[str]): Columns to ignore for selection\n", + " min_selection (int): Minimum required columns\n", + "\n", + " Returns:\n", + " List[str]: Validated list of selected column names\n", + " \"\"\"\n", + "\n", + " # Filter available columns\n", + " available_fields = [col for col in df.columns if col not in excluded]\n", + "\n", + " # Display fields\n", + " print(\"\\nAvailable fields (select at least 3):\")\n", + " print(\"-\" * 40)\n", + " for col in available_fields:\n", + " print(f\"- {col}\")\n", + " print(\"-\" * 40)\n", + "\n", + " # Interactive loop\n", + " while True:\n", + " user_input = input(f\"Enter at least {min_selection} field names separated by commas: \")\n", + "\n", + " # Clean and normalize input\n", + " selected = [col.strip() for col in user_input.split(\",\") if col.strip()]\n", + " selected = list(dict.fromkeys(selected)) # Remove duplicates (preserve order)\n", + "\n", + " # Validation checks\n", + " if len(selected) < min_selection:\n", + " print(f\"Error: Please select at least {min_selection} fields.\")\n", + " continue\n", + "\n", + " invalid = [col for col in selected if col not in available_fields]\n", + " if invalid:\n", + " print(f\"Error: Invalid field(s): {', '.join(invalid)}\")\n", + " print(f\"Valid options are: {', '.join(available_fields)}\")\n", + " continue\n", + "\n", + " return selected\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9ccb8657", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Available fields (select at least 3):\n", + "----------------------------------------\n", + "- field1\n", + "- field2\n", + "- field3\n", + "- field4\n", + "- field5\n", + "- field6\n", + "- field7\n", + "- field8\n", + "----------------------------------------\n", + "Selected Streams : ['field1', 'field2', 'field3']\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"./datasets/2881821.csv\")\n", + "selected_streams = choose_stream(df)\n", + "print(\"Selected Streams : \", selected_streams)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sit720_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 237401f56efe7b43ad8647c245d43673357efba6 Mon Sep 17 00:00:00 2001 From: Senura Perera Date: Thu, 7 Aug 2025 21:08:46 +1000 Subject: [PATCH 2/2] Update choose_stream.ipynb --- data_science/development/choose_stream.ipynb | 37 -------------------- 1 file changed, 37 deletions(-) diff --git a/data_science/development/choose_stream.ipynb b/data_science/development/choose_stream.ipynb index 45066c6..f4e3c54 100644 --- a/data_science/development/choose_stream.ipynb +++ b/data_science/development/choose_stream.ipynb @@ -10,43 +10,6 @@ "import pandas as pd" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8ffc7655", - "metadata": {}, - "outputs": [], - "source": [ - "def choose_stream(df):\n", - " \n", - " # display available column names\n", - " print(\"Available columns\")\n", - " for col in df.columns:\n", - " print(f\"- {col}\")\n", - " \n", - " while True:\n", - " \n", - " # get user input for column names\n", - " user_input = input(\"Enter atleast 3 column names seperated by commas : \")\n", - " selected_columns = [col.strip() for col in user_input.split(\",\")]\n", - "\n", - " # validate selected columns\n", - " if(len(selected_columns)<3):\n", - " print(\"Please select atleast 3 columns.\")\n", - " continue\n", - " if(len(selected_columns)>len(df.columns)):\n", - " print(f\"Error: Cannot select more than available columns ({len(df.columns)}).\")\n", - " continue\n", - "\n", - " #validate column names\n", - " invalid_columns = [col for col in selected_columns if col not in df.columns]\n", - " if invalid_columns:\n", - " print(f\"Error: Invalid column names ({', '.join(invalid_columns)}).\")\n", - " continue\n", - "\n", - " return selected_columns" - ] - }, { "cell_type": "code", "execution_count": 5,