Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions data_science/development/choose_stream.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "5ddebc79",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "20a6c94b",
"metadata": {},
"outputs": [],
"source": [
"from typing import List\n",
"\n",
"def choose_stream(\n",
" df,\n",
" excluded: List[str] = [\"created_at\", \"entry_id\"],\n",
" min_selection: int = 3\n",
") -> List[str]:\n",
" \"\"\"\n",
" Allows user to select data streams (columns) from a dataframe interactively.\n",
"\n",
" - Skips excluded columns (e.g., metadata like 'created_at', 'entry_id')\n",
" - Enforces minimum selection (default = 3)\n",
" - Validates input against available fields\n",
" - Removes duplicates and trims whitespace\n",
"\n",
" Parameters:\n",
" df (pd.DataFrame): The dataset\n",
" excluded (List[str]): Columns to ignore for selection\n",
" min_selection (int): Minimum required columns\n",
"\n",
" Returns:\n",
" List[str]: Validated list of selected column names\n",
" \"\"\"\n",
"\n",
" # Filter available columns\n",
" available_fields = [col for col in df.columns if col not in excluded]\n",
"\n",
" # Display fields\n",
" print(\"\\nAvailable fields (select at least 3):\")\n",
" print(\"-\" * 40)\n",
" for col in available_fields:\n",
" print(f\"- {col}\")\n",
" print(\"-\" * 40)\n",
"\n",
" # Interactive loop\n",
" while True:\n",
" user_input = input(f\"Enter at least {min_selection} field names separated by commas: \")\n",
"\n",
" # Clean and normalize input\n",
" selected = [col.strip() for col in user_input.split(\",\") if col.strip()]\n",
" selected = list(dict.fromkeys(selected)) # Remove duplicates (preserve order)\n",
"\n",
" # Validation checks\n",
" if len(selected) < min_selection:\n",
" print(f\"Error: Please select at least {min_selection} fields.\")\n",
" continue\n",
"\n",
" invalid = [col for col in selected if col not in available_fields]\n",
" if invalid:\n",
" print(f\"Error: Invalid field(s): {', '.join(invalid)}\")\n",
" print(f\"Valid options are: {', '.join(available_fields)}\")\n",
" continue\n",
"\n",
" return selected\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9ccb8657",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Available fields (select at least 3):\n",
"----------------------------------------\n",
"- field1\n",
"- field2\n",
"- field3\n",
"- field4\n",
"- field5\n",
"- field6\n",
"- field7\n",
"- field8\n",
"----------------------------------------\n",
"Selected Streams : ['field1', 'field2', 'field3']\n"
]
}
],
"source": [
"df = pd.read_csv(\"./datasets/2881821.csv\")\n",
"selected_streams = choose_stream(df)\n",
"print(\"Selected Streams : \", selected_streams)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "sit720_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
}
},
"nbformat": 4,
"nbformat_minor": 5
}