From 857bb2fd8198b4d1f07803d6ba3caada7125bc32 Mon Sep 17 00:00:00 2001 From: sjasthi Date: Mon, 8 May 2023 11:00:11 -0500 Subject: [PATCH] FP #3: data validation, duplicate recognition - are integrated --- python101_2023_gas_prices_project.ipynb | 562 +++++++++++++----------- 1 file changed, 313 insertions(+), 249 deletions(-) diff --git a/python101_2023_gas_prices_project.ipynb b/python101_2023_gas_prices_project.ipynb index 77021bb..d8cd6c1 100644 --- a/python101_2023_gas_prices_project.ipynb +++ b/python101_2023_gas_prices_project.ipynb @@ -4,7 +4,6 @@ "metadata": { "colab": { "provenance": [], - "toc_visible": true, "include_colab_link": true }, "kernelspec": { @@ -47,32 +46,10 @@ "!wget https://raw.githubusercontent.com/sjasthi/python_input_files/main/gasprices.txt" ], "metadata": { - "id": "cB1aEknPb3bT", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "f3abc923-48da-494d-be92-b7a5855f4572" + "id": "cB1aEknPb3bT" }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2023-05-08 15:49:56-- https://raw.githubusercontent.com/sjasthi/python_input_files/main/gasprices.txt\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 19059 (19K) [text/plain]\n", - "Saving to: ‘gasprices.txt’\n", - "\n", - "gasprices.txt 100%[===================>] 18.61K --.-KB/s in 0.001s \n", - "\n", - "2023-05-08 15:49:57 (12.4 MB/s) - ‘gasprices.txt’ saved [19059/19059]\n", - "\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -131,7 +108,7 @@ "metadata": { "id": "37w-n6jcjlM_" }, - "execution_count": 2, + "execution_count": null, "outputs": [] }, { @@ -181,6 +158,12 @@ " for s in file_lines:\n", " m_d_y_list = s.split('-')\n", " y_p_list = m_d_y_list[2].split(':')\n", + "\n", + " # Once we split by :, we should be getting exactly two tokens.\n", + " # Otherwise, ignore the line\n", + " if (len(y_p_list)!=2):\n", + " continue\n", + "\n", " price = y_p_list[1].strip('\\n')\n", "\n", " m_d_y_p_list = []\n", @@ -195,6 +178,13 @@ " m_d_y_p_list.append(price)\n", "\n", " #update the prices master list\n", + " # only if the m_d_y_p_list doesn't exist in the master list\n", + " # this helps us to avoid the duplicates\n", + " if m_d_y_p_list in prices_master_list:\n", + " continue\n", + "\n", + " # if it is not in the master list,\n", + " # we will continue updating our data structures\n", " prices_master_list.append(m_d_y_p_list)\n", "\n", " # update the year dictionary\n", @@ -230,22 +220,10 @@ " print(\"gasprices.txt data is processed\")" ], "metadata": { - "id": "uqkxR9bLitmI", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "fa9ef299-b12d-43d6-f439-082d044b1e49" + "id": "uqkxR9bLitmI" }, - "execution_count": 3, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "gasprices.txt data is processed\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -296,22 +274,10 @@ " print(\"get_average_price_per_year( ) is regisered\")" ], "metadata": { - "id": "a-23MFiOixer", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "bce076d8-a24e-47fe-e37b-2c335a1b53b2" + "id": "a-23MFiOixer" }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "get_average_price_per_year( ) is regisered\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -330,42 +296,10 @@ " print(year,average_price)" ], "metadata": { - "id": "BlNmOIfegF_O", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "c4487c8e-1e4b-47d4-fd24-198d9ec69bd8" + "id": "BlNmOIfegF_O" }, - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "1993 1.0711538461538466\n", - "1994 1.0778653846153845\n", - "1995 1.1577115384615386\n", - "1996 1.2445283018867925\n", - "1997 1.2442499999999999\n", - "1998 1.071711538461538\n", - "1999 1.1760576923076924\n", - "2000 1.522730769230769\n", - "2001 1.4603018867924529\n", - "2002 1.385961538461538\n", - "2003 1.603019230769231\n", - "2004 1.8946923076923083\n", - "2005 2.314461538461538\n", - "2006 2.6182692307692315\n", - "2007 2.8434716981132078\n", - "2008 3.2989038461538462\n", - "2009 2.4058269230769236\n", - "2010 2.835057692307693\n", - "2011 3.576423076923077\n", - "2012 3.6796415094339627\n", - "2013 3.651441176470588\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -412,33 +346,10 @@ "# Bug 2 : The average price needs to be rounded off to a.bc (2 digts after the decimal)" ], "metadata": { - "id": "obufOlRXixoV", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "8a9f5b2c-da58-4c85-9320-4da07c585316" + "id": "obufOlRXixoV" }, - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "1 1.8996666666666664\n", - "2 1.9232705882352943\n", - "3 1.9622209302325573\n", - "4 2.0024971264367806\n", - "5 2.031773755656108\n", - "6 2.052775894538607\n", - "7 2.0668349358974365\n", - "8 2.075001394700141\n", - "9 2.074043640897756\n", - "10 2.0694983164983176\n", - "11 2.0559222108495407\n", - "12 2.0399230046948365\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -497,34 +408,10 @@ "get_average_prices_per_month_dictionary()" ], "metadata": { - "id": "pJG1MDwegMz7", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "2a770297-ef7c-4434-b9ee-d5ba6e3b6805" + "id": "pJG1MDwegMz7" }, - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Month - Average Price\n", - "April : 2.12\n", - "May : 2.14\n", - "June : 2.16\n", - "July : 2.15\n", - "August : 2.13\n", - "September : 2.07\n", - "October : 2.03\n", - "November : 1.92\n", - "December : 1.86\n", - "January : 1.9\n", - "February : 1.95\n", - "March : 2.04\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -569,43 +456,10 @@ "#Bug 3: The output could use escape strings /t (tabbed separation) " ], "metadata": { - "id": "6MfaQaIGixzT", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "610aa542-2c18-452a-8e1f-399b53a972cc" + "id": "6MfaQaIGixzT" }, - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Year Highest Lowest\n", - "1993 1.107 0.999\n", - "1994 1.165 0.992\n", - "1995 1.246 1.106\n", - "1996 1.33 1.126\n", - "1997 1.288 1.158\n", - "1998 1.113 0.979\n", - "1999 1.315 0.949\n", - "2000 1.711 1.304\n", - "2001 1.748 1.101\n", - "2002 1.499 1.142\n", - "2003 1.787 1.487\n", - "2004 2.104 1.552\n", - "2005 3.117 1.824\n", - "2006 3.083 2.246\n", - "2007 3.258 2.213\n", - "2008 4.165 1.67\n", - "2009 2.746 1.737\n", - "2010 3.106 2.664\n", - "2011 4.018 3.124\n", - "2012 3.997 3.324\n", - "2013 3.851 3.373\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -634,43 +488,10 @@ "get_highest_and_lowest_prices_per_year_dictionary() " ], "metadata": { - "id": "p6njOXJ0gTOa", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "1f043d18-5e2c-4e6a-afe4-224703bf11e9" + "id": "p6njOXJ0gTOa" }, - "execution_count": 9, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Year\t\tHighest Price\t\tLowest Price\n", - "1993 \t\t 1.107 \t\t\t 0.999\n", - "1994 \t\t 1.165 \t\t\t 0.992\n", - "1995 \t\t 1.246 \t\t\t 1.106\n", - "1996 \t\t 1.33 \t\t\t 1.126\n", - "1997 \t\t 1.288 \t\t\t 1.158\n", - "1998 \t\t 1.148 \t\t\t 0.979\n", - "1999 \t\t 1.315 \t\t\t 0.949\n", - "2000 \t\t 1.711 \t\t\t 1.304\n", - "2001 \t\t 1.748 \t\t\t 1.101\n", - "2002 \t\t 1.499 \t\t\t 1.142\n", - "2003 \t\t 1.787 \t\t\t 1.487\n", - "2004 \t\t 2.104 \t\t\t 1.552\n", - "2005 \t\t 3.117 \t\t\t 1.824\n", - "2006 \t\t 3.083 \t\t\t 2.246\n", - "2007 \t\t 3.258 \t\t\t 2.213\n", - "2008 \t\t 4.165 \t\t\t 1.67\n", - "2009 \t\t 2.746 \t\t\t 1.737\n", - "2010 \t\t 3.106 \t\t\t 2.664\n", - "2011 \t\t 4.018 \t\t\t 3.124\n", - "2012 \t\t 3.997 \t\t\t 3.324\n", - "2013 \t\t 3.851 \t\t\t 3.373\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", @@ -715,7 +536,7 @@ "metadata": { "id": "nTLqjQzliyCb" }, - "execution_count": 10, + "execution_count": null, "outputs": [] }, { @@ -761,7 +582,7 @@ "metadata": { "id": "2UnxtLVmi203" }, - "execution_count": 11, + "execution_count": null, "outputs": [] }, { @@ -813,7 +634,7 @@ "metadata": { "id": "vdCAP2O9EFmP" }, - "execution_count": 12, + "execution_count": null, "outputs": [] }, { @@ -849,31 +670,15 @@ "\n" ], "metadata": { - "id": "i1PTYudzg1OO", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "7cdd20fd-6091-4226-b32c-f7d11e7be7af" + "id": "i1PTYudzg1OO" }, - "execution_count": 13, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Printing the Frequency Table\n", - "Low ==> 569\n", - "Medium ==> 157\n", - "High ==> 188\n", - "Very High ==> 151\n" - ] - } - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", "source": [ - "#@title get_missing_dates_report\n", + "#@title get_missing_dates_list_report\n", "def get_missing_dates_list_report():\n", " import datetime\n", "\n", @@ -918,7 +723,7 @@ "#@title get_missing_dates_summary_report() \n", "import datetime\n", "\n", - "def get_missing_dates_summary_report():\n", + "def get_missing_dates_report():\n", " file_path = 'gasprices.txt'\n", " # Read the gas prices from the file\n", " with open(file_path) as f:\n", @@ -949,13 +754,270 @@ " year, month = year_month.split()\n", " print(f\"{month}\\t{year}\\t{count}\")\n", "\n", - "get_missing_dates_summary_report()" + "get_missing_dates_report()" ], "metadata": { - "id": "VoX8G_DD8qMM" + "id": "VoX8G_DD8qMM", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3f94a3eb-e730-4ec6-883f-f8ac03b7e170" }, "execution_count": null, - "outputs": [] + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Month\tYear\tCount of missing days\n", + "Apr\t1993\t22\n", + "May\t1993\t26\n", + "Jun\t1993\t26\n", + "Jul\t1993\t27\n", + "Aug\t1993\t26\n", + "Sep\t1993\t26\n", + "Oct\t1993\t27\n", + "Nov\t1993\t25\n", + "Dec\t1993\t27\n", + "Jan\t1994\t26\n", + "Feb\t1994\t24\n", + "Mar\t1994\t27\n", + "Apr\t1994\t26\n", + "May\t1994\t26\n", + "Jun\t1994\t26\n", + "Jul\t1994\t27\n", + "Aug\t1994\t26\n", + "Sep\t1994\t26\n", + "Oct\t1994\t26\n", + "Nov\t1994\t26\n", + "Dec\t1994\t27\n", + "Jan\t1995\t26\n", + "Feb\t1995\t24\n", + "Mar\t1995\t27\n", + "Apr\t1995\t26\n", + "May\t1995\t26\n", + "Jun\t1995\t26\n", + "Jul\t1995\t26\n", + "Aug\t1995\t27\n", + "Sep\t1995\t26\n", + "Oct\t1995\t26\n", + "Nov\t1995\t26\n", + "Dec\t1995\t27\n", + "Jan\t1996\t26\n", + "Feb\t1996\t25\n", + "Mar\t1996\t27\n", + "Apr\t1996\t25\n", + "May\t1996\t27\n", + "Jun\t1996\t26\n", + "Jul\t1996\t26\n", + "Aug\t1996\t27\n", + "Sep\t1996\t25\n", + "Oct\t1996\t27\n", + "Nov\t1996\t26\n", + "Dec\t1996\t26\n", + "Jan\t1997\t27\n", + "Feb\t1997\t24\n", + "Mar\t1997\t26\n", + "Apr\t1997\t26\n", + "May\t1997\t27\n", + "Jun\t1997\t25\n", + "Jul\t1997\t27\n", + "Aug\t1997\t27\n", + "Sep\t1997\t25\n", + "Oct\t1997\t27\n", + "Nov\t1997\t26\n", + "Dec\t1997\t26\n", + "Jan\t1998\t27\n", + "Feb\t1998\t24\n", + "Mar\t1998\t26\n", + "Apr\t1998\t26\n", + "May\t1998\t27\n", + "Jun\t1998\t25\n", + "Jul\t1998\t27\n", + "Aug\t1998\t26\n", + "Sep\t1998\t26\n", + "Oct\t1998\t27\n", + "Nov\t1998\t25\n", + "Dec\t1998\t27\n", + "Jan\t1999\t27\n", + "Feb\t1999\t24\n", + "Mar\t1999\t26\n", + "Apr\t1999\t26\n", + "May\t1999\t26\n", + "Jun\t1999\t26\n", + "Jul\t1999\t27\n", + "Aug\t1999\t26\n", + "Sep\t1999\t26\n", + "Oct\t1999\t27\n", + "Nov\t1999\t25\n", + "Dec\t1999\t27\n", + "Jan\t2000\t26\n", + "Feb\t2000\t25\n", + "Mar\t2000\t27\n", + "Apr\t2000\t26\n", + "May\t2000\t26\n", + "Jun\t2000\t26\n", + "Jul\t2000\t26\n", + "Aug\t2000\t27\n", + "Sep\t2000\t26\n", + "Oct\t2000\t26\n", + "Nov\t2000\t26\n", + "Dec\t2000\t27\n", + "Jan\t2001\t26\n", + "Feb\t2001\t24\n", + "Mar\t2001\t27\n", + "Apr\t2001\t25\n", + "May\t2001\t27\n", + "Jun\t2001\t26\n", + "Jul\t2001\t26\n", + "Aug\t2001\t27\n", + "Sep\t2001\t26\n", + "Oct\t2001\t26\n", + "Nov\t2001\t26\n", + "Dec\t2001\t26\n", + "Jan\t2002\t27\n", + "Feb\t2002\t24\n", + "Mar\t2002\t27\n", + "Apr\t2002\t25\n", + "May\t2002\t27\n", + "Jun\t2002\t26\n", + "Jul\t2002\t26\n", + "Aug\t2002\t27\n", + "Sep\t2002\t25\n", + "Oct\t2002\t27\n", + "Nov\t2002\t26\n", + "Dec\t2002\t26\n", + "Jan\t2003\t27\n", + "Feb\t2003\t24\n", + "Mar\t2003\t26\n", + "Apr\t2003\t26\n", + "May\t2003\t27\n", + "Jun\t2003\t25\n", + "Jul\t2003\t27\n", + "Aug\t2003\t27\n", + "Sep\t2003\t25\n", + "Oct\t2003\t27\n", + "Nov\t2003\t26\n", + "Dec\t2003\t26\n", + "Jan\t2004\t27\n", + "Feb\t2004\t25\n", + "Mar\t2004\t26\n", + "Apr\t2004\t26\n", + "May\t2004\t26\n", + "Jun\t2004\t26\n", + "Jul\t2004\t27\n", + "Aug\t2004\t26\n", + "Sep\t2004\t26\n", + "Oct\t2004\t27\n", + "Nov\t2004\t25\n", + "Dec\t2004\t27\n", + "Jan\t2005\t26\n", + "Feb\t2005\t24\n", + "Mar\t2005\t27\n", + "Apr\t2005\t26\n", + "May\t2005\t26\n", + "Jun\t2005\t26\n", + "Jul\t2005\t27\n", + "Aug\t2005\t26\n", + "Sep\t2005\t26\n", + "Oct\t2005\t26\n", + "Nov\t2005\t26\n", + "Dec\t2005\t27\n", + "Jan\t2006\t26\n", + "Feb\t2006\t24\n", + "Mar\t2006\t27\n", + "Apr\t2006\t26\n", + "May\t2006\t26\n", + "Jun\t2006\t26\n", + "Jul\t2006\t26\n", + "Aug\t2006\t27\n", + "Sep\t2006\t26\n", + "Oct\t2006\t26\n", + "Nov\t2006\t26\n", + "Dec\t2006\t27\n", + "Jan\t2007\t26\n", + "Feb\t2007\t24\n", + "Mar\t2007\t27\n", + "Apr\t2007\t25\n", + "May\t2007\t27\n", + "Jun\t2007\t26\n", + "Jul\t2007\t26\n", + "Aug\t2007\t27\n", + "Sep\t2007\t26\n", + "Oct\t2007\t26\n", + "Nov\t2007\t26\n", + "Dec\t2007\t26\n", + "Jan\t2008\t27\n", + "Feb\t2008\t25\n", + "Mar\t2008\t26\n", + "Apr\t2008\t26\n", + "May\t2008\t27\n", + "Jun\t2008\t25\n", + "Jul\t2008\t27\n", + "Aug\t2008\t27\n", + "Sep\t2008\t25\n", + "Oct\t2008\t27\n", + "Nov\t2008\t26\n", + "Dec\t2008\t26\n", + "Jan\t2009\t27\n", + "Feb\t2009\t24\n", + "Mar\t2009\t26\n", + "Apr\t2009\t26\n", + "May\t2009\t27\n", + "Jun\t2009\t25\n", + "Jul\t2009\t27\n", + "Aug\t2009\t26\n", + "Sep\t2009\t26\n", + "Oct\t2009\t27\n", + "Nov\t2009\t25\n", + "Dec\t2009\t27\n", + "Jan\t2010\t27\n", + "Feb\t2010\t24\n", + "Mar\t2010\t26\n", + "Apr\t2010\t26\n", + "May\t2010\t26\n", + "Jun\t2010\t26\n", + "Jul\t2010\t27\n", + "Aug\t2010\t26\n", + "Sep\t2010\t26\n", + "Oct\t2010\t27\n", + "Nov\t2010\t25\n", + "Dec\t2010\t27\n", + "Jan\t2011\t26\n", + "Feb\t2011\t24\n", + "Mar\t2011\t27\n", + "Apr\t2011\t26\n", + "May\t2011\t26\n", + "Jun\t2011\t26\n", + "Jul\t2011\t27\n", + "Aug\t2011\t26\n", + "Sep\t2011\t26\n", + "Oct\t2011\t26\n", + "Nov\t2011\t26\n", + "Dec\t2011\t27\n", + "Jan\t2012\t26\n", + "Feb\t2012\t25\n", + "Mar\t2012\t27\n", + "Apr\t2012\t25\n", + "May\t2012\t27\n", + "Jun\t2012\t26\n", + "Jul\t2012\t26\n", + "Aug\t2012\t27\n", + "Sep\t2012\t26\n", + "Oct\t2012\t26\n", + "Nov\t2012\t26\n", + "Dec\t2012\t26\n", + "Jan\t2013\t27\n", + "Feb\t2013\t24\n", + "Mar\t2013\t27\n", + "Apr\t2013\t25\n", + "May\t2013\t27\n", + "Jun\t2013\t26\n", + "Jul\t2013\t26\n", + "Aug\t2013\t22\n" + ] + } + ] }, { "cell_type": "code", @@ -1047,7 +1109,7 @@ "metadata": { "id": "gcKx4uxOlu0u" }, - "execution_count": 16, + "execution_count": null, "outputs": [] }, { @@ -1135,20 +1197,23 @@ " break\n", " \n", " else:\n", - " print('Please enter a valid number between 1 to 16.')" + " print('Please enter a valid number between 1 to 15.')" ], "metadata": { "id": "ujel_5P1sqj3", "colab": { "base_uri": "https://localhost:8080/" }, - "outputId": "9eefb370-16dc-4232-8968-8fd1a8c93176" + "outputId": "5a754102-d95b-47be-d03a-88c5581bd0dc" }, "execution_count": null, "outputs": [ { - "output_type": "stream", + "metadata": { + "tags": null + }, "name": "stdout", + "output_type": "stream", "text": [ "\n", "===========================================================\n", @@ -1172,7 +1237,6 @@ "\n", "===========================================================\n", "\n", - "Please choose one of the method numbers: 15\n", "\n", "\n", "\n",