From 8736e43de11c209953276ab9cb09a3a724137aaf Mon Sep 17 00:00:00 2001 From: AyseSubasi <86684300+AyseSubasi@users.noreply.github.com> Date: Fri, 10 Sep 2021 09:09:18 +0200 Subject: [PATCH] final version --- Olist-Data cleaning.ipynb | 338 +++++--------------------------------- 1 file changed, 38 insertions(+), 300 deletions(-) diff --git a/Olist-Data cleaning.ipynb b/Olist-Data cleaning.ipynb index 80a6c98..983a1dc 100644 --- a/Olist-Data cleaning.ipynb +++ b/Olist-Data cleaning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c5229c99", + "id": "5a27f39e", "metadata": {}, "source": [ "## Table of Contents" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "9a57f8ed", + "id": "f9a0f82e", "metadata": {}, "source": [ "+ 1. [Libaries](#Libaries)\n", @@ -100324,9 +100324,8 @@ "source": [ "### 3. Retriving data from MYSQL into Pandas Data Frame \n", "\n", - "For the first, we consider the following tables as important to explore and clean:\n", + "First, we consider the following tables as important to explore and clean:\n", "\n", - "#HIER KOMMT NOCH EIN SCREENSHOT VON EER DIAGRAMM!!!!! NICHT VERGESSEN\n", " - customers\n", " - sellers\n", " - order_payments\n", @@ -100459,7 +100458,7 @@ } ], "source": [ - "# Reading Customers dataset\n", + "# Reading customers dataset\n", "\n", "query = \"SELECT * FROM olist.customers\"\n", "customers_df = pd.read_sql_query(query, db_connection)\n", @@ -101462,7 +101461,7 @@ }, { "cell_type": "markdown", - "id": "93a5cf9d", + "id": "e1ebfaf6", "metadata": {}, "source": [ "### 4. Data Transformation " @@ -101470,7 +101469,7 @@ }, { "cell_type": "markdown", - "id": "b665c299", + "id": "50645f6f", "metadata": {}, "source": [ "#### 4.1 Translating data " @@ -101711,17 +101710,18 @@ }, { "cell_type": "markdown", - "id": "23e67708", + "id": "0eaec331", "metadata": {}, "source": [ - "Why we decided not using a pipeline:\n", + "Why we decided to work with multiple dataframes:\n", "\n", - "* we dont work with one dataset\n", - "* lack of time to build pipelines for each table\n", - "* creating functions-> problem we need to put the function into a new dataframe\n", + "* dataset is large and complex\n", + "* we use different functions to apply in specific tables\n", "\n", - "Type of the tables - > checked in Workbench EER Diagramm everything was correct\n", - "\n" + "\n", + "Data Type of the tables - > checked in Workbench EER Diagramm everything was correct\n", + "\n", + "We used tableau to merge the tables for our specific needs" ] }, { @@ -101933,7 +101933,7 @@ }, { "cell_type": "markdown", - "id": "6c7411f7", + "id": "c9b3f9f4", "metadata": {}, "source": [ "#### 5.2 Handling empty values " @@ -101941,7 +101941,7 @@ }, { "cell_type": "markdown", - "id": "51f8b179", + "id": "543c5825", "metadata": {}, "source": [ "Product dataframe - we consider the following columns as columns of interest:\n", @@ -102075,7 +102075,7 @@ }, { "cell_type": "markdown", - "id": "0d31b614", + "id": "2fcf78e5", "metadata": {}, "source": [ "##### Now we fill in the missing values.. " @@ -102133,41 +102133,6 @@ "orders_df.dropna(subset=[\"order_purchase_timestamp\",\"order_approved_at\",\"order_delivered_carrier_date\",\"order_delivered_customer_date\",\"order_estimated_delivery_date\"],inplace=True)" ] }, - { - "cell_type": "code", - "execution_count": 71, - "id": "90d258bd", - "metadata": {}, - "outputs": [], - "source": [ - "#not sure if i should delete these functions maybe at the end?" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "d638e8d2", - "metadata": {}, - "outputs": [], - "source": [ - "# Data cleaning pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "65543d4c", - "metadata": {}, - "outputs": [], - "source": [ - "# #data cleaning\n", - "# def data_clean(df):\n", - "# df = df.drop_duplicates().reset_index()\n", - "# df.columns = [i.replace(\" \", \"None\") for i in df.columns]\n", - "# df = df.drop(columns = ['index','product_name_length','product_description_length','product_photos_qty'])\n", - "# df = df.dropna()" - ] - }, { "cell_type": "markdown", "id": "2084f02a", @@ -102176,171 +102141,9 @@ "----" ] }, - { - "cell_type": "code", - "execution_count": 36, - "id": "21acdb75", - "metadata": {}, - "outputs": [], - "source": [ - "# \n", - "# def drop_na(orders_df):\n", - "# #dropna dates\n", - "# orders_df[\"order_purchase_timestamp\"].dropna(inplace=True)\n", - "# orders_df[\"order_approved_at\"].dropna(inplace=True)\n", - "# orders_df[\"order_delivered_carrier_date\"].dropna(inplace=True)\n", - "# orders_df[\"order_delivered_customer_date\"].dropna(inplace=True)\n", - "# orders_df[\"order_estimated_delivery_date\"].dropna(inplace=True)\n", - "# return(orders_df)" - ] - }, - { - "cell_type": "markdown", - "id": "80b80827", - "metadata": {}, - "source": [ - "##### Checking orders column.. " - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "df0b7907", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
order_idcustomer_idorder_statusorder_purchase_timestamporder_approved_atorder_delivered_carrier_dateorder_delivered_customer_dateorder_estimated_delivery_date
0e481f51cbdc54678b7cc49136f2d6af79ef432eb6251297304e76186b10a928ddelivered2017-10-02 12:56:332017-10-02 13:07:152017-10-04 21:55:002017-10-10 23:25:132017-10-18 02:00:00
153cdb2fc8bc7dce0b6741e2150273451b0830fb4747a6c6d20dea0b8c802d7efdelivered2018-07-24 22:41:372018-07-26 05:24:272018-07-26 16:31:002018-08-07 17:27:452018-08-13 02:00:00
247770eb9100c2d0c44946d9cf07ec65d41ce2a54c0b03bf3443c3d931a367089delivered2018-08-08 10:38:492018-08-08 10:55:232018-08-08 15:50:002018-08-17 20:06:292018-09-04 02:00:00
3949d5b44dbf5de918fe9c16f97b45f8af88197465ea7920adcdbec7375364d82delivered2017-11-18 20:28:062017-11-18 20:45:592017-11-22 14:39:592017-12-02 01:28:422017-12-15 01:00:00
4ad21c59c0840e6cb83a9ceb5573f81598ab97904e6daea8866dbdbc4fb7aad2cdelivered2018-02-13 22:18:392018-02-13 23:20:292018-02-14 20:46:342018-02-16 19:17:022018-02-26 01:00:00
\n", - "
" - ], - "text/plain": [ - " order_id customer_id \\\n", - "0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d \n", - "1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef \n", - "2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 \n", - "3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 \n", - "4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c \n", - "\n", - " order_status order_purchase_timestamp order_approved_at \\\n", - "0 delivered 2017-10-02 12:56:33 2017-10-02 13:07:15 \n", - "1 delivered 2018-07-24 22:41:37 2018-07-26 05:24:27 \n", - "2 delivered 2018-08-08 10:38:49 2018-08-08 10:55:23 \n", - "3 delivered 2017-11-18 20:28:06 2017-11-18 20:45:59 \n", - "4 delivered 2018-02-13 22:18:39 2018-02-13 23:20:29 \n", - "\n", - " order_delivered_carrier_date order_delivered_customer_date \\\n", - "0 2017-10-04 21:55:00 2017-10-10 23:25:13 \n", - "1 2018-07-26 16:31:00 2018-08-07 17:27:45 \n", - "2 2018-08-08 15:50:00 2018-08-17 20:06:29 \n", - "3 2017-11-22 14:39:59 2017-12-02 01:28:42 \n", - "4 2018-02-14 20:46:34 2018-02-16 19:17:02 \n", - "\n", - " order_estimated_delivery_date \n", - "0 2017-10-18 02:00:00 \n", - "1 2018-08-13 02:00:00 \n", - "2 2018-09-04 02:00:00 \n", - "3 2017-12-15 01:00:00 \n", - "4 2018-02-26 01:00:00 " - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "orders_df.head()" - ] - }, { "cell_type": "markdown", - "id": "6dfe5620", + "id": "7e8da8d3", "metadata": {}, "source": [ "#### 5.3 Checking for duplicates " @@ -102348,7 +102151,7 @@ }, { "cell_type": "markdown", - "id": "a0db1936", + "id": "f65e99a9", "metadata": {}, "source": [ "Checking for duplicate rows with - DataFrame.duplicated(subset=None, keep='first') - " @@ -102524,7 +102327,7 @@ }, { "cell_type": "markdown", - "id": "17123296", + "id": "b229da96", "metadata": {}, "source": [ "* geolocation is the only table with duplicates " @@ -102532,12 +102335,26 @@ }, { "cell_type": "markdown", - "id": "355927d0", + "id": "d47d32e5", "metadata": {}, "source": [ "#### 5.4 Drop duplicates " ] }, + { + "cell_type": "code", + "execution_count": 48, + "id": "165b9c9e", + "metadata": {}, + "outputs": [], + "source": [ + "# function for removing duplicate rows\n", + "def dup(df):\n", + " df = df.drop_duplicates().reset_index()\n", + " df = df.drop(columns = [\"index\"])\n", + " return(df)" + ] + }, { "cell_type": "code", "execution_count": 49, @@ -102561,47 +102378,9 @@ "cleaned_geo.shape" ] }, - { - "cell_type": "code", - "execution_count": 48, - "id": "165b9c9e", - "metadata": {}, - "outputs": [], - "source": [ - "#\n", - "def dup(df):\n", - " df = df.drop_duplicates().reset_index()\n", - " df = df.drop(columns = [\"index\"])\n", - " return(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "29910464", - "metadata": {}, - "outputs": [], - "source": [ - "# def drop_columns(df):\n", - "# products_df.drop(columns = ['product_name_length','product_description_length','product_photos_qty'], axis = 1, inplace = True)\n", - "# return df" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "35f0a8ec", - "metadata": {}, - "outputs": [], - "source": [ - "# def standardising(df):\n", - "# products_df.product_category_name.str.replace(\" \", \"None\")\n", - "# return(df)" - ] - }, { "cell_type": "markdown", - "id": "aef2559f", + "id": "7ef72289", "metadata": {}, "source": [ "### 6. Applying functions " @@ -102618,19 +102397,9 @@ "products_df = data_cleaning(products_df)" ] }, - { - "cell_type": "code", - "execution_count": 57, - "id": "8c8c51fb", - "metadata": {}, - "outputs": [], - "source": [ - "orders_df = drop_na(orders_df)" - ] - }, { "cell_type": "markdown", - "id": "d6100051", + "id": "6059e2a6", "metadata": {}, "source": [ "### 7. Save to csv" @@ -102639,7 +102408,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bc011e83", + "id": "ee29109e", "metadata": {}, "outputs": [], "source": [ @@ -102725,37 +102494,6 @@ "source": [ "sellers_df.to_csv('sellers_df.csv', index=False)" ] - }, - { - "cell_type": "markdown", - "id": "aeab2241", - "metadata": {}, - "source": [ - "\n", - "#### Save Pandas Data Frame to MYSQL Database" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "3c0e799d", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'test_table' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtest_table\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_sql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"clean_df\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdb_connection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mschema\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"lab_db_python_sql\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'test_table' is not defined" - ] - } - ], - "source": [ - "test_table.to_sql(name=\"clean_df\", con=db_connection, schema=\"lab_db_python_sql\")" - ] } ], "metadata": {