From 8736e43de11c209953276ab9cb09a3a724137aaf Mon Sep 17 00:00:00 2001 From: AyseSubasi <86684300+AyseSubasi@users.noreply.github.com> Date: Fri, 10 Sep 2021 09:09:18 +0200 Subject: [PATCH] final version --- Olist-Data cleaning.ipynb | 338 +++++--------------------------------- 1 file changed, 38 insertions(+), 300 deletions(-) diff --git a/Olist-Data cleaning.ipynb b/Olist-Data cleaning.ipynb index 80a6c98..983a1dc 100644 --- a/Olist-Data cleaning.ipynb +++ b/Olist-Data cleaning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c5229c99", + "id": "5a27f39e", "metadata": {}, "source": [ "## Table of Contents" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "9a57f8ed", + "id": "f9a0f82e", "metadata": {}, "source": [ "+ 1. [Libaries](#Libaries)\n", @@ -100324,9 +100324,8 @@ "source": [ "### 3. Retriving data from MYSQL into Pandas Data Frame \n", "\n", - "For the first, we consider the following tables as important to explore and clean:\n", + "First, we consider the following tables as important to explore and clean:\n", "\n", - "#HIER KOMMT NOCH EIN SCREENSHOT VON EER DIAGRAMM!!!!! NICHT VERGESSEN\n", " - customers\n", " - sellers\n", " - order_payments\n", @@ -100459,7 +100458,7 @@ } ], "source": [ - "# Reading Customers dataset\n", + "# Reading customers dataset\n", "\n", "query = \"SELECT * FROM olist.customers\"\n", "customers_df = pd.read_sql_query(query, db_connection)\n", @@ -101462,7 +101461,7 @@ }, { "cell_type": "markdown", - "id": "93a5cf9d", + "id": "e1ebfaf6", "metadata": {}, "source": [ "### 4. Data Transformation " @@ -101470,7 +101469,7 @@ }, { "cell_type": "markdown", - "id": "b665c299", + "id": "50645f6f", "metadata": {}, "source": [ "#### 4.1 Translating data " @@ -101711,17 +101710,18 @@ }, { "cell_type": "markdown", - "id": "23e67708", + "id": "0eaec331", "metadata": {}, "source": [ - "Why we decided not using a pipeline:\n", + "Why we decided to work with multiple dataframes:\n", "\n", - "* we dont work with one dataset\n", - "* lack of time to build pipelines for each table\n", - "* creating functions-> problem we need to put the function into a new dataframe\n", + "* dataset is large and complex\n", + "* we use different functions to apply in specific tables\n", "\n", - "Type of the tables - > checked in Workbench EER Diagramm everything was correct\n", - "\n" + "\n", + "Data Type of the tables - > checked in Workbench EER Diagramm everything was correct\n", + "\n", + "We used tableau to merge the tables for our specific needs" ] }, { @@ -101933,7 +101933,7 @@ }, { "cell_type": "markdown", - "id": "6c7411f7", + "id": "c9b3f9f4", "metadata": {}, "source": [ "#### 5.2 Handling empty values " @@ -101941,7 +101941,7 @@ }, { "cell_type": "markdown", - "id": "51f8b179", + "id": "543c5825", "metadata": {}, "source": [ "Product dataframe - we consider the following columns as columns of interest:\n", @@ -102075,7 +102075,7 @@ }, { "cell_type": "markdown", - "id": "0d31b614", + "id": "2fcf78e5", "metadata": {}, "source": [ "##### Now we fill in the missing values.. " @@ -102133,41 +102133,6 @@ "orders_df.dropna(subset=[\"order_purchase_timestamp\",\"order_approved_at\",\"order_delivered_carrier_date\",\"order_delivered_customer_date\",\"order_estimated_delivery_date\"],inplace=True)" ] }, - { - "cell_type": "code", - "execution_count": 71, - "id": "90d258bd", - "metadata": {}, - "outputs": [], - "source": [ - "#not sure if i should delete these functions maybe at the end?" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "d638e8d2", - "metadata": {}, - "outputs": [], - "source": [ - "# Data cleaning pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "65543d4c", - "metadata": {}, - "outputs": [], - "source": [ - "# #data cleaning\n", - "# def data_clean(df):\n", - "# df = df.drop_duplicates().reset_index()\n", - "# df.columns = [i.replace(\" \", \"None\") for i in df.columns]\n", - "# df = df.drop(columns = ['index','product_name_length','product_description_length','product_photos_qty'])\n", - "# df = df.dropna()" - ] - }, { "cell_type": "markdown", "id": "2084f02a", @@ -102176,171 +102141,9 @@ "----" ] }, - { - "cell_type": "code", - "execution_count": 36, - "id": "21acdb75", - "metadata": {}, - "outputs": [], - "source": [ - "# \n", - "# def drop_na(orders_df):\n", - "# #dropna dates\n", - "# orders_df[\"order_purchase_timestamp\"].dropna(inplace=True)\n", - "# orders_df[\"order_approved_at\"].dropna(inplace=True)\n", - "# orders_df[\"order_delivered_carrier_date\"].dropna(inplace=True)\n", - "# orders_df[\"order_delivered_customer_date\"].dropna(inplace=True)\n", - "# orders_df[\"order_estimated_delivery_date\"].dropna(inplace=True)\n", - "# return(orders_df)" - ] - }, - { - "cell_type": "markdown", - "id": "80b80827", - "metadata": {}, - "source": [ - "##### Checking orders column.. " - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "df0b7907", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | order_id | \n", - "customer_id | \n", - "order_status | \n", - "order_purchase_timestamp | \n", - "order_approved_at | \n", - "order_delivered_carrier_date | \n", - "order_delivered_customer_date | \n", - "order_estimated_delivery_date | \n", - "
---|---|---|---|---|---|---|---|---|
0 | \n", - "e481f51cbdc54678b7cc49136f2d6af7 | \n", - "9ef432eb6251297304e76186b10a928d | \n", - "delivered | \n", - "2017-10-02 12:56:33 | \n", - "2017-10-02 13:07:15 | \n", - "2017-10-04 21:55:00 | \n", - "2017-10-10 23:25:13 | \n", - "2017-10-18 02:00:00 | \n", - "
1 | \n", - "53cdb2fc8bc7dce0b6741e2150273451 | \n", - "b0830fb4747a6c6d20dea0b8c802d7ef | \n", - "delivered | \n", - "2018-07-24 22:41:37 | \n", - "2018-07-26 05:24:27 | \n", - "2018-07-26 16:31:00 | \n", - "2018-08-07 17:27:45 | \n", - "2018-08-13 02:00:00 | \n", - "
2 | \n", - "47770eb9100c2d0c44946d9cf07ec65d | \n", - "41ce2a54c0b03bf3443c3d931a367089 | \n", - "delivered | \n", - "2018-08-08 10:38:49 | \n", - "2018-08-08 10:55:23 | \n", - "2018-08-08 15:50:00 | \n", - "2018-08-17 20:06:29 | \n", - "2018-09-04 02:00:00 | \n", - "
3 | \n", - "949d5b44dbf5de918fe9c16f97b45f8a | \n", - "f88197465ea7920adcdbec7375364d82 | \n", - "delivered | \n", - "2017-11-18 20:28:06 | \n", - "2017-11-18 20:45:59 | \n", - "2017-11-22 14:39:59 | \n", - "2017-12-02 01:28:42 | \n", - "2017-12-15 01:00:00 | \n", - "
4 | \n", - "ad21c59c0840e6cb83a9ceb5573f8159 | \n", - "8ab97904e6daea8866dbdbc4fb7aad2c | \n", - "delivered | \n", - "2018-02-13 22:18:39 | \n", - "2018-02-13 23:20:29 | \n", - "2018-02-14 20:46:34 | \n", - "2018-02-16 19:17:02 | \n", - "2018-02-26 01:00:00 | \n", - "