From 1f3200daf4df00c14af9249cb9e21eba3710c77d Mon Sep 17 00:00:00 2001 From: AyseSubasi <86684300+AyseSubasi@users.noreply.github.com> Date: Tue, 7 Sep 2021 21:50:05 +0200 Subject: [PATCH] toc+doc --- Olist-Data cleaning.ipynb | 45 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/Olist-Data cleaning.ipynb b/Olist-Data cleaning.ipynb index 5442cc1..80a6c98 100644 --- a/Olist-Data cleaning.ipynb +++ b/Olist-Data cleaning.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "eb3e9893", + "id": "c5229c99", "metadata": {}, "source": [ "## Table of Contents" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "324573fc", + "id": "9a57f8ed", "metadata": {}, "source": [ "+ 1. [Libaries](#Libaries)\n", @@ -101462,7 +101462,7 @@ }, { "cell_type": "markdown", - "id": "d5385a52", + "id": "93a5cf9d", "metadata": {}, "source": [ "### 4. Data Transformation " @@ -101470,7 +101470,7 @@ }, { "cell_type": "markdown", - "id": "4ca0e598", + "id": "b665c299", "metadata": {}, "source": [ "#### 4.1 Translating data " @@ -101711,14 +101711,17 @@ }, { "cell_type": "markdown", - "id": "acf7df20", + "id": "23e67708", "metadata": {}, "source": [ "Why we decided not using a pipeline:\n", "\n", - "we dont work with one dataset\n", - "lack of time to build pipelines for each table\n", - "creating functions-> put it into new dataframes\n" + "* we dont work with one dataset\n", + "* lack of time to build pipelines for each table\n", + "* creating functions-> problem we need to put the function into a new dataframe\n", + "\n", + "Type of the tables - > checked in Workbench EER Diagramm everything was correct\n", + "\n" ] }, { @@ -101930,7 +101933,7 @@ }, { "cell_type": "markdown", - "id": "5982ad79", + "id": "6c7411f7", "metadata": {}, "source": [ "#### 5.2 Handling empty values " @@ -101938,7 +101941,7 @@ }, { "cell_type": "markdown", - "id": "41188677", + "id": "51f8b179", "metadata": {}, "source": [ "Product dataframe - we consider the following columns as columns of interest:\n", @@ -102072,7 +102075,7 @@ }, { "cell_type": "markdown", - "id": "7d4326c1", + "id": "0d31b614", "metadata": {}, "source": [ "##### Now we fill in the missing values.. " @@ -102133,7 +102136,7 @@ { "cell_type": "code", "execution_count": 71, - "id": "e0da08c2", + "id": "90d258bd", "metadata": {}, "outputs": [], "source": [ @@ -102143,7 +102146,7 @@ { "cell_type": "code", "execution_count": 70, - "id": "61dda57f", + "id": "d638e8d2", "metadata": {}, "outputs": [], "source": [ @@ -102193,7 +102196,7 @@ }, { "cell_type": "markdown", - "id": "1352b8c3", + "id": "80b80827", "metadata": {}, "source": [ "##### Checking orders column.. " @@ -102337,7 +102340,7 @@ }, { "cell_type": "markdown", - "id": "d747a410", + "id": "6dfe5620", "metadata": {}, "source": [ "#### 5.3 Checking for duplicates " @@ -102345,7 +102348,7 @@ }, { "cell_type": "markdown", - "id": "071940ba", + "id": "a0db1936", "metadata": {}, "source": [ "Checking for duplicate rows with - DataFrame.duplicated(subset=None, keep='first') - " @@ -102521,7 +102524,7 @@ }, { "cell_type": "markdown", - "id": "aa32732d", + "id": "17123296", "metadata": {}, "source": [ "* geolocation is the only table with duplicates " @@ -102529,7 +102532,7 @@ }, { "cell_type": "markdown", - "id": "30c015fe", + "id": "355927d0", "metadata": {}, "source": [ "#### 5.4 Drop duplicates " @@ -102598,7 +102601,7 @@ }, { "cell_type": "markdown", - "id": "e7a3304d", + "id": "aef2559f", "metadata": {}, "source": [ "### 6. Applying functions " @@ -102627,7 +102630,7 @@ }, { "cell_type": "markdown", - "id": "ec66da90", + "id": "d6100051", "metadata": {}, "source": [ "### 7. Save to csv" @@ -102636,7 +102639,7 @@ { "cell_type": "code", "execution_count": null, - "id": "406d2b59", + "id": "bc011e83", "metadata": {}, "outputs": [], "source": [