EDA_R_toolkit.Rmd

---
title: "EDA_R_toolkit"
author: "Miao Wang"
date: "9/18/2018"
output:
  html_document:
---
# Toy Data
```{r, include=F}
library(dplyr)
library(tidyverse)
str(storms)
```

# 1. Use `here` for setting relative-path in Rproj

We want to create a R project and always use a relative-path for the file. It is easier for version-control (If data file change location).
```{r, eval=F}
library(magrittr)
library(dplyr)
library(tidyverse)
library(zipcode)

dt1 <- readxl::read_xlsx(
  here::here("data", "001-vendor-data-sample", "01", "cln",
             "d001-vendor-data-sample-20180914.xlsx"),
  na = c("","NULL")
) %>%
  janitor::clean_names()
```

# 2. Check Missing for single fields
Missing might be in many forms, check unique values of the data field before using. 

```{r}
check_missing <- function(.data, .field) {
  # Does not work on NA sets
  if(sum(is.na(.data[,.field]))>0){
    
    print("Already have NA")
    retunr(0)
  }
  
  .field <- sym(.field)
  .data %>% 
    mutate(miss_n = 
             (!!.field == "(Unknown)") %>% as.numeric() +
             (!!.field  == "NULL") %>% as.numeric() +
             (is.na(!!.field )) %>% as.numeric() +
             is.null(!!.field ) %>% as.numeric() +
             (!!.field  == "") %>% as.numeric() + 
             (!!.field  == 'Unknown') %>% as.numeric()) %>% 
    summarise(miss_tot = sum(miss_n),
              miss_porp = mean(miss_n))

}
```

## Example 
```{r}
# check if there is any NA
colSums(is.na(storms))

# Only apply on NA sets
sub_var <- colnames(storms)[colSums(is.na(storms))==0]
sub_var %>% 
  set_names(sub_var) %>% 
  map_dfr(.x=.,.f=check_missing,
          .data = storms,
          .id="variable")
```
# 2.2 Check Missing pattern for entire data 
```{r}
# Check
 tibble(
     variable = colnames(dt1),
     n_NA = colSums(is.na(dt1)),
     n_Unknown = colSums(dt1 == 'Unknown',na.rm=T),
     n_par_Unknown = colSums(dt1 == '(Unknown)',na.rm=T),
     n_blank = colSums(dt1 == '',na.rm=T),
     n_NULL = colSums(dt1 == 'NULL',na.rm = T),
     n_9999 = colSums(dt1 == '-9999',na.rm=T),
     n_dot = colSums(dt1 == '.',na.rm = T)) %>%
     mutate(n_tot = (n_NA+n_Unknown+n_par_Unknown+n_blank+n_NULL+n_9999)/nrow(dt1)) %>%
     print(n=Inf) %>%
     summarise(sum(n_NA),
               sum(n_Unknown),
               sum(n_par_Unknown),
               sum(n_blank),
               sum(n_NULL),
               sum(n_9999),
               sum(n_dot)) %>%
     glimpse()
 
 # select the one you want to turn as NA
 dt1 <-
    dt1 %>%
    replace_with_na_all(condition = ~.x %in% c('Unknown','(Unknown)','-9999'))
```

# 3. Confirmn 1-1 Relationship
Imagine the data is at transaction level. But we also have bank-level information. We should be careful running the summary on bank-level info and avoid double count. 

This is a code to identify and check whether the given variable is a high-level information. (Do you have multiple possible value for the same unique bank? If so, then that variable might not be bank-level info)

This is very-useful to pre-check the data before building a hirachical data model. 

```{r}
# check if the field has unique value for a unique id_var
## only include the field and the identifier field
check_unique_within_id = function(.data,.X,.id){
    f1 <- sym(.X) # convert to symbol
    f2 <- sym(.id)
    
    .data %>% 
        count(!!f1,!!f2) %>% 
        count(!!f2) %>% 
        mutate(non_unique = (nn!=1)) %>% 
        summarise(One_to_one = sum(non_unique)==0, 
                  Prop_one_to_many = mean(non_unique))
}
```

## Examples
```{r}
colnames(storms)[1]
range_colnames <- colnames(storms)[-1]
range_colnames %>% map(.f=check_unique_within_id,
                      .data = storms,
                      .id = colnames(storms)[1]) %>% 
  set_names(range_colnames) %>% 
  bind_rows(.id = "Variable")
```

# 4. Plot hist with sorted y axis
```{r}
plot_top_all <- function(.data,.field,plot_str){
  f <- sym(.field) # turn str to symbol
  
  df <- .data %>% 
    count(!!f)  %>% 
    set_names(c("X","n"))
  
  df$new_X = factor(df$X, levels = df$X[order(df$n)])
  
  df %>% 
    ggplot(aes(new_X,n))+
    geom_col()+
    ggtitle(plot_str) +
    coord_flip() + 
    xlab(.field)
}
```

## Example 
```{r}
plot_top_all(storms,"wind","Wind Hist")
```


# 5. Plot top n level of a Categorical Variable/(Sprase Continous)
Include (NA) count at bottom 
```{r}
plot_top_n <- function(.data,.field,n_top_val){
  f <- sym(.field) # turn str to symbol
  
  df <- .data %>% 
    count(!!f)  %>% 
    set_names(c("X","n")) %>% 
    top_n(n_top_val)
  
  if(sum(is.na(.data[,.field]))!=0){
    df <- rbind(df, c("NA",sum(is.na(.data[,.field]))))
  }
  
  df$new_X = factor(df$X, levels = df$X[order(df$n)])
  
  df %>% 
    ggplot(aes(new_X,n))+
    geom_col()+
    ggtitle(paste0("Top ",n_top_val," Levels")) +
    coord_flip() + 
    xlab(.field)
}
```

## Example
```{r}
plot_top_n(storms,"wind",20)
```


# 6. Vendor & Customer industry relationship

```{r, echo=F}
dt1 %>% 
    filter(vendor_sic_cd != '-9999',
           vendor_sic_cd != '(Unknown)',
           cust_sic_cd != '-9999',
           cust_sic_cd != '(Unknown)') %>% 
    mutate(vendor_segment = substr(vendor_sic_cd,start = 1, stop = 2) %>% as.numeric(),
           cust_segment = substr(cust_sic_cd,start = 1, stop = 2) %>% as.numeric()) %>% 
    mutate(vendor_segment = cut(vendor_segment,breaks = c(1,10,15,18,20,40,50,52,60,70,91,99,Inf),
                         labels = c("Agr_Fore_Fish",
                                    "Mining",
                                    "Constu",
                                    "notused",
                                    "Manufact",
                                    "Transp_Commu_Electric_Gas_San",
                                    "WS_Trade",
                                    "RT_Trade",
                                    "Fin_Insu_REst",
                                    "Services",
                                    "Pub_Adminn",
                                    "Nonclass"),
                         include.lowest = T,
                         right = F),
           cust_segment = cut(cust_segment,breaks = c(1,10,15,18,20,40,50,52,60,70,91,99,Inf),
                              labels = c("Agr_Fore_Fish",
                                    "Mining",
                                    "Constu",
                                    "notused",
                                    "Manufact",
                                    "Transp_Commu_Electric_Gas_San",
                                    "WS_Trade",
                                    "RT_Trade",
                                    "Fin_Insu_REst",
                                    "Services",
                                    "Pub_Adminn",
                                    "Nonclass"),
                         include.lowest = T,
                         right = F)) %>% 
  count(cust_segment, vendor_segment) %>%
  ggplot(mapping = aes(x = cust_segment, y = vendor_segment)) +
    geom_tile(mapping = aes(fill = n),colour = "black")+
    scale_fill_gradient(low = "white",high = "blue")

```

# 7. Vendor & Customer Distance (Caluclated Based on Zip-code)

We will take out opportunity with Unknown/Missing Zip-code. Get the longitude, and latitude using `zipcode` package and then get the distance between vendero and customer unsing `gdist()` from the `Imap` package. To Save Time I only randomly select 50,000 records.

```{r, echo=F}
# create a variable called gdist_cust_vendor 
library(zipcode)
library(Imap)
data(zipcode)

# 1. get latitude, longtitude
tempdt <- 
    dt1[sample(nrow(dt1), 50000),] %>% 
    left_join(zipcode,by = c("vendor_location_zip_code"="zip")) %>%  
    rename(vendor_latitude = latitude,
           vendor_longitude = longitude) %>% 
    left_join(zipcode,by = c("cust_zip_code"="zip")) %>% 
     rename(cust_latitude = latitude,
           cust_longitude = longitude) %>% 
    mutate(gdist_vendor_cust = NA)

# 2. calculate distance, have to use for-loop becuase of the building function 
for (i in 1:nrow(tempdt)){
  tempdt[i,"gdist_vendor_cust"] <-
      gdist(lon.1 = tempdt[i,"vendor_longitude"] %>% unlist(),
      lat.1 = tempdt[i,"vendor_latitude"]%>% unlist(),
      lon.2 = tempdt[i,"cust_longitude"]%>% unlist(),
      lat.2 = tempdt[i,"cust_latitude"]%>% unlist())
}

# 3. draw general distance 
tempdt %>% 
    ggplot() + 
    geom_histogram(mapping = aes(x = gdist_vendor_cust),binwidth=20)+ # by 20 miles
    ggtitle("Distiance between Vendor & Customer By mile (random sample)")

    
# Results Different From Below !!!
# # 4. compared with simpling subtracting zip-code value 
# dt1 %>% 
#     head(10000) %>% 
#     filter(!is.na(as.numeric(cust_zip_code)), # remove NA,null
#            !is.na(as.numeric(vendor_location_zip_code))) %>%
#     mutate(zip_dist = abs(as.numeric(cust_zip_code)-
#                               as.numeric(vendor_location_zip_code))) %>% 
#     ggplot() +
#     geom_histogram(mapping = aes(x = zip_dist),binwidth = 50)

```


# 8. Scrapping information from online pdf file 

2-digit sic code is downloaded from

https://www.dnb.com/content/dam/english/economic-and.../sic_2_digit_codes.xls. 

3-digit sic code in downloaded from 

https://www.fax-list.com/forms/SICCodeTable(2and3_digits).pdf


Those results were compared (selectively) with the official sic code web

https://siccode.com/en/siccode/list/directory/search_keyword


```{r}
library(pdftools)


download.file("https://www.fax-list.com/forms/SICCodeTable(2and3_digits).pdf","./sic_code.pdf")

text <- pdf_text("sic_code.pdf")
text2 <- strsplit(text, "\n")  %>% unlist() %>% as.vector() 
# one element contains 3 sets of value (for that rows)
text3 <- strsplit(text2,"       ") %>% unlist() %>% as.vector()
text4 <- text3[grepl("[0-9]",text3)] # contains at least number 
text4 <- text4[grepl("[a-z]",text4)] # contain at least alphabetic
text4 <- c(text4, "491 Electric Services") # manual check and found break 184 

# cleans out each element 
text5 <- trimws(text4,which="left") # remove the leading space

clean_str <- function(x_str){
  x <- strsplit(x_str," ") %>% unlist()
  return(c(x[1],paste0(x[-1],collapse = "_")))
}

text6 <- map(text5,clean_str) %>% 
  as.data.frame() %>% 
  t() %>% 
  as.tibble() %>% 
  mutate(V2 = ifelse(substr(V2,start=1,stop=1) =="_",
                     substr(V2,start=2,stop=length(V2)),
                     V2)) %>% 
  arrange(V1) %>% 
  rename(sic_code=V1, label=V2)

write_csv(text6,"SIC_code_d3.csv")
```