-
Notifications
You must be signed in to change notification settings - Fork 0
/
KFW_Panel_Dataset_Building.R
125 lines (92 loc) · 4.22 KB
/
KFW_Panel_Dataset_Building.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#clear variables and values
rm(list=ls())
#set the working directory to where the files are stored - !CHANGE THIS TO YOUR OWN DIRECTORY!
#setwd("/home/aiddata/Desktop/Github/kfw2_amazon_conflict/")
setwd("C:/Users/jflak/OneDrive/GitHub/kfw2_amazon_conflict/")
#setwd("/Users/rbtrichler/Documents/AidData/Git Repos/kfw2_amazon_conflict")
#essential spatial view packages (load and project shapefiles etc...)
library(rgdal)
library(sp)
#tools for spatial objects
library(rgeos)
library(maptools)
#tools for data manipulation
library(reshape2)
#library that handles matching
library(MatchIt)
#library that has old sci functions (like timeRangeTrend)
library(SCI)
shp_file <- "Processed_Data/shpfilecross.shp"
dta_shp = readShapePoly(shp_file)
#df_dta_shp = as.data.frame(dta_shp)
#deletes the Pop_2000_x variable and renames Pop_2000_y to Pop_2000
dta_shp@data$Pop_2000_x <- NULL
names(dta_shp@data)[names(dta_shp@data)=="Pop_2000_y"] <- "Pop_2000"
#fills in 0s for NAs in lfreq_tota and ifreq_tota
dta_shp@data$lfreq_tota[is.na(dta_shp@data$lfreq_tota)] <- 0
dta_shp@data$ifreq_tota[is.na(dta_shp@data$ifreq_tota)]<-0
#Impute 2014 ntl value
dta_shp@data$ntl_2013[is.na(dta_shp@data$ntl_2013)] <- 5555
dta_shp@data$ntl2014trend <- timeRangeTrend(dta_shp,"ntl_[0-9][0-9][0-9][0-9]",2009,2013,"id")
dta_shp@data$ntl2014trend[dta_shp@data$ntl2014trend == "NaN"] <- NA
dta_shp@data$ntl_2014 <- (dta_shp@data$ntl2014trend + dta_shp@data$ntl_2013)
dta_shp@data$ntl_2014[dta_shp@data$ntl_2014 < 0] <- 0
dta_shp@data$ntl_2013[dta_shp@data$ntl_2013 == 5555] <- NA
#Fills in the previous Pop value for the missing Pop values
dta_shp@data$Pop_2003 <- dta_shp@data$Pop_2000
dta_shp@data$Pop_2004 <- dta_shp@data$Pop_2000
dta_shp@data$Pop_2006 <- dta_shp@data$Pop_2005
dta_shp@data$Pop_2007 <- dta_shp@data$Pop_2005
dta_shp@data$Pop_2008 <- dta_shp@data$Pop_2005
dta_shp@data$Pop_2009 <- dta_shp@data$Pop_2005
dta_shp@data$Pop_2011 <- dta_shp@data$Pop_2010
dta_shp@data$Pop_2012 <- dta_shp@data$Pop_2010
dta_shp@data$Pop_2013 <- dta_shp@data$Pop_2010
dta_shp@data$Pop_2014 <- dta_shp@data$Pop_2010
#Turn dta_shp@data into dataframe
df<-dta_shp@data
#Drop all variables for years outside of 2003-2014
df<- df[, -grep("(19[0-9][0-9])", names(df))]
df<- df[, -grep("(2000)", names(df))]
df<- df[, -grep("(2001)", names(df))]
df<- df[, -grep("(2002)", names(df))]
df<- df[, -grep("(2015)", names(df))]
df<- df[, -grep("(2020)", names(df))]
#Drop all AvgD and AvgDist varialbes (many NAs, not used for analysis)
df<-df[,-grep("(AvgD)",names(df))]
#Add underscore to ifreq and lfreq variables to match other variables with yearly measures
for (i in 2:length(df))
{
colnames(df)[i] <- sub("lfreq","lfreq_",colnames(df)[i])
}
for (i in 2:length(df))
{
colnames(df)[i] <- sub("ifreq","ifreq_",colnames(df)[i])
}
#Orders the dataframe columns alphabetically - DO NOT REMOVE, this is key to getting reshape to work correctly
df <- as.data.frame(df)[order(names(df))]
#List the names of the variables in dta_shp
#print(names(df))
#shows what variables are taken by reshape in varying
print(grep("(.+)[0-9][0-9][0-9][0-9]$", names(df), value = TRUE))
#-----------------------------------
#Reshapes the dataframe from wide to long
#-----------------------------------
# panel_data <- reshape(dta_shp@data, dir = "long", varying = grep("(.+)[0-9][0-9][0-9][0-9]$", names(dta_shp), value = TRUE),
# v.names = c("Pop_", "MeanL_", "MaxL_", "MeanT_", "MaxT_", "MinT_", "MeanP_", "MaxP_", "MinP_",
# "ifreq", "lfreq", "ntl_"), times = 1982:2020)
panel_data<-reshape(df,dir="long",varying=grep("(.+)[0-9][0-9][0-9][0-9]$", names(df), value = TRUE),sep="_")
View(as.data.frame(panel_data)[1:100])
View(as.data.frame(panel_data)[101:150])
#Rename variable "time" to "year"
names(panel_data)[names(panel_data)=="time"]="year"
#Replace NAs in ifreq and lfreq with 0
panel_data$ifreq[is.na(panel_data$ifreq)]<-0
panel_data$lfreq[is.na(panel_data$lfreq)]<-0
#Save panel data
write.csv(panel_data,"C:/Users/jflak/OneDrive/GitHub/kfw2_amazon_conflict/Processed_Data/panel_data.csv")
#View(as.data.frame(dta_shp)[1:100])
#View(as.data.frame(dta_shp)[101:200])
#View(as.data.frame(dta_shp)[201:300])
#View(as.data.frame(dta_shp)[301:400])
#View(as.data.frame(dta_shp)[401:469])