-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneighborhood_diversities_no_timeofday.py
136 lines (114 loc) · 4.09 KB
/
neighborhood_diversities_no_timeofday.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 13 12:10:25 2021
This script calculates grid cell diversities for the whole social media data without separating times of day.
@author: waeiski
"""
import geopandas as gpd
import pandas as pd
import pysal
import libpysal
import skbio.diversity.alpha as sk
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import argparse
# Set up the argument parser
ap = argparse.ArgumentParser()
# Get path to register input file
ap.add_argument("-r", "--register", required=True,
help="Path to input register file (type geopackage).")
# Get path to grid database input file
ap.add_argument("-i", "--input", required=True,
help="Path to combined Twitter and Instagram point feature file"
" (type geopackage).")
# Get path to output file
ap.add_argument("-o", "--output", required=True,
help="Path to output file (type geopackage).")
# parse arguments
args = vars(ap.parse_args())
# function to scale values
def scale_minmax(series):
'''
Parameters
----------
series : pandas series containing lang counts in a list.
Returns
-------
Min-Max scaled values
'''
# get values
series = series.values
# reshape for scikit
series = series.reshape(-1,1)
# scale
scaled = MinMaxScaler().fit_transform(series)
return scaled
# read grid in
grid = gpd.read_file(args['register'])
# reduce column clutter
grid = grid[['NRO', 'KUNTA', 'KUNTANRO', 'geometry']]
# convert grid id to integer
grid['NRO'] = grid['NRO'].astype(int)
# get contiguity
gW = libpysal.weights.Queen.from_dataframe(grid, idVariable='NRO')
# read timeofday df in
df = gpd.read_file(args['input'])
# join with grid
df = gpd.sjoin(df, grid, op='intersects')
# get grid ids
idlist = df['NRO'].value_counts().index.tolist()
# set language dictionary
langdict = {}
# loop over grid ids
for gid in idlist:
# get neighboring grid ids of current grid
neigh_ids = list(gW[gid].keys())
# extend list with current grid id
neigh_ids.extend([gid])
# fetch neighborhood grid cells for current grid id into dataframe
ndf = df[df['NRO'].isin(neigh_ids)]
# extract languages used in neighborhood grid cells
langs = list(zip(ndf['language'].value_counts().index,
ndf['language'].value_counts().values))
# add language use to language dictionary
langdict[gid] = langs
# generate a language use dataframe
langdf = pd.DataFrame(langdict.items(), columns=['grid_id', 'langs'])
# loop over language dataframe
print('[INFO] - Calculating diversity indices..')
for i, row in langdf.iterrows():
# get language counts from tuples
counts = [x[1] for x in row['langs']]
# calculate diversities
langdf.at[i, 'sents'] = sum(counts)
langdf.at[i, 'unique'] = sk.observed_otus(counts)
langdf.at[i, 'singletons'] = sk.singles(counts)
langdf.at[i, 'berger'] = sk.berger_parker_d(counts)
langdf.at[i, 'dominance'] = sk.dominance(counts)
langdf.at[i, 'mcintosh_d'] = sk.mcintosh_d(counts)
langdf.at[i, 'strong'] = sk.strong(counts)
langdf.at[i, 'shannon'] = sk.shannon(counts, base=np.e)
langdf.at[i, 'brillouin'] = sk.brillouin_d(counts)
langdf.at[i, 'pielou'] = sk.pielou_e(counts)
langdf.at[i, 'heip'] = sk.heip_e(counts)
langdf.at[i, 'simpson_e'] = sk.simpson_e(counts)
langdf.at[i, 'mcintosh_e'] = sk.mcintosh_e(counts)
langdf.at[i, 'menhinick'] = sk.menhinick(counts)
langdf.at[i, 'margalef'] = sk.margalef(counts)
langdf.at[i, 'gini'] = sk.gini_index(counts)
langdf.at[i, 'enspie'] = sk.enspie(counts)
langdf.at[i, 'simpson'] = sk.simpson(counts)
# scale shannon
langdf['shannon_scaled'] = scale_minmax(langdf['shannon'])
# get time of day dataframe to grid
langrid = pd.merge(grid, langdf, how='outer', left_on='NRO', right_on='grid_id')
# ddrop empty rows
langrid = langrid.dropna(subset=['sents'])
# drop list column
langrid = langrid.drop(columns=['langs'])
# save to file
print('[INFO] - Saving results...')
langrid.to_file(args['output'], driver='GPKG')
# print done
print('[INFO] - ... done!')