-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_high_clusters.py
103 lines (74 loc) · 2.59 KB
/
extract_high_clusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 4 13:27:19 2021
This script reads data with identified clusters and extracts the high and low clusters.
@author: tuomvais
"""
import glob
import geopandas as gpd
import pandas as pd
import argparse
# Set up the argument parser
ap = argparse.ArgumentParser()
# Get path to input file
ap.add_argument("-i", "--input", required=True,
help="Path to input RTK database file (type geopackage).")
# Get path to input file
ap.add_argument("-if", "--inputfolder", required=True,
help="Path to folder with geopackagee files containing cluster"
" information. Example: /path/to/folder/ ")
# Get path to output file
ap.add_argument("-of", "--outputfolder", required=True,
help="Path to output folder Example: /path/to/outputfolder/.")
# parse arguments
args = vars(ap.parse_args())
# read socioeconomic grid database data in
df = gpd.read_file(args['input'])
# empty list for high and low clusters
highs = []
lows = []
# create empty list for file paths
files = []
# populate list with geopackage file paths to files with bivariate local morans i results
for gpkg in glob.glob(args['inputfolder'] + '*.gpkg'):
files.append(gpkg)
# loop over files
for file in files:
# read file
mdf = gpd.read_file(file)
# get file name
fn = file.split('/')[-1][:-5]
# extract high shannon/simpson clusters
mdf = mdf[mdf['sha_sim_cl'] == 1]
# update filename
fn = fn + '_shasim_high.gpkg'
# add to high list
highs.append(mdf)
# loop over files again
for file in files:
# read file
ldf = gpd.read_file(file)
# get file name
fn = file.split('/')[-1][:-5]
# extract low clusters
ldf = ldf[ldf['sha_sim_cl'] == 3]
# update filename
fn = fn + '_shasim_low.gpkg'
# add to lows list
lows.append(ldf)
# calculate duplicate geometries
high_df = pd.concat(highs)
low_df = pd.concat(lows)
# record cluster appearance counts
highcounts = high_df['NRO'].value_counts().rename('stability').reset_index()
lowcounts = low_df['NRO'].value_counts().rename('stability').reset_index()
# drop duplicates
high_df = high_df.drop_duplicates(subset=['NRO'])
low_df = low_df.drop_duplicates(subset=['NRO'])
# join stability series
histab = pd.merge(high_df, highcounts, left_on='NRO', right_on='index')
lowstab = pd.merge(low_df, lowcounts, left_on='NRO', right_on='index')
# save stability to geopackage
histab.to_file(args['outputfolder'] + 'stability_high.gpgk', driver='GPKG')
lowstab.to_file(args['outputfolder'] + 'stability_low.gpgk', driver='GPKG')