Skip to content

Commit

Permalink
Combined 4 first releases into a single executable
Browse files Browse the repository at this point in the history
  • Loading branch information
OneStone2 committed Jul 27, 2016
1 parent 7b9d183 commit a195621
Show file tree
Hide file tree
Showing 15 changed files with 1,202 additions and 460 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,21 @@ workon mcmc_growth

# Operation
```
python run.py [state code] [number of clusters]
python run.py [--online] state
```
state code is a 2-letter code for the state (e.g. ME)
number of clusters is used in the clustering step

Displaying graphs is WIP.
--online is to download the required files on the fly.

# Obtaining the data

This project uses an FIA dataset for the state of Maine. To download them:
Sometimes, the connection the the FIA database is unstable.

To download them manually:
```
wget http://apps.fs.fed.us/fiadb-downloads/CSV/ME_PLOT.csv -O data/ME_PLOT.csv
wget http://apps.fs.fed.us/fiadb-downloads/CSV/ME_TREE.csv -O data/ME_TREE.csv
wget http://apps.fs.fed.us/fiadb-downloads/CSV/ME_COND.csv -O data/ME_COND.csv
```
For other states, replace ME with the corresponding state code.

Expand Down
179 changes: 179 additions & 0 deletions analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
Contains code for revisions 1-4
All functions return RMSE
"""
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import linear_model
import random
import math

def analyze_r01(state, human, time):
"""
Revision 1
"""
if human:
DATA_CSV = 'data/'+state+'_2a.csv'
else:
DATA_CSV = 'data/'+state+'_2b.csv'
data_points = pd.read_csv(DATA_CSV)
nr1 = data_points['carb']
nr1 = nr1.iloc[1:len(nr1)]
nr2 = data_points['py']
nr2 = nr2.iloc[1:len(nr2)]
data_points = data_points.iloc[0:len(data_points.index)-1]
nr1.index = np.arange(len(nr1.index))
nr2.index = np.arange(len(nr2.index))
if time:
data_points.loc[:, 'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])
else:
data_points.loc[:, 'growth'] = nr1
data_points.loc[:, 'post_py'] = nr2
data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]
data_points.drop(['py', 'post_py'], axis=1, inplace=True)
data_points.index = np.arange(len(data_points.index))
data_points = data_points.loc[:, ['carb', 'growth']]
data_points = data_points.as_matrix().tolist()
random.shuffle(data_points)
training = data_points[0:9 * len(data_points) / 10]
test = data_points[9 * len(data_points) / 10:len(data_points)]
training = np.array(training)
m = stats.linregress(training).slope
n = stats.linregress(training).intercept
sq_error = 0.0
for elem in test:
predicted = m * elem[0] + n
actual = elem[1]
sq_error += (actual - predicted) ** 2
mse = math.sqrt(sq_error/len(test))
return mse

def analyze_r02(state, human, time):
"""
Revision 2
"""
if human:
DATA_CSV = 'data/'+state+'_2a.csv'
else:
DATA_CSV = 'data/'+state+'_2b.csv'
data_points = pd.read_csv(DATA_CSV)
nr1 = data_points['carb']
nr1 = nr1.iloc[1:len(nr1)]
nr2 = data_points['py']
nr2 = nr2.iloc[1:len(nr2)]
data_points = data_points.iloc[0:len(data_points.index)-1]
nr1.index = np.arange(len(nr1.index))
nr2.index = np.arange(len(nr2.index))
if time:
data_points.loc[:, 'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])
else:
data_points.loc[:, 'growth'] = nr1
data_points.loc[:, 'post_py'] = nr2
data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]
data_points.drop(['py', 'post_py'], axis=1, inplace=True)
data_points.index = np.arange(len(data_points.index))
data_points = data_points.loc[:, ['carb', 'growth']]
data_points = data_points.as_matrix().tolist()
N_GROUPS = 10
random.shuffle(data_points)
groups = []
prev_cutoff = 0
for i in np.arange(N_GROUPS):
next_cutoff = (i + 1) * len(data_points) / N_GROUPS
groups.append(data_points[prev_cutoff:next_cutoff])
prev_cutoff = next_cutoff
min_mse = float("inf")
for i in np.arange(N_GROUPS):
training = []
test = []
for j, group in enumerate(groups):
if j == i:
test = group
else:
training.extend(group)
training = np.array(training)
m = stats.linregress(training).slope
n = stats.linregress(training).intercept
sq_error = 0.0
for elem in test:
predicted = m * elem[0] + n
actual = elem[1]
sq_error += (actual - predicted) ** 2
mse = math.sqrt(sq_error/len(test))
if mse < min_mse:
min_mse = mse
return min_mse

def analyze_r03(state, human, time, called_as_r04=False):
"""
Revision 3
"""
if human:
DATA_CSV = 'data/'+state+'_2a.csv'
else:
DATA_CSV = 'data/'+state+'_2b.csv'
data_points = pd.read_csv(DATA_CSV)
nr1 = data_points['carb']
nr1 = nr1.iloc[1:len(nr1)]
nr2 = data_points['py']
nr2 = nr2.iloc[1:len(nr2)]
data_points = data_points.iloc[0:len(data_points.index)-1]
nr1.index = np.arange(len(nr1.index))
nr2.index = np.arange(len(nr2.index))
if time:
data_points.loc[:, 'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])
else:
data_points.loc[:, 'growth'] = nr1
data_points.loc[:, 'post_py'] = nr2
mode = stats.mode((data_points['post_py'] - data_points['py']).tolist()).mode[0]
if called_as_r04:
data_points = data_points[data_points.post_py - data_points.py == mode]
else:
data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]
data_points.drop(['py', 'post_py'], axis=1, inplace=True)
data_points.index = np.arange(len(data_points.index))
data_points = data_points.loc[:, ['carb', 'lat', 'lon', 'growth']]
data_points = data_points.as_matrix().tolist()
for i, elem in enumerate(data_points):
elem[0] = [x for x in elem]
elem[0].pop()
elem[1] = elem[-1]
data_points[i] = elem[0:2]
N_GROUPS = 10
random.shuffle(data_points)
groups = []
prev_cutoff = 0
for i in np.arange(N_GROUPS):
next_cutoff = (i + 1) * len(data_points) / N_GROUPS
groups.append(data_points[prev_cutoff:next_cutoff])
prev_cutoff = next_cutoff
min_mse = float("inf")
for i in np.arange(N_GROUPS):
training = []
test = []
for j, group in enumerate(groups):
if j == i:
test = group
else:
training.extend(group)
training = np.array(training).T.tolist()
clf = linear_model.LinearRegression()
clf.fit(training[0], training[1])
coef = clf.coef_
cons = clf.intercept_
sq_error = 0.0
for elem in test:
predicted = sum(coef * elem[0]) + cons
actual = elem[1]
sq_error += (actual - predicted) ** 2
mse = math.sqrt(sq_error/len(test))
if mse < min_mse:
min_mse = mse
return min_mse

def analyze_r04(state, human, time):
"""
Revision 4
"""
return analyze_r03(state, human, time, called_as_r04=True)
105 changes: 105 additions & 0 deletions analyze_r01.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from scipy import stats\n",
"import seaborn as sns\n",
"import random\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def analyze_r01(state, human, time):\n",
" if human:\n",
" DATA_CSV = 'data/'+state+'_2a.csv'\n",
" else:\n",
" DATA_CSV = 'data/'+state+'_2b.csv'\n",
" data_points = pd.read_csv(DATA_CSV)\n",
" nr1 = data_points['carb']\n",
" nr1 = nr1.iloc[1:len(nr1)]\n",
" nr2 = data_points['py']\n",
" nr2 = nr2.iloc[1:len(nr2)]\n",
" data_points = data_points.iloc[0:len(data_points.index)-1]\n",
" nr1.index = np.arange(len(nr1.index))\n",
" nr2.index = np.arange(len(nr2.index))\n",
" if time:\n",
" data_points.loc[:,'growth'] = (nr1 - data_points['carb']) / (nr2 - data_points['py'])\n",
" else:\n",
" data_points.loc[:,'growth'] = nr1\n",
" data_points.loc[:,'post_py'] = nr2\n",
" data_points = data_points[data_points.post_py // 10000 == data_points.py // 10000]\n",
" data_points.drop(['py','post_py'], axis=1, inplace=True)\n",
" data_points.index = np.arange(len(data_points.index))\n",
" data_points = data_points.loc[:,['carb','growth']]\n",
" training = []\n",
" test = []\n",
" prob = 10\n",
" for i, point in data_points.iterrows():\n",
" if random.randint(1, prob) == 1:\n",
" prob = 10\n",
" test.append(point)\n",
" else:\n",
" prob -= 1\n",
" training.append(point)\n",
" training = np.array(training)\n",
" m = stats.linregress(training).slope\n",
" n = stats.linregress(training).intercept\n",
" sq_error = 0.0\n",
" for elem in test:\n",
" predicted = m * elem[0] + n\n",
" actual = elem[1]\n",
" sq_error += (actual - predicted) ** 2\n",
" mse = math.sqrt(sq_error/len(test))\n",
" return mse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"STATE = 'ME'\n",
"print analyze_r01(STATE, True, True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit a195621

Please sign in to comment.