Skip to content

Commit

Permalink
mode in tfms
Browse files Browse the repository at this point in the history
  • Loading branch information
jph00 committed Mar 7, 2018
2 parents 5bf1f43 + 6d0e116 commit 3ff8bed
Show file tree
Hide file tree
Showing 16 changed files with 183 additions and 38 deletions.
2 changes: 1 addition & 1 deletion courses/dl1/lesson1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@
"hidden": true
},
"source": [
"If you try training for more epochs, you'll notice that we start to *overfit*, which means that our model is learning to recognize the specific images in the training set, rather than generalizaing such that we also get good results on the validation set. One way to fix this is to effectively create more data, through *data augmentation*. This refers to randomly changing the images in ways that shouldn't impact their interpretation, such as horizontal flipping, zooming, and rotating.\n",
"If you try training for more epochs, you'll notice that we start to *overfit*, which means that our model is learning to recognize the specific images in the training set, rather than generalizing such that we also get good results on the validation set. One way to fix this is to effectively create more data, through *data augmentation*. This refers to randomly changing the images in ways that shouldn't impact their interpretation, such as horizontal flipping, zooming, and rotating.\n",
"\n",
"We can do this by passing `aug_tfms` (*augmentation transforms*) to `tfms_from_model`, with a list of functions to apply that randomly change the image however we wish. For photos that are largely taken from the side (e.g. most photos of dogs and cats, as opposed to photos taken from the top down, such as satellite imagery) we can use the pre-defined list of functions `transforms_side_on`. We can also specify random zooming of images up to specified scale by adding the `max_zoom` parameter."
]
Expand Down
1 change: 1 addition & 0 deletions courses/dl1/lesson2-image_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
"os.makedirs('/cache/planet/tmp', exist_ok=True)\n",
"\n",
"!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/train-jpg {PATH}\n",
"!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/test-jpg {PATH}\n",
"!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/train_v2.csv {PATH}\n",
"!ln -s /cache/planet/tmp {PATH}"
]
Expand Down
2 changes: 1 addition & 1 deletion courses/dl1/lesson7-cifar10.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
"outputs": [],
"source": [
"def get_data(sz,bs):\n",
" tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlipXY()], pad=sz//8)\n",
" tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlip()], pad=sz//8)\n",
" return ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)"
]
},
Expand Down
2 changes: 1 addition & 1 deletion courses/ml1/Ethics in Data Science.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Not everything your employer asks you to do may be legal.** An engineer at Volkswagon was [sentenced to 3.5 years in prison](https://www.nytimes.com/2017/08/25/business/volkswagen-engineer-prison-diesel-cheating.html) for helping develop the software to cheat on federal emissions tests. Your boss asking you to do something is not an excuse that will protect you in court."
"**Not everything your employer asks you to do may be legal.** An engineer at Volkswagen was [sentenced to 3.5 years in prison](https://www.nytimes.com/2017/08/25/business/volkswagen-engineer-prison-diesel-cheating.html) for helping develop the software to cheat on federal emissions tests. Your boss asking you to do something is not an excuse that will protect you in court."
]
},
{
Expand Down
11 changes: 5 additions & 6 deletions courses/ml1/lesson1-rf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -437,9 +437,8 @@
"outputs": [],
"source": [
"def display_all(df):\n",
" with pd.option_context(\"display.max_rows\", 1000): \n",
" with pd.option_context(\"display.max_columns\", 1000): \n",
" display(df)"
" with pd.option_context(\"display.max_rows\", 1000, \"display.max_columns\", 1000): \n",
" display(df)"
]
},
{
Expand Down Expand Up @@ -1189,7 +1188,7 @@
}
],
"source": [
"display_all(df_raw.tail().transpose())"
"display_all(df_raw.tail().T)"
]
},
{
Expand Down Expand Up @@ -2371,7 +2370,7 @@
}
],
"source": [
"display_all(df_raw.describe(include='all').transpose())"
"display_all(df_raw.describe(include='all').T)"
]
},
{
Expand Down Expand Up @@ -3491,7 +3490,7 @@
},
"outputs": [],
"source": [
"df_trn, y_trn = proc_df(df_raw, 'SalePrice')\n",
"df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')\n",
"X_train, X_valid = split_vals(df_trn, n_trn)\n",
"y_train, y_valid = split_vals(y_trn, n_trn)"
]
Expand Down
2 changes: 1 addition & 1 deletion courses/ml1/lesson2-rf_interpretation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3434,7 +3434,7 @@
"df_ext = df_keep.copy()\n",
"df_ext['is_valid'] = 1\n",
"df_ext.is_valid[:n_trn] = 0\n",
"x, y = proc_df(df_ext, 'is_valid')"
"x, y, nas = proc_df(df_ext, 'is_valid')"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion fastai/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def to_gpu(x, *args, **kwargs):
def noop(*args, **kwargs): return

def split_by_idxs(seq, idxs):
last, sl = 0, len(seq)
last = 0
for idx in idxs:
yield seq[last:idx]
last = idx
Expand Down
93 changes: 83 additions & 10 deletions fastai/dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import csv

from .imports import *
from .torch_imports import *
from .core import *
Expand Down Expand Up @@ -41,12 +43,50 @@ def read_dirs(path, folder):
labels, filenames, all_labels = [], [], []
full_path = os.path.join(path, folder)
for label in sorted(os.listdir(full_path)):
all_labels.append(label)
for fname in os.listdir(os.path.join(full_path, label)):
filenames.append(os.path.join(folder, label, fname))
labels.append(label)
if label not in ('.ipynb_checkpoints'):
all_labels.append(label)
for fname in os.listdir(os.path.join(full_path, label)):
filenames.append(os.path.join(folder, label, fname))
labels.append(label)
return filenames, labels, all_labels

def create_sample(path, r):
""" Takes a path to a dataset and creates a sample of specified size at <path>_sample

Parameters:
-----------
path: dataset path
r (float): proportion of examples to use as sample, in the range from 0 to 1
"""
sample_path = path + '_sample'
shutil.rmtree(sample_path, ignore_errors=True)
subdirs = [os.path.split(p)[1] for p in glob(os.path.join(path, '*'))]
copy_or_move_with_subdirs(subdirs, path, sample_path, r, move=False)

def create_val(path, r):
""" Takes a path to a dataset and creates a validation set of specified size

Note - this changes the dataset at <path> by moving files to the val set

Parameters:
-----------
path: dataset path
r (float): proportion of examples to use for validation, in the range from 0 to 1

"""
val_path = os.path.join(os.path.split(path)[0], 'valid')
subdirs = [os.path.split(p)[1] for p in glob(os.path.join(path, '*'))]
copy_or_move_with_subdirs(subdirs, path, val_path, r, move=True)

def copy_or_move_with_subdirs(subdir_lst, src, dst, r, move=False):
do = shutil.move if move else shutil.copy
for subdir in subdir_lst:
os.makedirs(os.path.join(dst, subdir))
files = glob(os.path.join(src, subdir, '*'))
np.random.shuffle(files)
for f in files[:int(len(files) * r)]:
do(f, os.path.join(dst, subdir, os.path.split(f)[1]))

def n_hot(ids, c):
res = np.zeros((c,), dtype=np.float32)
res[ids] = 1
Expand All @@ -61,8 +101,33 @@ def folder_source(path, folder):
return fnames, label_arr, all_labels

def parse_csv_labels(fn, skip_header=True):
skip = 1 if skip_header else 0
csv_lines = [o.strip().split(',') for o in open(fn)][skip:]
"""Parse filenames and label sets from a CSV file.

This method expects that the csv file at path :fn: has two columns. If it
has a header, :skip_header: should be set to True. The labels in the
label set are expected to be space separated.

Arguments:
fn: Path to a CSV file.
skip_header: A boolean flag indicating whether to skip the header.

Returns:
a four-tuple of (
sorted image filenames,
a dictionary of filenames and corresponding labels,
a sorted set of unique labels,
a dictionary of labels to their corresponding index, which will
be one-hot encoded.
)
.
"""
with open(fn) as fileobj:
reader = csv.reader(fileobj)
if skip_header:
next(reader)

csv_lines = [l for l in reader]

fnames = [fname for fname, _ in csv_lines]
csv_labels = {a:b.split(' ') for a,b in csv_lines}
return sorted(fnames), csv_labels
Expand Down Expand Up @@ -273,7 +338,11 @@ def get_ds(fn, trn, val, tfms, test=None, **kwargs):
fn(val[0], val[1], tfms[0], **kwargs) # aug
]
if test is not None:
test_lbls = np.zeros((len(test),1))
if isinstance(test, tuple):
test_lbls = test[1]
test = test[0]
else:
test_lbls = np.zeros((len(test),1))
res += [
fn(test, test_lbls, tfms[1], **kwargs), # test
fn(test, test_lbls, tfms[0], **kwargs) # test_aug
Expand Down Expand Up @@ -308,7 +377,7 @@ def from_arrays(cls, path, trn, val, bs=64, tfms=(None,None), classes=None, num_
return cls(path, datasets, bs, num_workers, classes=classes)

@classmethod
def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, num_workers=8):
def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, test_with_labels=False, num_workers=8):
""" Read in images and their labels given as sub-folder names

Arguments:
Expand All @@ -323,9 +392,13 @@ def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='v
Returns:
ImageClassifierData
"""
assert isinstance(tfms[0], Transforms) and isinstance(tfms[1], Transforms), \
"please provide transformations for your train and validation sets"
trn,val = [folder_source(path, o) for o in (trn_name, val_name)]
test_fnames = read_dir(path, test_name) if test_name else None
datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test_fnames)
if test_name:
test = folder_source(path, test_name) if test_with_labels else read_dir(path, test_name)
else: test = None
datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test)
return cls(path, datasets, bs, num_workers, classes=trn[2])

@classmethod
Expand Down
12 changes: 9 additions & 3 deletions fastai/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ def __init__(self, data, models, opt_fn=None, tmp_name='tmp', models_name='model
self.crit,self.reg_fn = None,None

@classmethod
def from_model_data(cls, m, data):
self = cls(data, BasicModel(to_gpu(m)))
def from_model_data(cls, m, data, **kwargs):
self = cls(data, BasicModel(to_gpu(m)), **kwargs)
self.unfreeze()
return self

Expand Down Expand Up @@ -72,7 +72,7 @@ def get_cycle_end(self, name):
def save_cycle(self, name, cycle): self.save(f'{name}_cyc_{cycle}')
def load_cycle(self, name, cycle): self.load(f'{name}_cyc_{cycle}')

def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None,
def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None, best_save_name=None,
use_clr=None, metrics=None, callbacks=None, use_wd_sched=False, norm_wds=False, wds_sched_mult=None, **kwargs):

"""Method does some preparation before finally delegating to the 'fit' method for
Expand Down Expand Up @@ -101,6 +101,8 @@ def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1,
https://github.com/fastai/fastai/blob/master/courses/dl1/lesson1.ipynb

cycle_save_name (str): use to save the weights at end of each cycle

best_save_name (str): use to save weights of best model during training.

metrics (function): some function for evaluating a desired metric. Eg. accuracy.

Expand Down Expand Up @@ -151,6 +153,10 @@ def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1,
self.sched = CosAnneal(layer_opt, cycle_batches, on_cycle_end=cycle_end, cycle_mult=cycle_mult)
elif not self.sched: self.sched=LossRecorder(layer_opt)
callbacks+=[self.sched]

if best_save_name is not None:
callbacks+=[SaveBestModel(self, layer_opt, best_save_name)]

n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
return fit(model, data, n_epoch, layer_opt.opt, self.crit,
metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)
Expand Down
4 changes: 2 additions & 2 deletions fastai/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def set_train_mode(m):
else: m.train()


def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, **kwargs):
def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, stepper=Stepper, **kwargs):
""" Fits a model

Arguments:
Expand All @@ -72,7 +72,7 @@ def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, **kwargs):
epochs(int): number of epochs
crit: loss function to optimize. Example: F.cross_entropy
"""
stepper = Stepper(model, opt, crit, **kwargs)
stepper = stepper(model, opt, crit, **kwargs)
metrics = metrics or []
callbacks = callbacks or []
avg_mom=0.98
Expand Down
2 changes: 1 addition & 1 deletion fastai/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ class TextDataLoader():
def __init__(self, src, x_fld, y_fld):
self.src,self.x_fld,self.y_fld = src,x_fld,y_fld

def __len__(self): return len(self.src)-1
def __len__(self): return len(self.src)

def __iter__(self):
it = iter(self.src)
Expand Down
4 changes: 2 additions & 2 deletions fastai/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ def plots_from_files(imspaths, figsize=(10,5), rows=1, titles=None, maintitle=No
plt.imshow(img)


def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues, figsize=None):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
(This function is copied from the scikit docs.)
"""
plt.figure()
plt.figure(figsize=figsize)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
Expand Down
66 changes: 65 additions & 1 deletion fastai/sgdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,33 @@ def on_batch_begin(self): pass
def on_epoch_end(self, metrics): pass
def on_batch_end(self, metrics): pass
def on_train_end(self): pass


# Useful for maintaining status of a long-running job.
#
# Usage:
# learn.fit(0.01, 1, callbacks = [LoggingCallback(save_path="/tmp/log")])
class LoggingCallback(Callback):
def __init__(self, save_path):
super().__init__()
self.save_path=save_path
def on_train_begin(self):
self.batch = 0
self.epoch = 0
self.f = open(self.save_path, "a", 1)
self.log("\ton_train_begin")
def on_batch_begin(self):
self.log(str(self.batch)+"\ton_batch_begin")
def on_epoch_end(self, metrics):
self.log(str(self.epoch)+"\ton_epoch_end: "+str(metrics))
self.epoch += 1
def on_batch_end(self, metrics):
self.log(str(self.batch)+"\ton_batch_end: "+str(metrics))
self.batch += 1
def on_train_end(self):
self.log("\ton_train_end")
self.f.close()
def log(self, string):
self.f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+"\t"+string+"\n")

class LossRecorder(Callback):
def __init__(self, layer_opt, save_path=''):
Expand Down Expand Up @@ -143,6 +169,44 @@ def calc_lr(self, init_lrs):
return res


class SaveBestModel(LossRecorder):

""" Save weigths of the model with
the best accuracy during training.

Args:
model: the fastai model
lr: indicate to use test images; otherwise use validation images
name: the name of filename of the weights without '.h5'

Usage:
Briefly, you have your model 'learn' variable and call fit.
>>> learn.fit(lr, 2, cycle_len=2, cycle_mult=1, best_save_name='mybestmodel')
....
>>> learn.load('mybestmodel')

For more details see http://forums.fast.ai/t/a-code-snippet-to-save-the-best-model-during-training/12066

"""
def __init__(self, model, layer_opt, name='best_model'):
super().__init__(layer_opt)
self.name = name
self.model = model
self.best_loss = None
self.best_acc = None

def on_epoch_end(self, metrics):
super().on_epoch_end(metrics)
loss, acc = metrics
if self.best_acc == None or acc > self.best_acc:
self.best_acc = acc
self.best_loss = loss
self.model.save(f'{self.name}')
elif acc == self.best_acc and loss < self.best_loss:
self.best_loss = loss
self.model.save(f'{self.name}')


class WeightDecaySchedule(Callback):
def __init__(self, layer_opt, batch_per_epoch, cycle_len, cycle_mult, n_cycles, norm_wds=False, wds_sched_mult=None):
"""
Expand Down
2 changes: 1 addition & 1 deletion fastai/structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def get_sample(df,n):

>>> get_sample(df, 2)
col1 col2
2 3 a
1 2 b
2 3 a
"""
idxs = sorted(np.random.permutation(len(df))[:n])
return df.iloc[idxs].copy()
Expand Down
Loading

0 comments on commit 3ff8bed

Please sign in to comment.