mode in tfms

dineshkumarsarangapani · Mar 7, 2018 · 3ff8bed · 3ff8bed
2 parents 5bf1f43 + 6d0e116
commit 3ff8bed
Show file tree

Hide file tree

Showing 16 changed files with 183 additions and 38 deletions.
diff --git a/courses/dl1/lesson1.ipynb b/courses/dl1/lesson1.ipynb
@@ -1100,7 +1100,7 @@
     "hidden": true
    },
    "source": [
-    "If you try training for more epochs, you'll notice that we start to *overfit*, which means that our model is learning to recognize the specific images in the training set, rather than generalizaing such that we also get good results on the validation set. One way to fix this is to effectively create more data, through *data augmentation*. This refers to randomly changing the images in ways that shouldn't impact their interpretation, such as horizontal flipping, zooming, and rotating.\n",
+    "If you try training for more epochs, you'll notice that we start to *overfit*, which means that our model is learning to recognize the specific images in the training set, rather than generalizing such that we also get good results on the validation set. One way to fix this is to effectively create more data, through *data augmentation*. This refers to randomly changing the images in ways that shouldn't impact their interpretation, such as horizontal flipping, zooming, and rotating.\n",
     "\n",
     "We can do this by passing `aug_tfms` (*augmentation transforms*) to `tfms_from_model`, with a list of functions to apply that randomly change the image however we wish. For photos that are largely taken from the side (e.g. most photos of dogs and cats, as opposed to photos taken from the top down, such as satellite imagery) we can use the pre-defined list of functions `transforms_side_on`. We can also specify random zooming of images up to specified scale by adding the `max_zoom` parameter."
    ]

diff --git a/courses/dl1/lesson2-image_models.ipynb b/courses/dl1/lesson2-image_models.ipynb
@@ -56,6 +56,7 @@
     "os.makedirs('/cache/planet/tmp', exist_ok=True)\n",
     "\n",
     "!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/train-jpg {PATH}\n",
+    "!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/test-jpg {PATH}\n",
     "!ln -s /datasets/kaggle/planet-understanding-the-amazon-from-space/train_v2.csv {PATH}\n",
     "!ln -s /cache/planet/tmp {PATH}"
    ]

diff --git a/courses/dl1/lesson7-cifar10.ipynb b/courses/dl1/lesson7-cifar10.ipynb
@@ -63,7 +63,7 @@
    "outputs": [],
    "source": [
     "def get_data(sz,bs):\n",
-    "    tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlipXY()], pad=sz//8)\n",
+    "    tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlip()], pad=sz//8)\n",
     "    return ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)"
    ]
   },

diff --git a/courses/ml1/Ethics in Data Science.ipynb b/courses/ml1/Ethics in Data Science.ipynb
@@ -20,7 +20,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Not everything your employer asks you to do may be legal.**  An engineer at Volkswagon was [sentenced to 3.5 years in prison](https://www.nytimes.com/2017/08/25/business/volkswagen-engineer-prison-diesel-cheating.html) for helping develop the software to cheat on federal emissions tests.  Your boss asking you to do something is not an excuse that will protect you in court."
+    "**Not everything your employer asks you to do may be legal.**  An engineer at Volkswagen was [sentenced to 3.5 years in prison](https://www.nytimes.com/2017/08/25/business/volkswagen-engineer-prison-diesel-cheating.html) for helping develop the software to cheat on federal emissions tests.  Your boss asking you to do something is not an excuse that will protect you in court."
    ]
   },
   {

diff --git a/courses/ml1/lesson1-rf.ipynb b/courses/ml1/lesson1-rf.ipynb
@@ -437,9 +437,8 @@
    "outputs": [],
    "source": [
     "def display_all(df):\n",
-    "    with pd.option_context(\"display.max_rows\", 1000): \n",
-    "        with pd.option_context(\"display.max_columns\", 1000): \n",
-    "            display(df)"
+    "    with pd.option_context(\"display.max_rows\", 1000, \"display.max_columns\", 1000): \n",
+    "        display(df)"
    ]
   },
   {
@@ -1189,7 +1188,7 @@
     }
    ],
    "source": [
-    "display_all(df_raw.tail().transpose())"
+    "display_all(df_raw.tail().T)"
    ]
   },
   {
@@ -2371,7 +2370,7 @@
     }
    ],
    "source": [
-    "display_all(df_raw.describe(include='all').transpose())"
+    "display_all(df_raw.describe(include='all').T)"
    ]
   },
   {
@@ -3491,7 +3490,7 @@
    },
    "outputs": [],
    "source": [
-    "df_trn, y_trn = proc_df(df_raw, 'SalePrice')\n",
+    "df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')\n",
     "X_train, X_valid = split_vals(df_trn, n_trn)\n",
     "y_train, y_valid = split_vals(y_trn, n_trn)"
    ]

diff --git a/courses/ml1/lesson2-rf_interpretation.ipynb b/courses/ml1/lesson2-rf_interpretation.ipynb
@@ -3434,7 +3434,7 @@
     "df_ext = df_keep.copy()\n",
     "df_ext['is_valid'] = 1\n",
     "df_ext.is_valid[:n_trn] = 0\n",
-    "x, y = proc_df(df_ext, 'is_valid')"
+    "x, y, nas = proc_df(df_ext, 'is_valid')"
    ]
   },
   {

diff --git a/fastai/core.py b/fastai/core.py
@@ -43,7 +43,7 @@ def to_gpu(x, *args, **kwargs):
 def noop(*args, **kwargs): return
 
 def split_by_idxs(seq, idxs):
-    last, sl = 0, len(seq)
+    last = 0
     for idx in idxs:
         yield seq[last:idx]
         last = idx

diff --git a/fastai/dataset.py b/fastai/dataset.py
@@ -1,3 +1,5 @@
+import csv
+
 from .imports import *
 from .torch_imports import *
 from .core import *
@@ -41,12 +43,50 @@ def read_dirs(path, folder):
     labels, filenames, all_labels = [], [], []
     full_path = os.path.join(path, folder)
     for label in sorted(os.listdir(full_path)):
-        all_labels.append(label)
-        for fname in os.listdir(os.path.join(full_path, label)):
-            filenames.append(os.path.join(folder, label, fname))
-            labels.append(label)
+        if label not in ('.ipynb_checkpoints'):
+            all_labels.append(label)
+            for fname in os.listdir(os.path.join(full_path, label)):
+                filenames.append(os.path.join(folder, label, fname))
+                labels.append(label)
     return filenames, labels, all_labels
 
+def create_sample(path, r):
+    """ Takes a path to a dataset and creates a sample of specified size at <path>_sample
+
+    Parameters:
+    -----------
+    path: dataset path
+    r (float): proportion of examples to use as sample, in the range from 0 to 1
+    """
+    sample_path = path + '_sample'
+    shutil.rmtree(sample_path, ignore_errors=True)
+    subdirs = [os.path.split(p)[1] for p in glob(os.path.join(path, '*'))]
+    copy_or_move_with_subdirs(subdirs, path, sample_path, r, move=False)
+
+def create_val(path, r):
+    """ Takes a path to a dataset and creates a validation set of specified size
+
+    Note - this changes the dataset at <path> by moving files to the val set
+
+    Parameters:
+    -----------
+    path: dataset path
+    r (float): proportion of examples to use for validation, in the range from 0 to 1
+
+    """
+    val_path = os.path.join(os.path.split(path)[0], 'valid')
+    subdirs = [os.path.split(p)[1] for p in glob(os.path.join(path, '*'))]
+    copy_or_move_with_subdirs(subdirs, path, val_path, r, move=True)
+
+def copy_or_move_with_subdirs(subdir_lst, src, dst, r, move=False):
+    do = shutil.move if move else shutil.copy
+    for subdir in subdir_lst:
+        os.makedirs(os.path.join(dst, subdir))
+        files = glob(os.path.join(src, subdir, '*'))
+        np.random.shuffle(files)
+        for f in files[:int(len(files) * r)]:
+            do(f, os.path.join(dst, subdir, os.path.split(f)[1]))
+
 def n_hot(ids, c):
     res = np.zeros((c,), dtype=np.float32)
     res[ids] = 1
@@ -61,8 +101,33 @@ def folder_source(path, folder):
     return fnames, label_arr, all_labels
 
 def parse_csv_labels(fn, skip_header=True):
-    skip = 1 if skip_header else 0
-    csv_lines = [o.strip().split(',') for o in open(fn)][skip:]
+    """Parse filenames and label sets from a CSV file.
+
+    This method expects that the csv file at path :fn: has two columns. If it
+    has a header, :skip_header: should be set to True. The labels in the
+    label set are expected to be space separated.
+
+    Arguments:
+        fn: Path to a CSV file.
+        skip_header: A boolean flag indicating whether to skip the header.
+
+    Returns:
+        a four-tuple of (
+            sorted image filenames,
+            a dictionary of filenames and corresponding labels,
+            a sorted set of unique labels,
+            a dictionary of labels to their corresponding index, which will
+            be one-hot encoded.
+        )
+    .
+    """
+    with open(fn) as fileobj:
+        reader = csv.reader(fileobj)
+        if skip_header:
+            next(reader)
+
+        csv_lines = [l for l in reader]
+
     fnames = [fname for fname, _ in csv_lines]
     csv_labels = {a:b.split(' ') for a,b in csv_lines}
     return sorted(fnames), csv_labels
@@ -273,7 +338,11 @@ def get_ds(fn, trn, val, tfms, test=None, **kwargs):
             fn(val[0], val[1], tfms[0], **kwargs)  # aug
         ]
         if test is not None:
-            test_lbls = np.zeros((len(test),1))
+            if isinstance(test, tuple):
+                test_lbls = test[1]
+                test = test[0]
+            else:
+                test_lbls = np.zeros((len(test),1))
             res += [
                 fn(test, test_lbls, tfms[1], **kwargs), # test
                 fn(test, test_lbls, tfms[0], **kwargs)  # test_aug
@@ -308,7 +377,7 @@ def from_arrays(cls, path, trn, val, bs=64, tfms=(None,None), classes=None, num_
         return cls(path, datasets, bs, num_workers, classes=classes)
 
     @classmethod
-    def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, num_workers=8):
+    def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, test_with_labels=False, num_workers=8):
         """ Read in images and their labels given as sub-folder names
 
         Arguments:
@@ -323,9 +392,13 @@ def from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='v
         Returns:
             ImageClassifierData
         """
+        assert isinstance(tfms[0], Transforms) and isinstance(tfms[1], Transforms), \
+            "please provide transformations for your train and validation sets"
         trn,val = [folder_source(path, o) for o in (trn_name, val_name)]
-        test_fnames = read_dir(path, test_name) if test_name else None
-        datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test_fnames)
+        if test_name:
+            test = folder_source(path, test_name) if test_with_labels else read_dir(path, test_name)
+        else: test = None
+        datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test)
         return cls(path, datasets, bs, num_workers, classes=trn[2])
 
     @classmethod

diff --git a/fastai/learner.py b/fastai/learner.py
@@ -26,8 +26,8 @@ def __init__(self, data, models, opt_fn=None, tmp_name='tmp', models_name='model
         self.crit,self.reg_fn = None,None
 
     @classmethod
-    def from_model_data(cls, m, data):
-        self = cls(data, BasicModel(to_gpu(m)))
+    def from_model_data(cls, m, data, **kwargs):
+        self = cls(data, BasicModel(to_gpu(m)), **kwargs)
         self.unfreeze()
         return self
 
@@ -72,7 +72,7 @@ def get_cycle_end(self, name):
     def save_cycle(self, name, cycle): self.save(f'{name}_cyc_{cycle}')
     def load_cycle(self, name, cycle): self.load(f'{name}_cyc_{cycle}')
 
-    def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None,
+    def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None, best_save_name=None,
                 use_clr=None, metrics=None, callbacks=None, use_wd_sched=False, norm_wds=False, wds_sched_mult=None, **kwargs):
 
         """Method does some preparation before finally delegating to the 'fit' method for
@@ -101,6 +101,8 @@ def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1,
                 https://github.com/fastai/fastai/blob/master/courses/dl1/lesson1.ipynb
 
             cycle_save_name (str): use to save the weights at end of each cycle
+
+            best_save_name (str): use to save weights of best model during training.
 
             metrics (function): some function for evaluating a desired metric. Eg. accuracy.
 
@@ -151,6 +153,10 @@ def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1,
             self.sched = CosAnneal(layer_opt, cycle_batches, on_cycle_end=cycle_end, cycle_mult=cycle_mult)
         elif not self.sched: self.sched=LossRecorder(layer_opt)
         callbacks+=[self.sched]
+
+        if best_save_name is not None:
+            callbacks+=[SaveBestModel(self, layer_opt, best_save_name)]
+
         n_epoch = sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle)
         return fit(model, data, n_epoch, layer_opt.opt, self.crit,
             metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, **kwargs)

diff --git a/fastai/model.py b/fastai/model.py
@@ -61,7 +61,7 @@ def set_train_mode(m):
     else: m.train()
 
 
-def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, **kwargs):
+def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, stepper=Stepper, **kwargs):
     """ Fits a model
 
     Arguments:
@@ -72,7 +72,7 @@ def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, **kwargs):
        epochs(int): number of epochs
        crit: loss function to optimize. Example: F.cross_entropy
     """
-    stepper = Stepper(model, opt, crit, **kwargs)
+    stepper = stepper(model, opt, crit, **kwargs)
     metrics = metrics or []
     callbacks = callbacks or []
     avg_mom=0.98

diff --git a/fastai/nlp.py b/fastai/nlp.py
@@ -312,7 +312,7 @@ class TextDataLoader():
     def __init__(self, src, x_fld, y_fld):
         self.src,self.x_fld,self.y_fld = src,x_fld,y_fld
 
-    def __len__(self): return len(self.src)-1
+    def __len__(self): return len(self.src)
 
     def __iter__(self):
         it = iter(self.src)

diff --git a/fastai/plots.py b/fastai/plots.py
@@ -39,13 +39,13 @@ def plots_from_files(imspaths, figsize=(10,5), rows=1, titles=None, maintitle=No
         plt.imshow(img)
 
 
-def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
+def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues, figsize=None):
     """
     This function prints and plots the confusion matrix.
     Normalization can be applied by setting `normalize=True`.
     (This function is copied from the scikit docs.)
     """
-    plt.figure()
+    plt.figure(figsize=figsize)
     plt.imshow(cm, interpolation='nearest', cmap=cmap)
     plt.title(title)
     plt.colorbar()

diff --git a/fastai/sgdr.py b/fastai/sgdr.py
@@ -9,7 +9,33 @@ def on_batch_begin(self): pass
     def on_epoch_end(self, metrics): pass
     def on_batch_end(self, metrics): pass
     def on_train_end(self): pass
-
+
+# Useful for maintaining status of a long-running job.
+# 
+# Usage:
+# learn.fit(0.01, 1, callbacks = [LoggingCallback(save_path="/tmp/log")])
+class LoggingCallback(Callback):
+    def __init__(self, save_path):
+        super().__init__()
+        self.save_path=save_path
+    def on_train_begin(self): 
+        self.batch = 0
+        self.epoch = 0
+        self.f = open(self.save_path, "a", 1)
+        self.log("\ton_train_begin")
+    def on_batch_begin(self): 
+        self.log(str(self.batch)+"\ton_batch_begin")
+    def on_epoch_end(self, metrics): 
+        self.log(str(self.epoch)+"\ton_epoch_end: "+str(metrics))
+        self.epoch += 1
+    def on_batch_end(self, metrics): 
+        self.log(str(self.batch)+"\ton_batch_end: "+str(metrics))
+        self.batch += 1
+    def on_train_end(self): 
+        self.log("\ton_train_end")
+        self.f.close()
+    def log(self, string):
+        self.f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+"\t"+string+"\n")
 
 class LossRecorder(Callback):
     def __init__(self, layer_opt, save_path=''):
@@ -143,6 +169,44 @@ def calc_lr(self, init_lrs):
         return res
 
 
+class SaveBestModel(LossRecorder):
+
+    """ Save weigths of the model with
+        the best accuracy during training.
+
+        Args:
+            model: the fastai model
+            lr: indicate to use test images; otherwise use validation images
+            name: the name of filename of the weights without '.h5'
+
+        Usage:
+            Briefly, you have your model 'learn' variable and call fit.
+            >>> learn.fit(lr, 2, cycle_len=2, cycle_mult=1, best_save_name='mybestmodel')
+            ....
+            >>> learn.load('mybestmodel')
+
+            For more details see http://forums.fast.ai/t/a-code-snippet-to-save-the-best-model-during-training/12066
+
+    """
+    def __init__(self, model, layer_opt, name='best_model'):
+        super().__init__(layer_opt)
+        self.name = name
+        self.model = model
+        self.best_loss = None
+        self.best_acc = None
+
+    def on_epoch_end(self, metrics):
+        super().on_epoch_end(metrics)
+        loss, acc = metrics
+        if self.best_acc == None or acc > self.best_acc:
+            self.best_acc = acc
+            self.best_loss = loss
+            self.model.save(f'{self.name}')
+        elif acc == self.best_acc and  loss < self.best_loss:
+            self.best_loss = loss
+            self.model.save(f'{self.name}')
+
+
 class WeightDecaySchedule(Callback):
     def __init__(self, layer_opt, batch_per_epoch, cycle_len, cycle_mult, n_cycles, norm_wds=False, wds_sched_mult=None):
         """

diff --git a/fastai/structured.py b/fastai/structured.py
@@ -67,8 +67,8 @@ def get_sample(df,n):
 
     >>> get_sample(df, 2)
        col1 col2
-    2     3    a
     1     2    b
+    2     3    a
     """
     idxs = sorted(np.random.permutation(len(df))[:n])
     return df.iloc[idxs].copy()