template.py

class BaseTemplate(object):
    def __init__(self):
        self.hparams = {
            # User-specified directories
            "workdir": "~/sockeye_trial/",
            "datadir": "$workdir/data",
            "modeldir": "$workdir/model",
            "rootdir": "/home/hltcoe/kduh/src/mt/sockeye-recipes/",
            # Language pair (source and target) ###
            "src_lang": "dfsdfsde",
            "trg_lang": "en",

            ### Tokenized training and validation data ###
            "train_tok": "$workdir/sample-de-en/train",
            "valid_tok": "$workdir/sample-de-en/valid",

            ### Number of symbols to use for BPE ###
            "bpe_symbols_src": "4000",
            "bpe_symbols_trg": "4000",

            ### Filename for BPE-processed bitext file ###
            # Note: the following default names should be fine for most use cases
            "train_bpe": "$datadir/train.bpe-${bpe_symbols_src}",
            "valid_bpe": "$datadir/valid.bpe-${bpe_symbols_src}",
            ### Filename for BPE vocabulary ###
            # Note: the following default names should be fine for most use cases
            # Note: bpe_vocab_src will be needed for applying BPE to test, in translate.sh
            "bpe_vocab_src": "${train_bpe}.$src.bpe_vocab",
            "bpe_vocab_trg": "${train_bpe}.$trg.bpe_vocab",

            # Model architecture
            "num_embed": "50:50",
            "rnn_num_hidden": "50",
            "attention_type": "dot",
            "num_layers": "1",
            "rnn_cell_type": "lstm",

            # Training configuration
            "max_seq_len": "100:100",
            "num_words": "4000:4000",
            "word_min_count": "1:1",
            "batch_size": "64",
            "min_num_epochs": "0",
            "embed_dropout": ".0:.0",

            # Logging and stopping condition
            "checkpoint_frequency": "100",
            "max_updates": "400",
            "keep_last_params": "-1",
        }
        self.template = '''
#####################################################################
# Hyperparameters for preprocess-bpe.sh and train.sh                #
#                                                                   #
# Overview:                                                         #
# - "workdir" corresponds a group of preprocessed bitext and models #
#    for a given dataset. Each "workdir" can contain multiple       #
#    "datadir" and "modeldir" if desired                            #
# - "datadir" stores the BPE-preprocessed training and validation   #
#    bitext files                                                   #
# - "modeldir" is generated by Sockeye and stores all training info #
# - "rootdir" is path to your installation of sockeye-recipes,      #
#    e.g. ~/src/sockeye-recipes                                     #
#                                                                   #
# preprocess-bpe.sh:                                                #
# - input: Tokenized bitext for training ("train_tok") and          #
#   and validation ("valid_tok")                                    #
# - output: BPE-preprocessed bitext ("train_bpe", "valid_bpe")      #
#   and vocabulary ("bpe_vocab_src", "bpe_vocab_trg")               #
# - main hyperparameters: number of BPE symbols for source & target #
#                                                                   #
# train.sh:                                                         #
# - input: BPE-preprocessed bitext ("train_bpe", "valid_bpe")       #
# - output: "modeldir", which contains all training info and can    #
#    be used to translate                                           #
# - main hyperparameters: many! see below                           #
# #####################################################################


#####################################################################
# (0) General settings (to be modified for each project)            #
#####################################################################

### User-specified directories ###
workdir=%s
datadir=%s
modeldir=%s
rootdir=%s

### Language pair (source and target) ###
# Note: We assume all bitext files contain these as suffices. 
# e.g. $train_tok.$src, $train_tok.$trg refer to the source and target 
src=%s
trg=%s

### Tokenized training and validation data ###
# Note we assume tokenization is already done, and will only run BPE
# For tokenization and other preprocessing, see preprocess-tokenize.sh
# which does not use this hyperparam.txt file
train_tok=%s
valid_tok=%s


#####################################################################
# (1) preprocess-bpe.sh settings (modify if needed)                 #
#####################################################################

### Number of symbols to use for BPE ###
# Note: we perform source and target BPE separately
# This corresponds to initial source (src) and target (trg) vocab size
bpe_symbols_src=%s
bpe_symbols_trg=%s

### Filename for BPE-processed bitext file ###
# Note: the following default names should be fine for most use cases
train_bpe=%s
valid_bpe=%s

### Filename for BPE vocabulary ###
# Note: the following default names should be fine for most use cases
# Note: bpe_vocab_src will be needed for applying BPE to test, in translate.sh
bpe_vocab_src=%s
bpe_vocab_trg=%s


#####################################################################
# (2) train.sh settings (modify if needed)                          #
#####################################################################

# Model architecture
num_embed=%s
rnn_num_hidden=%s
attention_type=%s
num_layers=%s
rnn_cell_type=%s

# Training configuration
max_seq_len=%s
num_words=%s
word_min_count=%s
batch_size=%s
min_num_epochs=%s
embed_dropout=%s

# Logging and stopping condition
checkpoint_frequency=%s
max_updates=%s
keep_last_params=%s
'''

    def hparam_list(self):
        return list(self.hparams.keys())

    def render(self):
        hparams = tuple(self.hparams.values())
        return self.template % hparams

    def update(self, updated_dict):
        self.hparams.update(updated_dict)