src/model.py

# CNN-LSTM-CTC-OCR
# Copyright (C) 2017 Jerod Weinman
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# model.py -- Constructs the graph representing the network
#   model. Inputs start from convnet_layers(), whose outputs hook into
#   rnn_layers(), which produces the logits for CTC loss (for
#   training) and decoding (for prediction/evaluation).

import tensorflow as tf
from tensorflow.contrib import learn

# Layer params:   Filts K  Padding  Name     BatchNorm?
layer_params = [ [  64, 3, 'valid', 'conv1', False], 
                 [  64, 3, 'same',  'conv2', True],  # pool
                 [ 128, 3, 'same',  'conv3', False], 
                 [ 128, 3, 'same',  'conv4', True],  # hpool
                 [ 256, 3, 'same',  'conv5', False],
                 [ 256, 3, 'same',  'conv6', True],  # hpool
                 [ 512, 3, 'same',  'conv7', False], 
                 [ 512, 3, 'same',  'conv8', True] ] # hpool 3

rnn_size = 2**9    # Dimensionality of all RNN elements' hidden layers
dropout_rate = 0.5 # For RNN layers (currently not used--uncomment below)

def conv_layer( bottom, params, training ):
    """Build a convolutional layer using entry from layer_params)"""

    batch_norm = params[4] # Boolean

    if batch_norm:
        activation = None
    else:
        activation = tf.nn.relu

    kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
    bias_initializer = tf.constant_initializer( value=0.0 )

    top = tf.layers.conv2d( bottom, 
                            filters=params[0],
                            kernel_size=params[1],
                            padding=params[2],
                            activation=activation,
                            kernel_initializer=kernel_initializer,
                            bias_initializer=bias_initializer,
                            name=params[3] )
    if batch_norm:
        top = norm_layer( top, training, params[3]+'/batch_norm' )
        top = tf.nn.relu( top, name=params[3]+'/relu' )

    return top


def pool_layer( bottom, wpool, padding, name ):
    """Short function to build a pooling layer with less syntax"""
    top = tf.layers.max_pooling2d( bottom, 
                                   2, 
                                   [2, wpool], 
                                   padding=padding, 
                                   name=name )
    return top


def norm_layer( bottom, training, name):
    """Short function to build a batch normalization layer with less syntax"""
    top = tf.layers.batch_normalization( bottom, 
                                         axis=3, # channels last
                                         training=training,
                                         name=name )
    return top


def convnet_layers( inputs, widths, mode ):
    """
    Build convolutional network layers attached to the given input tensor
    """

    training = (mode == learn.ModeKeys.TRAIN)

    # inputs should have shape [ ?, 32, ?, 1 ]
    with tf.variable_scope( "convnet" ): # h,w
        
        conv1 = conv_layer( inputs, layer_params[0], training ) # 30,30
        conv2 = conv_layer( conv1, layer_params[1], training )  # 30,30
        pool2 = pool_layer( conv2, 2, 'valid', 'pool2' )        # 15,15
        conv3 = conv_layer( pool2, layer_params[2], training )  # 15,15
        conv4 = conv_layer( conv3, layer_params[3], training )  # 15,15
        pool4 = pool_layer( conv4, 1, 'valid', 'pool4' )        # 7,14
        conv5 = conv_layer( pool4, layer_params[4], training )  # 7,14
        conv6 = conv_layer( conv5, layer_params[5], training )  # 7,14
        pool6 = pool_layer( conv6, 1, 'valid', 'pool6')         # 3,13
        conv7 = conv_layer( pool6, layer_params[6], training )  # 3,13
        conv8 = conv_layer( conv7, layer_params[7], training )  # 3,13
        pool8 = tf.layers.max_pooling2d( conv8, [3, 1], [3, 1], 
                                         padding='valid', 
                                         name='pool8' )         # 1,13
        # squeeze row dim
        features = tf.squeeze( pool8, axis=1, name='features' ) 
        
        sequence_length = get_sequence_lengths( widths )
        
        # Vectorize
        sequence_length = tf.reshape( sequence_length, [-1], name='seq_len' ) 

        return features, sequence_length


def get_sequence_lengths( widths ):    
    """Tensor calculating output sequence length from original image widths"""
    kernel_sizes = [params[1] for params in layer_params]

    with tf.variable_scope("sequence_length"):
        conv1_trim = tf.constant( 2 * (kernel_sizes[0] // 2),
                                  dtype=tf.int32,
                                  name='conv1_trim' )
        one = tf.constant( 1, dtype=tf.int32, name='one' )
        two = tf.constant( 2, dtype=tf.int32, name='two' )
        after_conv1 = tf.subtract( widths, conv1_trim, name='after_conv1' )
        after_pool2 = tf.floor_div( after_conv1, two, name='after_pool2' )
        after_pool4 = tf.subtract( after_pool2, one, name='after_pool4' )
        after_pool6 = tf.subtract( after_pool4, one, name='after_pool6' ) 
        after_pool8 = tf.identity( after_pool6, name='after_pool8' )
    return after_pool8


def rnn_layer( bottom_sequence, sequence_length, rnn_size, scope ):
    """Build bidirectional (concatenated output) RNN layer"""

    weight_initializer = tf.truncated_normal_initializer( stddev=0.01 )

    # Default activation is tanh
    cell_fw = tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell( rnn_size )
    cell_bw = tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell( rnn_size )

    # Pre-CUDNN (slower) alternatve. Default activation is tanh .
    #cell_fw = tf.contrib.rnn.LSTMCell( rnn_size, 
    #                                   initializer=weight_initializer)
    #cell_bw = tf.contrib.rnn.LSTMCell( rnn_size, 
    #                                   initializer=weight_initializer)

    # Include?
    #cell_fw = tf.contrib.rnn.DropoutWrapper( cell_fw, 
    #                                         input_keep_prob=dropout_rate )
    #cell_bw = tf.contrib.rnn.DropoutWrapper( cell_bw, 
    #                                         input_keep_prob=dropout_rate )
    
    rnn_output,_ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw, 
        cell_bw, 
        bottom_sequence,
        sequence_length=sequence_length,
        time_major=True,
        dtype=tf.float32,
        scope=scope )
    
    # Concatenation allows a single output op because [A B]*[x;y] = Ax+By
    # [ paddedSeqLen batchSize 2*rnn_size]
    rnn_output_stack = tf.concat( rnn_output, 2, name='output_stack' )
    
    return rnn_output_stack


def rnn_layers( features, sequence_length, num_classes ):
    """Build a stack of RNN layers from input features"""

    # Input features is [batchSize paddedSeqLen numFeatures]
    logit_activation = tf.nn.relu
    weight_initializer = tf.contrib.layers.variance_scaling_initializer()
    bias_initializer = tf.constant_initializer( value=0.0 )

    with tf.variable_scope( "rnn" ):
        # Transpose to time-major order for efficiency
        rnn_sequence = tf.transpose( features, 
                                     perm=[1, 0, 2], 
                                     name='time_major' )
        rnn1 = rnn_layer( rnn_sequence, sequence_length, rnn_size, 'bdrnn1' )
        rnn2 = rnn_layer( rnn1, sequence_length, rnn_size, 'bdrnn2' )
        rnn_logits = tf.layers.dense( rnn2, 
                                      num_classes+1, 
                                      activation=logit_activation,
                                      kernel_initializer=weight_initializer,
                                      bias_initializer=bias_initializer,
                                      name='logits' )
        return rnn_logits
    

def ctc_loss_layer( rnn_logits, sequence_labels, sequence_length,
                    reduce_mean=True ):
    """Build CTC Loss layer for training"""
    losses = tf.nn.ctc_loss( sequence_labels, 
                             rnn_logits, 
                             sequence_length,
                             time_major=True, 
                             ignore_longer_outputs_than_inputs=True )
    if (reduce_mean):
        loss = tf.reduce_mean( losses )
    else:
        loss = tf.reduce_sum( losses )

    return loss