laia-force-align

#!/usr/bin/env th

require 'laia'

local batcher = laia.RandomBatcher()
local parser = laia.argparse(){
  name = 'laia-force-align',
  description = ''
}

-- Register laia.Version options
laia.Version():registerOptions(parser)
-- Register laia.log options
laia.log.registerOptions(parser)
-- Register cudnn options, only if available
if cudnn then cudnn.registerOptions(parser, true) end
-- Register batcher options
batcher:registerOptions(parser)

parser:argument('checkpoint',
		'Path of the file containing the trained checkpoint/model.')
parser:argument('symbols_table', 'Table mapping from symbols to integer IDs.')
parser:argument('image_list', 'File containing the list of images to align.')
parser:argument('text_table',
		'File containing the table of transcripts to align.')
parser:argument('output_align', 'File containing the output alignments.')
parser:argument('output_prior', 'File containing the output label priors.')
  :args('?')   -- This argument is optional

parser:option(
  '--seed -s', 'Seed for random numbers generation.',
  0x012345, laia.toint)
parser:option(
  '--gpu', 'If gpu>0, uses the specified GPU, otherwise uses the CPU.',
  1, laia.toint)
parser:option(
  '--smoothing_additive', 'If > 0, adds this amount to all symbols for prior ' ..
    'computation.',
  0, tonumber)
  :argname('<float>')
parser:option(
  '--auto_width_factor', 'If true, sets the width factor for the batchers ' ..
    'automatically, from the size of the pooling layers.',
  false, laia.toboolean)
  :argname('<bool>')
parser:option(
  '--batch_size -b', 'Batch size', 16, laia.toint)
  :ge(1)
parser:option(
  '--output_hpad', 'If given, write the horizontal padding applied to each ' ..
    'image to this file.', '')
  :argname('<file>')
parser:option(
  '--skip_alignments', 'If true, does not output the alignments.',
  false, laia.toboolean)
  :argname('<bool>')

-- Parse options
local opts = parser:parse()

-- Initialize random seeds
laia.manualSeed(opts.seed)

-- Load *BEST* model from the checkpoint.
local model = laia.Checkpoint():load(opts.checkpoint):Best():getModel()
assert(model ~= nil, 'No model was found in the checkpoint file!')

-- Add output layer
model:add(nn.LogSoftMax())

-- If a GPU is requested, check that we have everything necessary.
if opts.gpu > 0 then
  assert(cutorch ~= nil, 'Package cutorch is required in order to use the GPU.')
  assert(nn ~= nil, 'Package nn is required in order to use the GPU.')
  cutorch.setDevice(opts.gpu)
  model = model:cuda()
  -- If cudnn_force_convert=true, force all possible layers to use cuDNN impl.
  if cudnn and cudnn.force_convert then
    cudnn.convert(model, cudnn)
  end
else
  -- This should not be necessary, but just in case
  model = model:float()
end
-- We are going to evaluate the model
model:evaluate()

-- Prepare batcher
if opts.auto_width_factor then
  local width_factor = laia.getWidthFactor(model)
  batcher:setOptions({width_factor = width_factor})
  laia.log.info('Batcher width factor was automatically set to %d',
		width_factor)
end
batcher:load(opts.image_list, opts.text_table, opts.symbols_table)
batcher:epochReset()

-- Open file to output the alignments.
local output_align = nil
if not opts.skip_alignments then
  output_align = opts.output_align == '-' and io.stdout or
    io.open(opts.output_align, 'w')
  assert(output_align ~= nil, 'File %q could not be opened for writing!',
         opts.output_align)
end

-- Open file to write the horizontal padding of each sample.
local output_hpad = nil
if opts.output_hpad ~= '' then
  output_hpad = opts.output_hpad == '-' and io.stdout or
    io.open(opts.output_hpad, 'w')
  assert(output_hpad ~= nil, 'File %q could not be opened for writing!',
   opts.output_hpad)
end

-- Open file to write the label priors.
local output_prior = opts.output_prior == '-' and io.stdout or
  opts.output_prior ~= '' and io.open(opts.output_prior, 'w') or nil

local prior_count = nil
for b=1,batcher:numSamples(),opts.batch_size do
  -- Prepare batch
  local batch_img, batch_gt, _, batch_ids, batch_hpad = batcher:next(opts.batch_size)
  if opts.gpu > 0 then batch_img = batch_img:cuda() end
  -- Forward through network, and copy to the CPU
  local output = model:forward(batch_img):float()
  -- Put output in batch x frame x label layout
  local num_frames = output:size(1) / opts.batch_size
  output = output:view(num_frames, opts.batch_size, output:size(2))
  output = output:permute(2, 1, 3):contiguous()

  -- Write horizontal padding of each sample
  if output_hpad then
    for i=1,opts.batch_size do
      if i+b-1 > batcher:numSamples() then break end
      output_hpad:write(('%s %d %d %d\n'):format(
    batch_ids[i], batch_hpad[i][1], batch_hpad[i][2], batch_hpad[i][3]))
    end
    output_hpad:flush()
  end

  -- Initialize prior count
  if output_prior and not prior_count then
    -- prior_count = torch.LongTensor(output:size(3)):zero()
    prior_count = torch.FloatTensor(output:size(3)):zero()
  end

  for i=1,opts.batch_size do
    -- Batch can contain more images, stop here.
    if i+b-1 > batcher:numSamples() then break end

    if not opts.skip_alignments then
      -- Do forced alignment of the sample w.r.t. batch_gt[i]
      laia.log.info('Performing forced alignment of sample %q', batch_ids[i])
      local alignment = laia.force_alignment(output[i], batch_gt[i])

      -- Print alignment
      output_align:write(batch_ids[i])
      for f=1,#alignment do
        output_align:write(' ' .. alignment[f])
      end
      output_align:write('\n')
      output_align:flush()
    end

    if output_prior then
      -- print(output[i]:exp():sum(1):size())
      prior_count = prior_count + output[i]:exp():sum(1)
      --[[
      for _,v in ipairs(alignment) do
	prior_count[v] = prior_count[v] + 1
      end
      ]]
    end
  end
end

-- Close files
if not opts.skip_alignments then
  output_align:close()
end
if output_hpad then
  output_hpad:close()
end

-- Output priors and total counts
if output_prior and prior_count then
  local prior_total = prior_count:sum()
  if opts.smoothing_additive > 0 then
    for n=1,prior_count:size(1) do
      output_prior:write(('%d\t%.6f\t%d\t%.10e\n'):format(
        n, prior_count[n], prior_total, (opts.smoothing_additive+prior_count[n])/(prior_count:size(1)*opts.smoothing_additive+prior_total)))
    end
  else
    for n=1,prior_count:size(1) do
      output_prior:write(('%d\t%.6f\t%d\t%.10e\n'):format(
        n, prior_count[n], prior_total, prior_count[n]/prior_total))
    end
  end
  output_prior:close()
end