diff --git a/graph/B_ConcatRealImag.m b/graph/B_ConcatRealImag.m new file mode 100644 index 0000000..ecdd06f --- /dev/null +++ b/graph/B_ConcatRealImag.m @@ -0,0 +1,13 @@ +function grad = B_ConcatRealImag(prev_layer, curr_layer, future_layers) + +future_grad = GetFutureGrad(future_layers, curr_layer); + +[D, T, N] = size(future_grad); +j = sqrt(-1); + +realpart = future_grad(1:D/2,:,:); +imagpart = future_grad(D/2+1:end,:,:); + +grad = realpart + j*imagpart; + +end \ No newline at end of file diff --git a/graph/DNN_Cost10.m b/graph/DNN_Cost10.m index ee405f1..019f9e7 100644 --- a/graph/DNN_Cost10.m +++ b/graph/DNN_Cost10.m @@ -140,14 +140,20 @@ [layer{i}.a, layer{i}.validFrameMask] = F_comp_gcc(prev_layers{1}, layer{i}); case 'stft' [layer{i}.a, layer{i}.validFrameMask] = F_stft(prev_layers{1}, layer{i}); - + case 'spatialcov' + [layer{i}.a, layer{i}.validFrameMask] = F_SpatialCov(prev_layers{1}, layer{i}); % do not support variable length yet case 'spatialcovmask' - layer{i}.a = F_SpatialCovMask(prev_layers, layer{i}); % do not support variable length yet + [layer{i}.a, layer{i}.validFrameMask] = F_SpatialCovMask(prev_layers, layer{i}); % do not support variable length yet case 'spatialcovsplitmask' layer{i}.a = F_SpatialCovSplitMask(prev_layers, layer{i}); % do not support variable length yet case 'mvdr_spatialcov' layer{i} = F_MVDR_spatialCov(prev_layers{1}, layer{i}); % do not support variable length yet - + case 'extspatialcovfeat' + layer{i}.a = F_ExtSpatialCovFeat(prev_layers{1}, layer{i}); % extract up triangle real and imag parts, diagonal part from spatial cov + case 'spatialnorm' + layer{i}.a = F_SpatialNorm(prev_layers{1}, layer{i}); + case 'concatrealimag' + layer{i}.a = F_ConcatRealImag(prev_layers{1}); case 'cov' layer{i}.a = F_cov(prev_layers{1}.a); % do not support variable length yet case 'logdet' @@ -247,7 +253,7 @@ if isfield(layer{i}, 'mask') % the mask defines what values can be tuned and what cannot be tuned. tmp = tmp .* layer{i}.mask; end - cost_func.cost = cost_func.cost + 0.5* L2weight * sum(sum(tmp.*tmp)); + cost_func.cost = cost_func.cost + 0.5* L2weight * sum(sum(real(tmp.*conj(tmp)))); end end end @@ -453,6 +459,8 @@ layer{i}.grad = B_inner_product_normalized(prev_layers, future_layers); case 'concatenate' layer{i}.grad = B_concatenate(prev_layers, layer{i}, future_layers); + case 'concatrealimag' + layer{i}.grad = B_ConcatRealImag(prev_layers, layer{i}, future_layers); otherwise fprintf('Error: unknown output node type %s!\n', layer{i}.name); diff --git a/graph/DNN_update.m b/graph/DNN_update.m index 01d0685..84f1512 100644 --- a/graph/DNN_update.m +++ b/graph/DNN_update.m @@ -30,7 +30,7 @@ end end - if para.NET.gradientClipThreshold > 0 + if para.NET.gradientClipThreshold > 0 && isreal(grad_W) grad_W = max(-para.NET.gradientClipThreshold, grad_W); grad_W = min(para.NET.gradientClipThreshold, grad_W); end @@ -65,7 +65,7 @@ layer{Lidx(1)}.W = layer{Lidx(1)}.W - update{i}.W; end - if para.NET.weight_clip + if para.NET.weight_clip && isreal(layer{Lidx(1)}.W) % sometimes the weight will explode, so we need to add a limit to the value of the weights, e.g. +-10 layer{Lidx(1)}.W = max(-para.NET.weight_clip,layer{Lidx(1)}.W); layer{Lidx(1)}.W = min(para.NET.weight_clip,layer{Lidx(1)}.W); diff --git a/graph/F_ConcatRealImag.m b/graph/F_ConcatRealImag.m new file mode 100644 index 0000000..045f504 --- /dev/null +++ b/graph/F_ConcatRealImag.m @@ -0,0 +1,7 @@ +function output = F_ConcatRealImag(prev_layer) + +covMat = prev_layer.a; + +output = [real(covMat); imag(covMat)]; + +end diff --git a/graph/F_ExtSpatialCovFeat.m b/graph/F_ExtSpatialCovFeat.m new file mode 100644 index 0000000..3f282ae --- /dev/null +++ b/graph/F_ExtSpatialCovFeat.m @@ -0,0 +1,160 @@ +function feat = F_ExtSpatialCovFeat(prev_layer, curr_layer) + +covMat = prev_layer.a; +prev_mask = prev_layer.validFrameMask; +nCh = curr_layer.nCh; +nBin = curr_layer.nBin; +[~, nf, N] = size(covMat); + +if isfield(curr_layer, 'scm_select') + scm_select = curr_layer.scm_select; +else + scm_select = 'uptriangle'; +end +if isfield(curr_layer, 'scm_select_diag') + scm_select_diag = curr_layer.scm_select_diag; +else + scm_select_diag = 1; +end +if isfield(curr_layer, 'scm_select_bin') + scm_select_bin = curr_layer.scm_select_bin; + scm_bin_shift = curr_layer.scm_bin_shift; +else + scm_select_bin = 0; +end + +if N == 1 + + % normalize the cov matrix by their diagonal elements, remove the effect of + % spectral power and only retains the phase information + dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin)); + dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index + diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:), nCh, nBin, nf), 1)); + if nf ==1 + diag_mean = diag_mean.'; + end + normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf), [3 4 1 2]), 1./diag_mean); + normCovMat = reshape(permute(normCovMat, [3 4 1 2]), nCh^2*nBin, nf); + + % get the upper triangle off-diagonal elements which are complex-valued + if strcmpi(scm_select, 'uptriangle') + selectMat = triu(ones(nCh, nCh),1); % 1. up-trialgle + elseif strcmpi(scm_select, 'row') + selectMat = zeros(nCh, nCh); selectMat(1,2:end) = ones(1, nCh-1); % 2. first row + else + fprintf('Error: unknown scm feature select type: %s', lower(scm_select)) + end + + dimSelectMask2 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin)); + dimSelectIdx2 = find(reshape(dimSelectMask2, numel(dimSelectMask2),1) == 1); + real_part = real(normCovMat(dimSelectIdx2,:)); + % imag_part = imag(normCovMat(dimSelectIdx2,:)); + % for freq bin 1 and 257, no imag part + dimSelectMask3 = bsxfun(@times, selectMat, cat(3,zeros(nCh, nCh, 1), ones(nCh, nCh, nBin-2), zeros(nCh, nCh, 1))); + dimSelectIdx3 = find(reshape(dimSelectMask3, numel(dimSelectMask3),1) == 1); + imag_part = imag(normCovMat(dimSelectIdx3,:)); + + % get the diagonal elements which are real values + % diag_part = covMat(dimSelectIdx1,:); + % diag_part = log(max(eps,abs(diag_part))); + if scm_select_diag + diag_part = real(normCovMat(dimSelectIdx1,:)); + end +else + % select 1 bin by average every scm_bin_shift bins + if scm_select_bin + covMat1 = reshape(covMat, nCh^2, nBin, nf, N); + covMat2 = reshape(permute(covMat1, [1 3 4 2]), nCh^2*nf*N, nBin); + covMat3 = conv2(covMat2, ones(1,scm_bin_shift, 'like', covMat2(1))/scm_bin_shift, 'valid'); + covMat4 = covMat3(:, 1:scm_bin_shift:end); + nBin = size(covMat4, 2); + covMat = reshape(permute(reshape(covMat4, nCh^2, nf, N, nBin), [1 4 2 3]), nCh^2*nBin, nf, N); + + end + + % normalize the cov matrix by their diagonal elements, remove the effect of + % spectral power and only retains the phase information + dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin)); + dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index + diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:,:), nCh, nBin, nf, N), 1)); + if nf ==1 + diag_mean = reshape(diag_mean, size(diag_mean,1), 1, size(diag_mean, 2)); + end + % minibatch padding makes some frames zero, mean of that still be zero, can not be divided. + diag_mean1 = permute(bsxfun(@plus, permute(diag_mean, [2 3 1]), -1e10.*prev_mask), [3 1 2]); + normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf, N), [3 4 5 1 2]), 1./diag_mean1); + normCovMat = reshape(permute(normCovMat, [4 5 1 2 3]), nCh^2*nBin, nf, N); + +% % select 1 bin by average every scm_bin_shift bins +% if scm_select_bin +% normCovMat1 = reshape(normCovMat, nCh^2, nBin, nf, N); +% normCovMat2 = reshape(permute(normCovMat1, [1 3 4 2]), nCh^2*nf*N, nBin); +% normCovMat3 = conv2(normCovMat2, ones(1,scm_bin_shift, 'like', normCovMat2(1))/scm_bin_shift, 'valid'); +% normCovMat4 = normCovMat3(:, 1:scm_bin_shift:end); +% nBin = size(normCovMat4, 2); +% normCovMat = reshape(permute(reshape(normCovMat4, nCh^2, nf, N, nBin), [1 4 2 3]), nCh^2*nBin, nf, N); +% +% end + + % get the upper triangle off-diagonal elements which are complex-valued + if strcmpi(scm_select, 'uptriangle') + selectMat = triu(ones(nCh, nCh),1); % 1. up-trialgle + elseif strcmpi(scm_select, 'row') + selectMat = zeros(nCh, nCh); selectMat(1,2:end) = ones(1, nCh-1); % 2. first row + else + fprintf('Error: unknown scm feature select type: %s', lower(scm_select)) + end + + dimSelectMask2 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin)); + dimSelectIdx2 = find(reshape(dimSelectMask2, numel(dimSelectMask2),1) == 1); + real_part = real(normCovMat(dimSelectIdx2,:,:)); + % imag_part = imag(normCovMat(dimSelectIdx2,:)); + % for freq bin 1 and 257, no imag part + if scm_select_bin + dimSelectMask3 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin)); + else + dimSelectMask3 = bsxfun(@times, selectMat, cat(3,zeros(nCh, nCh, 1), ones(nCh, nCh, nBin-2), zeros(nCh, nCh, 1))); + end + dimSelectIdx3 = find(reshape(dimSelectMask3, numel(dimSelectMask3),1) == 1); + imag_part = imag(normCovMat(dimSelectIdx3,:,:)); + + % get the diagonal elements which are real values + if scm_select_diag + dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin)); + dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); + diag_part = real(normCovMat(dimSelectIdx1,:,:)); + end +end + +% get the final feature vector +if scm_select_diag + feat = [real_part; imag_part; diag_part]; +else + feat = [real_part; imag_part]; +end +% real_part = reshape(real_part, 7, 257, nf, N); +% imag_part = reshape(imag_part, 7, 255, nf, N); +% real_part = real_part(:, 6:5:end,:,:); +% imag_part = imag_part(:, 5:5:end,:,:); +% +% feat = [reshape(real_part, 7*51, nf, N); reshape(imag_part, 7*51, nf, N)]; + + +% covMat = reshape(covMat(:,:,:), nCh, nCh, nBin, nf, N); +% covMatCell = num2cell(covMat, [1 2]); +% omegaTau = cellfun(@GetPrincVec, covMatCell, 'UniformOutput', 0); +% output = permute(cell2mat(omegaTau), [1 3 4 5 2]); +% +% feat = output(2:8, 5:5:end, :,:); +% [d1,d2,d3,d4] = size(feat); +% feat = reshape(feat, d1*d2, d3, d4); + +end + +function omegaTau = GetPrincVec(A) +[V,D] = eig(A); +D = diag(D); +[~, idx] = max(D); +ev = V(:,idx); +omegaTau = gather(angle(ev/ev(1))); +end diff --git a/graph/F_SpatialCov.m b/graph/F_SpatialCov.m index 5c8e67c..62d253f 100644 --- a/graph/F_SpatialCov.m +++ b/graph/F_SpatialCov.m @@ -1,4 +1,4 @@ -function output = F_SpatialCov(input_layer, curr_layer) +function [output, mask] = F_SpatialCov(input_layer, curr_layer) input = input_layer.a; [D,T,N] = size(input); @@ -6,16 +6,87 @@ curr_layer = SetDefaultValue(curr_layer, 'winSize', 0); curr_layer = SetDefaultValue(curr_layer, 'winShift', 1); +if T <= curr_layer.winSize + windowSize = 0; + windowShift = 1; +else + windowSize = curr_layer.winSize; + windowShift = curr_layer.winShift; +end + nBin = length(curr_layer.freqBin); nCh = D/nBin; +input2 = reshape(input, nBin, nCh, T, N); + +if windowSize == 0 + nf = 1; +else + nf = fix((T-windowSize+windowShift)/windowShift); +end +mask = zeros(nf, N, 'like', real(input2(1))); + if N==1 - input2 = reshape(input, nBin, nCh, T,N); - R = ComplexSpectrum2SpatialCov(input2, curr_layer.winSize, curr_layer.winShift); - output = permute(R, [3 1 2 4]); - output = reshape(output, nBin*nCh^2, size(output,4),N); +% R = ComplexSpectrum2SpatialCov(input2, windowSize, windowShift); +% % output = permute(R, [3 1 2 4]); +% % output = reshape(output, nBin*nCh^2, size(output,4),N); +% output = reshape(R, nCh^2*nBin, size(R,4),N); + + X2 = permute(input2, [2 1 3]); + XX = outProdND(X2); + XX2 = reshape(XX, nCh^2*nBin, T); + + if windowSize == 0 + output = squeeze(mean(XX2, 2)); + else +% idx = [ones(1,half_ctx) 1:T ones(1,half_ctx)*T]; +% SCM = conv2(XX2, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid'); + SCM = conv2(XX2, ones(1,windowSize, class(gather(input2))), 'valid'); + output = SCM(:, 1:windowShift:end); + end + else - % to be implemented + X2 = permute(input2, [2 1 3 4]); + XX = outProdND(X2); + XX2 = reshape(XX, nCh^2*nBin, T, N); + + if windowSize == 0 + output = mean(XX2, 2); + else +% % idx = [ones(1,half_ctx) 1:T ones(1,half_ctx)*T]; +% XX3 = reshape(permute(XX2, [1 3 2]), nCh^2*nBin*N, T); +% SCM = conv2(XX3, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid'); +% output = SCM(:, 1:windowShift:end); +% output = permute(reshape(output, nCh^2*nBin, N, size(output, 2)), [1 3 2]); + +% % Version 1 +% prev_mask = input_layer.validFrameMask; +% output = zeros(nCh^2*nBin, nf, N, 'like', XX2); +% for i=1:N +% idx = find(prev_mask(:,i) == 0, 1, 'last'); +% idx2 = fix((idx-windowSize+windowShift)/windowShift); +% XX3 = squeeze(XX2(:,1:idx,i)); +% SCM = conv2(XX3, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid'); +% output(:, 1:idx2, i) = SCM(:, 1:windowShift:end); +% mask(idx2+1:end, i) = 1; +% end + + % Version 2, much fast + prev_mask = input_layer.validFrameMask; + idx = arrayfun(@(x) find(gather(prev_mask(:,x)) == 0, 1, 'last'), 1:size(prev_mask,2)); + idx2 = arrayfun(@(x) fix((idx(x)-windowSize+windowShift)/windowShift), 1:length(idx)); + XX31 = reshape(permute(XX2, [1 3 2]), nCh^2*nBin*N, T); +% SCM1 = conv2(XX31, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid'); + SCM1 = conv2(XX31, ones(1,windowSize, class(gather(input2))), 'valid'); + output1 = SCM1(:, 1:windowShift:end); + output2 = permute(reshape(output1, nCh^2*nBin, N, size(output1, 2)), [1 3 2]); + output = zeros(nCh^2*nBin, nf, N, 'like', XX2); + for i = 1:N + output(:, 1:idx2(i), i) = output2(:, 1:idx2(i), i); + mask(idx2(i)+1:end, i) = 1; + end + + end end -end \ No newline at end of file +end diff --git a/graph/F_SpatialCovMask.m b/graph/F_SpatialCovMask.m index 45473e0..4db29ad 100644 --- a/graph/F_SpatialCovMask.m +++ b/graph/F_SpatialCovMask.m @@ -1,24 +1,39 @@ -% Estimate spatial covariance matrix for sentences using a mask. The mask +% Estimate spatial covariance matrix for sentences using a speechMask. The speechMask % specifies speech presense probability at all time frequency locations, % with a 1 means speech present and 0 means speech absent. % -function output = F_SpatialCovMask(prev_layers, curr_layer) -mask = prev_layers{1}.a; +function [output, uttMask] = F_SpatialCovMask(prev_layers, curr_layer) +speechMask = prev_layers{1}.a; data = prev_layers{2}.a; +prev_mask =prev_layers{2}.validFrameMask; -if isfield(curr_layer, 'windowSize') - windowSize = curr_layer.windowSize; -else - windowSize = 0; -end - -[D,T,N] = size(mask); +[D,T,N] = size(speechMask); [D2,T,N] = size(data); nCh = D2/D; data = reshape(data, D, nCh, T, N); data = permute(data, [2 3 1 4]); % data = abs(data); +if isfield(curr_layer, 'winSize') && T > curr_layer.winSize + windowSize = curr_layer.winSize; + windowShift = curr_layer.winShift; +else + windowSize = 0; +end + +if isfield(curr_layer, 'speechOnly') + speechOnly = curr_layer.speechOnly; +else + speechOnly = false; +end + +% if windowSize == 0 +% nf = 1; +% else +% nf = fix((T-windowSize+windowShift)/windowShift); +% end +% uttMask = zeros(nf, N, 'like', real(data(1))); + if windowSize == 0 % utterance mode, estimate two spatial covariance matrixes for each utterance, one is speech and the other is noise. if 0 % for loop version if IsInGPU(data) @@ -27,38 +42,50 @@ else scm_speech = zeros(nCh, nCh, D, N); scm_noise = zeros(nCh, nCh, D, N); - end + end for d=1:D for n=1:N for t=1:T - scm_speech(:,:,d,n) = scm_speech(:,:,d,n) + mask(d,t,n) * data(:,t,d) * data(:,t,d)'; - scm_noise(:,:,d,n) = scm_noise(:,:,d,n) + (1-mask(d,t,n)) * data(:,t,d) * data(:,t,d)'; + scm_speech(:,:,d,n) = scm_speech(:,:,d,n) + speechMask(d,t,n) * data(:,t,d) * data(:,t,d)'; + scm_noise(:,:,d,n) = scm_noise(:,:,d,n) + (1-speechMask(d,t,n)) * data(:,t,d) * data(:,t,d)'; end - scm_speech(:,:,d,n) = scm_speech(:,:,d,n) / sum(mask(d,:,n)); - scm_noise(:,:,d,n) = scm_noise(:,:,d,n) / (T-sum(mask(d,:,n))); + scm_speech(:,:,d,n) = scm_speech(:,:,d,n) / sum(speechMask(d,:,n)); + scm_noise(:,:,d,n) = scm_noise(:,:,d,n) / (T-sum(speechMask(d,:,n))); end end - else % vectorized + else +% vectorized: version 1 % data_cell = num2cell(data, [1]); -% mask_cell = num2cell(permute(mask, [3 2 1]), [1]); +% mask_cell = num2cell(permute(speechMask, [3 2 1]), [1]); % scm_speech_cell = cellfun(@(x,y) (reshape(x*y*y',nCh^2,1)), mask_cell, data_cell, 'UniformOutput', 0); % scm_noise_cell = cellfun(@(x,y) (reshape((1-x)*y*y',nCh^2,1)), mask_cell, data_cell, 'UniformOutput', 0); % scm_speech = reshape(sum(cell2mat(scm_speech_cell),2),nCh,nCh,D); -% scm_speech = bsxfun(@times, scm_speech, permute(1./sum(mask,2), [3 2 1])); +% scm_speech = bsxfun(@times, scm_speech, permute(1./sum(speechMask,2), [3 2 1])); % scm_noise = reshape(sum(cell2mat(scm_noise_cell),2),nCh,nCh,D); -% scm_noise = bsxfun(@times, scm_noise, permute(1./sum(1-mask,2), [3 2 1])); - - mask2 = permute(mask, [4 2 1 3]); +% scm_noise = bsxfun(@times, scm_noise, permute(1./sum(1-speechMask,2), [3 2 1])); +% + % version 2 + mask2 = permute(speechMask, [4 2 1 3]); scm_speech = ComputeCovMask(data, mask2); - scm_noise = ComputeCovMask(data, 1-mask2); + if speechOnly + output = scm_speech; + else + scm_noise = ComputeCovMask(data, 1-mask2); + output = [scm_speech; scm_noise]; + end + end + uttMask = zeros(1, N, 'like', real(data(1))); +else % online mode, estiamte covariance matrices for a sliding window of frames. + % to be implemented. + % frame number after moving window + mask2 = permute(speechMask, [4 2 1 3]); + [scm_speech, uttMask] = ComputeWinCovMask(data, mask2, prev_mask, windowSize, windowShift); + if speechOnly + output = scm_speech; + else + scm_noise = ComputeWinCovMask(data, 1-mask2, prev_mask, windowSize, windowShift); + output = [scm_speech; scm_noise]; end - - scm_speech2 = reshape(scm_speech, nCh^2*D, 1, N); - scm_noise2 = reshape(scm_noise, nCh^2*D, 1, N); - output = [scm_speech2; scm_noise2]; -else % online mode, estiamte covariance matrices for a sliding window of frames. - % to be implemented. end - end diff --git a/graph/F_SpatialNorm.m b/graph/F_SpatialNorm.m new file mode 100644 index 0000000..ac5c0ed --- /dev/null +++ b/graph/F_SpatialNorm.m @@ -0,0 +1,38 @@ +function normCovMat = F_SpatialNorm(prev_layer, curr_layer) + +covMat = prev_layer.a; +prev_mask = prev_layer.validFrameMask; +nCh = curr_layer.nCh; +nBin = curr_layer.nBin; +[~, nf, N] = size(covMat); + +if N == 1 + + % normalize the cov matrix by their diagonal elements, remove the effect of + % spectral power and only retains the phase information + dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin)); + dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index + diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:), nCh, nBin, nf), 1)); + if nf ==1 + diag_mean = diag_mean.'; + end + normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf), [3 4 1 2]), 1./diag_mean); + normCovMat = reshape(permute(normCovMat, [3 4 1 2]), nCh^2*nBin, nf); + +else + % normalize the cov matrix by their diagonal elements, remove the effect of + % spectral power and only retains the phase information + dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin)); + dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index + diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:,:), nCh, nBin, nf, N), 1)); + if nf ==1 + diag_mean = reshape(diag_mean, size(diag_mean,1), 1, size(diag_mean, 2)); + end + % minibatch padding makes some frames zero, mean of that still be zero, can not be divided. + diag_mean1 = permute(bsxfun(@plus, permute(diag_mean, [2 3 1]), -1e10.*prev_mask), [3 1 2]); + normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf, N), [3 4 5 1 2]), 1./diag_mean1); + normCovMat = reshape(permute(normCovMat, [4 5 1 2 3]), nCh^2*nBin, nf, N); + +end + +end diff --git a/graph/F_tdoa2weight.m b/graph/F_tdoa2weight.m index 7529aa9..8568860 100644 --- a/graph/F_tdoa2weight.m +++ b/graph/F_tdoa2weight.m @@ -1,8 +1,10 @@ -function output = F_tdoa2weight(input, freq_bin) +function output = F_tdoa2weight(input_layer, curr_layer) % assume input is an array of time delay of C microphone channels. % freq_bin is an array of center frequencies of N FFT bins. +input = input_layer.a; +freq_bin = curr_layer.freq_bin; [D,T,N] = size(input); nCh = D+1; delay = [zeros(1,T); input]; diff --git a/prototypes/computePCA.m b/prototypes/computePCA.m new file mode 100644 index 0000000..e16c9d5 --- /dev/null +++ b/prototypes/computePCA.m @@ -0,0 +1,58 @@ + +function [W, b] = computePCA(Visible, nUttUsed, para, layer) +if exist('nUttUsed')==0 || length(nUttUsed)==0 + nUttUsed = 500; +end +nUtt = length(Visible(1).data); +if nUtt>nUttUsed + step = ceil(nUtt/nUttUsed); + for i=1:length(Visible) + Visible(i).data = Visible(i).data(1:step:end); + end +end + +para.out_layer_idx = length(layer); +para.output = 'dummy'; +para = ParseOptions2(para); +output = FeatureTree2(Visible, para, layer); + +if para.NET.variableLengthMinibatch + for i=1:length(output) + featTmp = gather(output{i}{1}); + [featTmp2, mask, variableLength] = ExtractVariableLengthTrajectory(featTmp); + feat{i} = cell2mat(featTmp2); + end +else + for i=1:length(output) + feat{i} = gather(output{i}{1}); + [D,T,N] = size(feat{i}); + if N>1 + feat{i} = reshape(feat{i},D,T*N); + end + end +end +feat = cell2mat(feat); + +% [coeff, scores, latent] = princomp(feat','econ'); +% tmp=cumsum(latent)./sum(latent); +% coeff = princomp(feat','econ'); +% W = coeff(:,1:para.topology.pcaDim)'; +% b = -W*mean(feat,2); +% +% [coeff1, scores1, latent1] = pca(feat'); +% cov1 = cov(feat'); +% [V1,D1] = eig(cov1); +% D2 = diag(D1); + +fprintf('Load %d utts feats, begin pca ...', nUttUsed); +[coeff, ~, latent] = pca(feat'); +tmp=cumsum(latent)./sum(latent); +idx = find(tmp == 0.95); +fprintf('End of PCA, select %d can cover 99.9 percent', idx); +% W = coeff(:,1:para.topology.pcaDim)'; +W = coeff(:,1:1000)'; +b = -W*mean(feat,2); + +save(['PCA_U' num2str(nUttUsed) '_W_B.mat'], 'W', 'b', 'latent'); + +end diff --git a/signal/feature/sfft_multi.m b/signal/feature/sfft_multi.m index 635983d..3847484 100644 --- a/signal/feature/sfft_multi.m +++ b/signal/feature/sfft_multi.m @@ -17,9 +17,9 @@ useGPU = 0; end -if exist('doDithering')==0 || length(doDithering)==0 - x = x + randn(size(x))/2^32; -end +% if exist('doDithering')==0 || length(doDithering)==0 +% x = x + randn(size(x))/2^32; +% end % produce the hamming windowm if exist('window_type')==0 || length(window_type)==0 diff --git a/signal/gmm/ComputeCovMask.m b/signal/gmm/ComputeCovMask.m index 4d3c09f..1be3d07 100644 --- a/signal/gmm/ComputeCovMask.m +++ b/signal/gmm/ComputeCovMask.m @@ -5,12 +5,21 @@ % feature vector to the covariance matrix % function covMat = ComputeCovMask(data, mask) - +[nCh, ~, nBin, N] = size(data); weight = sqrt(bsxfun(@times, mask, 1./sum(mask))); data_scaled = bsxfun(@times, data, weight); -data_cell = num2cell(data_scaled, [1 2]); % convert to cell array and call cellfun for speed -tmp = cellfun(@(x) gather(x*x'), data_cell, 'UniformOutput', 0); -covMat = cell2mat(tmp); -% covMat = cell2mat_gpu(tmp); + +% % version 1 +% data_cell = num2cell(data_scaled, [1 2]); % convert to cell array and call cellfun for speed +% tmp = cellfun(@(x) gather(x*x'), data_cell, 'UniformOutput', 0); +% covMat = cell2mat(tmp); +% % covMat = cell2mat_gpu(tmp); +% covMat = reshape(covMat, nCh^2*nBin, 1, N); + +% version 2 + +covMat1 = outProdND(data_scaled); +covMat2 = squeeze(mean(covMat1, 3)); +covMat = reshape(covMat2, nCh^2*nBin, 1, N); end diff --git a/signal/gmm/ComputeWinCovMask.m b/signal/gmm/ComputeWinCovMask.m new file mode 100644 index 0000000..cc05ddd --- /dev/null +++ b/signal/gmm/ComputeWinCovMask.m @@ -0,0 +1,68 @@ +function [winCovMat, winMask] = ComputeWinCovMask(data, mask, prev_mask, windowSize, windowShift) +[nCh, nf_stft, nBin, N] = size(data); +weight = sqrt(bsxfun(@times, mask, 1./sum(mask))); +data_scaled = bsxfun(@times, data, weight); + +covMat = outProdND(data_scaled); + +nf = fix((nf_stft-windowSize+windowShift)/windowShift); +winMask = zeros(nf, N, 'like', real(covMat(1))); + +if N == 1 + covMat1 = reshape(permute(covMat, [1 2 4 3]), nCh^2*nBin, nf_stft); +% covMat1 = repmat(mean(covMat1,2), 1, size(covMat1, 2)); + % % Version 1: fast, but consume memory when windowSize is large +% nf = fix((nf_stft-windowSize+windowShift)/windowShift); +% covMat2 = ExpandContext_v2(covMat1, 0:windowSize-1); +% nf_idx = 1:windowShift:nf_stft-windowSize+1; +% covMat3 = covMat2(:, nf_idx, :); +% covMat3 = reshape(covMat3, nCh^2*nBin, windowSize, nf, N); +% winCovMat = squeeze(mean(covMat3, 2)); +% + % Version 2: less fast than version 1 +% SCM1 = conv2(covMat1, ones(1,windowSize, class(gather(covMat)))/windowSize, 'valid'); + SCM1 = conv2(covMat1, ones(1,windowSize, class(gather(covMat))), 'valid'); + winCovMat = SCM1(:, 1:windowShift:end); +% +% % Version 3: slowest in repmat and not support multiple sentences +% if IsInGPU(data) +% winCovMat11 = gpuArray.zeros(nf, nCh*nCh*nBin*windowSize); +% else +% winCovMat11 = zeros(nf, nCh*nCh*nBin*windowSize); +% end +% covMat11 = reshape(permute(covMat, [1 2 4 3]), 1, nCh*nCh*nBin*nf_stft); +% indf = nCh*nCh*nBin*windowShift*(0:(nf-1)).'; +% inds = (1:nCh*nCh*nBin*windowSize); +% % winCovMat(:) = covMat(indf(:,ones(1,nCh*nCh*nBin*windowSize))+inds(ones(nf,1),:)); % slow +% winCovMat11(:) = covMat11(repmat(indf,1,nCh*nCh*nBin*windowSize)+repmat(inds,nf,1)); +% winCovMat11 = permute(reshape(winCovMat11, nf, nCh*nCh*nBin, windowSize), [2 3 1]); +% winCovMat = squeeze(mean(winCovMat11, 2)); +else +% % version 1 +% covMat1 = reshape(permute(covMat, [1 2 4 3 5]), nCh^2*nBin, nf_stft, N); +% winCovMat = zeros(nCh^2*nBin, nf, N, 'like', covMat1(1)); +% for i=1:N +% idx = find(prev_mask(:,i) == 0, 1, 'last'); +% idx2 = fix((idx-windowSize+windowShift)/windowShift); +% covMat2 = squeeze(covMat1(:,1:idx,i)); +% SCM = conv2(covMat2, ones(1,windowSize, 'like', covMat1(1))/windowSize, 'valid'); +% winCovMat(:, 1:idx2, i) = SCM(:, 1:windowShift:end); +% winMask(idx2+1:end, i) = 1; +% end + + % Version 2, much fast + covMat2 = reshape(permute(covMat, [1 2 4 5 3]), nCh^2*nBin*N, nf_stft); + idx = arrayfun(@(x) find(gather(prev_mask(:,x)) == 0, 1, 'last'), 1:size(prev_mask,2)); + idx2 = arrayfun(@(x) fix((idx(x)-windowSize+windowShift)/windowShift), 1:length(idx)); +% covMat3 = conv2(covMat2, ones(1,windowSize, 'like', covMat2(1))/windowSize, 'valid'); + covMat3 = conv2(covMat2, ones(1,windowSize, 'like', covMat2(1)), 'valid'); + winCovMat1 = covMat3(:, 1:windowShift:end); + winCovMat2 = permute(reshape(winCovMat1, nCh^2*nBin, N, size(winCovMat1, 2)), [1 3 2]); + winCovMat = zeros(nCh^2*nBin, nf, N, 'like', winCovMat2(1)); + for i = 1:N + winCovMat(:, 1:idx2(i), i) = winCovMat2(:, 1:idx2(i), i); + winMask(idx2(i)+1:end, i) = 1; + end + +end +end \ No newline at end of file