diff --git a/graph/B_ConcatRealImag.m b/graph/B_ConcatRealImag.m
new file mode 100644
index 0000000..ecdd06f
--- /dev/null
+++ b/graph/B_ConcatRealImag.m
@@ -0,0 +1,13 @@
+function grad = B_ConcatRealImag(prev_layer, curr_layer, future_layers)
+
+future_grad = GetFutureGrad(future_layers, curr_layer);
+
+[D, T, N] = size(future_grad);
+j = sqrt(-1);
+
+realpart = future_grad(1:D/2,:,:);
+imagpart = future_grad(D/2+1:end,:,:);
+
+grad = realpart + j*imagpart;
+
+end
\ No newline at end of file
diff --git a/graph/DNN_Cost10.m b/graph/DNN_Cost10.m
index ee405f1..019f9e7 100644
--- a/graph/DNN_Cost10.m
+++ b/graph/DNN_Cost10.m
@@ -140,14 +140,20 @@
             [layer{i}.a, layer{i}.validFrameMask] = F_comp_gcc(prev_layers{1}, layer{i});
         case 'stft'
             [layer{i}.a, layer{i}.validFrameMask] = F_stft(prev_layers{1}, layer{i});
-            
+        case 'spatialcov'
+            [layer{i}.a, layer{i}.validFrameMask] = F_SpatialCov(prev_layers{1}, layer{i});       % do not support variable length yet
         case 'spatialcovmask'
-            layer{i}.a = F_SpatialCovMask(prev_layers, layer{i});       % do not support variable length yet
+            [layer{i}.a, layer{i}.validFrameMask] = F_SpatialCovMask(prev_layers, layer{i});       % do not support variable length yet
         case 'spatialcovsplitmask'
             layer{i}.a = F_SpatialCovSplitMask(prev_layers, layer{i});       % do not support variable length yet
         case 'mvdr_spatialcov'
             layer{i} = F_MVDR_spatialCov(prev_layers{1}, layer{i});       % do not support variable length yet
-            
+        case 'extspatialcovfeat'
+            layer{i}.a = F_ExtSpatialCovFeat(prev_layers{1}, layer{i});   % extract up triangle real and imag parts, diagonal part from spatial cov
+        case 'spatialnorm'
+            layer{i}.a = F_SpatialNorm(prev_layers{1}, layer{i});
+        case 'concatrealimag'
+            layer{i}.a = F_ConcatRealImag(prev_layers{1}); 
         case 'cov'
             layer{i}.a = F_cov(prev_layers{1}.a);       % do not support variable length yet
         case 'logdet'
@@ -247,7 +253,7 @@
             if isfield(layer{i}, 'mask')        % the mask defines what values can be tuned and what cannot be tuned. 
                 tmp = tmp .* layer{i}.mask;
             end
-			cost_func.cost = cost_func.cost + 0.5* L2weight * sum(sum(tmp.*tmp));
+			cost_func.cost = cost_func.cost + 0.5* L2weight * sum(sum(real(tmp.*conj(tmp))));
 		end
     end
 end
@@ -453,6 +459,8 @@
             layer{i}.grad = B_inner_product_normalized(prev_layers, future_layers);
         case 'concatenate'
             layer{i}.grad = B_concatenate(prev_layers, layer{i}, future_layers);
+        case 'concatrealimag'
+            layer{i}.grad = B_ConcatRealImag(prev_layers, layer{i}, future_layers);
 
         otherwise
             fprintf('Error: unknown output node type %s!\n', layer{i}.name);
diff --git a/graph/DNN_update.m b/graph/DNN_update.m
index 01d0685..84f1512 100644
--- a/graph/DNN_update.m
+++ b/graph/DNN_update.m
@@ -30,7 +30,7 @@
         end
     end
     
-    if para.NET.gradientClipThreshold > 0
+    if para.NET.gradientClipThreshold > 0 && isreal(grad_W)
         grad_W = max(-para.NET.gradientClipThreshold, grad_W);
         grad_W = min(para.NET.gradientClipThreshold, grad_W);
     end
@@ -65,7 +65,7 @@
         layer{Lidx(1)}.W = layer{Lidx(1)}.W - update{i}.W;
     end
     
-    if para.NET.weight_clip
+    if para.NET.weight_clip && isreal(layer{Lidx(1)}.W)
         % sometimes the weight will explode, so we need to add a limit to the value of the weights, e.g. +-10
         layer{Lidx(1)}.W = max(-para.NET.weight_clip,layer{Lidx(1)}.W);
         layer{Lidx(1)}.W = min(para.NET.weight_clip,layer{Lidx(1)}.W);
diff --git a/graph/F_ConcatRealImag.m b/graph/F_ConcatRealImag.m
new file mode 100644
index 0000000..045f504
--- /dev/null
+++ b/graph/F_ConcatRealImag.m
@@ -0,0 +1,7 @@
+function output = F_ConcatRealImag(prev_layer)
+
+covMat = prev_layer.a;
+
+output = [real(covMat); imag(covMat)];
+
+end
diff --git a/graph/F_ExtSpatialCovFeat.m b/graph/F_ExtSpatialCovFeat.m
new file mode 100644
index 0000000..3f282ae
--- /dev/null
+++ b/graph/F_ExtSpatialCovFeat.m
@@ -0,0 +1,160 @@
+function feat = F_ExtSpatialCovFeat(prev_layer, curr_layer)
+
+covMat = prev_layer.a;
+prev_mask = prev_layer.validFrameMask;
+nCh = curr_layer.nCh;
+nBin = curr_layer.nBin;
+[~, nf, N] = size(covMat);
+
+if isfield(curr_layer, 'scm_select')
+    scm_select = curr_layer.scm_select;
+else
+    scm_select = 'uptriangle';
+end
+if isfield(curr_layer, 'scm_select_diag')
+    scm_select_diag = curr_layer.scm_select_diag;
+else
+    scm_select_diag = 1;
+end
+if isfield(curr_layer, 'scm_select_bin')
+    scm_select_bin = curr_layer.scm_select_bin;
+    scm_bin_shift = curr_layer.scm_bin_shift;
+else
+    scm_select_bin = 0;
+end
+
+if N == 1
+    
+    % normalize the cov matrix by their diagonal elements, remove the effect of
+    % spectral power and only retains the phase information
+    dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin));
+    dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index
+    diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:), nCh, nBin, nf), 1));
+    if nf ==1
+        diag_mean = diag_mean.';
+    end
+    normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf), [3 4 1 2]), 1./diag_mean);
+    normCovMat = reshape(permute(normCovMat, [3 4 1 2]), nCh^2*nBin, nf);
+    
+    % get the upper triangle off-diagonal elements which are complex-valued
+    if strcmpi(scm_select, 'uptriangle')
+        selectMat = triu(ones(nCh, nCh),1); % 1. up-trialgle
+    elseif strcmpi(scm_select, 'row')
+        selectMat = zeros(nCh, nCh); selectMat(1,2:end) = ones(1, nCh-1); % 2. first row
+    else
+        fprintf('Error: unknown scm feature select type: %s', lower(scm_select))
+    end
+    
+    dimSelectMask2 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin));
+    dimSelectIdx2 = find(reshape(dimSelectMask2, numel(dimSelectMask2),1) == 1);
+    real_part = real(normCovMat(dimSelectIdx2,:));
+    % imag_part = imag(normCovMat(dimSelectIdx2,:));
+    % for freq bin 1 and 257, no imag part
+    dimSelectMask3 = bsxfun(@times, selectMat, cat(3,zeros(nCh, nCh, 1), ones(nCh, nCh, nBin-2), zeros(nCh, nCh, 1)));
+    dimSelectIdx3 = find(reshape(dimSelectMask3, numel(dimSelectMask3),1) == 1);
+    imag_part = imag(normCovMat(dimSelectIdx3,:));
+    
+    % get the diagonal elements which are real values
+    % diag_part = covMat(dimSelectIdx1,:);
+    % diag_part = log(max(eps,abs(diag_part)));
+    if scm_select_diag
+        diag_part = real(normCovMat(dimSelectIdx1,:));
+    end
+else
+    % select 1 bin by average every scm_bin_shift bins
+    if scm_select_bin
+        covMat1 = reshape(covMat, nCh^2, nBin, nf, N);
+        covMat2 = reshape(permute(covMat1, [1 3 4 2]), nCh^2*nf*N, nBin);
+        covMat3 = conv2(covMat2, ones(1,scm_bin_shift, 'like', covMat2(1))/scm_bin_shift, 'valid');
+        covMat4 = covMat3(:, 1:scm_bin_shift:end);
+        nBin = size(covMat4, 2);
+        covMat = reshape(permute(reshape(covMat4, nCh^2, nf, N, nBin), [1 4 2 3]), nCh^2*nBin, nf, N);
+        
+    end
+    
+    % normalize the cov matrix by their diagonal elements, remove the effect of
+    % spectral power and only retains the phase information
+    dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin));
+    dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index
+    diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:,:), nCh, nBin, nf, N), 1));
+    if nf ==1
+        diag_mean = reshape(diag_mean, size(diag_mean,1), 1, size(diag_mean, 2));
+    end
+    % minibatch padding makes some frames zero, mean of that still be zero, can not be divided.
+    diag_mean1 = permute(bsxfun(@plus, permute(diag_mean, [2 3 1]), -1e10.*prev_mask), [3 1 2]);
+    normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf, N), [3 4 5 1 2]), 1./diag_mean1);
+    normCovMat = reshape(permute(normCovMat, [4 5 1 2 3]), nCh^2*nBin, nf, N);
+    
+%     % select 1 bin by average every scm_bin_shift bins
+%     if scm_select_bin
+%         normCovMat1 = reshape(normCovMat, nCh^2, nBin, nf, N);
+%         normCovMat2 = reshape(permute(normCovMat1, [1 3 4 2]), nCh^2*nf*N, nBin);
+%         normCovMat3 = conv2(normCovMat2, ones(1,scm_bin_shift, 'like', normCovMat2(1))/scm_bin_shift, 'valid');
+%         normCovMat4 = normCovMat3(:, 1:scm_bin_shift:end);
+%         nBin = size(normCovMat4, 2);
+%         normCovMat = reshape(permute(reshape(normCovMat4, nCh^2, nf, N, nBin), [1 4 2 3]), nCh^2*nBin, nf, N);
+%         
+%     end
+    
+    % get the upper triangle off-diagonal elements which are complex-valued
+    if strcmpi(scm_select, 'uptriangle')
+        selectMat = triu(ones(nCh, nCh),1); % 1. up-trialgle
+    elseif strcmpi(scm_select, 'row')
+        selectMat = zeros(nCh, nCh); selectMat(1,2:end) = ones(1, nCh-1); % 2. first row
+    else
+        fprintf('Error: unknown scm feature select type: %s', lower(scm_select))
+    end
+    
+    dimSelectMask2 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin));
+    dimSelectIdx2 = find(reshape(dimSelectMask2, numel(dimSelectMask2),1) == 1);
+    real_part = real(normCovMat(dimSelectIdx2,:,:));
+    % imag_part = imag(normCovMat(dimSelectIdx2,:));
+    % for freq bin 1 and 257, no imag part
+    if scm_select_bin
+        dimSelectMask3 = bsxfun(@times, selectMat, ones(nCh, nCh, nBin));
+    else
+        dimSelectMask3 = bsxfun(@times, selectMat, cat(3,zeros(nCh, nCh, 1), ones(nCh, nCh, nBin-2), zeros(nCh, nCh, 1)));
+    end
+    dimSelectIdx3 = find(reshape(dimSelectMask3, numel(dimSelectMask3),1) == 1);
+    imag_part = imag(normCovMat(dimSelectIdx3,:,:));
+    
+    % get the diagonal elements which are real values
+    if scm_select_diag
+        dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin));
+        dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1);
+        diag_part = real(normCovMat(dimSelectIdx1,:,:));
+    end
+end
+
+% get the final feature vector
+if scm_select_diag
+    feat = [real_part; imag_part; diag_part];
+else
+    feat = [real_part; imag_part];
+end
+% real_part = reshape(real_part, 7, 257, nf, N);
+% imag_part = reshape(imag_part, 7, 255, nf, N);
+% real_part = real_part(:, 6:5:end,:,:);
+% imag_part = imag_part(:, 5:5:end,:,:);
+% 
+% feat = [reshape(real_part, 7*51, nf, N); reshape(imag_part, 7*51, nf, N)];
+
+
+% covMat = reshape(covMat(:,:,:), nCh, nCh, nBin, nf, N);
+% covMatCell = num2cell(covMat, [1 2]);
+% omegaTau = cellfun(@GetPrincVec, covMatCell, 'UniformOutput', 0);
+% output = permute(cell2mat(omegaTau), [1 3 4 5 2]);
+% 
+% feat = output(2:8, 5:5:end, :,:);
+% [d1,d2,d3,d4] = size(feat);
+% feat = reshape(feat, d1*d2, d3, d4);
+
+end
+
+function omegaTau = GetPrincVec(A)
+[V,D] = eig(A);
+D = diag(D);
+[~, idx] = max(D);
+ev = V(:,idx);
+omegaTau = gather(angle(ev/ev(1)));
+end
diff --git a/graph/F_SpatialCov.m b/graph/F_SpatialCov.m
index 5c8e67c..62d253f 100644
--- a/graph/F_SpatialCov.m
+++ b/graph/F_SpatialCov.m
@@ -1,4 +1,4 @@
-function output = F_SpatialCov(input_layer, curr_layer)
+function [output, mask] = F_SpatialCov(input_layer, curr_layer)
 
 input = input_layer.a;
 [D,T,N] = size(input);
@@ -6,16 +6,87 @@
 curr_layer = SetDefaultValue(curr_layer, 'winSize', 0);
 curr_layer = SetDefaultValue(curr_layer, 'winShift', 1);
 
+if T <= curr_layer.winSize
+    windowSize = 0;
+    windowShift = 1;
+else
+    windowSize = curr_layer.winSize;
+    windowShift = curr_layer.winShift;
+end
+
 nBin = length(curr_layer.freqBin);
 nCh = D/nBin;
 
+input2 = reshape(input, nBin, nCh, T, N);
+
+if windowSize == 0
+    nf = 1;
+else
+    nf = fix((T-windowSize+windowShift)/windowShift);
+end
+mask = zeros(nf, N, 'like', real(input2(1)));
+
 if N==1
-    input2 = reshape(input, nBin, nCh, T,N);
-    R = ComplexSpectrum2SpatialCov(input2, curr_layer.winSize, curr_layer.winShift);
-    output = permute(R, [3 1 2 4]);
-    output = reshape(output, nBin*nCh^2, size(output,4),N);
+%     R = ComplexSpectrum2SpatialCov(input2, windowSize, windowShift);
+% %     output = permute(R, [3 1 2 4]);
+% %     output = reshape(output, nBin*nCh^2, size(output,4),N);
+%     output = reshape(R, nCh^2*nBin, size(R,4),N);
+    
+    X2 = permute(input2, [2 1 3]);
+    XX = outProdND(X2);
+    XX2 = reshape(XX, nCh^2*nBin, T);
+    
+    if windowSize == 0
+        output = squeeze(mean(XX2, 2));
+    else
+%         idx = [ones(1,half_ctx) 1:T ones(1,half_ctx)*T];
+%         SCM = conv2(XX2, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid');
+        SCM = conv2(XX2, ones(1,windowSize, class(gather(input2))), 'valid');
+        output = SCM(:, 1:windowShift:end);
+    end
+    
 else
-    % to be implemented
+    X2 = permute(input2, [2 1 3 4]);
+    XX = outProdND(X2);
+    XX2 = reshape(XX, nCh^2*nBin, T, N);
+    
+    if windowSize == 0
+        output = mean(XX2, 2);
+    else
+% %         idx = [ones(1,half_ctx) 1:T ones(1,half_ctx)*T];
+%         XX3 = reshape(permute(XX2, [1 3 2]), nCh^2*nBin*N, T);
+%         SCM = conv2(XX3, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid');
+%         output = SCM(:, 1:windowShift:end);
+%         output = permute(reshape(output, nCh^2*nBin, N, size(output, 2)), [1 3 2]);
+        
+%         % Version 1
+%         prev_mask = input_layer.validFrameMask;
+%         output = zeros(nCh^2*nBin, nf, N, 'like', XX2);
+%         for i=1:N
+%             idx = find(prev_mask(:,i) == 0, 1, 'last');
+%             idx2 = fix((idx-windowSize+windowShift)/windowShift);
+%             XX3 = squeeze(XX2(:,1:idx,i));
+%             SCM = conv2(XX3, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid');
+%             output(:, 1:idx2, i) = SCM(:, 1:windowShift:end);
+%             mask(idx2+1:end, i) = 1;
+%         end
+        
+        % Version 2, much fast
+        prev_mask = input_layer.validFrameMask;
+        idx = arrayfun(@(x) find(gather(prev_mask(:,x)) == 0, 1, 'last'), 1:size(prev_mask,2));
+        idx2 = arrayfun(@(x) fix((idx(x)-windowSize+windowShift)/windowShift), 1:length(idx));
+        XX31 = reshape(permute(XX2, [1 3 2]), nCh^2*nBin*N, T);
+%         SCM1 = conv2(XX31, ones(1,windowSize, class(gather(input2)))/windowSize, 'valid');
+        SCM1 = conv2(XX31, ones(1,windowSize, class(gather(input2))), 'valid');
+        output1 = SCM1(:, 1:windowShift:end);
+        output2 = permute(reshape(output1, nCh^2*nBin, N, size(output1, 2)), [1 3 2]);
+        output = zeros(nCh^2*nBin, nf, N, 'like', XX2);
+        for i = 1:N
+            output(:, 1:idx2(i), i) = output2(:, 1:idx2(i), i);
+            mask(idx2(i)+1:end, i) = 1;
+        end
+        
+    end
 end
 
-end
\ No newline at end of file
+end
diff --git a/graph/F_SpatialCovMask.m b/graph/F_SpatialCovMask.m
index 45473e0..4db29ad 100644
--- a/graph/F_SpatialCovMask.m
+++ b/graph/F_SpatialCovMask.m
@@ -1,24 +1,39 @@
-% Estimate spatial covariance matrix for sentences using a mask. The mask
+% Estimate spatial covariance matrix for sentences using a speechMask. The speechMask
 % specifies speech presense probability at all time frequency locations,
 % with a 1 means speech present and 0 means speech absent. 
 %
-function output = F_SpatialCovMask(prev_layers, curr_layer)
-mask = prev_layers{1}.a;
+function [output, uttMask] = F_SpatialCovMask(prev_layers, curr_layer)
+speechMask = prev_layers{1}.a;
 data = prev_layers{2}.a;
+prev_mask =prev_layers{2}.validFrameMask;
 
-if isfield(curr_layer, 'windowSize')
-    windowSize = curr_layer.windowSize;
-else
-    windowSize = 0;
-end
-
-[D,T,N] = size(mask);
+[D,T,N] = size(speechMask);
 [D2,T,N] = size(data);
 nCh = D2/D;
 data = reshape(data, D, nCh, T, N);
 data = permute(data, [2 3 1 4]);
 % data = abs(data);
 
+if isfield(curr_layer, 'winSize') && T > curr_layer.winSize
+    windowSize = curr_layer.winSize;
+    windowShift = curr_layer.winShift;
+else
+    windowSize = 0;
+end
+
+if isfield(curr_layer, 'speechOnly')
+    speechOnly = curr_layer.speechOnly;
+else
+    speechOnly = false;
+end
+
+% if windowSize == 0
+%     nf = 1;
+% else
+%     nf = fix((T-windowSize+windowShift)/windowShift);
+% end
+% uttMask = zeros(nf, N, 'like', real(data(1)));
+
 if windowSize == 0      % utterance mode, estimate two spatial covariance matrixes for each utterance, one is speech and the other is noise.
     if 0    % for loop version
         if IsInGPU(data)
@@ -27,38 +42,50 @@
         else
             scm_speech = zeros(nCh, nCh, D, N);
             scm_noise = zeros(nCh, nCh, D, N);
-        end    
+        end
         for d=1:D
             for n=1:N
                 for t=1:T
-                    scm_speech(:,:,d,n) = scm_speech(:,:,d,n) + mask(d,t,n) * data(:,t,d) * data(:,t,d)';
-                    scm_noise(:,:,d,n) = scm_noise(:,:,d,n) + (1-mask(d,t,n)) * data(:,t,d) * data(:,t,d)';
+                    scm_speech(:,:,d,n) = scm_speech(:,:,d,n) + speechMask(d,t,n) * data(:,t,d) * data(:,t,d)';
+                    scm_noise(:,:,d,n) = scm_noise(:,:,d,n) + (1-speechMask(d,t,n)) * data(:,t,d) * data(:,t,d)';
                 end
-                scm_speech(:,:,d,n) = scm_speech(:,:,d,n) / sum(mask(d,:,n));
-                scm_noise(:,:,d,n) = scm_noise(:,:,d,n) / (T-sum(mask(d,:,n)));
+                scm_speech(:,:,d,n) = scm_speech(:,:,d,n) / sum(speechMask(d,:,n));
+                scm_noise(:,:,d,n) = scm_noise(:,:,d,n) / (T-sum(speechMask(d,:,n)));
             end
         end
-    else        % vectorized
+    else
+%         vectorized: version 1
 %         data_cell = num2cell(data, [1]);
-%         mask_cell = num2cell(permute(mask, [3 2 1]), [1]);
+%         mask_cell = num2cell(permute(speechMask, [3 2 1]), [1]);
 %         scm_speech_cell = cellfun(@(x,y) (reshape(x*y*y',nCh^2,1)), mask_cell, data_cell, 'UniformOutput', 0);
 %         scm_noise_cell = cellfun(@(x,y) (reshape((1-x)*y*y',nCh^2,1)), mask_cell, data_cell, 'UniformOutput', 0);
 %         scm_speech = reshape(sum(cell2mat(scm_speech_cell),2),nCh,nCh,D);
-%         scm_speech = bsxfun(@times, scm_speech, permute(1./sum(mask,2), [3 2 1]));
+%         scm_speech = bsxfun(@times, scm_speech, permute(1./sum(speechMask,2), [3 2 1]));
 %         scm_noise = reshape(sum(cell2mat(scm_noise_cell),2),nCh,nCh,D);
-%         scm_noise = bsxfun(@times, scm_noise, permute(1./sum(1-mask,2), [3 2 1]));
-        
-        mask2 = permute(mask, [4 2 1 3]);
+%         scm_noise = bsxfun(@times, scm_noise, permute(1./sum(1-speechMask,2), [3 2 1]));
+%         
+        % version 2
+        mask2 = permute(speechMask, [4 2 1 3]);
         scm_speech = ComputeCovMask(data, mask2);
-        scm_noise = ComputeCovMask(data, 1-mask2);
+        if speechOnly
+            output = scm_speech;
+        else
+            scm_noise = ComputeCovMask(data, 1-mask2);
+            output = [scm_speech; scm_noise];
+        end
+    end
+    uttMask = zeros(1, N, 'like', real(data(1)));
+else        % online mode, estiamte covariance matrices for a sliding window of frames.
+    % to be implemented.
+    % frame number after moving window
+    mask2 = permute(speechMask, [4 2 1 3]);
+    [scm_speech, uttMask] = ComputeWinCovMask(data, mask2, prev_mask, windowSize, windowShift);
+    if speechOnly
+        output = scm_speech;
+    else
+        scm_noise = ComputeWinCovMask(data, 1-mask2, prev_mask, windowSize, windowShift);
+        output = [scm_speech; scm_noise];
     end
-    
-    scm_speech2 = reshape(scm_speech, nCh^2*D, 1, N);
-    scm_noise2 = reshape(scm_noise, nCh^2*D, 1, N);
-    output = [scm_speech2; scm_noise2];    
-else        % online mode, estiamte covariance matrices for a sliding window of frames. 
-    % to be implemented.    
 end
 
-
 end
diff --git a/graph/F_SpatialNorm.m b/graph/F_SpatialNorm.m
new file mode 100644
index 0000000..ac5c0ed
--- /dev/null
+++ b/graph/F_SpatialNorm.m
@@ -0,0 +1,38 @@
+function normCovMat = F_SpatialNorm(prev_layer, curr_layer)
+
+covMat = prev_layer.a;
+prev_mask = prev_layer.validFrameMask;
+nCh = curr_layer.nCh;
+nBin = curr_layer.nBin;
+[~, nf, N] = size(covMat);
+
+if N == 1
+    
+    % normalize the cov matrix by their diagonal elements, remove the effect of
+    % spectral power and only retains the phase information
+    dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin));
+    dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index
+    diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:), nCh, nBin, nf), 1));
+    if nf ==1
+        diag_mean = diag_mean.';
+    end
+    normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf), [3 4 1 2]), 1./diag_mean);
+    normCovMat = reshape(permute(normCovMat, [3 4 1 2]), nCh^2*nBin, nf);
+    
+else
+    % normalize the cov matrix by their diagonal elements, remove the effect of
+    % spectral power and only retains the phase information
+    dimSelectMask1 = bsxfun(@times, eye(nCh, nCh), ones(nCh, nCh, nBin));
+    dimSelectIdx1 = find(reshape(dimSelectMask1, numel(dimSelectMask1),1) == 1); % diag elements index
+    diag_mean = squeeze(mean(reshape(covMat(dimSelectIdx1,:,:), nCh, nBin, nf, N), 1));
+    if nf ==1
+        diag_mean = reshape(diag_mean, size(diag_mean,1), 1, size(diag_mean, 2));
+    end
+    % minibatch padding makes some frames zero, mean of that still be zero, can not be divided.
+    diag_mean1 = permute(bsxfun(@plus, permute(diag_mean, [2 3 1]), -1e10.*prev_mask), [3 1 2]);
+    normCovMat = bsxfun(@times, permute(reshape(covMat, nCh, nCh, nBin, nf, N), [3 4 5 1 2]), 1./diag_mean1);
+    normCovMat = reshape(permute(normCovMat, [4 5 1 2 3]), nCh^2*nBin, nf, N);
+    
+end
+
+end
diff --git a/graph/F_tdoa2weight.m b/graph/F_tdoa2weight.m
index 7529aa9..8568860 100644
--- a/graph/F_tdoa2weight.m
+++ b/graph/F_tdoa2weight.m
@@ -1,8 +1,10 @@
 
 
-function output = F_tdoa2weight(input, freq_bin)
+function output = F_tdoa2weight(input_layer, curr_layer)
 % assume input is an array of time delay of C microphone channels. 
 % freq_bin is an array of center frequencies of N FFT bins. 
+input = input_layer.a;
+freq_bin = curr_layer.freq_bin;
 [D,T,N] = size(input);
 nCh = D+1;
 delay = [zeros(1,T); input];
diff --git a/prototypes/computePCA.m b/prototypes/computePCA.m
new file mode 100644
index 0000000..e16c9d5
--- /dev/null
+++ b/prototypes/computePCA.m
@@ -0,0 +1,58 @@
+
+function [W, b] = computePCA(Visible, nUttUsed, para, layer)
+if exist('nUttUsed')==0 || length(nUttUsed)==0
+    nUttUsed = 500;
+end
+nUtt = length(Visible(1).data);
+if nUtt>nUttUsed
+    step = ceil(nUtt/nUttUsed);
+    for i=1:length(Visible)
+        Visible(i).data = Visible(i).data(1:step:end);
+    end
+end
+
+para.out_layer_idx = length(layer);
+para.output = 'dummy';
+para = ParseOptions2(para);
+output = FeatureTree2(Visible, para, layer);
+
+if para.NET.variableLengthMinibatch
+    for i=1:length(output)
+        featTmp = gather(output{i}{1});
+        [featTmp2, mask, variableLength] = ExtractVariableLengthTrajectory(featTmp);
+        feat{i} = cell2mat(featTmp2);
+    end
+else
+    for i=1:length(output)
+        feat{i} = gather(output{i}{1});
+        [D,T,N] = size(feat{i});
+        if N>1
+            feat{i} = reshape(feat{i},D,T*N);
+        end
+    end
+end
+feat = cell2mat(feat);
+
+% [coeff, scores, latent] = princomp(feat','econ');
+% tmp=cumsum(latent)./sum(latent);
+% coeff = princomp(feat','econ');
+% W = coeff(:,1:para.topology.pcaDim)';
+% b = -W*mean(feat,2);
+% 
+% [coeff1, scores1, latent1] = pca(feat');
+% cov1 = cov(feat');
+% [V1,D1] = eig(cov1);
+% D2 = diag(D1);
+
+fprintf('Load %d utts feats, begin pca ...', nUttUsed);
+[coeff, ~, latent] = pca(feat');
+tmp=cumsum(latent)./sum(latent);
+idx = find(tmp == 0.95);
+fprintf('End of PCA, select %d can cover 99.9 percent', idx);
+% W = coeff(:,1:para.topology.pcaDim)';
+W = coeff(:,1:1000)';
+b = -W*mean(feat,2);
+
+save(['PCA_U' num2str(nUttUsed) '_W_B.mat'], 'W', 'b', 'latent');
+
+end
diff --git a/signal/feature/sfft_multi.m b/signal/feature/sfft_multi.m
index 635983d..3847484 100644
--- a/signal/feature/sfft_multi.m
+++ b/signal/feature/sfft_multi.m
@@ -17,9 +17,9 @@
     useGPU = 0;
 end
 
-if exist('doDithering')==0 || length(doDithering)==0
-    x = x + randn(size(x))/2^32;
-end
+% if exist('doDithering')==0 || length(doDithering)==0
+%     x = x + randn(size(x))/2^32;
+% end
 
 % produce the hamming windowm
 if exist('window_type')==0 || length(window_type)==0
diff --git a/signal/gmm/ComputeCovMask.m b/signal/gmm/ComputeCovMask.m
index 4d3c09f..1be3d07 100644
--- a/signal/gmm/ComputeCovMask.m
+++ b/signal/gmm/ComputeCovMask.m
@@ -5,12 +5,21 @@
 %   feature vector to the covariance matrix
 %
 function covMat = ComputeCovMask(data, mask)
-
+[nCh, ~, nBin, N] = size(data);
 weight = sqrt(bsxfun(@times, mask, 1./sum(mask)));
 data_scaled = bsxfun(@times, data, weight);
-data_cell = num2cell(data_scaled, [1 2]);       % convert to cell array and call cellfun for speed
-tmp = cellfun(@(x) gather(x*x'), data_cell, 'UniformOutput', 0);
-covMat = cell2mat(tmp);
-% covMat = cell2mat_gpu(tmp);
+
+% % version 1
+% data_cell = num2cell(data_scaled, [1 2]);       % convert to cell array and call cellfun for speed
+% tmp = cellfun(@(x) gather(x*x'), data_cell, 'UniformOutput', 0);
+% covMat = cell2mat(tmp);
+% % covMat = cell2mat_gpu(tmp);
+% covMat = reshape(covMat, nCh^2*nBin, 1, N);
+
+% version 2
+
+covMat1 = outProdND(data_scaled);
+covMat2 = squeeze(mean(covMat1, 3));
+covMat = reshape(covMat2, nCh^2*nBin, 1, N);
 
 end
diff --git a/signal/gmm/ComputeWinCovMask.m b/signal/gmm/ComputeWinCovMask.m
new file mode 100644
index 0000000..cc05ddd
--- /dev/null
+++ b/signal/gmm/ComputeWinCovMask.m
@@ -0,0 +1,68 @@
+function [winCovMat, winMask] = ComputeWinCovMask(data, mask, prev_mask, windowSize, windowShift)
+[nCh, nf_stft, nBin, N] = size(data);
+weight = sqrt(bsxfun(@times, mask, 1./sum(mask)));
+data_scaled = bsxfun(@times, data, weight);
+
+covMat = outProdND(data_scaled);
+
+nf = fix((nf_stft-windowSize+windowShift)/windowShift);
+winMask = zeros(nf, N, 'like', real(covMat(1)));
+
+if N == 1
+    covMat1 = reshape(permute(covMat, [1 2 4 3]), nCh^2*nBin, nf_stft);
+%     covMat1 = repmat(mean(covMat1,2), 1, size(covMat1, 2));
+    % % Version 1: fast, but consume memory when windowSize is large
+%     nf = fix((nf_stft-windowSize+windowShift)/windowShift);
+%     covMat2 = ExpandContext_v2(covMat1, 0:windowSize-1);
+%     nf_idx = 1:windowShift:nf_stft-windowSize+1;
+%     covMat3 = covMat2(:, nf_idx, :);
+%     covMat3 = reshape(covMat3, nCh^2*nBin, windowSize, nf, N);
+%     winCovMat = squeeze(mean(covMat3, 2));
+%     
+    % Version 2: less fast than version 1
+%     SCM1 = conv2(covMat1, ones(1,windowSize, class(gather(covMat)))/windowSize, 'valid');
+    SCM1 = conv2(covMat1, ones(1,windowSize, class(gather(covMat))), 'valid');
+    winCovMat = SCM1(:, 1:windowShift:end);
+%     
+%     % Version 3: slowest in repmat and not support multiple sentences
+%     if IsInGPU(data)
+%         winCovMat11 = gpuArray.zeros(nf, nCh*nCh*nBin*windowSize);
+%     else
+%         winCovMat11 = zeros(nf, nCh*nCh*nBin*windowSize);
+%     end
+%     covMat11 = reshape(permute(covMat, [1 2 4 3]), 1, nCh*nCh*nBin*nf_stft);
+%     indf = nCh*nCh*nBin*windowShift*(0:(nf-1)).';
+%     inds = (1:nCh*nCh*nBin*windowSize);
+%     % winCovMat(:) = covMat(indf(:,ones(1,nCh*nCh*nBin*windowSize))+inds(ones(nf,1),:)); % slow
+%     winCovMat11(:) = covMat11(repmat(indf,1,nCh*nCh*nBin*windowSize)+repmat(inds,nf,1));
+%     winCovMat11 = permute(reshape(winCovMat11, nf, nCh*nCh*nBin, windowSize), [2 3 1]);
+%     winCovMat = squeeze(mean(winCovMat11, 2));
+else
+%     % version 1
+%     covMat1 = reshape(permute(covMat, [1 2 4 3 5]), nCh^2*nBin, nf_stft, N);
+%     winCovMat = zeros(nCh^2*nBin, nf, N, 'like', covMat1(1));
+%     for i=1:N
+%         idx = find(prev_mask(:,i) == 0, 1, 'last');
+%         idx2 = fix((idx-windowSize+windowShift)/windowShift);
+%         covMat2 = squeeze(covMat1(:,1:idx,i));
+%         SCM = conv2(covMat2, ones(1,windowSize, 'like', covMat1(1))/windowSize, 'valid');
+%         winCovMat(:, 1:idx2, i) = SCM(:, 1:windowShift:end);
+%         winMask(idx2+1:end, i) = 1;
+%     end
+    
+    % Version 2, much fast
+    covMat2 = reshape(permute(covMat, [1 2 4 5 3]), nCh^2*nBin*N, nf_stft);
+    idx = arrayfun(@(x) find(gather(prev_mask(:,x)) == 0, 1, 'last'), 1:size(prev_mask,2));
+    idx2 = arrayfun(@(x) fix((idx(x)-windowSize+windowShift)/windowShift), 1:length(idx));
+%     covMat3 = conv2(covMat2, ones(1,windowSize, 'like', covMat2(1))/windowSize, 'valid');
+    covMat3 = conv2(covMat2, ones(1,windowSize, 'like', covMat2(1)), 'valid');
+    winCovMat1 = covMat3(:, 1:windowShift:end);
+    winCovMat2 = permute(reshape(winCovMat1, nCh^2*nBin, N, size(winCovMat1, 2)), [1 3 2]);
+    winCovMat = zeros(nCh^2*nBin, nf, N, 'like', winCovMat2(1));
+    for i = 1:N
+        winCovMat(:, 1:idx2(i), i) = winCovMat2(:, 1:idx2(i), i);
+        winMask(idx2(i)+1:end, i) = 1;
+    end
+    
+end
+end
\ No newline at end of file