BUTSpeechFIT · Aug 30, 2021
diff --git a/‎VBx/VB_diarization.py
+148-196 b/‎VBx/VB_diarization.py
+148-196
diff --git a/‎VBx/diarization_lib.py
+61-61 b/‎VBx/diarization_lib.py
+61-61
diff --git a/‎VBx/features.py
+28-31 b/‎VBx/features.py
+28-31
@@ -20,14 +20,14 @@ def twoGMMcalib_lin(s, niters=20):
     var = np.var(s)
     threshold = np.inf
     for _ in range(niters):
-        lls = np.log(weights)-0.5*np.log(var) - 0.5*(s[:,np.newaxis]-means)**2/var
+        lls = np.log(weights) - 0.5 * np.log(var) - 0.5 * (s[:, np.newaxis] - means)**2 / var
         gammas = softmax(lls, axis=1)
         cnts = np.sum(gammas, axis=0)
         weights = cnts / cnts.sum()
         means = s.dot(gammas) / cnts
         var = ((s**2).dot(gammas) / cnts - means**2).dot(weights)
-        threshold =  -0.5*(np.log(weights**2/var)-means**2/var).dot([1,-1])/(means/var).dot([1,-1])
-    return threshold, lls[:,means.argmax()]-lls[:,means.argmin()]
+        threshold = -0.5 * (np.log(weights**2 / var) - means**2 / var).dot([1, -1]) / (means/var).dot([1, -1])
+    return threshold, lls[:, means.argmax()] - lls[:, means.argmin()]
 
 
 def AHC(sim_mx, threshold=0):
@@ -41,18 +41,19 @@ def AHC(sim_mx, threshold=0):
         cluster labels stored in an array of length N containing (integers in
         the range from 0 to C-1, where C is the number of dicovered clusters)
     """
-    dist = -sim_mx;
+    dist = -sim_mx
     dist[np.diag_indices_from(dist)] = np.inf
     clsts = [[i] for i in range(len(dist))]
     while True:
         mi, mj = np.sort(np.unravel_index(dist.argmin(), dist.shape))
         if dist[mi, mj] > -threshold:
             break
-        dist[:, mi] = dist[mi,:] = (dist[mi,:]*len(clsts[mi])+dist[mj,:]*len(clsts[mj]))/(len(clsts[mi])+len(clsts[mj]))
-        dist[:, mj] = dist[mj,:] = np.inf
+        dist[:, mi] = dist[mi, :] = (dist[mi, :]*len(clsts[mi]) + dist[mj, :]*len(clsts[mj])) / \
+                                    (len(clsts[mi]) + len(clsts[mj]))
+        dist[:, mj] = dist[mj, :] = np.inf
         clsts[mi].extend(clsts[mj])
         clsts[mj] = None
-    labs= np.empty(len(dist), dtype=int)
+    labs = np.empty(len(dist), dtype=int)
     for i, c in enumerate([e for e in clsts if e]):
         labs[c] = i
     return labs
@@ -73,14 +74,14 @@ def PLDA_scoring_in_LDA_space(Fe, Ft, diagAC):
     """
     # See (7-8) in L. Burget et al.: "Discriminatively trained probabilistic
     # linear discriminant analysis for speaker verification", in ICASSP 2011.
-    iTC      = 1.0 / (1 +   diagAC)
-    iWC2AC   = 1.0 / (1 + 2*diagAC)
-    ldTC    = np.sum(np.log(1 +   diagAC))
+    iTC = 1.0 / (1 + diagAC)
+    iWC2AC = 1.0 / (1 + 2*diagAC)
+    ldTC = np.sum(np.log(1 + diagAC))
     ldWC2AC = np.sum(np.log(1 + 2*diagAC))
-    Gamma = -0.25*(iWC2AC + 1 - 2*iTC)
-    Lambda= -0.5 *(iWC2AC - 1)
-    k = - 0.5*(ldWC2AC - 2*ldTC)
-    return  np.dot(Fe * Lambda, Ft.T) + (Fe**2).dot(Gamma)[:,np.newaxis] + (Ft**2).dot(Gamma) + k
+    Gamma = -0.25 * (iWC2AC + 1 - 2*iTC)
+    Lambda = -0.5 * (iWC2AC - 1)
+    k = -0.5 * (ldWC2AC - 2*ldTC)
+    return np.dot(Fe * Lambda, Ft.T) + (Fe**2).dot(Gamma)[:, np.newaxis] + (Ft**2).dot(Gamma) + k
 
 
 def kaldi_ivector_plda_scoring_dense(kaldi_plda, x, target_energy=0.1, pca_dim=None):
@@ -102,23 +103,21 @@ def kaldi_ivector_plda_scoring_dense(kaldi_plda, x, target_energy=0.1, pca_dim=N
         matrix of pairwise similarities between the input x-vectors
     """
     plda_mu, plda_tr, plda_psi = kaldi_plda
-    [energy,PCA]=spl.eigh(np.cov(x.T, bias=True))
+    energy, PCA = spl.eigh(np.cov(x.T, bias=True))
     if pca_dim is None:
-      energy=np.cumsum(energy[::-1])
-      pca_dim=np.sum(energy/energy[-1]<=target_energy) + 2
-      # we need at least 2 dimensions, so 2 more dimensions are always added
+        energy = np.cumsum(energy[::-1])
+        pca_dim = np.sum(energy/energy[-1] <= target_energy) + 2
+        # we need at least 2 dimensions, so 2 more dimensions are always added
 
-    PCA=PCA[:,:-pca_dim-1:-1]
+    PCA = PCA[:, :-pca_dim-1:-1]
     print("pca_dim:", pca_dim)
 
-    plda_tr_inv_pca=PCA.T.dot(np.linalg.inv(plda_tr))
+    plda_tr_inv_pca = PCA.T.dot(np.linalg.inv(plda_tr))
     W = plda_tr_inv_pca.dot(plda_tr_inv_pca.T)
     B = (plda_tr_inv_pca*plda_psi).dot(plda_tr_inv_pca.T)
-    acvar, wccn = spl.eigh(B,  W)
-    x = np.dot(x-plda_mu,PCA).dot(wccn)
-    x *= np.sqrt(x.shape[1] / np.dot(x**2, 1.0 / (acvar + 1.0)))[:,np.newaxis] # kaldi style length-norm
-    #Lambda, Gamma, c, k = PLDA_params_to_bilinear_form(np.eye(pca_dim), np.diag(acvar), np.zeros((pca_dim,)))
-    #return bilinear_scoring(Lambda, Gamma, c, k, x, x)
+    acvar, wccn = spl.eigh(B, W)
+    x = np.dot(x-plda_mu, PCA).dot(wccn)
+    x *= np.sqrt(x.shape[1] / np.dot(x**2, 1.0 / (acvar + 1.0)))[:, np.newaxis]  # kaldi style length-norm
     return PLDA_scoring_in_LDA_space(x, x, acvar)
 
 
@@ -135,8 +134,8 @@ def read_xvector_timing_dict(kaldi_segments):
          segs_dict[recording_file_name] = (array_of_xvector_names, array_of_start_and_end_times)
     """
     segs = np.loadtxt(kaldi_segments, dtype=object)
-    split_by_filename = np.nonzero(segs[1:,1]!=segs[:-1,1])[0]+1
-    return {s[0,1]: (s[:,0], s[:,2:].astype(float)) for s in np.split(segs, split_by_filename)}
+    split_by_filename = np.nonzero(segs[1:, 1] != segs[:-1, 1])[0] + 1
+    return {s[0, 1]: (s[:, 0], s[:, 2:].astype(float)) for s in np.split(segs, split_by_filename)}
 
 
 def merge_adjacent_labels(starts, ends, labels):
@@ -154,13 +153,13 @@ def merge_adjacent_labels(starts, ends, labels):
     # Merge neighbouring (or overlaping) segments with the same label
     adjacent_or_overlap = np.logical_or(np.isclose(ends[:-1], starts[1:]), ends[:-1] > starts[1:])
     to_split = np.nonzero(np.logical_or(~adjacent_or_overlap, labels[1:] != labels[:-1]))[0]
-    starts  = starts[np.r_[0, to_split+1]]
-    ends    = ends[np.r_[to_split, -1]]
-    labels  = labels[np.r_[0, to_split+1]]
-  
+    starts = starts[np.r_[0, to_split+1]]
+    ends = ends[np.r_[to_split, -1]]
+    labels = labels[np.r_[0, to_split+1]]
+
     # Fix starts and ends times for overlapping segments
-    overlaping = np.nonzero(starts[1:]<ends[:-1])[0]
-    ends[overlaping] = starts[overlaping+1] = (ends[overlaping]+starts[overlaping+1]) / 2.0
+    overlaping = np.nonzero(starts[1:] < ends[:-1])[0]
+    ends[overlaping] = starts[overlaping+1] = (ends[overlaping] + starts[overlaping+1]) / 2.0
     return starts, ends, labels
 
 
@@ -178,12 +177,12 @@ def segment_to_frame_labels(starts, ends, labels, length=0, frame_rate=100., emp
         frms  - array of frame-by-frame labels
     """
     min_len, max_len = (length, length) if length > 0 else (-length, None)
-    starts = np.rint(frame_rate*starts).astype(int)
-    ends   = np.rint(frame_rate*ends  ).astype(int)
+    starts = np.rint(frame_rate * starts).astype(int)
+    ends = np.rint(frame_rate * ends).astype(int)
     if not ends.size:
-      return np.full(min_len, empty_label)
+        return np.full(min_len, empty_label)
 
-    frms = np.repeat(np.r_[np.c_[[empty_label]*len(labels),    labels     ].flat, empty_label],
+    frms = np.repeat(np.r_[np.c_[[empty_label]*len(labels), labels].flat, empty_label],
                      np.r_[np.c_[starts - np.r_[0, ends[:-1]], ends-starts].flat, max(0, min_len-ends[-1])])
     return frms[:max_len]
 
@@ -194,7 +193,8 @@ def mkdir_p(path):
     except OSError as exc:
         if exc.errno == errno.EEXIST and os.path.isdir(path):
             pass
-        else: raise
+        else:
+            raise
 
 
 def l2_norm(vec_or_matrix):
@@ -216,26 +216,26 @@ def l2_norm(vec_or_matrix):
 
 
 def cos_similarity(x):
-        """Compute cosine similarity matrix in CPU & memory sensitive way
-
-        Args:
-            x (np.ndarray): embeddings, 2D array, embeddings are in rows
-
-        Returns:
-            np.ndarray: cosine similarity matrix
-
-        """
-        assert x.ndim == 2, f'x has {x.ndim} dimensions, it must be matrix'
-        x = x / (np.sqrt(np.sum(np.square(x), axis=1, keepdims=True)) + 1.0e-32)
-        assert np.allclose(np.ones_like(x[:, 0]), np.sum(np.square(x), axis=1))
-        max_n_elm = 200000000
-        step = max(max_n_elm // (x.shape[0] * x.shape[0]), 1)
-        retval = np.zeros(shape=(x.shape[0], x.shape[0]), dtype=np.float64)
-        x0 = np.expand_dims(x, 0)
-        x1 = np.expand_dims(x, 1)
-        for i in range(0, x.shape[1], step):
-            product = x0[:, :, i:i+step] * x1[:, :, i:i+step]
-            retval += np.sum(product, axis=2, keepdims=False)
-        assert np.all(retval >= -1.0001), retval
-        assert np.all(retval <= 1.0001), retval
-        return retval
+    """Compute cosine similarity matrix in CPU & memory sensitive way
+
+    Args:
+        x (np.ndarray): embeddings, 2D array, embeddings are in rows
+
+    Returns:
+        np.ndarray: cosine similarity matrix
+
+    """
+    assert x.ndim == 2, f'x has {x.ndim} dimensions, it must be matrix'
+    x = x / (np.sqrt(np.sum(np.square(x), axis=1, keepdims=True)) + 1.0e-32)
+    assert np.allclose(np.ones_like(x[:, 0]), np.sum(np.square(x), axis=1))
+    max_n_elm = 200000000
+    step = max(max_n_elm // (x.shape[0] * x.shape[0]), 1)
+    retval = np.zeros(shape=(x.shape[0], x.shape[0]), dtype=np.float64)
+    x0 = np.expand_dims(x, 0)
+    x1 = np.expand_dims(x, 1)
+    for i in range(0, x.shape[1], step):
+        product = x0[:, :, i:i+step] * x1[:, :, i:i+step]
+        retval += np.sum(product, axis=2, keepdims=False)
+    assert np.all(retval >= -1.0001), retval
+    assert np.all(retval <= 1.0001), retval
+    return retval
@@ -8,17 +8,17 @@
 
 def framing(a, window, shift=1):
     shape = ((a.shape[0] - window) // shift + 1, window) + a.shape[1:]
-    strides = (a.strides[0]*shift,a.strides[0]) + a.strides[1:]
+    strides = (a.strides[0]*shift, a.strides[0]) + a.strides[1:]
     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 
 
 # Mel and inverse Mel scale warping functions
 def mel_inv(x):
-    return (np.exp(x/1127.)-1.)*700.
+    return (np.exp(x/1127.) - 1.) * 700.
 
 
 def mel(x):
-    return 1127.*np.log(1. + x/700.)
+    return 1127. * np.log(1. + x/700.)
 
 
 def preemphasis(x, coef=0.97):
@@ -31,7 +31,7 @@ def mel_fbank_mx(winlen_nfft, fs, NUMCHANS=20, LOFREQ=0.0, HIFREQ=None, warp_fn=
                   used to determine number of samples for FFT computation (NFFT).
                   If positive, the value (window lenght) is rounded up to the
                   next higher power of two to obtain HTK-compatible NFFT.
-                  If negative, NFFT is set to -winlen_nfft. In such case, the 
+                  If negative, NFFT is set to -winlen_nfft. In such case, the
                   parameter nfft in mfcc_htk() call should be set likewise.
     fs          - sampling frequency (Hz, i.e. 1e7/SOURCERATE)
     NUMCHANS    - number of filter bank bands
@@ -40,30 +40,32 @@ def mel_fbank_mx(winlen_nfft, fs, NUMCHANS=20, LOFREQ=0.0, HIFREQ=None, warp_fn=
     warp_fn     - function for frequency warping and its inverse
     inv_warp_fn - inverse function to warp_fn
     """
-    if not HIFREQ: HIFREQ = 0.5 * fs
+    HIFREQ = 0.5 * fs if not HIFREQ else HIFREQ
     nfft = 2**int(np.ceil(np.log2(winlen_nfft))) if winlen_nfft > 0 else -int(winlen_nfft)
 
     fbin_mel = warp_fn(np.arange(nfft / 2 + 1, dtype=float) * fs / nfft)
     cbin_mel = np.linspace(warp_fn(LOFREQ), warp_fn(HIFREQ), NUMCHANS + 2)
     cind = np.floor(inv_warp_fn(cbin_mel) / fs * nfft).astype(int) + 1
     mfb = np.zeros((len(fbin_mel), NUMCHANS))
     for i in range(NUMCHANS):
-        mfb[cind[i]  :cind[i+1], i] = (cbin_mel[i]  -fbin_mel[cind[i]  :cind[i+1]]) / (cbin_mel[i]  -cbin_mel[i+1])
-        mfb[cind[i+1]:cind[i+2], i] = (cbin_mel[i+2]-fbin_mel[cind[i+1]:cind[i+2]]) / (cbin_mel[i+2]-cbin_mel[i+1])
-    if LOFREQ > 0.0 and float(LOFREQ)/fs*nfft+0.5 > cind[0] and htk_bug: mfb[cind[0],:] = 0.0 # Just to be HTK compatible
+        mfb[cind[i]:cind[i+1], i] = (cbin_mel[i] - fbin_mel[cind[i]:cind[i+1]]) / (cbin_mel[i] - cbin_mel[i+1])
+        mfb[cind[i+1]:cind[i+2], i] = (cbin_mel[i+2] - fbin_mel[cind[i+1]:cind[i+2]]) / \
+                                      (cbin_mel[i+2] - cbin_mel[i+1])
+    if LOFREQ > 0.0 and float(LOFREQ) / fs * nfft + 0.5 > cind[0] and htk_bug:
+        mfb[cind[0], :] = 0.0  # Just to be HTK compatible
     return mfb
 
 
 def fbank_htk(x, window, noverlap, fbank_mx, nfft=None, _E=None,
-             USEPOWER=False, RAWENERGY=True, PREEMCOEF=0.97, ZMEANSOURCE=False,
-             ENORMALISE=True, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True):
+              USEPOWER=False, RAWENERGY=True, PREEMCOEF=0.97, ZMEANSOURCE=False,
+              ENORMALISE=True, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True):
     """Mel log Mel-filter bank channel outputs
     Returns NUMCHANS-by-M matrix of log Mel-filter bank outputs extracted from
     signal x, where M is the number of extracted frames, which can be computed
     as floor((length(x)-noverlap)/(window-noverlap)). Remaining parameters
     have the following meaning:
     x         - input signal
-    window    - frame window length (in samples, i.e. WINDOWSIZE/SOURCERATE) 
+    window    - frame window length (in samples, i.e. WINDOWSIZE/SOURCERATE)
                 or vector of window weights override default windowing function
                 (see option USEHAMMING)
     noverlap  - overlapping between frames (in samples, i.e window-TARGETRATE/SOURCERATE)
@@ -80,13 +82,11 @@ def fbank_htk(x, window, noverlap, fbank_mx, nfft=None, _E=None,
     See also:
       mel_fbank_mx:
           to obtain the matrix for the parameter fbank_mx
-      add_deriv: 
+      add_deriv:
           for adding delta, double delta, ... coefficients
       add_dither:
           for adding dithering in HTK-like fashion
     """
-    from time import time
-    tm = time()
     if type(USEPOWER) == bool:
         USEPOWER += 1
     if np.isscalar(window):
@@ -95,40 +95,37 @@ def fbank_htk(x, window, noverlap, fbank_mx, nfft=None, _E=None,
         nfft = 2**int(np.ceil(np.log2(window.size)))
     x = framing(x.astype("float"), window.size, window.size-noverlap).copy()
     if ZMEANSOURCE:
-        x -= x.mean(axis=1)[:,np.newaxis]
+        x -= x.mean(axis=1)[:, np.newaxis]
     if _E is not None and RAWENERGY:
         energy = np.log((x**2).sum(axis=1))
     if PREEMCOEF is not None:
         x = preemphasis(x, PREEMCOEF)
     x *= window
     if _E is not None and not RAWENERGY:
         energy = np.log((x**2).sum(axis=1))
-    #x = np.abs(scipy.fftpack.fft(x, nfft))
-    #x = x[:,:x.shape[1]/2+1]
     x = np.fft.rfft(x, nfft)
-    #x = np.abs(x)
     x = x.real**2 + x.imag**2
     if USEPOWER != 2:
         x **= 0.5 * USEPOWER
     x = np.log(np.maximum(1.0, np.dot(x, fbank_mx)))
     if _E is not None and ENORMALISE:
-        energy = (energy - energy.max())       * ESCALE + 1.0
-        min_val  = -np.log(10**(SILFLOOR/10.)) * ESCALE + 1.0
+        energy = (energy - energy.max()) * ESCALE + 1.0
+        min_val = -np.log(10**(SILFLOOR/10.)) * ESCALE + 1.0
         energy[energy < min_val] = min_val
 
-    return np.hstack(([energy[:,np.newaxis]] if _E == "first" else []) + [x] +
-                     ([energy[:,np.newaxis]] if (_E in ["last", True])  else []))
-                     
+    return np.hstack(([energy[:, np.newaxis]] if _E == "first" else []) + [x] +
+                     ([energy[:, np.newaxis]] if (_E in ["last", True]) else []))
+
 
 def povey_window(winlen):
-  return np.power(0.5 - 0.5*np.cos(np.linspace(0,2*np.pi, winlen)), 0.85)
+    return np.power(0.5 - 0.5*np.cos(np.linspace(0, 2*np.pi, winlen)), 0.85)
 
 
 def add_dither(x, level=8):
-    return x + level * (np.random.rand(*x.shape)*2-1) 
+    return x + level * (np.random.rand(*x.shape)*2 - 1)
 
 
-def cmvn_floating_kaldi(x, LC,RC, norm_vars=True):
+def cmvn_floating_kaldi(x, LC, RC, norm_vars=True):
     """Mean and variance normalization over a floating window.
     x is the feature matrix (nframes x dim)
     LC, RC are the number of frames to the left and right defining the floating
@@ -139,11 +136,11 @@ def cmvn_floating_kaldi(x, LC,RC, norm_vars=True):
     Global normalization is used if nframes is less than LC+RC+1.
     """
     N, dim = x.shape
-    win_len = min(len(x),  LC+RC+1)
-    win_start = np.maximum(np.minimum(np.arange(-LC,N-LC), N-win_len), 0)
+    win_len = min(len(x), LC+RC+1)
+    win_start = np.maximum(np.minimum(np.arange(-LC, N-LC), N-win_len), 0)
     f = np.r_[np.zeros((1, dim)), np.cumsum(x, 0)]
-    x = x - (f[win_start+win_len]-f[win_start])/win_len
+    x = x - (f[win_start+win_len] - f[win_start]) / win_len
     if norm_vars:
-      f = np.r_[np.zeros((1, dim)), np.cumsum(x**2, 0)]
-      x /= np.sqrt((f[win_start+win_len]-f[win_start])/win_len)
+        f = np.r_[np.zeros((1, dim)), np.cumsum(x**2, 0)]
+        x /= np.sqrt((f[win_start+win_len] - f[win_start]) / win_len)
     return x