Some merge conflictsMerge branch 'master' of https://github.com/Gaura…

…vWaghmare/Phoneme Conflicts: kerasmodelNN.py silent_zones.py
GauravWaghmare · Jun 14, 2016 · 7802033 · 7802033
2 parents 6ef5f7c + a2d1e1a
commit 7802033
Show file tree

Hide file tree

Showing 10 changed files with 818 additions and 641 deletions.
diff --git a/LSTM.py b/LSTM.py
@@ -0,0 +1,26 @@
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation
+from keras.optimizers import SGD
+
+model = Sequential()
+# Dense(64) is a fully-connected layer with 64 hidden units.
+# in the first layer, you must specify the expected input data shape:
+# here, 20-dimensional vectors.
+model.add(Dense(64, input_dim=20, init='uniform'))
+model.add(Activation('tanh'))
+model.add(Dropout(0.5))
+model.add(Dense(64, init='uniform'))
+model.add(Activation('tanh'))
+model.add(Dropout(0.5))
+model.add(Dense(10, init='uniform'))
+model.add(Activation('softmax'))
+
+sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
+model.compile(loss='categorical_crossentropy',
+              optimizer=sgd,
+              metrics=['accuracy'])
+
+model.fit(X_train, y_train,
+          nb_epoch=20,
+          batch_size=16)
+score = model.evaluate(X_test, y_test, batch_size=16)
diff --git a/LTSD_silence_detection.py b/LTSD_silence_detection.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python2
+# -*- coding: utf-8 -*-
+# $File: ltsd.py
+# $Date: Sun Jul 19 17:53:59 2015 +0800
+# $Author: Xinyu Zhou <zxytim[at]gmail[dot]com>
+
+import sys
+from scipy.io import wavfile
+import matplotlib
+matplotlib.use("Qt4Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pyssp.vad.ltsd import LTSD
+
+
+MAGIC_NUMBER = 0.04644
+
+class LTSD_VAD(object):
+    ltsd = None
+    order = 5
+
+    fs = 0
+    window_size = 0
+    window = 0
+
+    lambda0 = 0
+    lambda1 = 0
+
+    noise_signal = None
+
+    def init_params_by_noise(self, fs, noise_signal):
+        noise_signal = self._mononize_signal(noise_signal)
+        self.noise_signal = np.array(noise_signal)
+        self._init_window(fs)
+        ltsd = LTSD(self.window_size, self.window, self.order)
+        res, ltsds = ltsd.compute_with_noise(noise_signal,
+                noise_signal)
+        max_ltsd = max(ltsds)
+        self.lambda0 = max_ltsd * 1.1
+        self.lambda1 = self.lambda0 * 2.0
+        print 'max_ltsd =', max_ltsd
+        print 'lambda0 =', self.lambda0
+        print 'lambda1 =', self.lambda1
+
+    def plot_ltsd(self, fs, signal):
+        signal = self._mononize_signal(signal)
+        res, ltsds = self._get_ltsd().compute_with_noise(signal, self.noise_signal)
+        plt.plot(ltsds)
+        plt.show()
+
+    def filter(self, signal):
+        signal = self._mononize_signal(signal)
+        res, ltsds = self._get_ltsd().compute_with_noise(signal, self.noise_signal)
+        voice_signals = []
+        res = [(start * self.window_size / 2, (finish + 1) * self.window_size
+                / 2) for start, finish in res]
+        print res, len(ltsds) * self.window_size / 2
+        for start, finish in res:
+            voice_signals.append(signal[start:finish])
+        try:
+            return np.concatenate(voice_signals), res
+        except:
+            return np.array([]), []
+
+    def _init_window(self, fs):
+        self.fs = fs
+        self.window_size = int(MAGIC_NUMBER * fs)
+        self.window = np.hanning(self.window_size)
+
+    def _get_ltsd(self, fs=None):
+        if fs is not None and fs != self.fs:
+            self._init_window(fs)
+        return LTSD(self.window_size, self.window, self.order,
+                lambda0=self.lambda0, lambda1=self.lambda1)
+
+    def _mononize_signal(self, signal):
+        if signal.ndim > 1:
+            signal = signal[:,0]
+        return signal
+
+
+def main():
+    fs, bg_signal = wavfile.read(sys.argv[1])
+    ltsd = LTSD_VAD()
+    ltsd.init_params_by_noise(fs, bg_signal)
+
+    fs, signal = wavfile.read(sys.argv[2])
+    vaded_signal = ltsd.filter(signal)
+
+    wavfile.write('/home/gaurav/Documents/Phoneme/trainset/2/3_1.wav', fs, vaded_signal)
+
+if __name__ == '__main__':
+    main()
diff --git a/NN/src/Removesilence.py b/NN/src/Removesilence.py
@@ -0,0 +1,247 @@
+import numpy
+import mlpy
+import time
+import scipy
+import os
+import matplotlib.pyplot as plt
+from scipy.spatial import distance
+import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from sklearn.lda import LDA
+import csv
+import os.path
+import sklearn
+import sklearn.hmm
+import cPickle
+import glob
+import featureExtraction as fe
+import scipy
+import scipy.io.wavfile as wavfile
+import sklearn
+from features import mfcc
+from sklearn import svm
+
+def listOfFeatures2Matrix(features):
+	'''
+	listOfFeatures2Matrix(features)
+
+	This function takes a list of feature matrices as argument and returns a single concatenated feature matrix and the respective class labels.
+
+	ARGUMENTS:
+		- features:        a list of feature matrices
+
+	RETURNS:
+		- X:            a concatenated matrix of features
+		- Y:            a vector of class indeces
+	'''
+
+	X = numpy.array([])
+	Y = numpy.array([])
+	for i, f in enumerate(features):
+		if i == 0:
+			X = f
+			Y = i * numpy.ones((len(f), 1))
+		else:
+			X = numpy.vstack((X, f))
+			Y = numpy.append(Y, i * numpy.ones((len(f), 1)))
+	return (X, Y)
+
+
+def trainSVM(features, Cparam):
+	'''
+	Train a multi-class probabilitistic SVM classifier.
+	Note:     This function is simply a wrapper to the mlpy-LibSVM functionality for SVM training
+			  See function trainSVM_feature() to use a wrapper on both the feature extraction and the SVM training (and parameter tuning) processes.
+	ARGUMENTS:
+		- features:         a list ([numOfClasses x 1]) whose elements containt numpy matrices of features
+							each matrix features[i] of class i is [numOfSamples x numOfDimensions]
+		- Cparam:           SVM parameter C (cost of constraints violation)
+	RETURNS:
+		- svm:              the trained SVM variable
+
+	NOTE:
+		This function trains a linear-kernel SVM for a given C value. For a different kernel, other types of parameters should be provided.
+		For example, gamma for a polynomial, rbf or sigmoid kernel. Furthermore, Nu should be provided for a nu_SVM classifier.
+		See MLPY documentation for more details (http://mlpy.sourceforge.net/docs/3.4/svm.html)
+	'''
+
+	[X, Y] = listOfFeatures2Matrix(features)
+	svm = mlpy.LibSvm(svm_type='c_svc', kernel_type='linear', eps=0.0000001, C=Cparam, probability=True)
+	svm.learn(X, Y)
+	return svm
+
+
+
+def normalizeFeatures(features):
+	'''
+	This function normalizes a feature set to 0-mean and 1-std.
+	Used in most classifier trainning cases.
+
+	ARGUMENTS:
+		- features:    list of feature matrices (each one of them is a numpy matrix)
+	RETURNS:
+		- featuresNorm:    list of NORMALIZED feature matrices
+		- MEAN:        mean vector
+		- STD:        std vector
+	'''
+	X = numpy.array([])
+
+	for count, f in enumerate(features):
+		if f.shape[0] > 0:
+			if count == 0:
+				X = f
+			else:
+				X = numpy.vstack((X, f))
+			count += 1
+
+	MEAN = numpy.mean(X, axis=0)
+	STD = numpy.std(X, axis=0)
+
+	featuresNorm = []
+	for f in features:
+		ft = f.copy()
+		for nSamples in range(f.shape[0]):
+			ft[nSamples, :] = (ft[nSamples, :] - MEAN) / STD
+		featuresNorm.append(ft)
+	return (featuresNorm, MEAN, STD)
+
+
+
+def smoothMovingAvg(inputSignal, windowLen=11):
+	windowLen = int(windowLen)
+	if inputSignal.ndim != 1:
+		raise ValueError("")
+	if inputSignal.size < windowLen:
+		raise ValueError("Input vector needs to be bigger than window size.")
+	if windowLen < 3:
+		return inputSignal
+	s = numpy.r_[2*inputSignal[0] - inputSignal[windowLen-1::-1], inputSignal, 2*inputSignal[-1]-inputSignal[-1:-windowLen:-1]]
+	w = numpy.ones(windowLen, 'd')
+	y = numpy.convolve(w/w.sum(), s, mode='same')
+	return y[windowLen:-windowLen+1]
+
+
+
+def silenceRemoval(x, Fs, stWin, stStep, smoothWindow=0.5, Weight=0.5, plot=False):
+	'''
+	Event Detection (silence removal)
+	ARGUMENTS:
+		 - x:                the input audio signal
+		 - Fs:               sampling freq
+		 - stWin, stStep:    window size and step in seconds
+		 - smoothWindow:     (optinal) smooth window (in seconds)
+		 - Weight:           (optinal) weight factor (0 < Weight < 1) the higher, the more strict
+		 - plot:             (optinal) True if results are to be plotted
+	RETURNS:
+		 - segmentLimits:    list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
+					the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
+	'''
+
+	if Weight >= 1:
+		Weight = 0.99
+	if Weight <= 0:
+		Weight = 0.01
+
+	# Step 1: feature extraction
+	ShortTermFeatures = fe.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs)        # extract short-term features
+
+	# Step 2: train binary SVM classifier of low vs high energy frames
+	EnergySt = ShortTermFeatures[1, :]                  # keep only the energy short-term sequence (2nd feature)
+	E = numpy.sort(EnergySt)                            # sort the energy feature values:
+	L1 = int(len(E) / 10)                               # number of 10% of the total short-term windows
+	T1 = numpy.mean(E[0:L1])                            # compute "lower" 10% energy threshold
+	T2 = numpy.mean(E[-L1:-1])                          # compute "higher" 10% energy threshold
+	Class1 = ShortTermFeatures[:, numpy.where(EnergySt < T1)[0]]         # get all features that correspond to low energy
+	Class2 = ShortTermFeatures[:, numpy.where(EnergySt > T2)[0]]         # get all features that correspond to high energy
+	featuresSS = [Class1.T, Class2.T]                                    # form the binary classification task and ...
+	[featuresNormSS, MEANSS, STDSS] = normalizeFeatures(featuresSS)   # normalize and ...
+	SVM = trainSVM(featuresNormSS, 1.0)                               # train the respective SVM probabilistic model (ONSET vs SILENCE)
+
+	# Step 3: compute onset probability based on the trained SVM
+	ProbOnset = []
+	for i in range(ShortTermFeatures.shape[1]):                    # for each frame
+		curFV = (ShortTermFeatures[:, i] - MEANSS) / STDSS         # normalize feature vector
+		ProbOnset.append(SVM.pred_probability(curFV)[1])           # get SVM probability (that it belongs to the ONSET class)
+	ProbOnset = numpy.array(ProbOnset)
+	ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep)  # smooth probability
+
+	# Step 4A: detect onset frame indices:
+	ProbOnsetSorted = numpy.sort(ProbOnset)                        # find probability Threshold as a weighted average of top 10% and lower 10% of the values
+	Nt = ProbOnsetSorted.shape[0] / 10
+	T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) + Weight * numpy.mean(ProbOnsetSorted[-Nt::]))
+
+	MaxIdx = numpy.where(ProbOnset > T)[0]                         # get the indices of the frames that satisfy the thresholding
+	i = 0
+	timeClusters = []
+	segmentLimits = []
+
+	# Step 4B: group frame indices to onset segments
+	while i < len(MaxIdx):                                         # for each of the detected onset indices
+		curCluster = [MaxIdx[i]]
+		if i == len(MaxIdx)-1:
+			break
+		while MaxIdx[i+1] - curCluster[-1] <= 2:
+			curCluster.append(MaxIdx[i+1])
+			i += 1
+			if i == len(MaxIdx)-1:
+				break
+		i += 1
+		timeClusters.append(curCluster)
+		segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep])
+
+	# Step 5: Post process: remove very small segments:
+	minDuration = 0.2
+	segmentLimits2 = []
+	for s in segmentLimits:
+		if s[1] - s[0] > minDuration:
+			segmentLimits2.append(s)
+	segmentLimits = segmentLimits2
+
+	if plot:
+		timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs)
+
+		plt.subplot(2, 1, 1)
+		plt.plot(timeX, x)
+		for s in segmentLimits:
+			plt.axvline(x=s[0])
+			plt.axvline(x=s[1])
+		plt.subplot(2, 1, 2)
+		plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset)
+		plt.title('Signal')
+		for s in segmentLimits:
+			plt.axvline(x=s[0])
+			plt.axvline(x=s[1])
+		plt.title('SVM Probability')
+		plt.show()
+
+	return segmentLimits
+
+
+
+def nonsilentRegions(segmentLimits, fs, data):
+	segmentLimits *= fs
+	wave = numpy.array([])
+	flag = False
+
+	for i in segmentLimits:
+		start = i[0]
+		end = i[1]
+		a = data[start:end]
+		if flag==False:
+			wave = a
+			flag = True
+		else:
+			wave = numpy.concatenate((wave, a ))
+
+	# wavfile.write(file, fs, wave)
+	return wave
+
+
+# fs, data = wavfile.read("/home/manvi/Desktop/voicebiometric/Phoneme/trainset/1/662892_age_reco.wav")
+
+# stWin = 0.025
+# stStep = 0.01
+
+# segmentLimits = silenceRemoval(data, fs, stWin, stStep)
+# segmentLimits = numpy.asarray(segmentLimits)
+# wave = nonsilentRegions(segmentLimits, fs)