Skip to content

Commit

Permalink
Some merge conflictsMerge branch 'master' of https://github.com/Gaura…
Browse files Browse the repository at this point in the history
…vWaghmare/Phoneme

Conflicts:
	kerasmodelNN.py
	silent_zones.py
  • Loading branch information
ManviG committed Jun 14, 2016
2 parents 6ef5f7c + a2d1e1a commit 7802033
Show file tree
Hide file tree
Showing 10 changed files with 818 additions and 641 deletions.
26 changes: 26 additions & 0 deletions LSTM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(Dense(64, input_dim=20, init='uniform'))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(64, init='uniform'))
model.add(Activation('tanh'))
model.add(Dropout(0.5))
model.add(Dense(10, init='uniform'))
model.add(Activation('softmax'))

sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
optimizer=sgd,
metrics=['accuracy'])

model.fit(X_train, y_train,
nb_epoch=20,
batch_size=16)
score = model.evaluate(X_test, y_test, batch_size=16)
94 changes: 94 additions & 0 deletions LTSD_silence_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/python2
# -*- coding: utf-8 -*-
# $File: ltsd.py
# $Date: Sun Jul 19 17:53:59 2015 +0800
# $Author: Xinyu Zhou <zxytim[at]gmail[dot]com>

import sys
from scipy.io import wavfile
import matplotlib
matplotlib.use("Qt4Agg")
import matplotlib.pyplot as plt
import numpy as np

from pyssp.vad.ltsd import LTSD


MAGIC_NUMBER = 0.04644

class LTSD_VAD(object):
ltsd = None
order = 5

fs = 0
window_size = 0
window = 0

lambda0 = 0
lambda1 = 0

noise_signal = None

def init_params_by_noise(self, fs, noise_signal):
noise_signal = self._mononize_signal(noise_signal)
self.noise_signal = np.array(noise_signal)
self._init_window(fs)
ltsd = LTSD(self.window_size, self.window, self.order)
res, ltsds = ltsd.compute_with_noise(noise_signal,
noise_signal)
max_ltsd = max(ltsds)
self.lambda0 = max_ltsd * 1.1
self.lambda1 = self.lambda0 * 2.0
print 'max_ltsd =', max_ltsd
print 'lambda0 =', self.lambda0
print 'lambda1 =', self.lambda1

def plot_ltsd(self, fs, signal):
signal = self._mononize_signal(signal)
res, ltsds = self._get_ltsd().compute_with_noise(signal, self.noise_signal)
plt.plot(ltsds)
plt.show()

def filter(self, signal):
signal = self._mononize_signal(signal)
res, ltsds = self._get_ltsd().compute_with_noise(signal, self.noise_signal)
voice_signals = []
res = [(start * self.window_size / 2, (finish + 1) * self.window_size
/ 2) for start, finish in res]
print res, len(ltsds) * self.window_size / 2
for start, finish in res:
voice_signals.append(signal[start:finish])
try:
return np.concatenate(voice_signals), res
except:
return np.array([]), []

def _init_window(self, fs):
self.fs = fs
self.window_size = int(MAGIC_NUMBER * fs)
self.window = np.hanning(self.window_size)

def _get_ltsd(self, fs=None):
if fs is not None and fs != self.fs:
self._init_window(fs)
return LTSD(self.window_size, self.window, self.order,
lambda0=self.lambda0, lambda1=self.lambda1)

def _mononize_signal(self, signal):
if signal.ndim > 1:
signal = signal[:,0]
return signal


def main():
fs, bg_signal = wavfile.read(sys.argv[1])
ltsd = LTSD_VAD()
ltsd.init_params_by_noise(fs, bg_signal)

fs, signal = wavfile.read(sys.argv[2])
vaded_signal = ltsd.filter(signal)

wavfile.write('/home/gaurav/Documents/Phoneme/trainset/2/3_1.wav', fs, vaded_signal)

if __name__ == '__main__':
main()
247 changes: 247 additions & 0 deletions NN/src/Removesilence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
import numpy
import mlpy
import time
import scipy
import os
import matplotlib.pyplot as plt
from scipy.spatial import distance
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.lda import LDA
import csv
import os.path
import sklearn
import sklearn.hmm
import cPickle
import glob
import featureExtraction as fe
import scipy
import scipy.io.wavfile as wavfile
import sklearn
from features import mfcc
from sklearn import svm

def listOfFeatures2Matrix(features):
'''
listOfFeatures2Matrix(features)
This function takes a list of feature matrices as argument and returns a single concatenated feature matrix and the respective class labels.
ARGUMENTS:
- features: a list of feature matrices
RETURNS:
- X: a concatenated matrix of features
- Y: a vector of class indeces
'''

X = numpy.array([])
Y = numpy.array([])
for i, f in enumerate(features):
if i == 0:
X = f
Y = i * numpy.ones((len(f), 1))
else:
X = numpy.vstack((X, f))
Y = numpy.append(Y, i * numpy.ones((len(f), 1)))
return (X, Y)


def trainSVM(features, Cparam):
'''
Train a multi-class probabilitistic SVM classifier.
Note: This function is simply a wrapper to the mlpy-LibSVM functionality for SVM training
See function trainSVM_feature() to use a wrapper on both the feature extraction and the SVM training (and parameter tuning) processes.
ARGUMENTS:
- features: a list ([numOfClasses x 1]) whose elements containt numpy matrices of features
each matrix features[i] of class i is [numOfSamples x numOfDimensions]
- Cparam: SVM parameter C (cost of constraints violation)
RETURNS:
- svm: the trained SVM variable
NOTE:
This function trains a linear-kernel SVM for a given C value. For a different kernel, other types of parameters should be provided.
For example, gamma for a polynomial, rbf or sigmoid kernel. Furthermore, Nu should be provided for a nu_SVM classifier.
See MLPY documentation for more details (http://mlpy.sourceforge.net/docs/3.4/svm.html)
'''

[X, Y] = listOfFeatures2Matrix(features)
svm = mlpy.LibSvm(svm_type='c_svc', kernel_type='linear', eps=0.0000001, C=Cparam, probability=True)
svm.learn(X, Y)
return svm



def normalizeFeatures(features):
'''
This function normalizes a feature set to 0-mean and 1-std.
Used in most classifier trainning cases.
ARGUMENTS:
- features: list of feature matrices (each one of them is a numpy matrix)
RETURNS:
- featuresNorm: list of NORMALIZED feature matrices
- MEAN: mean vector
- STD: std vector
'''
X = numpy.array([])

for count, f in enumerate(features):
if f.shape[0] > 0:
if count == 0:
X = f
else:
X = numpy.vstack((X, f))
count += 1

MEAN = numpy.mean(X, axis=0)
STD = numpy.std(X, axis=0)

featuresNorm = []
for f in features:
ft = f.copy()
for nSamples in range(f.shape[0]):
ft[nSamples, :] = (ft[nSamples, :] - MEAN) / STD
featuresNorm.append(ft)
return (featuresNorm, MEAN, STD)



def smoothMovingAvg(inputSignal, windowLen=11):
windowLen = int(windowLen)
if inputSignal.ndim != 1:
raise ValueError("")
if inputSignal.size < windowLen:
raise ValueError("Input vector needs to be bigger than window size.")
if windowLen < 3:
return inputSignal
s = numpy.r_[2*inputSignal[0] - inputSignal[windowLen-1::-1], inputSignal, 2*inputSignal[-1]-inputSignal[-1:-windowLen:-1]]
w = numpy.ones(windowLen, 'd')
y = numpy.convolve(w/w.sum(), s, mode='same')
return y[windowLen:-windowLen+1]



def silenceRemoval(x, Fs, stWin, stStep, smoothWindow=0.5, Weight=0.5, plot=False):
'''
Event Detection (silence removal)
ARGUMENTS:
- x: the input audio signal
- Fs: sampling freq
- stWin, stStep: window size and step in seconds
- smoothWindow: (optinal) smooth window (in seconds)
- Weight: (optinal) weight factor (0 < Weight < 1) the higher, the more strict
- plot: (optinal) True if results are to be plotted
RETURNS:
- segmentLimits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that
the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds
'''

if Weight >= 1:
Weight = 0.99
if Weight <= 0:
Weight = 0.01

# Step 1: feature extraction
ShortTermFeatures = fe.stFeatureExtraction(x, Fs, stWin * Fs, stStep * Fs) # extract short-term features

# Step 2: train binary SVM classifier of low vs high energy frames
EnergySt = ShortTermFeatures[1, :] # keep only the energy short-term sequence (2nd feature)
E = numpy.sort(EnergySt) # sort the energy feature values:
L1 = int(len(E) / 10) # number of 10% of the total short-term windows
T1 = numpy.mean(E[0:L1]) # compute "lower" 10% energy threshold
T2 = numpy.mean(E[-L1:-1]) # compute "higher" 10% energy threshold
Class1 = ShortTermFeatures[:, numpy.where(EnergySt < T1)[0]] # get all features that correspond to low energy
Class2 = ShortTermFeatures[:, numpy.where(EnergySt > T2)[0]] # get all features that correspond to high energy
featuresSS = [Class1.T, Class2.T] # form the binary classification task and ...
[featuresNormSS, MEANSS, STDSS] = normalizeFeatures(featuresSS) # normalize and ...
SVM = trainSVM(featuresNormSS, 1.0) # train the respective SVM probabilistic model (ONSET vs SILENCE)

# Step 3: compute onset probability based on the trained SVM
ProbOnset = []
for i in range(ShortTermFeatures.shape[1]): # for each frame
curFV = (ShortTermFeatures[:, i] - MEANSS) / STDSS # normalize feature vector
ProbOnset.append(SVM.pred_probability(curFV)[1]) # get SVM probability (that it belongs to the ONSET class)
ProbOnset = numpy.array(ProbOnset)
ProbOnset = smoothMovingAvg(ProbOnset, smoothWindow / stStep) # smooth probability

# Step 4A: detect onset frame indices:
ProbOnsetSorted = numpy.sort(ProbOnset) # find probability Threshold as a weighted average of top 10% and lower 10% of the values
Nt = ProbOnsetSorted.shape[0] / 10
T = (numpy.mean((1 - Weight) * ProbOnsetSorted[0:Nt]) + Weight * numpy.mean(ProbOnsetSorted[-Nt::]))

MaxIdx = numpy.where(ProbOnset > T)[0] # get the indices of the frames that satisfy the thresholding
i = 0
timeClusters = []
segmentLimits = []

# Step 4B: group frame indices to onset segments
while i < len(MaxIdx): # for each of the detected onset indices
curCluster = [MaxIdx[i]]
if i == len(MaxIdx)-1:
break
while MaxIdx[i+1] - curCluster[-1] <= 2:
curCluster.append(MaxIdx[i+1])
i += 1
if i == len(MaxIdx)-1:
break
i += 1
timeClusters.append(curCluster)
segmentLimits.append([curCluster[0] * stStep, curCluster[-1] * stStep])

# Step 5: Post process: remove very small segments:
minDuration = 0.2
segmentLimits2 = []
for s in segmentLimits:
if s[1] - s[0] > minDuration:
segmentLimits2.append(s)
segmentLimits = segmentLimits2

if plot:
timeX = numpy.arange(0, x.shape[0] / float(Fs), 1.0 / Fs)

plt.subplot(2, 1, 1)
plt.plot(timeX, x)
for s in segmentLimits:
plt.axvline(x=s[0])
plt.axvline(x=s[1])
plt.subplot(2, 1, 2)
plt.plot(numpy.arange(0, ProbOnset.shape[0] * stStep, stStep), ProbOnset)
plt.title('Signal')
for s in segmentLimits:
plt.axvline(x=s[0])
plt.axvline(x=s[1])
plt.title('SVM Probability')
plt.show()

return segmentLimits



def nonsilentRegions(segmentLimits, fs, data):
segmentLimits *= fs
wave = numpy.array([])
flag = False

for i in segmentLimits:
start = i[0]
end = i[1]
a = data[start:end]
if flag==False:
wave = a
flag = True
else:
wave = numpy.concatenate((wave, a ))

# wavfile.write(file, fs, wave)
return wave


# fs, data = wavfile.read("/home/manvi/Desktop/voicebiometric/Phoneme/trainset/1/662892_age_reco.wav")

# stWin = 0.025
# stStep = 0.01

# segmentLimits = silenceRemoval(data, fs, stWin, stStep)
# segmentLimits = numpy.asarray(segmentLimits)
# wave = nonsilentRegions(segmentLimits, fs)
Loading

0 comments on commit 7802033

Please sign in to comment.