forked from haixpham/end2end_AU_speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_spectrogram.py
81 lines (72 loc) · 2.87 KB
/
extract_spectrogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
'''
Copyright (c) 2017 Hai Pham, Rutgers University
http://www.cs.rutgers.edu/~hxp1/
This code is free to use for academic/research purpose.
'''
import math
import pathlib as plb
import librosa
import csv
import numpy as np
from extract_feature import write_csv, get_fps, extract_one_frame_data
FREQ_DIM = 128
TIME_DIM = 32
NFFT = FREQ_DIM*2
nFrameSize = (TIME_DIM - 3) * FREQ_DIM + NFFT
def extract_one_file(videofile, audiofile):
print (" --- " + audiofile)
# get video FPS
nFrames, fps = get_fps(videofile)
# load audio
data, sr = librosa.load(audiofile, sr=44100) # data is np.float32
# number of audio samples per video frame
nSamPerFrame = int(math.floor(float(sr) / fps))
# number of samples per 20ms
#nSamPerFFTWindow = NFFT #int(math.ceil(float(sr) * 0.02))
# number of samples per step 8ms
#nSamPerStep = FREQ_DIM #int(math.floor(float(sr) * 0.008))
# number of steps per frame
#nStepsPerFrame = TIME_DIM #int(math.floor(float(nSamPerFrame) / float(nSamPerStep)))
# real frame size
#nFrameSize = (nStepsPerFrame - 1) * nSamPerStep + nSamPerFFTWindow
# initial position in the sound stream
# initPos negative means we need zero padding at the front.
curPos = nSamPerFrame - nFrameSize
dbspecs = []
for f in range(0,nFrames):
frameData, nextPos = extract_one_frame_data(data, curPos, nFrameSize, nSamPerFrame)
curPos = nextPos
# spectrogram transform
FD = librosa.core.stft(y=frameData, n_fft=NFFT, hop_length=FREQ_DIM)
FD, phase = librosa.magphase(FD)
DB = librosa.core.amplitude_to_db(FD, ref=np.max)
# scale dB-spectrogram in [0,1]
DB = np.divide(np.absolute(DB), 80.0)
# remove the last row
newDB = DB[0:-1,:]
# store
dbspecs.append(newDB.flatten().tolist())
return dbspecs
video_root = "H:/Speech_data/RAVDESS"
audio_root = "H:/Speech_data/RAVDESS_wav"
feat_root = "H:/Speech_data/RAVDESS_feat"
def process_all():
video_dir = plb.Path(video_root)
feat_dir = plb.Path(feat_root)
feat_dir.mkdir(parents=True, exist_ok=True)
for actor in video_dir.iterdir():
if actor.name not in ("s8", "s9"):
continue
for video_file in actor.iterdir():
#if video_file.name[len(video_file.name)-4:] != '.mp4' or video_file.name[0:2] != '01':
# continue
seq_dir = plb.Path( feat_root + "/" + actor.name + "/" + video_file.stem )
if not seq_dir.exists():
continue
video_path = str(video_file)
audio_path = audio_root + "/" + actor.name + "/" + video_file.stem + ".wav"
dbspecs = extract_one_file(video_path, audio_path)
feature_path = feat_root + "/" + actor.name + "/" + video_file.stem + "/dbspectrogram.csv"
write_csv(feature_path, dbspecs)
if __name__ == "__main__":
process_all()