Fix batch synthesis for Mulaw-Quantize

tuong-olli · Aug 12, 2018 · b143893 · b143893
1 parent ffeca51
commit b143893
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 21 deletions.
diff --git a/hparams.py b/hparams.py
@@ -99,14 +99,14 @@
 	# If input_type is raw or mulaw, network assumes scalar input and
 	# discretized mixture of logistic distributions output, otherwise one-hot
 	# input and softmax output are assumed.
-	input_type="raw",
-	quantize_channels=65536,  # 65536 (16-bit) (raw) or 256 (8-bit) (mulaw or mulaw-quantize) // number of classes = 256 <=> mu = 255
+	input_type="mulaw-quantize",
+	quantize_channels=2 ** 8,  # 65536 (16-bit) (raw) or 256 (8-bit) (mulaw or mulaw-quantize) // number of classes = 256 <=> mu = 255
 
 	log_scale_min=float(np.log(1e-14)), #Mixture of logistic distributions minimal log scale
 	log_scale_min_gauss = float(np.log(1e-7)), #Gaussian distribution minimal allowed log scale
 
 	#To use Gaussian distribution as output distribution instead of mixture of logistics sets "out_channels = 2" instead of "out_channels = 10 * 3". (UNDER TEST)
-	out_channels = 2, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale).
+	out_channels = 2 ** 8, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale).
 	layers = 30, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper)
 	stacks = 3, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper)
 	residual_channels = 512,
@@ -116,7 +116,7 @@
 
 	cin_channels = 80, #Set this to -1 to disable local conditioning, else it must be equal to num_mels!!
 	upsample_conditional_features = True, #Whether to repeat conditional features or upsample them (The latter is recommended)
-	upsample_scales = [15, 20], #prod(scales) should be equal to hop size
+	upsample_scales = [15, 20], #prod(upsample_scales) should be equal to hop_size
 	freq_axis_kernel_size = 3,
 	leaky_alpha = 0.4,
 
@@ -213,28 +213,14 @@
 	'Basilar membrane and otolaryngology are not auto-correlations.',
 	'He has read the whole thing.',
 	'He reads books.',
-	"Don't desert me here in the desert!",
 	'He thought it was time to present the present.',
 	'Thisss isrealy awhsome.',
 	'Punctuation sensitivity, is working.',
 	'Punctuation sensitivity is working.',
-	"The buses aren't the problem, they actually provide a solution.",
-	"The buses aren't the PROBLEM, they actually provide a SOLUTION.",
-	"The quick brown fox jumps over the lazy dog.",
-	"does the quick brown fox jump over the lazy dog?",
 	"Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?",
 	"She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.",
-	"The blue lagoon is a nineteen eighty American romance adventure film.",
 	"Tajima Airport serves Toyooka.",
-	'Talib Kweli confirmed to AllHipHop that he will be releasing an album in the next year.',
-	#From Training data:
-	'the rest being provided with barrack beds, and in dimensions varying from thirty feet by fifteen to fifteen feet by ten.',
-	'in giltspur street compter, where he was first lodged.',
-	'a man named burnett came with his wife and took up his residence at whitchurch, hampshire, at no great distance from laverstock,',
-	'it appears that oswald had only one caller in response to all of his fpcc activities,',
-	'he relied on the absence of the strychnia.',
-	'scoggins thought it was lighter.',
-	'would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, and would have possessed so much moral dignity',
+	#From The web (random long utterance)
 	'Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
 	This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
 	the adopted architecture is able to perform this task with wild success.',

diff --git a/sentences.txt b/sentences.txt
@@ -0,0 +1,19 @@
+Scientists at the CERN laboratory say they have discovered a new particle.
+There's a way to measure the acute emotional intelligence that has never gone out of style.
+President Trump met with other leaders at the Group of 20 conference.
+The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.
+
+Generative adversarial network or variational auto-encoder.
+Basilar membrane and otolaryngology are not auto-correlations.
+He has read the whole thing.
+He reads books.
+He thought it was time to present the present.
+Thisss isrealy awhsome.
+Punctuation sensitivity, is working.
+Punctuation sensitivity is working.
+Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
+She sells sea-shells on the sea-shore. The shells she sells are sea-shells I'm sure.
+Tajima Airport serves Toyooka.
+
+Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization. This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that the adopted architecture is able to perform this task with wild success.
+Thank you so much for your support!
diff --git a/train.py b/train.py
@@ -102,7 +102,7 @@ def main():
 	parser.add_argument('--restore', type=bool, default=True, help='Set this to False to do a fresh training')
 	parser.add_argument('--summary_interval', type=int, default=250,
 		help='Steps between running summary ops')
-	parser.add_argument('--checkpoint_interval', type=int, default=5000,
+	parser.add_argument('--checkpoint_interval', type=int, default=20,
 		help='Steps between writing checkpoints')
 	parser.add_argument('--eval_interval', type=int, default=10000,
 		help='Steps between eval on test data')

diff --git a/wavenet_vocoder/models/wavenet.py b/wavenet_vocoder/models/wavenet.py
@@ -615,7 +615,7 @@ def body(time, outputs_ta, current_input, loss_outputs_ta, queues):
 				x = tf.nn.softmax(tf.reshape(x, [batch_size, -1]), axis=1) if softmax \
 					else tf.reshape(x, [batch_size, -1])
 				if quantize:
-					sample = tf.multinomial(tf.reshape(x, [batch_size, -1]), 1)[0] #Pick a sample using x as probability
+					sample = tf.multinomial(tf.reshape(x, [batch_size, -1]), batch_size)[0] #Pick a sample using x as probability
 					x = tf.one_hot(sample, depth=self._hparams.quantize_channels)
 
 			outputs_ta = outputs_ta.write(time, x)