Skip to content

Commit

Permalink
Padding adjustment
Browse files Browse the repository at this point in the history
  • Loading branch information
maxrmorrison committed Mar 1, 2023
1 parent f366f31 commit 27e23a9
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 25 deletions.
28 changes: 28 additions & 0 deletions config/radtts-trf1-mrf9-512-4x512-sil63-interp-adjust.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
MODULE = 'pyfoal'

# Configuration name
CONFIG = 'radtts-trf1-mrf9-512-4x512-sil63-interp-adjust'

# Whether to account for the padding applied to the mels
ADJUST_PADDING = True

# Whether to allow silent tokens on loud frames
ALLOW_LOUD_SILENCE = False

# Whether to perform local interpolation over time
INTERPOLATE = True

# Kernel sizes for the mel encoder
MEL_ENCODER_KERNEL_SIZES = [3, 3, 3, 3]

# Mel layer widths
MEL_ENCODER_WIDTHS = [512, 512, 512]

# Width of the phoneme embedding
PHONEME_EMBEDDING_SIZE = 512

# Threshold below which audio is considered silent
SILENCE_THRESHOLD = -63. # dB

# Kernel sizes for the text encoder
TEXT_ENCODER_KERNEL_SIZES = [1, 1, 1, 1, 1]
7 changes: 5 additions & 2 deletions pyfoal/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
###############################################################################


# Whether to account for the padding applied to the mels
ADJUST_PADDING = False

# Whether to allow silent tokens on loud frames
ALLOW_LOUD_SILENCE = True

Expand All @@ -72,8 +75,8 @@

# Root location for saving outputs
# TEMPORARY
# ROOT_DIR = Path(__file__).parent.parent.parent
ROOT_DIR = Path('/data/max/pyfoal')
ROOT_DIR = Path(__file__).parent.parent.parent
# ROOT_DIR = Path('/data/max/pyfoal')

# Location to save assets to be bundled with pip release
ASSETS_DIR = Path(__file__).parent.parent / 'assets'
Expand Down
7 changes: 7 additions & 0 deletions pyfoal/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,13 @@ def postprocess(phonemes, logits, audio):
# Get per-phoneme frame counts from network output
indices, counts = pyfoal.viterbi.decode(phonemes, logits, loudness)

# Account for padding applied to mels
if pyfoal.ADJUST_PADDING:
pad_count = pyfoal.convert.samples_to_frames(
(pyfoal.WINDOW_SIZE - pyfoal.HOPSIZE) // 2)
counts[0] -= pad_count
counts[-1] -= pad_count

# Convert phoneme indices to phonemes
phonemes = pyfoal.convert.indices_to_phonemes(
phonemes[0, indices.to(torch.long)])
Expand Down
2 changes: 1 addition & 1 deletion pyfoal/viterbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def decode(phonemes, logits, loudness=None):

# Count consecutive indices
indices, counts = torch.unique_consecutive(indices, return_counts=True)
counts = counts.to(torch.float)

# Maybe interpolate
if pyfoal.INTERPOLATE:
Expand All @@ -90,7 +91,6 @@ def decode(phonemes, logits, loudness=None):
dim=0)[0]

# Apply to counts
counts = counts.to(torch.float)
counts[:-1] += weight

return indices, counts
Expand Down
33 changes: 11 additions & 22 deletions test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 3,
"metadata": {
"tags": []
},
Expand Down Expand Up @@ -69,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count": 198,
"execution_count": 4,
"metadata": {
"tags": []
},
Expand All @@ -83,7 +83,7 @@
},
{
"cell_type": "code",
"execution_count": 199,
"execution_count": 5,
"metadata": {
"tags": []
},
Expand All @@ -102,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 200,
"execution_count": 6,
"metadata": {
"tags": []
},
Expand All @@ -126,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 225,
"execution_count": 7,
"metadata": {
"tags": []
},
Expand All @@ -139,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 226,
"execution_count": 8,
"metadata": {
"tags": []
},
Expand All @@ -162,18 +162,18 @@
},
{
"cell_type": "code",
"execution_count": 227,
"execution_count": 9,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[<matplotlib.lines.Line2D at 0x7f3b9d7d1f90>]"
"[<matplotlib.lines.Line2D at 0x7fae40b06890>]"
]
},
"execution_count": 227,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
Expand All @@ -195,7 +195,7 @@
},
{
"cell_type": "code",
"execution_count": 228,
"execution_count": 11,
"metadata": {
"tags": []
},
Expand All @@ -212,18 +212,7 @@
}
],
"source": [
"pyfoal.plot.alignment(audio, alignment, target).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"pyfoal.plot.alignment(audio, alignment, target).show()"
"pyfoal.plot.alignments(audio, alignment, target).show()"
]
},
{
Expand Down

0 comments on commit 27e23a9

Please sign in to comment.