Skip to content

Commit 0a2e367

Browse files
authored
[TTS]clean starganv2 vc model code and add docstring (#2987)
* clean code * add docstring
1 parent 880c172 commit 0a2e367

File tree

4 files changed

+176
-433
lines changed

4 files changed

+176
-433
lines changed

paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py

Lines changed: 5 additions & 223 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
import random
15-
1614
import paddle
1715
import paddle.nn.functional as F
1816
import paddleaudio.functional as audio_F
@@ -46,7 +44,8 @@ def __init__(self,
4644
self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
4745

4846
def forward(self, x: paddle.Tensor):
49-
return self.linear_layer(x)
47+
out = self.linear_layer(x)
48+
return out
5049

5150

5251
class ConvNorm(nn.Layer):
@@ -82,85 +81,6 @@ def forward(self, signal: paddle.Tensor):
8281
return conv_signal
8382

8483

85-
class CausualConv(nn.Layer):
86-
def __init__(self,
87-
in_channels: int,
88-
out_channels: int,
89-
kernel_size: int=1,
90-
stride: int=1,
91-
padding: int=1,
92-
dilation: int=1,
93-
bias: bool=True,
94-
w_init_gain: str='linear',
95-
param=None):
96-
super().__init__()
97-
if padding is None:
98-
assert (kernel_size % 2 == 1)
99-
padding = int(dilation * (kernel_size - 1) / 2) * 2
100-
else:
101-
self.padding = padding * 2
102-
self.conv = nn.Conv1D(
103-
in_channels,
104-
out_channels,
105-
kernel_size=kernel_size,
106-
stride=stride,
107-
padding=self.padding,
108-
dilation=dilation,
109-
bias_attr=bias)
110-
111-
xavier_uniform_(
112-
self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
113-
114-
def forward(self, x: paddle.Tensor):
115-
x = self.conv(x)
116-
x = x[:, :, :-self.padding]
117-
return x
118-
119-
120-
class CausualBlock(nn.Layer):
121-
def __init__(self,
122-
hidden_dim: int,
123-
n_conv: int=3,
124-
dropout_p: float=0.2,
125-
activ: str='lrelu'):
126-
super().__init__()
127-
self.blocks = nn.LayerList([
128-
self._get_conv(
129-
hidden_dim=hidden_dim,
130-
dilation=3**i,
131-
activ=activ,
132-
dropout_p=dropout_p) for i in range(n_conv)
133-
])
134-
135-
def forward(self, x):
136-
for block in self.blocks:
137-
res = x
138-
x = block(x)
139-
x += res
140-
return x
141-
142-
def _get_conv(self,
143-
hidden_dim: int,
144-
dilation: int,
145-
activ: str='lrelu',
146-
dropout_p: float=0.2):
147-
layers = [
148-
CausualConv(
149-
in_channels=hidden_dim,
150-
out_channels=hidden_dim,
151-
kernel_size=3,
152-
padding=dilation,
153-
dilation=dilation), _get_activation_fn(activ),
154-
nn.BatchNorm1D(hidden_dim), nn.Dropout(p=dropout_p), CausualConv(
155-
in_channels=hidden_dim,
156-
out_channels=hidden_dim,
157-
kernel_size=3,
158-
padding=1,
159-
dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
160-
]
161-
return nn.Sequential(*layers)
162-
163-
16484
class ConvBlock(nn.Layer):
16585
def __init__(self,
16686
hidden_dim: int,
@@ -264,13 +184,14 @@ def get_alignment_energies(self,
264184
"""
265185
Args:
266186
query:
267-
decoder output (batch, n_mel_channels * n_frames_per_step)
187+
decoder output (B, n_mel_channels * n_frames_per_step)
268188
processed_memory:
269189
processed encoder outputs (B, T_in, attention_dim)
270190
attention_weights_cat:
271191
cumulative and prev. att weights (B, 2, max_time)
272192
Returns:
273-
Tensor: alignment (batch, max_time)
193+
Tensor:
194+
alignment (B, max_time)
274195
"""
275196

276197
processed_query = self.query_layer(query.unsqueeze(1))
@@ -316,144 +237,6 @@ def forward(self,
316237
return attention_context, attention_weights
317238

318239

319-
class ForwardAttentionV2(nn.Layer):
320-
def __init__(self,
321-
attention_rnn_dim: int,
322-
embedding_dim: int,
323-
attention_dim: int,
324-
attention_location_n_filters: int,
325-
attention_location_kernel_size: int):
326-
super().__init__()
327-
self.query_layer = LinearNorm(
328-
in_dim=attention_rnn_dim,
329-
out_dim=attention_dim,
330-
bias=False,
331-
w_init_gain='tanh')
332-
self.memory_layer = LinearNorm(
333-
in_dim=embedding_dim,
334-
out_dim=attention_dim,
335-
bias=False,
336-
w_init_gain='tanh')
337-
self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
338-
self.location_layer = LocationLayer(
339-
attention_n_filters=attention_location_n_filters,
340-
attention_kernel_size=attention_location_kernel_size,
341-
attention_dim=attention_dim)
342-
self.score_mask_value = -float(1e20)
343-
344-
def get_alignment_energies(self,
345-
query: paddle.Tensor,
346-
processed_memory: paddle.Tensor,
347-
attention_weights_cat: paddle.Tensor):
348-
"""
349-
Args:
350-
query:
351-
decoder output (batch, n_mel_channels * n_frames_per_step)
352-
processed_memory:
353-
processed encoder outputs (B, T_in, attention_dim)
354-
attention_weights_cat:
355-
prev. and cumulative att weights (B, 2, max_time)
356-
Returns:
357-
Tensor: alignment (batch, max_time)
358-
"""
359-
360-
processed_query = self.query_layer(query.unsqueeze(1))
361-
processed_attention_weights = self.location_layer(attention_weights_cat)
362-
energies = self.v(
363-
paddle.tanh(processed_query + processed_attention_weights +
364-
processed_memory))
365-
366-
energies = energies.squeeze(-1)
367-
return energies
368-
369-
def forward(self,
370-
attention_hidden_state: paddle.Tensor,
371-
memory: paddle.Tensor,
372-
processed_memory: paddle.Tensor,
373-
attention_weights_cat: paddle.Tensor,
374-
mask: paddle.Tensor,
375-
log_alpha: paddle.Tensor):
376-
"""
377-
Args:
378-
attention_hidden_state:
379-
attention rnn last output
380-
memory:
381-
encoder outputs
382-
processed_memory:
383-
processed encoder outputs
384-
attention_weights_cat:
385-
previous and cummulative attention weights
386-
mask:
387-
binary mask for padded data
388-
"""
389-
log_energy = self.get_alignment_energies(
390-
query=attention_hidden_state,
391-
processed_memory=processed_memory,
392-
attention_weights_cat=attention_weights_cat)
393-
394-
if mask is not None:
395-
log_energy[:] = paddle.where(
396-
mask,
397-
paddle.full(log_energy.shape, self.score_mask_value,
398-
log_energy.dtype), log_energy)
399-
log_alpha_shift_padded = []
400-
max_time = log_energy.shape[1]
401-
for sft in range(2):
402-
shifted = log_alpha[:, :max_time - sft]
403-
shift_padded = F.pad(shifted, (sft, 0), 'constant',
404-
self.score_mask_value)
405-
log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
406-
407-
biased = paddle.logsumexp(paddle.conat(log_alpha_shift_padded, 2), 2)
408-
log_alpha_new = biased + log_energy
409-
attention_weights = F.softmax(log_alpha_new, axis=1)
410-
attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
411-
attention_context = attention_context.squeeze(1)
412-
413-
return attention_context, attention_weights, log_alpha_new
414-
415-
416-
class PhaseShuffle2D(nn.Layer):
417-
def __init__(self, n: int=2):
418-
super().__init__()
419-
self.n = n
420-
self.random = random.Random(1)
421-
422-
def forward(self, x: paddle.Tensor, move: int=None):
423-
# x.size = (B, C, M, L)
424-
if move is None:
425-
move = self.random.randint(-self.n, self.n)
426-
427-
if move == 0:
428-
return x
429-
else:
430-
left = x[:, :, :, :move]
431-
right = x[:, :, :, move:]
432-
shuffled = paddle.concat([right, left], axis=3)
433-
return shuffled
434-
435-
436-
class PhaseShuffle1D(nn.Layer):
437-
def __init__(self, n: int=2):
438-
super().__init__()
439-
self.n = n
440-
self.random = random.Random(1)
441-
442-
def forward(self, x: paddle.Tensor, move: int=None):
443-
# x.size = (B, C, M, L)
444-
if move is None:
445-
move = self.random.randint(-self.n, self.n)
446-
447-
if move == 0:
448-
return x
449-
else:
450-
left = x[:, :, :move]
451-
right = x[:, :, move:]
452-
shuffled = paddle.concat([right, left], axis=2)
453-
454-
return shuffled
455-
456-
457240
class MFCC(nn.Layer):
458241
def __init__(self, n_mfcc: int=40, n_mels: int=80):
459242
super().__init__()
@@ -473,7 +256,6 @@ def forward(self, mel_specgram: paddle.Tensor):
473256
# -> (channel, time, n_mfcc).tranpose(...)
474257
mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
475258
self.dct_mat).transpose([0, 2, 1])
476-
477259
# unpack batch
478260
if unsqueezed:
479261
mfcc = mfcc.squeeze(0)

paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_future_mask(self, out_length: int, unmask_future_steps: int=0):
9999
unmask_futre_steps (int):
100100
unmasking future step size.
101101
Return:
102-
mask (paddle.BoolTensor):
102+
Tensor (paddle.Tensor(bool)):
103103
mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
104104
"""
105105
index_tensor = paddle.arange(out_length).unsqueeze(0).expand(
@@ -194,9 +194,8 @@ def forward(self,
194194
logit_outputs += [logit]
195195
alignments += [attention_weights]
196196

197-
hidden_outputs, logit_outputs, alignments = \
198-
self.parse_decoder_outputs(
199-
hidden_outputs, logit_outputs, alignments)
197+
hidden_outputs, logit_outputs, alignments = self.parse_decoder_outputs(
198+
hidden_outputs, logit_outputs, alignments)
200199

201200
return hidden_outputs, logit_outputs, alignments
202201

0 commit comments

Comments
 (0)