1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
1313# limitations under the License.
14- import random
15-
1614import paddle
1715import paddle .nn .functional as F
1816import paddleaudio .functional as audio_F
@@ -46,7 +44,8 @@ def __init__(self,
4644 self .linear_layer .weight , gain = _calculate_gain (w_init_gain ))
4745
4846 def forward (self , x : paddle .Tensor ):
49- return self .linear_layer (x )
47+ out = self .linear_layer (x )
48+ return out
5049
5150
5251class ConvNorm (nn .Layer ):
@@ -82,85 +81,6 @@ def forward(self, signal: paddle.Tensor):
8281 return conv_signal
8382
8483
85- class CausualConv (nn .Layer ):
86- def __init__ (self ,
87- in_channels : int ,
88- out_channels : int ,
89- kernel_size : int = 1 ,
90- stride : int = 1 ,
91- padding : int = 1 ,
92- dilation : int = 1 ,
93- bias : bool = True ,
94- w_init_gain : str = 'linear' ,
95- param = None ):
96- super ().__init__ ()
97- if padding is None :
98- assert (kernel_size % 2 == 1 )
99- padding = int (dilation * (kernel_size - 1 ) / 2 ) * 2
100- else :
101- self .padding = padding * 2
102- self .conv = nn .Conv1D (
103- in_channels ,
104- out_channels ,
105- kernel_size = kernel_size ,
106- stride = stride ,
107- padding = self .padding ,
108- dilation = dilation ,
109- bias_attr = bias )
110-
111- xavier_uniform_ (
112- self .conv .weight , gain = _calculate_gain (w_init_gain , param = param ))
113-
114- def forward (self , x : paddle .Tensor ):
115- x = self .conv (x )
116- x = x [:, :, :- self .padding ]
117- return x
118-
119-
120- class CausualBlock (nn .Layer ):
121- def __init__ (self ,
122- hidden_dim : int ,
123- n_conv : int = 3 ,
124- dropout_p : float = 0.2 ,
125- activ : str = 'lrelu' ):
126- super ().__init__ ()
127- self .blocks = nn .LayerList ([
128- self ._get_conv (
129- hidden_dim = hidden_dim ,
130- dilation = 3 ** i ,
131- activ = activ ,
132- dropout_p = dropout_p ) for i in range (n_conv )
133- ])
134-
135- def forward (self , x ):
136- for block in self .blocks :
137- res = x
138- x = block (x )
139- x += res
140- return x
141-
142- def _get_conv (self ,
143- hidden_dim : int ,
144- dilation : int ,
145- activ : str = 'lrelu' ,
146- dropout_p : float = 0.2 ):
147- layers = [
148- CausualConv (
149- in_channels = hidden_dim ,
150- out_channels = hidden_dim ,
151- kernel_size = 3 ,
152- padding = dilation ,
153- dilation = dilation ), _get_activation_fn (activ ),
154- nn .BatchNorm1D (hidden_dim ), nn .Dropout (p = dropout_p ), CausualConv (
155- in_channels = hidden_dim ,
156- out_channels = hidden_dim ,
157- kernel_size = 3 ,
158- padding = 1 ,
159- dilation = 1 ), _get_activation_fn (activ ), nn .Dropout (p = dropout_p )
160- ]
161- return nn .Sequential (* layers )
162-
163-
16484class ConvBlock (nn .Layer ):
16585 def __init__ (self ,
16686 hidden_dim : int ,
@@ -264,13 +184,14 @@ def get_alignment_energies(self,
264184 """
265185 Args:
266186 query:
267- decoder output (batch , n_mel_channels * n_frames_per_step)
187+ decoder output (B , n_mel_channels * n_frames_per_step)
268188 processed_memory:
269189 processed encoder outputs (B, T_in, attention_dim)
270190 attention_weights_cat:
271191 cumulative and prev. att weights (B, 2, max_time)
272192 Returns:
273- Tensor: alignment (batch, max_time)
193+ Tensor:
194+ alignment (B, max_time)
274195 """
275196
276197 processed_query = self .query_layer (query .unsqueeze (1 ))
@@ -316,144 +237,6 @@ def forward(self,
316237 return attention_context , attention_weights
317238
318239
319- class ForwardAttentionV2 (nn .Layer ):
320- def __init__ (self ,
321- attention_rnn_dim : int ,
322- embedding_dim : int ,
323- attention_dim : int ,
324- attention_location_n_filters : int ,
325- attention_location_kernel_size : int ):
326- super ().__init__ ()
327- self .query_layer = LinearNorm (
328- in_dim = attention_rnn_dim ,
329- out_dim = attention_dim ,
330- bias = False ,
331- w_init_gain = 'tanh' )
332- self .memory_layer = LinearNorm (
333- in_dim = embedding_dim ,
334- out_dim = attention_dim ,
335- bias = False ,
336- w_init_gain = 'tanh' )
337- self .v = LinearNorm (in_dim = attention_dim , out_dim = 1 , bias = False )
338- self .location_layer = LocationLayer (
339- attention_n_filters = attention_location_n_filters ,
340- attention_kernel_size = attention_location_kernel_size ,
341- attention_dim = attention_dim )
342- self .score_mask_value = - float (1e20 )
343-
344- def get_alignment_energies (self ,
345- query : paddle .Tensor ,
346- processed_memory : paddle .Tensor ,
347- attention_weights_cat : paddle .Tensor ):
348- """
349- Args:
350- query:
351- decoder output (batch, n_mel_channels * n_frames_per_step)
352- processed_memory:
353- processed encoder outputs (B, T_in, attention_dim)
354- attention_weights_cat:
355- prev. and cumulative att weights (B, 2, max_time)
356- Returns:
357- Tensor: alignment (batch, max_time)
358- """
359-
360- processed_query = self .query_layer (query .unsqueeze (1 ))
361- processed_attention_weights = self .location_layer (attention_weights_cat )
362- energies = self .v (
363- paddle .tanh (processed_query + processed_attention_weights +
364- processed_memory ))
365-
366- energies = energies .squeeze (- 1 )
367- return energies
368-
369- def forward (self ,
370- attention_hidden_state : paddle .Tensor ,
371- memory : paddle .Tensor ,
372- processed_memory : paddle .Tensor ,
373- attention_weights_cat : paddle .Tensor ,
374- mask : paddle .Tensor ,
375- log_alpha : paddle .Tensor ):
376- """
377- Args:
378- attention_hidden_state:
379- attention rnn last output
380- memory:
381- encoder outputs
382- processed_memory:
383- processed encoder outputs
384- attention_weights_cat:
385- previous and cummulative attention weights
386- mask:
387- binary mask for padded data
388- """
389- log_energy = self .get_alignment_energies (
390- query = attention_hidden_state ,
391- processed_memory = processed_memory ,
392- attention_weights_cat = attention_weights_cat )
393-
394- if mask is not None :
395- log_energy [:] = paddle .where (
396- mask ,
397- paddle .full (log_energy .shape , self .score_mask_value ,
398- log_energy .dtype ), log_energy )
399- log_alpha_shift_padded = []
400- max_time = log_energy .shape [1 ]
401- for sft in range (2 ):
402- shifted = log_alpha [:, :max_time - sft ]
403- shift_padded = F .pad (shifted , (sft , 0 ), 'constant' ,
404- self .score_mask_value )
405- log_alpha_shift_padded .append (shift_padded .unsqueeze (2 ))
406-
407- biased = paddle .logsumexp (paddle .conat (log_alpha_shift_padded , 2 ), 2 )
408- log_alpha_new = biased + log_energy
409- attention_weights = F .softmax (log_alpha_new , axis = 1 )
410- attention_context = paddle .bmm (attention_weights .unsqueeze (1 ), memory )
411- attention_context = attention_context .squeeze (1 )
412-
413- return attention_context , attention_weights , log_alpha_new
414-
415-
416- class PhaseShuffle2D (nn .Layer ):
417- def __init__ (self , n : int = 2 ):
418- super ().__init__ ()
419- self .n = n
420- self .random = random .Random (1 )
421-
422- def forward (self , x : paddle .Tensor , move : int = None ):
423- # x.size = (B, C, M, L)
424- if move is None :
425- move = self .random .randint (- self .n , self .n )
426-
427- if move == 0 :
428- return x
429- else :
430- left = x [:, :, :, :move ]
431- right = x [:, :, :, move :]
432- shuffled = paddle .concat ([right , left ], axis = 3 )
433- return shuffled
434-
435-
436- class PhaseShuffle1D (nn .Layer ):
437- def __init__ (self , n : int = 2 ):
438- super ().__init__ ()
439- self .n = n
440- self .random = random .Random (1 )
441-
442- def forward (self , x : paddle .Tensor , move : int = None ):
443- # x.size = (B, C, M, L)
444- if move is None :
445- move = self .random .randint (- self .n , self .n )
446-
447- if move == 0 :
448- return x
449- else :
450- left = x [:, :, :move ]
451- right = x [:, :, move :]
452- shuffled = paddle .concat ([right , left ], axis = 2 )
453-
454- return shuffled
455-
456-
457240class MFCC (nn .Layer ):
458241 def __init__ (self , n_mfcc : int = 40 , n_mels : int = 80 ):
459242 super ().__init__ ()
@@ -473,7 +256,6 @@ def forward(self, mel_specgram: paddle.Tensor):
473256 # -> (channel, time, n_mfcc).tranpose(...)
474257 mfcc = paddle .matmul (mel_specgram .transpose ([0 , 2 , 1 ]),
475258 self .dct_mat ).transpose ([0 , 2 , 1 ])
476-
477259 # unpack batch
478260 if unsqueezed :
479261 mfcc = mfcc .squeeze (0 )
0 commit comments