Convolutional-Forced-Alignment/inference.py at main · sfarhat/Convolutional-Forced-Alignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
import torch
from ctcdecode import CTCBeamDecoder
from grad_cam import Sequential_GRAD_CAM
import matplotlib.pyplot as plt
from loss import calculate_loss
from pronouncing import generate_spaces_in_guess, pronunciation_model
from dataset_utils import spec_time_to_waveform_time, SPACE_TOKEN
from config import hparams

def test_accuracy(model, test_loader, criterion, device, transformer):
    """
    Evaluation for model.

    Args:
        model (nn.Module): Network to test on
        test_loader (torch.utils.data.dataloader): DataLoader for test dataset
        criterion (nn.modules.loss): Loss function
        device (torch.device): Device (cpu or cuda)
        transformer (timit_utils.PhonemeTransformer or utils.TextTransfomer): Transformer that handles all labels <-> text
    """

    model.eval()
    data_len = len(test_loader.dataset)
    with torch.no_grad():
        phon_err_rates = []
        for batch_num, data in enumerate(test_loader):

            inputs, input_lengths, targets, target_lengths = data
            inputs, targets = inputs.to(device), targets.to(device)
            # output of shape batch x time x classes
            output = model(inputs)
            loss = calculate_loss(criterion, output, targets, input_lengths, target_lengths)

            for log_probs, true_target, target_len, input in zip(output, targets, target_lengths, inputs):

                # For TIMIT, moving to 39 test labels occurs in target_to_text()
                guessed_text = timit_decode(log_probs, target_len, transformer)
                true_text = transformer.target_to_text(true_target[:target_len])

                per = phoneme_error_rate(guessed_text, true_text).item()
                phon_err_rates.append(per)

                print(f"[{(batch_num+1) * len(inputs)}/{data_len} ({100. * (batch_num+1) / len(test_loader):.2f}%)]\tPER: {per:.6f}")

    avg_per = sum(phon_err_rates) / len(phon_err_rates) * 100

    print('Average PER: {}%'.format(avg_per))

    return avg_per

def test_alignment(model, loader, device, transformer):

    model.eval()
    data_len = len(loader.dataset)

    with torch.no_grad():
        alignment_errs = []
        within_10, within_20, within_30, within_40, within_50 = 0, 0, 0, 0, 0
        for batch_num, data in enumerate(loader):

            inputs, samples, spectrogram_generator = data
            # Loaded w/o collator, so we get sample with
            # sample = {'audio': [], 'phonemes': [], 'words': [], 'transcript': ''}

            for input, sample in zip(inputs, samples):

                guessed_word_alignments = force_align(model, transformer, device, input, spectrogram_generator, sample['transcript'])
                true_word_alignments = sample['words']

                ae = alignment_error(guessed_word_alignments, true_word_alignments)
                alignment_errs.append(ae)

                print(f"[{(batch_num+1) * len(inputs)}/{data_len} ({100. * (batch_num+1) / len(loader):.2f}%)]\tAE: {ae:.6f}")

                if ae <= 0.010:
                    within_10 += 1
                    within_20 += 1
                    within_30 += 1
                    within_40 += 1
                    within_50 += 1
                elif ae <= 0.020:
                    within_20 += 1
                    within_30 += 1
                    within_40 += 1
                    within_50 += 1
                elif ae <= 0.030:
                    within_30 += 1
                    within_40 += 1
                    within_50 += 1
                elif ae <= 0.040:
                    within_40 += 1
                    within_50 += 1
                elif ae <= 0.050:
                    within_50 += 1
                else:
                    pass

    avg_ae = sum(alignment_errs) / len(alignment_errs)
    within_10 = within_10 / len(alignment_errs)
    within_20 = within_20 / len(alignment_errs)
    within_30 = within_30 / len(alignment_errs)
    within_40 = within_40 / len(alignment_errs)
    within_50 = within_50 / len(alignment_errs)

    print(within_10, within_20, within_30, within_40, within_50)

    print('Average AE: {}%'.format(avg_ae))

    return avg_ae

def show_activation_map(model, device, input, desired_phone_indices):
    """Creates and displays an activation map using GRAD-CAM on top of an input spectrogram. Given that in our model the time dimension exists, a target class (desired phoneme)
    can span multiple time-steps, so when asking for a map wrt a class, the specific phoneme must be provided in the form of its index in the (non-artificially-repeating) original phoneme transcript.
    In turn, a map will be generated by taking the element-wise maximum among all maps corresponding to the desired phoneme.

    For example, if the non-duration-including phoneme transcript is ['sil', 'ae', 's', 'ae'], then providing desired_phone_idx = 1 will generate a CAM
    across all timesteps covered by the first 'ae'.

    Args:
        model (nn.Module): Network to put input into
        device (torch.device): Device (cpu or cuda)
        input (Tensor): Input to be fed into model of shape channel x features x time
        desired_phone_idx (list): The indices of the desired phonemes in the output transcript to generate the activation map with respect to
    """

    gcam = Sequential_GRAD_CAM(model)

    # Artifically create batch dimension
    input = input.to(device).unsqueeze(0)
    # output of shape batch x time x classes
    log_probs = model(input).squeeze(0)

    guessed_labels = torch.argmax(log_probs, dim=1)

    # guessed_labels don't requires_grad, so we index into log_probs to get target_classes
    target_classes = gcam.get_target_classes(log_probs, guessed_labels, desired_phone_indices)
    # Squeeze out batch and channel dimensions when providing interpolation size
    cam = gcam.generate_cam(input.squeeze(0).squeeze(0).shape, target_classes)

    plotted_cam = cam.squeeze(0).permute(1, 2, 0).cpu()
    plotted_input = input.squeeze(0).permute(1, 2, 0).cpu()
    plt.imshow(plotted_cam, alpha=1, cmap='jet')
    plt.imshow(plotted_input, alpha=0.5, cmap='binary')
    plt.savefig('cam.png')
    plt.show()

def force_align(model, transformer, device, input, spectrogram_generator, transcript):
    """(Pseudo-)Force aligns waveform with transcript on a word-level by using Levenshtein distance to compute where
    word separations belong in generated phonetic transcript.

    Args:
        model ([type]): [description]
        transformer ([type]): [description]
        device ([type]): [description]
        input ([type]): [description]
        spectrogram_generator ([type]): [description]
        transcript (list): List of words making up transcript
    """

    pronounced_transcript = pronunciation_model(transcript, transformer)

    input = input.to(device).unsqueeze(0)
    log_probs = model(input).squeeze(0)
    guessed_labels = torch.argmax(log_probs, dim=1)

    # print("Guessed transcript")

    # Must do int -> 39 txt phones -> collapsing repeats
    # Going from int to 39 can introduce extra repeats as well
    guess_pre_collapsing = transformer.target_to_text(guessed_labels)
    guess = collapse_repeats(guess_pre_collapsing)

    path = edit_distance_path(guess, pronounced_transcript)

    guess_with_spaces = generate_spaces_in_guess(guess, pronounced_transcript, path)

    # Re-extend guess with repeats to map space_indices to waveform times

    # Want space indices wrt length of original transcript
    space_indices = []
    pre_collapsed_idx = 0
    for phon in guess_with_spaces:
        if phon == SPACE_TOKEN:
            space_indices.append(pre_collapsed_idx)
            continue
        while pre_collapsed_idx < len(guess_pre_collapsing) and phon == guess_pre_collapsing[pre_collapsed_idx]:
            pre_collapsed_idx += 1

    # print("Found where spaces belong")

    # print(space_indices)

    # pass desired words into CAM generator via space_indices since they're in spectrogram space
    if hparams['cam_word'] >= len(space_indices):
        print(f"Invalid word index to compute CAM over. Please choose a value less than {len(space_indices)}")
    else:
        if hparams['cam_word'] == 0:
            desired_cam_range = list(range(0, space_indices[hparams['cam_word']]))
        else:
            desired_cam_range = list(range(space_indices[hparams['cam_word']-1], space_indices[hparams['cam_word']]))
        desired_cam_indices = []
        prev = None
        num_phones_seen = -1
        for phon_idx, phon in enumerate(guess_pre_collapsing):
            if phon != prev:
                num_phones_seen += 1
            if phon_idx in desired_cam_range and num_phones_seen not in desired_cam_indices:
                desired_cam_indices.append(num_phones_seen)
            prev = phon

        # squeezing necessary since it is unsqueezed again in this fn, super clean lmao
        show_activation_map(model, device, input.squeeze(0), desired_cam_indices)

    word_alignments = []

    # split true transcript into words, use that to index into space_indices to get start and end
    # TODO: there is a space at the very end, leads to ending of final word to be out of bounds
    end, prev_end = 0, 0
    for i in range(len(transcript)):
        # print("Computing alignment for: ", transcript[i])
        end = spec_time_to_waveform_time(space_indices[i], spectrogram_generator) / 16500
        word_alignment = {'word': transcript[i], 'start': prev_end, 'end': end}
        word_alignments.append(word_alignment)
        prev_end = end

    # print(word_alignments)
    return word_alignments

def timit_decode(log_probs, target_len, transformer):
    """Generates 39-label phoneme sequence from output of network for a single sample"""

    phon_indices = torch.argmax(log_probs, dim=1)
    return transformer.target_to_text(phon_indices[:target_len])

def alignment_error(guess, truth):
    # Take end times of words, compute difference, average them

    error = 0

    # Don't need to do last word
    num_words = len(guess) - 1
    for i in range(num_words):
        guess_end = guess[i]['end']
        true_end = truth[i]['end']

        diff = abs(guess_end - true_end)
        error += diff

    ae = error / num_words

    return ae

def phoneme_error_rate(guess, truth):
    """Phoneme Error Rate of sequence"""

    # collapsed_guess, collapsed_true = collapse_repeats(guess), collapse_repeats(truth)
    # levenshtein_dist = edit_distance(collapsed_guess, collapsed_true)
    # per = levenshtein_dist / len(collapsed_true)

    levenshtein_dist = edit_distance(guess, truth)
    per = levenshtein_dist / len(truth)

    return per

def collapse_repeats(sequence):
    """Collapse repeats from sequence to be used for PER"""

    result = []
    prev = None

    for x in sequence:
        if x == prev:
            continue

        result.append(x)
        prev = x

    return result

def generate_edit_distance_matrix(a, b):
    """Generates DP matrix for Levenshtein distance"""

    # add 1 for blank beginning
    m, n = len(a)+1, len(b)+1
    d = torch.empty(m, n)

    for i in range(m):
        d[i, 0] = i

    for j in range(n):
        d[0, j] = j

    for i in range(1, m):
        for j in range(1, n):
            # off-by-one for first char not starting at index 0 of matrix
            if a[i-1] == b[j-1]:
                sub = 0
            else:
                sub = 1
            d[i, j] = min(d[i-1, j] + 1,
                        d[i, j-1] + 1,
                        d[i-1, j-1] + sub)

    return d

def edit_distance(a, b):
    """Levenshtein Distance"""

    return generate_edit_distance_matrix(a, b)[-1,-1]

def edit_distance_path(a, b):

    d = generate_edit_distance_matrix(a, b)

    i, j = d.size(0)-1, d.size(1)-1
    path = [(i, j)]

    while i >= 0 and j >= 0:
        if i == 0 and j == 0:
            break
        elif i == 0:
            path.append((i, j-1))
            i, j = i, j-1
        elif j == 0:
            path.append((i-1, j))
            i, j = i-1, j
        else:
            up, left, diagonal = d[i-1,j], d[i, j-1], d[i-1, j-1]
            min_direction = min(up, left, diagonal)
            # If multiple possible paths, prefers non-diagonal ones
            if min_direction == up:
                path.append((i-1, j))
                i, j = i-1, j
            elif min_direction == left:
                path.append((i, j-1))
                i, j = i, j-1
            else:
                path.append((i-1, j-1))
                i, j = i-1, j-1

    return path[::-1]

def test_Librispeech(model, test_loader, criterion, device, transformer):
    """
    Evaluation for model.

    Args:
        model (nn.Module): Network to test on
        test_loader (torch.utils.data.dataloader): DataLoader for test dataset
        criterion (nn.modules.loss): Loss function
        device (torch.device): Device (cpu or cuda)
        transformer (timit_utils.PhonemeTransformer or utils.TextTransfomer): Transformer that handles all labels <-> text
    """

    model.eval()
    data_len = len(test_loader.dataset)
    with torch.no_grad():
        char_err_rates = []
        for batch_num, data in enumerate(test_loader):

            inputs, input_lengths, targets, target_lengths = data
            inputs, targets = inputs.to(device), targets.to(device)
            # output of shape batch x time x classes
            output = model(inputs)
            loss = calculate_loss(criterion, output, targets, input_lengths, target_lengths)

            for log_probs, true_target, target_len, input in zip(output, targets, target_lengths, inputs):

                guessed_text = greedy_decode(log_probs, transformer)
                true_text = transformer.target_to_text(true_target[:target_len])

                # Phoneme error rate fn doubles as Character Error Rate as well.
                cer = phoneme_error_rate(guessed_text, true_text).item()
                char_err_rates.append(cer)

                print(f"[{(batch_num+1) * len(inputs)}/{data_len} ({100. * (batch_num+1) / len(test_loader):.2f}%)]\tCER: {cer:.6f}")

    avg_cer = sum(char_err_rates) / len(char_err_rates) * 100

    print('Average CER: {}%'.format(avg_cer))

    return avg_cer

def beam_search_decode(log_probs, transformer):

    # Using this ctc decoder: https://github.com/parlance/ctcdecode
    # Labels come from order specified in utils.py, _ represents blank
    labels = list("_ abcdefghijklmnopqrstuvwxyz'")

    decoder = CTCBeamDecoder(
        labels,
        model_path=None,
        alpha=0,
        beta=0,
        cutoff_top_n=40,
        cutoff_prob=1.0,
        beam_width=100,
        num_processes=16,
        blank_id=0,
        log_probs_input=True
    )

    # input to decoder needs to be of shape BATCHSIZE x N_TIMESTEPS x N_LABELS
    # Currently doing single samples, so unsqueeze to create batch of 1
    beam_results, beam_scores, timesteps, out_lens = decoder.decode(log_probs.unsqueeze(dim=0))
    # beam_results is of shape (num_batches, num_beams, time), so to get top beam, index [0][0]
    # cut it off by the appropriate length out_lens with same index
    return transformer.target_to_text(beam_results[0][0][:out_lens[0][0]])


def greedy_decode(log_probs, transformer):

    char_indices = torch.argmax(log_probs, dim=1)
    transcript = []
    blank_label = 0
    prev = None

    for idx in range(len(char_indices)):
        char = char_indices[idx].item()
        if char != blank_label:
            if char != prev:
                transcript.append(char)
        prev = char

    return transformer.target_to_text(transcript)