Skip to content

Commit 66800d6

Browse files
committed
Support FLAC format
1 parent 76620b3 commit 66800d6

2 files changed

Lines changed: 29 additions & 5 deletions

File tree

preprocessing/acoustic_binarizer.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
'key_shift',
4848
'speed',
4949
]
50+
WAV_CANDIDATE_EXTENSIONS = ['.wav', '.flac']
5051

5152
pitch_extractor: BasePE = None
5253
energy_smooth: SinusoidalSmoothingConv1d = None
@@ -73,8 +74,19 @@ def load_meta_data(self, raw_data_dir: pathlib.Path, ds_id, spk, lang):
7374
with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf-8') as f:
7475
for utterance_label in csv.DictReader(f):
7576
item_name = utterance_label['name']
77+
wav_fn = None
78+
for ext in WAV_CANDIDATE_EXTENSIONS:
79+
candidate_fn = raw_data_dir / 'wavs' / f'{item_name}{ext}'
80+
if candidate_fn.exists():
81+
wav_fn = candidate_fn
82+
break
83+
if wav_fn is None:
84+
raise FileNotFoundError(
85+
f'Waveform file not found for item \'{item_name}\'. '
86+
f'Candidate extensions: {WAV_CANDIDATE_EXTENSIONS}'
87+
)
7688
temp_dict = {
77-
'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
89+
'wav_fn': str(wav_fn),
7890
'spk_id': self.spk_map[spk],
7991
'spk_name': spk,
8092
'lang_seq': [

preprocessing/variance_binarizer.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
'voicing', # frame-level RMS of harmonic parts (dB), float32[T_s,]
5050
'tension', # frame-level tension (logit), float32[T_s,]
5151
]
52+
WAV_CANDIDATE_EXTENSIONS = ['.wav', '.flac']
5253
DS_INDEX_SEP = '#'
5354

5455
# These operators are used as global variables due to a PyTorch shared memory bug on Windows platforms.
@@ -129,13 +130,26 @@ def require(attr, optional=False):
129130
raise ValueError(f'Missing required attribute {attr} of item \'{item_name}\'.')
130131
return value
131132

133+
wav_fn = None
134+
for ext in WAV_CANDIDATE_EXTENSIONS:
135+
candidate_fn = raw_data_dir / 'wavs' / f'{item_name}{ext}'
136+
if candidate_fn.exists():
137+
wav_fn = candidate_fn
138+
break
139+
if wav_fn is None and not self.prefer_ds:
140+
raise FileNotFoundError(
141+
f'Waveform file not found for item \'{item_name}\'. '
142+
f'Candidate extensions: {WAV_CANDIDATE_EXTENSIONS}\n'
143+
f'If you are using DS files instead of waveform files, please set \'prefer_ds\' to true.'
144+
)
145+
132146
temp_dict = {
133147
'ds_idx': item_idx,
134148
'spk_id': self.spk_map[spk],
135149
'spk_name': spk,
136150
'language_id': self.lang_map[lang],
137151
'language_name': lang,
138-
'wav_fn': str(raw_data_dir / 'wavs' / f'{item_name}.wav'),
152+
'wav_fn': str(wav_fn) if wav_fn is not None else None,
139153
'lang_seq': [
140154
(
141155
self.lang_map[lang if '/' not in p else p.split('/', maxsplit=1)[0]]
@@ -288,10 +302,8 @@ def process_item(self, item_name, meta_data, binarization_args):
288302
processed_input['mel2ph'] = mel2ph.cpu().numpy()
289303

290304
# Below: extract actual f0, convert to pitch and calculate delta pitch
291-
if pathlib.Path(meta_data['wav_fn']).exists():
305+
if meta_data['wav_fn'] is not None:
292306
waveform, _ = librosa.load(meta_data['wav_fn'], sr=hparams['audio_sample_rate'], mono=True)
293-
elif not self.prefer_ds:
294-
raise FileNotFoundError(meta_data['wav_fn'])
295307
else:
296308
waveform = None
297309

0 commit comments

Comments
 (0)