-
Notifications
You must be signed in to change notification settings - Fork 46
/
prepare-dataset.py
77 lines (64 loc) · 2.71 KB
/
prepare-dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File : train-duration.py
@Date : 2021/01/05, Tue
@Author : Atomicoo
@Version : 1.0
@Contact : atomicoo95@gmail.com
@License : (C)Copyright 2020-2021, ShiGroup-NLP-XMU
@Desc : Download and preprocess datasets. Supported datasets are:
* English female: LJSpeech
* Mandarin female: BBSpeech (BIAOBEI)
* Tibetan female: TBSpeech (Non-public)
* Mongolian male: MBSpeech (Mongolian Bible)
* Korean female: KSSpeech (Kaggle Korean Single Speech)
* Cantonese male: HKSpeech (Common Voice, Hong Kong)
* Japanese female: JPSpeech (JSUT Speech Corpus)
'''
__author__ = 'Atomicoo'
import sys
import os
import os.path as osp
import argparse
import pandas as pd
from utils.hparams import HParam
from utils.utils import download_file
from helpers.processor import Processor
from datasets.dataset import SpeechDataset
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--config', default=None, type=str, help='Config file path')
parser.add_argument('--compute', action='store_true', help='Pre-compute dataset statistics')
args = parser.parse_args()
hparams = HParam(args.config) \
if args.config else HParam(osp.join(osp.abspath(os.getcwd()), 'config', 'default.yaml'))
datasets_path = hparams.data.datasets_path
dataset_file_url = \
f'https://open-speech-data.oss-cn-hangzhou.aliyuncs.com/{hparams.data.dataset_dir}.tar.bz2'
dataset_file_name = osp.basename(dataset_file_url)
dataset_dir = dataset_file_name[:-8]
dataset_path = osp.join(datasets_path, dataset_dir)
wavfile_path = osp.join(dataset_path, "wavs")
melspec_path = osp.join(dataset_path, "mels")
if osp.isdir(melspec_path) and False:
print("%s dataset folder already exists" % dataset_dir)
sys.exit(0)
else:
dataset_file_path = osp.join(datasets_path, dataset_file_name)
if not osp.isfile(dataset_file_path):
download_file(dataset_file_url, dataset_file_path)
else:
print("'%s' already exists" % dataset_file_name)
if not osp.isdir(wavfile_path):
print("extracting '%s'..." % dataset_file_name)
os.system('cd %s; tar xvjf %s' % (datasets_path, dataset_file_name))
else:
print("'%s' already exists" % wavfile_path)
dataset_root = osp.join(hparams.data.datasets_path, hparams.data.dataset_dir)
dataset = SpeechDataset([], dataset_root, hparams.text)
processor = Processor(hparams=hparams.audio)
# pre process/compute
if args.compute:
processor.precompute(dataset_path, dataset)
else:
processor.preprocess(dataset_path, dataset)