Skip to content

Commit

Permalink
black with 120 char line-length
Browse files Browse the repository at this point in the history
Fix #118
  • Loading branch information
albertz committed Feb 28, 2023
1 parent f82d114 commit bcaf65b
Show file tree
Hide file tree
Showing 47 changed files with 406 additions and 910 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ jobs:
with:
python-version: 3.8
- run: pip install black==22.3.0
- run: black --diff common/
- run: black --check common/
- run: black --diff .
- run: black --check .
4 changes: 1 addition & 3 deletions common/baselines/librispeech/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def get_corpus_data_inputs(
"""

# Dictionary containing all LibriSpeech CorpusObject entries
corpus_object_dict = get_corpus_object_dict(
audio_format="wav", output_prefix="corpora"
)
corpus_object_dict = get_corpus_object_dict(audio_format="wav", output_prefix="corpora")

# Definition of the official 4-gram LM to be used as default LM
lm = {
Expand Down
4 changes: 1 addition & 3 deletions common/baselines/librispeech/default_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
# RASR_BINARY_PATH = None
# RASR_BINARY_PATH = compile_rasr_binaries_i6mode(commit="907eec4f4e36c11153f6ab6b5dd7675116f909f6") # use tested RASR
RASR_BINARY_PATH = compile_rasr_binaries_i6mode() # use most recent RASR
assert (
RASR_BINARY_PATH
), "Please set a specific RASR_BINARY_PATH before running the pipeline"
assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline"
RASR_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_RASR_BINARY_PATH"


Expand Down
8 changes: 2 additions & 6 deletions common/baselines/librispeech/ls100/gmm/baseline_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,7 @@ def get_monophone_args():
"use_gpu": False,
}

return util.GmmMonophoneArgs(
linear_alignment_args, monophone_training_args, monophone_recognition_args
)
return util.GmmMonophoneArgs(linear_alignment_args, monophone_training_args, monophone_recognition_args)


def get_cart_args(
Expand All @@ -163,9 +161,7 @@ def get_cart_args(
:return:
"""

CartQuestions = (
CartQuestionsWithStress if use_stress_marker else CartQuestionsWithoutStress
)
CartQuestions = CartQuestionsWithStress if use_stress_marker else CartQuestionsWithoutStress

cart_questions_class = CartQuestions(
max_leaves=max_leaves,
Expand Down
4 changes: 1 addition & 3 deletions common/baselines/librispeech/ls100/gmm/baseline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ def run_librispeech_100_common_baseline(
steps.add_step("vtln+sat", vtln_sat_args)
steps.add_step("output", final_output_args)

corpus_data = get_corpus_data_inputs(
corpus_key="train-clean-100", use_g2p_training=True, use_stress_marker=False
)
corpus_data = get_corpus_data_inputs(corpus_key="train-clean-100", use_g2p_training=True, use_stress_marker=False)

system = gmm_system.GmmSystem(rasr_binary_path=RASR_BINARY_PATH)
system.init_system(
Expand Down
8 changes: 2 additions & 6 deletions common/baselines/librispeech/ls960/gmm/baseline_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,7 @@ def get_monophone_args(feature_flow: str = "mfcc+deriv+norm") -> util.GmmMonopho
"use_gpu": False,
}

return util.GmmMonophoneArgs(
linear_alignment_args, monophone_training_args, monophone_recognition_args
)
return util.GmmMonophoneArgs(linear_alignment_args, monophone_training_args, monophone_recognition_args)


def get_cart_args(
Expand All @@ -168,9 +166,7 @@ def get_cart_args(
:param add_unknown: set to true if an unknown phoneme exists
"""

CartQuestions = (
CartQuestionsWithStress if use_stress_marker else CartQuestionsWithoutStress
)
CartQuestions = CartQuestionsWithStress if use_stress_marker else CartQuestionsWithoutStress

cart_questions_class = CartQuestions(
max_leaves=max_leaves,
Expand Down
4 changes: 1 addition & 3 deletions common/baselines/librispeech/ls960/gmm/baseline_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,7 @@ def run_librispeech_960_common_baseline(

# ******************** Data ********************

corpus_data = get_corpus_data_inputs(
corpus_key="train-other-960", use_g2p_training=True, use_stress_marker=False
)
corpus_data = get_corpus_data_inputs(corpus_key="train-other-960", use_g2p_training=True, use_stress_marker=False)

# ******************** GMM System ********************

Expand Down
8 changes: 2 additions & 6 deletions common/baselines/librispeech/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,10 @@ def gmm_example_report_format(report: _Report_Type) -> str:
+ str(results["dev-other"]["VTLN"]["10"])
)
out.append(
"SAT 08".ljust(23)
+ str(results["dev-clean"]["SAT"]["08"]).ljust(14)
+ str(results["dev-other"]["SAT"]["08"])
"SAT 08".ljust(23) + str(results["dev-clean"]["SAT"]["08"]).ljust(14) + str(results["dev-other"]["SAT"]["08"])
)
out.append(
"SAT 10".ljust(23)
+ str(results["dev-clean"]["SAT"]["10"]).ljust(14)
+ str(results["dev-other"]["SAT"]["10"])
"SAT 10".ljust(23) + str(results["dev-clean"]["SAT"]["10"]).ljust(14) + str(results["dev-other"]["SAT"]["10"])
)
out.append(
"VTLN+SAT 08".ljust(17)
Expand Down
20 changes: 5 additions & 15 deletions common/datasets/librispeech/cart.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,12 @@ def __init__(
},
{
"type": "for-each-key",
"keys": (" ").join(
"history[0] central future[0]".split(" ")[:n_phones]
),
"keys": (" ").join("history[0] central future[0]".split(" ")[:n_phones]),
"questions": [
{
"type": "for-each-value",
"values": self.phonemes_boundary_str,
"questions": [
{"type": "question", "description": "context-phone"}
],
"questions": [{"type": "question", "description": "context-phone"}],
},
{
"type": "question",
Expand Down Expand Up @@ -374,9 +370,7 @@ def __init__(


class CartQuestionsWithStress:
def __init__(
self, max_leaves=12001, min_obs=1000, add_unknown: bool = True, n_phones=3
):
def __init__(self, max_leaves=12001, min_obs=1000, add_unknown: bool = True, n_phones=3):
self.max_leaves = max_leaves
self.min_obs = min_obs
self.boundary = "#"
Expand Down Expand Up @@ -528,16 +522,12 @@ def __init__(
},
{
"type": "for-each-key",
"keys": (" ").join(
"history[0] central future[0]".split(" ")[:n_phones]
),
"keys": (" ").join("history[0] central future[0]".split(" ")[:n_phones]),
"questions": [
{
"type": "for-each-value",
"values": self.phonemes_boundary_str,
"questions": [
{"type": "question", "description": "context-phone"}
],
"questions": [{"type": "question", "description": "context-phone"}],
},
{
"type": "question",
Expand Down
24 changes: 6 additions & 18 deletions common/datasets/librispeech/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,11 @@
"test-other": 10,
}

durations["train-clean-460"] = (
durations["train-clean-100"] + durations["train-clean-360"]
)
durations["train-other-960"] = (
durations["train-clean-460"] + durations["train-other-500"]
)
durations["train-clean-460"] = durations["train-clean-100"] + durations["train-clean-360"]
durations["train-other-960"] = durations["train-clean-460"] + durations["train-other-500"]

num_segments["train-clean-460"] = (
num_segments["train-clean-100"] + num_segments["train-clean-360"]
)
num_segments["train-other-960"] = (
num_segments["train-clean-460"] + num_segments["train-other-500"]
)
num_segments["train-clean-460"] = num_segments["train-clean-100"] + num_segments["train-clean-360"]
num_segments["train-other-960"] = num_segments["train-clean-460"] + num_segments["train-other-500"]

concurrent["train-clean-460"] = (
concurrent["train-clean-100"] + concurrent["train-clean-360"]
)
concurrent["train-other-960"] = (
concurrent["train-clean-460"] + concurrent["train-other-500"]
)
concurrent["train-clean-460"] = concurrent["train-clean-100"] + concurrent["train-clean-360"]
concurrent["train-other-960"] = concurrent["train-clean-460"] + concurrent["train-other-500"]
40 changes: 10 additions & 30 deletions common/datasets/librispeech/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,16 @@ def get_bliss_corpus_dict(audio_format="flac", output_prefix="datasets"):
output_prefix = os.path.join(output_prefix, "LibriSpeech")

download_metadata_job = DownloadLibriSpeechMetadataJob()
download_metadata_job.add_alias(
os.path.join(output_prefix, "download", "metadata_job")
)
download_metadata_job.add_alias(os.path.join(output_prefix, "download", "metadata_job"))

def _get_corpus(corpus_name):
download_corpus_job = DownloadLibriSpeechCorpusJob(corpus_key=corpus_name)
create_bliss_corpus_job = LibriSpeechCreateBlissCorpusJob(
corpus_folder=download_corpus_job.out_corpus_folder,
speaker_metadata=download_metadata_job.out_speakers,
)
download_corpus_job.add_alias(
os.path.join(output_prefix, "download", corpus_name)
)
create_bliss_corpus_job.add_alias(
os.path.join(output_prefix, "create_bliss", corpus_name)
)
download_corpus_job.add_alias(os.path.join(output_prefix, "download", corpus_name))
create_bliss_corpus_job.add_alias(os.path.join(output_prefix, "create_bliss", corpus_name))
return create_bliss_corpus_job.out_corpus

corpus_names = [
Expand All @@ -67,9 +61,7 @@ def _get_corpus(corpus_name):
"train-other-500",
]

bliss_corpus_dict = {
corpus_name: _get_corpus(corpus_name) for corpus_name in corpus_names
}
bliss_corpus_dict = {corpus_name: _get_corpus(corpus_name) for corpus_name in corpus_names}

audio_format_options = {
"wav": {
Expand All @@ -94,19 +86,13 @@ def _get_corpus(corpus_name):
corpus_name,
)
)
converted_bliss_corpus_dict[
corpus_name
] = bliss_change_encoding_job.out_corpus
converted_bliss_corpus_dict[corpus_name] = bliss_change_encoding_job.out_corpus
else:
converted_bliss_corpus_dict = bliss_corpus_dict

def _merge_corpora(corpora, name):
merge_job = MergeCorporaJob(
bliss_corpora=corpora, name=name, merge_strategy=MergeStrategy.FLAT
)
merge_job.add_alias(
os.path.join(output_prefix, "%s_merge" % audio_format, name)
)
merge_job = MergeCorporaJob(bliss_corpora=corpora, name=name, merge_strategy=MergeStrategy.FLAT)
merge_job.add_alias(os.path.join(output_prefix, "%s_merge" % audio_format, name))
return merge_job.out_merged_corpus

converted_bliss_corpus_dict["train-clean-460"] = _merge_corpora(
Expand Down Expand Up @@ -151,9 +137,7 @@ def get_corpus_object_dict(audio_format="flac", output_prefix="datasets"):
- 'train-other-960'
:rtype: dict[str, CorpusObject]
"""
bliss_corpus_dict = get_bliss_corpus_dict(
audio_format=audio_format, output_prefix=output_prefix
)
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)

corpus_object_dict = {}

Expand Down Expand Up @@ -196,19 +180,15 @@ def get_ogg_zip_dict(
from i6_core.returnn.oggzip import BlissToOggZipJob

ogg_zip_dict = {}
bliss_corpus_dict = get_bliss_corpus_dict(
audio_format="ogg", output_prefix=output_prefix
)
bliss_corpus_dict = get_bliss_corpus_dict(audio_format="ogg", output_prefix=output_prefix)
for name, bliss_corpus in bliss_corpus_dict.items():
ogg_zip_job = BlissToOggZipJob(
bliss_corpus,
no_conversion=True,
returnn_python_exe=returnn_python_exe,
returnn_root=returnn_root,
)
ogg_zip_job.add_alias(
os.path.join(output_prefix, "LibriSpeech", "%s_ogg_zip_job" % name)
)
ogg_zip_job.add_alias(os.path.join(output_prefix, "LibriSpeech", "%s_ogg_zip_job" % name))
ogg_zip_dict[name] = ogg_zip_job.out_ogg_zip

return ogg_zip_dict
40 changes: 10 additions & 30 deletions common/datasets/librispeech/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,17 @@ def _export_datasets(output_prefix):

# export all bliss corpora
for audio_format in ["flac", "ogg", "wav"]:
bliss_corpus_dict = get_bliss_corpus_dict(
audio_format=audio_format, output_prefix=output_prefix
)
bliss_corpus_dict = get_bliss_corpus_dict(audio_format=audio_format, output_prefix=output_prefix)
for name, bliss_corpus in bliss_corpus_dict.items():
tk.register_output(
os.path.join(
output_prefix, "LibriSpeech", "%s-%s.xml.gz" % (name, audio_format)
),
os.path.join(output_prefix, "LibriSpeech", "%s-%s.xml.gz" % (name, audio_format)),
bliss_corpus,
)

# export all ogg zip corpora
ogg_corpus_dict = get_ogg_zip_dict(output_prefix=output_prefix)
for name, ogg_corpus in ogg_corpus_dict.items():
tk.register_output(
os.path.join(output_prefix, "LibriSpeech", "%s.ogg.zip" % name), ogg_corpus
)
tk.register_output(os.path.join(output_prefix, "LibriSpeech", "%s.ogg.zip" % name), ogg_corpus)


def _export_lm_data(output_prefix):
Expand All @@ -57,37 +51,27 @@ def _export_lexicon_and_vocab(output_prefix):
lexicon_output_prefix = os.path.join(output_prefix, "LibriSpeech", "lexicon")

# folded / without stress marker
bliss_lexicon = get_bliss_lexicon(
output_prefix=output_prefix, use_stress_marker=False
)
bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix, use_stress_marker=False)
tk.register_output(
os.path.join(lexicon_output_prefix, "librispeech.lexicon.folded.xml.gz"),
bliss_lexicon,
)

g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict(
use_stress_marker=True, output_prefix=output_prefix
)
g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict(use_stress_marker=True, output_prefix=output_prefix)
for k, lexicon in g2p_lexicon_dict.items():
tk.register_output(
os.path.join(
lexicon_output_prefix, "%s.lexicon_with_g2p.folded.xml.gz" % k
),
os.path.join(lexicon_output_prefix, "%s.lexicon_with_g2p.folded.xml.gz" % k),
lexicon,
)

# with stress marker
bliss_lexicon = get_bliss_lexicon(
output_prefix=output_prefix, use_stress_marker=True
)
bliss_lexicon = get_bliss_lexicon(output_prefix=output_prefix, use_stress_marker=True)
tk.register_output(
os.path.join(lexicon_output_prefix, "librispeech.lexicon.xml.gz"),
bliss_lexicon,
)

g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict(
use_stress_marker=False, output_prefix=output_prefix
)
g2p_lexicon_dict = get_g2p_augmented_bliss_lexicon_dict(use_stress_marker=False, output_prefix=output_prefix)
for k, lexicon in g2p_lexicon_dict.items():
tk.register_output(
os.path.join(lexicon_output_prefix, "%s.lexicon_with_g2p.xml.gz" % k),
Expand All @@ -102,12 +86,8 @@ def _export_legacy_bpe(output_prefix):
:param str output_prefix
"""
lexicon_output_prefix = os.path.join(output_prefix, "LibriSpeech", "bpe")
ls960_bpe_settings = get_subword_nmt_bpe(
corpus_key="train-other-960", bpe_size=10000, output_prefix=output_prefix
)
ls100_bpe_settings = get_subword_nmt_bpe(
corpus_key="train-clean-100", bpe_size=2000, output_prefix=output_prefix
)
ls960_bpe_settings = get_subword_nmt_bpe(corpus_key="train-other-960", bpe_size=10000, output_prefix=output_prefix)
ls100_bpe_settings = get_subword_nmt_bpe(corpus_key="train-clean-100", bpe_size=2000, output_prefix=output_prefix)
tk.register_output(
os.path.join(lexicon_output_prefix, "train-other-960", "bpe_10k.codes"),
ls960_bpe_settings.bpe_codes,
Expand Down
16 changes: 4 additions & 12 deletions common/datasets/librispeech/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,8 @@ def get_arpa_lm_dict(output_prefix="datasets"):
lm_dict["3gram"] = download_arpa_3gram_lm_job.out_file

lm_prefix = os.path.join(output_prefix, "LibriSpeech", "lm")
download_arpa_3gram_lm_job.add_alias(
os.path.join(lm_prefix, "download_3gram_lm_job")
)
download_arpa_4gram_lm_job.add_alias(
os.path.join(lm_prefix, "download_4gram_lm_job")
)
download_arpa_3gram_lm_job.add_alias(os.path.join(lm_prefix, "download_3gram_lm_job"))
download_arpa_4gram_lm_job.add_alias(os.path.join(lm_prefix, "download_4gram_lm_job"))

return lm_dict

Expand All @@ -51,10 +47,6 @@ def get_librispeech_normalized_lm_data(output_prefix="datasets") -> tk.Path:
:param output_prefix:
:return: gzipped text file containing the LM training data
"""
download_job = DownloadJob(
url="https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz"
)
download_job.add_alias(
os.path.join(output_prefix, "LibriSpeech", "lm", "download_lm_data")
)
download_job = DownloadJob(url="https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz")
download_job.add_alias(os.path.join(output_prefix, "LibriSpeech", "lm", "download_lm_data"))
return download_job.out_file
Loading

0 comments on commit bcaf65b

Please sign in to comment.