Skip to content
Snippets Groups Projects
Commit a9348442 authored by Nicolas Denier's avatar Nicolas Denier
Browse files

add base documentation, update LibriSpeech datasets, add infos field to SpeechDataset

parent 72104edc
No related branches found
No related tags found
1 merge request!4Refactoring
Showing
with 738 additions and 107 deletions
*outputdir/
Manifest.toml
notebook-test.jl
docs/build/
image: julia:1.9
stages:
- test
- build-docs
- deploy
- build-badges
- deploy-badges
variables:
FAILED: "echo \"failed\" > .status"
PASSED: "echo \"passed\" > .status"
WRITE_ENV: "echo \"$${PREFIX}_STATUS=$$(cat .status)\" >> .env"
# Rule to run a job only on merge request on main
.only-on-merge-request:
rules:
- if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
# Run a job only when a version tag is pushed on main branch
.only-vtag-on-main:
rules:
- if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/ # ensure it corresponds to a version
# $CI_COMMIT_BRANCH == "main"
# Run unit tests if provided on merge request
tests:
stage: test
variables:
SDL_VIDEODRIVER: "dummy"
SDL_AUDIODRIVER: "disk"
PREFIX: "TEST" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-on-merge-request
before_script:
- eval "$FAILED" # set status to failed by default
- apt update -y
- apt install -y libasound2-dev
- |
julia -e '
using Pkg
pkg"registry add https://github.com/JuliaRegistries/General"
pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
Pkg.activate(; temp = true)
Pkg.resolve()
Pkg.precompile()'
script:
- |
if [ -f test/runtests.jl ]; then
julia --project=./ -e 'using Pkg; Pkg.test()'
else
echo "[warning] no tests provided"
fi
- eval "$PASSED" # set status to passed
# Build documentation on merge request
build-docs:
stage: build-docs
extends:
- .dotenv # share job status in .env
- .only-on-merge-request
variables:
SDL_VIDEODRIVER: "dummy"
SDL_AUDIODRIVER: "disk"
PREFIX: "BUILD_DOCS" # badge prefix
before_script:
- eval "$FAILED" # set status to failed by default
- apt update -y
- apt install -y libasound2-dev
- apt clean
- |
julia --project=docs -e '
using Pkg
pkg"registry add https://github.com/JuliaRegistries/General"
pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
Pkg.develop(PackageSpec(path=pwd()))
Pkg.instantiate()'
script:
- julia --project=docs docs/make.jl
- eval "$PASSED" # set status to passed
# Deploy documentation once build-docs succeeded on new version tag
deploy-docs:
stage: deploy
variables:
PREFIX: "DEPLOY_DOCS" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-vtag-on-main
before_script:
- eval "$FAILED" # set status to failed by default
- apt update && apt install -y git
- git clone -b docs --single-branch "https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/${CI_PROJECT_NAME}.git"
- mkdir docs/build
- mv ${CI_PROJECT_NAME}/dev/* docs/build
script:
- cat docs/build/index.html
- echo "success"
- eval "$PASSED" # set status to passed
# Register a new version on new version tag
deploy-version:
stage: deploy
variables:
PREFIX: "DEPLOY_VERSION" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-vtag-on-main
before_script:
- eval "$FAILED" # set status to failed by default
- apt update && apt install -y git
- git config --global user.email "$GITLAB_USER_EMAIL"
- git config --global user.name "$GITLAB_USER_NAME"
- julia -e "using Pkg; Pkg.add(\"LocalRegistry\"); pkg\"registry add https://registry-token:${REGISTRY_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/Registry.git\""
# install release-cli
- BINARY_NAME=release-cli-linux-amd64
- curl --output /usr/local/bin/release-cli "https://gitlab.com/api/v4/projects/gitlab-org%2Frelease-cli/packages/generic/release-cli/latest/$BINARY_NAME"
- chmod +x /usr/local/bin/release-cli
- export PATH=$PATH:/usr/local/bin
script:
- julia --project=. -e 'using Pkg; Pkg.Registry.update(); using LocalRegistry; register(; registry = "PTAL")'
- eval "$PASSED" # set status to passed
release:
tag_name: '$CI_COMMIT_TAG'
description: '$CI_COMMIT_TAG'
### BADGES ###
# share .env
.dotenv:
after_script:
- eval "$WRITE_ENV"
artifacts:
reports:
dotenv: .env
# example job supporting a badge
# required lines are marked with ##
# example:
# stage: test
# variables: ##
# PREFIX: "TEST" ##
# extends: .dotenv ##
# before_script:
# - eval "$FAILED" ##
# script:
# - echo "passed"
# - eval "$PASSED" ##
# only:
# - main
# Generate a json artifact corresponding to a badge
.badge:
stage: build-badges
when: always # runs even if previous job failed
before_script:
- STATUS_VAR=${PREFIX}_STATUS
- STATUS="${!STATUS_VAR}"
# set color according to status
- |
case "$STATUS" in
"failed")
COLOR="red"
;;
"passed")
COLOR="brightgreen"
;;
*)
COLOR="grey"
;;
esac
script:
# https://shields.io/badges/endpoint-badge
- echo "{\"schemaVersion\":1, \"label\":\"$LABEL\", \"message\":\"$STATUS\", \"color\":\"$COLOR\"}" > "${PREFIX}_BADGE.json"
artifacts:
paths:
- "${PREFIX}_BADGE.json"
when: always
# Define a badge for each job
tests-badge:
extends:
- .badge
- .only-on-merge-request
variables:
PREFIX: "TEST"
LABEL: "Tests"
build-docs-badge:
extends:
- .badge
- .only-on-merge-request
variables:
PREFIX: "BUILD_DOCS"
LABEL: "Build Docs"
deploy-docs-badge:
extends:
- .badge
- .only-vtag-on-main
variables:
PREFIX: "DEPLOY_DOCS"
LABEL: "Deploy Docs"
deploy-version-badge:
extends:
- .badge
- .only-vtag-on-main
variables:
PREFIX: "DEPLOY_VERSION"
LABEL: "Deploy Version"
# Deploy badges to gitub pages so they can be fetched with shields.io api
pages:
stage: deploy-badges
when: always
before_script:
# get all current badges if there isn't a new one
# ignore unexisting badges (404)
- |
for PREFIX in "TEST" "BUILD_DOCS" "DEPLOY_DOCS" "DEPLOY_VERSION"
do
FILE="${PREFIX}_BADGE.json"
if [ ! -f "$FILE" ];then
URL=$(echo "$CI_PAGES_URL/$FILE" | sed "s/http/https/")
curl -o "$FILE" "$URL"
if cat "$FILE" | grep -q 404 ;then
rm "$FILE"
echo "removed $FILE"
fi
fi
done
script:
# new badges and previous ones not renewed are published to pages
- mkdir public
- mv *_BADGE.json public/
- echo "deploying badges"
artifacts:
paths:
- public
rules:
- if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/
- if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
......@@ -4,6 +4,7 @@ authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.d
version = "0.16.0"
[deps]
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
......@@ -11,4 +12,3 @@ MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
[compat]
JSON = "0.21"
julia = "1.10"
AudioSources = "0.3.0"
......@@ -4,7 +4,7 @@ A Julia package to download and prepare speech corpus.
## Installation
Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry)
Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/PTAL/Registry)
to your julia installation. Then, install the package as usual:
```
pkg> add SpeechDatasets
......
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
SpeechDatasets = "ae813453-fab8-46d9-ab8f-a64c05464021"
# Allows to connect to GitLab with HTTPS
# https://github.com/JuliaDocs/Documenter.jl/issues/2061#issuecomment-1607077792
# https://github.com/JuliaDocs/Documenter.jl/blob/master/src/deployconfig.jl
using Documenter: DeployConfig, DeployDecision, marker, env_nonempty, HTTPS
@kwdef struct GitLabHTTPS <: DeployConfig
commit_branch::String = get(ENV, "CI_COMMIT_BRANCH", "")
pull_request_iid::String = get(ENV, "CI_EXTERNAL_PULL_REQUEST_IID", "")
repo_path::String = get(ENV, "CI_PROJECT_PATH", "")
repo_slug::String = get(ENV, "CI_PROJECT_PATH_SLUG", "")
commit_tag::String = get(ENV, "CI_COMMIT_TAG", "")
pipeline_source::String = get(ENV, "CI_PIPELINE_SOURCE", "")
end
Documenter.authentication_method(::GitLabHTTPS) = HTTPS
function Documenter.authenticated_repo_url(cfg::GitLabHTTPS)
token = get(ENV,"CI_BOT_TOKEN","")
host = get(ENV,"CI_SERVER_HOST", "")
return "https://documenter-ci:$token@$host/$(cfg.repo_path).git"
end
function Documenter.deploy_folder(
cfg::GitLabHTTPS;
repo,
repo_previews = repo,
devbranch,
push_preview,
devurl,
branch = "docs",
branch_previews = branch,
tag_prefix = "",
kwargs...,
)
io = IOBuffer()
all_ok = true
println(io, "\nGitLab config:")
println(io, " Commit branch: \"", cfg.commit_branch, "\"")
println(io, " Pull request IID: \"", cfg.pull_request_iid, "\"")
println(io, " Repo slug: \"", cfg.repo_slug, "\"")
println(io, " Commit tag: \"", cfg.commit_tag, "\"")
println(io, " Pipeline source: \"", cfg.pipeline_source, "\"")
build_type = if cfg.pull_request_iid != ""
:preview
elseif cfg.commit_tag != ""
:release
else
:devbranch
end
println(io, "Detected build type: ", build_type)
if build_type == :release
tag_nobuild = version_tag_strip_build(cfg.commit_tag; tag_prefix)
## If a tag exist it should be a valid VersionNumber
tag_ok = tag_nobuild !== nothing
println(
io,
"- $(marker(tag_ok)) ENV[\"CI_COMMIT_TAG\"] contains a valid VersionNumber",
)
all_ok &= tag_ok
is_preview = false
subfolder = tag_nobuild
deploy_branch = branch
deploy_repo = repo
elseif build_type == :preview
pr_number = tryparse(Int, cfg.pull_request_iid)
pr_ok = pr_number !== nothing
all_ok &= pr_ok
println(
io,
"- $(marker(pr_ok)) ENV[\"CI_EXTERNAL_PULL_REQUEST_IID\"]=\"$(cfg.pull_request_iid)\" is a number",
)
btype_ok = push_preview
all_ok &= btype_ok
is_preview = true
println(
io,
"- $(marker(btype_ok)) `push_preview` keyword argument to deploydocs is `true`",
)
## deploy to previews/PR
subfolder = "previews/PR$(something(pr_number, 0))"
deploy_branch = branch_previews
deploy_repo = repo_previews
else
branch_ok = !isempty(cfg.commit_tag) || cfg.commit_branch == devbranch
all_ok &= branch_ok
println(
io,
"- $(marker(branch_ok)) ENV[\"CI_COMMIT_BRANCH\"] matches devbranch=\"$(devbranch)\"",
)
is_preview = false
subfolder = devurl
deploy_branch = branch
deploy_repo = repo
end
key_ok = env_nonempty("CI_BOT_TOKEN")
println(io, "- $(marker(key_ok)) ENV[\"CI_BOT_TOKEN\"] exists and is non-empty")
all_ok &= key_ok
print(io, "Deploying to folder $(repr(subfolder)): $(marker(all_ok))")
@info String(take!(io))
if all_ok
return DeployDecision(;
all_ok = true,
branch = deploy_branch,
repo = deploy_repo,
subfolder = subfolder,
is_preview = is_preview,
)
else
return DeployDecision(; all_ok = false)
end
end
\ No newline at end of file
push!(LOAD_PATH,"..")
using Documenter, SpeechDatasets
using Documenter.Remotes
include("deployconfig.jl")
makedocs(
sitename="SpeechDatasets",
repo = Remotes.GitLab("gitlab.lisn.upsaclay.fr", "PTAL", "Datasets/SpeechDatasets.jl"),
doctest = false,
)
config = GitLabHTTPS()
deploydocs(
repo = "gitlab.lisn.upsaclay.fr/PTAL/Datasets/SpeechDatasets.jl",
devbranch = config.commit_branch,
branch = "docs",
deploy_config = GitLabHTTPS()
)
\ No newline at end of file
<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
<ellipse id="petal" cx="52.5" cy="100" rx="42.5" ry="30"
stroke="black" stroke-opacity="0"
fill-opacity="1" fill="#08d87b"/>
<use href="#petal" transform="rotate(45, 100, 100)"/>
<use href="#petal" transform="rotate(90, 100, 100)"/>
<use href="#petal" transform="rotate(135, 100, 100)"/>
<use href="#petal" transform="rotate(180, 100, 100)"/>
<use href="#petal" transform="rotate(225, 100, 100)"/>
<use href="#petal" transform="rotate(270, 100, 100)"/>
<use href="#petal" transform="rotate(315, 100, 100)"/>
</svg>
\ No newline at end of file
# SpeechDatasets.jl
## Contents
```@contents
Depth = 3
```
## Datasets
### AVID
"Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."
[source](https://zenodo.org/records/10524873)
```@docs
AVID(datadir::AbstractString, outputdir::AbstractString)
```
### INA Diachrony
Voice recordings and transcriptions sorted by time period, sex and speaker.
```@docs
INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
```
### Mini LibriSpeech
"Subset of LibriSpeech corpus for purpose of regression testing."
[source](https://www.openslr.org/31/)
```@docs
MINILIBRISPEECH(outputdir::AbstractString, subset::AbstractString)
```
### Multilingual LibriSpeech
"Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish."
[source](http://www.openslr.org/94)
```@docs
```
### Speech2Tex
Recordings of read equations, literal transcriptions and latex transcriptions.
```@docs
SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString)
```
### TIMIT
"The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems."
[source](https://catalog.ldc.upenn.edu/LDC93S1)
```@docs
TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString})
```
## Index
```@index
```
\ No newline at end of file
# Installation
This package is part of the PTAL tool collection and requires the
[PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry) to be installed.
To add this registry to your Julia installation type `]` to enter the
package mode of the REPL and then type:
```
pkg> registry add "https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
```
Once the registry has been added, SpeechDatasets can be installed with the
Julia package manager by typing in Pkg REPL mode
```
pkg> add SpeechDatasets
```
......@@ -3,8 +3,9 @@
module SpeechDatasets
using JSON
import AudioSources
using SpeechFeatures
import MLUtils
using AudioSources
export
# ManifestItem
......@@ -17,13 +18,13 @@ export
readmanifest,
# Corpora interface
download,
lang,
name,
prepare,
# download,
# lang,
# name,
# prepare,
# Corpora
MultilingualLibriSpeech,
MLLIBRISPEECH,
MINILIBRISPEECH,
TIMIT,
INADIACHRONY,
......@@ -36,7 +37,8 @@ export
MFAFRDICT,
# Dataset
dataset
SpeechDatasetInfos,
SpeechDataset
include("speechcorpus.jl")
include("manifest_item.jl")
......
......@@ -130,11 +130,21 @@ function avid_prepare(datadir, outputdir)
end
function AVID(datadir, outputdir)
"""
AVID(datadir::AbstractString, outputdir::AbstractString)
Extract metadata and paths from AVID dataset.\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `calibration_tones.jsonl` containing informations about calibration tones
- `annotations.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function AVID(datadir::AbstractString, outputdir::AbstractString)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
avid_prepare(datadir, outputdir)
end
dataset(outputdir, "")
infos = SpeechDatasetInfos("AVID")
SpeechDataset(infos, outputdir, "")
end
[
{
"name": "AVID",
"lang": "eng",
"license": "CC BY 4.0",
"source": "https://zenodo.org/records/10524873",
"authors": ["Manila Kodali", "Paavo Alku", "Sudarsana Reddy Kadiri"],
"description": "Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."
},
{
"name": "INA Diachrony",
"lang": "fra",
"license": "proprietary",
"description": "Voice recordings and transcriptions sorted by time period, sex and speaker."
},
{
"name": "Mini LibriSpeech",
"lang": "eng",
"license": "CC BY 4.0",
"source": "https://www.openslr.org/31/",
"authors": ["Vassil Panayotov", "Daniel Povey"],
"description": "Subset of LibriSpeech corpus for purpose of regression testing."
},
{
"name": "Multilingual LibriSpeech",
"lang": ["eng", "fra", "prt", "esp", "deu", "eng", "nld", "ita", "pol"],
"license": "CC BY 4.0",
"source": "http://www.openslr.org/94",
"authors": ["Vineel Pratap", "Qiantong Xu", "Anuroop Sriram", "Gabriel Synnaeve", "Ronan Collobert"],
"description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish"
},
{
"name": "TIMIT",
"lang": "eng",
"license": "LDC User Agreement for Non-Members",
"source": "https://catalog.ldc.upenn.edu/LDC93S1",
"authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
"description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems."
},
{
"name": "Speech2Tex",
"lang": "fra",
"license": "proprietary",
"authors": ["Lorenzo Brucato"],
"description": "Recordings of read equations, literal transcriptions and latex transcriptions."
}
]
\ No newline at end of file
......@@ -151,10 +151,19 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
end
function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
"""
INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
Extract metadata and paths from INA Diachrony dataset.\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `annotations.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
dataset(outputdir, "")
infos = SpeechDatasetInfos("INA Diachrony")
SpeechDataset(infos, outputdir, "")
end
......@@ -31,7 +31,7 @@ function minils_recordings(dir, subset)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
CmdAudioSource(`sox $path -t wav -`);
AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
......@@ -85,14 +85,16 @@ function minils_download(dir)
@debug "dataset in $dir"
end
function minils_prepare(dir)
function minils_prepare(inputdir, outputdir)
outputdir = mkpath(outputdir)
# 1. Recording manifest.
out = joinpath(dir, "recordings.jsonl")
out = joinpath(outputdir, "recordings.jsonl")
if ! isfile(out)
open(out, "w") do f
open(out, "a") do f
for subset in ["train", "dev"]
@debug "preparing recording manifest ($subset) $out"
recs = minils_recordings(dir, subset)
recs = minils_recordings(inputdir, subset)
writemanifest(f, recs)
end
end
......@@ -100,10 +102,10 @@ function minils_prepare(dir)
# 2. Annotation manifests.
for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
out = joinpath(dir, "annotations-$name.jsonl")
out = joinpath(outputdir, "annotations-$name.jsonl")
if ! isfile(out)
@debug "preparing annotation manifest ($subset) $out"
sups = minils_annotations(dir, subset)
sups = minils_annotations(inputdir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
......@@ -112,9 +114,28 @@ function minils_prepare(dir)
end
function MINILIBRISPEECH(dir, subset)
minils_download(dir)
minils_prepare(dir)
dataset(dir, subset)
"""
MINILIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString)
Extract metadata and paths from Mini LibriSpeech dataset.\n
`subset` must be one of ["train", "dev"]\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `annotations-<subset>.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function MINILIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString)
# download if no datadir
if ! isdir(datadir)
minils_download(datadir)
end
# prepare if not already
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
isfile(joinpath(outputdir, "annotations-test.jsonl")))
minils_prepare(datadir, outputdir)
end
infos = SpeechDatasetInfos("Mini LibriSpeech")
SpeechDataset(infos, outputdir, subset)
end
# SPDX-License-Identifier: CECILL-C
struct MultilingualLibriSpeech <: SpeechCorpus
lang
name
function MultilingualLibriSpeech(lang)
new(lang, "multilingual_librispeech")
end
end
const MLS_LANG_CODE = Dict(
"deu" => "german",
"eng" => "english",
......@@ -42,8 +33,21 @@ const MLS_LM_URLS = Dict(
"prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
)
function Base.download(corpus::MultilingualLibriSpeech, outdir)
dir = path(corpus, outdir)
const supported_lang = collect(keys(MLS_LANG_CODE))
struct MultilingualLibriSpeech <: SpeechCorpus
lang::AbstractString
name::AbstractString
end
function MultilingualLibriSpeech(lang)
if ! (lang in supported_lang)
throw(ArgumentError("lang must be one of $supported_lang"))
end
MultilingualLibriSpeech(lang, "multilingual_librispeech")
end
function Base.download(corpus::MultilingualLibriSpeech, dir)
donefile = joinpath(dir, ".download.done")
if ! isfile(donefile)
run(`mkdir -p $dir`)
......@@ -68,8 +72,8 @@ function Base.download(corpus::MultilingualLibriSpeech, outdir)
corpus
end
function recordings(corpus::MultilingualLibriSpeech, dir, subset)
subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
function mlls_recordings(corpus::MultilingualLibriSpeech, inputdir, subset)
subsetdir = joinpath(inputdir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
recs = Dict()
for d1 in readdir(subsetdir; join = true)
......@@ -78,7 +82,7 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
id = replace(basename(path), ".flac" => "")
r = Recording(
id,
CmdAudioSource(`sox $path -t wav -`);
AudioSources.CmdAudioSource(`sox $path -t wav -`);
channels = [1],
samplerate = 16000
)
......@@ -89,13 +93,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
recs
end
function annotations(corpus::MultilingualLibriSpeech, dir, subset)
trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
function mlls_annotations(corpus::MultilingualLibriSpeech, inputdir, subset)
trans = joinpath(inputdir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
sups = Dict()
open(trans, "r") do f
for line in eachline(f)
tokens = split(line)
s = Annotation(tokens[1], tokens[1]; channel = 1,
s = Annotation(tokens[1], tokens[1]; channels = [1],
data = Dict("text" => join(tokens[2:end], " ")))
sups[s.id] = s
end
......@@ -103,16 +107,17 @@ function annotations(corpus::MultilingualLibriSpeech, dir, subset)
sups
end
function prepare(corpus::MultilingualLibriSpeech, outdir)
dir = path(corpus, outdir)
function mlls_prepare(corpus::MultilingualLibriSpeech, inputdir, outputdir)
outputdir = mkpath(outputdir)
# 1. Recording manifests.
for subset in ["train", "dev", "test"]
out = joinpath(dir, "recording-manifest-$subset.jsonl")
@info "preparing recording manifest ($subset) $out"
if ! isfile(out)
recs = recordings(corpus, dir, subset)
open(out, "w") do f
out = joinpath(outputdir, "recordings.jsonl")
@info "preparing recording manifest $out"
if ! isfile(out)
open(out, "a") do f
for subset in ["train", "dev", "test"]
recs = mlls_recordings(corpus, inputdir, subset)
writemanifest(f, recs)
end
end
......@@ -120,10 +125,10 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
# 2. Annotation manifests.
for subset in ["train", "dev", "test"]
out = joinpath(dir, "annotation-manifest-$subset.jsonl")
out = joinpath(outputdir, "annotations-$subset.jsonl")
@info "preparing annotation manifest ($subset) $out"
if ! isfile(out)
sups = annotations(corpus, dir, subset)
sups = mlls_annotations(corpus, inputdir, subset)
open(out, "w") do f
writemanifest(f, sups)
end
......@@ -133,3 +138,32 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
corpus
end
"""
MLLIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString, lang::AbstractString)
Extract metadata and paths from Multilingual LibriSpeech dataset.\n
# Arguments
- `datadir` is the name of dataset directory. If the directory does not exists, it is created and the data is downloaded.\n
- `outputdir` is the output directory for manifest files.
- `subset` must be one of ["train", "dev", "test"].\n
- `lang` is the language, must be one of $supported_lang.\n
Create the `outputdir` folder, with:
- `recordings-<subset>.jsonl` containing each audio file path and associated metadata
- `annotations-<subset>.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function MLLIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString, lang::AbstractString)
mlls = MultilingualLibriSpeech(lang)
# download if no datadir
if ! isdir(datadir)
download(mlls, datadir)
end
# prepare if not already
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
isfile(joinpath(outputdir, "annotations-test.jsonl")))
mlls_prepare(mlls, datadir, outputdir)
end
infos = SpeechDatasetInfos("Multilingual LibriSpeech")
SpeechDataset(infos, outputdir, subset)
end
\ No newline at end of file
......@@ -114,10 +114,19 @@ function speech2tex_prepare(datadir, outputdir)
end
function SPEECH2TEX(datadir, outputdir)
"""
SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString
Extract metadata and paths from Speech2Tex dataset.\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `annotations.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations.jsonl")))
speech2tex_prepare(datadir, outputdir)
end
dataset(outputdir, "")
infos = SpeechDatasetInfos("Speech2Tex")
SpeechDataset(infos, outputdir, "")
end
......@@ -11,7 +11,7 @@ const TIMIT_SUBSETS = Dict(
const TIMIT_DEV_SPK_LIST = Set([
"faks0",
"faks0",
"fdac1",
"fjem0",
"mgwt0",
......@@ -318,7 +318,7 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
id = "timit_$(spk)_$(name)"
audio_src = if fmt == "SPHERE"
CmdAudioSource(`sph2pipe -f wav $path`)
AudioSources.CmdAudioSource(`sph2pipe -f wav $path`)
else
FileAudioSource(path)
end
......@@ -423,14 +423,23 @@ function timit_annotations(dir, formantsdir=nothing)
annotations
end
function TIMIT(timitdir, dir, subset, formantsdir=nothing)
if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
isfile(joinpath(dir, "annotations-train.jsonl")) &&
isfile(joinpath(dir, "annotations-dev.jsonl")) &&
isfile(joinpath(dir, "annotations-test.jsonl")))
timit_prepare(timitdir, dir, formantsdir)
"""
TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString}=nothing)
Extract metadata and paths from TIMIT dataset.\n
`subset` must be one of ["train", "dev", "test"]\n
Create the `outputdir` folder, with:
- `recordings.jsonl` containing each audio file path and associated metadata
- `annotations-<subset>.jsonl` containing each annotation and associated metadata
Return a SpeechDataset object.
"""
function TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString}=nothing)
if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
isfile(joinpath(outputdir, "annotations-test.jsonl")))
timit_prepare(timitdir, outputdir, formantsdir)
end
dataset(dir, subset)
infos = SpeechDatasetInfos("TIMIT")
SpeechDataset(infos, outputdir, subset)
end
# SPDX-License-Identifier: CECILL-C
const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json")
@kwdef struct SpeechDatasetInfos
name::AbstractString = ""
lang::Union{AbstractString, Vector{AbstractString}} = ""
license::AbstractString = ""
source::AbstractString = ""
authors::Vector{AbstractString} = []
description::AbstractString = ""
end
function SpeechDatasetInfos(infos::AbstractDict)
kwargs = NamedTuple()
for key in fieldnames(SpeechDatasetInfos)
val = get(infos, String(key), nothing)
# merge new (key=val) if key was found
kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs
end
SpeechDatasetInfos(kwargs...)
end
function SpeechDatasetInfos(name::AbstractString)
corpora_infos = JSON.parsefile(corpora_file)
infos = filter(x -> x["name"]==name, corpora_infos)[1]
SpeechDatasetInfos(infos)
end
struct SpeechDataset <: MLUtils.AbstractDataContainer
infos::SpeechDatasetInfos
idxs::Vector{AbstractString}
annotations::Dict{AbstractString, Annotation}
recordings::Dict{AbstractString, Recording}
end
"""
dataset(manifestroot)
Load `SpeechDataset` from manifest files stored in `manifestroot`.
Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
See also [`Annotation`](@ref).
function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording})
idxs = collect(keys(annotations))
SpeechDataset(infos, idxs, annotations, recordings)
end
# Examples
```julia-repl
julia> ds = dataset("./manifests", :train)
SpeechDataset(
...
)
julia> ds[1]
(
(samples=[...], sampling_rate=16_000),
Dict(
"text" => "Annotation text here"
)
)
```
"""
function dataset(manifestroot::AbstractString, partition)
function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, partition::AbstractString)
partition_name = partition == "" ? "" : "-$(partition)"
annot_path = joinpath(manifestroot, "annotations$(partition_name).jsonl")
rec_path = joinpath(manifestroot, "recordings.jsonl")
annotations = load(Annotation, annot_path)
recordings = load(Recording, rec_path)
dataset(annotations, recordings)
end
function dataset(annotations::AbstractDict, recordings::AbstractDict)
idxs = collect(keys(annotations))
SpeechDataset(idxs, annotations, recordings)
annotations = load_manifest(Annotation, annot_path)
recordings = load_manifest(Recording, rec_path)
SpeechDataset(infos, annotations, recordings)
end
Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
# Fix1 -> partial funcion with fixed 1st argument
# Fix1 -> partial function with fixed 1st argument
Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)
Base.length(d::SpeechDataset) = length(d.idxs)
......@@ -65,6 +69,6 @@ function Base.filter(fn, d::SpeechDataset)
k idset
end
SpeechDataset(fidxs, fannotations, frecs)
SpeechDataset(d.infos, fidxs, fannotations, frecs)
end
......@@ -3,7 +3,7 @@
#=====================================================================#
# JSON serialization of a manifest item
function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
function Base.show(io::IO, m::MIME"application/json", s::AudioSources.FileAudioSource)
compact = get(io, :compact, false)
indent = get(io, :indent, 0)
printfn = compact ? print : println
......@@ -13,7 +13,7 @@ function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
print(io, repeat(" ", indent), "}")
end
function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
function Base.show(io::IO, m::MIME"application/json", s::AudioSources.URLAudioSource)
compact = get(io, :compact, false)
indent = get(io, :indent, 0)
printfn = compact ? print : println
......@@ -23,7 +23,7 @@ function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
print(io, repeat(" ", indent), "}")
end
function Base.show(io::IO, m::MIME"application/json", s::CmdAudioSource)
function Base.show(io::IO, m::MIME"application/json", s::AudioSources.CmdAudioSource)
compact = get(io, :compact, false)
indent = get(io, :indent, 0)
printfn = compact ? print : println
......@@ -78,11 +78,11 @@ end
function AudioSource(d::Dict)
if d["type"] == "path"
T = FileAudioSource
T = AudioSources.FileAudioSource
elseif d["type"] == "url"
T = URLAudioSource
T = AudioSources.URLAudioSource
elseif d["type"] == "cmd"
T = CmdAudioSource
T = AudioSources.CmdAudioSource
else
throw(ArgumentError("invalid type: $(d["type"])"))
end
......@@ -116,7 +116,7 @@ function writemanifest(io::IO, manifest::Dict)
end
function readmanifest(io::IO, T)
manifest = Dict()
manifest = Dict{AbstractString, T}()
for line in eachline(io)
item = JSON.parse(line) |> T
manifest[item.id] = item
......@@ -129,12 +129,12 @@ manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl"
"""
load(Annotation, path)
load(Recording, path)
load_manifest(Annotation, path)
load_manifest(Recording, path)
Load Recording/Annotation manifest from `path`.
"""
load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
load_manifest(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
function checkdir(dir::AbstractString)
isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment