add base documentation, update LibriSpeech datasets, add infos field to SpeechDataset

a9348442 · Nicolas Denier · 72104edc · a9348442 · a9348442 · a9348442
Commit a9348442 authored 6 months ago by Nicolas Denier
--- a/.gitignore
+++ b/.gitignore
 *outputdir/
 Manifest.toml
 notebook-test.jl
+docs/build/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: julia:1.9
+stages:
+  - test
+  - build-docs
+  - deploy
+  - build-badges
+  - deploy-badges
+
+variables:
+  FAILED: "echo \"failed\" > .status"
+  PASSED: "echo \"passed\" > .status"
+  WRITE_ENV: "echo \"$${PREFIX}_STATUS=$$(cat .status)\" >> .env"
+
+# Rule to run a job only on merge request on main
+.only-on-merge-request:
+  rules:
+    - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
+
+# Run a job only when a version tag is pushed on main branch
+.only-vtag-on-main:
+  rules:
+    - if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/ # ensure it corresponds to a version
+# $CI_COMMIT_BRANCH == "main"
+
+# Run unit tests if provided on merge request 
+tests:
+  stage: test
+  variables:
+    SDL_VIDEODRIVER: "dummy"
+    SDL_AUDIODRIVER: "disk"
+    PREFIX: "TEST" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-on-merge-request
+  before_script:
+   - eval "$FAILED" # set status to failed by default
+   - apt update -y
+   - apt install -y libasound2-dev
+   - |
+     julia -e '
+      using Pkg
+      pkg"registry add https://github.com/JuliaRegistries/General"
+      pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+      Pkg.activate(; temp = true)
+      Pkg.resolve()
+      Pkg.precompile()'
+  script:
+    - |
+      if [ -f test/runtests.jl ]; then
+        julia --project=./ -e 'using Pkg; Pkg.test()'
+      else
+        echo "[warning] no tests provided"
+      fi
+    - eval "$PASSED" # set status to passed
+
+
+# Build documentation on merge request
+build-docs:
+  stage: build-docs    
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-on-merge-request
+  variables:
+    SDL_VIDEODRIVER: "dummy"
+    SDL_AUDIODRIVER: "disk"
+    PREFIX: "BUILD_DOCS" # badge prefix
+  before_script:
+    - eval "$FAILED" # set status to failed by default
+    - apt update -y
+    - apt install -y libasound2-dev
+    - apt clean
+    - |
+      julia --project=docs -e '
+        using Pkg
+        pkg"registry add https://github.com/JuliaRegistries/General"
+        pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+        Pkg.develop(PackageSpec(path=pwd()))
+        Pkg.instantiate()'
+  script:
+    - julia --project=docs docs/make.jl
+    - eval "$PASSED" # set status to passed
+
+
+# Deploy documentation once build-docs succeeded on new version tag
+deploy-docs:
+  stage: deploy
+  variables:
+    PREFIX: "DEPLOY_DOCS" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-vtag-on-main
+  before_script: 
+    - eval "$FAILED" # set status to failed by default
+    - apt update && apt install -y git
+    - git clone -b docs --single-branch "https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/${CI_PROJECT_NAME}.git"
+    - mkdir docs/build
+    - mv ${CI_PROJECT_NAME}/dev/* docs/build
+  script:
+    - cat docs/build/index.html 
+    - echo "success"
+    - eval "$PASSED" # set status to passed
+
+
+# Register a new version on new version tag
+deploy-version:
+  stage: deploy
+  variables:
+    PREFIX: "DEPLOY_VERSION" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-vtag-on-main
+  before_script:
+    - eval "$FAILED" # set status to failed by default
+    - apt update && apt install -y git
+    - git config --global user.email "$GITLAB_USER_EMAIL"
+    - git config --global user.name "$GITLAB_USER_NAME"
+    - julia -e "using Pkg; Pkg.add(\"LocalRegistry\"); pkg\"registry add https://registry-token:${REGISTRY_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/Registry.git\""
+    # install release-cli
+    - BINARY_NAME=release-cli-linux-amd64
+    - curl --output /usr/local/bin/release-cli "https://gitlab.com/api/v4/projects/gitlab-org%2Frelease-cli/packages/generic/release-cli/latest/$BINARY_NAME" 
+    - chmod +x /usr/local/bin/release-cli
+    - export PATH=$PATH:/usr/local/bin
+  script:
+    - julia --project=. -e 'using Pkg; Pkg.Registry.update(); using LocalRegistry; register(; registry = "PTAL")' 
+    - eval "$PASSED" # set status to passed
+  release:
+    tag_name: '$CI_COMMIT_TAG'
+    description: '$CI_COMMIT_TAG'
+
+
+### BADGES ###
+
+# share .env
+.dotenv: 
+  after_script:
+    - eval "$WRITE_ENV"
+  artifacts:
+    reports:
+      dotenv: .env
+
+# example job supporting a badge
+# required lines are marked with ##
+# example:
+#   stage: test
+#   variables:          ##
+#     PREFIX: "TEST"    ##
+#   extends: .dotenv    ##
+#   before_script:
+#     - eval "$FAILED"  ##
+#   script:
+#     - echo "passed"
+#     - eval "$PASSED"  ##
+#   only:
+#     - main
+
+# Generate a json artifact corresponding to a badge
+.badge:
+  stage: build-badges
+  when: always # runs even if previous job failed
+  before_script:
+    - STATUS_VAR=${PREFIX}_STATUS
+    - STATUS="${!STATUS_VAR}"
+    # set color according to status
+    - |
+      case "$STATUS" in
+        "failed")
+          COLOR="red"
+          ;;
+        "passed")
+          COLOR="brightgreen"
+          ;;
+        *)
+          COLOR="grey"
+          ;;
+      esac
+  script:
+    # https://shields.io/badges/endpoint-badge
+    - echo "{\"schemaVersion\":1, \"label\":\"$LABEL\", \"message\":\"$STATUS\", \"color\":\"$COLOR\"}" > "${PREFIX}_BADGE.json"
+  artifacts:
+    paths:
+      - "${PREFIX}_BADGE.json"
+    when: always
+
+
+# Define a badge for each job
+
+tests-badge:
+  extends: 
+    - .badge
+    - .only-on-merge-request
+  variables:
+    PREFIX: "TEST"
+    LABEL: "Tests"
+
+
+build-docs-badge:
+  extends: 
+    - .badge
+    - .only-on-merge-request
+  variables:
+    PREFIX: "BUILD_DOCS"
+    LABEL: "Build Docs"
+
+
+deploy-docs-badge:
+  extends: 
+    - .badge
+    - .only-vtag-on-main
+  variables:
+    PREFIX: "DEPLOY_DOCS"
+    LABEL: "Deploy Docs"
+
+
+deploy-version-badge:
+  extends: 
+    - .badge
+    - .only-vtag-on-main
+  variables:
+    PREFIX: "DEPLOY_VERSION"
+    LABEL: "Deploy Version"
+
+
+# Deploy badges to gitub pages so they can be fetched with shields.io api
+pages:
+  stage: deploy-badges
+  when: always
+  before_script:
+    # get all current badges if there isn't a new one
+    # ignore unexisting badges (404)
+    - |
+      for PREFIX in "TEST" "BUILD_DOCS" "DEPLOY_DOCS" "DEPLOY_VERSION"
+      do 
+        FILE="${PREFIX}_BADGE.json"
+        if [ ! -f "$FILE" ];then 
+          URL=$(echo "$CI_PAGES_URL/$FILE" | sed "s/http/https/")
+          curl -o "$FILE" "$URL"
+          if cat "$FILE" | grep -q 404 ;then
+            rm "$FILE"
+            echo "removed $FILE"
+          fi
+        fi
+      done
+  script: 
+    # new badges and previous ones not renewed are published to pages
+    - mkdir public
+    - mv *_BADGE.json public/
+    - echo "deploying badges"
+  artifacts:
+    paths:
+    - public
+  rules:
+    - if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/
+    - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.d
 version = "0.16.0"

 [deps]
+SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
 AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
@@ -11,4 +12,3 @@ MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 [compat]
 JSON = "0.21"
 julia = "1.10"
-AudioSources = "0.3.0"
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A Julia package to download and prepare speech corpus.

 ## Installation

-Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry)
+Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/PTAL/Registry)
 to your julia installation. Then, install the package as usual:
 ```
 pkg> add SpeechDatasets

--- a/docs/Project.toml
+++ b/docs/Project.toml
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+SpeechDatasets = "ae813453-fab8-46d9-ab8f-a64c05464021"
--- a/docs/deployconfig.jl
+++ b/docs/deployconfig.jl
+# Allows to connect to GitLab with HTTPS
+# https://github.com/JuliaDocs/Documenter.jl/issues/2061#issuecomment-1607077792
+# https://github.com/JuliaDocs/Documenter.jl/blob/master/src/deployconfig.jl
+using Documenter: DeployConfig, DeployDecision, marker, env_nonempty, HTTPS
+
+@kwdef struct GitLabHTTPS <: DeployConfig
+    commit_branch::String = get(ENV, "CI_COMMIT_BRANCH", "")
+    pull_request_iid::String = get(ENV, "CI_EXTERNAL_PULL_REQUEST_IID", "")
+    repo_path::String = get(ENV, "CI_PROJECT_PATH", "")
+    repo_slug::String = get(ENV, "CI_PROJECT_PATH_SLUG", "")
+    commit_tag::String = get(ENV, "CI_COMMIT_TAG", "")
+    pipeline_source::String = get(ENV, "CI_PIPELINE_SOURCE", "")
+end
+
+Documenter.authentication_method(::GitLabHTTPS) = HTTPS
+
+function Documenter.authenticated_repo_url(cfg::GitLabHTTPS) 
+    token = get(ENV,"CI_BOT_TOKEN","")
+    host = get(ENV,"CI_SERVER_HOST", "")
+    return "https://documenter-ci:$token@$host/$(cfg.repo_path).git"
+end
+
+function Documenter.deploy_folder(
+    cfg::GitLabHTTPS;
+    repo,
+    repo_previews = repo,
+    devbranch,
+    push_preview,
+    devurl,
+    branch = "docs",
+    branch_previews = branch,
+    tag_prefix = "",
+    kwargs...,
+)
+    io = IOBuffer()
+    all_ok = true
+
+    println(io, "\nGitLab config:")
+    println(io, "  Commit branch: \"", cfg.commit_branch, "\"")
+    println(io, "  Pull request IID: \"", cfg.pull_request_iid, "\"")
+    println(io, "  Repo slug: \"", cfg.repo_slug, "\"")
+    println(io, "  Commit tag: \"", cfg.commit_tag, "\"")
+    println(io, "  Pipeline source: \"", cfg.pipeline_source, "\"")
+
+    build_type = if cfg.pull_request_iid != ""
+        :preview
+    elseif cfg.commit_tag != ""
+        :release
+    else
+        :devbranch
+    end
+
+    println(io, "Detected build type: ", build_type)
+
+    if build_type == :release
+        tag_nobuild = version_tag_strip_build(cfg.commit_tag; tag_prefix)
+        ## If a tag exist it should be a valid VersionNumber
+        tag_ok = tag_nobuild !== nothing
+
+        println(
+            io,
+            "- $(marker(tag_ok)) ENV[\"CI_COMMIT_TAG\"] contains a valid VersionNumber",
+        )
+        all_ok &= tag_ok
+
+        is_preview = false
+        subfolder = tag_nobuild
+        deploy_branch = branch
+        deploy_repo = repo
+    elseif build_type == :preview
+        pr_number = tryparse(Int, cfg.pull_request_iid)
+        pr_ok = pr_number !== nothing
+        all_ok &= pr_ok
+        println(
+            io,
+            "- $(marker(pr_ok)) ENV[\"CI_EXTERNAL_PULL_REQUEST_IID\"]=\"$(cfg.pull_request_iid)\" is a number",
+        )
+        btype_ok = push_preview
+        all_ok &= btype_ok
+        is_preview = true
+        println(
+            io,
+            "- $(marker(btype_ok)) `push_preview` keyword argument to deploydocs is `true`",
+        )
+        ## deploy to previews/PR
+        subfolder = "previews/PR$(something(pr_number, 0))"
+        deploy_branch = branch_previews
+        deploy_repo = repo_previews
+    else
+        branch_ok = !isempty(cfg.commit_tag) || cfg.commit_branch == devbranch
+        all_ok &= branch_ok
+        println(
+            io,
+            "- $(marker(branch_ok)) ENV[\"CI_COMMIT_BRANCH\"] matches devbranch=\"$(devbranch)\"",
+        )
+        is_preview = false
+        subfolder = devurl
+        deploy_branch = branch
+        deploy_repo = repo
+    end
+
+    key_ok = env_nonempty("CI_BOT_TOKEN")
+    println(io, "- $(marker(key_ok)) ENV[\"CI_BOT_TOKEN\"] exists and is non-empty")
+    all_ok &= key_ok
+
+    print(io, "Deploying to folder $(repr(subfolder)): $(marker(all_ok))")
+    @info String(take!(io))
+
+    if all_ok
+        return DeployDecision(;
+            all_ok = true,
+            branch = deploy_branch,
+            repo = deploy_repo,
+            subfolder = subfolder,
+            is_preview = is_preview,
+        )
+    else
+        return DeployDecision(; all_ok = false)
+    end
+end
\ No newline at end of file
--- a/docs/make.jl
+++ b/docs/make.jl
+push!(LOAD_PATH,"..")
+
+using Documenter, SpeechDatasets
+using Documenter.Remotes
+
+include("deployconfig.jl")
+
+makedocs(
+    sitename="SpeechDatasets", 
+    repo = Remotes.GitLab("gitlab.lisn.upsaclay.fr", "PTAL", "Datasets/SpeechDatasets.jl"),
+    doctest = false,
+)
+
+config = GitLabHTTPS()
+
+deploydocs(
+    repo = "gitlab.lisn.upsaclay.fr/PTAL/Datasets/SpeechDatasets.jl",
+    devbranch = config.commit_branch,
+    branch = "docs",
+    deploy_config = GitLabHTTPS()
+)
\ No newline at end of file
--- a/docs/src/assets/logo.svg
+++ b/docs/src/assets/logo.svg
+<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
+	<ellipse id="petal" cx="52.5" cy="100" rx="42.5" ry="30" 
+	         stroke="black" stroke-opacity="0"
+			 fill-opacity="1" fill="#08d87b"/>
+	<use href="#petal" transform="rotate(45, 100, 100)"/>
+	<use href="#petal" transform="rotate(90, 100, 100)"/>
+	<use href="#petal" transform="rotate(135, 100, 100)"/>
+	<use href="#petal" transform="rotate(180, 100, 100)"/>
+	<use href="#petal" transform="rotate(225, 100, 100)"/>
+	<use href="#petal" transform="rotate(270, 100, 100)"/>
+	<use href="#petal" transform="rotate(315, 100, 100)"/>
+</svg>
\ No newline at end of file
--- a/docs/src/index.md
+++ b/docs/src/index.md
+# SpeechDatasets.jl
+
+## Contents 
+
+```@contents
+Depth = 3
+```
+
+## Datasets
+
+### AVID
+"Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."  
+
+[source](https://zenodo.org/records/10524873)
+```@docs
+AVID(datadir::AbstractString, outputdir::AbstractString)
+```
+
+### INA Diachrony
+Voice recordings and transcriptions sorted by time period, sex and speaker.  
+
+```@docs
+INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
+```
+
+### Mini LibriSpeech
+"Subset of LibriSpeech corpus for purpose of regression testing."  
+
+[source](https://www.openslr.org/31/)
+```@docs
+MINILIBRISPEECH(outputdir::AbstractString, subset::AbstractString)
+```
+
+### Multilingual LibriSpeech
+"Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish."  
+
+[source](http://www.openslr.org/94) 
+```@docs
+
+```
+
+### Speech2Tex
+Recordings of read equations, literal transcriptions and latex transcriptions.  
+```@docs
+SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString)
+```
+
+### TIMIT
+"The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems."  
+
+[source](https://catalog.ldc.upenn.edu/LDC93S1)
+```@docs
+TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString})
+```
+
+## Index
+
+```@index
+```
\ No newline at end of file
--- a/docs/src/installation.md
+++ b/docs/src/installation.md
+# Installation
+
+This package is part of the PTAL tool collection and requires the
+[PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry) to be installed.
+
+To add this registry to your Julia installation type `]` to enter the
+package mode of the REPL and then type:
+
+```
+pkg> registry add "https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+```
+
+Once the registry has been added, SpeechDatasets can be installed with the
+Julia package manager by typing in Pkg REPL mode
+```
+pkg> add SpeechDatasets
+```
--- a/src/SpeechDatasets.jl
+++ b/src/SpeechDatasets.jl
@@ -3,8 +3,9 @@
 module SpeechDatasets

 using JSON
+import AudioSources
+using SpeechFeatures
 import MLUtils
-using AudioSources

 export
    # ManifestItem
@@ -17,13 +18,13 @@ export
    readmanifest,

    # Corpora interface
-    download,
-    lang,
-    name,
-    prepare,
+    # download,
+    # lang,
+    # name,
+    # prepare,

    # Corpora
-    MultilingualLibriSpeech,
+    MLLIBRISPEECH,
    MINILIBRISPEECH,
    TIMIT,
    INADIACHRONY,
@@ -36,7 +37,8 @@ export
    MFAFRDICT,

    # Dataset
-    dataset
+    SpeechDatasetInfos,
+    SpeechDataset

 include("speechcorpus.jl")
 include("manifest_item.jl")

--- a/src/corpora/avid.jl
+++ b/src/corpora/avid.jl
@@ -130,11 +130,21 @@ function avid_prepare(datadir, outputdir)
 end


-function AVID(datadir, outputdir)
+"""
+    AVID(datadir::AbstractString, outputdir::AbstractString)
+Extract metadata and paths from AVID dataset.\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `calibration_tones.jsonl` containing informations about calibration tones
+- `annotations.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function AVID(datadir::AbstractString, outputdir::AbstractString)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "calibration_tones.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        avid_prepare(datadir, outputdir)
    end
-    dataset(outputdir, "")
+    infos = SpeechDatasetInfos("AVID")
+    SpeechDataset(infos, outputdir, "")
 end
--- a/src/corpora/corpora.json
+++ b/src/corpora/corpora.json
+[
+    {
+        "name": "AVID",
+        "lang": "eng",
+        "license": "CC BY 4.0",
+        "source": "https://zenodo.org/records/10524873",
+        "authors": ["Manila Kodali", "Paavo Alku", "Sudarsana Reddy Kadiri"],
+        "description": "Aalto Vocal Intensity Database includes speech and EGG produced by 50 speakers (25 males, 25 females) who varied their vocal intensity in four categories (soft, normal, loud, and very loud)."
+    },
+    {
+        "name": "INA Diachrony",
+        "lang": "fra",
+        "license": "proprietary",
+        "description": "Voice recordings and transcriptions sorted by time period, sex and speaker."
+    },
+    {
+        "name": "Mini LibriSpeech",
+        "lang": "eng",
+        "license": "CC BY 4.0",
+        "source": "https://www.openslr.org/31/",
+        "authors": ["Vassil Panayotov", "Daniel Povey"],
+        "description": "Subset of LibriSpeech corpus for purpose of regression testing."
+    },
+    {
+        "name": "Multilingual LibriSpeech",
+        "lang": ["eng", "fra", "prt", "esp", "deu", "eng", "nld", "ita", "pol"],
+        "license": "CC BY 4.0",
+        "source": "http://www.openslr.org/94",
+        "authors": ["Vineel Pratap", "Qiantong Xu", "Anuroop Sriram", "Gabriel Synnaeve", "Ronan Collobert"],
+        "description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish"
+    },
+    {
+        "name": "TIMIT",
+        "lang": "eng",
+        "license": "LDC User Agreement for Non-Members",
+        "source": "https://catalog.ldc.upenn.edu/LDC93S1",
+        "authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
+        "description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems."
+    },
+    {
+        "name": "Speech2Tex",
+        "lang": "fra",
+        "license": "proprietary",
+        "authors": ["Lorenzo Brucato"],
+        "description": "Recordings of read equations, literal transcriptions and latex transcriptions."
+    }
+]
\ No newline at end of file
--- a/src/corpora/ina_diachrony.jl
+++ b/src/corpora/ina_diachrony.jl
@@ -151,10 +151,19 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    end
 end

-function INADIACHRONY(ina_wav_dir, outputdir, ina_csv_dir=nothing)
+"""
+    INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
+Extract metadata and paths from INA Diachrony dataset.\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `annotations.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function INADIACHRONY(ina_wav_dir::AbstractString, outputdir::AbstractString, ina_csv_dir::Union{Nothing,AbstractString}=nothing)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
    end
-    dataset(outputdir, "")
+    infos = SpeechDatasetInfos("INA Diachrony")
+    SpeechDataset(infos, outputdir, "")
 end
--- a/src/corpora/mini_librispeech.jl
+++ b/src/corpora/mini_librispeech.jl
@@ -31,7 +31,7 @@ function minils_recordings(dir, subset)
                id = replace(basename(path), ".flac" =>  "")
                r = Recording(
                    id,
-                    CmdAudioSource(`sox $path -t wav -`);
+                    AudioSources.CmdAudioSource(`sox $path -t wav -`);
                    channels = [1],
                    samplerate = 16000
                )
@@ -85,14 +85,16 @@ function minils_download(dir)
    @debug "dataset in $dir"
 end

-function minils_prepare(dir)
+function minils_prepare(inputdir, outputdir)
+    outputdir = mkpath(outputdir)
+
    # 1. Recording manifest.
-    out = joinpath(dir, "recordings.jsonl")
+    out = joinpath(outputdir, "recordings.jsonl")
    if ! isfile(out)
-        open(out, "w") do f
+        open(out, "a") do f
            for subset in ["train", "dev"]
                @debug "preparing recording manifest ($subset) $out"
-                recs = minils_recordings(dir, subset)
+                recs = minils_recordings(inputdir, subset)
                writemanifest(f, recs)
            end
        end
@@ -100,10 +102,10 @@ function minils_prepare(dir)

    # 2. Annotation manifests.
    for (subset, name) in [("train", "train"), ("dev", "dev"), ("dev", "test")]
-        out = joinpath(dir, "annotations-$name.jsonl")
+        out = joinpath(outputdir, "annotations-$name.jsonl")
        if ! isfile(out)
            @debug "preparing annotation manifest ($subset) $out"
-            sups = minils_annotations(dir, subset)
+            sups = minils_annotations(inputdir, subset)
            open(out, "w") do f
                writemanifest(f, sups)
            end
@@ -112,9 +114,28 @@ function minils_prepare(dir)
 end


-function MINILIBRISPEECH(dir, subset)
-    minils_download(dir)
-    minils_prepare(dir)
-    dataset(dir, subset)
+"""
+    MINILIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString)
+Extract metadata and paths from Mini LibriSpeech dataset.\n
+`subset` must be one of ["train", "dev"]\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `annotations-<subset>.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function MINILIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString)
+    # download if no datadir 
+    if ! isdir(datadir)
+        minils_download(datadir)
+    end
+    # prepare if not already
+    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-test.jsonl")))
+        minils_prepare(datadir, outputdir)
+    end
+    infos = SpeechDatasetInfos("Mini LibriSpeech")
+    SpeechDataset(infos, outputdir, subset)
 end

--- a/src/corpora/multilingual_librispeech.jl
+++ b/src/corpora/multilingual_librispeech.jl
 # SPDX-License-Identifier: CECILL-C

-struct MultilingualLibriSpeech <: SpeechCorpus
-    lang
-    name
-
-    function MultilingualLibriSpeech(lang)
-        new(lang, "multilingual_librispeech")
-    end
-end
-
 const MLS_LANG_CODE = Dict(
    "deu" => "german",
    "eng" => "english",
@@ -42,8 +33,21 @@ const MLS_LM_URLS = Dict(
    "prt" => "https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz"
 )

-function Base.download(corpus::MultilingualLibriSpeech, outdir)
-    dir = path(corpus, outdir)
+const supported_lang = collect(keys(MLS_LANG_CODE))
+
+struct MultilingualLibriSpeech <: SpeechCorpus
+    lang::AbstractString
+    name::AbstractString
+end
+
+function MultilingualLibriSpeech(lang)
+    if ! (lang in supported_lang)
+        throw(ArgumentError("lang must be one of $supported_lang"))
+    end 
+    MultilingualLibriSpeech(lang, "multilingual_librispeech")
+end
+
+function Base.download(corpus::MultilingualLibriSpeech, dir)
    donefile = joinpath(dir, ".download.done")
    if ! isfile(donefile)
        run(`mkdir -p $dir`)
@@ -68,8 +72,8 @@ function Base.download(corpus::MultilingualLibriSpeech, outdir)
    corpus
 end

-function recordings(corpus::MultilingualLibriSpeech, dir, subset)
-    subsetdir = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
+function mlls_recordings(corpus::MultilingualLibriSpeech, inputdir, subset)
+    subsetdir = joinpath(inputdir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "audio")
    recs = Dict()

    for d1 in readdir(subsetdir; join = true)
@@ -78,7 +82,7 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
                id = replace(basename(path), ".flac" =>  "")
                r = Recording(
                    id,
-                    CmdAudioSource(`sox $path -t wav -`);
+                    AudioSources.CmdAudioSource(`sox $path -t wav -`);
                    channels = [1],
                    samplerate = 16000
                )
@@ -89,13 +93,13 @@ function recordings(corpus::MultilingualLibriSpeech, dir, subset)
    recs
 end

-function annotations(corpus::MultilingualLibriSpeech, dir, subset)
-    trans = joinpath(dir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
+function mlls_annotations(corpus::MultilingualLibriSpeech, inputdir, subset)
+    trans = joinpath(inputdir, "mls_$(MLS_LANG_CODE[corpus.lang])", subset, "transcripts.txt")
    sups = Dict()
    open(trans, "r") do f
        for line in eachline(f)
            tokens = split(line)
-            s = Annotation(tokens[1], tokens[1]; channel = 1,
+            s = Annotation(tokens[1], tokens[1]; channels = [1],
                            data = Dict("text" => join(tokens[2:end], " ")))
            sups[s.id] = s
        end
@@ -103,16 +107,17 @@ function annotations(corpus::MultilingualLibriSpeech, dir, subset)
    sups
 end

-function prepare(corpus::MultilingualLibriSpeech, outdir)
-    dir = path(corpus, outdir)
+function mlls_prepare(corpus::MultilingualLibriSpeech, inputdir, outputdir)
+    outputdir = mkpath(outputdir)

    # 1. Recording manifests.
-    for subset in ["train", "dev", "test"]
-        out = joinpath(dir, "recording-manifest-$subset.jsonl")
-        @info "preparing recording manifest ($subset) $out"
-        if ! isfile(out)
-            recs = recordings(corpus, dir, subset)
-            open(out, "w") do f
+    
+    out = joinpath(outputdir, "recordings.jsonl")
+    @info "preparing recording manifest $out"
+    if ! isfile(out)
+        open(out, "a") do f
+            for subset in ["train", "dev", "test"]
+                recs = mlls_recordings(corpus, inputdir, subset)
                writemanifest(f, recs)
            end
        end
@@ -120,10 +125,10 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)

    # 2. Annotation manifests.
    for subset in ["train", "dev", "test"]
-        out = joinpath(dir, "annotation-manifest-$subset.jsonl")
+        out = joinpath(outputdir, "annotations-$subset.jsonl")
        @info "preparing annotation manifest ($subset) $out"
        if ! isfile(out)
-            sups = annotations(corpus, dir, subset)
+            sups = mlls_annotations(corpus, inputdir, subset)
            open(out, "w") do f
                writemanifest(f, sups)
            end
@@ -133,3 +138,32 @@ function prepare(corpus::MultilingualLibriSpeech, outdir)
    corpus
 end

+"""
+    MLLIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString, lang::AbstractString)
+Extract metadata and paths from Multilingual LibriSpeech dataset.\n
+# Arguments
+- `datadir` is the name of dataset directory. If the directory does not exists, it is created and the data is downloaded.\n
+- `outputdir` is the output directory for manifest files.
+- `subset` must be one of ["train", "dev", "test"].\n
+- `lang` is the language, must be one of $supported_lang.\n
+Create the `outputdir` folder, with:
+- `recordings-<subset>.jsonl` containing each audio file path and associated metadata
+- `annotations-<subset>.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function MLLIBRISPEECH(datadir::AbstractString, outputdir::AbstractString, subset::AbstractString, lang::AbstractString)
+    mlls = MultilingualLibriSpeech(lang)
+    # download if no datadir 
+    if ! isdir(datadir)
+        download(mlls, datadir)
+    end
+    # prepare if not already
+    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
+        isfile(joinpath(outputdir, "annotations-test.jsonl")))
+        mlls_prepare(mlls, datadir, outputdir)
+  end
+    infos = SpeechDatasetInfos("Multilingual LibriSpeech")
+    SpeechDataset(infos, outputdir, subset)
+end
\ No newline at end of file
--- a/src/corpora/speech2tex.jl
+++ b/src/corpora/speech2tex.jl
@@ -114,10 +114,19 @@ function speech2tex_prepare(datadir, outputdir)
 end


-function SPEECH2TEX(datadir, outputdir)
+"""
+    SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString
+Extract metadata and paths from Speech2Tex dataset.\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `annotations.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function SPEECH2TEX(datadir::AbstractString, outputdir::AbstractString)
    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
          isfile(joinpath(outputdir, "annotations.jsonl")))
        speech2tex_prepare(datadir, outputdir)
    end
-    dataset(outputdir, "")
+    infos = SpeechDatasetInfos("Speech2Tex")
+    SpeechDataset(infos, outputdir, "")
 end
--- a/src/corpora/timit.jl
+++ b/src/corpora/timit.jl
@@ -11,7 +11,7 @@ const TIMIT_SUBSETS = Dict(


 const TIMIT_DEV_SPK_LIST = Set([
-"faks0",
+    "faks0",
    "fdac1",
    "fjem0",
    "mgwt0",
@@ -318,7 +318,7 @@ function timit_recordings(dir::AbstractString; fmt="SPHERE")
            id = "timit_$(spk)_$(name)"

            audio_src = if fmt == "SPHERE"
-                CmdAudioSource(`sph2pipe -f wav $path`)
+                AudioSources.CmdAudioSource(`sph2pipe -f wav $path`)
            else
                FileAudioSource(path)
            end
@@ -423,14 +423,23 @@ function timit_annotations(dir, formantsdir=nothing)
    annotations
 end

-
-function TIMIT(timitdir, dir, subset, formantsdir=nothing)
-    if ! (isfile(joinpath(dir, "recordings.jsonl")) &&
-          isfile(joinpath(dir, "annotations-train.jsonl")) &&
-          isfile(joinpath(dir, "annotations-dev.jsonl")) &&
-          isfile(joinpath(dir, "annotations-test.jsonl")))
-        timit_prepare(timitdir, dir, formantsdir)
+"""
+    TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString}=nothing)
+Extract metadata and paths from TIMIT dataset.\n
+`subset` must be one of ["train", "dev", "test"]\n
+Create the `outputdir` folder, with:
+- `recordings.jsonl` containing each audio file path and associated metadata
+- `annotations-<subset>.jsonl` containing each annotation and associated metadata
+Return a SpeechDataset object.
+"""
+function TIMIT(timitdir::AbstractString, outputdir::AbstractString, subset::AbstractString, formantsdir::Union{Nothing,AbstractString}=nothing)
+    if ! (isfile(joinpath(outputdir, "recordings.jsonl")) &&
+          isfile(joinpath(outputdir, "annotations-train.jsonl")) &&
+          isfile(joinpath(outputdir, "annotations-dev.jsonl")) &&
+          isfile(joinpath(outputdir, "annotations-test.jsonl")))
+        timit_prepare(timitdir, outputdir, formantsdir)
    end
-    dataset(dir, subset)
+    infos = SpeechDatasetInfos("TIMIT")
+    SpeechDataset(infos, outputdir, subset)
 end

--- a/src/dataset.jl
+++ b/src/dataset.jl
 # SPDX-License-Identifier: CECILL-C

+const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json")
+
+@kwdef struct SpeechDatasetInfos
+    name::AbstractString = ""
+    lang::Union{AbstractString, Vector{AbstractString}} = ""
+    license::AbstractString = ""
+    source::AbstractString = ""
+    authors::Vector{AbstractString} = []
+    description::AbstractString = ""
+end
+
+function SpeechDatasetInfos(infos::AbstractDict)
+    kwargs = NamedTuple()
+    for key in fieldnames(SpeechDatasetInfos)
+        val = get(infos, String(key), nothing)
+        # merge new (key=val) if key was found
+        kwargs = !isnothing(val) ? (; kwargs..., key=>val) : kwargs
+    end
+    SpeechDatasetInfos(kwargs...)
+end
+
+function SpeechDatasetInfos(name::AbstractString)
+    corpora_infos = JSON.parsefile(corpora_file)
+    infos = filter(x -> x["name"]==name, corpora_infos)[1]
+    SpeechDatasetInfos(infos)
+end
+
 struct SpeechDataset <: MLUtils.AbstractDataContainer
+    infos::SpeechDatasetInfos
    idxs::Vector{AbstractString}
    annotations::Dict{AbstractString, Annotation}
    recordings::Dict{AbstractString, Recording}
 end

-"""
-dataset(manifestroot)
-
-Load `SpeechDataset` from manifest files stored in `manifestroot`.
-
-Each item of the dataset is a nested tuple `((samples, sampling_rate), Annotation.data)`.
-
-See also [`Annotation`](@ref).
+function SpeechDataset(infos::SpeechDatasetInfos, annotations::Dict{AbstractString, Annotation}, recordings::Dict{AbstractString, Recording})
+    idxs = collect(keys(annotations))
+    SpeechDataset(infos, idxs, annotations, recordings)
+end

-# Examples
-```julia-repl
-julia> ds = dataset("./manifests", :train)
-SpeechDataset(
-    ...
-)
-julia> ds[1]
-(
-    (samples=[...], sampling_rate=16_000),
-    Dict(
-        "text" => "Annotation text here"
-    )
-)
-```
-"""
-function dataset(manifestroot::AbstractString, partition)
+function SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, partition::AbstractString)
    partition_name = partition == "" ? "" : "-$(partition)"
    annot_path =  joinpath(manifestroot, "annotations$(partition_name).jsonl") 
    rec_path = joinpath(manifestroot, "recordings.jsonl")
-    annotations = load(Annotation, annot_path)
-    recordings = load(Recording, rec_path)
-    dataset(annotations, recordings)
-end
-
-function dataset(annotations::AbstractDict, recordings::AbstractDict)
-    idxs = collect(keys(annotations))
-    SpeechDataset(idxs, annotations, recordings)
+    annotations = load_manifest(Annotation, annot_path)
+    recordings = load_manifest(Recording, rec_path)
+    SpeechDataset(infos, annotations, recordings)
 end

 Base.getindex(d::SpeechDataset, key::AbstractString) = d.recordings[key], d.annotations[key]
 Base.getindex(d::SpeechDataset, idx::Integer) = getindex(d, d.idxs[idx])
-# Fix1 -> partial funcion with fixed 1st argument
+# Fix1 -> partial function with fixed 1st argument
 Base.getindex(d::SpeechDataset, idxs::AbstractVector) = map(Base.Fix1(getindex, d), idxs)

 Base.length(d::SpeechDataset) = length(d.idxs)
@@ -65,6 +69,6 @@ function Base.filter(fn, d::SpeechDataset)
        k ∈ idset
    end

-    SpeechDataset(fidxs, fannotations, frecs)
+    SpeechDataset(d.infos, fidxs, fannotations, frecs)
 end

--- a/src/manifest_io.jl
+++ b/src/manifest_io.jl
@@ -3,7 +3,7 @@
 #=====================================================================#
 # JSON serialization of a manifest item

-function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.FileAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -13,7 +13,7 @@ function Base.show(io::IO, m::MIME"application/json", s::FileAudioSource)
    print(io, repeat(" ", indent), "}")
 end

-function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.URLAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -23,7 +23,7 @@ function Base.show(io::IO, m::MIME"application/json", s::URLAudioSource)
    print(io, repeat(" ", indent), "}")
 end

-function Base.show(io::IO, m::MIME"application/json", s::CmdAudioSource)
+function Base.show(io::IO, m::MIME"application/json", s::AudioSources.CmdAudioSource)
    compact = get(io, :compact, false)
    indent = get(io, :indent, 0)
    printfn = compact ? print : println
@@ -78,11 +78,11 @@ end

 function AudioSource(d::Dict)
    if d["type"] == "path"
-        T = FileAudioSource
+        T = AudioSources.FileAudioSource
    elseif d["type"] == "url"
-        T = URLAudioSource
+        T = AudioSources.URLAudioSource
    elseif d["type"] == "cmd"
-        T = CmdAudioSource
+        T = AudioSources.CmdAudioSource
    else
        throw(ArgumentError("invalid type: $(d["type"])"))
    end
@@ -116,7 +116,7 @@ function writemanifest(io::IO, manifest::Dict)
 end

 function readmanifest(io::IO, T)
-    manifest = Dict()
+    manifest = Dict{AbstractString, T}()
    for line in eachline(io)
        item = JSON.parse(line) |> T
        manifest[item.id] = item
@@ -129,12 +129,12 @@ manifestname(::Type{<:Recording}, name) = "recordings.jsonl"
 manifestname(::Type{<:Annotation}, name) = "annotations-$name.jsonl"

 """
-    load(Annotation, path)
-    load(Recording, path)
+    load_manifest(Annotation, path)
+    load_manifest(Recording, path)

 Load Recording/Annotation manifest from `path`.
 """
-load(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")
+load_manifest(T::Type{<:Union{Recording, Annotation}}, path) = open(f -> readmanifest(f, T), path, "r")

 function checkdir(dir::AbstractString)
    isdir(dir) || throw(ArgumentError("$dir is not an existing directory"))