Compare revisions

Nicolas Denier · Simon Devauchelle · Nicolas Denier · Nicolas Denier · Nicolas Denier · Nicolas Denier
--- a/.gitignore
+++ b/.gitignore
 *outputdir/
 Manifest.toml
 notebook-test.jl
+docs/build/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: julia:1.9
+stages:
+  - test
+  - build-docs
+  - deploy
+  - build-badges
+  - deploy-badges
+variables:
+  FAILED: "echo \"failed\" > .status"
+  PASSED: "echo \"passed\" > .status"
+  WRITE_ENV: "echo \"$${PREFIX}_STATUS=$$(cat .status)\" >> .env"
+# Rule to run a job only on merge request on main
+.only-on-merge-request:
+  rules:
+    - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
+# Run a job only when a version tag is pushed on main branch
+.only-vtag-on-main:
+  rules:
+    - if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/ # ensure it corresponds to a version
+# $CI_COMMIT_BRANCH == "main"
+# Run unit tests if provided on merge request 
+tests:
+  stage: test
+  variables:
+    SDL_VIDEODRIVER: "dummy"
+    SDL_AUDIODRIVER: "disk"
+    PREFIX: "TEST" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-on-merge-request
+  before_script:
+   - eval "$FAILED" # set status to failed by default
+   - apt update -y
+   - apt install -y libasound2-dev
+   - |
+     julia -e '
+      using Pkg
+      pkg"registry add https://github.com/JuliaRegistries/General"
+      pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+      Pkg.activate(; temp = true)
+      Pkg.resolve()
+      Pkg.precompile()'
+  script:
+    - |
+      if [ -f test/runtests.jl ]; then
+        julia --project=./ -e 'using Pkg; Pkg.test()'
+      else
+        echo "[warning] no tests provided"
+      fi
+    - eval "$PASSED" # set status to passed
+# Build documentation on merge request
+build-docs:
+  stage: build-docs    
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-on-merge-request
+  variables:
+    SDL_VIDEODRIVER: "dummy"
+    SDL_AUDIODRIVER: "disk"
+    PREFIX: "BUILD_DOCS" # badge prefix
+  before_script:
+    - eval "$FAILED" # set status to failed by default
+    - apt update -y
+    - apt install -y libasound2-dev
+    - apt clean
+    - |
+      julia --project=docs -e '
+        using Pkg
+        pkg"registry add https://github.com/JuliaRegistries/General"
+        pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+        Pkg.develop(PackageSpec(path=pwd()))
+        Pkg.instantiate()'
+  script:
+    - julia --project=docs docs/make.jl
+    - eval "$PASSED" # set status to passed
+# Deploy documentation once build-docs succeeded on new version tag
+deploy-docs:
+  stage: deploy
+  variables:
+    PREFIX: "DEPLOY_DOCS" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-vtag-on-main
+  before_script: 
+    - eval "$FAILED" # set status to failed by default
+    - apt update && apt install -y git
+    - git clone -b docs --single-branch "https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/${CI_PROJECT_NAME}.git"
+    - mkdir docs/build
+    - mv ${CI_PROJECT_NAME}/dev/* docs/build
+  script:
+    - cat docs/build/index.html 
+    - echo "success"
+    - eval "$PASSED" # set status to passed
+# Register a new version on new version tag
+deploy-version:
+  stage: deploy
+  variables:
+    PREFIX: "DEPLOY_VERSION" # badge prefix
+  extends: 
+    - .dotenv # share job status in .env
+    - .only-vtag-on-main
+  before_script:
+    - eval "$FAILED" # set status to failed by default
+    - apt update && apt install -y git
+    - git config --global user.email "$GITLAB_USER_EMAIL"
+    - git config --global user.name "$GITLAB_USER_NAME"
+    - julia -e "using Pkg; Pkg.add(\"LocalRegistry\"); pkg\"registry add https://registry-token:${REGISTRY_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/Registry.git\""
+    # install release-cli
+    - BINARY_NAME=release-cli-linux-amd64
+    - curl --output /usr/local/bin/release-cli "https://gitlab.com/api/v4/projects/gitlab-org%2Frelease-cli/packages/generic/release-cli/latest/$BINARY_NAME" 
+    - chmod +x /usr/local/bin/release-cli
+    - export PATH=$PATH:/usr/local/bin
+  script:
+    - julia --project=. -e 'using Pkg; Pkg.Registry.update(); using LocalRegistry; register(; registry = "PTAL")' 
+    - eval "$PASSED" # set status to passed
+  release:
+    tag_name: '$CI_COMMIT_TAG'
+    description: '$CI_COMMIT_TAG'
+### BADGES ###
+# share .env
+.dotenv: 
+  after_script:
+    - eval "$WRITE_ENV"
+  artifacts:
+    reports:
+      dotenv: .env
+# example job supporting a badge
+# required lines are marked with ##
+# example:
+#   stage: test
+#   variables:          ##
+#     PREFIX: "TEST"    ##
+#   extends: .dotenv    ##
+#   before_script:
+#     - eval "$FAILED"  ##
+#   script:
+#     - echo "passed"
+#     - eval "$PASSED"  ##
+#   only:
+#     - main
+# Generate a json artifact corresponding to a badge
+.badge:
+  stage: build-badges
+  when: always # runs even if previous job failed
+  before_script:
+    - STATUS_VAR=${PREFIX}_STATUS
+    - STATUS="${!STATUS_VAR}"
+    # set color according to status
+    - |
+      case "$STATUS" in
+        "failed")
+          COLOR="red"
+          ;;
+        "passed")
+          COLOR="brightgreen"
+          ;;
+        *)
+          COLOR="grey"
+          ;;
+      esac
+  script:
+    # https://shields.io/badges/endpoint-badge
+    - echo "{\"schemaVersion\":1, \"label\":\"$LABEL\", \"message\":\"$STATUS\", \"color\":\"$COLOR\"}" > "${PREFIX}_BADGE.json"
+  artifacts:
+    paths:
+      - "${PREFIX}_BADGE.json"
+    when: always
+# Define a badge for each job
+tests-badge:
+  extends: 
+    - .badge
+    - .only-on-merge-request
+  variables:
+    PREFIX: "TEST"
+    LABEL: "Tests"
+build-docs-badge:
+  extends: 
+    - .badge
+    - .only-on-merge-request
+  variables:
+    PREFIX: "BUILD_DOCS"
+    LABEL: "Build Docs"
+deploy-docs-badge:
+  extends: 
+    - .badge
+    - .only-vtag-on-main
+  variables:
+    PREFIX: "DEPLOY_DOCS"
+    LABEL: "Deploy Docs"
+deploy-version-badge:
+  extends: 
+    - .badge
+    - .only-vtag-on-main
+  variables:
+    PREFIX: "DEPLOY_VERSION"
+    LABEL: "Deploy Version"
+# Deploy badges to gitub pages so they can be fetched with shields.io api
+pages:
+  stage: deploy-badges
+  when: always
+  before_script:
+    # get all current badges if there isn't a new one
+    # ignore unexisting badges (404)
+    - |
+      for PREFIX in "TEST" "BUILD_DOCS" "DEPLOY_DOCS" "DEPLOY_VERSION"
+      do 
+        FILE="${PREFIX}_BADGE.json"
+        if [ ! -f "$FILE" ];then 
+          URL=$(echo "$CI_PAGES_URL/$FILE" | sed "s/http/https/")
+          curl -o "$FILE" "$URL"
+          if cat "$FILE" | grep -q 404 ;then
+            rm "$FILE"
+            echo "removed $FILE"
+          fi
+        fi
+      done
+  script: 
+    # new badges and previous ones not renewed are published to pages
+    - mkdir public
+    - mv *_BADGE.json public/
+    - echo "deploying badges"
+  artifacts:
+    paths:
+    - public
+  rules:
+    - if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/
+    - if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Tags
+## [0.17.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.17.0) - 26/08/2024
+### Changed
+- Datasets are now loadable with `dataset("name", inputdir, outputdir; <keyword arguments>)`
+- New corpus can be added by simply creating a single file and implementing at least one function. 
+### Added
+- Supported corpora are referenced in `corpora.json`
+- Added documentation, will update on new corpus.
 ## [0.15.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.15.0) - 19/06/2024
 ### Changed
 - Added support for Speech2Tex dataset

--- a/Project.toml
+++ b/Project.toml
 name = "SpeechDatasets"
 uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
-authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>",
+authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", "Nicolas DENIER <nicolas.denier@cnrs.fr>"]
-           "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>",
+version = "0.17.2"
-           "Nicolas DENIER <nicolas.denier@lisn.fr>"]
-version = "0.15.0"
 [deps]
+SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
+AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
-SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
 [compat]
-julia = "1.10"
 JSON = "0.21"
-SpeechFeatures = "0.8"
+AudioSources = "0.3.0"
+SpeechFeatures = "0.10.4"
--- a/README.md
+++ b/README.md
@@ -4,36 +4,24 @@ A Julia package to download and prepare speech corpus.
 ## Installation
-Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry)
+Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/PTAL/Registry)
 to your julia installation. Then, install the package as usual:
-```
+```julia
 pkg> add SpeechDatasets
 ```
+## Usage
+```julia
+dataset("name", inputdir, outputdir; <keyword arguments>)
+```
 ## Example
-```
+```julia
 julia> using SpeechDatasets
-julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
+julia> ds = dataset("TIMIT", "/path/to/timit/dir", "outputdir"; subset="train")
-...
-julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
-...
-julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
-...
-julia> dataset = AVID("/path/to/avid/dir", "outputdir")
-...
-julia> dataset = SPEECH2TEX("/path/to/speech2tex/dir", "outputdir")
-...
-julia> for ((signal, fs), supervision) in dataset
+# Access any element
-           # do something
+julia> ds[5]
-       end
 # Lexicons
 julia> CMUDICT("outputfile")
@@ -46,5 +34,5 @@ julia> TIMITDICT("/path/to/timit/dir")
 ## License
-This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE))
+This software is provided under the [CeCILL-C license](https://cecill.info/licences.en.html) (see [`/license`](/license))
--- a/docs/Project.toml
+++ b/docs/Project.toml
+[deps]
+AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+SpeechDatasets = "ae813453-fab8-46d9-ab8f-a64c05464021"
--- a/docs/deployconfig.jl
+++ b/docs/deployconfig.jl
+# Allows to connect to GitLab with HTTPS
+# https://github.com/JuliaDocs/Documenter.jl/issues/2061#issuecomment-1607077792
+# https://github.com/JuliaDocs/Documenter.jl/blob/master/src/deployconfig.jl
+using Documenter: DeployConfig, DeployDecision, marker, env_nonempty, HTTPS
+@kwdef struct GitLabHTTPS <: DeployConfig
+    commit_branch::String = get(ENV, "CI_COMMIT_BRANCH", "")
+    pull_request_iid::String = get(ENV, "CI_EXTERNAL_PULL_REQUEST_IID", "")
+    repo_path::String = get(ENV, "CI_PROJECT_PATH", "")
+    repo_slug::String = get(ENV, "CI_PROJECT_PATH_SLUG", "")
+    commit_tag::String = get(ENV, "CI_COMMIT_TAG", "")
+    pipeline_source::String = get(ENV, "CI_PIPELINE_SOURCE", "")
+end
+Documenter.authentication_method(::GitLabHTTPS) = HTTPS
+function Documenter.authenticated_repo_url(cfg::GitLabHTTPS) 
+    token = get(ENV,"CI_BOT_TOKEN","")
+    host = get(ENV,"CI_SERVER_HOST", "")
+    return "https://documenter-ci:$token@$host/$(cfg.repo_path).git"
+end
+function Documenter.deploy_folder(
+    cfg::GitLabHTTPS;
+    repo,
+    repo_previews = repo,
+    devbranch,
+    push_preview,
+    devurl,
+    branch = "docs",
+    branch_previews = branch,
+    tag_prefix = "",
+    kwargs...,
+)
+    io = IOBuffer()
+    all_ok = true
+    println(io, "\nGitLab config:")
+    println(io, "  Commit branch: \"", cfg.commit_branch, "\"")
+    println(io, "  Pull request IID: \"", cfg.pull_request_iid, "\"")
+    println(io, "  Repo slug: \"", cfg.repo_slug, "\"")
+    println(io, "  Commit tag: \"", cfg.commit_tag, "\"")
+    println(io, "  Pipeline source: \"", cfg.pipeline_source, "\"")
+    build_type = if cfg.pull_request_iid != ""
+        :preview
+    elseif cfg.commit_tag != ""
+        :release
+    else
+        :devbranch
+    end
+    println(io, "Detected build type: ", build_type)
+    if build_type == :release
+        tag_nobuild = version_tag_strip_build(cfg.commit_tag; tag_prefix)
+        ## If a tag exist it should be a valid VersionNumber
+        tag_ok = tag_nobuild !== nothing
+        println(
+            io,
+            "- $(marker(tag_ok)) ENV[\"CI_COMMIT_TAG\"] contains a valid VersionNumber",
+        )
+        all_ok &= tag_ok
+        is_preview = false
+        subfolder = tag_nobuild
+        deploy_branch = branch
+        deploy_repo = repo
+    elseif build_type == :preview
+        pr_number = tryparse(Int, cfg.pull_request_iid)
+        pr_ok = pr_number !== nothing
+        all_ok &= pr_ok
+        println(
+            io,
+            "- $(marker(pr_ok)) ENV[\"CI_EXTERNAL_PULL_REQUEST_IID\"]=\"$(cfg.pull_request_iid)\" is a number",
+        )
+        btype_ok = push_preview
+        all_ok &= btype_ok
+        is_preview = true
+        println(
+            io,
+            "- $(marker(btype_ok)) `push_preview` keyword argument to deploydocs is `true`",
+        )
+        ## deploy to previews/PR
+        subfolder = "previews/PR$(something(pr_number, 0))"
+        deploy_branch = branch_previews
+        deploy_repo = repo_previews
+    else
+        branch_ok = !isempty(cfg.commit_tag) || cfg.commit_branch == devbranch
+        all_ok &= branch_ok
+        println(
+            io,
+            "- $(marker(branch_ok)) ENV[\"CI_COMMIT_BRANCH\"] matches devbranch=\"$(devbranch)\"",
+        )
+        is_preview = false
+        subfolder = devurl
+        deploy_branch = branch
+        deploy_repo = repo
+    end
+    key_ok = env_nonempty("CI_BOT_TOKEN")
+    println(io, "- $(marker(key_ok)) ENV[\"CI_BOT_TOKEN\"] exists and is non-empty")
+    all_ok &= key_ok
+    print(io, "Deploying to folder $(repr(subfolder)): $(marker(all_ok))")
+    @info String(take!(io))
+    if all_ok
+        return DeployDecision(;
+            all_ok = true,
+            branch = deploy_branch,
+            repo = deploy_repo,
+            subfolder = subfolder,
+            is_preview = is_preview,
+        )
+    else
+        return DeployDecision(; all_ok = false)
+    end
+end
\ No newline at end of file
--- a/docs/make.jl
+++ b/docs/make.jl
+push!(LOAD_PATH,"..")
+using Documenter, SpeechDatasets, AudioSources
+using Documenter.Remotes
+include("deployconfig.jl")
+makedocs(
+    sitename="SpeechDatasets", 
+    repo = Remotes.GitLab("gitlab.lisn.upsaclay.fr", "PTAL", "Datasets/SpeechDatasets.jl"),
+    doctest = false,
+    pages = [
+        "Home" => "index.md",
+        "Installation" => "installation.md",
+        "Examples" => "examples.md",
+        "API" => "api.md",
+        "Supported datasets" => "datasets.md",
+        "Add a new dataset" => "newdataset.md",
+    ]
+)
+config = GitLabHTTPS()
+deploydocs(
+    repo = "gitlab.lisn.upsaclay.fr/PTAL/Datasets/SpeechDatasets.jl",
+    devbranch = config.commit_branch,
+    branch = "docs",
+    deploy_config = GitLabHTTPS()
+)
\ No newline at end of file
--- a/docs/src/api.md
+++ b/docs/src/api.md
+# API
+## Load a Dataset
+To get data from a supported dataset, you only need one function: 
+```@docs
+dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString)
+Base.summary(dataset::SpeechDataset)
+get_dataset_kwargs(name::String)
+```
+## Types
+### SpeechDataset
+```@docs
+SpeechDatasetInfos
+SpeechDatasetInfos(name::AbstractString)
+SpeechDataset
+SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
+```
+Access a single element with integer or id indexing
+```julia
+# ds::SpeechDataset
+ds[1]
+ds["1988-147956-0027"]
+```
+Access several elements by providing a list
+```julia
+ds[[1,4,7]]
+ds[[8, 2, "777-126732-0015"]]
+```
+Get all annotations
+```julia
+ds.annotations
+```
+### Manifest items
+```@docs
+SpeechDatasets.ManifestItem
+Recording
+Annotation
+AudioSources.load(r::Recording; start = -1, duration = -1, channels = r.channels)
+AudioSources.load(r::Recording, a::Annotation)
+SpeechDatasets.load_manifest(T::Type{<:Union{Recording, Annotation}}, path)
+```
+## Lexicons
+```@docs
+CMUDICT(path)
+TIMITDICT(timitdir)
+MFAFRDICT(path)
+```
+## Index
+```@index
+```
\ No newline at end of file
--- a/docs/src/assets/logo.svg
+++ b/docs/src/assets/logo.svg
+<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
+	<mask id="myMask">
+		<rect x="0" y="0" width="200" height="200" fill="white" />
+		<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="black" transform="rotate(45,100,100)"/>
+	</mask>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(0.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(45.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(90.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(135.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(180.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(225.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(270.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(315.0, 100, 100)" mask="url(#myMask)"/>
+	<circle cx="100" cy="100" r="27.878" stroke="black" stroke-width="2.41" fill="yellow"/>
+</svg>
--- a/docs/src/assets/ptal-logo.svg
+++ b/docs/src/assets/ptal-logo.svg
+<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
+	<mask id="myMask">
+		<rect x="0" y="0" width="200" height="200" fill="white" />
+		<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="black" transform="rotate(45,100,100)"/>
+	</mask>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(0.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(45.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(90.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(135.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(180.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(225.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(270.0, 100, 100)"/>
+	<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(315.0, 100, 100)" mask="url(#myMask)"/>
+	<circle cx="100" cy="100" r="27.878" stroke="black" stroke-width="2.41" fill="yellow"/>
+</svg>
--- a/docs/src/datasets.md
+++ b/docs/src/datasets.md
+# Supported Datasets
+```@eval
+using SpeechDatasets, JSON, Markdown
+corpora_infos = JSON.parsefile(SpeechDatasets.corpora_file)
+function write_corpora_docs(io::IO)
+    for corpus in corpora_infos
+        fields = keys(corpus)
+        println(io, "## $(corpus["name"])")
+        if "license" in fields
+            license = replace(corpus["license"], "-" => "--") # dash are escaped
+            color = license=="proprietary" ? "red" : "lightblue"
+            license_badge = "https://img.shields.io/badge/License-$license-$color"
+            println(io, "![License]($license_badge)")
+        end
+        if "lang" in fields
+            languages = corpus["lang"] isa String ? [corpus["lang"]] : corpus["lang"]
+            for lang in languages
+                lang_badge = "https://img.shields.io/badge/Lang-$lang-lightgreen"
+                println(io, "![Language]($lang_badge)")
+            end
+        end
+        if "description" in fields
+            println(io, "")
+            println(io, corpus["description"])
+        end
+        if "source" in fields
+            println(io, "")
+            println(io, "[Source]($(corpus["source"]))")
+        end
+        if "authors" in fields
+            println(io, "")
+            println(io, "### Authors")
+            println(io, join(corpus["authors"], ", "))
+        end
+        need_subset = false
+        if "subsets" in fields
+            need_subset = true
+            println(io, "")
+            println(io, "### Subsets")
+            println(io, join(corpus["subsets"], ", "))
+        end
+        kwargs = get_dataset_kwargs(corpus["name"])
+        if need_subset 
+            kwargs = merge(kwargs, (;subset=""))
+        end
+        if ! isempty(kwargs)
+            println(io, "### Keyword arguments")
+            println(io, "```julia")
+            println(io, kwargs)
+            println(io, "```")
+        end
+        println(io, "\n---")
+    end
+end
+Markdown.parse(sprint(write_corpora_docs))
+```
\ No newline at end of file
--- a/docs/src/examples.md
+++ b/docs/src/examples.md
+# Examples
+```julia
+using SpeechDatasets
+ds = dataset("Mini LibriSpeech", "path/to/minils", "minils_output")
+typeof(ds[26])
+```
+```@example
+println("Tuple{Recording, Annotation}") # hide
+```
\ No newline at end of file
--- a/docs/src/index.md
+++ b/docs/src/index.md
+# SpeechDatasets.jl
+Convenient and unified way to load a speech dataset. It can then be harnessed with other PTAL tools.  
+A `SpeechDataset` instance consists of a set of recordings (info about audio data) and annotations.
+## Contents 
+```@contents
+Pages = ["index.md", "installation.md", "examples.md", "api.md", "datasets.md", "newdataset.md"]
+```
+## License
+This software is provided under the [CeCILL-C license](https://cecill.info/licences.en.html)
+## Authors
+- Lucas Ondel Yang
+- Nicolas Denier
+- Simon Devauchelle
+![](https://ptal.lisn.upsaclay.fr/assets/lisn-ups-cnrs.png)
\ No newline at end of file
--- a/docs/src/installation.md
+++ b/docs/src/installation.md
+# Installation
+This package is part of the PTAL tool collection and requires the
+[PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry) to be installed.
+To add this registry to your Julia installation type `]` to enter the
+package mode of the REPL and then type:
+```
+pkg> registry add "https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
+```
+Once the registry has been added, SpeechDatasets can be installed with the
+Julia package manager by typing in Pkg REPL mode
+```
+pkg> add SpeechDatasets
+```
--- a/docs/src/newdataset.md
+++ b/docs/src/newdataset.md
+# Add a new dataset
+1. Add metadatas in `src/corpora/corpora.json`  
+    Example:
+        {
+            "name": "TIMIT",
+            "lang": "eng",
+            "license": "LDC User Agreement for Non-Members",
+            "source": "https://catalog.ldc.upenn.edu/LDC93S1",
+            "authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
+            "description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems.",
+            "subsets": ["train", "dev", "test"]
+        },
+2. Create a new `.jl` file in `src/corpora`
+3. Add the following line at the beginning of the file:  
+        const <idname> = get_nametype(<dataset name>)
+    - Replace `<idname>` with an identifier of your dataset (for example, `timit_id`).
+    - Replace `<dataset name>` with a string containing the name of the dataset (same as referenced in `corpora.json`).
+4. If your dataset is downloadable, you can implement
+        Base.download(::DatasetBuilder{<idname>}, dir::AbstractString)
+5. It is mandatory to implement the `prepare()` function as such: 
+        prepare(::DatasetBuilder{<idname>}, inputdir, outputdir; <keyword arguments>)
+    You can add any keyword argument.  
+    This function must create the following files in outputdir:
+      - `recordings.jsonl`
+      - `annotations.jsonl` or `annotations-<subset>.jsonl` for each subset
+That's it, you can now use 
+```julia
+dataset("name", inputdir, outputdir; <keyword arguments>)
+```
+## DatasetBuilder and utilities
+```@docs
+DatasetBuilder
+DatasetBuilder(name::Symbol)
+SpeechDatasets.declareBuilder(name::Symbol)
+get_kwargs(func_name::Function, args_types::Tuple)
+get_nametype(name::String)
+Base.download
+prepare
+```
\ No newline at end of file
--- a/examples/load_dataset.jl
+++ b/examples/load_dataset.jl
--- a/license/LICENCE-fra.txt
+++ b/license/LICENCE-fra.txt
--- a/license/LICENSE-eng.txt
+++ b/license/LICENSE-eng.txt
--- a/src/SpeechDatasets.jl
+++ b/src/SpeechDatasets.jl
-# SPDX-License-Identifier: CECILL-2.1
+# SPDX-License-Identifier: CECILL-C
 module SpeechDatasets
 using JSON
+import AudioSources
 using SpeechFeatures
 import MLUtils
@@ -16,36 +17,38 @@ export
    writemanifest,
    readmanifest,
-    # Corpora interface
-    download,
-    lang,
-    name,
-    prepare,
-    # Corpora
-    MultilingualLibriSpeech,
-    MINILIBRISPEECH,
-    TIMIT,
-    INADIACHRONY,
-    AVID,
-    SPEECH2TEX,
    # Lexicon
    CMUDICT,
    TIMITDICT,
    MFAFRDICT,
+    # Builder
+    DatasetBuilder,
+    get_kwargs,
+    get_dataset_kwargs,
+    get_nametype,
+    download,
+    prepare,
    # Dataset
+    SpeechDatasetInfos,
+    SpeechDataset,
+    summary,
    dataset
-include("speechcorpus.jl")
+const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json")
+const corpora_names = map(c -> c["name"], JSON.parsefile(corpora_file))
 include("manifest_item.jl")
 include("manifest_io.jl")
+include("builder.jl")
 include("dataset.jl")
 # Supported corpora
-include.("corpora/".*filter(contains(r".jl$"), readdir("src/corpora/")))
+include.("corpora/".*filter(contains(r"\.jl$"), readdir(joinpath(@__DIR__, "corpora"))))
 include("lexicons.jl")
+# declare all supported builders
+declareBuilder.(get_nametype.(corpora_names))
 end
No results found