Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • fast/speechdatasets.jl
  • PTAL/Datasets/SpeechDatasets.jl
2 results
Show changes
Commits on Source (24)
Showing with 2604 additions and 49 deletions
*outputdir/ *outputdir/
Manifest.toml Manifest.toml
notebook-test.jl notebook-test.jl
docs/build/
image: julia:1.9
stages:
- test
- build-docs
- deploy
- build-badges
- deploy-badges
variables:
FAILED: "echo \"failed\" > .status"
PASSED: "echo \"passed\" > .status"
WRITE_ENV: "echo \"$${PREFIX}_STATUS=$$(cat .status)\" >> .env"
# Rule to run a job only on merge request on main
.only-on-merge-request:
rules:
- if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
# Run a job only when a version tag is pushed on main branch
.only-vtag-on-main:
rules:
- if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/ # ensure it corresponds to a version
# $CI_COMMIT_BRANCH == "main"
# Run unit tests if provided on merge request
tests:
stage: test
variables:
SDL_VIDEODRIVER: "dummy"
SDL_AUDIODRIVER: "disk"
PREFIX: "TEST" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-on-merge-request
before_script:
- eval "$FAILED" # set status to failed by default
- apt update -y
- apt install -y libasound2-dev
- |
julia -e '
using Pkg
pkg"registry add https://github.com/JuliaRegistries/General"
pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
Pkg.activate(; temp = true)
Pkg.resolve()
Pkg.precompile()'
script:
- |
if [ -f test/runtests.jl ]; then
julia --project=./ -e 'using Pkg; Pkg.test()'
else
echo "[warning] no tests provided"
fi
- eval "$PASSED" # set status to passed
# Build documentation on merge request
build-docs:
stage: build-docs
extends:
- .dotenv # share job status in .env
- .only-on-merge-request
variables:
SDL_VIDEODRIVER: "dummy"
SDL_AUDIODRIVER: "disk"
PREFIX: "BUILD_DOCS" # badge prefix
before_script:
- eval "$FAILED" # set status to failed by default
- apt update -y
- apt install -y libasound2-dev
- apt clean
- |
julia --project=docs -e '
using Pkg
pkg"registry add https://github.com/JuliaRegistries/General"
pkg"registry add https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
Pkg.develop(PackageSpec(path=pwd()))
Pkg.instantiate()'
script:
- julia --project=docs docs/make.jl
- eval "$PASSED" # set status to passed
# Deploy documentation once build-docs succeeded on new version tag
deploy-docs:
stage: deploy
variables:
PREFIX: "DEPLOY_DOCS" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-vtag-on-main
before_script:
- eval "$FAILED" # set status to failed by default
- apt update && apt install -y git
- git clone -b docs --single-branch "https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/${CI_PROJECT_NAME}.git"
- mkdir docs/build
- mv ${CI_PROJECT_NAME}/dev/* docs/build
script:
- cat docs/build/index.html
- echo "success"
- eval "$PASSED" # set status to passed
# Register a new version on new version tag
deploy-version:
stage: deploy
variables:
PREFIX: "DEPLOY_VERSION" # badge prefix
extends:
- .dotenv # share job status in .env
- .only-vtag-on-main
before_script:
- eval "$FAILED" # set status to failed by default
- apt update && apt install -y git
- git config --global user.email "$GITLAB_USER_EMAIL"
- git config --global user.name "$GITLAB_USER_NAME"
- julia -e "using Pkg; Pkg.add(\"LocalRegistry\"); pkg\"registry add https://registry-token:${REGISTRY_TOKEN}@gitlab.lisn.upsaclay.fr/PTAL/Registry.git\""
# install release-cli
- BINARY_NAME=release-cli-linux-amd64
- curl --output /usr/local/bin/release-cli "https://gitlab.com/api/v4/projects/gitlab-org%2Frelease-cli/packages/generic/release-cli/latest/$BINARY_NAME"
- chmod +x /usr/local/bin/release-cli
- export PATH=$PATH:/usr/local/bin
script:
- julia --project=. -e 'using Pkg; Pkg.Registry.update(); using LocalRegistry; register(; registry = "PTAL")'
- eval "$PASSED" # set status to passed
release:
tag_name: '$CI_COMMIT_TAG'
description: '$CI_COMMIT_TAG'
### BADGES ###
# share .env
.dotenv:
after_script:
- eval "$WRITE_ENV"
artifacts:
reports:
dotenv: .env
# example job supporting a badge
# required lines are marked with ##
# example:
# stage: test
# variables: ##
# PREFIX: "TEST" ##
# extends: .dotenv ##
# before_script:
# - eval "$FAILED" ##
# script:
# - echo "passed"
# - eval "$PASSED" ##
# only:
# - main
# Generate a json artifact corresponding to a badge
.badge:
stage: build-badges
when: always # runs even if previous job failed
before_script:
- STATUS_VAR=${PREFIX}_STATUS
- STATUS="${!STATUS_VAR}"
# set color according to status
- |
case "$STATUS" in
"failed")
COLOR="red"
;;
"passed")
COLOR="brightgreen"
;;
*)
COLOR="grey"
;;
esac
script:
# https://shields.io/badges/endpoint-badge
- echo "{\"schemaVersion\":1, \"label\":\"$LABEL\", \"message\":\"$STATUS\", \"color\":\"$COLOR\"}" > "${PREFIX}_BADGE.json"
artifacts:
paths:
- "${PREFIX}_BADGE.json"
when: always
# Define a badge for each job
tests-badge:
extends:
- .badge
- .only-on-merge-request
variables:
PREFIX: "TEST"
LABEL: "Tests"
build-docs-badge:
extends:
- .badge
- .only-on-merge-request
variables:
PREFIX: "BUILD_DOCS"
LABEL: "Build Docs"
deploy-docs-badge:
extends:
- .badge
- .only-vtag-on-main
variables:
PREFIX: "DEPLOY_DOCS"
LABEL: "Deploy Docs"
deploy-version-badge:
extends:
- .badge
- .only-vtag-on-main
variables:
PREFIX: "DEPLOY_VERSION"
LABEL: "Deploy Version"
# Deploy badges to gitub pages so they can be fetched with shields.io api
pages:
stage: deploy-badges
when: always
before_script:
# get all current badges if there isn't a new one
# ignore unexisting badges (404)
- |
for PREFIX in "TEST" "BUILD_DOCS" "DEPLOY_DOCS" "DEPLOY_VERSION"
do
FILE="${PREFIX}_BADGE.json"
if [ ! -f "$FILE" ];then
URL=$(echo "$CI_PAGES_URL/$FILE" | sed "s/http/https/")
curl -o "$FILE" "$URL"
if cat "$FILE" | grep -q 404 ;then
rm "$FILE"
echo "removed $FILE"
fi
fi
done
script:
# new badges and previous ones not renewed are published to pages
- mkdir public
- mv *_BADGE.json public/
- echo "deploying badges"
artifacts:
paths:
- public
rules:
- if: $CI_COMMIT_TAG =~ /^v\d+.\d+.\d+-?.*$/
- if: $CI_MERGE_REQUEST_TARGET_BRANCH_NAME == "main"
# Tags # Tags
## [0.17.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.17.0) - 26/08/2024
### Changed
- Datasets are now loadable with `dataset("name", inputdir, outputdir; <keyword arguments>)`
- New corpus can be added by simply creating a single file and implementing at least one function.
### Added
- Supported corpora are referenced in `corpora.json`
- Added documentation, will update on new corpus.
## [0.15.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.15.0) - 19/06/2024 ## [0.15.0](https://https://gitlab.lisn.upsaclay.fr/fast/speechdatasets.jl/-/tags/v0.15.0) - 19/06/2024
### Changed ### Changed
- Added support for Speech2Tex dataset - Added support for Speech2Tex dataset
......
name = "SpeechDatasets" name = "SpeechDatasets"
uuid = "ae813453-fab8-46d9-ab8f-a64c05464021" uuid = "ae813453-fab8-46d9-ab8f-a64c05464021"
authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", authors = ["Lucas ONDEL YANG <lucas.ondel@cnrs.fr>", "Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", "Nicolas DENIER <nicolas.denier@cnrs.fr>"]
"Simon DEVAUCHELLE <simon.devauchelle@universite-paris-saclay.fr>", version = "0.17.2"
"Nicolas DENIER <nicolas.denier@lisn.fr>"]
version = "0.15.0"
[deps] [deps]
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
SpeechFeatures = "6f3487c4-5ca2-4050-bfeb-2cf56df92307"
[compat] [compat]
julia = "1.10"
JSON = "0.21" JSON = "0.21"
SpeechFeatures = "0.8" AudioSources = "0.3.0"
SpeechFeatures = "0.10.4"
...@@ -4,36 +4,24 @@ A Julia package to download and prepare speech corpus. ...@@ -4,36 +4,24 @@ A Julia package to download and prepare speech corpus.
## Installation ## Installation
Make sure to add the [FAST registry](https://gitlab.lisn.upsaclay.fr/fast/registry) Make sure to add the [PTAL registry](https://gitlab.lisn.upsaclay.fr/PTAL/Registry)
to your julia installation. Then, install the package as usual: to your julia installation. Then, install the package as usual:
``` ```julia
pkg> add SpeechDatasets pkg> add SpeechDatasets
``` ```
## Usage
```julia
dataset("name", inputdir, outputdir; <keyword arguments>)
```
## Example ## Example
``` ```julia
julia> using SpeechDatasets julia> using SpeechDatasets
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test julia> ds = dataset("TIMIT", "/path/to/timit/dir", "outputdir"; subset="train")
...
julia> dataset = TIMIT("/path/to/timit/dir", "outputdir", :train) # :dev | :test
...
julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
...
julia> dataset = AVID("/path/to/avid/dir", "outputdir")
...
julia> dataset = SPEECH2TEX("/path/to/speech2tex/dir", "outputdir")
...
julia> for ((signal, fs), supervision) in dataset # Access any element
# do something julia> ds[5]
end
# Lexicons # Lexicons
julia> CMUDICT("outputfile") julia> CMUDICT("outputfile")
...@@ -46,5 +34,5 @@ julia> TIMITDICT("/path/to/timit/dir") ...@@ -46,5 +34,5 @@ julia> TIMITDICT("/path/to/timit/dir")
## License ## License
This software is provided under the CeCILL 2.1 license (see the [`/LICENSE`](/LICENSE)) This software is provided under the [CeCILL-C license](https://cecill.info/licences.en.html) (see [`/license`](/license))
[deps]
AudioSources = "09fc2aa8-47ce-428a-ad90-e701fa7ea67f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
SpeechDatasets = "ae813453-fab8-46d9-ab8f-a64c05464021"
# Allows to connect to GitLab with HTTPS
# https://github.com/JuliaDocs/Documenter.jl/issues/2061#issuecomment-1607077792
# https://github.com/JuliaDocs/Documenter.jl/blob/master/src/deployconfig.jl
using Documenter: DeployConfig, DeployDecision, marker, env_nonempty, HTTPS
@kwdef struct GitLabHTTPS <: DeployConfig
commit_branch::String = get(ENV, "CI_COMMIT_BRANCH", "")
pull_request_iid::String = get(ENV, "CI_EXTERNAL_PULL_REQUEST_IID", "")
repo_path::String = get(ENV, "CI_PROJECT_PATH", "")
repo_slug::String = get(ENV, "CI_PROJECT_PATH_SLUG", "")
commit_tag::String = get(ENV, "CI_COMMIT_TAG", "")
pipeline_source::String = get(ENV, "CI_PIPELINE_SOURCE", "")
end
Documenter.authentication_method(::GitLabHTTPS) = HTTPS
function Documenter.authenticated_repo_url(cfg::GitLabHTTPS)
token = get(ENV,"CI_BOT_TOKEN","")
host = get(ENV,"CI_SERVER_HOST", "")
return "https://documenter-ci:$token@$host/$(cfg.repo_path).git"
end
function Documenter.deploy_folder(
cfg::GitLabHTTPS;
repo,
repo_previews = repo,
devbranch,
push_preview,
devurl,
branch = "docs",
branch_previews = branch,
tag_prefix = "",
kwargs...,
)
io = IOBuffer()
all_ok = true
println(io, "\nGitLab config:")
println(io, " Commit branch: \"", cfg.commit_branch, "\"")
println(io, " Pull request IID: \"", cfg.pull_request_iid, "\"")
println(io, " Repo slug: \"", cfg.repo_slug, "\"")
println(io, " Commit tag: \"", cfg.commit_tag, "\"")
println(io, " Pipeline source: \"", cfg.pipeline_source, "\"")
build_type = if cfg.pull_request_iid != ""
:preview
elseif cfg.commit_tag != ""
:release
else
:devbranch
end
println(io, "Detected build type: ", build_type)
if build_type == :release
tag_nobuild = version_tag_strip_build(cfg.commit_tag; tag_prefix)
## If a tag exist it should be a valid VersionNumber
tag_ok = tag_nobuild !== nothing
println(
io,
"- $(marker(tag_ok)) ENV[\"CI_COMMIT_TAG\"] contains a valid VersionNumber",
)
all_ok &= tag_ok
is_preview = false
subfolder = tag_nobuild
deploy_branch = branch
deploy_repo = repo
elseif build_type == :preview
pr_number = tryparse(Int, cfg.pull_request_iid)
pr_ok = pr_number !== nothing
all_ok &= pr_ok
println(
io,
"- $(marker(pr_ok)) ENV[\"CI_EXTERNAL_PULL_REQUEST_IID\"]=\"$(cfg.pull_request_iid)\" is a number",
)
btype_ok = push_preview
all_ok &= btype_ok
is_preview = true
println(
io,
"- $(marker(btype_ok)) `push_preview` keyword argument to deploydocs is `true`",
)
## deploy to previews/PR
subfolder = "previews/PR$(something(pr_number, 0))"
deploy_branch = branch_previews
deploy_repo = repo_previews
else
branch_ok = !isempty(cfg.commit_tag) || cfg.commit_branch == devbranch
all_ok &= branch_ok
println(
io,
"- $(marker(branch_ok)) ENV[\"CI_COMMIT_BRANCH\"] matches devbranch=\"$(devbranch)\"",
)
is_preview = false
subfolder = devurl
deploy_branch = branch
deploy_repo = repo
end
key_ok = env_nonempty("CI_BOT_TOKEN")
println(io, "- $(marker(key_ok)) ENV[\"CI_BOT_TOKEN\"] exists and is non-empty")
all_ok &= key_ok
print(io, "Deploying to folder $(repr(subfolder)): $(marker(all_ok))")
@info String(take!(io))
if all_ok
return DeployDecision(;
all_ok = true,
branch = deploy_branch,
repo = deploy_repo,
subfolder = subfolder,
is_preview = is_preview,
)
else
return DeployDecision(; all_ok = false)
end
end
\ No newline at end of file
push!(LOAD_PATH,"..")
using Documenter, SpeechDatasets, AudioSources
using Documenter.Remotes
include("deployconfig.jl")
makedocs(
sitename="SpeechDatasets",
repo = Remotes.GitLab("gitlab.lisn.upsaclay.fr", "PTAL", "Datasets/SpeechDatasets.jl"),
doctest = false,
pages = [
"Home" => "index.md",
"Installation" => "installation.md",
"Examples" => "examples.md",
"API" => "api.md",
"Supported datasets" => "datasets.md",
"Add a new dataset" => "newdataset.md",
]
)
config = GitLabHTTPS()
deploydocs(
repo = "gitlab.lisn.upsaclay.fr/PTAL/Datasets/SpeechDatasets.jl",
devbranch = config.commit_branch,
branch = "docs",
deploy_config = GitLabHTTPS()
)
\ No newline at end of file
# API
## Load a Dataset
To get data from a supported dataset, you only need one function:
```@docs
dataset(name::AbstractString, inputdir::AbstractString, outputdir::AbstractString)
Base.summary(dataset::SpeechDataset)
get_dataset_kwargs(name::String)
```
## Types
### SpeechDataset
```@docs
SpeechDatasetInfos
SpeechDatasetInfos(name::AbstractString)
SpeechDataset
SpeechDataset(infos::SpeechDatasetInfos, manifestroot::AbstractString, subset::AbstractString)
```
Access a single element with integer or id indexing
```julia
# ds::SpeechDataset
ds[1]
ds["1988-147956-0027"]
```
Access several elements by providing a list
```julia
ds[[1,4,7]]
ds[[8, 2, "777-126732-0015"]]
```
Get all annotations
```julia
ds.annotations
```
### Manifest items
```@docs
SpeechDatasets.ManifestItem
Recording
Annotation
AudioSources.load(r::Recording; start = -1, duration = -1, channels = r.channels)
AudioSources.load(r::Recording, a::Annotation)
SpeechDatasets.load_manifest(T::Type{<:Union{Recording, Annotation}}, path)
```
## Lexicons
```@docs
CMUDICT(path)
TIMITDICT(timitdir)
MFAFRDICT(path)
```
## Index
```@index
```
\ No newline at end of file
<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
<mask id="myMask">
<rect x="0" y="0" width="200" height="200" fill="white" />
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="black" transform="rotate(45,100,100)"/>
</mask>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(0.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(45.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(90.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(135.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(180.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(225.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(270.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(315.0, 100, 100)" mask="url(#myMask)"/>
<circle cx="100" cy="100" r="27.878" stroke="black" stroke-width="2.41" fill="yellow"/>
</svg>
<svg version="1.1" width="200" height="200" xmlns="http://www.w3.org/2000/svg">
<mask id="myMask">
<rect x="0" y="0" width="200" height="200" fill="white" />
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="black" transform="rotate(45,100,100)"/>
</mask>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(0.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(45.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(90.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(135.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(180.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(225.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(270.0, 100, 100)"/>
<ellipse cx="52.5" cy="100" rx="42.5" ry="25" stroke="black" stroke-width="2.41" fill="white" transform="rotate(315.0, 100, 100)" mask="url(#myMask)"/>
<circle cx="100" cy="100" r="27.878" stroke="black" stroke-width="2.41" fill="yellow"/>
</svg>
# Supported Datasets
```@eval
using SpeechDatasets, JSON, Markdown
corpora_infos = JSON.parsefile(SpeechDatasets.corpora_file)
function write_corpora_docs(io::IO)
for corpus in corpora_infos
fields = keys(corpus)
println(io, "## $(corpus["name"])")
if "license" in fields
license = replace(corpus["license"], "-" => "--") # dash are escaped
color = license=="proprietary" ? "red" : "lightblue"
license_badge = "https://img.shields.io/badge/License-$license-$color"
println(io, "![License]($license_badge)")
end
if "lang" in fields
languages = corpus["lang"] isa String ? [corpus["lang"]] : corpus["lang"]
for lang in languages
lang_badge = "https://img.shields.io/badge/Lang-$lang-lightgreen"
println(io, "![Language]($lang_badge)")
end
end
if "description" in fields
println(io, "")
println(io, corpus["description"])
end
if "source" in fields
println(io, "")
println(io, "[Source]($(corpus["source"]))")
end
if "authors" in fields
println(io, "")
println(io, "### Authors")
println(io, join(corpus["authors"], ", "))
end
need_subset = false
if "subsets" in fields
need_subset = true
println(io, "")
println(io, "### Subsets")
println(io, join(corpus["subsets"], ", "))
end
kwargs = get_dataset_kwargs(corpus["name"])
if need_subset
kwargs = merge(kwargs, (;subset=""))
end
if ! isempty(kwargs)
println(io, "### Keyword arguments")
println(io, "```julia")
println(io, kwargs)
println(io, "```")
end
println(io, "\n---")
end
end
Markdown.parse(sprint(write_corpora_docs))
```
\ No newline at end of file
# Examples
```julia
using SpeechDatasets
ds = dataset("Mini LibriSpeech", "path/to/minils", "minils_output")
typeof(ds[26])
```
```@example
println("Tuple{Recording, Annotation}") # hide
```
\ No newline at end of file
# SpeechDatasets.jl
Convenient and unified way to load a speech dataset. It can then be harnessed with other PTAL tools.
A `SpeechDataset` instance consists of a set of recordings (info about audio data) and annotations.
## Contents
```@contents
Pages = ["index.md", "installation.md", "examples.md", "api.md", "datasets.md", "newdataset.md"]
```
## License
This software is provided under the [CeCILL-C license](https://cecill.info/licences.en.html)
## Authors
- Lucas Ondel Yang
- Nicolas Denier
- Simon Devauchelle
![](https://ptal.lisn.upsaclay.fr/assets/lisn-ups-cnrs.png)
\ No newline at end of file
# Installation
This package is part of the PTAL tool collection and requires the
[PTAL registry](https://gitlab.lisn.upsaclay.fr/ptal/registry) to be installed.
To add this registry to your Julia installation type `]` to enter the
package mode of the REPL and then type:
```
pkg> registry add "https://gitlab.lisn.upsaclay.fr/PTAL/Registry"
```
Once the registry has been added, SpeechDatasets can be installed with the
Julia package manager by typing in Pkg REPL mode
```
pkg> add SpeechDatasets
```
# Add a new dataset
1. Add metadatas in `src/corpora/corpora.json`
Example:
{
"name": "TIMIT",
"lang": "eng",
"license": "LDC User Agreement for Non-Members",
"source": "https://catalog.ldc.upenn.edu/LDC93S1",
"authors": ["John S. Garofolo", "Lori F. Lamel", "William M. Fisher", "Jonathan G. Fiscus", "David S. Pallett", "Nancy L. Dahlgren", "Victor Zue"],
"description": "The TIMIT corpus of read speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems.",
"subsets": ["train", "dev", "test"]
},
2. Create a new `.jl` file in `src/corpora`
3. Add the following line at the beginning of the file:
const <idname> = get_nametype(<dataset name>)
- Replace `<idname>` with an identifier of your dataset (for example, `timit_id`).
- Replace `<dataset name>` with a string containing the name of the dataset (same as referenced in `corpora.json`).
4. If your dataset is downloadable, you can implement
Base.download(::DatasetBuilder{<idname>}, dir::AbstractString)
5. It is mandatory to implement the `prepare()` function as such:
prepare(::DatasetBuilder{<idname>}, inputdir, outputdir; <keyword arguments>)
You can add any keyword argument.
This function must create the following files in outputdir:
- `recordings.jsonl`
- `annotations.jsonl` or `annotations-<subset>.jsonl` for each subset
That's it, you can now use
```julia
dataset("name", inputdir, outputdir; <keyword arguments>)
```
## DatasetBuilder and utilities
```@docs
DatasetBuilder
DatasetBuilder(name::Symbol)
SpeechDatasets.declareBuilder(name::Symbol)
get_kwargs(func_name::Function, args_types::Tuple)
get_nametype(name::String)
Base.download
prepare
```
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# SPDX-License-Identifier: CECILL-2.1 # SPDX-License-Identifier: CECILL-C
module SpeechDatasets module SpeechDatasets
using JSON using JSON
import AudioSources
using SpeechFeatures using SpeechFeatures
import MLUtils import MLUtils
...@@ -16,36 +17,38 @@ export ...@@ -16,36 +17,38 @@ export
writemanifest, writemanifest,
readmanifest, readmanifest,
# Corpora interface
download,
lang,
name,
prepare,
# Corpora
MultilingualLibriSpeech,
MINILIBRISPEECH,
TIMIT,
INADIACHRONY,
AVID,
SPEECH2TEX,
# Lexicon # Lexicon
CMUDICT, CMUDICT,
TIMITDICT, TIMITDICT,
MFAFRDICT, MFAFRDICT,
# Builder
DatasetBuilder,
get_kwargs,
get_dataset_kwargs,
get_nametype,
download,
prepare,
# Dataset # Dataset
SpeechDatasetInfos,
SpeechDataset,
summary,
dataset dataset
include("speechcorpus.jl") const corpora_file = joinpath(@__DIR__, "corpora", "corpora.json")
const corpora_names = map(c -> c["name"], JSON.parsefile(corpora_file))
include("manifest_item.jl") include("manifest_item.jl")
include("manifest_io.jl") include("manifest_io.jl")
include("builder.jl")
include("dataset.jl") include("dataset.jl")
# Supported corpora # Supported corpora
include.("corpora/".*filter(contains(r".jl$"), readdir("src/corpora/"))) include.("corpora/".*filter(contains(r"\.jl$"), readdir(joinpath(@__DIR__, "corpora"))))
include("lexicons.jl") include("lexicons.jl")
# declare all supported builders
declareBuilder.(get_nametype.(corpora_names))
end end