Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
SpeechDatasets.jl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
PTAL
Datasets
SpeechDatasets.jl
Commits
2fea769a
Commit
2fea769a
authored
9 months ago
by
Nicolas Denier
Browse files
Options
Downloads
Patches
Plain Diff
fix text file reading, update readme
parent
b24e4214
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
.gitignore
+1
-1
1 addition, 1 deletion
.gitignore
README.md
+7
-3
7 additions, 3 deletions
README.md
src/corpora/ina_diachrony.jl
+42
-32
42 additions, 32 deletions
src/corpora/ina_diachrony.jl
with
50 additions
and
36 deletions
.gitignore
+
1
−
1
View file @
2fea769a
outputdir/
*
outputdir/
This diff is collapsed.
Click to expand it.
README.md
+
7
−
3
View file @
2fea769a
...
@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus.
...
@@ -7,7 +7,7 @@ A Julia package to download and prepare speech corpus.
Make sure to add the
[
FAST registry
](
https://gitlab.lisn.upsaclay.fr/fast/registry
)
Make sure to add the
[
FAST registry
](
https://gitlab.lisn.upsaclay.fr/fast/registry
)
to your julia installation. Then, install the package as usual:
to your julia installation. Then, install the package as usual:
```
```
pkg> add Speech
Corpora
pkg> add Speech
Datasets
```
```
## Example
## Example
...
@@ -18,9 +18,13 @@ julia> using SpeechDatasets
...
@@ -18,9 +18,13 @@ julia> using SpeechDatasets
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
julia> dataset = MINILIBRISPEECH("outputdir", :train) # :dev | :test
...
...
julia> dataset =
MINILIBRISPEECH
("/path/to/timit/dir", "outputdir", :train) # :dev | :test
julia> dataset =
TIMIT
("/path/to/timit/dir", "outputdir", :train) # :dev | :test
...
...
julia> dataset = INADIACHRONY("/path/to/ina_wav/dir", "outputdir", "/path/to/ina_csv/dir") # ina_csv dir optional
...
julia> for ((signal, fs), supervision) in dataset
julia> for ((signal, fs), supervision) in dataset
# do something
# do something
end
end
...
@@ -36,5 +40,5 @@ julia> TIMITDICT("/path/to/timit/dir")
...
@@ -36,5 +40,5 @@ julia> TIMITDICT("/path/to/timit/dir")
## License
## License
This software is provided under the CeCILL 2.1 license (see the
[
`/LICENSE`
](
/LICENSE
)
This software is provided under the CeCILL 2.1 license (see the
[
`/LICENSE`
](
/LICENSE
)
)
This diff is collapsed.
Click to expand it.
src/corpora/ina_diachrony.jl
+
42
−
32
View file @
2fea769a
...
@@ -3,9 +3,12 @@
...
@@ -3,9 +3,12 @@
const
AUDIO_PATH
=
"/vol/work1/rilliard/diachronie/normal"
const
AUDIO_PATH
=
"/vol/work1/rilliard/diachronie/normal"
const
TRANSCRIPTION_PATH
=
"/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"
const
TRANSCRIPTION_PATH
=
"/vol/work1/rilliard/diachronie/whisper_diachronik/fr/tc_trs__modified"
function
checkdir
(
dir
::
AbstractString
)
isdir
(
dir
)
||
throw
(
ArgumentError
(
"
$
dir is not an existing directory"
))
end
function
ina_diachrony_recordings
(
dir
::
AbstractString
)
function
ina_diachrony_recordings
(
dir
::
AbstractString
)
!
isdir
(
dir
)
&&
throw
(
ArgumentError
(
"expected directory
$
dir
"
)
)
checkdir
(
dir
)
recordings
=
Dict
()
recordings
=
Dict
()
for
(
root
,
subdirs
,
files
)
in
walkdir
(
dir
)
for
(
root
,
subdirs
,
files
)
in
walkdir
(
dir
)
...
@@ -13,7 +16,7 @@ function ina_diachrony_recordings(dir::AbstractString)
...
@@ -13,7 +16,7 @@ function ina_diachrony_recordings(dir::AbstractString)
filename
,
ext
=
splitext
(
file
)
filename
,
ext
=
splitext
(
file
)
ext
!=
".wav"
&&
continue
ext
!=
".wav"
&&
continue
id
=
"ina_diachrony§
$
(
filename
)
"
id
=
"ina_diachrony§
$
filename"
path
=
joinpath
(
root
,
file
)
path
=
joinpath
(
root
,
file
)
audio_src
=
FileAudioSource
(
path
)
audio_src
=
FileAudioSource
(
path
)
...
@@ -30,8 +33,17 @@ function ina_diachrony_recordings(dir::AbstractString)
...
@@ -30,8 +33,17 @@ function ina_diachrony_recordings(dir::AbstractString)
end
end
function
get_metadata
(
filename
)
metadata
=
split
(
filename
,
"§"
)
timeperiod
=
metadata
[
1
]
age
,
sex
=
split
(
metadata
[
2
],
"_"
)
speaker
=
metadata
[
3
]
return
timeperiod
,
age
,
sex
,
speaker
end
function
ina_diachrony_annotations_whole
(
dir
)
function
ina_diachrony_annotations_whole
(
dir
)
!
isdir
(
dir
)
&&
throw
(
ArgumentError
(
"expected directory
$
dir
"
)
)
checkdir
(
dir
)
annotations
=
Dict
()
annotations
=
Dict
()
...
@@ -39,26 +51,24 @@ function ina_diachrony_annotations_whole(dir)
...
@@ -39,26 +51,24 @@ function ina_diachrony_annotations_whole(dir)
for
file
in
files
for
file
in
files
filename
,
ext
=
splitext
(
file
)
filename
,
ext
=
splitext
(
file
)
ext
!=
".wav"
&&
continue
ext
!=
".wav"
&&
continue
metadata
=
split
(
filename
,
"§"
)
timeperiod
=
metadata
[
1
]
age
,
sex
=
split
(
metadata
[
2
],
"_"
)
speaker
=
metadata
[
3
]
id
=
"ina_diachrony§
$(filename)
"
# extract text
textfilename
=
"
$(filename)
.txt"
text
=
isfile
(
textfilename
)
?
readlines
(
textfilename
)
:
""
# extract metadata from filename
timeperiod
,
age
,
sex
,
speaker
=
get_metadata
(
filename
)
# extract transcription text (same filename but .txt)
textfilepath
=
joinpath
(
root
,
"
$
filename.txt"
)
text
=
isfile
(
textfilepath
)
?
join
(
readlines
(
textfilepath
),
"
\n
"
)
:
""
id
=
"ina_diachrony§
$
filename"
annotation_id
=
id
*
"§0"
annotation_id
=
id
*
"§0"
# generate annotation
annotations
[
annotation_id
]
=
Annotation
(
annotations
[
annotation_id
]
=
Annotation
(
id
,
# audio id
id
,
# audio id
annotation_id
,
# annotation id
annotation_id
,
# annotation id
-
1
,
# start and duration is -1 means that we take the whole
-
1
,
# start and duration is -1 means that we take the whole
-
1
,
# recording
-
1
,
# recording
[
1
],
# only 1 channel (mono recording)
[
1
],
# only 1 channel (mono recording)
Dict
(
Dict
(
# additional informations
"text"
=>
text
,
"text"
=>
text
,
"speaker"
=>
speaker
,
"speaker"
=>
speaker
,
"timeperiod"
=>
timeperiod
,
"timeperiod"
=>
timeperiod
,
...
@@ -71,8 +81,9 @@ function ina_diachrony_annotations_whole(dir)
...
@@ -71,8 +81,9 @@ function ina_diachrony_annotations_whole(dir)
annotations
annotations
end
end
function
ina_diachrony_annotations_csv
(
dir
)
function
ina_diachrony_annotations_csv
(
dir
)
!
isdir
(
dir
)
&&
throw
(
ArgumentError
(
"expected directory
$
dir
"
)
)
checkdir
(
dir
)
annotations
=
Dict
()
annotations
=
Dict
()
...
@@ -81,13 +92,11 @@ function ina_diachrony_annotations_csv(dir)
...
@@ -81,13 +92,11 @@ function ina_diachrony_annotations_csv(dir)
filename
,
ext
=
splitext
(
file
)
filename
,
ext
=
splitext
(
file
)
ext
!=
".csv"
&&
continue
ext
!=
".csv"
&&
continue
metadata
=
split
(
filename
,
"§"
)
# extract metadata from filename
timeperiod
=
metadata
[
1
]
timeperiod
,
age
,
sex
,
speaker
=
get_metadata
(
filename
)
age
,
sex
=
split
(
metadata
[
2
],
"_"
)
speaker
=
metadata
[
3
]
id
=
"ina_diachrony§
$(filename)
"
id
=
"ina_diachrony§
$
filename"
# generate annotation for each line in csv
open
(
joinpath
(
root
,
file
))
do
f
open
(
joinpath
(
root
,
file
))
do
f
header
=
readline
(
f
)
header
=
readline
(
f
)
line
=
1
line
=
1
...
@@ -97,14 +106,14 @@ function ina_diachrony_annotations_csv(dir)
...
@@ -97,14 +106,14 @@ function ina_diachrony_annotations_csv(dir)
start_time
,
end_time
,
text
=
split
(
current_line
,
","
,
limit
=
3
)
start_time
,
end_time
,
text
=
split
(
current_line
,
","
,
limit
=
3
)
start_time
=
parse
(
Float64
,
start_time
)
start_time
=
parse
(
Float64
,
start_time
)
duration
=
parse
(
Float64
,
end_time
)
-
start_time
duration
=
parse
(
Float64
,
end_time
)
-
start_time
annotation_id
=
id
*
"§
$
(
line
)
"
annotation_id
=
id
*
"§
$
line"
annotations
[
id
]
=
Annotation
(
annotations
[
id
]
=
Annotation
(
id
,
# audio id
id
,
# audio id
annotation_id
,
# annotation id
annotation_id
,
# annotation id
start_time
,
# start
start_time
,
# start
duration
,
# duration
duration
,
# duration
[
1
],
# only 1 channel (mono recording)
[
1
],
# only 1 channel (mono recording)
Dict
(
Dict
(
# additional informations
"text"
=>
text
,
"text"
=>
text
,
"speaker"
=>
speaker
,
"speaker"
=>
speaker
,
"timeperiod"
=>
timeperiod
,
"timeperiod"
=>
timeperiod
,
...
@@ -125,7 +134,7 @@ end
...
@@ -125,7 +134,7 @@ end
function
ina_diachrony_prepare
(
ina_wav_dir
,
ina_csv_dir
,
outputdir
)
function
ina_diachrony_prepare
(
ina_wav_dir
,
ina_csv_dir
,
outputdir
)
# Validate the data directory
# Validate the data directory
for
d
in
[
ina_wav_dir
,
ina_csv_dir
]
for
d
in
[
ina_wav_dir
,
ina_csv_dir
]
!
isdir
(
d
)
&&
throw
(
ArgumentError
(
"invalid path
$(d)
"
)
)
isnothing
(
d
)
||
checkdir
(
d
)
end
end
# Create the output directory.
# Create the output directory.
...
@@ -143,12 +152,13 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
...
@@ -143,12 +152,13 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
# Annotations
# Annotations
@info
"Extracting annotations from
$
ina_wav_dir"
@info
"Extracting annotations from
$
ina_wav_dir"
whole_annotations
=
ina_diachrony_annotations_whole
(
ina_wav_dir
)
annotations
=
ina_diachrony_annotations_whole
(
ina_wav_dir
)
#@info "Extracting annotations from $ina_csv_dir"
if
!
isnothing
(
ina_csv_dir
)
#csv_annotations = ina_diachrony_annotations_csv(ina_csv_dir)
@info
"Extracting annotations from
$
ina_csv_dir"
#annotations = merge(whole_annotations, csv_annotations)
csv_annotations
=
ina_diachrony_annotations_csv
(
ina_csv_dir
)
annotations
=
whole_annotations
annotations
=
merge
(
annotations
,
csv_annotations
)
end
manifestpath
=
joinpath
(
outputdir
,
"annotations.jsonl"
)
manifestpath
=
joinpath
(
outputdir
,
"annotations.jsonl"
)
@info
"Creating
$
manifestpath"
@info
"Creating
$
manifestpath"
open
(
manifestpath
,
"w"
)
do
f
open
(
manifestpath
,
"w"
)
do
f
...
@@ -156,7 +166,7 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
...
@@ -156,7 +166,7 @@ function ina_diachrony_prepare(ina_wav_dir, ina_csv_dir, outputdir)
end
end
end
end
function
INADIACHRONY
(
ina_wav_dir
,
ina_csv_dir
,
outputdir
)
function
INADIACHRONY
(
ina_wav_dir
,
outputdir
,
ina_csv_dir
=
nothing
)
if
!
(
isfile
(
joinpath
(
outputdir
,
"recordings.jsonl"
))
&&
if
!
(
isfile
(
joinpath
(
outputdir
,
"recordings.jsonl"
))
&&
isfile
(
joinpath
(
outputdir
,
"annotations.jsonl"
)))
isfile
(
joinpath
(
outputdir
,
"annotations.jsonl"
)))
ina_diachrony_prepare
(
ina_wav_dir
,
ina_csv_dir
,
outputdir
)
ina_diachrony_prepare
(
ina_wav_dir
,
ina_csv_dir
,
outputdir
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment