Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# SPDX-License-Identifier: CECILL-2.1
"""
abstract type AbstractAudioSource end
Base class for all audio source. Possible audio sources are:
* `FileAudioSource`
* `URLAudioSource`
* `CmdAudioSource`
You can load the data of an audio source with the internal function
loadsoce(s::AbstractAudioSource, subrange)
"""
abstract type AbstractAudioSource end
struct FileAudioSource <: AbstractAudioSource
path::AbstractString
end
struct URLAudioSource <: AbstractAudioSource
url::AbstractString
end
struct CmdAudioSource <: AbstractAudioSource
cmd
end
CmdAudioSource(c::String) = CmdAudioSource(Cmd(String.(split(c))))
loadsource(s::FileAudioSource, subrange) = wavread(s.path; subrange)
loadsource(s::URLAudioSource, subrange) = wavread(IOBuffer(HTTP.get(s.url).body); subrange)
loadsource(s::CmdAudioSource, subrange) = wavread(IOBuffer(read(pipeline(s.cmd))); subrange)
"""
abstract type ManifestItem end
Base class for all manifest item. Every manifest item should have an
`id` attribute.
"""
abstract type ManifestItem end
"""
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
id::AbstractString
source::Ts
channels::Vector{Int}
samplerate::Int
end
A recording is an audio source associated with and id.
# Constructors
Recording(id, source, channels, samplerate)
Recording(id, souce[; channels = missing, samplerate = missing])
If the channels or the sample rate are not provided then they will be
read from `source`.
!!! warn
When preparing large corpus, not providing the channes and/or the
sample rate can drastically reduce the speed as it forces to read
source.
"""
struct Recording{Ts<:AbstractAudioSource} <: ManifestItem
id::AbstractString
source::Ts
channels::Vector{Int}
samplerate::Int
end
function Recording(uttid, s::AbstractAudioSource; channels = missing, samplerate = missing)
if ismissing(channels) || ismissing(samplerate)
x, sr = load(s)
samplerate = ismissing(samplerate) ? Int(sr) : samplerate
channels = ismissing(channels) ? collect(1:size(x,2)) : channels
end
Recording(uttid, s, channels, samplerate)
end
#function Base.show(io::IO, ::MIME"text/html", r::Recording)
# x, fs = load(r)
# wavwrite(x, "test.wav", Fs = fs)
# println(io, "<audio controls>")
# println(io, "<source src=\"file://test.wav\" type=\"audio/wav\">")
# #println(io, "Your browser does not support the audio element.")
# print(io, "</audio>")
#end
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
struct Supervision <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
channel::Int
data::Dict
end
A "supervision" defines a segment of a recording on a single channel.
The `data` field is an arbitrary dictionary holdin the nature of the
supervision.
# Constructor
Supervision(id, recording_id, start, duration, channel, data)
Supervision(id, recording_id[; channel = missing, start = -1, duration = -1, data = missing)
If `start` and/or `duration` are negative, the segment is considered to
be the whole sequence length of the recording.
"""
struct Supervision <: ManifestItem
id::AbstractString
recording_id::AbstractString
start::Float64
duration::Float64
channel::Int
data::Dict
end
Supervision(id, recid; channel = missing, start = -1, duration = -1, data = missing) =
Supervision(id, recid, start, duration, channel, data)
"""
load(recording[; start = -1, duration = -1, channels = recording.channels])
load(recording, supervision)
Load the signal from a recording. `start`, `duration` (in seconds) can
be used to load only a segment. If a `supervision` is given, function
will return on the portion of the signal corresponding to the
supervision segment.
The function returns a tuple `(x, sr)` where `x` is a ``NxC`` array
- ``N`` is the length of the signal and ``C`` is the number of channels
- and `sr` is the sampling rate of the signal.
"""
function load(r::Recording; start = -1, duration = -1, channels = r.channels)
if start >= 0 && duration >= 0
s = Int(floor(start * r.samplerate + 1))
e = Int(ceil(duration * r.samplerate))
subrange = (s:e)
else
subrange = (:)
end
x, sr, _, _ = loadsource(r.source, subrange)
x[:,channels], sr
end
load(r::Recording, s::Supervision) =
load(r; start = s.start, duration = s.duration, channels = [s.channel])