Skip to content
Snippets Groups Projects
Commit 80de9082 authored by Maria Guaranda-Cabezas's avatar Maria Guaranda-Cabezas
Browse files

organizes code for inference, in favor of 1D trajectories

parent 6db8ebe8
No related branches found
No related tags found
1 merge request!4organizes code for inference, in favor of 1D trajectories
......@@ -42,7 +42,7 @@ You should expect to hear intelligible (but noisy) speech by ~8k steps (~1.5h on
#### Training with trajectories
You don't need to run any preprocessing.
```
python src/diffwave/__main__.py /path/to/model/dir /path/to/file/containing/trajectories --data_type trajectories_x #or trajectories if you want to use 3D data
python src/diffwave/__main__.py /path/to/model/dir /path/to/file/containing/trajectories --data_type trajectories_x # for 1D data
```
#### Multi-GPU training
......
......@@ -31,8 +31,10 @@ from diffwave.model import DiffWave
models = {}
STD = 1.7358
MEAN = -0.0003
def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('cuda'), fast_sampling=False):
def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('cuda'), fast_sampling=False, clamp=True):
# Lazy load model.
if not model_dir in models:
if os.path.exists(f'{model_dir}/weights.pt'):
......@@ -91,41 +93,56 @@ def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('
noise = torch.randn_like(audio)
sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
audio += sigma * noise
audio = torch.clamp(audio, -1.0, 1.0)
#audio = torch.clamp(audio, -1.0, 1.0) if(params.audio_len != 2000) else audio
if clamp: # originally done for audio
audio = torch.clamp(audio, -1.0, 1.0)
return audio, model.params.sample_rate
def main(args):
if args.spectrogram_path:
spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
else:
spectrogram = None
def predict_audio(spectrogram, args):
'''
Function that calls predict() to generate an audio sample and save it.
Note that we are not using a list of spectograms, but a single one.
So, if args.num_samples > 1, we will generate the same audio sample multiple times.
'''
samples = []
for i in range(args.num_samples):
for _ in range(args.num_samples):
audio, sr = predict(spectrogram, model_dir=args.model_dir, fast_sampling=args.fast, params=base_params, device=torch.device('cpu' if args.cpu else 'cuda'))
if base_params.audio_len !=2000:
samples.append(audio.cpu())
else:
# this is a lagrangian trajectory, we have to apply the inverse of
# the transformations used when preprocessing
reverse_transform = Compose([
samples.append(audio.cpu())
# save data
torchaudio.save(args.output, audio, sample_rate=sr)
def predict_trajectories(args):
'''
Function that calls predict() to generate a trajectory sample and save it.
Note we're not making the transformations something variable; they are fixed.
'''
samples = []
for _ in range(args.num_samples):
trajectory, _ = predict(model_dir=args.model_dir, fast_sampling=args.fast, params=base_params, device=torch.device('cpu' if args.cpu else 'cuda'), clamp=args.clamp)
reverse_transform = Compose([
Lambda(lambda t: (t*STD) + MEAN),
Lambda(lambda t: t.numpy(force=True).astype(np.float64).transpose()),
])
trajectory = reverse_transform(audio)
samples.append(trajectory)
if base_params.audio_len !=2000:
for audio in samples:
torchaudio.save(args.output, audio, sample_rate=sr)
else:
# vertically stack all the trajectories
trajectories = np.stack(samples, axis=0)
print(trajectories.shape)
trajectory = reverse_transform(trajectory)
samples.append(trajectory)
# save data
trajectories = np.stack(samples, axis=0) # so size = (num_samples, num_timesteps, 1)
with open(args.output, 'wb') as f:
np.save(f, trajectories)
def main(args):
if args.spectrogram_path:
spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
else:
spectrogram = None
if args.data_type == 'audio':
predict_audio(spectrogram, args)
elif args.data_type == 'trajectories_x':
predict_trajectories(args)
else:
raise NotImplementedError
if __name__ == '__main__':
parser = ArgumentParser(description='runs inference on a spectrogram file generated by diffwave.preprocess')
......@@ -141,4 +158,14 @@ if __name__ == '__main__':
help='use cpu instead of cuda')
parser.add_argument('--num_samples', default=1, type=int,
help='number of samples to generate')
parser.add_argument('--data_type', default='audio', type=str,
help='indicate what type of data is being trained on (audio or other with custom dataloader)')
parser.add_argument('--clamp', '-c', default=True, type=bool,
help='clamp in [-1,1] when generating data')
main(parser.parse_args())
'''
Example usage:
python src/diffwave/inference.py ./models/weigths.pt -o ./path_to_file/that/stores/new_samples.npy \
--data_type trajectories_x --cpu --num_samples 100 --clamp False
'''
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment