organizes code for inference, in favor of 1D trajectories

80de9082 · Maria Guaranda-Cabezas · 6db8ebe8 · 80de9082 · 80de9082
Commit 80de9082 authored 1 year ago by Maria Guaranda-Cabezas
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ You should expect to hear intelligible (but noisy) speech by ~8k steps (~1.5h on
 #### Training with trajectories
 You don't need to run any preprocessing.
 ```
-python src/diffwave/__main__.py /path/to/model/dir /path/to/file/containing/trajectories --data_type trajectories_x #or trajectories if you want to use 3D data
+python src/diffwave/__main__.py /path/to/model/dir /path/to/file/containing/trajectories --data_type trajectories_x # for 1D data
 ```

 #### Multi-GPU training

--- a/src/diffwave/inference.py
+++ b/src/diffwave/inference.py
@@ -31,8 +31,10 @@ from diffwave.model import DiffWave


 models = {}
+STD = 1.7358
+MEAN = -0.0003

-def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('cuda'), fast_sampling=False):
+def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('cuda'), fast_sampling=False, clamp=True):
  # Lazy load model.
  if not model_dir in models:
    if os.path.exists(f'{model_dir}/weights.pt'):
@@ -91,41 +93,56 @@ def predict(spectrogram=None, model_dir=None, params=None, device=torch.device('
        noise = torch.randn_like(audio)
        sigma = ((1.0 - alpha_cum[n-1]) / (1.0 - alpha_cum[n]) * beta[n])**0.5
        audio += sigma * noise
-        audio = torch.clamp(audio, -1.0, 1.0)
-        #audio = torch.clamp(audio, -1.0, 1.0) if(params.audio_len != 2000) else audio
+      if clamp:  # originally done for audio
+        audio = torch.clamp(audio, -1.0, 1.0) 
+        
    return audio, model.params.sample_rate


-def main(args):
-  if args.spectrogram_path:
-    spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
-  else:
-    spectrogram = None
-    
+def predict_audio(spectrogram, args):
+  '''
+    Function that calls predict() to generate an audio sample and save it.
+    Note that we are not using a list of spectograms, but a single one.
+    So, if args.num_samples > 1, we will generate the same audio sample multiple times.
+  '''
  samples = []
-  for i in range(args.num_samples):
+  for _ in range(args.num_samples):
    audio, sr = predict(spectrogram, model_dir=args.model_dir, fast_sampling=args.fast, params=base_params, device=torch.device('cpu' if args.cpu else 'cuda'))
-    if base_params.audio_len !=2000:
-      samples.append(audio.cpu())
-    else:
-      # this is a lagrangian trajectory, we have to apply the inverse of 
-      # the transformations used when preprocessing
-      reverse_transform = Compose([
+    samples.append(audio.cpu())
+    # save data
+    torchaudio.save(args.output, audio, sample_rate=sr)
+
+def predict_trajectories(args):
+  '''
+    Function that calls predict() to generate a trajectory sample and save it.
+    Note we're not making the transformations something variable; they are fixed.
+  '''
+  samples = []
+  for _ in range(args.num_samples):
+    trajectory, _ = predict(model_dir=args.model_dir, fast_sampling=args.fast, params=base_params, device=torch.device('cpu' if args.cpu else 'cuda'), clamp=args.clamp)
+    reverse_transform = Compose([
+        Lambda(lambda t: (t*STD) + MEAN),
        Lambda(lambda t: t.numpy(force=True).astype(np.float64).transpose()),
      ])
-      trajectory = reverse_transform(audio)
-      samples.append(trajectory)
-
-  if base_params.audio_len !=2000:
-    for audio in samples:
-      torchaudio.save(args.output, audio, sample_rate=sr)
-  else:
-    # vertically stack all the trajectories
-    trajectories = np.stack(samples, axis=0)
-    print(trajectories.shape)
+    trajectory = reverse_transform(trajectory)
+    samples.append(trajectory)
+    # save data
+    trajectories = np.stack(samples, axis=0) # so size = (num_samples, num_timesteps, 1)
    with open(args.output, 'wb') as f:
      np.save(f, trajectories)

+def main(args):
+  if args.spectrogram_path:
+    spectrogram = torch.from_numpy(np.load(args.spectrogram_path))
+  else:
+    spectrogram = None
+  
+  if args.data_type == 'audio':
+    predict_audio(spectrogram, args)
+  elif args.data_type == 'trajectories_x':
+    predict_trajectories(args)
+  else:
+    raise NotImplementedError

 if __name__ == '__main__':
  parser = ArgumentParser(description='runs inference on a spectrogram file generated by diffwave.preprocess')
@@ -141,4 +158,14 @@ if __name__ == '__main__':
      help='use cpu instead of cuda')
  parser.add_argument('--num_samples', default=1, type=int,
      help='number of samples to generate')
+  parser.add_argument('--data_type', default='audio', type=str,
+      help='indicate what type of data is being trained on (audio or other with custom dataloader)')
+  parser.add_argument('--clamp', '-c', default=True, type=bool,
+      help='clamp in [-1,1] when generating data')
  main(parser.parse_args())
+
+'''
+Example usage:
+python src/diffwave/inference.py ./models/weigths.pt -o ./path_to_file/that/stores/new_samples.npy \
+--data_type trajectories_x --cpu --num_samples 100 --clamp False
+'''
\ No newline at end of file