diff --git a/README.md b/README.md index 111a5674beb9157057dbaa6dd1037b72b2e1e0ba..cf9522ead895a430f17869ce0f723c81ad675368 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DiffWave **Note** -This code is an adaptation of the original work by the LMNT team. [Original repo link](https://img.shields.io/github/license/lmnt-com/diffwave); the version used is 0.17. +This code is an adaptation of the original work by the LMNT team. [Original repo link](https://img.shields.io/github/license/lmnt-com/diffwave); version 0.17. **Part of the original README follows here:** @@ -39,6 +39,12 @@ tensorboard --logdir /path/to/model/dir --bind_all You should expect to hear intelligible (but noisy) speech by ~8k steps (~1.5h on a 2080 Ti). +#### Training with trajectories +You don't need to run any preprocessing. +``` +python src/diffwave/__main__.py /path/to/model/dir /path/to/file/containing/trajectories --data_type trajectories_x #or trajectories if you want to use 3D data +``` + #### Multi-GPU training By default, this implementation uses as many GPUs in parallel as returned by [`torch.cuda.device_count()`](https://pytorch.org/docs/stable/cuda.html#torch.cuda.device_count). You can specify which GPUs to use by setting the [`CUDA_DEVICES_AVAILABLE`](https://developer.nvidia.com/blog/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/) environment variable before running the training module. diff --git a/src/data/lagrangian_datatools.py b/src/data/lagrangian_datatools.py index c1810337330f760827a8b34dafd5ec3bd1f03a51..deda09744e4eed35b77b1ebf9c746d861fe1d80a 100644 --- a/src/data/lagrangian_datatools.py +++ b/src/data/lagrangian_datatools.py @@ -4,13 +4,22 @@ from torch.utils.data import Dataset from glob import glob +class StandardScaler(object): + """Standardize the data""" + + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, sample): + return (sample - self.mean) / self.std + class ParticleDataset(Dataset): - def __init__(self, path, transform=None, for_diffwave = False): + def __init__(self, path, transform=None): - self.npy_filepath = glob(f'{path}/**/*.npy', recursive=True) + self.npy_filepath = path self.transform = transform - self.data = torch.Tensor(np.load(self.npy_filepath, encoding="ASCII", allow_pickle=True, mmap_mode='r+')) - self.for_diffwave = for_diffwave + self.data = np.load(self.npy_filepath, encoding="ASCII", allow_pickle=True, mmap_mode='r+') def __len__(self): return self.data.shape[0] @@ -23,24 +32,17 @@ class ParticleDataset(Dataset): if self.transform: part_traj = self.transform(part_traj) - # 1D convolutions in Pytorch are in the form (batch, channels, length) - # so we need to permute the dimensions for all the N samples - # from (T,3) to (3, T) - # N: number of trajectories, T: number of timesteps, 3: x,y,z coords - if (self.for_diffwave): - return { - 'audio': part_traj.permute(1,0), - 'spectrogram': None - } - return part_traj.permute(1,0) + return part_traj +# maybe this class is not needed, and we can keep one +# class for both cases, and receive an argument for the coordinate +# but for now I'll keep it like this class ParticleDatasetVx(Dataset): - def __init__(self, path, transform=None, for_diffwave = False): + def __init__(self, path, transform): - self.npy_filepath = glob(f'{path}/**/*.npy', recursive=True) + self.npy_filepath = path self.transform = transform - self.data = torch.Tensor(np.load(self.npy_filepath, encoding="ASCII", allow_pickle=True, mmap_mode='r+')[:,:,0]).unsqueeze(1) - self.for_diffwave = for_diffwave + self.data = np.load(self.npy_filepath, encoding="ASCII", allow_pickle=True, mmap_mode='r+')[:,:,0] def __len__(self): return self.data.shape[0] @@ -53,14 +55,33 @@ class ParticleDatasetVx(Dataset): if self.transform: part_traj = self.transform(part_traj) - # 1D convolutions in Pytorch are in the form (batch, channels, length) - # so we need to permute the dimensions for all the N samples - # from (T,3) to (3, T) - # N: number of trajectories, T: number of timesteps, 3: x,y,z coords - if (self.for_diffwave): - return { - 'audio': part_traj.permute(1,0), - 'spectrogram': None - } - - return part_traj.permute(1,0) \ No newline at end of file + return part_traj + +# We can pass from this class if we modify the train() method in +# learner.py, and for this we may need to use a new parameter in the +# model class, to know if we are training for trajectories or not +# we're currently using a command argument to know this + +class CollatorForDiffwave: + def __init__(self): + pass + + def collate(self, minibatch): + trajectories = np.stack([record ['audio']for record in minibatch]) + return { + 'audio': torch.from_numpy(trajectories), + 'spectrogram': None, + } + + +class ToDiffwaveTensor(object): + """Convert ndarrays in sample to Tensors.""" + + def __call__(self, sample): + trajectory = sample + + return { + 'audio': torch.from_numpy(trajectory).float(), + 'spectrogram': None + } + diff --git a/src/diffwave/dataset.py b/src/diffwave/dataset.py index 87ad6743f43c2d4422b9706ee1dd411fa860282e..31c7b148d62e19494b2e9c14fd03a556513840c9 100644 --- a/src/diffwave/dataset.py +++ b/src/diffwave/dataset.py @@ -22,10 +22,12 @@ import random import torch import torch.nn.functional as F import torchaudio +from torchvision import transforms from glob import glob from torch.utils.data.distributed import DistributedSampler -from data.lagrangian_datatools import ParticleDataset, ParticleDatasetVx +from data.lagrangian_datatools import * +from data.lagrangian_datatools import CollatorForDiffwave as LagrangianCollator class ConditionalDataset(torch.utils.data.Dataset): def __init__(self, paths): @@ -60,7 +62,6 @@ class UnconditionalDataset(torch.utils.data.Dataset): def __getitem__(self, idx): audio_filename = self.filenames[idx] - spec_filename = f'{audio_filename}.spec.npy' signal, _ = torchaudio.load(audio_filename) return { 'audio': signal[0], @@ -85,7 +86,7 @@ class Collator: start = random.randint(0, record['audio'].shape[-1] - self.params.audio_len) end = start + self.params.audio_len record['audio'] = record['audio'][start:end] - record['audio'] = np.pad(record['audio'], (0, (end - start) - len(record['audio'])), mode='constant') + record['audio'] = np.pad(record['audio'], (0, (end - start) - len(record['audio'])), mode='constant') #IDK why this is needed else: # Filter out records that aren't long enough. if len(record['spectrogram']) < self.params.crop_mel_frames: @@ -142,21 +143,26 @@ class Collator: def from_path(args, params, is_distributed=False): data_dirs = args.data_dirs - if params.unconditional: + if params.unconditional and not 'trajectories' in args.data_type: dataset = UnconditionalDataset(data_dirs) else: if args.data_type == 'trajectories': - dataset = ParticleDataset(path = data_dirs) + dataset = ParticleDataset(path = data_dirs[0], for_diffwave=True) elif args.data_type == 'trajectories_x': - dataset = ParticleDatasetVx(path = data_dirs) + # the mean and standard deviation were previously computed + # in an exploration notebook + dataset = ParticleDatasetVx(path = data_dirs[0], + transform=transforms.Compose([StandardScaler(mean=-0.0003, std=1.7358), + transforms.Lambda(lambda t: (t * 2) - 1), + ToDiffwaveTensor()])) else: #with condition dataset = ConditionalDataset(data_dirs) return torch.utils.data.DataLoader( dataset, batch_size=params.batch_size, - collate_fn= None if args.data_type == 'trajectories' else Collator(params).collate, + collate_fn= LagrangianCollator().collate if 'trajectories' in args.data_type else Collator(params).collate, shuffle=not is_distributed, - num_workers=os.cpu_count(), + num_workers= 2, #os.cpu_count(), sampler=DistributedSampler(dataset) if is_distributed else None, pin_memory=True, drop_last=True) @@ -173,3 +179,5 @@ def from_gtzan(params, is_distributed=False): sampler=DistributedSampler(dataset) if is_distributed else None, pin_memory=True, drop_last=True) + + diff --git a/src/diffwave/learner.py b/src/diffwave/learner.py index 94ef21a3ffcd0089e675e747228f1abb625ecc79..5566138976a424ca39b3525df31a6240e6af8c08 100644 --- a/src/diffwave/learner.py +++ b/src/diffwave/learner.py @@ -111,7 +111,7 @@ class DiffWaveLearner: if self.is_master: if self.step % 50 == 0: self._write_summary(self.step, features, loss) - if self.step % len(self.dataset) == 0: + if self.step % self.params.checkpoints_hop == 0: self.save_to_checkpoint() self.step += 1 @@ -122,7 +122,7 @@ class DiffWaveLearner: audio = features['audio'] spectrogram = features['spectrogram'] - N, T = audio.shape + N, _ = audio.shape device = audio.device self.noise_level = self.noise_level.to(device) @@ -149,6 +149,7 @@ class DiffWaveLearner: if not self.params.unconditional: writer.add_image('feature/spectrogram', torch.flip(features['spectrogram'][:1], [1]), step) writer.add_scalar('train/loss', loss, step) + # the following line will print a warning if the audio amplitude is out of range writer.add_scalar('train/grad_norm', self.grad_norm, step) writer.flush() self.summary_writer = writer diff --git a/src/diffwave/model.py b/src/diffwave/model.py index 40c834c2f00143e97098818134a4f863fa0edcf2..0691fe3371c461d81e1cfa6130e1c3453f7ca3a9 100644 --- a/src/diffwave/model.py +++ b/src/diffwave/model.py @@ -147,7 +147,7 @@ class DiffWave(nn.Module): def forward(self, audio, diffusion_step, spectrogram=None): assert (spectrogram is None and self.spectrogram_upsampler is None) or \ (spectrogram is not None and self.spectrogram_upsampler is not None) - x = audio.unsqueeze(1) + x = audio.unsqueeze(1) # watch out for this, we can leave this to the dataloader actually x = self.input_projection(x) x = F.relu(x) diff --git a/src/diffwave/params.py b/src/diffwave/params.py index 4b672c2047501750164530033ccec1503c05a256..72458881df07e0b76599076df0e41155c5e9ece7 100644 --- a/src/diffwave/params.py +++ b/src/diffwave/params.py @@ -31,7 +31,7 @@ class AttrDict(dict): raise NotImplementedError return self -""" + params = AttrDict( # Training params batch_size=16, @@ -39,6 +39,7 @@ params = AttrDict( max_grad_norm=None, # Data params + # these are actually not used for trajectories sample_rate=2000, n_mels=80, n_fft=1024, @@ -46,19 +47,20 @@ params = AttrDict( crop_mel_frames=62, # Probably an error in paper. # Model params - residual_layers=10, - residual_channels=64, - dilation_cycle_length=10, # with this config and residual layers = 10, we get r=2047 + residual_layers=8, + residual_channels=32, + dilation_cycle_length=8, # with this config and residual layers = 8, we get r=511*T unconditional = True, - noise_schedule=np.linspace(1e-4, 0.01, 10).tolist(), # last param is num_timesteps - inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5], # for fast sampling - audio_len = 2000 # length of generated samples + noise_schedule=np.linspace(1e-4, 0.01, 200).tolist(), # last param is num_timesteps + inference_noise_schedule=[1e-4, 0.001, 0.01, 0.05, 0.2, 0.5], # for fast sampling + audio_len = 2000, # length of generated samples + checkpoints_hop = 50000 # how often to save checkpoints ) - """ + # This is the original params dictionary -params = AttrDict( +""" params = AttrDict( # Training params batch_size=16, learning_rate=2e-4, @@ -75,10 +77,10 @@ params = AttrDict( residual_layers=30, residual_channels=64, # 64 or 128 for a larger model (diffwave large) dilation_cycle_length=10, - unconditional = False, + unconditional = True, noise_schedule=np.linspace(1e-4, 0.05, 50).tolist(), inference_noise_schedule=[0.0001, 0.001, 0.01, 0.05, 0.2, 0.5], # unconditional sample len audio_len = 22050*5, # unconditional_synthesis_samples -) +) """