import torch import torch.nn as nn import numpy as np import pytorch_lightning as pl from qa_mdt.audioldm_train.utilities.model_util import ( exists, default, mean_flat, count_params, instantiate_from_config, ) from torch.optim import * from transformers import GPT2Config, GPT2Model, GPTJConfig, GPTJModel import torch.optim.lr_scheduler as lr_scheduler class Sequence2AudioMAE(pl.LightningModule): def __init__( self, base_learning_rate, sequence_gen_length, sequence_input_key, sequence_input_embed_dim, cond_stage_config, optimizer_type="AdamW", use_warmup=True, use_ar_gen_loss=False, use_audiomae_linear=False, target_tokens_mask_ratio=0.0, random_mask_ratio=False, **kwargs ): super().__init__() assert use_audiomae_linear == False self.random_mask_ratio = random_mask_ratio self.learning_rate = base_learning_rate self.cond_stage_config = cond_stage_config self.use_audiomae_linear = use_audiomae_linear self.optimizer_type = optimizer_type self.use_warmup = use_warmup self.use_ar_gen_loss = use_ar_gen_loss # Even though the LDM can be conditioned on mutliple pooling rate # Our model always predict the higest pooling rate self.mae_token_num = sequence_gen_length self.sequence_input_key = sequence_input_key self.sequence_input_embed_dim = sequence_input_embed_dim self.target_tokens_mask_ratio = target_tokens_mask_ratio self.start_of_sequence_tokens = nn.Embedding(32, 768) self.end_of_sequence_tokens = nn.Embedding(32, 768) self.input_sequence_embed_linear = nn.ModuleList([]) self.initial_learning_rate = None for dim in self.sequence_input_embed_dim: self.input_sequence_embed_linear.append(nn.Linear(dim, 768)) self.cond_stage_models = nn.ModuleList([]) self.instantiate_cond_stage(cond_stage_config) self.initialize_param_check_toolkit() self.private_training_step = 0 # configuration = GPT2Config(n_layer=1) # TODO # self.model=GPT2Model(configuration) ################### # self.model=nn.Linear(768,768, bias=False) # TODO change the model # with torch.no_grad(): # self.model.weight.copy_(torch.eye(768)) ################### self.model = GPT2Model.from_pretrained("gpt2") ################### # self.model = nn.LSTM(input_size=768, hidden_size=768, num_layers=1,bias=False) # TODO # self.loss_fn = nn.MSELoss() self.loss_fn = nn.L1Loss() self.logger_save_dir = None self.logger_exp_name = None self.logger_exp_group_name = None self.logger_version = None def set_log_dir(self, save_dir, exp_group_name, exp_name): self.logger_save_dir = save_dir self.logger_exp_group_name = exp_group_name self.logger_exp_name = exp_name def cfg_uncond(self, batch_size): unconditional_conditioning = {} for key in self.cond_stage_model_metadata: model_idx = self.cond_stage_model_metadata[key]["model_idx"] unconditional_conditioning[key] = self.cond_stage_models[ model_idx ].get_unconditional_condition(batch_size) assert ( "crossattn_audiomae_pooled" in unconditional_conditioning.keys() ), "The module is not initialized with AudioMAE" unconditional_conditioning[ "crossattn_clap_to_audiomae_feature" ] = unconditional_conditioning["crossattn_audiomae_pooled"] return unconditional_conditioning def configure_optimizers(self): lr = float(self.learning_rate) # params = list(self.model.parameters()) + list(self.input_sequence_embed_linear.parameters()) params = list(self.parameters()) # opt = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.98), eps=1e-9) opt = eval(self.optimizer_type)(params, lr=lr) scheduler = lr_scheduler.StepLR(opt, step_size=10, gamma=0.8) return [opt], [scheduler] def add_sos_eos_tokens(self, _id, sequence, attn_mask): batchsize = sequence.size(0) new_attn_mask_step = torch.ones((batchsize, 1)).to(sequence.device) key_id = torch.tensor([_id]).to(sequence.device) # Add two more steps to attn mask new_attn_mask = [new_attn_mask_step, attn_mask, new_attn_mask_step], dim=1 ) # Add two more tokens in the sequence sos_token = self.start_of_sequence_tokens(key_id).expand(batchsize, 1, -1) eos_token = self.end_of_sequence_tokens(key_id).expand(batchsize, 1, -1) new_sequence =[sos_token, sequence, eos_token], dim=1) return new_sequence, new_attn_mask def truncate_sequence_and_mask(self, sequence, mask, max_len=512): if sequence.size(1) > max_len: print( "The input sequence length to GPT-2 model is too long:", sequence.size(1), ) return sequence[:, :max_len], mask[:, :max_len] else: return sequence, mask def get_input_sequence_and_mask(self, cond_dict): input_embeds = None input_embeds_attn_mask = None for _id, sequence_key in enumerate(self.sequence_input_key): assert sequence_key in cond_dict.keys(), ( "Invalid sequence key %s" % sequence_key ) cond_embed = cond_dict[sequence_key] if isinstance(cond_embed, list): assert ( len(cond_embed) == 2 ), "The crossattn returned list should have length 2, including embed and attn_mask" item_input_embeds, item_attn_mask = cond_embed item_input_embeds = self.input_sequence_embed_linear[_id]( item_input_embeds ) item_input_embeds, item_attn_mask = self.add_sos_eos_tokens( _id, item_input_embeds, item_attn_mask ) if input_embeds is None and input_embeds_attn_mask is None: input_embeds, input_embeds_attn_mask = ( item_input_embeds, item_attn_mask, ) else: input_embeds = [input_embeds, item_input_embeds], dim=1 ) # The 1-st dimension is time steps input_embeds_attn_mask = [input_embeds_attn_mask, item_attn_mask], dim=1 ) # The 1-st dimension is time steps else: assert isinstance(cond_embed, torch.Tensor) cond_embed = self.input_sequence_embed_linear[_id](cond_embed) attn_mask = torch.ones((cond_embed.size(0), cond_embed.size(1))).to( cond_embed.device ) item_input_embeds, item_attn_mask = self.add_sos_eos_tokens( _id, cond_embed, attn_mask ) if input_embeds is None and input_embeds_attn_mask is None: input_embeds, input_embeds_attn_mask = ( item_input_embeds, item_attn_mask, ) else: input_embeds, input_embeds_attn_mask = [input_embeds, item_input_embeds], dim=1 ),[input_embeds_attn_mask, item_attn_mask], dim=1) assert input_embeds is not None and input_embeds_attn_mask is not None input_embeds, input_embeds_attn_mask = self.truncate_sequence_and_mask( input_embeds, input_embeds_attn_mask, int(1024 - self.mae_token_num) ) cond_sequence_end_time_idx = input_embeds.size( 1 ) # The index that we start to collect the output embeds return input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx def warmup_step(self): if self.initial_learning_rate is None: self.initial_learning_rate = float(self.learning_rate) # Only the first parameter group if self.global_step <= 1000: if self.global_step == 0: print( "Warming up learning rate start with %s" % self.initial_learning_rate ) self.trainer.optimizers[0].param_groups[0]["lr"] = ( self.global_step / 1000 ) * self.initial_learning_rate else: # TODO set learning rate here self.trainer.optimizers[0].param_groups[0][ "lr" ] = self.initial_learning_rate def mask_target_sequence(self, target_embeds, target_embeds_attn_mask): time_seq_mask = None if self.target_tokens_mask_ratio > 1e-4: batchsize, time_seq_len, embed_dim = target_embeds.size() _, time_seq_len = target_embeds_attn_mask.size() # Generate random mask if self.random_mask_ratio: mask_ratio = torch.rand(1).item() * self.target_tokens_mask_ratio else: mask_ratio = self.target_tokens_mask_ratio time_seq_mask = (torch.rand((batchsize, time_seq_len)) > mask_ratio).to( target_embeds.device ) # Mask the target embedding target_embeds = target_embeds * time_seq_mask.unsqueeze(-1) target_embeds_attn_mask = target_embeds_attn_mask * time_seq_mask return target_embeds, target_embeds_attn_mask, time_seq_mask def training_step(self, batch, batch_idx=None, cond_dict=None, return_output=False): # cond_dict['film_clap_cond1']: [2,1,512] # cond_dict['crossattn_audiomae_pooled']: [2, 128, 768] if self.use_warmup: self.warmup_step() if cond_dict is None: cond_dict = self.get_input(batch) # param_list = list(self.model.parameters()) target_embeds, target_embeds_attn_mask = ( cond_dict["crossattn_audiomae_pooled"][0], cond_dict["crossattn_audiomae_pooled"][1], ) ( input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx, ) = self.get_input_sequence_and_mask(cond_dict) # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately if "crossattn_audiomae_pooled_44" in cond_dict.keys(): target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0] # target_embeds, target_embeds_attn_mask, time_seq_mask = self.mask_target_sequence(target_embeds, target_embeds_attn_mask) final_input_embeds =[input_embeds, target_embeds], dim=1) final_input_embeds_attn_mask = [input_embeds_attn_mask, target_embeds_attn_mask], dim=1 ) ########################### GPT-2 output_embeds = self.model( inputs_embeds=final_input_embeds, attention_mask=final_input_embeds_attn_mask, )["last_hidden_state"] ########################### DNN # output_embeds = self.model(final_input_embeds) ########################### LSTM # output_embeds,_ = self.model(final_input_embeds) target = target_embeds output = output_embeds[:, cond_sequence_end_time_idx - 1 : -1] # output = output_embeds[:, cond_sequence_end_time_idx: ] # TODO bug here intentionally assert target.size(1) == self.mae_token_num # if(batch_idx % 1000 == 0): # print(output[0], target[0]) loss = self.loss_fn(output, target) if self.use_ar_gen_loss: ar_gen_loss = self.calculate_ahead_k_step_loss(batch, batch_idx, cond_dict) else: ar_gen_loss = loss if self.private_training_step % 500 == 0: print( "AudioMAE prediction module:", "loss", loss, "ar_gen_loss", ar_gen_loss ) try: learning_rate = self.trainer.optimizers[0].param_groups[0]["lr"] self.log( "train/lr_audiomae_pred", learning_rate, prog_bar=True, logger=True, on_step=True, on_epoch=False, sync_dist=True, ) except: pass self.log( "train/loss_clap_2_audiomae", loss, prog_bar=True, logger=True, on_step=True, on_epoch=False, sync_dist=True, ) self.log( "train/loss_ar_gen_loss", ar_gen_loss, prog_bar=True, logger=True, on_step=True, on_epoch=False, sync_dist=True, ) self.log( "global_step_audiomae", float(self.global_step), prog_bar=True, logger=True, on_step=True, on_epoch=False, sync_dist=True, ) self.private_training_step += 1 if return_output: return loss + ar_gen_loss, output else: return loss + ar_gen_loss def calculate_ahead_k_step_loss(self, batch, batch_idx=None, cond_dict=None): if cond_dict is None: cond_dict = self.get_input(batch) target_embeds, target_embeds_attn_mask = ( cond_dict["crossattn_audiomae_pooled"][0], cond_dict["crossattn_audiomae_pooled"][1], ) assert ( torch.sum(target_embeds_attn_mask < 0.1) < 1 ), "This function only works for AudioMAE prediction, which should have all one atten_mask" ( input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx, ) = self.get_input_sequence_and_mask(cond_dict) target_total_time_steps = target_embeds.size(1) steps = min(round(torch.rand(1).item() * 8), target_total_time_steps) if steps < 2: steps = 2 start_idx = max( 0, round(torch.rand(1).item() * (target_total_time_steps - steps)) - 1 ) model_input = input_embeds model_input_mask = input_embeds_attn_mask target_embeds_ar_gen = target_embeds[:, start_idx : start_idx + steps, :] generation = [] if start_idx > 0: model_input = [input_embeds, target_embeds[:, :start_idx, :]], dim=1 ) attention_mask_known_steps = torch.ones( (model_input_mask.size(0), start_idx) ).to(model_input.device) model_input_mask = [input_embeds_attn_mask, attention_mask_known_steps], dim=1 ) for _ in range(steps): output = self.model( inputs_embeds=model_input, attention_mask=model_input_mask )["last_hidden_state"] # Update the model input generation.append(output[:, -1:, :]) model_input =[model_input, output[:, -1:, :]], dim=1) # Update the attention mask attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to( model_input.device ) model_input_mask = [model_input_mask, attention_mask_new_step], dim=1 ) generation =, dim=1) return self.loss_fn(generation, target_embeds_ar_gen) def generate_partial(self, batch, cond_dict=None, no_grad=False): if cond_dict is None: cond_dict = self.get_input(batch) print("Generate partially prompted audio with in-context learning") # self.model.train() # assert target_embeds, target_embeds_attn_mask = ( cond_dict["crossattn_audiomae_pooled"][0], cond_dict["crossattn_audiomae_pooled"][1], ) target_time_steps = target_embeds.size(1) ( input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx, ) = self.get_input_sequence_and_mask(cond_dict) model_input = [input_embeds, target_embeds[:, : target_time_steps // 4, :]], dim=1 ) model_input_mask = [ input_embeds_attn_mask, target_embeds_attn_mask[:, : target_time_steps // 4], ], dim=1, ) steps = self.mae_token_num for _ in range(3 * steps // 4): output = self.model( inputs_embeds=model_input, attention_mask=model_input_mask )["last_hidden_state"] # Update the model input model_input =[model_input, output[:, -1:, :]], dim=1) # Update the attention mask attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to( model_input.device ) model_input_mask = [model_input_mask, attention_mask_new_step], dim=1 ) output = model_input[:, cond_sequence_end_time_idx:] return output, cond_dict def generate(self, batch, cond_dict=None, no_grad=False): if cond_dict is None: cond_dict = self.get_input(batch) # self.model.train() # print("!!!!!!!!!!!!!train") ( input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx, ) = self.get_input_sequence_and_mask(cond_dict) model_input = input_embeds model_input_mask = input_embeds_attn_mask steps = self.mae_token_num for _ in range(steps): output = self.model( inputs_embeds=model_input, attention_mask=model_input_mask )["last_hidden_state"] # Update the model input model_input =[model_input, output[:, -1:, :]], dim=1) # Update the attention mask attention_mask_new_step = torch.ones((model_input_mask.size(0), 1)).to( model_input.device ) model_input_mask = [model_input_mask, attention_mask_new_step], dim=1 ) return model_input[:, cond_sequence_end_time_idx:], cond_dict # def on_validation_epoch_start(self) -> None: # # Use text as condition during validation # for key in self.cond_stage_model_metadata.keys(): # metadata = self.cond_stage_model_metadata[key] # model_idx, cond_stage_key, conditioning_key = metadata["model_idx"], metadata["cond_stage_key"], metadata["conditioning_key"] # # If we use CLAP as condition, we might use audio for training, but we also must use text for evaluation # # if(isinstance(self.cond_stage_models[model_idx], CLAPAudioEmbeddingClassifierFreev2)): # # self.cond_stage_model_metadata[key]["cond_stage_key_orig"] = self.cond_stage_model_metadata[key]["cond_stage_key"] # # self.cond_stage_model_metadata[key]["embed_mode_orig"] = self.cond_stage_models[model_idx].embed_mode # # print("Change the model original cond_keyand embed_mode %s, %s to text during evaluation" % (self.cond_stage_model_metadata[key]["cond_stage_key_orig"], self.cond_stage_model_metadata[key]["embed_mode_orig"])) # # self.cond_stage_model_metadata[key]["cond_stage_key"] = "text" # # self.cond_stage_models[model_idx].embed_mode = "text" # return super().on_validation_epoch_start() def validation_step(self, batch, batch_idx): cond_dict = self.get_input(batch) # cond_dict['film_clap_cond1']: [2,1,512] # cond_dict['crossattn_audiomae_pooled']: [2, 128, 768] target_embeds, target_embeds_attn_mask = ( cond_dict["crossattn_audiomae_pooled"][0], cond_dict["crossattn_audiomae_pooled"][1], ) ( input_embeds, input_embeds_attn_mask, cond_sequence_end_time_idx, ) = self.get_input_sequence_and_mask(cond_dict) # Some times if the pooling factor is random, the length of crossattn_audiomae_pooled is not necessary 32, so need to calculate separately if "crossattn_audiomae_pooled_44" in cond_dict.keys(): target_embeds = cond_dict["crossattn_audiomae_pooled_44"][0] final_input_embeds =[input_embeds, target_embeds], dim=1) final_input_embeds_attn_mask = [input_embeds_attn_mask, target_embeds_attn_mask], dim=1 ) output_embeds = self.model( inputs_embeds=final_input_embeds, attention_mask=final_input_embeds_attn_mask, )["last_hidden_state"] target = target_embeds output = output_embeds[:, cond_sequence_end_time_idx - 1 : -1] loss = self.loss_fn(output, target) self.log( "val/loss", loss, prog_bar=True, logger=True, on_step=True, sync_dist=True, on_epoch=True, ) generation_output, _ = self.generate(batch) ar_gen_loss = self.loss_fn(generation_output, target) self.log( "val/ar_gen_loss", ar_gen_loss, prog_bar=True, logger=True, on_step=True, sync_dist=True, on_epoch=True, ) return {"loss": loss, "ar_gen_loss": ar_gen_loss} def get_input_item(self, batch, k): fname, text, label_indices, waveform, stft, fbank = ( batch["fname"], batch["text"], batch["label_vector"], batch["waveform"], batch["stft"], batch["log_mel_spec"], ) ret = {} ret["fbank"] = ( fbank.unsqueeze(1).to(memory_format=torch.contiguous_format).float() ) ret["stft"] = # ret["clip_label"] = ret["waveform"] = ret["text"] = list(text) ret["fname"] = fname for key in batch.keys(): if key not in ret.keys(): ret[key] = batch[key] return ret[k] def get_input(self, batch): cond_dict = {} if len(self.cond_stage_model_metadata.keys()) > 0: unconditional_cfg = False for cond_model_key in self.cond_stage_model_metadata.keys(): cond_stage_key = self.cond_stage_model_metadata[cond_model_key][ "cond_stage_key" ] # if(not # if(isinstance(self.cond_stage_models[self.cond_stage_model_metadata[cond_model_key]["model_idx"]], CLAPAudioEmbeddingClassifierFreev2)): # assert cond_stage_key == "text" # CLAP model should use text for evaluation # The original data for conditioning xc = self.get_input_item(batch, cond_stage_key) if type(xc) == torch.Tensor: xc = c = self.get_learned_conditioning( xc, key=cond_model_key, unconditional_cfg=unconditional_cfg ) cond_dict[cond_model_key] = c return cond_dict def instantiate_cond_stage(self, config): self.cond_stage_model_metadata = {} for i, cond_model_key in enumerate(config.keys()): model = instantiate_from_config(config[cond_model_key]) self.cond_stage_models.append(model) self.cond_stage_model_metadata[cond_model_key] = { "model_idx": i, "cond_stage_key": config[cond_model_key]["cond_stage_key"], "conditioning_key": config[cond_model_key]["conditioning_key"], } def get_learned_conditioning(self, c, key, unconditional_cfg): assert key in self.cond_stage_model_metadata.keys() # Classifier-free guidance if not unconditional_cfg: c = self.cond_stage_models[ self.cond_stage_model_metadata[key]["model_idx"] ](c) else: if isinstance(c, torch.Tensor): batchsize = c.size(0) elif isinstance(c, list): batchsize = len(c) else: raise NotImplementedError() c = self.cond_stage_models[ self.cond_stage_model_metadata[key]["model_idx"] ].get_unconditional_condition(batchsize) return c def initialize_param_check_toolkit(self): self.tracked_steps = 0 self.param_dict = {} def statistic_require_grad_tensor_number(self, module, name=None): requires_grad_num = 0 total_num = 0 require_grad_tensor = None for p in module.parameters(): if p.requires_grad: requires_grad_num += 1 if require_grad_tensor is None: require_grad_tensor = p total_num += 1 print( "Module: [%s] have %s trainable parameters out of %s total parameters (%.2f)" % (name, requires_grad_num, total_num, requires_grad_num / total_num) ) return require_grad_tensor def check_module_param_update(self): if self.tracked_steps == 0: print("Sequence2AudioMAE") for name, module in self.named_children(): try: require_grad_tensor = self.statistic_require_grad_tensor_number( module, name=name ) if require_grad_tensor is not None: self.param_dict[name] = require_grad_tensor.clone() else: print("==> %s does not requires grad" % name) except Exception as e: print("%s does not have trainable parameters: %s" % (name, e)) continue if self.tracked_steps % 5000 == 0: print("Sequence2AudioMAE") for name, module in self.named_children(): try: require_grad_tensor = self.statistic_require_grad_tensor_number( module, name=name ) if require_grad_tensor is not None: print( "===> Param diff %s: %s; Size: %s" % ( name, torch.sum( torch.abs( self.param_dict[name] - require_grad_tensor ) ), require_grad_tensor.size(), ) ) else: print("%s does not requires grad" % name) except Exception as e: print("%s does not have trainable parameters: %s" % (name, e)) continue self.tracked_steps += 1