
    bi                     v   d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z" dd	l#m$Z$m%Z% d
dl&m'Z'm(Z( ddl)m*Z*m+Z+  e       rd dl,Z,ddlm-Z-  e-       rd dl.m/c m0Z1 dZ2ndZ2 ejf                  e4      Z5dZ6	 	 ddZ7 G d de(      Z8y)    N)AnyCallableDictListOptionalUnion)ClapFeatureExtractor	ClapModelGPT2LMHeadModelRobertaTokenizerRobertaTokenizerFastSpeechT5HifiGanT5EncoderModelT5TokenizerT5TokenizerFast	VitsModelVitsTokenizer   )AutoencoderKL)KarrasDiffusionSchedulers)is_accelerate_availableis_accelerate_versionis_librosa_availableloggingreplace_example_docstring)is_transformers_version)empty_device_cacherandn_tensor   )AudioPipelineOutputDiffusionPipeline   )AudioLDM2ProjectionModelAudioLDM2UNet2DConditionModel)is_torch_xla_availableTFah  
    Examples:
        ```py
        >>> import scipy
        >>> import torch
        >>> from diffusers import AudioLDM2Pipeline

        >>> repo_id = "cvssp/audioldm2"
        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> # define the prompts
        >>> prompt = "The sound of a hammer hitting a wooden surface."
        >>> negative_prompt = "Low quality."

        >>> # set the seed for generator
        >>> generator = torch.Generator("cuda").manual_seed(0)

        >>> # run the generation
        >>> audio = pipe(
        ...     prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_inference_steps=200,
        ...     audio_length_in_s=10.0,
        ...     num_waveforms_per_prompt=3,
        ...     generator=generator,
        ... ).audios

        >>> # save the best audio sample (index 0) as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
        ```
        ```
        #Using AudioLDM2 for Text To Speech
        >>> import scipy
        >>> import torch
        >>> from diffusers import AudioLDM2Pipeline

        >>> repo_id = "anhnct/audioldm2_gigaspeech"
        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> # define the prompts
        >>> prompt = "A female reporter is speaking"
        >>> transcript = "wish you have a good day"

        >>> # set the seed for generator
        >>> generator = torch.Generator("cuda").manual_seed(0)

        >>> # run the generation
        >>> audio = pipe(
        ...     prompt,
        ...     transcription=transcript,
        ...     num_inference_steps=200,
        ...     audio_length_in_s=10.0,
        ...     num_waveforms_per_prompt=2,
        ...     generator=generator,
        ...     max_new_tokens=512,          #Must set max_new_tokens equa to 512 for TTS
        ... ).audios

        >>> # save the best audio sample (index 0) as a .wav file
        >>> scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])
        ```
c                 H    || d d dd f   } | |||j                  d      dS )N	use_cache)inputs_embedsattention_maskpast_key_valuesr(   )get)r)   r*   r+   kwargss       k/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/audioldm2/pipeline_audioldm2.pyprepare_inputs_for_generationr/      s;     "%af- '(*ZZ,	     c            0       6    e Zd ZdZdededeeef   de	de
deeef   deeeef   d	ed
ededef fdZd Zd Zd3dee   deej4                  ef   fdZ	 	 d4dej:                  defdZ	 	 	 	 	 	 	 	 	 d5deej:                     deej:                     deej:                     deej:                     deej>                     deej>                     dee   fdZ d Z!d Z"d  Z#	 	 	 	 	 	 	 	 d6d!Z$d7d"Z% ejL                          e'e(      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d8d#eee)e   f   d$eee)e   f   d%ee*   d&ed'e*d(eeee)e   f      d)ee   d*e*d+eeejV                  e)ejV                     f      d,eej:                     deej:                     deej:                     deej:                     deej:                     deej>                     deej>                     dee   d-e,d.ee-eeej:                  gdf      d/ee   d0ee.ee/f      d1ee   f,d2              Z0 xZ1S )9AudioLDM2Pipelinea  
    Pipeline for text-to-audio generation using AudioLDM2.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        text_encoder ([`~transformers.ClapModel`]):
            First frozen text-encoder. AudioLDM2 uses the joint audio-text embedding model
            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModelWithProjection),
            specifically the [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant. The
            text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
            rank generated waveforms against the text prompt by computing similarity scores.
        text_encoder_2 ([`~transformers.T5EncoderModel`, `~transformers.VitsModel`]):
            Second frozen text-encoder. AudioLDM2 uses the encoder of
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
            [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant. Second frozen text-encoder use
            for TTS. AudioLDM2 uses the encoder of
            [Vits](https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel).
        projection_model ([`AudioLDM2ProjectionModel`]):
            A trained model used to linearly project the hidden-states from the first and second text encoder models
            and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
            concatenated to give the input to the language model. A Learned Position Embedding for the Vits
            hidden-states
        language_model ([`~transformers.GPT2Model`]):
            An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
            outputs from the two text encoders.
        tokenizer ([`~transformers.RobertaTokenizer`]):
            Tokenizer to tokenize text for the first frozen text-encoder.
        tokenizer_2 ([`~transformers.T5Tokenizer`, `~transformers.VitsTokenizer`]):
            Tokenizer to tokenize text for the second frozen text-encoder.
        feature_extractor ([`~transformers.ClapFeatureExtractor`]):
            Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.
        unet ([`UNet2DConditionModel`]):
            A `UNet2DConditionModel` to denoise the encoded audio latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            Vocoder of class `SpeechT5HifiGan` to convert the mel-spectrogram latents to the final audio waveform.
    vaetext_encodertext_encoder_2projection_modellanguage_model	tokenizertokenizer_2feature_extractorunet	schedulervocoderc                     t         |           | j                  |||||||||	|
|       t        | dd       r5dt	        | j
                  j                  j                        dz
  z  | _        y d| _        y )N)r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r3   r   r"      )	super__init__register_modulesgetattrlenr3   configblock_out_channelsvae_scale_factor)selfr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   	__class__s               r.   rA   zAudioLDM2Pipeline.__init__   s     	%)-)#/ 	 	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvwr0   c                 8    | j                   j                          y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        N)r3   enable_slicingrH   s    r.   enable_vae_slicingz$AudioLDM2Pipeline.enable_vae_slicing   s    
 	!r0   c                 8    | j                   j                          y)z
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        N)r3   disable_slicingrL   s    r.   disable_vae_slicingz%AudioLDM2Pipeline.disable_vae_slicing   s    
 	  "r0   Ngpu_iddevicec           	         t               rt        dd      rddlm} nt	        d      t        j                  |      }|j                  }| |t        d| d| d	|j                         |j                  }|}|s|j                  r| d
|xs |j                   }t        j                  |      }| j                  j                  dk7  r(| j                  dd       t        |j                         | j                  j                  | j                  j                  | j                  | j                   | j"                  | j$                  | j&                  | j(                  | j                  g	}d}	|D ]  }
 ||
||	      \  }}	 |	| _        y)a  
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
        z>=z0.17.0.dev0r   )cpu_offload_with_hookzC`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.NzYou have passed both `gpu_id`=z4 and an index as part of the passed device `device`=zwCannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`=:cpuT)silence_dtype_warnings)prev_module_hook)r   r   
acceleraterT   ImportErrortorchrR   index
ValueErrortypetor   r4   
text_modeltext_projectionr5   r6   r7   r;   r3   r=   final_offload_hook)rH   rQ   rR   rT   torch_devicedevice_indexdevice_type
device_strmodel_sequencehookcpu_offloaded_model_s               r.   enable_model_cpu_offloadz*AudioLDM2Pipeline.enable_model_cpu_offload   s    #$)>t])S8cdd||F+#)),":08lmslt J  KW  K\  K\  J]^ 
 #'' 
\''&<q)E<3E3E(FGJj);;u$GGE$G7v{{+ ((--!!IIHHLL

 #1 	`+,?Z^_GAt	` #'r0   r)   max_new_tokensc           	      `   i }t        dd      r||d<   nD|j                  d   |d<   t        | dd      | j                  j                  n| j                  |d<   ||d	<   ||n| j                  j
                  j                  } | j                  j                  di |}t        |      D ]u  }t        |fi |} | j                  di |d
d
d}|j                  d   }t        j                  ||ddddddf   gd      }| j                  j                  ||      }w |dd| dddf   S )a  

        Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.

        Parameters:
            inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The sequence used as a prompt for the generation.
            max_new_tokens (`int`):
                Number of new tokens to generate.
            model_kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of additional model-specific kwargs that will be forwarded to the `forward`
                function of the model.

        Return:
            `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The sequence of generated hidden-states.
        <z4.52.1	input_idsr   
seq_lengthr7   NrR   model_kwargsT)output_hidden_statesreturn_dictr'   r"   )dim )r   shaperC   r7   rR   rE   rl   _get_initial_cache_positionranger/   hidden_statesr[   cat#_update_model_kwargs_for_generation)	rH   r)   rl   rq   cache_position_kwargsrj   model_inputsoutputnext_hidden_statess	            r.   generate_language_modelz)AudioLDM2Pipeline.generate_language_model#  s^   . !#"311>!+.2?2E2Ea2H!,/.5d<Ld.S._##**eiepep "(+ 1=n-+9+E4K^K^KeKeKtKtFt**FF_I^_~& 	iA8W,WL )T((e<ed`deF!'!5!5b!9 "II}6HBCQR6S&TZ[\M  ..RRSY[ghL	i Q 0!344r0   prompt_embedsnegative_prompt_embedsgenerated_prompt_embeds negative_generated_prompt_embedsr*   negative_attention_maskc                    |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| j
                  | j                  g}t        | j                  t              }|r#| j                  | j                  j                  g}n| j                  | j                  g}|}g }g }t        ||      D ]  \  }}t        |t        t        t        t        f      } ||r|n|t        |t        t        t        f      rdnd|j                   dd      }|j"                  }|j$                  } ||dd	      j"                  }|j                  d
   |j                  d
   k\  ryt'        j(                  ||      sc|j+                  |dd|j                   dz
  d
f         }t,        j/                  d|j0                  j2                   d|j                    d|        |j5                  |      }|j5                  |      }|j0                  j2                  dk(  r3|j7                  ||      }|dddddf   }|j9                  |df      }ni|rXt        ||      D ])  \  }}t;        |      D ]  \  }}|dk(  sd||<   d||<    ) +  ||||j=                  d
            }|d   }n |||      }|d   }|j?                  |       |j?                  |        | jA                  |d   |d   |d   |d         }|jB                  } |j$                  }!| jE                  | |!|      }	|j5                  | j                  jF                  |      }||j5                  |      n2t'        jH                  |j                  dd t&        jJ                  |      }|	j5                  | jL                  jF                  |      }	|j                  \  }"}#}$|jO                  d|d      }|jQ                  |"|z  |#|$      }|jO                  d|      }|jQ                  |"|z  |#      }|	j                  \  }"}#}$|	jO                  d|d      }	|	jQ                  |"|z  |#|$      }	|r||dg|z  }%ntS        |      tS        |      ur$tU        dtS        |       dtS        |       d      t        |t              r|g}%n1|t        |      k7  r!tW        d| dt        |       d| d| d	      |}%g }&g }'|j                  d   }(t        ||      D ]  \  }} ||%dt        |t        t        t        f      r|j                   n|(dd      })|)j"                  j5                  |      }*|)j$                  j5                  |      }|j0                  j2                  dk(  r3|j7                  |*|      }|dddddf   }|j9                  |df      }n|rt'        jX                  ||j                   |j0                  jZ                        j5                  | j                  jF                  |      }t'        jX                  ||j                         j5                  | j                  jF                  |      }n ||*|      }|d   }|&j?                  |       |'j?                  |        | jA                  |&d   |&d   |'d   |'d         }|jB                  }+|j$                  },| jE                  |+|,|      }
|rq|j                  d   }#|j5                  | j                  jF                  |      }||j5                  |      n2t'        jH                  |j                  dd t&        jJ                  |      }|
j5                  | jL                  jF                  |      }
|jO                  d|d      }|jQ                  ||z  |#d
      }|jO                  d|      }|jQ                  ||z  |#      }|
j                  d   }#|
jO                  d|d      }
|
jQ                  ||z  |#d
      }
t'        j\                  ||g      }t'        j\                  ||g      }t'        j\                  |
|	g      }	|||	fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            transcription (`str` or `List[str]`):
                transcription of text to speech
            device (`torch.device`):
                torch device
            num_waveforms_per_prompt (`int`):
                number of waveforms that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
                prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
                *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
                `negative_prompt` input argument.
            generated_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
                 argument.
            negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
                `negative_prompt` input argument.
            attention_mask (`torch.LongTensor`, *optional*):
                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
                be computed from `prompt` input argument.
            negative_attention_mask (`torch.LongTensor`, *optional*):
                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
                mask will be computed from `negative_prompt` input argument.
            max_new_tokens (`int`, *optional*, defaults to None):
                The number of new tokens to generate with the GPT2 language model.
        Returns:
            prompt_embeds (`torch.Tensor`):
                Text embeddings from the Flan T5 model.
            attention_mask (`torch.LongTensor`):
                Attention mask to be applied to the `prompt_embeds`.
            generated_prompt_embeds (`torch.Tensor`):
                Text embeddings generated from the GPT2 language model.

        Example:

        ```python
        >>> import scipy
        >>> import torch
        >>> from diffusers import AudioLDM2Pipeline

        >>> repo_id = "cvssp/audioldm2"
        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> # Get text embedding vectors
        >>> prompt_embeds, attention_mask, generated_prompt_embeds = pipe.encode_prompt(
        ...     prompt="Techno music with a strong, upbeat tempo and high melodic riffs",
        ...     device="cuda",
        ...     do_classifier_free_guidance=True,
        ... )

        >>> # Pass text embeddings to pipeline for text-conditional audio generation
        >>> audio = pipe(
        ...     prompt_embeds=prompt_embeds,
        ...     attention_mask=attention_mask,
        ...     generated_prompt_embeds=generated_prompt_embeds,
        ...     num_inference_steps=200,
        ...     audio_length_in_s=10.0,
        ... ).audios[0]

        >>> # save generated audio sample
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
        ```Nr"   r   
max_lengthTpt)paddingr   
truncationreturn_tensorslongest)r   r   r'   z7The following part of your input was truncated because z! can only handle sequences up to z	 tokens: clap)r*      )r*   padding_mask)ry   hidden_states_1r*   attention_mask_1)r*   rl   )dtyperR   rR   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)/
isinstancestrlistrD   rv   r8   r9   r5   r   r4   zipr   r   r   r   r   model_max_lengthro   r*   r[   equalbatch_decodeloggerwarningrE   
model_typer_   get_text_featuresnew_ones	enumerate	unsqueezeappendr6   ry   r   r   oneslongr7   repeatviewr^   	TypeErrorr]   zeroshidden_sizerz   )-rH   promptrR   num_waveforms_per_promptdo_classifier_free_guidancetranscriptionnegative_promptr   r   r   r   r*   r   rl   
batch_size
tokenizersis_vits_text_encodertext_encodersprompt_embeds_listattention_mask_listr8   r4   
use_prompttext_inputstext_input_idsuntruncated_idsremoved_texttext_input_idtext_attention_maskidx
phoneme_idprojection_outputprojected_prompt_embedsprojected_attention_maskbs_embedseq_lenr   uncond_tokensnegative_prompt_embeds_listnegative_attention_mask_listr   uncond_inputuncond_input_ids negative_projected_prompt_embeds!negative_projected_attention_masks-                                                r.   encode_promptzAudioLDM2Pipeline.encode_promptW  s|	   ~ *VS"9JJvt$<VJ&,,Q/J nnd&6&67
)$*=*=yI!..0C0C0P0PQM!..0C0CDM !#"$+.z=+I :;'	<' 02FUde
 ((Fm!).>@TVc-de )(99##' "-!6!6!,!;!;"+FIVZ"["e"e"((,0D0DR0HHQVQ\Q\"OR $-#9#9/!YMgMgjkMknpMpJp:q#rLNNQR^ReReRpRpQq r77@7Q7Q6RR[\h[ik
 "0!2!26!:!/!2!26!:&&11V;$0$B$B&'5 %C %M
 %2!T1*$=M%3%<%<j!_%MN)>A.R`>a &:':/8/G &OC)Q58c 2;< 3C 8 %	&& %1&~TbTlTlmoTp%M %2!$4M$0&'5%M %2!$4M"))-8#**>:u:;x !% 5 503 21 5215!4Q!7	 !6 ! '8&E&E#'8'G'G$&*&B&B'7- 'C '# &((t/B/B/H/HQW(X ) V,M//35::fU 	
 #:"<"<4CVCVC\C\ek"<"l)6)<)<&';%,,Q0H!L%**86N+NPWYde (..q2JK',,X8P-PRYZ)@)F)F&';"9"@"@D\^_"`"9">">//+#

 '+A+I&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0*,'+-(&,,Q/J+.z=+I (M'	<(!(!).>@TVc-de  )99###'  $0#9#9#<#<V#D *6*E*E*H*H*P'&&11V;-9-K-K('> .L .*
 .DAtQJ-O*.E.N.NPZ\]._+)-2[["!22$++77. bt2288bH	 +
 /4kk*iF`F`.a.d.d"1177 /e /+ .:('>.* .DA-F*+223IJ,334KLQ(MT !% 5 59!< ;A >;A>!=a!@	 !6 ! 0A/N/N,0A0P0P-/3/K/K0@- 0L 0, ',2215G%;%>%>TEXEXE^E^gm%>%n" +6 (**&*9ZZ 6 < <Ra @

[ab $
 0P/R/R))// 0S 0,
 &<%B%B1F^`a%b"%;%@%@NfAfhoqs%t" '>&D&DQH`&a#&=&B&B:PhChjq&r# 7<<Q?G/O/V/VWXZrtu/v,/O/T/T55w0, "II'=}&MNM"YY(?'PQN&+ii1QSj0k&l#n.EEEr0   c                     |j                         dk(  r|j                  d      }| j                  |      }|j                         j	                         }|S )N   r"   )rt   squeezer=   rV   float)rH   mel_spectrogramwaveforms      r.   mel_spectrogram_to_waveformz-AudioLDM2Pipeline.mel_spectrogram_to_waveform  sJ     A%-55a8O<<0<<>'')r0   c                    t               st        j                  d       |S | j                  |dd      }t	        j
                  |j                         | j                  j                  j                  | j                  j                        }| j                  t        |      d| j                  j                        j                  j                  |      |d<   |j                  |      } | j                  di |j                   }t#        j$                  |dd	      d d d |f   }	t#        j&                  |d
|	j)                  d      j+                               }|S )Na  Automatic scoring of the generated audio waveforms against the input prompt text requires the `librosa` package to resample the generated waveforms. Returning the audios in the order they were generated. To enable automatic scoring, install `librosa` with: `pip install librosa`.r   T)r   r   )orig_sr	target_sr)r   sampling_rateinput_featuresr"   )rt   
descendingr   r'   ru   )r   r   infor8   librosaresamplenumpyr=   rE   r   r:   r   r   r^   r_   r4   logits_per_textr[   argsortindex_selectreshaperV   )
rH   textaudior   rR   r   inputsresampled_audior   indicess
             r.   score_waveformsz!AudioLDM2Pipeline.score_waveforms  s/   #%KKi
 LT4H!**KKM4<<#6#6#D#DPTPfPfPtPt
 $(#9#9!$dF\F\FjFj $: $

.e 	  6" ,$++5f5EE--Q4HLeMeLeIef""5!W__R-@-D-D-FGr0   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Neta	generator)setinspect	signaturer<   step
parameterskeys)rH   r   r   accepts_etaextra_step_kwargsaccepts_generators         r.   prepare_extra_step_kwargsz+AudioLDM2Pipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r0   c                    || j                   z  }||k  rt        d| d| d      | j                  j                  j                  | j                   z  dk7  r:t        d| j                  j                  j                   d| j                    d      ||0t        |t              r|dk  rt        d| dt        |       d      ||t        d	| d
| d      |||	t        d      |7t        |t              s't        |t              st        dt        |             ||t        d| d| d      ||
t        d      |||j                  |j                  k7  r&t        d|j                   d|j                   d      |A|j                  |j                  d d k7  r%t        d|j                   d|j                         |.| j                  j                  j                  dk(  rDt        d      |7t        |t              s't        |t              st        dt        |             |	|
|	j                  |
j                  k7  r&t        d|	j                   d|
j                   d      |B|j                  |j                  d d k7  r%t        d|j                   d|j                         y y y y )NzH`audio_length_in_s` has to be a positive value greater than or equal to z	, but is r   r   zwThe number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got z bins and a scale factor of z5`callback_steps` has to be a positive integer but is z	 of type zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zProvide either `prompt`, or `prompt_embeds` and `generated_prompt_embeds`. Cannot leave `prompt` undefined without specifying both `prompt_embeds` and `generated_prompt_embeds`.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z{Cannot forward `negative_prompt_embeds` without `negative_generated_prompt_embeds`. Ensure thatboth arguments are specifiedzu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` r   zq`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:`attention_mask: z != `prompt_embeds` vitszLCannot forward without transcription. Please make sure to have transcriptionz9`transcription` has to be of type `str` or `list` but is z`generated_prompt_embeds` and `negative_generated_prompt_embeds` must have the same shape when passed directly, but got: `generated_prompt_embeds` z' != `negative_generated_prompt_embeds` )rG   r]   r=   rE   model_in_dimr   intr^   r   r   rv   r5   r   )rH   r   audio_length_in_svocoder_upsample_factorcallback_stepsr   r   r   r   r   r   r*   r   min_audio_length_in_ss                 r.   check_inputszAudioLDM2Pipeline.check_inputs  s    !8$:O:O O44Z[pZq r'(+ 
 <<++d.C.CCqH--1\\-@-@-M-M,NNj((),  "&
>30OSaefSfGGW X(), 
 -";08N}o ^0 0  ^!6:Q:Yl  FC)@TZ\`IaQRVW]R^Q_`aa&+A+M9/9J K*++]_  $/4T4\/ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8 
 )n.B.BmFYFYZ\[\F].] ((6(<(<'==QR_ReReQfh 
  ""))44> !opp&=#.z-QU7VXY]^kYlXmnoo".3S3_&,,0P0V0VV KKbKhKhJi j::Z:`:`9aabd  (3+115K5Q5QRTST5UU ((?(E(E'FFZ[q[w[wZxz  V 4 4`.r0   c                    ||t        |      | j                  z  t        | j                  j                  j                        | j                  z  f}t        |t              r)t        |      |k7  rt        dt        |       d| d      |t        ||||      }n|j                  |      }|| j                  j                  z  }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rR   r   )r   rG   r=   rE   r   r   r   rD   r]   r   r_   r<   init_noise_sigma)	rH   r   num_channels_latentsheightr   rR   r   latentsrv   s	            r.   prepare_latentsz!AudioLDM2Pipeline.prepare_latents:  s     K4000##001T5J5JJ	
 i&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZGjj(G DNN;;;r0   r   r   r   num_inference_stepsguidance_scaler   r   r   r   r  rs   callbackr   cross_attention_kwargsoutput_typec                 \	   t        j                  | j                  j                  j                        | j                  j                  j
                  z  }|0| j                  j                  j                  | j                  z  |z  }t        ||z        }t        || j                  j                  j
                  z        }|| j                  z  dk7  rZt        t        j                  || j                  z              | j                  z  }t        j                  d| d||z   d| d       | j                  ||||||||||||       |t        |t              rd}n-|t        |t               rt#        |      }n|j$                  d   }| j&                  }|dkD  }| j)                  |||||||||||||	      \  }}}| j*                  j-                  ||
       | j*                  j.                  }| j                  j                  j0                  }| j3                  ||z  |||j4                  ||	|
      }
| j7                  |	|      }t#        |      || j*                  j8                  z  z
  } | j;                  |      5 }!t=        |      D ]5  \  }"}#|rt?        j@                  |
gdz        n|
}$| j*                  jC                  |$|#      }$| j                  |$|#|||d      d   }%|r|%jE                  d      \  }&}'|&||'|&z
  z  z   }% | j*                  jF                  |%|#|
fi |jH                  }
|"t#        |      dz
  k(  s'|"dz   | kD  r]|"dz   | j*                  j8                  z  dk(  r>|!jK                          |,|"|z  dk(  r$|"tM        | j*                  dd      z  }( ||(|#|
       tN        s"tQ        jR                          8 	 ddd       | jU                          |dk(  sLd| jV                  j                  jX                  z  |
z  }
| jV                  j[                  |
      j\                  })nt_        |
      S | ja                  |)      }*|*ddd|f   }*|dkD  r"| | jc                  ||*|||j4                        }*|dk(  r|*je                         }*|s|*fS t_        |*      S # 1 sw Y   xY w)u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
            transcription (`str` or `List[str]`, *optional*):\
                The transcript for text to speech.
            audio_length_in_s (`int`, *optional*, defaults to 10.24):
                The length of the generated audio sample in seconds.
            num_inference_steps (`int`, *optional*, defaults to 200):
                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 3.5):
                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
                The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, then automatic
                scoring is performed between the generated outputs and the text prompt. This scoring ranks the
                generated waveforms based on their cosine similarity with the text input in the joint text-audio
                embedding space.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            generated_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings from the GPT2 language model. Can be used to easily tweak text inputs,
                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
                 argument.
            negative_generated_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
                `negative_prompt` input argument.
            attention_mask (`torch.LongTensor`, *optional*):
                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
                be computed from `prompt` input argument.
            negative_attention_mask (`torch.LongTensor`, *optional*):
                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
                mask will be computed from `negative_prompt` input argument.
            max_new_tokens (`int`, *optional*, defaults to None):
                Number of new tokens to generate with the GPT2 language model. If not provided, number of tokens will
                be taken from the config of the model.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
                model (LDM) output.

        Examples:

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
                otherwise a `tuple` is returned where the first element is a list with the generated audio.
        Nr   zAudio length in seconds z is increased to z; so that it can be handled by the model. It will be cut to z after the denoising process.r"   g      ?)r   r   r   r   r*   r   rl   r   )totalr   F)encoder_hidden_statesencoder_hidden_states_1encoder_attention_mask_1rs   orderlatent)audios)r   r   r   rR   r   np)3r  prodr=   rE   upsample_ratesr   r;   sample_sizerG   r   ceilr   r   r   r   r   r   rD   rv   _execution_devicer   r<   set_timesteps	timestepsin_channelsr  r   r   r  progress_barr   r[   rz   scale_model_inputchunkr   prev_sampleupdaterC   XLA_AVAILABLExm	mark_stepmaybe_free_model_hooksr3   scaling_factordecodesampler    r   r   r   )+rH   r   r   r   r  r  r   r   r   r   r  r   r   r   r   r*   r   rl   rs   r  r   r  r  r   r  original_waveform_lengthr   rR   r   r  r   r   num_warmup_stepsr  itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textstep_idxr   r   s+                                              r.   __call__zAudioLDM2Pipeline.__call__P  s   Z #%''$,,*=*=*L*L"MPTP\P\PcPcPqPq"q$ $		 0 0 < <t?T?T TWn n&)@@A#&'84<<;N;N;\;\'\#] D)))Q.$*?*?!?@ADDYDYYFKK*+<*==NvXoOoNp qMM^L_ `%& 	#"#,#	
  *VS"9JJvt$<VJ&,,Q/J'' '5s&:# BFASAS$''#9$;-M)$;) BT B
>~'>" 	$$%8$HNN,,	  $yy//;;&&11 
 !::9cJ y>,?$..BVBV,VV%89  	#\!), #1A\UYYy1}%=bi"%)^^%E%EFXZ[%\" "YY&*A,9-; % '  
 /9C9I9I!9L6%!2^YjGj5k!kJ .$..--j!WZHYZff I**A9I/IqSTuX\XfXfXlXlNlpqNq '')+N0Ba0G#$(K#K 1g6 LLN?# 	#D 	##% h&$((//8887BG"hhoog6==O&g6600Aa22223 $a'F,>(()A#)) ) E $KKME8O"%00 	#  	#s   >D,R",R""R+)Ncuda)Nr?   )	NNNNNNNNN)NNNNNNNN)N)NNN   g      @Nr"   g        NNNNNNNNNTNr"   Nr  )2__name__
__module____qualname____doc__r   r
   r   r   r   r#   r   r   r   r   r   r   r	   r$   r   r   rA   rM   rP   r   r   r[   rR   r   rk   Tensorr   
LongTensorr   r   r   r   r   r  no_gradr   EXAMPLE_DOC_STRINGr   r   	Generatorboolr   r   r   r/  __classcell__)rI   s   @r.   r2   r2      s   *Xxx  x ni78	x
 3x (x )+??@x ;FGx 0x ,x -x !x@"#0'x} 0'USXS_S_adSdMe 0'h '+25||25 25t 049=:>CG59>B(,JF  -JF !) 6JF "*%,,!7JF +35<<*@JF !!1!12JF "*%*:*:!;JF !JFZ
2!. # $)- $`F, U]]_12 )-/3-1#& #;?23MQ*.049=:>CG59>B(, GK();?%)/~1c49n%~1 S$s)^,~1 $E?	~1
 !~1 ~1 "%T#Y"78~1 #+3-~1 ~1 E%//43H"HIJ~1 %,,'~1  -~1 !) 6~1 "*%,,!7~1 +35<<*@~1  !!1!12!~1" "*%*:*:!;#~1$ !%~1& '~1( 8S#u||$<d$BCD)~1* !+~1, !)c3h 8-~1. c]/~1 3 ~1r0   r2   )NN)9r   typingr   r   r   r   r   r   r   r  r[   transformersr	   r
   r   r   r   r   r   r   r   r   r   modelsr   
schedulersr   utilsr   r   r   r   r   utils.import_utilsr   utils.torch_utilsr   r   pipeline_utilsr    r!   modeling_audioldm2r#   r$   r   r%   torch_xla.core.xla_modelcore	xla_modelr   r  
get_loggerr2  r   r9  r/   r2   ru   r0   r.   <module>rJ     s     = =      $ 3  : A C W  , ))MM			H	%> F ${1) {1r0   