
    big                     &   d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmc mZ d dlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ d	d
lm Z m!Z!m"Z"m#Z#  e       rd dl$m%c m&Z' dZ(ndZ( ejR                  e*      Z+dZ, G d de!e"e#      Z-y)    N)AnyCallableDictListOptionalUnion)ClapTextModelWithProjectionRobertaTokenizerRobertaTokenizerFastSpeechT5HifiGan   )AutoencoderKLUNet2DConditionModel)KarrasDiffusionSchedulers)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )AudioPipelineOutputDeprecatedPipelineMixinDiffusionPipelineStableDiffusionMixinTFaj  
    Examples:
        ```py
        >>> from diffusers import AudioLDMPipeline
        >>> import torch
        >>> import scipy

        >>> repo_id = "cvssp/audioldm-s-full-v2"
        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]

        >>> # save the audio sample as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
        ```
c            $       n    e Zd ZdZdZdZdededee	e
f   deded	ef fd
Z	 	 	 d#deej"                     deej"                     fdZd Zd Zd Z	 	 	 d#dZd$dZ ej0                          ee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deeee   f   dee   dededeeeee   f      dee   dedeeej>                  eej>                     f      deej"                     deej"                     deej"                     de dee!eeej"                  gdf      dee   d ee"ee#f      d!ee   f d"              Z$ xZ%S )&AudioLDMPipelinea  
    Pipeline for text-to-audio generation using AudioLDM.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
            Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
        tokenizer ([`PreTrainedTokenizer`]):
            A [`~transformers.RobertaTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A `UNet2DConditionModel` to denoise the encoded audio latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            Vocoder of class `SpeechT5HifiGan`.
    z0.33.1ztext_encoder->unet->vaevaetext_encoder	tokenizerunet	schedulervocoderc                     t         |           | j                  ||||||       t        | dd       r5dt	        | j
                  j                  j                        dz
  z  | _        y d| _        y )N)r   r   r   r   r    r!   r   r         )	super__init__register_modulesgetattrlenr   configblock_out_channelsvae_scale_factor)selfr   r   r   r   r    r!   	__class__s          i/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/audioldm/pipeline_audioldm.pyr&   zAudioLDMPipeline.__init__W   su     	% 	 	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw    Nprompt_embedsnegative_prompt_embedsc                    |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }|S| j                  |d| j
                  j                  dd      }	|	j                  }
|	j                  }| j                  |dd	      j                  }|j                  d
   |
j                  d
   k\  rt        j                  |
|      sj| j
                  j                  |dd| j
                  j                  dz
  d
f         }t        j                  d| j
                  j                   d|        | j                  |
j                  |      |j                  |            }|j                   }t#        j$                  |d
      }|j                  | j                  j&                  |      }|j                  \  }}|j)                  d|      }|j+                  ||z  |      }|r||dg|z  }nt-        |      t-        |      ur$t/        dt-        |       dt-        |       d      t        |t              r|g}n1|t        |      k7  r!t1        d| dt        |       d| d| d	      |}|j                  d   }| j                  |d|dd      }|j                  j                  |      }|j                  j                  |      }| j                  ||      }|j                   }t#        j$                  |d
      }|rt|j                  d   }|j                  | j                  j&                  |      }|j)                  d|      }|j+                  ||z  |      }t        j2                  ||g      }|S )a`  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device (`torch.device`):
                torch device
            num_waveforms_per_prompt (`int`):
                number of waveforms that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
        Nr#   r   
max_lengthTpt)paddingr4   
truncationreturn_tensorslongest)r6   r8   z\The following part of your input was truncated because CLAP can only handle sequences up to z	 tokens: )attention_mask)dim)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancestrlistr)   shaper   model_max_length	input_idsr;   torchequalbatch_decodeloggerwarningr   totext_embedsF	normalizer=   repeatviewtype	TypeError
ValueErrorcat)r-   promptr>   num_waveforms_per_promptdo_classifier_free_guidancenegative_promptr1   r2   
batch_sizetext_inputstext_input_idsr;   untruncated_idsremoved_textbs_embedseq_lenuncond_tokensr4   uncond_inputuncond_input_idss                       r/   _encode_promptzAudioLDMPipeline._encode_promptl   s   D *VS"9JJvt$<VJ&,,Q/J ..$>>::# ) K )22N(77N"nnVYW[n\ffO$$R(N,@,@,DDU[[N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 !--!!&)-008 . M *55MKK2>M%((t/@/@/F/Fv(V
 	
 &,,Q0HI%**86N+NPWX '+A+I&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0&,,Q/J>>$%# * L  ,5588@)88;;FCN%)%6%6 - &7 &" &<%G%G"%&[[1GR%P"&,2215G%;%>%>TEVEVE\E\ek%>%l"%;%B%B1F^%_"%;%@%@NfAfho%p"
 "II'=}&MNMr0   c                     d| j                   j                  j                  z  |z  }| j                   j                  |      j                  }|S )Nr#   )r   r*   scaling_factordecodesample)r-   latentsmel_spectrograms      r/   decode_latentszAudioLDMPipeline.decode_latents   s=    dhhoo444w>((//'299r0   c                     |j                         dk(  r|j                  d      }| j                  |      }|j                         j	                         }|S )N   r#   )r<   squeezer!   cpufloat)r-   rj   waveforms      r/   mel_spectrogram_to_waveformz,AudioLDMPipeline.mel_spectrogram_to_waveform   sJ     A%-55a8O<<0<<>'')r0   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Neta	generator)setinspect	signaturer    step
parameterskeys)r-   ru   rt   accepts_etaextra_step_kwargsaccepts_generators         r/   prepare_extra_step_kwargsz*AudioLDMPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r0   c                    || j                   z  }||k  rt        d| d| d      | j                  j                  j                  | j                   z  dk7  r:t        d| j                  j                  j                   d| j                    d      ||0t        |t              r|dk  rt        d| dt        |       d      ||t        d	| d
| d      ||t        d      |7t        |t              s't        |t              st        dt        |             ||t        d| d| d      |C|@|j                  |j                  k7  r&t        d|j                   d|j                   d      y y y )NzH`audio_length_in_s` has to be a positive value greater than or equal to z	, but is r@   r   zwThe number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got z bins and a scale factor of z5`callback_steps` has to be a positive integer but is z	 of type zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r,   rT   r!   r*   model_in_dimrA   intrR   rB   rC   rD   )	r-   rV   audio_length_in_svocoder_upsample_factorcallback_stepsrY   r1   r2   min_audio_length_in_ss	            r/   check_inputszAudioLDMPipeline.check_inputs  s    !8$:O:O O44Z[pZq r'(+ 
 <<++d.C.CCqH--1\\-@-@-M-M,NNj((),  "&
>30OSaefSfGGW X(), 
 -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$r0   c                    ||t        |      | j                  z  t        | j                  j                  j                        | j                  z  f}t        |t              r)t        |      |k7  rt        dt        |       d| d      |t        ||||      }n|j                  |      }|| j                  j                  z  }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)ru   r>   r=   )r   r,   r!   r*   r   rA   rC   r)   rT   r   rL   r    init_noise_sigma)	r-   rZ   num_channels_latentsheightr=   r>   ru   ri   rD   s	            r/   prepare_latentsz AudioLDMPipeline.prepare_latentsR  s     K4000##001T5J5JJ	
 i&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZGjj(G DNN;;;r0   rV   r   num_inference_stepsguidance_scalerY   rW   rt   ru   ri   return_dictcallbackr   cross_attention_kwargsoutput_typec           	      D   t        j                  | j                  j                  j                        | j                  j                  j
                  z  }|0| j                  j                  j                  | j                  z  |z  }t        ||z        }t        || j                  j                  j
                  z        }|| j                  z  dk7  rZt        t        j                  || j                  z              | j                  z  }t        j                  d| d||z   d| d       | j                  ||||||
|       |t        |t              rd}n-|t        |t               rt#        |      }n|
j$                  d   }| j&                  }|dkD  }| j)                  ||||||
|	      }
| j*                  j-                  ||
       | j*                  j.                  }| j                  j                  j0                  }| j3                  ||z  |||
j4                  |||	      }	| j7                  ||      }t#        |      || j*                  j8                  z  z
  }| j;                  |      5 }t=        |      D ];  \  }}|rt?        j@                  |	gdz        n|	}| j*                  jC                  ||      }| j                  ||d|
|      jD                  }|r|jG                  d      \  } }!| ||!| z
  z  z   } | j*                  jH                  |||	fi |jJ                  }	|t#        |      dz
  k(  s'|dz   |kD  r]|dz   | j*                  j8                  z  dk(  r>|jM                          |,||z  dk(  r$|tO        | j*                  dd      z  }" ||"||	       tP        s(tS        jT                          > 	 ddd       | jW                  |	      }#| jY                  |#      }$|$ddd|f   }$|dk(  r|$j[                         }$|s|$fS t]        |$      S # 1 sw Y   \xY w)u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
            audio_length_in_s (`int`, *optional*, defaults to 5.12):
                The length of the generated audio sample in seconds.
            num_inference_steps (`int`, *optional*, defaults to 10):
                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 2.5):
                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
                The number of waveforms to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
                `"pt"` to return a PyTorch `torch.Tensor` object.

        Examples:

        Returns:
            [`~pipelines.AudioPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated audio.
        Nr   zAudio length in seconds z is increased to z; so that it can be handled by the model. It will be cut to z after the denoising process.r#   g      ?)r1   r2   )r>   )totalr   )encoder_hidden_statesclass_labelsr   ordernp)audios)/r   prodr!   r*   upsample_ratessampling_rater   sample_sizer,   r   ceilrJ   infor   rA   rB   rC   r)   rD   _execution_devicerd   r    set_timesteps	timestepsin_channelsr   r=   r   r   progress_bar	enumeraterG   rU   scale_model_inputrh   chunkry   prev_sampleupdater(   XLA_AVAILABLExm	mark_steprk   rr   numpyr   )%r-   rV   r   r   r   rY   rW   rt   ru   ri   r1   r2   r   r   r   r   r   r   r   original_waveform_lengthrZ   r>   rX   r   r   r}   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textstep_idxrj   audios%                                        r/   __call__zAudioLDMPipeline.__call__h  sH   ^ #%''$,,*=*=*L*L"MPTP\P\PcPcPqPq"q$ $		 0 0 < <t?T?T TWn n&)@@A#&'84<<;N;N;\;\'\#] D)))Q.$*?*?!?@ADDYDYYFKK*+<*==NvXoOoNp qMM^L_ `%& 	#"	
 *VS"9JJvt$<VJ&,,Q/J'' '5s&:# ++$''#9 , 
 	$$%8$HNN,,	  $yy//;;&&11 
 !::9cJ y>,?$..BVBV,VV%89 	#\!), #1A\UYYy1}%=bi"%)^^%E%EFXZ[%\" "YY&*.!.+A '  &  /9C9I9I!9L6%!2^YjGj5k!kJ .$..--j!WZHYZff I**A9I/IqSTuX\XfXfXlXlNlpqNq '')+N0Ba0G#$(K#K 1g6 LLN=#	#D --g600Aa22223$KKME8O"%00]	# 	#s   /D2P#PP)NNN)N)NN
   g      @Nr#   g        NNNNTNr#   Nr   )&__name__
__module____qualname____doc___last_supported_versionmodel_cpu_offload_seqr   r	   r   r
   r   r   r   r   r&   r   rG   Tensorrd   rk   rr   r   r   r   no_gradr   EXAMPLE_DOC_STRINGrB   r   rp   r   	Generatorboolr   r   r   r   __classcell__)r.   s   @r/   r   r   <   s`   . '5xx 2x )+??@	x
 #x -x !x6 049=I  -I !) 6IV
!. #8v, U]]_12 )--1#% #;?23MQ*.049= GK();?%)#D1c49n%D1 $E?D1 !	D1
 D1 "%T#Y"78D1 #+3-D1 D1 E%//43H"HIJD1 %,,'D1  -D1 !) 6D1 D1 8S#u||$<d$BCDD1 !D1  !)c3h 8!D1" c]#D1 3 D1r0   r   ).rw   typingr   r   r   r   r   r   r   r   rG   torch.nn.functionalnn
functionalrN   transformersr	   r
   r   r   modelsr   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rJ   r   r    r0   r/   <module>r      s~     = =     m m 9 3 O O - r r ))MM			H	% (r1.0ACW r1r0   