
    bi                        d dl Z d dlZd dlmZmZmZmZmZmZm	Z	 d dl
Z
d dlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&  e       rd dl'm(c m)Z* dZ+ndZ+ ejX                  e-      Z.dZ/d Z0	 	 	 	 ddee1   dee	e2e
jf                  f      deee1      deee4      fdZ5 G d dee      Z6y)    N)AnyCallableDictListOptionalTupleUnion)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)CogVideoXLoraLoaderMixin)AutoencoderKLCogVideoXCogVideoXTransformer3DModel)get_3d_rotary_pos_embed)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )CogVideoXPipelineOutputTFa  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import CogVideoXPipeline
        >>> from diffusers.utils import export_to_video

        >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to("cuda")
        >>> prompt = (
        ...     "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
        ...     "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
        ...     "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
        ...     "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
        ...     "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
        ...     "atmosphere of this unique musical performance."
        ... )
        >>> video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
        >>> export_to_video(video, "output.mp4", fps=8)
        ```
c                 $   |}|}| \  }}||z  }|||z  kD  r|}t        t        ||z  |z              }	n|}	t        t        ||z  |z              }t        t        ||z
  dz              }
t        t        ||	z
  dz              }|
|f|
|z   ||	z   ffS )Ng       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_lefts               j/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/cogvideo/pipeline_cogvideox.pyget_resize_crop_region_for_gridr-   E   s    	B	BDAq	AABG}5a!,-E"q&1*-.5"},345HE2,345Ii 8m#;Y=U"VVV    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr1   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r1   r0   r2   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r2   r0   r0    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r1   len)	schedulerr/   r0   r1   r2   kwargsaccepts_timestepsaccept_sigmass           r,   retrieve_timestepsrB   X   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r.   c            1           e Zd ZdZg ZdZg dZdedede	de
deeef   f
 fd	Z	 	 	 	 	 d;deeee   f   dededeej(                     deej*                     f
dZ	 	 	 	 	 	 	 	 d<deeee   f   deeeee   f      dededeej0                     deej0                     dedeej(                     deej*                     fdZ	 d=dZdej0                  dej0                  fdZd Z	 	 d>dZd?dZd?d Zd!ed"ed#edej(                  de ej0                  ej0                  f   f
d$Z!e"d%        Z#e"d&        Z$e"d'        Z%e"d(        Z&e"d)        Z' ejP                          e)e*      d
d
d
d
d
d*d
d+d,dd-d
d
d
d
d.dd
d
dgdfdeeeee   f      deeeee   f      d!ee   d"ee   d#ee   d/ed0eee      d1e+d2eded3e+d4eeejX                  eejX                     f      deejZ                     deejZ                     deejZ                     d5ed6ed7ee.ee/f      d8eee0eee.gd
f   e1e2f      d9ee   dedee3e f   f,d:              Z4 xZ5S )@CogVideoXPipelinea  
    Pipeline for text-to-video generation using CogVideoX.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. CogVideoX uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`CogVideoXTransformer3DModel`]):
            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr>   c                    t         |           | j                  |||||       t        | dd       r/dt	        | j
                  j                  j                        dz
  z  nd| _        t        | dd       r | j
                  j                  j                  nd| _
        t        | dd       r | j
                  j                  j                  nd| _        t        | j                        | _        y )	N)rH   rI   rJ   rK   r>   rJ      r         gffffff?)vae_scale_factor)super__init__register_modulesgetattrr=   rJ   configblock_out_channelsvae_scale_factor_spatialtemporal_compression_ratiovae_scale_factor_temporalscaling_factorvae_scaling_factor_imager   video_processor)selfrH   rI   rJ   rK   r>   r<   s         r,   rR   zCogVideoXPipeline.__init__   s     	lQ\hq 	 	
 CJ$PUW[B\A#dhhoo889A=>bc 	% ;B$t:TDHHOO66Z[ 	& KRRVX]_cJd(F(Fjm%-t?\?\]r.   Nr      promptnum_videos_per_promptmax_sequence_lengthr0   dtypec                    |xs | j                   }|xs | j                  j                  }t        |t              r|gn|}t        |      }| j                  |d|ddd      }|j                  }| j                  |dd      j                  }	|	j                  d   |j                  d   k\  rXt        j                  ||	      sB| j                  j                  |	d d |dz
  df         }
t        j                  d	| d
|
        | j                  |j                  |            d   }|j                  ||      }|j                  \  }}}|j                  d|d      }|j!                  ||z  |d      }|S )N
max_lengthTpt)paddingrd   
truncationadd_special_tokensreturn_tensorslongest)rf   ri   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )rb   r0   )_execution_devicerI   rb   
isinstancestrr=   rH   	input_idsshapetorchequalbatch_decodeloggerwarningtorepeatview)r]   r_   r`   ra   r0   rb   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrF   _seq_lens                 r,   _get_t5_prompt_embedsz'CogVideoXPipeline._get_t5_prompt_embeds   s    14110**00'4&&[
nn *# % 
 %....SW.Xbb  $(<(<R(@@UcetIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((uV(D &++7A%,,Q0EqI%**:8M+MwXZ[r.   Tnegative_promptdo_classifier_free_guidancerF   rG   c
                    |xs | j                   }t        |t              r|gn|}|t        |      }
n|j                  d   }
|| j                  |||||	      }|r||xs d}t        |t              r|
|gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |
t        |      k7  r!t        d| dt        |       d	| d|
 d
	      | j                  |||||	      }||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        r   )r_   r`   ra   r0   rb    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	rl   rm   rn   r=   rp   r   type	TypeErrorr5   )r]   r_   r   r   r`   rF   rG   ra   r0   rb   ry   s              r,   encode_promptzCogVideoXPipeline.encode_prompt   sj   L 1411'4&&VJ&,,Q/J  66&;$7 7 M '+A+I-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &*%?%?&&;$7 &@ &" 444r.   c
                 X   t        |t              r)t        |      |k7  rt        dt        |       d| d      ||dz
  | j                  z  dz   ||| j
                  z  || j
                  z  f}
|	t        |
|||      }	n|	j                  |      }	|	| j                  j                  z  }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r   )	generatorr0   rb   )
rm   listr=   r5   rY   rW   r   rv   r>   init_noise_sigma)r]   ry   num_channels_latents
num_framesheightwidthrb   r0   r   rE   rp   s              r,   prepare_latentsz!CogVideoXPipeline.prepare_latentsE  s     i&3y>Z+GA#i.AQ R&<'gi  !^ > >>B d333T222
 ?"5IfTYZGjj(G DNN;;;r.   rE   returnc                     |j                  ddddd      }d| j                  z  |z  }| j                  j                  |      j                  }|S )Nr   rM   r   r   rO   )permuter[   rJ   decodesample)r]   rE   framess      r,   decode_latentsz CogVideoXPipeline.decode_latents_  sJ    //!Q1a0d333g=)00r.   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Netar   )r6   r7   r8   r>   stepr:   r;   )r]   r   r   accepts_etaextra_step_kwargsaccepts_generators         r,   prepare_extra_step_kwargsz+CogVideoXPipeline.prepare_extra_step_kwargsg  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r.   c           
          |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d j                   d|D cg c]  }| j                  vs| c}       ||t        d	| d
| d      ||t        d      |7t        |t              s't        |t
              st        dt        |             ||t        d	| d| d      ||t        d| d| d      |C|@|j                  |j                  k7  r&t        d|j                   d|j                   d      y y y c c}w )NrN   r   z7`height` and `width` have to be divisible by 8 but are z and r   c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0kr]   s     r,   	<genexpr>z1CogVideoXPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r5   allr   rm   rn   r   r   rp   )	r]   r_   r   r   r   "callback_on_step_end_tensor_inputsrF   rG   r   s	   `        r,   check_inputszCogVideoXPipeline.check_inputsy  s&    A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa"8"D0 9*++]_ 
 &+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$5 pHs   E%Ec                 F    d| _         | j                  j                          y)zEnables fused QKV projections.TN)fusing_transformerrK   fuse_qkv_projectionsr]   s    r,   r   z&CogVideoXPipeline.fuse_qkv_projections  s    "&--/r.   c                     | j                   st        j                  d       y| j                  j	                          d| _         y)z)Disable QKV projection fusion if enabled.zKThe Transformer was not initially fused for QKV projections. Doing nothing.FN)r   rt   ru   rK   unfuse_qkv_projectionsr   s    r,   r   z(CogVideoXPipeline.unfuse_qkv_projections  s2    &&NNhi335&+D#r.   r   r   r   c           	         || j                   | j                  j                  j                  z  z  }|| j                   | j                  j                  j                  z  z  }| j                  j                  j                  }| j                  j                  j                  }| j                  j                  j
                  |z  }	| j                  j                  j                  |z  }
|Ft        ||f|	|
      }t        | j                  j                  j                  |||f||      \  }}||fS ||z   dz
  |z  }t        | j                  j                  j                  d ||f|d|
|	f|      \  }}||fS )N)	embed_dimcrops_coords	grid_sizetemporal_sizer0   r   slice)r   r   r   r   	grid_typemax_sizer0   )
rW   rK   rU   
patch_sizepatch_size_tsample_widthsample_heightr-   r   attention_head_dim)r]   r   r   r   r0   grid_height
grid_widthpp_tbase_size_widthbase_size_heightgrid_crops_coords	freqs_cos	freqs_sinbase_num_framess                  r,   %_prepare_rotary_positional_embeddingsz7CogVideoXPipeline._prepare_rotary_positional_embeddings  s}    !>!>AQAQAXAXAcAc!cdt<<t?O?O?V?V?a?aab
##..%%22**11>>!C++22@@AE; ?j)?<L! $;**11DD.&
3($ Iy* )##  *C/!3;O#:**11DD!&
3-!*O<$ Iy )##r.   c                     | j                   S r   )_guidance_scaler   s    r,   guidance_scalez CogVideoXPipeline.guidance_scale  s    ###r.   c                     | j                   S r   )_num_timestepsr   s    r,   num_timestepszCogVideoXPipeline.num_timesteps  s    """r.   c                     | j                   S r   )_attention_kwargsr   s    r,   attention_kwargsz"CogVideoXPipeline.attention_kwargs      %%%r.   c                     | j                   S r   )_current_timestepr   s    r,   current_timestepz"CogVideoXPipeline.current_timestep  r   r.   c                     | j                   S r   )
_interruptr   s    r,   	interruptzCogVideoXPipeline.interrupt  s    r.   2      Fg        pilr/   r1   r   use_dynamic_cfgr   r   output_typereturn_dictr   callback_on_step_endr   c                    t        |t        t        f      r|j                  }|xs- | j                  j
                  j                  | j                  z  }|xs- | j                  j
                  j                  | j                  z  }|xs  | j                  j
                  j                  }d}
| j                  |||||||       || _        || _        d| _        d| _        |t        |t              rd}n-|t        |t               rt#        |      }n|j$                  d   }| j&                  }|dkD  }| j)                  ||||
||||      \  }}|rt+        j,                  ||gd      }t/        | j0                  |||      \  }}t#        |      | _        |dz
  | j4                  z  dz   }| j                  j
                  j6                  }d}|"||z  dk7  r|||z  z
  }||| j4                  z  z  }| j                  j
                  j8                  }| j;                  ||
z  |||||j<                  |||	      }| j?                  ||      }| j                  j
                  j@                  r#| jC                  |||jE                  d      |      nd}tG        t#        |      || j0                  jH                  z  z
  d      }| jK                  |      5 } d}!tM        |      D ]  \  }"}#| jN                  r|#| _        |rt+        j,                  |gd	z        n|}$| j0                  jQ                  |$|#      }$|#jS                  |$j$                  d         }%| j                  jU                  d
      5  | j	                  |$||%||d      d   }&ddd       &jW                         }&|	rNd|dtY        jZ                  tX        j\                  ||#j_                         z
  |z  dz  z        z
  d	z  z  z   | _        |r)|&ja                  d	      \  }'}(|'| jb                  |(|'z
  z  z   }&t        | j0                  td              s' | j0                  jf                  |&|#|fi |ddid   }n5 | j0                  jf                  |&|!|#|"dkD  r||"dz
     nd|fi |ddi\  }}!|ji                  |j<                        }|Zi })|D ]  }*tk               |*   |)|*<     || |"|#|)      }+|+jm                  d|      }|+jm                  d|      }|+jm                  d|      }|"t#        |      dz
  k(  s'|"dz   |kD  r/|"dz   | j0                  jH                  z  dk(  r| jo                          tp        sxts        jt                           	 ddd       d| _        |dk(  s:|dd|df   }| jw                  |      },| jx                  j{                  |,|      },n|},| j}                          |s|,fS t        |,      S # 1 sw Y   [xY w# 1 sw Y   xY w)a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The width in pixels of the generated image. This is set to 720 by default for the best results.
            num_frames (`int`, defaults to `48`):
                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
                needs to be satisfied is that of divisibility mentioned above.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, defaults to `226`):
                Maximum sequence length in encoded prompt. Must be consistent with
                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.

        Examples:

        Returns:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        r   NFr   g      ?)r`   rF   rG   ra   r0   )dim)totalrM   cond_uncond)hidden_statesencoder_hidden_statestimestepimage_rotary_embr   r   g      @r   rE   rF   rG   latent)videor   )r   )@rm   r   r   tensor_inputsrK   rU   r   rW   r   sample_framesr   r   r   r   r   rn   r   r=   rp   rl   r   rq   catrB   r>   r   rY   r   in_channelsr   rb   r    use_rotary_positional_embeddingsr   sizemaxorderprogress_bar	enumerater   scale_model_inputexpandcache_contextfloatmathcospiitemchunkr   r   r   rv   localspopupdateXLA_AVAILABLExm	mark_stepr   r\   postprocess_videomaybe_free_model_hooksr   )-r]   r_   r   r   r   r   r/   r1   r   r   r`   r   r   rE   rF   rG   r   r   r   r   r   ra   ry   r0   r   latent_framesr   additional_frameslatent_channelsr   r   num_warmup_stepsr   old_pred_original_sampleitlatent_model_inputr   
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   s-                                                r,   __call__zCogVideoXPipeline.__call__  sq   X *-=?U,VW1E1S1S.`4++22@@4C`C``]))00==@]@]]H4#3#3#:#:#H#H
 ! 	."	
  .!1!% *VS"9JJvt$<VJ&,,Q/J''
 '5s&:# 150B0B'"7'#9 3 1C 	1
-- '!II'=}&MSTUM *<DNNL_agir)s&	&!)n $aD,J,JJQN ''..;;#(D(I ,}|/K K+d.L.LLLJ**11==&&..

 !::9cJ
 &&GG 66vugllSToW]^ 	 s9~0CdnnFZFZ0ZZ\]^%89 A	#\'+$!), >#1>>)*&A\UYYy1}%=bi"%)^^%E%EFXZ[%\" 88$6$<$<Q$?@ %%33MB !%!1!1&8.;!))9)9$) "2 " "J (--/
 #+,~TXXdgg2E2PTg1glo0o&opptuu0 ,D( /9C9I9I!9L6%!2T5H5HO^oLo5p!pJ "$..2GH1dnn11*aqL]qkpqrstG8K8K8K"0,-E	!a%(t9 ,9 %*95G5 "**]%8%89 (3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLN}>#A	#F "&h&a!2!334G''0E((::T_:`EE 	##%8O&e44I A	# A	#s,   BW('WGW(W(W% W((W1)Nr   r^   NN)NTr   NNr^   NNr   )NN)r   N)6__name__
__module____qualname____doc___optional_componentsmodel_cpu_offload_seqr   r   r
   r   r   r	   r   r   rR   rn   r   r   r   rq   r0   rb   r   boolTensorr   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   	GeneratorFloatTensorr   r   r   r   r   r   r  __classcell__)r<   s   @r,   rD   rD      s   , <^^ %^ $	^
 1^ /1FFG^2 )-%&#&)-'+(c49n%(  #( !	(
 &( $(Z <@,0%&049=#&)-'+O5c49n%O5 "%T#Y"78O5 &*	O5
  #O5  -O5 !) 6O5 !O5 &O5 $O5d nr4ell u|| !2 #1f0
,*$*$ *$ 	*$
 *$ 
u||U\\)	**$X $ $ # # & & & &   U]]_12 37;? $#$(#%)- ! %%&MQ/359>B  59 9B#&1Z5sDI~./Z5 "%T#Y"78Z5 	Z5
 }Z5 SMZ5 !Z5 DI&Z5 Z5 Z5  #Z5 Z5 E%//43H"HIJZ5 %++,Z5   1 12Z5  !)):): ;!Z5" #Z5$ %Z5& #4S>2'Z5( '(Cd+T124DF\\]
)Z5. -1I/Z50 !1Z52 
&-	.3Z5 3 Z5r.   rD   )NNNN)7r7   r   typingr   r   r   r   r   r   r	   rq   transformersr
   r   	callbacksr   r   loadersr   modelsr   r   models.embeddingsr   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   r\   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr  rt   r  r-   r   rn   r0   r   rB   rD   r4   r.   r,   <module>r/     s       D D D  4 A / I 8 9 G O O - - 4 ))MM			H	% 0W* *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*vB
5)+C B
5r.   