
    bi*                     
   d dl Z d dlZd dlmZmZmZmZmZmZm	Z	 d dl
Z
d dlZd dlmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)  e!       rd dl*m+c m,Z- dZ.ndZ. e"j^                  e0      Z1dZ2d Z3	 	 	 	 d dee4   dee	e5ejl                  f      deee4      deee7      fdZ8	 d!dejr                  deejt                     de5fdZ; G d dee      Z<y)"    N)AnyCallableDictListOptionalTupleUnion)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)CogVideoXLoraLoaderMixin)AutoencoderKLCogVideoXCogVideoXTransformer3DModel)get_3d_rotary_pos_embed)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )CogVideoXPipelineOutputTFaa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import CogVideoXImageToVideoPipeline
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
        >>> image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
        ... )
        >>> video = pipe(image, prompt, use_dynamic_cfg=True)
        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
        ```
c                 $   |}|}| \  }}||z  }|||z  kD  r|}t        t        ||z  |z              }	n|}	t        t        ||z  |z              }t        t        ||z
  dz              }
t        t        ||	z
  dz              }|
|f|
|z   ||	z   ffS )N       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_lefts               v/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.pyget_resize_crop_region_for_gridr/   H   s    	B	BDAq	AABG}5a!,-E"q&1*-.5"},345HE2,345Ii 8m#;Y=U"VVV    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr3   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r3   r2   r4   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r4   r2   r2    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r3   len)	schedulerr1   r2   r3   r4   kwargsaccepts_timestepsaccept_sigmass           r.   retrieve_timestepsrD   [   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r0   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrI   rJ   moderL   AttributeError)rE   rF   rG   s      r.   retrieve_latentsrP      st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr0   c            3       @    e Zd ZdZg ZdZg dZdedede	de
deeef   f
 fd	Z	 	 	 	 	 d@deeee   f   dededeej(                     deej*                     f
dZ	 	 	 	 	 	 	 	 dAdeeee   f   deeeee   f      dededeej0                     deej0                     dedeej(                     deej*                     fdZ	 	 	 	 	 	 	 	 	 dBdej0                  dedededededeej*                     deej(                     deej4                     d eej0                     fd!Zd ej0                  d"ej0                  fd#Zd$ Zd% Z	 	 	 dCd&ZdDd'Z dDd(Z!dedededej(                  d"e"ej0                  ej0                  f   f
d)Z#e$d*        Z%e$d+        Z&e$d,        Z'e$d-        Z(e$d.        Z) ejT                          e+e,      d
d
d
d
d/d0d
d1d2dd3d
d
d
d
d4dd
d
d gdfde-deeeee   f      deeeee   f      dee   dee   ded5ed6eee      d7e.d8eded9e.deeej4                  eej4                     f      d eej^                     deej^                     deej^                     d:ed;ed<ee0ee1f      d=eee2eee0gd
f   e3e4f      d>ee   ded"ee5e"f   f.d?              Z6 xZ7S )ECogVideoXImageToVideoPipelinea  
    Pipeline for image-to-video generation using CogVideoX.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. CogVideoX uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`CogVideoXTransformer3DModel`]):
            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
    ztext_encoder->transformer->vae)rL   prompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr@   c                    t         |           | j                  |||||       t        | dd       r/dt	        | j
                  j                  j                        dz
  z  nd| _        t        | dd       r | j
                  j                  j                  nd| _
        t        | dd       r | j
                  j                  j                  nd| _        t        | j                        | _        y )	N)rU   rV   rW   rX   r@   rW      r         gffffff?)vae_scale_factor)super__init__register_modulesgetattrr?   rW   configblock_out_channelsvae_scale_factor_spatialtemporal_compression_ratiovae_scale_factor_temporalscaling_factorvae_scaling_factor_imager   video_processor)selfrU   rV   rW   rX   r@   r>   s         r.   r_   z&CogVideoXImageToVideoPipeline.__init__   s     	%# 	 	
 CJ$PUW[B\A#dhhoo889A=>bc 	% ;B$t:TDHHOO66Z[ 	& KRRVX]_cJd(F(Fjm%-t?\?\]r0   Nr      promptnum_videos_per_promptmax_sequence_lengthr2   dtypec                    |xs | j                   }|xs | j                  j                  }t        |t              r|gn|}t        |      }| j                  |d|ddd      }|j                  }| j                  |dd      j                  }	|	j                  d   |j                  d   k\  rXt        j                  ||	      sB| j                  j                  |	d d |dz
  df         }
t        j                  d	| d
|
        | j                  |j                  |            d   }|j                  ||      }|j                  \  }}}|j                  d|d      }|j!                  ||z  |d      }|S )N
max_lengthTpt)paddingrq   
truncationadd_special_tokensreturn_tensorslongest)rs   rv   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )ro   r2   )_execution_devicerV   ro   
isinstancestrr?   rU   	input_idsshapetorchequalbatch_decodeloggerwarningtorepeatview)rj   rl   rm   rn   r2   ro   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrS   _seq_lens                 r.   _get_t5_prompt_embedsz3CogVideoXImageToVideoPipeline._get_t5_prompt_embeds   s    14110**00'4&&[
nn *# % 
 %....SW.Xbb  $(<(<R(@@UcetIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((uV(D &++7A%,,Q0EqI%**:8M+MwXZ[r0   Tnegative_promptdo_classifier_free_guidancerS   rT   c
                    |xs | j                   }t        |t              r|gn|}|t        |      }
n|j                  d   }
|| j                  |||||	      }|r||xs d}t        |t              r|
|gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |
t        |      k7  r!t        d| dt        |       d	| d|
 d
	      | j                  |||||	      }||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        r   )rl   rm   rn   r2   ro    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	ry   rz   r{   r?   r}   r   type	TypeErrorr7   )rj   rl   r   r   rm   rS   rT   rn   r2   ro   r   s              r.   encode_promptz+CogVideoXImageToVideoPipeline.encode_prompt  sj   L 1411'4&&VJ&,,Q/J  66&;$7 7 M '+A+I-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &*%?%?&&;$7 &@ &" 444r0   imager   num_channels_latents
num_framesheightwidthrF   rL   c           
         t        |	t              r)t        |	      |k7  rt        dt        |	       d| d      |dz
  | j                  z  dz   }||||| j
                  z  || j
                  z  f}| j                  j                  j                  9|d d |d   |d   | j                  j                  j                  z  z   fz   |dd  z   }|j                  d      }t        |	t              rQt        |      D cg c]<  }t        | j                  j                  ||   j                  d            |	|         > }}nA|D cg c]6  }t        | j                  j                  |j                  d            |	      8 }}t        j                  |d      j!                  |      j#                  ddddd	      }| j                  j                  j$                  s| j&                  |z  }nd| j&                  z  |z  }||dz
  ||| j
                  z  || j
                  z  f}t        j(                  |||
      }t        j                  ||gd      }| j                  j                  j                  U|d d d |j+                  d      | j                  j                  j                  z  df   }t        j                  ||gd      }|
t-        ||	||      }
n|
j!                  |      }
|
| j.                  j0                  z  }
|
|fS c c}w c c}w )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r   rZ   r   dimr   r\   )r2   ro   .)rF   r2   ro   )rz   listr?   r7   rf   rd   rX   rb   patch_size_t	unsqueezerangerP   rW   encoder~   catr   permuteinvert_scale_latentsrh   zerossizer   r@   init_noise_sigma)rj   r   r   r   r   r   r   ro   r2   rF   rL   r}   iimage_latentsimgpadding_shapelatent_paddingfirst_frames                     r.   prepare_latentsz-CogVideoXImageToVideoPipeline.prepare_latents\  s    i&3y>Z+GA#i.AQ R&<'gi 
 !1n)G)GG!K
 d333T222
 ""//;"1IqE!Ht7G7G7N7N7[7[,[![ ]]`efgfh`iiE"i&`efp`q[\ q1C1CA1F!GSTVM  hmm`c-dhhoocmmA>N.OQZ[mMm		-Q7::5AII!QPQSTVWXxx33 99MIM  = ==MM N d333T222
 ]6O		=."AqI ""//;'+i]-?-?-BTEUEUE\E\EiEi-i+ikn(noK!II{M&BJM?"5IfTYZGjj(G DNN;;;%%M ns   7AL?;Lreturnc                     |j                  ddddd      }d| j                  z  |z  }| j                  j                  |      j                  }|S )Nr   rZ   r   r   r\   )r   rh   rW   decoderJ   )rj   rL   framess      r.   decode_latentsz,CogVideoXImageToVideoPipeline.decode_latents  sJ    //!Q1a0d333g=)00r0   c                     t        t        ||z        |      }t        ||z
  d      }||| j                  j                  z  d  }|||z
  fS )Nr   )minr    maxr@   order)rj   r1   r3   strengthr2   init_timestept_starts          r.   get_timestepsz+CogVideoXImageToVideoPipeline.get_timesteps  sY    C 3h >?ATU)M91=g(<(<<>?	-777r0   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )NetarF   )r8   r9   r:   r@   stepr<   r=   )rj   rF   r   accepts_etaextra_step_kwargsaccepts_generators         r.   prepare_extra_step_kwargsz7CogVideoXImageToVideoPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r0   c
           
      X    t        |t        j                        sKt        |t        j                  j                        s't        |t
              st        dt        |             |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d j                   d	|D 
cg c]  }
|
 j                  vs|
 c}
       ||t        d
| d| d      ||t        d      |7t        |t              s't        |t
              st        dt        |             ||	t        d
| d|	 d      ||	t        d| d|	 d      |C|	@|j                  |	j                  k7  r&t        d|j                   d|	j                   d      y y y c c}
w )Nz``image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is r[   r   z7`height` and `width` have to be divisible by 8 but are z and r   c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0krj   s     r.   	<genexpr>z=CogVideoXImageToVideoPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )rz   r~   TensorPILImager   r7   r   allr   r{   r}   )rj   r   rl   r   r   r   "callback_on_step_end_tensor_inputsrL   rS   rT   r   s   `          r.   check_inputsz*CogVideoXImageToVideoPipeline.check_inputs  sr    5%,,/uciioo6ud+K=" 
 A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa"8"D0 9*++]_ 
 &+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$5 pHs   6F'
F'c                 F    d| _         | j                  j                          y)zEnables fused QKV projections.TN)fusing_transformerrX   fuse_qkv_projectionsrj   s    r.   r   z2CogVideoXImageToVideoPipeline.fuse_qkv_projections  s    "&--/r0   c                     | j                   st        j                  d       y| j                  j	                          d| _         y)z)Disable QKV projection fusion if enabled.zKThe Transformer was not initially fused for QKV projections. Doing nothing.FN)r   r   r   rX   unfuse_qkv_projectionsr   s    r.   r   z4CogVideoXImageToVideoPipeline.unfuse_qkv_projections  s2    &&NNhi335&+D#r0   c           	         || j                   | j                  j                  j                  z  z  }|| j                   | j                  j                  j                  z  z  }| j                  j                  j                  }| j                  j                  j                  }| j                  j                  j
                  |z  }	| j                  j                  j                  |z  }
|Ft        ||f|	|
      }t        | j                  j                  j                  |||f||      \  }}||fS ||z   dz
  |z  }t        | j                  j                  j                  d ||f|d|
|	f|      \  }}||fS )N)	embed_dimcrops_coords	grid_sizetemporal_sizer2   r   slice)r   r   r   r   	grid_typemax_sizer2   )
rd   rX   rb   
patch_sizer   sample_widthsample_heightr/   r   attention_head_dim)rj   r   r   r   r2   grid_height
grid_widthpp_tbase_size_widthbase_size_heightgrid_crops_coords	freqs_cos	freqs_sinbase_num_framess                  r.   %_prepare_rotary_positional_embeddingszCCogVideoXImageToVideoPipeline._prepare_rotary_positional_embeddings  s}    !>!>AQAQAXAXAcAc!cdt<<t?O?O?V?V?a?aab
##..%%22**11>>!C++22@@AE; ?j)?<L! $;**11DD.&
3($ Iy* )##  *C/!3;O#:**11DD!&
3-!*O<$ Iy )##r0   c                     | j                   S r   )_guidance_scaler   s    r.   guidance_scalez,CogVideoXImageToVideoPipeline.guidance_scaleF  s    ###r0   c                     | j                   S r   )_num_timestepsr   s    r.   num_timestepsz+CogVideoXImageToVideoPipeline.num_timestepsJ  s    """r0   c                     | j                   S r   )_attention_kwargsr   s    r.   attention_kwargsz.CogVideoXImageToVideoPipeline.attention_kwargsN      %%%r0   c                     | j                   S r   )_current_timestepr   s    r.   current_timestepz.CogVideoXImageToVideoPipeline.current_timestepR  r   r0   c                     | j                   S r   )
_interruptr   s    r.   	interruptz'CogVideoXImageToVideoPipeline.interruptV  s    r0   1   2      Fg        pilr1   r3   r   use_dynamic_cfgr   output_typereturn_dictr   callback_on_step_endr   c                 F   t        |t        t        f      r|j                  }|xs- | j                  j
                  j                  | j                  z  }|xs- | j                  j
                  j                  | j                  z  }|xs  | j                  j
                  j                  }d}| j                  |||||||||	       |	| _        d| _        || _        d| _        |t        |t              rd}n-|t        |t               rt#        |      }n|j$                  d   }| j&                  }|	dkD  }| j)                  ||||||||      \  }}|rt+        j,                  ||gd      }t/        | j0                  |||      \  }}t#        |      | _        |dz
  | j4                  z  dz   }| j                  j
                  j6                  }d}|"||z  dk7  r|||z  z
  }||| j4                  z  z  }| j8                  j;                  |||	      j=                  ||j>                  
      }| j                  j
                  j@                  dz  }| jC                  |||z  |||||j>                  |||
      \  }}| jE                  ||      }| j                  j
                  jF                  r#| jI                  |||jK                  d      |      nd} | j                  j
                  jL                  dn|jO                  dd      }!tQ        t#        |      || j0                  jR                  z  z
  d      }"| jU                  |      5 }#d}$tW        |      D ]  \  }%}&| jX                  r|&| _        |rt+        j,                  |gdz        n|}'| j0                  j[                  |'|&      }'|rt+        j,                  |gdz        n|}(t+        j,                  |'|(gd      }'|&j]                  |'j$                  d         })| j                  j_                  d      5  | j	                  |'||)|!| |d      d   }*ddd       *ja                         }*|
rNd|	dtc        jd                  tb        jf                  ||&ji                         z
  |z  dz  z        z
  dz  z  z   | _        |r)|*jk                  d      \  }+},|+| jl                  |,|+z
  z  z   }*t        | j0                  tn              s' | j0                  jp                  |*|&|fi |ddid   }n5 | j0                  jp                  |*|$|&|%dkD  r||%dz
     nd|fi |ddi\  }}$|j=                  |j>                        }|Zi }-|D ]  }.ts               |.   |-|.<     || |%|&|-      }/|/ju                  d|      }|/ju                  d|      }|/ju                  d|      }|%t#        |      dz
  k(  s'|%dz   |"kD  r/|%dz   | j0                  jR                  z  dk(  r|#jw                          tx        st{        j|                           	 ddd       d| _        |dk(  s:|dd|df   }| j                  |      }0| j8                  j                  |0|      }0n|}0| j                          |s|0fS t        |0      S # 1 sw Y   [xY w# 1 sw Y   xY w)a7  
        Function invoked when calling the pipeline for generation.

        Args:
            image (`PipelineImageInput`):
                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The width in pixels of the generated image. This is set to 720 by default for the best results.
            num_frames (`int`, defaults to `48`):
                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
                needs to be satisfied is that of divisibility mentioned above.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, defaults to `226`):
                Maximum sequence length in encoded prompt. Must be consistent with
                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.

        Examples:

        Returns:
            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        r   )	r   rl   r   r   r   r   rL   rS   rT   NFr   g      ?)rl   r   r   rm   rS   rT   rn   r2   r   )r   r   )ro   rZ   )r   r   )
fill_value)totalcond_uncond)hidden_statesencoder_hidden_statestimestepofsimage_rotary_embr   r  g      @r  rL   rS   rT   latent)videor  )r   )Crz   r   r   tensor_inputsrX   rb   r   rd   r   sample_framesr   r   r   r   r   r{   r   r?   r}   ry   r   r~   r   rD   r@   r   rf   r   ri   
preprocessr   ro   in_channelsr   r    use_rotary_positional_embeddingsr   r   ofs_embed_dimnew_fullr   r   progress_bar	enumerater   scale_model_inputexpandcache_contextfloatmathcospiitemchunkr   r   r   localspopupdateXLA_AVAILABLExm	mark_stepr   postprocess_videomaybe_free_model_hooksr   )1rj   r   rl   r   r   r   r   r1   r3   r   r  rm   r   rF   rL   rS   rT   r  r  r   r  r   rn   r   r2   r   latent_framesr   additional_frameslatent_channelsr   r   r  ofs_embnum_warmup_stepsr  old_pred_original_sampler   tlatent_model_inputlatent_image_inputr  
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr  s1                                                    r.   __call__z&CogVideoXImageToVideoPipeline.__call__Z  s/   ^ *-=?U,VW1E1S1S.`4++22@@4C`C``]))00==@]@]]H4#3#3#:#:#H#H
 ! 	+/Q'#9 	 
	
  .!%!1 *VS"9JJvt$<VJ&,,Q/J''
 '5s&:# 150B0B+(C"7'#9 3 1C 	1
-- '!II'=}&MSTUM *<DNNL_agir)s&	&!)n $aD,J,JJQN ''..;;#(D(I ,}|/K K+d.L.LLLJ$$//fE/RUU--- V 
 **11==B!%!5!5.."
 !::9cJ
 &&GG 66vugllSToW]^ 	 **11??G$WM]M]^borM]Ms s9~0CdnnFZFZ0ZZ\]^%89 E	#\'+$!), B#1>>)*&A\UYYy1}%=bi"%)^^%E%EFXZ[%\"GbUYY/B%Chu"%*YY0BDV/W]^%_" 88$6$<$<Q$?@ %%33MB 	!%!1!1&8.;!)#)9)9$) "2 " "J	 (--/
 #+,~TXXdgg2E2PTg1glo0o&opptuu0 ,D( /9C9I9I!9L6%!2T5H5HO^oLo5p!pJ "$..2GH1dnn11*aqL]qkpqrstG8K8K8K"0,-E	!a%(t9 ,9 %*95G5 "**]%8%89 (3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNEB#E	#N "&h&a!2!334G''0E((::T_:`EE 	##%8O&e44K	 	%E	# E	#s,   CZZ
1GZZ
ZZZ )Nr   rk   NN)NTr   NNrk   NN)	r         <   Z   NNNN)NNN)r   N)8__name__
__module____qualname____doc___optional_componentsmodel_cpu_offload_seqr   r   r
   r   r   r	   r   r   r_   r{   r   r    r   r~   r2   ro   r   boolr   r   	Generatorr   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   r  FloatTensorr   r   r   r   r   r   r9  __classcell__)r>   s   @r.   rR   rR      sZ   , <^^ %^ $	^
 1^ /1FFG^< )-%&#&)-'+(c49n%(  #( !	(
 &( $(\ <@,0%&049=#&)-'+O5c49n%O5 "%T#Y"78O5 &*	O5
  #O5  -O5 !) 6O5 !O5 &O5 $O5h $&'+)-/3*.I&||I& I& "	I&
 I& I& I& $I& &I& EOO,I& %,,'I&Xell u|| 8!2 #=@0,*$*$ *$ 	*$
 *$ 
u||U\\)	**$X $ $ # # & & & &   U]]_12 37;? $##%)- ! %%&MQ/359>B  59 9B#&3k5!k5 sDI~./k5 "%T#Y"78	k5
 k5 }k5 k5 !k5 DI&k5 k5 k5  #k5 k5 E%//43H"HIJk5 %++,k5    1 12!k5" !)):): ;#k5$ %k5& 'k5( #4S>2)k5* '(Cd+T124DF\\]
+k50 -1I1k52 !3k54 
&-	.5k5 3 k5r0   rR   )NNNN)NrJ   )=r9   r  typingr   r   r   r   r   r   r	   r   r~   transformersr
   r   	callbacksr   r   image_processorr   loadersr   modelsr   r   models.embeddingsr   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   ri   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr'  r&  
get_loggerr>  r   rH  r/   r    r{   r2   r  rD   r   rE  rP   rR   r6   r0   r.   <module>r[     s+      D D D 
  4 A 1 / I 8 9 G 
 . - 4 ))MM			H	% *W* *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
Tc5$57O c5r0   