
    bi                        d dl Z d dlZd dlmZmZmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(  e        rd dl)m*c m+Z, dZ-ndZ- e!j\                  e/      Z0dZ1d Z2	 	 	 	 d dee3   dee	e4e
jj                  f      deee3      deee6      fdZ7	 d!de
jp                  dee
jr                     de4fdZ: G d dee      Z;y)"    N)AnyCallableDictListOptionalTupleUnion)Image)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)CogVideoXLoraLoaderMixin)AutoencoderKLCogVideoXCogVideoXTransformer3DModel)get_3d_rotary_pos_embed)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )CogVideoXPipelineOutputTFaX  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import CogVideoXDPMScheduler, CogVideoXVideoToVideoPipeline
        >>> from diffusers.utils import export_to_video, load_video

        >>> # Models: "THUDM/CogVideoX-2b" or "THUDM/CogVideoX-5b"
        >>> pipe = CogVideoXVideoToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")
        >>> pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config)

        >>> input_video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hiker.mp4"
        ... )
        >>> prompt = (
        ...     "An astronaut stands triumphantly at the peak of a towering mountain. Panorama of rugged peaks and "
        ...     "valleys. Very futuristic vibe and animated aesthetic. Highlights of purple and golden colors in "
        ...     "the scene. The sky is looks like an animated/cartoonish dream of galaxies, nebulae, stars, planets, "
        ...     "moons, but the remainder of the scene is mostly realistic."
        ... )

        >>> video = pipe(
        ...     video=input_video, prompt=prompt, strength=0.8, guidance_scale=6, num_inference_steps=50
        ... ).frames[0]
        >>> export_to_video(video, "output.mp4", fps=8)
        ```
c                 $   |}|}| \  }}||z  }|||z  kD  r|}t        t        ||z  |z              }	n|}	t        t        ||z  |z              }t        t        ||z
  dz              }
t        t        ||	z
  dz              }|
|f|
|z   ||	z   ffS )Ng       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_lefts               v/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.pyget_resize_crop_region_for_gridr.   M   s    	B	BDAq	AABG}5a!,-E"q&1*-.5"},345HE2,345Ii 8m#;Y=U"VVV    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr2   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r2   r1   r3   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r3   r1   r1    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r2   len)	schedulerr0   r1   r2   r3   kwargsaccepts_timestepsaccept_sigmass           r-   retrieve_timestepsrC   `   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r/   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrH   rI   moderK   AttributeError)rD   rE   rF   s      r-   retrieve_latentsrO      st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr/   c            3           e Zd ZdZg ZdZg dZdedede	de
deeef   f
 fd	Z	 	 	 	 	 dBdeeee   f   dededeej(                     deej*                     f
dZ	 	 	 	 	 	 	 	 dCdeeee   f   deeeee   f      dededeej0                     deej0                     dedeej(                     deej*                     fdZ	 	 	 	 	 	 	 	 	 	 dDdeej0                     dededededeej*                     deej(                     deej4                     deej0                     d eej0                     fd!Zdej0                  d"ej0                  fd#Zd$ Zd% Z	 	 	 	 dEd&ZdFd'Z dFd(Z!deded)edej(                  d"e"ej0                  ej0                  f   f
d*Z#e$d+        Z%e$d,        Z&e$d-        Z'e$d.        Z(e$d/        Z) ejT                          e+e,      d
d
d
d
d
d0d
d1d2d3dd4d
d
d
d
d5dd
d
dgdfdee-jZ                     deeeee   f      deeeee   f      dee   dee   d6ed7eee      d8e.d9e.d:eded;e.deeej4                  eej4                     f      deej^                     deej^                     deej^                     d<ed=ed>ee0ee1f      d?eee2eee0gd
f   e3e4f      d@ee   ded"ee5e"f   f.dA              Z6 xZ7S )GCogVideoXVideoToVideoPipelinea  
    Pipeline for video-to-video generation using CogVideoX.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. CogVideoX uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`CogVideoXTransformer3DModel`]):
            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
    ztext_encoder->transformer->vae)rK   prompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr?   c                    t         |           | j                  |||||       t        | dd       r/dt	        | j
                  j                  j                        dz
  z  nd| _        t        | dd       r | j
                  j                  j                  nd| _
        t        | dd       r | j
                  j                  j                  nd| _        t        | j                        | _        y )	N)rT   rU   rV   rW   r?   rV      r         gffffff?)vae_scale_factor)super__init__register_modulesgetattrr>   rV   configblock_out_channelsvae_scale_factor_spatialtemporal_compression_ratiovae_scale_factor_temporalscaling_factorvae_scaling_factor_imager   video_processor)selfrT   rU   rV   rW   r?   r=   s         r-   r^   z&CogVideoXVideoToVideoPipeline.__init__   s     	lQ\hq 	 	

 CJ$PUW[B\A#dhhoo889A=>bc 	% ;B$t:TDHHOO66Z[ 	& KRRVX]_cJd(F(Fjm%-t?\?\]r/   Nr      promptnum_videos_per_promptmax_sequence_lengthr1   dtypec                    |xs | j                   }|xs | j                  j                  }t        |t              r|gn|}t        |      }| j                  |d|ddd      }|j                  }| j                  |dd      j                  }	|	j                  d   |j                  d   k\  rXt        j                  ||	      sB| j                  j                  |	d d |dz
  df         }
t        j                  d	| d
|
        | j                  |j                  |            d   }|j                  ||      }|j                  \  }}}|j                  d|d      }|j!                  ||z  |d      }|S )N
max_lengthTpt)paddingrp   
truncationadd_special_tokensreturn_tensorslongest)rr   ru   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )rn   r1   )_execution_devicerU   rn   
isinstancestrr>   rT   	input_idsshapetorchequalbatch_decodeloggerwarningtorepeatview)ri   rk   rl   rm   r1   rn   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrR   _seq_lens                 r-   _get_t5_prompt_embedsz3CogVideoXVideoToVideoPipeline._get_t5_prompt_embeds   s    14110**00'4&&[
nn *# % 
 %....SW.Xbb  $(<(<R(@@UcetIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((uV(D &++7A%,,Q0EqI%**:8M+MwXZ[r/   Tnegative_promptdo_classifier_free_guidancerR   rS   c
                    |xs | j                   }t        |t              r|gn|}|t        |      }
n|j                  d   }
|| j                  |||||	      }|r||xs d}t        |t              r|
|gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |
t        |      k7  r!t        d| dt        |       d	| d|
 d
	      | j                  |||||	      }||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        r   )rk   rl   rm   r1   rn    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	rx   ry   rz   r>   r|   r   type	TypeErrorr6   )ri   rk   r   r   rl   rR   rS   rm   r1   rn   r   s              r-   encode_promptz+CogVideoXVideoToVideoPipeline.encode_prompt  sj   L 1411'4&&VJ&,,Q/J  66&;$7 7 M '+A+I-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &*%?%?&&;$7 &@ &" 444r/   videor   num_channels_latentsheightwidthrE   rK   timestepc           
         t        |t              r)t        |      |k7  rt        dt        |       d| d      |	$|j	                  d      dz
  | j
                  z  dz   n|	j	                  d      }||||| j                  z  || j                  z  f}|	t        |t              rQt        |      D cg c]<  }t        | j                  j                  ||   j                  d            ||         > }}nA|D cg c]6  }t        | j                  j                  |j                  d            |      8 }}t        j                  |d      j                  |      j                  ddddd	      }| j                   |z  }t#        ||||
      }| j$                  j'                  |||
      }	n|	j                  |      }	|	| j$                  j(                  z  }	|	S c c}w c c}w )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.rY   r   r   dimr   r[   )rE   r1   rn   )ry   listr>   r6   sizere   rc   rangerO   rV   encode	unsqueezer}   catr   permuterg   r   r?   	add_noiseinit_noise_sigma)ri   r   r   r   r   r   rn   r1   rE   rK   r   
num_framesr|   iinit_latentsvidnoises                    r-   prepare_latentsz-CogVideoXVideoToVideoPipeline.prepare_latents^  s    i&3y>Z+GA#i.AQ R&<'gi 
 SZRaejjma'D,J,JJQNgngsgstugv
  d333T222
 ?)T*dijtdu _`$TXX__U1X5G5G5J%KYWX\Z    kppcf 0qAQ1RT] ^pp 99\q9<<UCKKAqRSUVXYZL88<GL )FRWXEnn..|UHMGjj(G DNN;;;!   qs   2AG:;Greturnc                     |j                  ddddd      }d| j                  z  |z  }| j                  j                  |      j                  }|S )Nr   rY   r   r   r[   )r   rg   rV   decoderI   )ri   rK   framess      r-   decode_latentsz,CogVideoXVideoToVideoPipeline.decode_latents  sJ    //!Q1a0d333g=)00r/   c                     t        t        ||z        |      }t        ||z
  d      }||| j                  j                  z  d  }|||z
  fS )Nr   )minr   maxr?   order)ri   r0   r2   strengthr1   init_timestept_starts          r-   get_timestepsz+CogVideoXVideoToVideoPipeline.get_timesteps  sY    C 3h >?ATU)M91=g(<(<<>?	-777r/   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )NetarE   )r7   r8   r9   r?   stepr;   r<   )ri   rE   r   accepts_etaextra_step_kwargsaccepts_generators         r-   prepare_extra_step_kwargsz7CogVideoXVideoToVideoPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r/   c           
          |dz  dk7  s|dz  dk7  rt        d| d| d      |dk  s|dkD  rt        d|       |Lt         fd|D              s8t        d	 j                   d
|D cg c]  }| j                  vs| c}       ||	t        d| d|	 d      ||	t        d      |7t        |t              s't        |t
              st        dt        |             ||
t        d| d|
 d      ||
t        d| d|
 d      |	A|
?|	j                  |
j                  k7  r&t        d|	j                   d|
j                   d      ||t        d      y y c c}w )NrZ   r   z7`height` and `width` have to be divisible by 8 but are z and r   r   z2The value of strength should in [0.0, 1.0] but is c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0kri   s     r-   	<genexpr>z=CogVideoXVideoToVideoPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z3Only one of `video` or `latents` should be provided)r6   allr   ry   rz   r   r   r|   )ri   rk   r   r   r   r   "callback_on_step_end_tensor_inputsr   rK   rR   rS   r   s   `           r-   check_inputsz*CogVideoXVideoToVideoPipeline.check_inputs  s[    A:?eai1nVW]V^^cdicjjklmma<8a<QRZQ[\]]-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa"8"D0 9*++]_ 
 &+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  !4RSS "5E pHs   )E(=E(c                 F    d| _         | j                  j                          y)zEnables fused QKV projections.TN)fusing_transformerrW   fuse_qkv_projectionsri   s    r-   r   z2CogVideoXVideoToVideoPipeline.fuse_qkv_projections  s    "&--/r/   c                     | j                   st        j                  d       y| j                  j	                          d| _         y)z)Disable QKV projection fusion if enabled.zKThe Transformer was not initially fused for QKV projections. Doing nothing.FN)r   r   r   rW   unfuse_qkv_projectionsr   s    r-   r   z4CogVideoXVideoToVideoPipeline.unfuse_qkv_projections  s2    &&NNhi335&+D#r/   r   c           	         || j                   | j                  j                  j                  z  z  }|| j                   | j                  j                  j                  z  z  }| j                  j                  j                  }| j                  j                  j                  }| j                  j                  j
                  |z  }	| j                  j                  j                  |z  }
|Ft        ||f|	|
      }t        | j                  j                  j                  |||f||      \  }}||fS ||z   dz
  |z  }t        | j                  j                  j                  d ||f|d|
|	f|      \  }}||fS )N)	embed_dimcrops_coords	grid_sizetemporal_sizer1   r   slice)r   r   r   r   	grid_typemax_sizer1   )
rc   rW   ra   
patch_sizepatch_size_tsample_widthsample_heightr.   r   attention_head_dim)ri   r   r   r   r1   grid_height
grid_widthpp_tbase_size_widthbase_size_heightgrid_crops_coords	freqs_cos	freqs_sinbase_num_framess                  r-   %_prepare_rotary_positional_embeddingszCCogVideoXVideoToVideoPipeline._prepare_rotary_positional_embeddings  s}    !>!>AQAQAXAXAcAc!cdt<<t?O?O?V?V?a?aab
##..%%22**11>>!C++22@@AE; ?j)?<L! $;**11DD.&
3($ Iy* )##  *C/!3;O#:**11DD!&
3-!*O<$ Iy )##r/   c                     | j                   S r   )_guidance_scaler   s    r-   guidance_scalez,CogVideoXVideoToVideoPipeline.guidance_scale+  s    ###r/   c                     | j                   S r   )_num_timestepsr   s    r-   num_timestepsz+CogVideoXVideoToVideoPipeline.num_timesteps/  s    """r/   c                     | j                   S r   )_attention_kwargsr   s    r-   attention_kwargsz.CogVideoXVideoToVideoPipeline.attention_kwargs3      %%%r/   c                     | j                   S r   )_current_timestepr   s    r-   current_timestepz.CogVideoXVideoToVideoPipeline.current_timestep7  r   r/   c                     | j                   S r   )
_interruptr   s    r-   	interruptz'CogVideoXVideoToVideoPipeline.interrupt;  s    r/   2   g?   Fg        pilr0   r2   r   r   use_dynamic_cfgr   output_typereturn_dictr   callback_on_step_endr   c                    t        |t        t        f      r|j                  }|xs- | j                  j
                  j                  | j                  z  }|xs- | j                  j
                  j                  | j                  z  }|t        |      n|j                  d      }d}| j                  ||||||||||
       |	| _        || _        d| _        d| _        |t        |t               rd}n-|t        |t"              rt        |      }n|j$                  d   }| j&                  }|	dkD  }| j)                  ||||||||      \  }}|rt+        j,                  ||gd      }t/        | j0                  |||      \  }}| j3                  ||||      \  }}|dd j5                  ||z        }t        |      | _        |dz
  | j8                  z  dz   }| j                  j
                  j:                  }|||z  dk7  rt=        d	|d
|d      |;| j>                  jA                  |||      }|jC                  ||jD                        }| j                  j
                  jF                  }| jI                  |||z  ||||jD                  ||||
      }| jK                  ||      }| j                  j
                  jL                  r#| jO                  |||j                  d      |      nd} tQ        t        |      || j0                  jR                  z  z
  d      }!| jU                  |      5 }"d}#tW        |      D ]  \  }$}%| jX                  r|%| _        |rt+        j,                  |gdz        n|}&| j0                  j[                  |&|%      }&|%j]                  |&j$                  d         }'| j                  j_                  d      5  | j	                  |&||'| |d      d   }(ddd       (ja                         }(|
rNd|	dtc        jd                  tb        jf                  ||%ji                         z
  |z  dz  z        z
  dz  z  z   | _        |r)|(jk                  d      \  })}*|)| jl                  |*|)z
  z  z   }(t        | j0                  tn              s' | j0                  jp                  |(|%|fi |ddid   }n5 | j0                  jp                  |(|#|%|$dkD  r||$dz
     nd|fi |ddi\  }}#|jC                  |jD                        }|Zi }+|D ]  },ts               |,   |+|,<     || |$|%|+      }-|-ju                  d|      }|-ju                  d|      }|-ju                  d|      }|$t        |      dz
  k(  s'|$dz   |!kD  r/|$dz   | j0                  jR                  z  dk(  r|"jw                          tx        sxt{        j|                           	 ddd       d| _        |dk(  s/| j                  |      }| j>                  j                  ||      }n|}| j                          |s|fS t        |      S # 1 sw Y   PxY w# 1 sw Y   txY w)a  
        Function invoked when calling the pipeline for generation.

        Args:
            video (`List[PIL.Image.Image]`):
                The input video to condition the generation on. Must be a list of images/frames of the video.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
                The width in pixels of the generated image. This is set to 720 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            strength (`float`, *optional*, defaults to 0.8):
                Higher strength leads to more differences between original video and generated video.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, defaults to `226`):
                Maximum sequence length in encoded prompt. Must be consistent with
                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.

        Examples:

        Returns:
            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        Nr   )
rk   r   r   r   r   r   r   rK   rR   rS   Fr   g      ?)rl   rR   rS   rm   r1   r   z?The number of latent frames must be divisible by `patch_size_t=z-` but the given video contains latent_frames=z, which is not divisible.)r   r   )r1   rn   )totalrY   cond_uncond)hidden_statesencoder_hidden_statesr   image_rotary_embr   r   g      @r   rK   rR   rS   latent)r   r   )r   )Cry   r   r   tensor_inputsrW   ra   r   rc   r   r>   r   r   r   r   r   r   rz   r   r|   rx   r   r}   r   rC   r?   r   r   r   re   r   r6   rh   preprocess_videor   rn   in_channelsr   r    use_rotary_positional_embeddingsr   r   r   progress_bar	enumerater   scale_model_inputexpandcache_contextfloatmathcospiitemchunkr   r   r   localspopupdateXLA_AVAILABLExm	mark_stepr   postprocess_videomaybe_free_model_hooksr   ).ri   r   rk   r   r   r   r0   r2   r   r   r   rl   r   rE   rK   rR   rS   r   r   r   r   r   rm   r   r   r1   r   latent_timesteplatent_framesr   latent_channelsr   r  num_warmup_stepsr
  old_pred_original_sampler   tlatent_model_inputr   
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputss.                                                 r-   __call__z&CogVideoXVideoToVideoPipeline.__call__?  s   X *-=?U,VW1E1S1S.`4++22@@4C`C``]))00==@]@]]#*?SZQ
 ! 	+/Q'#9 	 	
  .!1!% *VS"9JJvt$<VJ&,,Q/J''
 '5s&:# 150B0B'"7'#9 3 1C 	1
-- '!II'=}&MSTUM *<DNNL_agir)s&	&)-););<OQZ\dfl)m&	&#BQ-..z<Q/QR!)n $aD,J,JJQN ''..;;#(D(IR\O T+)++DF 
 ?((99%V[9\EHHF-2E2EHFE**11==&&..
 !::9cJ
 &&GG 66vugllSToW]^ 	 s9~0CdnnFZFZ0ZZ\]^%89 A	#\'+$!), >#1>>)*&A\UYYy1}%=bi"%)^^%E%EFXZ[%\" 88$6$<$<Q$?@ %%33MB !%!1!1&8.;!))9)9$) "2 " "J (--/
 #+,~TXXdgg2E2PTg1glo0o&opptuu0 ,D( /9C9I9I!9L6%!2T5H5HO^oLo5p!pJ "$..2GH1dnn11*aqL]qkpqrstG8K8K8K"0,-E	!a%(t9 ,9 %*95G5 "**]%8%89 (3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLN}>#A	#F "&h&''0E((::T_:`EE 	##%8O&e44E A	# A	#s,   ,BX=X0"GX=2X=0X:5X==Y)Nr   rj   NN)NTr   NNrj   NN)
Nr      <   Z   NNNNNNNNN)r   N)8__name__
__module____qualname____doc___optional_componentsmodel_cpu_offload_seqr   r   r   r   r   r	   r   r   r^   rz   r   r   r   r}   r1   rn   r   boolTensorr   	Generatorr   r   r   r   r   r   r   r   r   propertyr   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr
   r  FloatTensorr   r   r   r   r   r   r)  __classcell__)r=   s   @r-   rQ   rQ      sp   , <^^ %^ $	^
 1^ /1FFG^6 )-%&#&)-'+(c49n%(  #( !	(
 &( $(\ <@,0%&049=#&)-'+O5c49n%O5 "%T#Y"78O5 &*	O5
  #O5  -O5 !) 6O5 !O5 &O5 $O5f )-$&'+)-/3*.+//%/ / "	/
 / / $/ &/ EOO,/ %,,'/ 5<<(/dell u|| 8!2 #:Tz0,*$*$ *$ 	*$
 *$ 
u||U\\)	**$X $ $ # # & & & &   U]]_12 $(26;? $##%)- ! %%&MQ/359>B  59 9B#&3c5EKK c5 sDI~./c5 "%T#Y"78	c5
 c5 }c5 !c5 DI&c5 c5 c5 c5  #c5 c5 E%//43H"HIJc5 %++,c5    1 12!c5" !)):): ;#c5$ %c5& 'c5( #4S>2)c5* '(Cd+T124DF\\]
+c50 -1I1c52 !3c54 
&-	.5c5 3 c5r/   rQ   r-  )NrI   )<r8   r  typingr   r   r   r   r   r   r	   r}   PILr
   transformersr   r   	callbacksr   r   loadersr   modelsr   r   models.embeddingsr   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   rh   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr.  r   r9  r.   r   rz   r1   r  rC   r5  r6  rO   rQ   r5   r/   r-   <module>rL     s"      D D D   4 A / I 8 9 G O O - - 4 ))MM			H	% >W* *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
T{
5$57O {
5r/   