
    bi                         d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'  e       rd dl(m)c m*Z+ dZ,ndZ, ejZ                  e.      Z/dZ0	 	 	 	 d&de1de1de2de2fdZ3	 	 	 	 d'dee1   deee4e
jj                  f      deee1      deee2      fdZ6	 d(de
jn                  d ee
jp                     d!e4fd"Z9d)d#Z: G d$ d%e%ee      Z;y)*    N)AnyCallableDictListOptionalUnion)T5EncoderModelT5TokenizerFast   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)FromSingleFileMixinLTXVideoLoraLoaderMixin)AutoencoderKLLTXVideo)LTXVideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTXPipelineOutputTFaS  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTXImageToVideoPipeline
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> image = load_image(
        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
        ... )
        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> video = pipe(
        ...     image=image,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=704,
        ...     height=480,
        ...     num_frames=161,
        ...     num_inference_steps=50,
        ... ).frames[0]
        >>> export_to_video(video, "output.mp4", fps=24)
        ```
base_seq_lenmax_seq_len
base_shift	max_shiftc                 <    ||z
  ||z
  z  }|||z  z
  }| |z  |z   }|S N )image_seq_lenr   r   r   r    mbmus           k/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/ltx/pipeline_ltx_image2video.pycalculate_shiftr)   K   s;     
Z	K,$>?AQ%%A		Q	BI    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr-   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r-   r,   r.   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r.   r,   r,   r#   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r-   len)	schedulerr+   r,   r-   r.   kwargsaccepts_timestepsaccept_sigmass           r(   retrieve_timestepsr=   Y   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r*   encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrB   rC   moderE   AttributeError)r>   r?   r@   s      r(   retrieve_latentsrI      st     ~}-+2I))00;;		/K84K))..00		+%%%RSSr*   c                     |j                  t        t        d|j                              d      }| j                  t        t        d| j                              d      }| ||z  z  }||z  d|z
  | z  z   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r(   rescale_noise_cfgrW      s    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#x''9: #66!>N:NR[9[[Ir*   c            7           e Zd ZdZdZg Zg dZdedede	de
def
 fd	Z	 	 	 	 	 dKdeeee   f   dededeej&                     deej(                     f
dZ	 	 	 	 	 	 	 	 	 	 dLdeeee   f   deeeee   f      dededeej.                     deej.                     deej.                     deej.                     dedeej&                     deej(                     fdZ	 	 	 	 	 dMdZedNdej.                  dededej.                  fd        Ze	 dNdej.                  d!ed"ed#edededej.                  fd$       Ze	 dOdej.                  d%ej.                  d&ej.                  d'edej.                  f
d(       Ze	 dOdej.                  d%ej.                  d&ej.                  d'edej.                  f
d)       Z	 	 	 	 	 	 	 	 	 	 dPd-eej.                     d.ed/ed"ed#ed!edeej(                     deej&                     d0eej@                     deej.                     dej.                  fd1Z!e"d2        Z#e"d3        Z$e"d4        Z%e"d5        Z&e"d6        Z'e"d7        Z(e"d8        Z) ejT                          e+e,      d
d
d
d*d+d,d9d:d
d;d<dd
d
d
d
d
d
d<d
d=dd
d
dgdfd-e-deeee   f   deeeee   f      d"ed#ed!ed>ed?ed@ee   dAedBedee   d0eeej@                  eej@                     f      deej.                     deej.                     deej.                     deej.                     deej.                     dCeeee   f   dDeeeee   f      dEee   dFedGee.ee/f      dHee0eee.gd
f      dIee   def4dJ              Z1 xZ2S )QLTXImageToVideoPipelinea  
    Pipeline for image-to-video generation.

    Reference: https://github.com/Lightricks/LTX-Video

    Args:
        transformer ([`LTXVideoTransformer3DModel`]):
            Conditional Transformer architecture to denoise the encoded video latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKLLTXVideo`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer (`T5TokenizerFast`):
            Second Tokenizer of class
            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
    ztext_encoder->transformer->vae)rE   prompt_embedsnegative_prompt_embedsr9   vaetext_encoder	tokenizertransformerc                 x   t         |           | j                  |||||       t        | dd       | j                  j
                  nd| _        t        | dd       | j                  j                  nd| _        t        | dd        | j                  j                  j                  nd| _        t        | d       | j                  j                  j                  nd| _        t        | j                        | _        t        | dd       | j"                  j$                  nd	| _        d
| _        d| _        d| _        y )N)r\   r]   r^   r_   r9   r\          r_   r   )vae_scale_factorr^           y   )super__init__register_modulesgetattrr\   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior_   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizer   video_processorr^   model_max_lengthtokenizer_max_lengthdefault_heightdefault_widthdefault_frames)selfr9   r\   r]   r^   r_   r7   s         r(   ri   z LTXImageToVideoPipeline.__init__   s5    	%# 	 	
 3:$t2L2XDHH..^` 	* 4;43M3YDHH//_` 	+ 3:$t2T2`D##..fg 	+ 5<D-4P4\D##00bc 	,  .t?a?ab/6t[$/O/[DNN++ad 	! " !r*   Nr   rd   promptnum_videos_per_promptmax_sequence_lengthr,   dtypec                    |xs | j                   }|xs | j                  j                  }t        |t              r|gn|}t        |      }| j                  |d|ddd      }|j                  }|j                  }	|	j                         j                  |      }	| j                  |dd      j                  }
|
j                  d   |j                  d   k\  rXt        j                  ||
      sB| j                  j                  |
d d |dz
  df         }t        j!                  d	| d
|        | j                  |j                  |            d   }|j                  ||      }|j                  \  }}}|j#                  d|d      }|j%                  ||z  |d      }|	j%                  |d      }	|	j#                  |d      }	||	fS )N
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensorslongest)r   r   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )r   r,   )_execution_devicer]   r   
isinstancestrr8   r^   	input_idsattention_maskbooltoshapetorchequalbatch_decodeloggerwarningrepeatview)r{   r|   r}   r~   r,   r   
batch_sizetext_inputstext_input_idsprompt_attention_maskuntruncated_idsremoved_textrZ   _seq_lens                  r(   _get_t5_prompt_embedsz-LTXImageToVideoPipeline._get_t5_prompt_embeds  s    14110**00'4&&[
nn *# % 
 %.. + : : 5 : : < ? ? G..SW.Xbb  $(<(<R(@@UcetIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((uV(D &++7A%,,Q0EqI%**:8M+MwXZ[ 5 : ::r J 5 < <=RTU V333r*   Tnegative_promptdo_classifier_free_guidancerZ   r[   r   negative_prompt_attention_maskc                    |
xs | j                   }
t        |t              r|gn|}|t        |      }n|j                  d   }|| j                  |||	|
|      \  }}|r||xs d}t        |t              r||gz  n|}|:t        |      t        |      ur$t        dt        |       dt        |       d      |t        |      k7  r!t        d| dt        |       d	| d| d
	      | j                  |||	|
|      \  }}||||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        r   )r|   r}   r~   r,   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   r8   r   r   type	TypeErrorr0   )r{   r|   r   r   r}   rZ   r[   r   r   r~   r,   r   r   s                r(   encode_promptz%LTXImageToVideoPipeline.encode_prompt2  s   P 1411'4&&VJ&,,Q/J 373M3M&;$7 4N 40M0 '+A+I-3O@J?\_@`jO+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  FJE_E_&&;$7 F` FB"$B 35KMkkkr*   c	           
          |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d j                   d|D 	cg c]  }	|	 j                  vs|	 c}	       ||t        d	| d
| d      ||t        d      |7t        |t              s't        |t
              st        dt        |             ||t        d      ||t        d      |||j                  |j                  k7  r&t        d|j                   d|j                   d      |j                  |j                  k7  r&t        d|j                   d|j                   d      y y y c c}	w )Nra   r   z8`height` and `width` have to be divisible by 32 but are z and r   c              3   :   K   | ]  }|j                   v   y wr"   )_callback_tensor_inputs).0kr{   s     r(   	<genexpr>z7LTXImageToVideoPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask` )r0   allr   r   r   rN   r   r   )
r{   r|   heightwidth"callback_on_step_end_tensor_inputsrZ   r[   r   r   r   s
   `         r(   check_inputsz$LTXImageToVideoPipeline.check_inputs  s8    B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa$)>)Fdee!-2P2Xvww$)?)K""&<&B&BB --:-@-@,A B.445Q8 
 %**.L.R.RR 55J5P5P4Q R6<<=Q@  S *L$+ pHs   E3%E3rE   rq   rs   returnc           
          | j                   \  }}}}}||z  }||z  }	||z  }
| j                  |d|||	||
|      } | j                  dddddddd	      j                  dd	      j                  dd      } | S )
Nr   r   r         r   r         )r   reshapepermuteflatten)rE   rq   rs   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widths              r(   _pack_latentsz%LTXImageToVideoPipeline._pack_latents  s     ?Fmm;
L*fe *l :"j0 J.//!	
 //!Q1aAq9AA!QGOOPQSTUr*   r   r   r   c           
          | j                  d      }| j                  ||||d|||      } | j                  dddddddd	      j                  dd	      j                  dd      j                  dd      } | S )
Nr   r   r   r   r   r   r   r   r   )sizer   r   r   )rE   r   r   r   rq   rs   r   s          r(   _unpack_latentsz'LTXImageToVideoPipeline._unpack_latents  s{     \\!_
//*j&%\[egqr//!Q1aAq9AA!QGOOPQSTU]]^_abcr*   latents_meanlatents_stdscaling_factorc                     |j                  ddddd      j                  | j                  | j                        }|j                  ddddd      j                  | j                  | j                        }| |z
  |z  |z  } | S Nr   r   r   r   r,   r   rE   r   r   r   s       r(   _normalize_latentsz*LTXImageToVideoPipeline._normalize_latents  sw     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X\)^;kIr*   c                     |j                  ddddd      j                  | j                  | j                        }|j                  ddddd      j                  | j                  | j                        }| |z  |z  |z   } | S r   r   r   s       r(   _denormalize_latentsz,LTXImageToVideoPipeline._denormalize_latents  sw     $((B1a8;;GNNGMMZ!&&q"aA699'..'--XK'.8<Gr*   re   rf      imager   num_channels_latentsr?   c           
         || j                   z  }|| j                   z  }|dz
  | j                  z  dz   }|||||f}|d|||f}|
|
j                  |      }d|d d d d df<   | j                  || j                  | j
                        j                  d      }|
j                  dk7  s|
j                  d d |j                  k7  r*t        d|
j                   d|j                  |fz    d	      |
j                  ||
      |fS t        |	t              rt        |	      |k7  rt        dt        |	       d| d      t        |      D cg c]K  }t        | j                   j#                  ||   j%                  d      j%                  d            |	|         M }}nP|D cg c]E  }t        | j                   j#                  |j%                  d      j%                  d            |	      G }}t'        j(                  |d      j                  |      }| j+                  || j                   j,                  | j                   j.                        }|j1                  dd|dd      }t'        j2                  |||
      }d|d d d d df<   t5        ||	||      }||z  |d|z
  z  z   }
| j                  || j                  | j
                        j                  d      }| j                  |
| j                  | j
                        }
|
|fS c c}w c c}w )Nr         ?r   r   r   r   z$Provided `latents` tensor has shape z, but the expected shape is r   r,   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.rK   r?   r,   r   )rm   ro   	new_zerosr   rr   rt   squeezerP   r   r0   r   r   rN   r8   rO   rI   r\   encode	unsqueezer   catr   r   r   r   zerosr   )r{   r   r   r   r   r   r   r   r,   r?   rE   r   
mask_shapeconditioning_maskiinit_latentsimgnoises                     r(   prepare_latentsz'LTXImageToVideoPipeline.prepare_latents  sD    4===;;; 1n)L)LLqP
1:vuM !Z?
 ' 1 1* =),aAg& $ 2 2!4#F#FHlHl!gbk  ||q GMM"1$59J9P9P$P :7==/Iefwf}f}  BV  AX  gX  fY  YZ  [  ::V5:9;LLLi&9~+ Ec)nEU V  *|+km  z* !q1C1CA1F1P1PQR1S!TV_`aVbcL  hm`c q1A1K1KA1N!OQZ[L  yy1588?..|TXX=R=RTXT\T\ThThi#**1aQB!KK
6O%(!Q'"UieT!22UaBS>S5TT ..tBBDDhDh

'"+ 	 $$T88$:^:^
 )))3
s   ALA
Lc                     | j                   S r"   _guidance_scaler{   s    r(   guidance_scalez&LTXImageToVideoPipeline.guidance_scale8  s    ###r*   c                     | j                   S r"   )_guidance_rescaler   s    r(   rS   z(LTXImageToVideoPipeline.guidance_rescale<      %%%r*   c                      | j                   dkD  S )Nr   r   r   s    r(   r   z3LTXImageToVideoPipeline.do_classifier_free_guidance@  s    ##c))r*   c                     | j                   S r"   )_num_timestepsr   s    r(   num_timestepsz%LTXImageToVideoPipeline.num_timestepsD  s    """r*   c                     | j                   S r"   )_current_timestepr   s    r(   current_timestepz(LTXImageToVideoPipeline.current_timestepH  r   r*   c                     | j                   S r"   )_attention_kwargsr   s    r(   attention_kwargsz(LTXImageToVideoPipeline.attention_kwargsL  r   r*   c                     | j                   S r"   )
_interruptr   s    r(   	interruptz!LTXImageToVideoPipeline.interruptP  s    r*      2   r           pil
frame_rater+   r-   r   rS   decode_timestepdecode_noise_scaleoutput_typereturn_dictr   callback_on_step_endr   c                    t        |t        t        f      r|j                  }| j	                  ||||||||       |
| _        || _        || _        d| _        d| _	        |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| j                  }| j                  ||| j                   |||||||
      \  }}}}| j                   r2t#        j$                  ||gd      }t#        j$                  ||gd      }|;| j&                  j)                  |||      }|j+                  ||j,                  	      }| j.                  j0                  j2                  }| j5                  |||z  ||||t"        j6                  |||
      \  }}| j                   rt#        j$                  ||g      }|dz
  | j8                  z  dz   }|| j:                  z  } || j:                  z  }!|| z  |!z  }"t=        j>                  d
d|z  |      }#tA        |"| jB                  j0                  jE                  dd      | jB                  j0                  jE                  dd      | jB                  j0                  jE                  dd      | jB                  j0                  jE                  dd            }$tG        | jB                  |||	|#|$      \  }	}tI        t        |	      || jB                  jJ                  z  z
  d      }%t        |	      | _&        | j8                  |z  | j:                  | j:                  f}&| jO                  |      5 }'tQ        |	      D ]  \  }(})| jR                  r|)| _	        | j                   rt#        j$                  |gdz        n|}*|*j+                  |j,                        }*|)jU                  |*j                  d         }+|+jW                  d      d|z
  z  }+| j.                  jY                  d      5  | j/                  |*||+||| |!|&|d
      d   },ddd       ,j[                         },| j                   rd|,j]                  d      \  }-}.|-| j^                  |.|-z
  z  z   },|+j]                  d      \  }+}/| j`                  dkD  rtc        |,|.| j`                        },| je                  |,|| |!| jf                  | jh                        },| je                  ||| |!| jf                  | jh                        }|,ddddddf   },|ddddddf   }0| jB                  jk                  |,|)|0d      d   }1t#        j$                  |ddddddf   |1gd      }| jm                  || jf                  | jh                        }|Hi }2|D ]  }3to               |3   |2|3<     || |(|)|2      }4|4jq                  d|      }|4jq                  d|      }|(t        |	      dz
  k(  s'|(dz   |%kD  r/|(dz   | jB                  jJ                  z  dk(  r|'js                          tt        stw        jx                           	 ddd       |dk(  r|}5n| je                  ||| |!| jf                  | jh                        }| j{                  || j|                  j~                  | j|                  j                  | j|                  j0                  j                        }|j+                  |j,                        }| j|                  j0                  j                  sd}+nt#        j                  |j                  |||j,                        }6t        |t              s|g|z  }||}nt        |t              s|g|z  }t#        j                  |||j,                  	      }+t#        j                  |||j,                  	      ddddddf   }d|z
  |z  ||6z  z   }| j|                  j                  ||+d      d   }5| j&                  j                  |5|      }5| j                          |s|5fS t        |5       S # 1 sw Y   xY w# 1 sw Y   xY w)!u'  
        Function invoked when calling the pipeline for generation.

        Args:
            image (`PipelineImageInput`):
                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            height (`int`, defaults to `512`):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, defaults to `704`):
                The width in pixels of the generated image. This is set to 848 by default for the best results.
            num_frames (`int`, defaults to `161`):
                The number of video frames to generate
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, defaults to `3 `):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
                Guidance rescale factor should fix overexposure when using zero terminal SNR.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask for text embeddings.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
                Pre-generated attention mask for negative text embeddings.
            decode_timestep (`float`, defaults to `0.0`):
                The timestep at which generated video is decoded.
            decode_noise_scale (`float`, defaults to `None`):
                The interpolation factor between random noise and denoised latents at the decode timestep.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ltx.LTXPipelineOutput`] instead of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to `128 `):
                Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.ltx.LTXPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ltx.LTXPipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        )r|   r   r   r   rZ   r[   r   r   FNr   r   )
r|   r   r   r}   rZ   r[   r   r   r~   r,   r   )r   r   r   r   base_image_seq_len   max_image_seq_len   r         ?r    ffffff?)r.   r'   )totalr   r   cond_uncond)
hidden_statesencoder_hidden_statestimestepencoder_attention_maskr   r   r   rope_interpolation_scaler   r  )rS   )r  rE   rZ   latentr   )r  )frames)Ir   r   r   tensor_inputsr   r   r   r   r   r   r   rN   r8   r   r   r   r   r   r   ru   
preprocessr   r   r_   rp   in_channelsr   float32ro   rm   nplinspacer)   r9   getr=   maxorderr   progress_bar	enumerater   expandr   cache_contextfloatchunkr   rS   rW   r   rr   rt   stepr   localspopupdateXLA_AVAILABLExm	mark_stepr   r\   r   r   r   timestep_conditioningrandntensordecodepostprocess_videomaybe_free_model_hooksr   )7r{   r   r|   r   r   r   r   r   r+   r-   r   rS   r}   r?   rE   rZ   r   r[   r   r   r  r  r  r   r  r   r~   r   r,   r   r   latent_num_frameslatent_heightlatent_widthvideo_sequence_lengthr.   r'   num_warmup_stepsr  r  r   tlatent_model_inputr  
noise_prednoise_pred_uncondrR   r   noise_latentspred_latentscallback_kwargsr   callback_outputsvideor   s7                                                          r(   __call__z LTXImageToVideoPipeline.__call__T  sx   h *-=?U,VW1E1S1S. 	/Q'#9"7+I 	 		
  .!1!1!% *VS"9JJvt$<VJ&,,Q/J'' +(,(H(H"7'#9"7+I 3  
	
!"* ++!II'=}&MSTUM$)II/MOd.ekl$m! ?((33E&PU3VEHHF-2E2EHFE#//66BB%)%9%9.. MM&
"" ++ %		+<>O*P Q (!^0S0SSVWW$"D"DD B BB 1M AL PS!&9"9;NO!NN!!%%&:C@NN!!%%&94@NN!!%%lC8NN!!%%k48
 *<NN*
&	& s9~0CdnnFZFZ0ZZ\]^!)n //*<....$
  %89 Q	#\!), P#1>>)*&AEAaAaUYYy1}%=gn"%7%:%:=;N;N%O" 88$6$<$<Q$?@#--b1Q9J5JK%%33MB !%!1!1&8.;!)/D#4,*1I)9$) "2 " "J (--/
339C9I9I!9L6%!2T5H5HO^oLo5p!pJ"*.."3KHa,,q0%6&$J_J_&

 "11%! 7788
 ..%! 7788 (1ab1
 '1ab 1#~~22:q-]b2cdef))WQ2A2X%6$E1M,,T@@$BfBf (3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNaP#Q	#f ("E**!3344G //..0D0DdhhooFdFdG jj!4!45G88??88GMMYv]d]j]jk!/48'6&7*&DO%-)8&#$6=*<)=
)J& <<gmm\%*\\2DV[b[h[h%itT4-&" 11W<?QTY?YYHHOOGX5OI!LE((::5k:ZE 	##%8O .._ Q	# Q	#s,   B9`5`($G>`5$`5(`2-`55`?)Nr   rd   NN)
NTr   NNNNrd   NN)NNNNN)r   r   )r   )
Nr   rd   re   rf   r   NNNN)3__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r	   r
   r   ri   r   r   r   intr   r   r,   r   r   r   Tensorr   r   staticmethodr   r   r"  r   r   	Generatorr   propertyr   rS   r   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   r   r   r   r?  __classcell__)r7   s   @r(   rY   rY      s   . =T&"2&" #&" %	&"
 #&" 0&"T )-%&#&)-'+.4c49n%.4  #.4 !	.4
 &.4 $.4h <@,0%&049=8<AE#&)-'+Qlc49n%Ql "%T#Y"78Ql &*	Ql
  #Ql  -Ql !) 6Ql  (5Ql )1(>Ql !Ql &Ql $Qlr ,0#"'+3j u||  PS \a\h\h  ,  st		+.	8;	DG	UX	lo			 	  or-2\\HMfk	   or-2\\HMfk	  )-$''+)-/3*.@*%@* @* "	@*
 @* @* @* $@* &@* EOO,@* %,,'@* 
@*D $ $ & & * * # # & & & &   U]]_12 %)(,;?#%# !"%/0MQ*.048<9=AE58BF%* 59KO9B#&7Z/!Z/ c49n%Z/ "%T#Y"78	Z/
 Z/ Z/ Z/ Z/ !Z/ 9Z/ Z/  Z/  (}Z/ E%//43H"HIJZ/ %,,'Z/   -!Z/"  (5#Z/$ !) 6%Z/& )1(>'Z/( ud5k12)Z/* %U5$u++=%>?+Z/, c]-Z/. /Z/0 #4S>21Z/2 'xc40@$0F'GH3Z/4 -1I5Z/6 !7Z/ 3 Z/r*   rY   )r  r	  r
  r  )NNNN)NrC   )r   )<r2   typingr   r   r   r   r   r   numpyr  r   transformersr	   r
   	callbacksr   r   image_processorr   loadersr   r   models.autoencodersr   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   ru   r   pipeline_utilsr   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr)  r(  
get_loggerr@  r   rL  rF  r"  r)   r   r,   r=   rG  rI  rI   rW   rY   r#   r*   r(   <module>r_     s[    = =   8 A 1 C 8 = 9 O O - - . . ))MM			H	% B 

 
 	

 
  *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
T4s//1DF] s/r*   