
    bi"                     H   d dl Z d dlmZmZmZmZmZmZ d dlZd dl	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7  e'       rd dl8m9c m:Z; dZ<ndZ< e(jz                  e>      Z?dZ@	 d"dej                  deej                     deCfdZD	 	 	 	 d#deeE   deeeCej                  f      deeeE      deeeG      fdZH G d  d!e4e5eeee0e2e
      ZIy)$    N)AnyCallableDictListOptionalUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```py
        >>> import imageio
        >>> import requests
        >>> import torch
        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
        >>> from diffusers.utils import export_to_gif
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> adapter = MotionAdapter.from_pretrained(
        ...     "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
        ... )
        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
        ... ).to("cuda")
        >>> pipe.scheduler = DDIMScheduler(
        ...     beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
        ... )


        >>> def load_video(file_path: str):
        ...     images = []

        ...     if file_path.startswith(("http://", "https://")):
        ...         # If the file_path is a URL
        ...         response = requests.get(file_path)
        ...         response.raise_for_status()
        ...         content = BytesIO(response.content)
        ...         vid = imageio.get_reader(content)
        ...     else:
        ...         # Assuming it's a local file path
        ...         vid = imageio.get_reader(file_path)

        ...     for frame in vid:
        ...         pil_image = Image.fromarray(frame)
        ...         images.append(pil_image)

        ...     return images


        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        >>> output = pipe(
        ...     video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
        ... )
        >>> frames = output.frames[0]
        >>> export_to_gif(frames, "animation.gif")
        ```
encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr1   r2   moder4   AttributeError)r-   r.   r/   s      {/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.pyretrieve_latentsr9   l   st     ~}-+2I))00;;		/K84K))..00		+%%%RSS    num_inference_stepsdevice	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr=   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r=   r<   r>   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r>   r<   r<    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r=   len)	schedulerr;   r<   r=   r>   kwargsaccepts_timestepsaccept_sigmass           r8   retrieve_timestepsrN   z   s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r:   c            6           e Zd ZdZdZg dZg dZ	 	 dJdedede	d	e
eef   d
ede
eeeeeef   dedef fdZ	 	 	 	 	 dKdeej2                     deej2                     dee   dee   fdZdLdZd ZdMdedej2                  fdZdMdefdZ d Z!	 	 	 	 	 	 	 	 dNdZ"d Z#	 	 	 	 	 	 	 	 	 	 	 	 dOd eej2                     d!ed"ed#ed$ed%ee   d&eejH                     d'eejJ                     d(ee
ejL                  e'ejL                     f      d)eej2                     ded*e(dej2                  fd+Z)e*d,        Z+e*d-        Z,e*d.        Z-e*d/        Z.e*d0        Z/e*d1        Z0 ejb                         ddddd2dddd3d4ddd5ddddddd6d7dddd)gdfd e'e'e2      d8ee
e3e'e3   f      d!ee   d"ee   d9ed:e(d;ee'e      d<ee'e      d=ed>ed?ee
e3e'e3   f      d@ee   dAed(ee
ejL                  e'ejL                     f      d)eej2                     deej2                     deej2                     dBee2   dCee'ej2                        dDee3   dEe(dFee4e3e5f      dee   dGee6eee4gdf      dHe'e3   def4dI       Z7 xZ8S )PAnimateDiffVideoToVideoPipelineaw  
    Pipeline for video-to-video generation.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r4   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetrS   rJ   rQ   rR   c	           
      \   t         	|           t        |t              rt	        j
                  ||      }| j                  ||||||||       t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t        | j                        | _        y )N)rV   rW   rX   rY   rS   rJ   rQ   rR   rV   r&   r+      )vae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrrI   rV   configblock_out_channelsr\   r%   video_processor)
selfrV   rW   rX   rY   rS   rJ   rQ   rR   rH   s
            r8   r^   z(AnimateDiffVideoToVideoPipeline.__init__   s    $ 	d01"..t^DD%)/' 	 		
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw-t?T?TUr:   rT   rU   
lora_scale	clip_skipc
                 T
   |Jt        | t              r:|| _        t        st	        | j
                  |       nt        | j
                  |       |t        |t        t        f      rd}
n-|t        |t              rt        |      }
n|j                  d   }
|t        | t              r| j                  || j                        }| j                  |d| j                  j                  dd      }|j                   }| j                  |dd	      j                   }|j                  d
   |j                  d
   k\  rt#        j$                  ||      sj| j                  j'                  |dd| j                  j                  dz
  d
f         }t(        j+                  d| j                  j                   d|        t-        | j
                  j.                  d      r<| j
                  j.                  j0                  r|j2                  j5                  |      }nd}|	(| j                  |j5                  |      |      }|d   }nT| j                  |j5                  |      |d      }|d
   |	dz       }| j
                  j6                  j9                  |      }| j
                  | j
                  j:                  }n/| j<                  | j<                  j:                  }n|j:                  }|j5                  ||      }|j                  \  }}}|j?                  d|d      }|jA                  ||z  |d
      }|rm|j|dg|
z  }n|:tC        |      tC        |      ur$tE        dtC        |       dtC        |       d      t        |t              r|g}n1|
t        |      k7  r!tG        d| dt        |       d| d|
 d	      |}t        | t              r| j                  || j                        }|j                  d   }| j                  |d|dd      }t-        | j
                  j.                  d      r<| j
                  j.                  j0                  r|j2                  j5                  |      }nd}| j                  |j                   j5                  |      |      }|d   }|rK|j                  d   }|j5                  ||      }|j?                  d|d      }|jA                  |
|z  |d
      }| j
                  ,t        | t              rt        rtI        | j
                  |       ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr+   r   
max_lengthTpt)paddingrj   
truncationreturn_tensorslongest)rl   rn   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rr   output_hidden_states)dtyper<    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)%r_   r   _lora_scaler   r   rW   r"   strdictlistrI   shaper   maybe_convert_promptrX   model_max_length	input_idstorchequalbatch_decodeloggerwarningr5   rc   rq   rr   to
text_modelfinal_layer_normrt   rY   repeatviewtype	TypeErrorrA   r#   )rf   promptr<   num_images_per_promptdo_classifier_free_guidancenegative_promptrT   rU   rg   rh   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrr   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrj   uncond_inputs                          r8   encode_promptz-AnimateDiffVideoToVideoPipeline.encode_prompt  s:   V !j7U&V)D $.t/@/@*M!$"3"3Z@*Vc4["AJJvt$<VJ&,,Q/J $ ;<2264>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;< $ 9 9- X&,,Q/J>>$%# * L t((//1EF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"($ >?DT#D$5$5zB444r:   c                 |   t        | j                  j                               j                  }t	        |t
        j                        s| j                  |d      j                  }|j                  ||      }|r}| j                  |d      j                  d   }|j                  |d      }| j                  t        j                  |      d      j                  d   }|j                  |d      }||fS | j                  |      j                  }|j                  |d      }t        j                  |      }	||	fS )	Nrk   )rn   r<   rt   T)rs   r   dim)nextrR   rF   rt   r_   r   TensorrQ   pixel_valuesr   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rf   imager<   r   rs   rt   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             r8   encode_imagez,AnimateDiffVideoToVideoPipeline.encode_image  sD   T''2245;;%.**5*FSSEe4&*&8&8UY&8&Z&h&hik&l#&=&O&OPekl&O&m#-1-?-?  'd .@ .mB. * .L-]-]%1 .^ .* +,JJJ--e4AAL'99:OUV9WL"'"2"2<"@!444r:   c                    g }|rg }|t        |t              s|g}t        |      t        | j                  j                  j
                        k7  rBt        dt        |       dt        | j                  j                  j
                         d      t        || j                  j                  j
                        D ]`  \  }}	t        |	t               }
| j                  ||d|
      \  }}|j                  |d d d f          |sIj                  |d d d f          b n?|D ]:  }|r%|j                  d      \  }}j                  |       |j                  |       < g }t        |      D ]|  \  }}t        j                  |g|z  d      }|r7t        j                  |   g|z  d      }t        j                  ||gd      }|j                  |      }|j                  |       ~ |S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r+   r&   r   r   )r<   )r_   rz   rI   rY   encoder_hid_projimage_projection_layersrA   zipr   r   appendchunk	enumerater   catr   )rf   ip_adapter_imageip_adapter_image_embedsr<   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsis                 r8   prepare_ip_adapter_image_embedsz?AnimateDiffVideoToVideoPipeline.prepare_ip_adapter_image_embeds  sY    &$&!"*.5$4#5 #$DII,F,F,^,^(__ abefvbwax  yE  FI  JN  JS  JS  Jd  Jd  J|  J|  F}  E~  ~K  L  >A $))"<"<"T"T> 
X9')9 +55E*W&W#DHDUDU+VQ8KEA#%A ##$7a$@A.)001MdTUg1VW
X (? 9#.H[HaHabcHdE02E)001MN##$78	9 #%&/&= 	@"A""'))-@,ADY,Y_`"a*/4yy:OPQ:R9SVk9kqr/s,&+ii1MOb0cij&k#"5"8"8"8"G#**+>?	@ '&r:      decode_chunk_sizereturnc                     g }t        dt        |      |      D ]A  }||||z    }t        | j                  j	                  |      |      }|j                  |       C t        j                  |      S )Nr   )r.   )rangerI   r9   rV   encoder   r   r   )rf   videor.   r   r4   r   batch_videos          r8   encode_videoz,AnimateDiffVideoToVideoPipeline.encode_video  sm    q#e*&78 	(AA(9$9:K*488??;+GS\]KNN;'	( yy!!r:   c                 N   d| j                   j                  j                  z  |z  }|j                  \  }}}}}|j	                  ddddd      j                  ||z  |||      }g }t        d|j                  d   |      D ]@  }	||	|	|z    }
| j                   j                  |
      j                  }
|j                  |
       B t        j                  |      }|d d d f   j                  ||df|j                  dd  z         j	                  ddddd      }|j                         }|S )Nr+   r   r&   r      rp   )rV   rc   scaling_factorr{   permutereshaper   decoder2   r   r   r   float)rf   r4   r   r   channels
num_framesheightwidthr   r   batch_latentss              r8   decode_latentsz.AnimateDiffVideoToVideoPipeline.decode_latents  s'   dhhoo444w>:A--7
Hj&%//!Q1a088j9PRZ\bdijq'--*,=> 	(A#A,=(=>M HHOOM:AAMLL'	(
 		% dAg&&
J'CekkRSRTo'UV^^_`bcefhiklmr:   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Netar.   )rB   rC   rD   rJ   steprF   rG   )rf   r.   r   accepts_etaextra_step_kwargsaccepts_generators         r8   prepare_extra_step_kwargsz9AnimateDiffVideoToVideoPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r:   c           
      n    |dk  s|dkD  rt        d|       |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d	 j                   d
|D cg c]  }| j                  vs| c}       ||t        d| d| d      ||t        d      |2t        |t        t
        t        f      st        dt        |             ||	t        d| d|	 d      |A|	?|j                  |	j                  k7  r&t        d|j                   d|	j                   d      ||t        d      |
|t        d      |Ut        |t
              st        dt        |             |d   j                  dvrt        d|d   j                   d      y y c c}w )Nr   r+   z2The value of strength should in [0.0, 1.0] but is r[   z7`height` and `width` have to be divisible by 8 but are z and rv   c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0krf   s     r8   	<genexpr>z?AnimateDiffVideoToVideoPipeline.check_inputs.<locals>.<genexpr>A  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z3Only one of `video` or `latents` should be providedzProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)
rA   allr   r_   rx   rz   ry   r   r{   ndim)rf   r   strengthr   r   r   r4   r   rT   rU   r   r   "callback_on_step_end_tensor_inputsr   s   `             r8   check_inputsz,AnimateDiffVideoToVideoPipeline.check_inputs,  s    a<8a<QRZQ[\]]A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  
6Ct;L(MYZ^_eZfYghii&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  !4RSS',C,O ^  #.5t< PQUVmQnPop  )+00> \]tuv]w]|]|\}}~  ? /K pHs   )F2=F2c                     t        t        ||z        |      }t        ||z
  d      }||| j                  j                  z  d  }|||z
  fS )Nr   )minintmaxrJ   order)rf   r;   r=   r   r<   init_timestept_starts          r8   get_timestepsz-AnimateDiffVideoToVideoPipeline.get_timestepst  sY    C 3h >?ATU)M91=g(<(<<>?	-777r:   r+   Fr   r   r   num_channels_latentsr   timesteprt   r<   r.   r4   	add_noisec                    |
|j                   d   n|
j                   d   }||||| j                  z  || j                  z  f}t        |	t              r)t	        |	      |k7  rt        dt	        |	       d| d      |
O| j                  j                  j                  r:|j                         }| j                  j                  t        j                         t        |	t              r?t        |      D cg c]*  }| j                  ||   |	|   |      j                  d      , }}n/|D cg c]$  }| j                  ||	|      j                  d      & }}t        j                   |d      }| j                  j                  j                  r| j                  j                  |       |j                  |      }| j                  j                  j"                  |z  }||j                   d   kD  r6||j                   d   z  dk(  r!d	| d
|j                   d    d}t        |      ||j                   d   kD  r4||j                   d   z  dk7  rt        d|j                   d    d| d      t        j                   |gd      }t%        |j                   |	||      }| j&                  j)                  |||      j+                  ddddd      }
|
S ||
j                   k7  rt        d|d|
j                         |
j                  ||      }
|r,t%        ||	||      }| j&                  j)                  |
||      }
|
S c c}w c c}w )Nr+   r&   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rt   r   r   zYou have passed z# text prompts (`prompt`), but only zp initial images (`image`). Please make sure to update your script to pass as many initial images as text promptsz'Cannot duplicate `image` of batch size z to z text prompts.)r.   r<   rt   r   r   z!`latents` expected to have shape=z, but found latents.shape=)r{   r\   r_   rz   rI   rA   rV   rc   force_upcastr   r   r   float32r   r   	unsqueezer   r   r$   rJ   r   r   )rf   r   r   r   r   r   r   rt   r<   r.   r4   r   r   r   r{   r   init_latentsviderror_messagenoises                       r8   prepare_latentsz/AnimateDiffVideoToVideoPipeline.prepare_latents}  sO    (/U[[^GMM!<L
 d+++T***
 i&3y>Z+GA#i.AQ R&<'gi 
 ?xx++%--0)T* #:.  %%eAh	!>OPZZ[\]   
 nssfi 1 1#yBS T ^ ^_` ass 99\q9L xx++E"'??51L88??99LHLL..q11j<CUCUVWCX6X\]6] 'zl2UVbVhVhijVkUl m   !//l0033
\EWEWXYEZ8Z^_8_ =l>P>PQR>S=TTXYcXddrs   %yy,Q? !3!3yQW_deEnn..|UHMUUVWYZ\]_`bcdG  % #EuhFaSZS`S`Rb!cddjjuj5G$UiV[\..227E8LW 
  ts   5/L:+)L?c                     | j                   S r   _guidance_scalerf   s    r8   guidance_scalez.AnimateDiffVideoToVideoPipeline.guidance_scale  s    ###r:   c                     | j                   S r   )
_clip_skipr   s    r8   rh   z)AnimateDiffVideoToVideoPipeline.clip_skip      r:   c                      | j                   dkD  S )Nr+   r   r   s    r8   r   z;AnimateDiffVideoToVideoPipeline.do_classifier_free_guidance  s    ##a''r:   c                     | j                   S r   )_cross_attention_kwargsr   s    r8   cross_attention_kwargsz6AnimateDiffVideoToVideoPipeline.cross_attention_kwargs  s    +++r:   c                     | j                   S r   )_num_timestepsr   s    r8   num_timestepsz-AnimateDiffVideoToVideoPipeline.num_timesteps  s    """r:   c                     | j                   S r   )
_interruptr   s    r8   	interruptz)AnimateDiffVideoToVideoPipeline.interrupt  r  r:   2   g      @g?g        pilTr   r;   enforce_inference_stepsr=   r>   r   r   r   num_videos_per_promptr   r   r   output_typereturn_dictr  callback_on_step_endr   c                 6   |xs- | j                   j                  j                  | j                  z  }|xs- | j                   j                  j                  | j                  z  }d}| j	                  ||
||||||||||       |	| _        || _        || _        d| _        |t        |t        t        f      rd}n-|t        |t              rt        |      }n|j                  d   }| j                  }| j                   }|sKt#        | j$                  ||||      \  }}| j'                  |||
|      \  }}|dd j)                  ||z        }nGt+        ||
z        }t#        | j$                  ||||      \  }}|| d }|dd j)                  ||z        }|F| j,                  j/                  |||      }|j1                  ddddd	      }|j3                  ||
      }| j                   j                  j4                  } | j7                  |||| ||z  |||||||      }| j8                  | j8                  j;                  dd      nd}!|j                  d   }"| j<                  r3| j?                  ||"||| j@                  ||||!| jB                  
      \  }}ng| jE                  |||| j@                  ||||!| jB                  	      \  }}| j@                  rtG        jH                  ||g      }|jK                  |"d      }||"| jM                  |||||z  | j@                        }#| jO                  ||      }$||d#ind}%| jP                  r| jR                  nd}&tU        |&      D ]R  }'| jP                  rE| jW                  ||'|||j                   |      \  }}t        |      }| j'                  |||
|      \  }}t        |      | _,        t        |      || j$                  jZ                  z  z
  }(| j]                  | jX                        5 })t_        |      D ]  \  }*}+| j`                  r| j@                  rtG        jH                  |gdz        n|},| j$                  jc                  |,|+      },| j                  |,|+|| j8                  |%      jd                  }-| j@                  r|-jg                  d      \  }.}/|.|	|/|.z
  z  z   }- | j$                  jh                  |-|+|fi |$jj                  }|Zi }0|D ]  }1tm               |1   |0|1<     || |*|+|0      }2|2jo                  d|      }|2jo                  d|      }|2jo                  d|      }|*t        |      dz
  k(  s'|*dz   |(kD  r/|*dz   | j$                  jZ                  z  dk(  r|)jq                          tr        stu        jv                           	 ddd       U |dk(  r|}n/| jy                  ||      }3| j,                  j{                  |3|      }| j}                          |s|fS t        |      S # 1 sw Y   xY w)u  
        The call function to the pipeline for generation.

        Args:
            video (`List[PipelineImageInput]`):
                The input video to condition the generation on. Must be a list of images/frames of the video.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            strength (`float`, *optional*, defaults to 0.8):
                Higher strength leads to more differences between original video and generated video.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            decode_chunk_size (`int`, defaults to `16`):
                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

        Returns:
            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r+   )r   r   r   r   r   rT   rU   r   r4   r   r   r   FNr   )r   r   r&   r   r   r   )r   r   r   r   r   r   rt   r<   r.   r4   r   r   scale)
r   r   r<   r  r   r   rT   rU   rg   rh   )rT   rU   rg   rh   )repeatsr   r   )total)encoder_hidden_statesr  added_cond_kwargsr4   rT   rU   latent)r   r  )frames)@rY   rc   sample_sizer\   r   r   r  r  r  r_   rx   ry   rz   rI   r{   _execution_devicert   rN   rJ   r   r   r   re   preprocess_videor   r   in_channelsr   r  getfree_noise_enabled_encode_prompt_free_noiser   rh   r   r   r   r   r   r   free_init_enabled_free_init_num_itersr   _apply_free_initr	  r   progress_barr   r  scale_model_inputr2   r   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   postprocess_videomaybe_free_model_hooksr,   )4rf   r   r   r   r   r;   r  r=   r>   r   r   r   r  r   r.   r4   rT   rU   r   r   r  r  r  rh   r  r   r   r   r<   rt   latent_timestepdenoising_inference_stepsr   text_encoder_lora_scaler   r   r   r  num_free_init_itersfree_init_iternum_warmup_stepsr'  r   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsvideo_tensors4                                                       r8   __call__z(AnimateDiffVideoToVideoPipeline.__call__  s   h O499++77$:O:OOM))558M8MM ! 	+'#9-$;/Q 	 	
  .#'=$ *Vc4["AJJvt$<VJ&,,Q/J''

 '-? 3VY.*I* .2-?-?@SU^`hjp-q*I*'m22:@U3UVO(+,?(,J(K%3E 969f40I0 "#6"6"78I'm22:@U3UVO ?((99%V[9\EMM!Q1a0EHHF%H8E#yy//;;&&!5!$99$/- ' 
" ?C>Y>Y>eD''++GT:ko 	  ]]1%
""484R4R%&;,0,L,L /+'=2.. 5S 51M1 594F4F%00+'=2.. 5G 
51M1 // %		+A=*Q R);;JTU;VM '+B+N?? '2200L !::9cJ
  +/F/R \* 	 <@;Q;Qd77WX#$78 6	'N%%%)%:%:^-@&'--Yb&" '*)n#151C1CDWYbdlnt1u.	."%i.D"9~0CdnnFZFZ0ZZ "")<)<"= )'%i0 ('DAq~~  FJEeEeG9q=)Akr&)-)I)IJ\^_)`& "&*.;/3/J/J*; "+ " f  77=G=M=Ma=P:)?%6?]nKn9o%o
 2dnn11*a^L]^jjG+7*,!C =A17!OA.=+?aO+\("2"6"6y'"J(8(<(<_m(\1A1E1EF^`v1w. C	NQ..AE=M3MSTWXSX\`\j\j\p\pRptuRu$++-$Q(')' )'6	'r ("E..w8IJL((::[f:gE 	##%8O(66q)' )'s   FXXX	)NN)NNNNNr   )r   )NNNNNNNN)N@   rA  r   r+   NNNNNr   F)9__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r
   r   r   r   r   r   r   r   r   r   r   r   r	   r   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   rt   r<   	Generatorr   boolr   propertyr   rh   r   r  r
  r  no_gradr   rx   r   r   r   r@  __classcell__)rH   s   @r8   rP   rP      s-   8 EST" 157;!!V!V $!V !	!V
 (/9:!V &!V  "+')
!V .!V  5!!VR 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n52+'Z" "U\\ " &!. # $+/FP8 )-$%"&'+)-MQ*.!#P%P P 	P
 "P P 3-P $P &P E%//43H"HIJP %,,'P P P 
Pd $ $   ( ( , , # #   U]]_ 1526 $##%(-)-(, #;?/0MQ*.049=9=@D%* ;?#'KO9B!#7~7D+,-~7 sDI~./~7 	~7
 }~7 !~7 "&~7 DI&~7 e%~7 ~7 ~7 "%T#Y"78~7  (}~7 ~7 E%//43H"HIJ~7  %,,'!~7"  -#~7$ !) 6%~7& ##56'~7( "*$u||*<!=)~7* c]+~7, -~7. !)c3h 8/~70 C=1~72 'xc40@$0F'GH3~74 -1I5~76 7~7 ~7r:   rP   )Nr2   )NNNN)JrC   typingr   r   r   r   r   r   r   transformersr	   r
   r   r   image_processorr   loadersr   r   r   r   modelsr   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   r   r   r   r   r   utilsr   r    r!   r"   r#   utils.torch_utilsr$   re   r%   free_init_utilsr'   free_noise_utilsr(   pipeline_utilsr)   r*   pipeline_outputr,   torch_xla.core.xla_modelcore	xla_modelr.  r-  
get_loggerrB  r   EXAMPLE_DOC_STRINGr   rH  rx   r9   r   r<   r   rN   rP   r@   r:   r8   <module>r`     sB    = =  h h 1 w w [ [ 9 ;  o n - - + 8 D 6 ))MM			H	%3 p ck
TLL
T-5eoo-F
T\_
T  *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*vt7"t7r:   