
    bi                        d dl Z d dlmZmZmZmZmZmZmZ d dl	Z	d dl
mc mZ d dlmZmZmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZm Z  ddl!m"Z" dd	l#m$Z$ dd
l%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:  e)       rd dl;m<c m=Z> dZ?ndZ? e*j                  eA      ZBdZC G d de7e8eeee3e5e
      ZDy)    N)AnyCallableDictListOptionalTupleUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLControlNetModelImageProjectionMultiControlNetModelUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFaT  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import (
        ...     AnimateDiffControlNetPipeline,
        ...     AutoencoderKL,
        ...     ControlNetModel,
        ...     MotionAdapter,
        ...     LCMScheduler,
        ... )
        >>> from diffusers.utils import export_to_gif, load_video

        >>> # Additionally, you will need a preprocess videos before they can be used with the ControlNet
        >>> # HF maintains just the right package for it: `pip install controlnet_aux`
        >>> from controlnet_aux.processor import ZoeDetector

        >>> # Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
        >>> # Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
        >>> controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)

        >>> # We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
        >>> motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")

        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
        >>> pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE",
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ... ).to(device="cuda", dtype=torch.float16)
        >>> pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
        >>> pipe.load_lora_weights(
        ...     "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora"
        ... )
        >>> pipe.set_adapters(["lcm-lora"], [0.8])

        >>> depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        >>> conditioning_frames = []

        >>> with pipe.progress_bar(total=len(video)) as progress_bar:
        ...     for frame in video:
        ...         conditioning_frames.append(depth_detector(frame))
        ...         progress_bar.update()

        >>> prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
        >>> negative_prompt = "bad quality, worst quality"

        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_frames=len(video),
        ...     num_inference_steps=10,
        ...     guidance_scale=2.0,
        ...     conditioning_frames=conditioning_frames,
        ...     generator=torch.Generator().manual_seed(42),
        ... ).frames[0]

        >>> export_to_gif(video, "animatediff_controlnet.gif", fps=8)
        ```
c            8           e Zd ZdZdZddgZg dZ	 	 dCdeded	e	d
e
eef   dede
eee   ee   ef   dedee   dee   f fdZ	 	 	 	 	 dDdeej0                     deej0                     dee   dee   fdZdEdZd ZdFdefdZd Z	 	 	 	 	 	 	 	 dGdZ 	 dEdZ!	 	 dHdZ"e#d         Z$e#d!        Z%e#d"        Z&e#d#        Z'e#d$        Z(e#d%        Z) ejT                         ddddd&d'dd(ddddddddd)d*dddddddd+gdfd,e
e+ee+   f   d-ee   d.ee   d/ee   d0ed1ed2ee
e+ee+   f      d3ee   d4ed5ee
ejX                  eejX                     f      d+eej0                     deej0                     deej0                     d6ee-   d7ee-   d8eee-      d9ee+   d:e.d;ee/e+e0f      d<e
eee   f   d=e.d>e
eee   f   d?e
eee   f   dee   d@ee1eee/gdf      dAee+   def6dB       Z2 xZ3S )IAnimateDiffControlNetPipelinea  
    Pipeline for text-to-video generation with ControlNet guidance.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    ztext_encoder->unet->vaefeature_extractorimage_encoder)latentsprompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetmotion_adapter
controlnet	schedulerc
                    t         
|           t        |t              rt	        j
                  ||      }t        |t        t        f      rt        |      }| j                  |||||||||		       t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t!        | j                        | _        t!        | j                  dd	      | _        y )
N)	r3   r4   r5   r6   r7   r8   r9   r.   r/   r3   r%   r*      )vae_scale_factorTF)r<   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dlisttupler   register_modulesgetattrlenr3   configblock_out_channelsr<   r$   video_processorcontrol_video_processor)selfr3   r4   r5   r6   r7   r8   r9   r.   r/   	__class__s             z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.pyr@   z&AnimateDiffControlNetPipeline.__init__   s     	d01"..t^DDj4-0-j9J%)!/' 	 
	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw-t?T?TU'5!224V[(
$    r1   r2   
lora_scale	clip_skipc
                 H
   |Jt        | t              r:|| _        t        st	        | j
                  |       nt        | j
                  |       |t        |t              rd}
n-|t        |t              rt        |      }
n|j                  d   }
|t        | t              r| j                  || j                        }| j                  |d| j                  j                  dd      }|j                  }| j                  |dd	      j                  }|j                  d
   |j                  d
   k\  rt!        j"                  ||      sj| j                  j%                  |dd| j                  j                  dz
  d
f         }t&        j)                  d| j                  j                   d|        t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}|	(| j                  |j3                  |      |      }|d   }nT| j                  |j3                  |      |d      }|d
   |	dz       }| j
                  j4                  j7                  |      }| j
                  | j
                  j8                  }n/| j:                  | j:                  j8                  }n|j8                  }|j3                  ||      }|j                  \  }}}|j=                  d|d      }|j?                  ||z  |d
      }|rm|j|dg|
z  }n|:tA        |      tA        |      ur$tC        dtA        |       dtA        |       d      t        |t              r|g}n1|
t        |      k7  r!tE        d| dt        |       d| d|
 d	      |}t        | t              r| j                  || j                        }|j                  d   }| j                  |d|dd      }t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}| j                  |j                  j3                  |      |      }|d   }|rK|j                  d   }|j3                  ||      }|j=                  d|d      }|j?                  |
|z  |d
      }| j
                  ,t        | t              rt        rtG        | j
                  |       ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr*   r   
max_lengthTpt)paddingrS   
truncationreturn_tensorslongest)rU   rW   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)r[   output_hidden_states)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$rA   r   _lora_scaler   r   r4   r    strrC   rG   shaper   maybe_convert_promptr5   model_max_length	input_idstorchequalbatch_decodeloggerwarninghasattrrH   rZ   r[   to
text_modelfinal_layer_normr]   r6   repeatviewtype	TypeError
ValueErrorr!   )rL   promptr^   num_images_per_promptdo_classifier_free_guidancenegative_promptr1   r2   rP   rQ   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr[   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrS   uncond_inputs                          rN   encode_promptz+AnimateDiffControlNetPipeline.encode_prompt   s6   V !j7U&V)D $.t/@/@*M!$"3"3Z@*VS"9JJvt$<VJ&,,Q/J $ ;<2264>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;< $ 9 9- X&,,Q/J>>$%# * L t((//1EF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"($ >?DT#D$5$5zB444rO   c                 |   t        | j                  j                               j                  }t	        |t
        j                        s| j                  |d      j                  }|j                  ||      }|r}| j                  |d      j                  d   }|j                  |d      }| j                  t        j                  |      d      j                  d   }|j                  |d      }||fS | j                  |      j                  }|j                  |d      }t        j                  |      }	||	fS )	NrT   )rW   r^   r]   T)r\   r   dim)nextr/   
parametersr]   rA   rg   Tensorr.   pixel_valuesrm   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rL   imager^   rv   r\   r]   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             rN   encode_imagez*AnimateDiffControlNetPipeline.encode_image~  sD   T''2245;;%.**5*FSSEe4&*&8&8UY&8&Z&h&hik&l#&=&O&OPekl&O&m#-1-?-?  'd .@ .mB. * .L-]-]%1 .^ .* +,JJJ--e4AAL'99:OUV9WL"'"2"2<"@!444rO   c                    g }|rg }|t        |t              s|g}t        |      t        | j                  j                  j
                        k7  rBt        dt        |       dt        | j                  j                  j
                         d      t        || j                  j                  j
                        D ]`  \  }}	t        |	t               }
| j                  ||d|
      \  }}|j                  |d d d f          |sIj                  |d d d f          b n?|D ]:  }|r%|j                  d      \  }}j                  |       |j                  |       < g }t        |      D ]|  \  }}t        j                  |g|z  d      }|r7t        j                  |   g|z  d      }t        j                  ||gd      }|j                  |      }|j                  |       ~ |S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r*   r%   r   r   r^   )rA   rC   rG   r6   encoder_hid_projimage_projection_layersrt   zipr   r   appendchunk	enumeraterg   catrm   )rL   ip_adapter_imageip_adapter_image_embedsr^   rv   rw   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsis                 rN   prepare_ip_adapter_image_embedsz=AnimateDiffControlNetPipeline.prepare_ip_adapter_image_embeds  sY    &$&!"*.5$4#5 #$DII,F,F,^,^(__ abefvbwax  yE  FI  JN  JS  JS  Jd  Jd  J|  J|  F}  E~  ~K  L  >A $))"<"<"T"T> 
X9')9 +55E*W&W#DHDUDU+VQ8KEA#%A ##$7a$@A.)001MdTUg1VW
X (? 9#.H[HaHabcHdE02E)001MN##$78	9 #%&/&= 	@"A""'))-@,ADY,Y_`"a*/4yy:OPQ:R9SVk9kqr/s,&+ii1MOb0cij&k#"5"8"8"8"G#**+>?	@ '&rO      decode_chunk_sizec                 N   d| j                   j                  j                  z  |z  }|j                  \  }}}}}|j	                  ddddd      j                  ||z  |||      }g }t        d|j                  d   |      D ]@  }	||	|	|z    }
| j                   j                  |
      j                  }
|j                  |
       B t        j                  |      }|d d d f   j                  ||df|j                  dd  z         j	                  ddddd      }|j                         }|S )Nr*   r   r%   r      rY   )r3   rH   scaling_factorrc   permutereshaperangedecodesampler   rg   r   float)rL   r0   r   ry   channels
num_framesheightwidthvideor   batch_latentss              rN   decode_latentsz,AnimateDiffControlNetPipeline.decode_latents  s'   dhhoo444w>:A--7
Hj&%//!Q1a088j9PRZ\bdijq'--*,=> 	(A#A,=(=>M HHOOM:AAMLL'	(
 		% dAg&&
J'CekkRSRTo'UV^^_`bcefhiklmrO   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Neta	generator)setinspect	signaturer9   stepr   keys)rL   r   r   accepts_etaextra_step_kwargsaccepts_generators         rN   prepare_extra_step_kwargsz7AnimateDiffControlNetPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  rO         ?        c                 R   	 |dz  dk7  s|dz  dk7  rt        d| d| d      |Lt         fd|D              s8t        d j                   d|D cg c]  }| j                  vs| c}       ||t        d	| d
| d      ||t        d      |2t        |t        t
        t        f      st        dt        |             ||t        d| d| d      |A|?|j                  |j                  k7  r&t        d|j                   d|j                   d      t         j                  t              rRt        |t
              rBt        j                  dt         j                  j                         dt        |       d       t        t         d      xr8 t         j                  t"        j$                  j&                  j(                        }t         j                  t*              s&|rst         j                  j,                  t*              rOt        	t
              st/        dt        	             t        	      |k7  rt        d| dt        	            t         j                  t              s&|rt         j                  j,                  t              rt        	t
              rt        	d   t
              st/        dt        	            t        	d         |k7  rt        d| dt        	d               t1        	fd	D              rt        d      J t         j                  t*              s&|r?t         j                  j,                  t*              rt        |
t2              st/        d      t         j                  t              s&|rt         j                  j,                  t              rst        |
t
              rt1        d |
D              rSt        d       t        |
t
              r8t        |
      t         j                  j                        k7  rt        d!      J t        |t4        t
        f      s|g}t        |t4        t
        f      s|g}t        |      t        |      k7  r$t        d"t        |       d#t        |       d$      t         j                  t              rt        |      t         j                  j                        k7  r[t        d%| d&t        |       d't         j                  j                         d(t         j                  j                         d	      t7        ||      D ]D  \  }}||k\  rt        d)| d*| d      |d+k  rt        d)| d,      |d-kD  s7t        d.| d/       y c c}w )0Nr;   r   z7`height` and `width` have to be divisible by 8 but are z and r`   c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0krL   s     rN   	<genexpr>z=AnimateDiffControlNetPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z	You have z! ControlNets and you have passed z= prompts. The conditionings will be fixed across the prompts.scaled_dot_product_attentionz>For single controlnet, `image` must be of type `list` but got zExcepted image to have length z but got len(video)=zQFor multiple controlnets: `image` must be type list of lists but got type(video)=z$Expected length of image sublist as z but got len(video[0])=c              3   R   K   | ]  }t        |      t        d          k7     yw)r   N)rG   )r   imgr   s     rN   r   z=AnimateDiffControlNetPipeline.check_inputs.<locals>.<genexpr>;  s"     >3s8s58},>s   $'zDAll conditioning frame batches for multicontrolnet must be same sizezLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.c              3   <   K   | ]  }t        |t                y wr   )rA   rC   )r   r   s     rN   r   z=AnimateDiffControlNetPipeline.check_inputs.<locals>.<genexpr>N  s     Rqz!T*Rs   zEA single batch of multiple conditionings are supported at the moment.zFor multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have the same length as the number of controlnetsz`control_guidance_start` has z* elements, but `control_guidance_end` has zI elements. Make sure to provide the same number of elements to each list.z`control_guidance_start`: z has z elements but there are z- controlnets available. Make sure to provide zcontrol guidance start: z4 cannot be larger or equal to control guidance end: r   z can't be smaller than 0.r   zcontrol guidance end: z can't be larger than 1.0.)rt   allr   rA   rb   rC   dictrr   rc   r8   r   rj   rk   rG   netsrl   Frg   _dynamo
eval_frameOptimizedModuler   	_orig_modrs   anyr   rD   r   )rL   ru   r   r   r   rx   r1   r2   "callback_on_step_end_tensor_inputsr   controlnet_conditioning_scalecontrol_guidance_startcontrol_guidance_endr   is_compiledstartends   `        `       rN   check_inputsz*AnimateDiffControlNetPipeline.check_inputs  sR    A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  
6Ct;L(MYZ^_eZfYghii&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  doo';<&$'DOO$8$8 9::[\_`f\g[hST a!?@ 
ZOOU]]55EEF
 t84??44oFeT*"`aefkal`m noo5zZ' #A*MbWZ[`WaVc!deet(<=4??446JKeT**U1Xt2L"thlmrhsgu vww58}
* #G
|Sk]`afghai]j\l!mnn>>> !ghh5 t84??44oF;UC noot(<=4??446JK7>R4QRR$%lmm94@SIfEgkn$$l F !D 
 505$-@&<%=".>$8#9 %&#.B*CC/4J0K/LLvwz  |P  xQ  wR  R[  \  doo';<)*c$//2F2F.GG 01G0HcRhNiMj  kC  DG  HL  HW  HW  H\  H\  D]  C^  ^K  LO  PT  P_  P_  Pd  Pd  Le  Kf  fg  h  46JK 	[JE3| .ug5ijminnop  s{ #;E7B[!\]]Sy #9#>X!YZZ	[Y pHs   V$&V$c
                 |   | j                   r| j                  |||||||||		      }	t        |t              r)t	        |      |k7  rt        dt	        |       d| d      ||||| j                  z  || j                  z  f}
|	t        |
|||      }	n|	j                  |      }	|	| j                  j                  z  }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   r^   r]   )free_noise_enabled_prepare_latents_free_noiserA   rC   rG   rt   r<   r#   rm   r9   init_noise_sigma)rL   ry   num_channels_latentsr   r   r   r]   r^   r   r0   rc   s              rN   prepare_latentsz-AnimateDiffControlNetPipeline.prepare_latentsv  s     ""660*feUTZ\egnG i&3y>Z+GA#i.AQ R&<'gi   d+++T***
 ?"5IfTYZGjj(G DNN;;;rO   Fc
                    | j                   j                  |||      j                  t        j                        }|j                  ddddd      j                  dd      }|j                  d   }
|
dk(  r|}n|}|j                  |d      }|j                  ||	      }|r|	st        j                  |gdz        }|S )
N)r   r   )r]   r   r%   r*   r   r   r   r   )
rK   preprocess_videorm   rg   float32r   flattenrc   r   r   )rL   r   r   r   ry   num_videos_per_promptr^   r]   rw   
guess_modevideo_batch_size	repeat_bys               rN   prepare_videoz+AnimateDiffControlNetPipeline.prepare_video  s     ,,==eFZ_=`cc-- d 
 aAq!,44Q: ;;q>q "I .I''	q'9e4&zIIugk*ErO   c                     | j                   S r   _guidance_scalerL   s    rN   guidance_scalez,AnimateDiffControlNetPipeline.guidance_scale  s    ###rO   c                     | j                   S r   )
_clip_skipr   s    rN   rQ   z'AnimateDiffControlNetPipeline.clip_skip      rO   c                      | j                   dkD  S )Nr*   r   r   s    rN   rw   z9AnimateDiffControlNetPipeline.do_classifier_free_guidance  s    ##a''rO   c                     | j                   S r   )_cross_attention_kwargsr   s    rN   cross_attention_kwargsz4AnimateDiffControlNetPipeline.cross_attention_kwargs  s    +++rO   c                     | j                   S r   )_num_timestepsr   s    rN   num_timestepsz+AnimateDiffControlNetPipeline.num_timesteps  s    """rO   c                     | j                   S r   )
_interruptr   s    rN   	interruptz'AnimateDiffControlNetPipeline.interrupt  r   rO   2   g      @r*   pilTr0   ru   r   r   r   num_inference_stepsr   rx   r   r   r   r   r   conditioning_framesoutput_typereturn_dictr   r   r   r   r   callback_on_step_endr   c                    t        | j                        r| j                  j                  n| j                  }t        |t              s t        |t              rt        |      |gz  }nt        |t              s t        |t              rt        |      |gz  }nSt        |t              sCt        |t              s3t        |t              rt        |j                        nd}||gz  ||gz  }}|xs- | j                  j                  j                  | j                  z  }|xs- | j                  j                  j                  | j                  z  }d}| j                  ||||||||||||       || _        || _        || _        d| _        |t        |t"        t$        f      rd}n-|t        |t              rt        |      }n|j&                  d   }| j(                  }t        |t              r)t        |t*              r|gt        |j                        z  }t        |t,              r|j                  j.                  n"|j                  d   j                  j.                  } |xs | }||j1                  dd      nd}!| j2                  r3| j5                  ||||| j6                  ||||!| j8                  
      \  }}ng| j;                  |||| j6                  ||||!| j8                  	      \  }}| j6                  rt=        j>                  ||g      }|jA                  |d	      }||"| jC                  |||||z  | j6                        }"t        |t,              r5| jE                  |||||z  |z  |||jF                  | j6                  |
	      }nct        |t              rQg }#|D ]G  }$| jE                  |$||||z  |z  |||jF                  | j6                  |
	      }%|#jI                  |%       I |#}nJ | jJ                  jM                  ||       | jJ                  jN                  }&| j                  j                  jP                  }'| jS                  ||z  |'||||jF                  ||
|	      }| jU                  |
|	      }(||d"ind})g }*tW        t        |&            D ]w  }+tY        ||      D ,-cg c]8  \  },}-dt+        |+t        |&      z  |,k  xs |+dz   t        |&      z  |-kD        z
  : }.},}-|*jI                  t        |t,              r|.d   n|.       y | jZ                  r| j\                  nd}/tW        |/      D ]<  }0| jZ                  r#| j_                  ||0|||jF                  |
      \  }}&t        |&      | _0        t        |&      || jJ                  jb                  z  z
  }1| je                  | j`                        5 }2tg        |&      D ]  \  }+}3| jh                  r| j6                  rt=        j>                  |gdz        n|}4| jJ                  jk                  |4|3      }4|r?| j6                  r3|}5| jJ                  jk                  |5|3      }5|jm                  d      d   }6n|4}5|}6t        |*|+   t              r%tY        ||*|+         D 7,cg c]
  \  }7},|7|,z   }8}7},n|}9t        |9t              r|9d   }9|9|*|+   z  }8t=        jn                  |5dd      }5|5jq                  d|5j&                  d   |5j&                  d   |5j&                  d   f      }5| j                  |5|3|6||8|d      \  }:};| j                  |4|3|| jr                  |)|:|;      jt                  }<| j6                  r|<jm                  d      \  }=}>|=||>|=z
  z  z   }< | jJ                  jv                  |<|3|fi |(jx                  }|Zi }?|D ]  }@t{               |@   |?|@<     || |+|3|?      }A|Aj}                  d|      }|Aj}                  d|      }|Aj}                  d|      }|+t        |&      dz
  k(  s'|+dz   |1kD  r/|+dz   | jJ                  jb                  z  dk(  r|2j                          t        st        j                           	 ddd       ? |dk(  r|}Bn/| j                  ||      }C| j                  j                  |C|      }B| j                          |sBfS t        B      S c c}-},w c c},}7w # 1 sw Y   xY w)u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            conditioning_frames (`List[PipelineImageInput]`, *optional*):
                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple
                ControlNets are specified, images must be passed as a list such that each element of the list can be
                correctly batched for input to a single ControlNet.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            guess_mode (`bool`, *optional*, defaults to `False`):
                The ControlNet encoder tries to recognize the content of the input image even if you remove all
                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
                The percentage of total steps at which the ControlNet starts applying.
            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
                The percentage of total steps at which the ControlNet stops applying.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r*   )ru   r   r   r   rx   r   r1   r2   r   r   r   r   FNr   scale)
ru   r   r^   r   rw   rx   r1   r2   rP   rQ   )r1   r2   rP   rQ   )repeatsr   )	r   r   r   ry   r   r^   r]   rw   r   r   r   r   )totalr%   rY   r   r   )encoder_hidden_statescontrolnet_condconditioning_scaler   r  )r  r   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr0   r1   r2   latent)r   r  )frames)Hr"   r8   r   rA   rC   rG   r   r   r6   rH   sample_sizer<   r   r   r   r   r  rb   r   rc   _execution_devicer   r   global_pool_conditionsgetr   _encode_prompt_free_noiserw   rQ   r   rg   r   r   r   r   r]   r   r9   set_timesteps	timestepsin_channelsr   r   r   r   free_init_enabled_free_init_num_iters_apply_free_initr  orderprogress_barr   r  scale_model_inputr   	transposer   r   r   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   rJ   postprocess_videomaybe_free_model_hooksr+   )DrL   ru   r   r   r   r	  r   rx   r   r   r   r0   r1   r2   r   r   r
  r  r  r   r   r   r   r   rQ   r  r   r   r8   multry   r^   r  text_encoder_lora_scaler   cond_prepared_videosframe_prepared_videor   r   r   r  controlnet_keepr   sekeepsnum_free_init_itersfree_init_iternum_warmup_stepsr&  tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embedsc
cond_scalecontrolnet_cond_scaledown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensorsD                                                                       rN   __call__z&AnimateDiffControlNetPipeline.__call__  s	   p 3ET__2UT__..[_[j[j
 0$7JG[]a<b%()=%>BXAY%Y"0$7JG]_c<d#&'=#>BVAW#W 2D9*MacgBh+5jBV+W3z']^D.//,-- %9" O499++77$:O:OOM))558M8MM ! 	!+/Q'#9%*G#9!5 	 	
  .#'=$ *Vc4["AJJvt$<VJ&,,Q/J''j"67JGdfk<l-J,KcR\RaRaNb,b) *o6 44#**AA 	
  9#9
 :P9["&&w5ae 	  ""484R4R%&;,0,L,L /+'=2.. 5S 51M1 594F4F%00+'=2.. 5G 
51M1 // %		+A=*Q R);;JTU;VM'+B+N?? '2200L j/2"&"4"4)%(==
J&; &&,0,L,L% #5 
# 
$89#% - <!%!3!3 !),AAJN*?!$**040P0P) "4 
" %++N;< #75 	$$%8$HNN,,	  $yy//;;&&.. 

 !::9cJ
  +/F/R \* 	 s9~& 	cA   68LMAq eAI.2Rq1uI6NQR6RSSE  ""z*o/V58\ab	c <@;Q;Qd77WX#$78 U	'N%%%)%:%:^-@&'--Yb&" #&i.D"9~0CdnnFZFZ0ZZ "")<)<"= K'%i0 J'DAq~~  FJEeEeG9q=)Akr&)-)I)IJ\^_)`&!d&F&F.5+.2nn.N.NObde.f+3@3F3Fq3I!3L0.@+3@0!/!"4d;8;<Y[jkl[m8n%o1a!e%o
%o0M-%&;TB4I!4L1%:_Q=O%O
*///:MqRS*T'*=*E*E066q9;N;T;TUV;WYlYrYrstYuv+' DH??+.F(;+5#-$) DS D@*,@ "&*.;/3/J/J*;8N6J "+ " f  77=G=M=Ma=P:)?%6?]nKn9o%o
 2dnn11*a^L]^jjG+7*,!C =A17!OA.=+?aO+\("2"6"6y'"J(8(<(<_m(\1A1E1EF^`v1w. C	NQ..AE=M3MSTWXSX\`\j\j\p\pRptuRu$++-$UJ'K' K'U	'p ("E..w8IJL((::[f:gE 	##%8O(66WH &p'K' K's,   +=c<Cc	cF;cccc&	)NN)NNNNNr   )r   )NNNNNr   r   r   )FF)4__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r	   r   r   r   r   r   r   r   r   r   r
   r   r@   rg   r   r   intr   r   r   r   r   r   r   r   propertyr   rQ   rw   r   r  r  no_gradrb   	Generatorr   boolr   r   r   rM  __classcell__)rM   s   @rN   r-   r-   x   sV   8 6/AT ;?AE"
"
 $"
 !	"

 (/9:"
 &"
 /4+@%BXZnno"
 -"
 $$67"
   =>"
V 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n52+'\ &!. #+/&)" J[\ nrR %*@ $ $   ( ( , , # #   U]]_ )-$& $##% #;?/0MQ*.049=9=@DBF%* ;?CF <?:=#'KO9B!#9L7c49n%L7 SML7 	L7
 }L7 !L7 L7 "%T#Y"78L7  (}L7 L7 E%//43H"HIJL7 %,,'L7  -L7 !) 6L7 ##56L7  "**<!=!L7" &d+=&>?#L7$ c]%L7& 'L7( !)c3h 8)L7* (-UDK-?'@+L7, -L7. !&eT%[&8 9/L70 $E4;$671L72 C=3L74 'xc40@$0F'GH5L76 -1I7L78 9L7 L7rO   r-   )Er   typingr   r   r   r   r   r   r	   rg   torch.nn.functionalnn
functionalr   transformersr
   r   r   r   image_processorr   loadersr   r   r   r   modelsr   r   r   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r    r!   utils.torch_utilsr"   r#   rJ   r$   free_init_utilsr&   free_noise_utilsr'   pipeline_utilsr(   r)   pipeline_outputr+   torch_xla.core.xla_modelcore	xla_modelr.  r-  
get_loggerrN  rj   EXAMPLE_DOC_STRINGr-    rO   rN   <module>rq     s     D D D    h h 1 w w  : ; 3 n n A - + 8 D 6 ))MM			H	%? Df7"f7rO   