
    bi:                         d dl Z d dlmZmZmZmZmZmZmZ d dl	Z
d dlZd dlZd dlmc mZ d dlmZmZmZmZ ddlmZmZ ddlmZmZmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ dd	l%m&Z& dd
l'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z=  e-       rd dl>m?c m@ZA dZBndZB e.j                  eD      ZEdZF	 ddej                  deej                     deIfdZJ G d de:e;eeee8e	      ZKy)    N)AnyCallableDictListOptionalTupleUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInputVaeImageProcessor)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)SparseControlNetModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import AnimateDiffSparseControlNetPipeline
        >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
        >>> from diffusers.utils import export_to_gif, load_image

        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
        >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
        >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
        >>> vae_id = "stabilityai/sd-vae-ft-mse"
        >>> device = "cuda"

        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
        >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
        >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
        >>> scheduler = DPMSolverMultistepScheduler.from_pretrained(
        ...     model_id,
        ...     subfolder="scheduler",
        ...     beta_schedule="linear",
        ...     algorithm_type="dpmsolver++",
        ...     use_karras_sigmas=True,
        ... )
        >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ...     scheduler=scheduler,
        ...     torch_dtype=torch.float16,
        ... ).to(device)
        >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
        >>> pipe.fuse_lora(lora_scale=1.0)

        >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
        >>> negative_prompt = "low quality, worst quality, letterboxed"

        >>> image_files = [
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png",
        ... ]
        >>> condition_frame_indices = [0, 8, 15]
        >>> conditioning_frames = [load_image(img_file) for img_file in image_files]

        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_inference_steps=25,
        ...     conditioning_frames=conditioning_frames,
        ...     controlnet_conditioning_scale=1.0,
        ...     controlnet_frame_indices=condition_frame_indices,
        ...     generator=torch.Generator().manual_seed(1337),
        ... ).frames[0]
        >>> export_to_gif(video, "output.gif")
        ```
encoder_output	generatorsample_modec                     t        | d      r |dk(  r| j                  j                  |      S t        | d      r|dk(  r| j                  j                         S t        | d      r| j                  S t        d      )Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr0   r1   moder3   AttributeError)r,   r-   r.   s      z/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.pyretrieve_latentsr8   w   st     ~}-+2I))00;;		/K84K))..00		+%%%RSS    c            7           e Zd ZdZdZg dZg dZ	 	 dGdedede	d	e
eef   d
ededededef fdZ	 	 	 	 	 dHdeej*                     deej*                     dee   dee   fdZdIdZd Zd Zd Z	 	 	 	 	 	 	 	 dJdefdZd Z	 dIdZd Z dej*                  d ed!ed"ejB                  d#ejD                  d$e#ej*                  ej*                  f   fd%Z$e%d&        Z&e%d'        Z'e%d(        Z(e%d)        Z)e%d*        Z* ejV                          e,e-      dddd+d,d-dd.d/dddddddd0d1ddd2gd3ddd4gfd5ee
e.e/e.   f      d6ee   d7ee   d ed8ed9ed:ee
e.e/e.   f      d;ed<ed=ee
ej`                  e/ej`                     f      d4eej*                     deej*                     deej*                     d>ee1   d?ee/ej*                        dee/e1      d@e.dAe2dBee3e.e4f      de
ee/e   f   d!e/e   dCe2dee   dDee5eee3gdf      dEe/e.   f2dF              Z6 xZ7S )K#AnimateDiffSparseControlNetPipelinea  
    Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
    to Text-to-Video Diffusion Models](https://huggingface.co/papers/2311.16933).

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r3   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetr>   
controlnet	schedulerr<   r=   c
                    t         
|           t        |t              rt	        j
                  ||      }| j                  |||||||||		       t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t        d| j                        | _        t        | j                  dd	      | _        y )
N)	rA   rB   rC   rD   r>   rE   rF   r<   r=   rA   r&   r*      F)	do_resizevae_scale_factorT)rJ   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrlenrA   configblock_out_channelsrJ   r%   video_processorr   control_image_processor)selfrA   rB   rC   rD   r>   rE   rF   r<   r=   	__class__s             r7   rN   z,AnimateDiffSparseControlNetPipeline.__init__   s     	d01"..t^DD%)!/' 	 
	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw-PTPePef'8!224V[(
$r9   r?   r@   
lora_scale	clip_skipc
                 H
   |Jt        | t              r:|| _        t        st	        | j
                  |       nt        | j
                  |       |t        |t              rd}
n-|t        |t              rt        |      }
n|j                  d   }
|t        | t              r| j                  || j                        }| j                  |d| j                  j                  dd      }|j                  }| j                  |dd	      j                  }|j                  d
   |j                  d
   k\  rt!        j"                  ||      sj| j                  j%                  |dd| j                  j                  dz
  d
f         }t&        j)                  d| j                  j                   d|        t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}|	(| j                  |j3                  |      |      }|d   }nT| j                  |j3                  |      |d      }|d
   |	dz       }| j
                  j4                  j7                  |      }| j
                  | j
                  j8                  }n/| j:                  | j:                  j8                  }n|j8                  }|j3                  ||      }|j                  \  }}}|j=                  d|d      }|j?                  ||z  |d
      }|rm|j|dg|
z  }n|:tA        |      tA        |      ur$tC        dtA        |       dtA        |       d      t        |t              r|g}n1|
t        |      k7  r!tE        d| dt        |       d| d|
 d	      |}t        | t              r| j                  || j                        }|j                  d   }| j                  |d|dd      }t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}| j                  |j                  j3                  |      |      }|d   }|rK|j                  d   }|j3                  ||      }|j=                  d|d      }|j?                  |
|z  |d
      }| j
                  ,t        | t              rt        rtG        | j
                  |       ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr*   r   
max_lengthTpt)paddingr]   
truncationreturn_tensorslongest)r_   ra   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)re   output_hidden_statesdtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$rO   r   _lora_scaler   r   rB   r!   strlistrS   shaper   maybe_convert_promptrC   model_max_length	input_idstorchequalbatch_decodeloggerwarningr4   rT   rd   re   to
text_modelfinal_layer_normrh   rD   repeatviewtype	TypeError
ValueErrorr"   )rX   promptri   num_images_per_promptdo_classifier_free_guidancenegative_promptr?   r@   rZ   r[   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textre   prompt_embeds_dtypebs_embedseq_len_uncond_tokensr]   uncond_inputs                          r7   encode_promptz1AnimateDiffSparseControlNetPipeline.encode_prompt   s6   V !j7U&V)D $.t/@/@*M!$"3"3Z@*VS"9JJvt$<VJ&,,Q/J $ ;<2264>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;< $ 9 9- X&,,Q/J>>$%# * L t((//1EF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"($ >?DT#D$5$5zB444r9   c                 |   t        | j                  j                               j                  }t	        |t
        j                        s| j                  |d      j                  }|j                  ||      }|r}| j                  |d      j                  d   }|j                  |d      }| j                  t        j                  |      d      j                  d   }|j                  |d      }||fS | j                  |      j                  }|j                  |d      }t        j                  |      }	||	fS )	Nr^   )ra   )ri   rh   T)rf   r   dim)nextr=   
parametersrh   rO   rs   Tensorr<   pixel_valuesrx   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rX   imageri   r   rf   rh   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             r7   encode_imagez0AnimateDiffSparseControlNetPipeline.encode_image  sD   T''2245;;%.**5*FSSEe4&*&8&8UY&8&Z&h&hik&l#&=&O&OPekl&O&m#-1-?-?  'd .@ .mB. * .L-]-]%1 .^ .* +,JJJ--e4AAL'99:OUV9WL"'"2"2<"@!444r9   c                    g }|rg }|t        |t              s|g}t        |      t        | j                  j                  j
                        k7  rBt        dt        |       dt        | j                  j                  j
                         d      t        || j                  j                  j
                        D ]`  \  }}	t        |	t               }
| j                  ||d|
      \  }}|j                  |d d d f          |sIj                  |d d d f          b n?|D ]:  }|r%|j                  d      \  }}j                  |       |j                  |       < g }t        |      D ]|  \  }}t        j                  |g|z  d      }|r7t        j                  |   g|z  d      }t        j                  ||gd      }|j                  |      }|j                  |       ~ |S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r*   r&   r   r   ri   )rO   rn   rS   rD   encoder_hid_projimage_projection_layersr   zipr   r   appendchunk	enumeraters   catrx   )rX   ip_adapter_imageip_adapter_image_embedsri   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsis                 r7   prepare_ip_adapter_image_embedszCAnimateDiffSparseControlNetPipeline.prepare_ip_adapter_image_embeds  sY    &$&!"*.5$4#5 #$DII,F,F,^,^(__ abefvbwax  yE  FI  JN  JS  JS  Jd  Jd  J|  J|  F}  E~  ~K  L  >A $))"<"<"T"T> 
X9')9 +55E*W&W#DHDUDU+VQ8KEA#%A ##$7a$@A.)001MdTUg1VW
X (? 9#.H[HaHabcHdE02E)001MN##$78	9 #%&/&= 	@"A""'))-@,ADY,Y_`"a*/4yy:OPQ:R9SVk9kqr/s,&+ii1MOb0cij&k#"5"8"8"8"G#**+>?	@ '&r9   c                    d| j                   j                  j                  z  |z  }|j                  \  }}}}}|j	                  ddddd      j                  ||z  |||      }| j                   j                  |      j                  }|d d d f   j                  ||df|j                  dd  z         j	                  ddddd      }|j                         }|S )Nr*   r   r&   r      rc   )	rA   rT   scaling_factorro   permutereshapedecoder1   float)	rX   r3   r   channels
num_framesheightwidthr   videos	            r7   decode_latentsz2AnimateDiffSparseControlNetPipeline.decode_latents  s    dhhoo444w>:A--7
Hj&%//!Q1a088j9PRZ\bdij(//dAg&&
J'CekkRSRTo'UV^^_`bcefhiklmr9   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Netar-   )setinspect	signaturerF   stepr   keys)rX   r-   r   accepts_etaextra_step_kwargsaccepts_generators         r7   prepare_extra_step_kwargsz=AnimateDiffSparseControlNetPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r9         ?controlnet_conditioning_scalec           
      x    |dz  dk7  s|dz  dk7  rt        d| d| d      |	Lt         fd|	D              s8t        d j                   d|	D cg c]  }| j                  vs| c}       ||t        d	| d
| d      ||t        d      |7t        |t              s't        |t
              st        dt        |             ||t        d| d| d      |A|?|j                  |j                  k7  r&t        d|j                   d|j                   d      ||t        d      |Tt        |t
              st        dt        |             |d   j                  dvrt        d|d   j                   d      t        t        d      xr8 t         j                  t        j                  j                  j                        }t         j                  t               s&|rct         j                  j"                  t               r?t        |
t
              r|
D ]  } j%                  |||        n j%                  |
||       nJ t         j                  t               s&|r@t         j                  j"                  t               rt        |t&              st)        d      y J c c}w )NrH   r   z7`height` and `width` have to be divisible by 8 but are z and rk   c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0krX   s     r7   	<genexpr>zCAnimateDiffSparseControlNetPipeline.check_inputs.<locals>.<genexpr>  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is Dscaled_dot_product_attentionzLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.)r   allr   rO   rm   rn   r}   ro   ndimr4   FrE   rs   _dynamo
eval_frameOptimizedModuler   	_orig_modcheck_imager   r~   )rX   r   r   r   r   r?   r@   r   r   "callback_on_step_end_tensor_inputsr   r   r   is_compiledimage_s   `              r7   check_inputsz0AnimateDiffSparseControlNetPipeline.check_inputs  sc    A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  ',C,O ^  #.5t< PQUVmQnPop  )+00> \]tuv]w]|]|\}}~  a!?@ 
ZOOU]]55EEF
 t(=>4??446KL%&# DF$$VV]CD   >5 t(=>4??446KL;UC noo D 5O pHs   J7%J7c                    t        |t        j                  j                        }t        |t        j                        }t        |t
        j                        }t        |t              xr' t        |d   t        j                  j                        }t        |t              xr t        |d   t        j                        }t        |t              xr t        |d   t
        j                        }	|s!|s|s|s|s|	st        dt        |             |rd}
nt        |      }
|t        |t              rd}n/|t        |t              rt        |      }n||j                  d   }|
dk7  r|
k7  rt        d|
 d|       y y )Nr   zimage must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is r*   zdIf image batch size is not 1, image batch size must be same as prompt batch size. image batch size: z, prompt batch size: )rO   PILImagers   r   npndarrayrn   r~   r}   rS   rm   ro   r   )rX   r   r   r?   image_is_pilimage_is_tensorimage_is_npimage_is_pil_listimage_is_tensor_listimage_is_np_listimage_batch_sizeprompt_batch_sizes               r7   r   z/AnimateDiffSparseControlNetPipeline.check_imageK  s   !%9$UELL9 

3&ud3]
58SYY__8])%6]:eAhPUP\P\;]%eT2Wz%(BJJ7W #%($ f  gk  lq  gr  fs  t   "5z*VS"9 !Jvt$< #F& - 3 3A 6q %59J%Jv  xH  wI  I^  _p  ^q  r  &K r9   c
                 2   ||||| j                   z  || j                   z  f}
t        |t              r)t        |      |k7  rt	        dt        |       d| d      |	t        |
|||      }	n|	j                  |      }	|	| j                  j                  z  }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r-   ri   rh   )	rJ   rO   rn   rS   r   r$   rx   rF   init_noise_sigma)rX   r   num_channels_latentsr   r   r   rh   ri   r-   r3   ro   s              r7   prepare_latentsz3AnimateDiffSparseControlNetPipeline.prepare_latentsq  s      d+++T***
 i&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZGjj(G DNN;;;r9   c                 z   | j                   j                  |||      }|j                  d      j                  ||      }|j                  \  }}}	}}|j                         dk\  r|j                         dk  sJ | j                  j                  r|j                  ||z  |	||      }d|z  dz
  }t        | j                  j                  |            | j                  j                  j                  z  }
|
j                  ||d|| j                  z  || j                  z        }
n|}
|
j!                  ddddd      }
|
S )N)r   r   r   r*   r&   r   r   )rW   
preprocess	unsqueezerx   ro   minmaxrE   "use_simplified_condition_embeddingr   r8   rA   encoderT   r   rJ   r   )rX   r   r   r   ri   rh   controlnet_imagesr   r   r   conditioning_framess              r7   prepare_imagez1AnimateDiffSparseControlNetPipeline.prepare_image  sD   ,,77fTY7Z!OOA.11&%@:K:Q:Q7
J&% !$$&!+0A0E0E0G10LLL??== 1 9 9*z:QS[]cej k !$5 5 9"2488??CT3U"VY]YaYaYhYhYwYw"w"5"="=J6T5J5J+JEUYUjUjLj# #4199!Q1aH""r9   r   r   controlnet_frame_indicesri   rh   returnc                 6   |j                   d   t        |      k\  sJ |j                   \  }}}}	}
t        j                  ||||	|
f||      }t        j                  |d||	|
f||      }|d d d d d t        |      f   |d d d d |f<   d|d d d d |f<   ||fS )Nr&   rg   r*   )ro   rS   rs   zeros)rX   r   r   r   ri   rh   r   r   r   r   r   controlnet_condcontrolnet_cond_masks                r7   #prepare_sparse_control_conditioningzGAnimateDiffSparseControlNetPipeline.prepare_sparse_control_conditioning  s     #((+s3K/LLLL1D1J1J.
Ha++z8ZQV&W_dmst${{J:vu+U]bkqr:MaQRTsVYZrVsTsNs:t1667?@Q#;;< 444r9   c                     | j                   S r   _guidance_scalerX   s    r7   guidance_scalez2AnimateDiffSparseControlNetPipeline.guidance_scale  s    ###r9   c                     | j                   S r   )
_clip_skipr  s    r7   r[   z-AnimateDiffSparseControlNetPipeline.clip_skip  s    r9   c                      | j                   dkD  S )Nr*   r  r  s    r7   r   z?AnimateDiffSparseControlNetPipeline.do_classifier_free_guidance  s    ##a''r9   c                     | j                   S r   )_cross_attention_kwargsr  s    r7   cross_attention_kwargsz:AnimateDiffSparseControlNetPipeline.cross_attention_kwargs  s    +++r9   c                     | j                   S r   )_num_timestepsr  s    r7   num_timestepsz1AnimateDiffSparseControlNetPipeline.num_timesteps  s    """r9      2   g      @r*   g        pilTr   Fr3   r   r   r   num_inference_stepsr	  r   num_videos_per_promptr   r-   r   r   output_typereturn_dictr  
guess_modecallback_on_step_endr   c                     t        | j                        r| j                  j                  n| j                  }|xs- | j                  j                  j
                  | j                  z  }|xs- | j                  j                  j
                  | j                  z  }d}| j                  |||||||||||       || _        || _	        || _
        |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| j                   }t        |t"              r|j                  j$                  n"|j&                  d   j                  j$                  }|xs |}| j(                  | j(                  j+                  dd      nd}| j-                  |||| j.                  ||||| j0                  	      \  }}| j.                  rt3        j4                  ||g      }|j7                  |d      }||"| j9                  |||||z  | j.                        }| j;                  |||||j<                        }| j?                  |||||j<                        \  } }!| j@                  jC                  ||       | j@                  jD                  }"| j                  j                  jF                  }#| jI                  ||z  |#||||j<                  ||
|	      }| jK                  |
|	      }$||d	ind}%| jL                  r| jN                  nd}&tQ        |&      D ]|  }'| jL                  r#| jS                  ||'|||j<                  |
      \  }}"t        |"      | _*        t        |"      || j@                  jV                  z  z
  }(| jY                  | jT                  
      5 })t[        |"      D ]  \  }*}+| j.                  rt3        j4                  |gdz        n|},| j@                  j]                  |,|+      },|r?| j.                  r3|}-| j@                  j]                  |-|+      }-|j_                  d      d   }.n|,}-|}.| j                  |-|+|.| |!||d      \  }/}0| j                  |,|+|||%|/|0      j`                  }1| j.                  r|1j_                  d      \  }2}3|2||3|2z
  z  z   }1 | j@                  jb                  |1|+|fi |$jd                  }|Zi }4|D ]  }5tg               |5   |4|5<     || |*|+|4      }6|6ji                  d|      }|6ji                  d|      }|6ji                  d|      }|*t        |"      dz
  k(  s'|*dz   |(kD  r/|*dz   | j@                  jV                  z  dk(  r|)jk                          tl        sto        jp                           	 ddd        |dk(  r|}7n.| js                  |      }8| jt                  jw                  |8|      }7| jy                          |s|7fS t{        |7      S # 1 sw Y   xY w)uI  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            conditioning_frames (`List[PipelineImageInput]`, *optional*):
                The SparseControlNet input to provide guidance to the `unet` for generation.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            controlnet_frame_indices (`List[int]`):
                The indices where the conditioning frames must be applied for generation. Multiple frames can be
                provided to guide the model to generate similar structure outputs, where the `unet` can
                "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected
                structure. Must have the same length as `conditioning_frames`.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r*   )r   r   r   r   r?   r@   r   r   r   r   r   Nr   scale)r?   r@   rZ   r[   )repeatsr   r   r   )totalr&   F)encoder_hidden_statesr  conditioning_maskconditioning_scaler  r  )r   r  added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr3   r?   r@   latent)r   r  )frames)>r#   rE   r   rD   rT   sample_sizerJ   r   r  r  r  rO   rm   rn   rS   ro   _execution_devicer   global_pool_conditionsnetsr  getr   r   r[   rs   r   r   r   r   rh   r  rF   set_timesteps	timestepsin_channelsr   r   free_init_enabled_free_init_num_itersrange_apply_free_initr  orderprogress_barr   scale_model_inputr   r1   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   rV   postprocess_videomaybe_free_model_hooksr+   )9rX   r   r   r   r   r  r	  r   r  r   r-   r3   r?   r@   r   r   r   r  r  r  r   r   r  r[   r  r   rE   r   ri   r*  text_encoder_lora_scaler   r  r  r.  r   r   r#  num_free_init_itersfree_init_iternum_warmup_stepsr5  r   tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embedsdown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensors9                                                            r7   __call__z,AnimateDiffSparseControlNetPipeline.__call__  s   f 3ET__2UT__..[_[j[j
 O499++77$:O:OOM))558M8MM ! 	+'#9-$;/Q%*G 	 	
  .#'=$ *VS"9JJvt$<VJ&,,Q/J'' *&;< 44#**AA 	
  9#9
 ?C>Y>Y>eD''++GT:ko 	  150B0B!,,'#9.nn 1C 
1
-- ++!II'=}&MNM%77
PQ7R '+B+N?? '2200L #001DeVU[]g]m]mn040X0X-EvzO_O_1
--
 	$$%8$HNN,,	  $yy//;;&&.. 

 !::9cJ
  +/F/R \* 	 <@;Q;Qd77WX#$78 F	'N%%%)%:%:^-@&'--Yb&" #&i.D"9~0CdnnFZFZ0ZZ "")<)<"= <'%i0 ;'DAqEIEeEeG9q=)Akr&)-)I)IJ\^_)`&!d&F&F.5+.2nn.N.NObde.f+3@3F3Fq3I!3L0.@+3@0CG??+.F(7*>+H#-$) DS 	D@*,@ "&*.;/E*;8N6J "+ " f  77=G=M=Ma=P:)?%6?]nKn9o%o
 2dnn11*a^L]^jjG+7*,!C =A17!OA.=+?aO+\("2"6"6y'"J(8(<(<_m(\1A1E1EF^`v1w. C	NQ..AE=M3MSTWXSX\`\j\j\p\pRptuRu$++-$w;'<' <'F	'R ("E..w7L((::[f:gE 	##%8O(66W<' <'s    GW39W33W=	)NN)NNNNNr   )NNNNNNNr   )8__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r	   r   r   r   r   r   r
   r   rN   r   rs   r   r   intr   r   r   r   r   r   r   r   r   ri   rh   r   r  propertyr	  r[   r   r  r  no_gradr    EXAMPLE_DOC_STRINGrm   r   	Generatorr   boolr   r   r   rP  __classcell__)rY   s   @r7   r;   r;      sR   : EST 157;

 $
 !	

 (/9:
 &
 *
 -
 .
 5
P 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n52+'\
!, # $+//2\ (-\~#N nr2#*5"\\5 5 #&	5
 5 {{5 
u||U\\)	*5$ $ $   ( ( , , # # U]]_12 37 $##% #;?%&MQ*.049=9=@DBF  ;?CF/0c #'KO9B5u7sDI~./u7 u7 }	u7
 u7 !u7 u7 "%T#Y"78u7  #u7 u7 E%//43H"HIJu7 %,,'u7  -u7 !) 6u7 ##56u7  "*$u||*<!=!u7" &d+=&>?#u7$ %u7& 'u7( !)c3h 8)u7* (-UDK-?'@+u7, #'s)-u7. /u70 C=1u72 'xc40@$0F'GH3u74 -1I5u7 3 u7r9   r;   )Nr1   )Lr   typingr   r   r   r   r   r   r	   numpyr   r   rs   torch.nn.functionalnn
functionalr   transformersr
   r   r   r   image_processorr   r   loadersr   r   r   r   modelsr   r   r   r   (models.controlnets.controlnet_sparsectrlr   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r    r!   r"   utils.torch_utilsr#   r$   rV   r%   free_init_utilsr'   pipeline_utilsr(   r)   pipeline_outputr+   torch_xla.core.xla_modelcore	xla_modelr<  r;  
get_loggerrQ  rv   rZ  r   r[  rm   r8   r;    r9   r7   <module>ru     s     D D D  
    h h D w w [ [ M 9 ; 3  B - + D 6 ))MM			H	%; @ ck
TLL
T-5eoo-F
T\_
T{7"{7r9   