
    bi                        d dl Z d dlmZmZmZmZmZ d dlZd dl	Z	d dl
mc mZ d dlmZ d dlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*  e"       rd dl+m,c m-Z. dZ/ndZ/ e#j`                  e1      Z2dZ3d Z4d Z5d Z6d dZ7d!dZ8d"dZ9	 	 	 	 d#dee:   deee;e	jx                  f      deee:      deee=      fdZ> G d de      Z?y)$    N)CallableDictListOptionalUnion)Image)	BertModelBertTokenizerQwen2TokenizerQwen2VLForConditionalGeneration   )MultiPipelineCallbacksPipelineCallback)VaeImageProcessor)AutoencoderKLMagvitEasyAnimateTransformer3DModel)DiffusionPipeline)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )EasyAnimatePipelineOutputTFaw  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import EasyAnimateInpaintPipeline
        >>> from diffusers.pipelines.easyanimate.pipeline_easyanimate_inpaint import get_image_to_video_latent
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = EasyAnimateInpaintPipeline.from_pretrained(
        ...     "alibaba-pai/EasyAnimateV5.1-12b-zh-InP-diffusers", torch_dtype=torch.bfloat16
        ... )
        >>> pipe.to("cuda")

        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
        >>> validation_image_start = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
        ... )

        >>> validation_image_end = None
        >>> sample_size = (448, 576)
        >>> num_frames = 49
        >>> input_video, input_video_mask = get_image_to_video_latent(
        ...     [validation_image_start], validation_image_end, num_frames, sample_size
        ... )

        >>> video = pipe(
        ...     prompt,
        ...     num_frames=num_frames,
        ...     negative_prompt="Twisted body, limb deformities, text subtitles, comics, stillness, ugliness, errors, garbled text.",
        ...     height=sample_size[0],
        ...     width=sample_size[1],
        ...     video=input_video,
        ...     mask_video=input_video_mask,
        ... )
        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
        ```
c                    t        | t        j                        rLt        j                  j                  j                  | j                  d      |dd      j                  d      } nt        | t        j                        r/| j                  |d   |d   f      } t        j                  |       } ngt        | t        j                        rBt        j                  |       j                  |d   |d   f      } t        j                  |       } nt        d      t        | t        j                        s7t        j                  |       j!                  ddd      j#                         dz  } | S )	zd
    Preprocess a single image (PIL.Image, numpy.ndarray, or torch.Tensor) to a resized tensor.
    r   bilinearFsizemodealign_cornersr   zKUnsupported input type. Expected PIL.Image, numpy.ndarray, or torch.Tensor.   g     o@)
isinstancetorchTensornn
functionalinterpolate	unsqueezesqueezer   resizenparrayndarray	fromarray
ValueError
from_numpypermutefloat)imagesample_sizes     w/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.pypreprocess_imager7   Z   s    %&##//OOA[zQV 0 

'!* 	 
E5;;	'k!nk!n=>	E2::	&&--{1~{1~.NOfgg eU\\*  '//1a8>>@5HL    c                 L   d}d}| At        | t              r| D cg c]  }t        ||       }}nt        | |      }t        |t              rt        j                  |D cg c]"  }|j                  d      j                  d      $ c}d      }t        j                  |ddddddf   dd|ddg      }||dddddt        |      f<   n9t        j                  |j                  d      j                  d      dd|ddg      }t        j                  |ddddf         }t        |t              rd|ddddt        |      df<   nd|ddddddf<   |t        |t              r|D cg c]  }t        ||       }	}t        j                  |	D cg c]"  }|j                  d      j                  d      $ c}d      }
|
|ddddt        |
       df<   d|ddddt        |	       df<   ||fS t        ||      }	|	j                  d      j                  d      |ddddddf<   d|ddddddf<   ||fS | Ct        j                  dd||d   |d   g      }t        j                  dd||d   |d   g      dz  }||fS c c}w c c}w c c}w c c}w )	z
    Generate latent representations for video from start and end images. Inputs can be PIL.Image, numpy.ndarray, or
    torch.Tensor.
    Nr   r   r"   dim   r   )r#   listr7   r$   catr)   tilelen
zeros_likezerosones)validation_image_startvalidation_image_end
num_framesr5   input_videoinput_video_maskimgimage_startstart_video	image_end	end_videos              r6   get_image_to_video_latentrO   u   s   
 K),d3I_`#+C=`K`*+A;OK k4()):EF3q!++A.FK  **[Arr%:Q:qRS<TUK4?K10K 0001**%%a(2215Az1a(K !++K2A2,>?k4(9<Q3{#3#556),Q12X&  +.5K_`C-c;?`	`!II>GHsS]]1%//2H	 8AAq3y>/"334<= AI'8!89 ((( --A;O	)2)<)<Q)?)I)I!)LAq"#I&./ Ars+ ((( 
 	'kk1a[^[QR^"TU ::q!ZQUV&XY\__(((_ a G. aHs   J!'J3J'J!c                 $   |}|}| \  }}||z  }|||z  kD  r|}t        t        ||z  |z              }	n|}	t        t        ||z  |z              }t        t        ||z
  dz              }
t        t        ||	z
  dz              }|
|f|
|z   ||	z   ffS )Ng       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_lefts               r6   get_resize_crop_region_for_gridr_      s    	B	BDAq	AABG}5a!,-E"q&1*-.5"},345HE2,345Ii 8m#;Y=U"VVVr8   c                     |j                  t        t        d|j                              d      }| j                  t        t        d| j                              d      }| ||z  z  }||z  d|z
  | z  z   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r   T)r;   keepdim)stdr>   rangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r6   rescale_noise_cfgrk      s    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#x''9: #66!>N:NR[9[[Ir8   c           	         |j                         }|rt        |dd        }d|d<   t        j                  | d d d d ddd d d d f   |dd      }t        |dd        }|d   dz
  |d<   |d   dk7  rFt        j                  | d d d d dd d d d d f   |dd      }t	        j
                  ||gd      }|S |}|S t        |dd        }t        j                  | |dd      }|S )Nr"   r   r   	trilinearFr   r:   )r   r>   Fr(   r$   r?   )masklatentprocess_first_frame_onlylatent_sizetarget_sizefirst_frame_resizedremaining_frames_resizedresized_masks           r6   resize_maskrw      s   ++-K;qr?+AmmAqsAq!+UZ
 ;qr?+$Q!+Aq>Q'(}}Q12q!^$;[X]($ !99&9;S%TZ[\L 	 /L  ;qr?+}}T+]bcr8   c                    |nt        j                  dd| j                  d   f      j                  | j                        }t        j
                  |      j                  | j                        }nJt        j                  | j                  d   f      j                  | j                  | j                        |z  }|It        j                  | j                         || j                  | j                        |d d d d d d f   z  }n"t        j                  |       |d d d d d d f   z  }t        j                  | dk(  t        j                  |       |      }| |z   } | S )Ng            ?r   )meanrb   r   )	generatordtypedevicer=   )r$   normalshapetor}   expr|   rD   randnr   
randn_likewhererB   )r4   ratior{   sigmaimage_noises        r6   add_noise_to_reference_videor      s    }$Cu{{1~6GHKKELLY		% ##EKK0

EKKN,-00u{{KeSKK

	UZUaUabAtT4-./ 	
 &&u-atT46O0PP++erk5+;+;E+BKPKKELr8   num_inference_stepsr}   	timestepssigmasc                    ||t        d      |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS |dt        t        j                  | j                        j
                  j                               v }|st        d| j                   d       | j                  d
||d| | j                  }t        |      }||fS  | j                  |fd	|i| | j                  }||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r   r}   r   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r   r}   r}    )
r0   setinspect	signatureset_timesteps
parameterskeys	__class__r   rA   )	schedulerr   r}   r   r   kwargsaccepts_timestepsaccept_sigmass           r6   retrieve_timestepsr     s   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	 	M)FMfM''	!)n ))) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	 	GvfGG''	!)n ))) 	 	 3MFMfM''	)))r8   c            9       l    e Zd ZdZdZg dZdedeee	f   dee
ef   dedef
 fd	Z	 	 	 	 	 	 	 	 	 	 d?deeee   f   dededeeeee   f      deej*                     deej*                     deej*                     deej*                     deej,                     deej.                     defdZd Z	 	 	 	 	 	 d@dZd Zd Z	 	 	 	 	 	 dAdZed        Zed        Z ed         Z!ed!        Z"ed"        Z# ejH                          e%e&      dd#dddd$d$d%d&dd
d'ddddddd(ddd)gd'd*d+dfdeeee   f   d,ee   d-eejN                     d.eejN                     d/eejN                     d0ee   d1ee   d2ee   d3ee(   deeeee   f      dee   d4ee(   d5eeejR                  eejR                     f      d)eej*                     deej*                     deej*                     deej*                     deej*                     d6ee   d7ed8eee*eee+gdf   e,e-f      d9ee   d:e(d;e(d<e(d=eee      f4d>              Z. xZ/S )BEasyAnimateInpaintPipelinea  
    Pipeline for text-to-video generation using EasyAnimate.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    EasyAnimate uses one text encoder [qwen2 vl](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) in V5.1.

    Args:
        vae ([`AutoencoderKLMagvit`]):
            Variational Auto-Encoder (VAE) Model to encode and decode video to and from latent representations.
        text_encoder (Optional[`~transformers.Qwen2VLForConditionalGeneration`, `~transformers.BertModel`]):
            EasyAnimate uses [qwen2 vl](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) in V5.1.
        tokenizer (Optional[`~transformers.Qwen2Tokenizer`, `~transformers.BertTokenizer`]):
            A `Qwen2Tokenizer` or `BertTokenizer` to tokenize text.
        transformer ([`EasyAnimateTransformer3DModel`]):
            The EasyAnimate model designed by EasyAnimate Team.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with EasyAnimate to denoise the encoded image latents.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embedsvaetext_encoder	tokenizertransformerr   c                    t         |           | j                  |||||       t        | dd        | j                  j
                  j                  nd| _        t        | dd       | j                  j                  nd| _	        t        | dd       | j                  j                  nd| _        t        | j                        | _        t        | j                  ddd	      | _        t        | j                        | _        y )
N)r   r   r   r   r   r   Tr         )vae_scale_factorF)r   do_normalizedo_binarizedo_convert_grayscale)super__init__register_modulesgetattrr   configenable_text_attention_maskr   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior   image_processormask_processorr   video_processor)selfr   r   r   r   r   r   s         r6   r   z#EasyAnimateInpaintPipeline.__init__c  s     	%# 	 	
 t]D1= ##>> 	' 3:$t2L2XDHH..^_ 	* 4;43M3YDHH//_` 	+  1$BdBde/!??!%	
  .t?a?abr8   r   TNpromptnum_images_per_promptdo_classifier_free_guidancenegative_promptr   r   prompt_attention_masknegative_prompt_attention_maskr}   r|   max_sequence_lengthc           	         |
xs | j                   j                  }
|	xs | j                   j                  }	|t        |t              rd}n-|t        |t
              rt        |      }n|j                  d   }|t        |t              rdd|dgdg}n|D cg c]  }dd|dgd }}|D cg c]!  }| j                  j                  |gdd	      # }}| j                  |d
|dddd      }|j                  | j                   j                        }|j                  }|j                  }| j                  r"| j                  ||d      j                  d   }nt        d      |j!                  |d      }|j                  |
|	      }|j                  \  }}}|j!                  d|d      }|j#                  ||z  |d      }|j                  |	      }|r||t        |t              rdd|dgdg}n|D cg c]  }dd|dgd }}|D cg c]!  }| j                  j                  |gdd	      # }}| j                  |d
|dddd      }|j                  | j                   j                        }|j                  }|j                  }| j                  r"| j                  ||d      j                  d   }nt        d      |j!                  |d      }|r]|j                  d   }|j                  |
|	      }|j!                  d|d      }|j#                  ||z  |d      }|j                  |	      }||||fS c c}w c c}w c c}w c c}w )a[  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            dtype (`torch.dtype`):
                torch dtype
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
        r   r   usertext)typer   )rolecontentFT)tokenizeadd_generation_prompt
max_lengthrightpt)r   paddingr   
truncationreturn_attention_maskpadding_sidereturn_tensors)	input_idsattention_maskoutput_hidden_stateszLLM needs attention_mask)r|   r}   r=   r}   )r   r|   r}   r#   strr>   rA   r   r   apply_chat_templater   r   r   r   hidden_statesr0   repeatview)r   r   r   r   r   r   r   r   r   r}   r|   r   
batch_sizemessages_promptmr   text_inputstext_input_idsbs_embedseq_len__negative_prompts                          r6   encode_promptz(EasyAnimateInpaintPipeline.encode_prompt  s   Z 0**0034,,33*VS"9JJvt$<VJ&,,Q/J &#& !'-3V$D#E $*
   !'-3W$E#F  nvhi22A3^b2cD  ..$.&*$# ) K &..):):)A)ABK(22N$/$>$>!.. $ 1 1,=Rim !2 !-!$ !!;<<$9$@$@AVXY$Z!%((uV(D,22'1%,,Q0EqI%**86K+KWVXY 5 8 8 8 G '+A+I*z/3/O !'-3_$M#N -<
 ) !'-3=M$N#O  nvhi22A3^b2cD  ..$.&*$# ) K &..):):)A)ABK(22N-8-G-G*..)-):):,#A)- *; *  -	*$& !!;<<-K-R-RShjk-l*&,2215G%;%>%>USY%>%Z"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"-K-N-NV\-N-]*46KMkkkAXs   L=.&M:M&Mc                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Netar{   )r   r   r   r   stepr   r   )r   r{   r   accepts_etaextra_step_kwargsaccepts_generators         r6   prepare_extra_step_kwargsz4EasyAnimateInpaintPipeline.prepare_extra_step_kwargs-  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r8   c
           
          |dz  dk7  s|dz  dk7  rt        d| d| d      |	Lt         fd|	D              s8t        d j                   d|	D 
cg c]  }
|
 j                  vs|
 c}
       ||t        d	| d
| d      ||t        d      |7t        |t              s't        |t
              st        dt        |             ||t        d      ||t        d| d| d      ||t        d      |C|@|j                  |j                  k7  r&t        d|j                   d|j                   d      y y y c c}
w )N   r   z8`height` and `width` have to be divisible by 16 but are z and .c              3   :   K   | ]  }|j                   v   y wN)_callback_tensor_inputs).0kr   s     r6   	<genexpr>z:EasyAnimateInpaintPipeline.check_inputs.<locals>.<genexpr>M  s#      F
23A---F
s   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r0   allr   r#   r   r>   r   r   )r   r   heightwidthr   r   r   r   r   "callback_on_step_end_tensor_inputsr   s   `          r6   check_inputsz'EasyAnimateInpaintPipeline.check_inputs>  s!    B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 DTEaEaDbbn  |^  pHvw  bc  ko  kG  kG  bGpq  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@TZ\`IaQRVW]R^Q_`aa$)>)Fdee&+A+M9/9J K*++]_ 
 "-2P2Xvww$)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$7 pHs   E
%E
c                 N   t        t        ||z        |      }t        ||z
  d      }| j                  j                  || j                  j
                  z  d  }t        | j                  d      r2| j                  j                  || j                  j
                  z         |||z
  fS )Nr   set_begin_index)minrQ   maxr   r   orderhasattrr   )r   r   strengthr}   init_timestept_startr   s          r6   get_timestepsz(EasyAnimateInpaintPipeline.get_timestepsu  s    C 3h >?ATU)M91=NN,,Wt~~7K7K-K-MN	4>>#45NN**7T^^5I5I+IJ-777r8   c                 |   ||j                  ||      }g }d}t        d|j                  d   |      D ]I  }||||z    }| j                  j	                  |      d   }|j                         }|j                  |       K t        j                  |d      }|| j                  j                  j                  z  }||j                  ||      }| j                  j                  j                  rt        ||
|      }g }d}t        d|j                  d   |      D ]I  }||||z    }| j                  j	                  |      d   }|j                         }|j                  |       K t        j                  |d      }|| j                  j                  j                  z  }|j                  ||      }||fS d }||fS )Nr}   r|   r   r   r:   )r   r{   )r   rc   r   r   encoder    appendr$   r?   r   scaling_factorr   add_noise_in_inpaint_modelr   )r   ro   masked_imager   r   r   r|   r}   r{   r   noise_aug_strengthnew_maskbsimask_bsnew_mask_pixel_valuesmask_pixel_values_bsmasked_image_latentss                     r6   prepare_mask_latentsz/EasyAnimateInpaintPipeline.prepare_mask_latents  s     77&76DHB1djjmR0 )q1r6*((//'215!,,.(	)
 99X1-D$((//888D#'??&?FL&&AA; (:i  %'!B1l003R8 C'3AB'?$'+xx7K'LQ'O$';'@'@'B$%,,-AB	C
 $)99-B#J #7$((//:X:X#X  $8#:#:&PU#:#V  ))) $( )))r8   c                    |||dz
  | j                   z  dz   || j                  z  || j                  z  f}t        |t              r)t	        |      |k7  rt        dt	        |       d| d      |s|	|s|
j                  ||      }
d}g }t        d|
j                  d   |      D ]I  }|
|||z    }| j                  j                  |      d   }|j                         }|j                  |       K t        j                  |d      }
|
| j                  j                  j                   z  }
|
j#                  ||
j                  d   z  dddd      }|j                  ||      }|	t%        ||||      }t        | j&                  t(              r"|r|n| j&                  j+                  ||      }	n!|r|n| j&                  j-                  ||      }	t/        | j&                  d	      rp|r|	| j&                  j0                  z  n|	}	nRt/        | j&                  d	      r+|	j                  |      }|| j&                  j0                  z  }	n|	j                  |      }	|	f}|r|fz  }|r|fz  }|S )
Nr   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r  r   r:   )r{   r}   r|   init_noise_sigma)r   r   r#   r>   rA   r0   r   rc   r   r   r  sampler  r$   r?   r   r  r   r   r   r   scale_noise	add_noiser   r  )r   r   num_channels_latentsr   r   rG   r|   r}   r{   r   videotimestepis_strength_maxreturn_noisereturn_video_latentsr   r  	new_videor  video_bsvideo_latentsnoiseoutputss                          r6   prepare_latentsz*EasyAnimateInpaintPipeline.prepare_latents  sj   $  !^ C CCaGd888T777
 i&3y>Z+GA#i.AQ R&<'gi 
  GOOHHF%H8EBI1ekk!nb1 + QV,88??84Q7#??,  *	+
 IIiQ/EDHHOO:::E!LLu{{1~)Eq!QPQRM),,F%,HM? )FRWXE$..*IJ#2%8R8RS`bjlq8r#2%8P8PQ^`ego8pt~~'9:GV'DNN$C$CC\ct~~'9:

6*$.."A"AA!**V, *xG''Gr8   c                     | j                   S r   _guidance_scaler   s    r6   guidance_scalez)EasyAnimateInpaintPipeline.guidance_scale  s    ###r8   c                     | j                   S r   )_guidance_rescaler'  s    r6   rg   z+EasyAnimateInpaintPipeline.guidance_rescale  s    %%%r8   c                      | j                   dkD  S )Nr   r%  r'  s    r6   r   z6EasyAnimateInpaintPipeline.do_classifier_free_guidance  s    ##a''r8   c                     | j                   S r   )_num_timestepsr'  s    r6   num_timestepsz(EasyAnimateInpaintPipeline.num_timesteps  s    """r8   c                     | j                   S r   )
_interruptr'  s    r6   	interruptz$EasyAnimateInpaintPipeline.interrupt  s    r8   1   i   2   g      @        pilr         ?gޓZӬ?rG   r  
mask_videomasked_video_latentsr   r   r   r(  r   r{   output_typereturn_dictcallback_on_step_endr   rg   r   r
  r   c                    t        |t        t        f      r|j                  }t	        |dz  dz        }t	        |dz  dz        }| j                  ||||
|||||	       |	| _        || _        d| _        |t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }| j                  }| j                  | j                  j                  }n| j                   j                  }| j#                  ||||| j$                  |
||||
      \  }}}}t        | j&                  t(              rt+        | j&                  |||d      \  }}nt+        | j&                  |||      \  }}| j-                  |||      \  }}|dd j/                  ||z        }|d	k(  }||j                  \  }} }}!}"| j0                  j3                  |j5                  dd
ddd      j7                  ||z  | |!|"      ||      }#|#j9                  t:        j<                        }#|#j7                  ||| ||      j5                  dd
ddd      }#nd}#| j>                  j@                  jB                  }$| j                   j@                  jD                  }%|%|$k(  }&| jG                  ||z  |$||||||||#||d|&      }'|&r|'\  }}(})n|'\  }}(||dk(  jI                         r6t;        jJ                  |      j9                  ||      }*| j                   j@                  jL                  r/t;        jJ                  |      ddddf   j9                  ||      }+n%t;        jJ                  |      j9                  ||      }+t;        jJ                  |      j9                  ||      }| j$                  rt;        jN                  |+gd
z        n|+},| j$                  rt;        jN                  |gd
z        n|}-t;        jN                  |,|-gd      j9                  |      }.n<|j                  \  }} }}!}"| jP                  j3                  |j5                  dd
ddd      j7                  ||z  | |!|"      ||      }/|/j9                  t:        j<                        }/|/j7                  ||| ||      j5                  dd
ddd      }/|%|$k7  rt;        jR                  |/g d      }0|(|#|0dk  z  t;        jT                  |#      |0dkD  z  dz  z   }1n|}1| j                   j@                  jL                  r| jW                  d|1||||||| j$                  |
      \  }2}tY        d|/z
  || j>                  j@                  jZ                        }+|+j9                  ||      | j>                  j@                  j\                  z  }+n(| jW                  |0|1||||||| j$                  |
      \  }+}| j$                  rt;        jN                  |+gd
z        n|+},| j$                  rt;        jN                  |gd
z        n|}-t;        jN                  |,|-gd      j9                  |      }.nd}.t;        jR                  |/d|$dddg      }*t_        j`                  |*|jc                         dd dd      j9                  ||      }*n|%|$k7  r5t;        jJ                  |      j9                  ||      }*| j                   j@                  jL                  r/t;        jJ                  |      ddddf   j9                  ||      }+n%t;        jJ                  |      j9                  ||      }+t;        jJ                  |      j9                  ||      }| j$                  rt;        jN                  |+gd
z        n|+},| j$                  rt;        jN                  |gd
z        n|}-t;        jN                  |,|-gd      j9                  |      }.nut;        jJ                  |#ddddf         }*t;        jR                  |*d|$dddg      }*t_        j`                  |*|jc                         dd dd      j9                  ||      }*d}.|%|$k7  r+j                  d   }3|j                  d   }4|$|3z   |4z   | j                   j@                  jD                  k7  rVte        d| j                   j@                   d| j                   j@                  jD                   d|$ d|3 d|4 d|$|4z   |3z    d       | jg                  ||      }5| j$                  r.t;        jN                  ||g      }t;        jN                  ||g      }|j9                  |!      }|j9                  |!      }t        |      || j&                  jh                  z  z
  }6t        |      | _5        | jm                  |"      5 }7to        |      D ]  \  }8}9| jp                  r| j$                  rt;        jN                  |gd
z        n|}:ts        | j&                  d#      r| j&                  ju                  |:|9      }:t;        jv                  |9g|:j                  d   z  |!      j9                  |:j                        };| j!                  |:|;||.d$      d   }<|<jc                         d   | j>                  j@                  jB                  k7  r|<jy                  d
d      \  }<}2| j$                  r|<jy                  d
      \  }=}>|=|	|>|=z
  z  z   }<| j$                  r|d%kD  rt{        |<>|&      }< | j&                  j|                  |<|9|fi |5d'did   }|%|$k(  r)}?|*}@|8t        |      dz
  k  r||8dz      }At        | j&                  t(              r2| j&                  j                  |?t;        jv                  Ag|(            }?n1| j&                  j                  |?|(t;        jv                  Ag            }?d@z
  |?z  |@|z  z   }|Zi }B|D ]  }Ct               |C   B|C<     || |8|9B      }D|Dj                  d(|      }|Dj                  d)|      }|Dj                  d*|      }|8t        |      dz
  k(  s'|8dz   |6kD  r/|8dz   | j&                  jh                  z  dk(  r|7j                          t        st        j                           	 ddd       |d+k(  sdd| j>                  j@                  j\                  z  |z  }| j>                  j                  |d,      d   }| j                  j                  ||-      }n|}| j                          |s|fS t        |.      S # 1 sw Y   xY w)/a  
        The call function to the pipeline for generation with HunyuanDiT.

        Examples:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            num_frames (`int`, *optional*):
                Length of the video to be generated in seconds. This parameter influences the number of frames and
                continuity of generated content.
            video (`torch.FloatTensor`, *optional*):
                A tensor representing an input video, which can be modified depending on the prompts provided.
            mask_video (`torch.FloatTensor`, *optional*):
                A tensor to specify areas of the video to be masked (omitted from generation).
            masked_video_latents (`torch.FloatTensor`, *optional*):
                Latents from masked portions of the video, utilized during image generation.
            height (`int`, *optional*):
                The height in pixels of the generated image or video frames.
            width (`int`, *optional*):
                The width in pixels of the generated image or video frames.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image but slower
                inference time. This parameter is modulated by `strength`.
            guidance_scale (`float`, *optional*, defaults to 5.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is effective when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to exclude in image generation. If not defined, you need to provide
                `negative_prompt_embeds`. This parameter is ignored when not using guidance (`guidance_scale < 1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                A parameter defined in the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the
                [`~schedulers.DDIMScheduler`] and is ignored in other schedulers. It adjusts noise level during the
                inference process.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) for setting
                random seeds which helps in making generation deterministic.
            latents (`torch.Tensor`, *optional*):
                A pre-computed latent representation which can be used to guide the generation process.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings, aiding in fine-tuning what should not be represented in the
                outputs. If not provided, embeddings are generated from the `negative_prompt` argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask guiding the focus of the model on specific parts of the prompt text. Required when using
                `prompt_embeds`.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt, needed when `negative_prompt_embeds` are used.
            output_type (`str`, *optional*, defaults to `"latent"`):
                The output format of the generated image. Choose between `PIL.Image` and `np.array` to define how you
                want the results to be formatted.
            return_dict (`bool`, *optional*, defaults to `True`):
                If set to `True`, a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] will be returned;
                otherwise, a tuple containing the generated images and safety flags will be returned.
            callback_on_step_end (`Callable[[int, int, Dict], None]`, `PipelineCallback`, `MultiPipelineCallbacks`,
            *optional*):
                A callback function (or a list of them) that will be executed at the end of each denoising step,
                allowing for custom processing during generation.
            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
                Specifies which tensor inputs should be included in the callback function. If not defined, all tensor
                inputs will be passed, facilitating enhanced logging or monitoring of the generation process.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Rescale parameter for adjusting noise configuration based on guidance rescale. Based on findings from
                [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891).
            strength (`float`, *optional*, defaults to 1.0):
                Affects the overall styling or quality of the generated output. Values closer to 1 usually provide
                direct adherence to prompts.

        Examples:
            # Example usage of the function for generating images based on prompts.

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                Returns either a structured output containing generated images and their metadata when `return_dict` is
                `True`, or a simpler tuple, where the first element is a list of generated images and the second
                element indicates if any of them contain "not-safe-for-work" (NSFW) content.
        r   FNr   r   )
r   r}   r|   r   r   r   r   r   r   r   )mu)r   r   r}   r6  r"   r   r   )r   r   )r|   T)r  r  r  r  r  r<   r:   )r   r   r   r   r   ry   r=   )r
  rm   r   zHIncorrect configuration settings! The config of `pipeline.transformer`: z	 expects z& but received `num_channels_latents`: z + `num_channels_mask`: z  + `num_channels_masked_image`: z = z[. Please verify the config of `pipeline.transformer` or your `mask_image` or `image` input.r   )totalscale_model_input)encoder_hidden_statesinpaint_latentsr:  r4  )rg   r:  r   r   r   rp   )r:  )r  r9  )frames)Lr#   r   r   tensor_inputsrQ   r   r&  r*  r0  r   r>   rA   r   _execution_devicer   r|   r   r   r   r   r   r   r  r   r   
preprocessr2   reshaper   r$   float32r   r   latent_channelsin_channelsr#  r   rB   resize_inpaint_mask_directlyr?   r   r@   	ones_liker  rw   cache_mag_vaer  rn   r(   r   r0   r   r   r-  progress_bar	enumerater1  r   r@  tensorchunkrk   r   r  r  localspopupdateXLA_AVAILABLExm	mark_stepdecoder   postprocess_videomaybe_free_model_hooksr   )Er   r   rG   r  r7  r8  r   r   r   r(  r   r   r   r{   r   r   r   r   r   r9  r:  r;  r   rg   r   r
  r   r   r}   r|   latent_timestepr  channelsheight_videowidth_video
init_videor  num_channels_transformerreturn_image_latentslatents_outputsr!  image_latentsro   mask_latents
mask_inputmasked_video_latents_inputrB  mask_conditionmask_condition_tilemasked_videor   num_channels_masknum_channels_masked_imager   num_warmup_stepsrN  r  tlatent_model_inputt_expand
noise_prednoise_pred_uncondrf   init_latents_proper	init_masknoise_timestepcallback_kwargsr   callback_outputssE                                                                        r6   __call__z#EasyAnimateInpaintPipeline.__call__  s   d *-=?U,VW1E1S1S. Vr\B&'ERK"$% 	"!*.
	
  .!1 *VS"9JJvt$<VJ&,,Q/J''(%%++E$$**E "7(,(H(H+'#9"7+I  
	
"!* dnn&EF-? 3VY1.*I* .@Pcekmv-w*I*)-);); 3hv *< *
&	&
 $BQ-..z<Q/QR"c/JO++GJ*lK--88aAq!,44Z*5LhXdfqr 9 J
 $U]];J#++J
HfV[\ddefhiklnoqrsJJ  $xx>>#'#3#3#:#:#F#F 7;OO .... $+!5 / 
   ,;)GUM,NGU !c!&&(''033FEB##**GG#(#3#3G#<QU#C#F#Fvu#UL#(#3#3G#<#?#?#NL','7'7'@'C'CFE'R$>B>^>^UYY~'9:dp
=A=]=]EII34q89cw + #())Z9S,TZ["\"_"_`e"f OYN^N^K
Hj,!%!4!4!?!?&&q!Q15=="Z/< " "@ " "0!2!2!2!G!/!7!7
JPXZ`bg!h!p!pq!Q" ,/CC*/**^_*U'+3&*=*CD#ooj9=PSV=VWZ\\] %
 (<''..KK262K2K (&"!!"% <</A 3L 3// (3.0DdhhooFcFc( (4vu'EHfHf'f=A=V=V/(&"!!"% <</A >W >:&: CGBbBbL>A+=!>htJ  ;; 		#7"81"<=1 /
 ',ii=W0X^_&`&c&cdi&jO&*Ozz.16JAqRS2TU}}Trs0C+eijmmE (+??''033FEB##**GG#(#3#3G#<QU#C#F#Fvu#UL#(#3#3G#<#?#?#NL','7'7'@'C'CFE'R$>B>^>^UYY~'9:dp
=A=]=]EII34q89cw + #())Z9S,TZ["\"_"_`e"f''
1bqb5(9:zz$,@!Q(JK}}Trs0C+eijmmE #' $';; , 2 21 5(<(B(B1(E%$'88;TT##**667 !^_c_o_o_v_v^w x((//;;<<bcwbx y-->,??_`y_z.1JJM^^_ `UU  !::9cJ++!II'=}&MNM$)II/MOd.e$f! &(((7 5 8 8 8 G y>,?$..BVBV,VV!)n%89 D	#\!), C#1>> BFAaAaUYYy1}%=gn"4>>+>?)-)I)IJ\^_)`& !<<.@.F.Fq.I(IRXY\\,22 ] 
 "--&*7$3 % .  
 ??$Q'488??+J+JJ$.$4$4QA$4$>MJ 339C9I9I!9L6%!2^YjGj5k!kJ338H38N!2:aq!rJ .$..--j!WmHYmglmnop+/CC*7' $I3y>A--)21q5)9%dnn6UV26..2L2L 3U\\>BRTY5Z3/ 37..2J2J 3UELL.IY<Z3/  !9}0CCiRYFYYG'3&(O? 9-3Xa[*9';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s*I**A9I/IqSTuX\XfXfXlXlNlpqNq '') LLNGC#D	#L h&$((//8887BGHHOOGO?BE((::T_:`EE 	##%8O(66gD	# D	#s   &K%t9t99u)
r   TNNNNNNN   )NNNNNN)NNNTFF)0__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r	   r   r
   r   r   r   r   r   rQ   boolr   r$   r%   r}   r|   r   r   r   r  r  r#  propertyr(  rg   r   r.  r1  no_gradr   EXAMPLE_DOC_STRINGFloatTensorr3   	Generatorr   r   r   r   rw  __classcell__)r   s   @r6   r   r   J  s   * =T$c $c ;YFG$c 67	$c
 3$c 3$cT &',0;?049=8<AE)-'+#&`lc49n%`l  #`l &*	`l
 "%T#Y"78`l  -`l !) 6`l  (5`l )1(>`l &`l $`l !`lF!, #"'++/4n	81*z "HT $ $ & & ( ( # #   U]]_12 )-$&*./39= #"-/*-;?/0"MQ*.049=8<AE%*  9B"%$*)-;}7c49n%}7 SM}7 U&&'	}7
 %++,}7 $E$5$56}7 }7 }}7 &c]}7 !}7 "%T#Y"78}7  (}}7 e_}7 E%//43H"HIJ}7 %,,'}7   -!}7" !) 6#}7$  (5%}7& )1(>'}7( c])}7* +}7, '(Cd+T124DF\\]
-}72 -1I3}74  5}76 7}78 "9}7: DI&;}7 3 }7r8   r   )r4  )T)NN)NNNN)@r   typingr   r   r   r   r   numpyr,   r$   torch.nn.functionalr&   r'   rn   PILr   transformersr	   r
   r   r   	callbacksr   r   r   r   modelsr   r   pipelines.pipeline_utilsr   
schedulersr   utilsr   r   r   utils.torch_utilsr   r   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelrV  rU  
get_loggerry  loggerr  r7   rO   r_   rk   rw   r   rQ   r   r}   r3   r   r   r   r8   r6   <module>r     s     8 8       B 0 H 9 9 O O - - 6 ))MM			H	%$ N6:)|W&64, *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*vI7!2 I7r8   