
    bi                       d dl Z d dlmZ d dlmZmZmZmZ d dlZ	d dl
Zd dlZd dlmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/  e!       rd dl0m1c m2Z3 dZ4ndZ4 e"jj                  e6      Z7e G d de&             Z8 G d de*e+      Z9y)    N)	dataclass)CallableListOptionalUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjectionGPT2Tokenizer   )VaeImageProcessor)StableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKL)adjust_lora_scale_text_encoder)KarrasDiffusionSchedulers)USE_PEFT_BACKEND	deprecateis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)
BaseOutput)randn_tensor   )DeprecatedPipelineMixinDiffusionPipeline   )UniDiffuserTextDecoder)UniDiffuserModelTFc                       e Zd ZU dZeeeej                  j                     e	j                  f      ed<   eeee   eee      f      ed<   y)ImageTextPipelineOutputa  
    Output class for joint image-text pipelines.

    Args:
        images (`List[PIL.Image.Image]` or `np.ndarray`)
            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
            num_channels)`.
        text (`List[str]` or `List[List[str]]`)
            List of generated text strings of length `batch_size` or a list of list of strings whose outer list has
            length `batch_size`.
    imagestextN)__name__
__module____qualname____doc__r   r   r   PILImagenpndarray__annotations__str     o/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.pyr#   r#   /   sP    
 U4		0"**<=>>
5cDcO34
55r1   r#   c            0       "    e Zd ZdZdZdZdededede	de
d	ed
ededef fdZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Z	 	 	 	 dDdeej>                     deej>                     dee    fdZ!	 	 	 	 	 dEdeej>                     deej>                     dee    d ee"   fd!Z#	 dFd"Z$	 dFd#Z%	 dFd$Z&	 dFd%Z'	 dFd&Z(d' Z)d( Z*d) Z+d* Z,d+ Z-d, Z.d- Z/	 	 	 	 	 	 	 dGd.Z0 ejb                         	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dHd/ee2e3e4e3   f      d0ee2ej>                  e5jl                  jl                  f      d1ee"   d2ee"   d3ee"   d4e"d5e d6ee2e3e4e3   f      d7ee"   d8ee"   d9e d:ee2ejn                  e4ejn                     f      d;eej>                     d<eej>                     d=eej>                     d>eej>                     deej>                     deej>                     d?ee3   d@e8dAee9e"e"ej>                  gdf      dBe"f,dC       Z: xZ;S )IUniDiffuserPipelineao  
    Pipeline for a bimodal image-text model which supports unconditional text and image generation, text-conditioned
    image generation, image-conditioned text generation, and joint image-text generation.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. This
            is part of the UniDiffuser image representation along with the CLIP vision encoding.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        image_encoder ([`CLIPVisionModel`]):
            A [`~transformers.CLIPVisionModel`] to encode images as part of its image representation along with the VAE
            latent representation.
        image_processor ([`CLIPImageProcessor`]):
            [`~transformers.CLIPImageProcessor`] to preprocess an image before CLIP encoding it with `image_encoder`.
        clip_tokenizer ([`CLIPTokenizer`]):
             A [`~transformers.CLIPTokenizer`] to tokenize the prompt before encoding it with `text_encoder`.
        text_decoder ([`UniDiffuserTextDecoder`]):
            Frozen text decoder. This is a GPT-style model which is used to generate text from the UniDiffuser
            embedding.
        text_tokenizer ([`GPT2Tokenizer`]):
            A [`~transformers.GPT2Tokenizer`] to decode text for text generation; used along with the `text_decoder`.
        unet ([`UniDiffuserModel`]):
            A [U-ViT](https://github.com/baofff/U-ViT) model with UNNet-style skip connections between transformer
            layers to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image and/or text latents. The
            original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler.
    z0.33.1z4text_encoder->image_encoder->unet->vae->text_decodervaetext_encoderimage_encoderclip_image_processorclip_tokenizertext_decodertext_tokenizerunet	schedulerc
                 d   t         
|           |j                  j                  |j                  k7  r/t        d|j                  j                   d|j                         | j                  |||||||||		       t        | dd       r/dt        | j                  j                  j                        dz
  z  nd| _        t        | j                        | _        |j                  j                  | _        |j                  j                   | _        |j                  j                  | _        |j                  j&                  | _        |j                  j*                  | _        | j$                  | _        | j0                  j2                  | j0                  j2                  | _        d | _        d | _        y )	NzxThe text encoder hidden size and text decoder prefix inner dim must be the same, but `text_encoder.config.hidden_size`: z& and `text_decoder.prefix_inner_dim`: )	r5   r6   r7   r8   r9   r:   r;   r<   r=   r5   r   r      )vae_scale_factor)super__init__confighidden_sizeprefix_inner_dim
ValueErrorregister_modulesgetattrlenr5   block_out_channelsr@   r   image_processorlatent_channelsnum_channels_latentsmax_position_embeddingstext_encoder_seq_lentext_encoder_hidden_sizeprojection_dimimage_encoder_projection_dimsample_sizeunet_resolutiontext_intermediate_dimr:   prefix_hidden_dimmodesafety_checker)selfr5   r6   r7   r8   r9   r:   r;   r<   r=   	__class__s             r2   rB   zUniDiffuserPipeline.__init__g   s    	**l.K.KK77C7J7J7V7V6WW}  K  \  \  ~]^ 
 	%'!5)%) 	 
	
 W^^bdikoVpc$((//*L*L&MPQ&Q Rvw0$BWBWX$'JJ$>$>!$0$7$7$O$O!(4(;(;(G(G%,9,@,@,O,O)#{{66%)%B%B"..:)-):):)L)LD&	 #r1   c                 V   dt        t        j                  | j                  j                        j
                  j                               v }i }|r||d<   dt        t        j                  | j                  j                        j
                  j                               v }|r||d<   |S )Neta	generator)setinspect	signaturer=   step
parameterskeys)rY   r]   r\   accepts_etaextra_step_kwargsaccepts_generators         r2   prepare_extra_step_kwargsz-UniDiffuserPipeline.prepare_extra_step_kwargs   s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r1   c                    |duxs |du}|du}	|xs |	}
|du}|du}|du}|du}|xr |}|xr |}| j                   | j                   }n|rd}n|	rd}n|s|rd}n|rd}n|rd}nd}| j                   |r|	rt        j                  d| d       | j                   A|
s?||k7  rt        j                  d	| d       |S |s|s|st        j                  d
| d       |S )z
        Infer the generation task ('mode') from the inputs to `__call__`. If the mode has been manually set, the set
        mode will be used.
        Ntext2imgimg2textjointr%   imgzwYou have supplied both a text prompt and image to the pipeline and mode has not been set manually, defaulting to mode 'z'.zYou have supplied exactly one of `vae_latents` and `clip_latents`, whereas either both or none are expected to be supplied. Defaulting to mode 'zaNo inputs or latents have been supplied, and mode has not been manually set, defaulting to mode ')rW   loggerwarning)rY   promptprompt_embedsimagelatentsprompt_latentsvae_latentsclip_latentsprompt_availableimage_availableinput_availableprompt_latents_availablevae_latents_availableclip_latents_availablefull_latents_availableimage_latents_availableall_indv_latents_availablerW   s                     r2   _infer_modezUniDiffuserPipeline._infer_mode   sR   
 #$.NM4Mt+*=o#1#=  +4 7!-T!9!(!4"7"R<R%=%YBY"99 99DDD &)C)(  99!1oNN((,vR1
 99_$(>>IIMbR  .6KTj,,065
 r1   c                 8    | j                   j                          y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        N)r5   enable_slicingrY   s    r2   enable_vae_slicingz&UniDiffuserPipeline.enable_vae_slicing       
 	!r1   c                 8    | j                   j                          y)z
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        N)r5   disable_slicingr   s    r2   disable_vae_slicingz'UniDiffuserPipeline.disable_vae_slicing   s    
 	  "r1   c                 8    | j                   j                          y)a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        N)r5   enable_tilingr   s    r2   enable_vae_tilingz%UniDiffuserPipeline.enable_vae_tiling   s     	 r1   c                 8    | j                   j                          y)z
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        N)r5   disable_tilingr   s    r2   disable_vae_tilingz&UniDiffuserPipeline.disable_vae_tiling   r   r1   c                     d| _         y)zOManually set the generation mode to unconditional ("marginal") text generation.r%   NrW   r   s    r2   set_text_modez!UniDiffuserPipeline.set_text_mode  s	    	r1   c                     d| _         y)zPManually set the generation mode to unconditional ("marginal") image generation.rl   Nr   r   s    r2   set_image_modez"UniDiffuserPipeline.set_image_mode  s	    	r1   c                     d| _         y)zFManually set the generation mode to text-conditioned image generation.ri   Nr   r   s    r2   set_text_to_image_modez*UniDiffuserPipeline.set_text_to_image_mode  	    	r1   c                     d| _         y)zFManually set the generation mode to image-conditioned text generation.rj   Nr   r   s    r2   set_image_to_text_modez*UniDiffuserPipeline.set_image_to_text_mode  r   r1   c                     d| _         y)zNManually set the generation mode to unconditional joint image-text generation.rk   Nr   r   s    r2   set_joint_modez"UniDiffuserPipeline.set_joint_mode  s	    	r1   c                     d| _         y)z^Removes a manually set mode; after calling this, the pipeline will infer the mode from inputs.Nr   r   s    r2   
reset_modezUniDiffuserPipeline.reset_mode  s	    	r1   c                    |d}|d}|dkD  sJ d       |dkD  sJ d       |dv rH|t        |t              rd}n-|t        |t              rt        |      }n|j                  d   }|}||fS |dv r<t        |t
        j                  j                        rd}n|j                  d   }|}||fS |dv r,|	|	j                  d   }n|
|
j                  d   }nd}|}||fS |dv r||j                  d   }nd}|}||fS |d	v r||j                  d   }n8||j                  d   }n&|	|	j                  d   }n|
|
j                  d   }nd}||k(  r|}||fS t        ||      }t        j                  d
| d| d| d| d	       fS )z\Infers the batch size and multiplier depending on mode and supplied arguments to `__call__`.r   r   z0num_images_per_prompt must be a positive integerz0num_prompts_per_image must be a positive integerri   rj   )rl   )r%   )rk   zYou are using mode `z` and `num_images_per_prompt`: z and num_prompts_per_image: z_ are not equal. Using batch size equal to `min(num_images_per_prompt, num_prompts_per_image) = .)

isinstancer/   listrI   shaper*   r+   minrm   rn   )rY   rW   ro   rp   rq   num_images_per_promptnum_prompts_per_imagerr   rs   rt   ru   
batch_size
multipliers                r2   _infer_batch_sizez%UniDiffuserPipeline._infer_batch_size  s?    !($%! ($%!$q(\*\\($q(\*\\(<!j&=
#
64(@ [
 +003
.JX :%%W \!%1
 #[[^
.JH :%%G W_&(..q1
))//2

.J8 :%%7 X)+11!4

.J, :%%+ Y"$]]1-
++11!4
((..q1
))//2

$(==2
 :%% !!68MN
*4&0OPeOf g//D.E FMMWLXY[
 :%%r1   Nrp   negative_prompt_embeds
lora_scalec	                     d}
t        dd|
d        | j                  d	||||||||d|	}t        j                  |d   |d   g      }|S )
Nz`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple.z_encode_prompt()1.0.0Fstandard_warn)ro   devicer   do_classifier_free_guidancenegative_promptrp   r   r   r   r   r0   )r   encode_prompttorchcat)rY   ro   r   r   r   r   rp   r   r   kwargsdeprecation_messageprompt_embeds_tuples               r2   _encode_promptz"UniDiffuserPipeline._encode_promptl  s}     a$g/BRWX0d00 

"7(C+'#9!

 

 		#6q#9;Nq;Q"RSr1   	clip_skipc
                 H
   |Jt        | t              r:|| _        t        st	        | j
                  |       nt        | j
                  |       |t        |t              rd}
n-|t        |t              rt        |      }
n|j                  d   }
|t        | t              r| j                  || j                        }| j                  |d| j                  j                  dd      }|j                  }| j                  |dd	      j                  }|j                  d
   |j                  d
   k\  rt!        j"                  ||      sj| j                  j%                  |dd| j                  j                  dz
  d
f         }t&        j)                  d| j                  j                   d|        t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}|	(| j                  |j3                  |      |      }|d   }nT| j                  |j3                  |      |d      }|d
   |	dz       }| j
                  j4                  j7                  |      }| j
                  | j
                  j8                  }n/| j:                  | j:                  j8                  }n|j8                  }|j3                  ||      }|j                  \  }}}|j=                  d|d      }|j?                  ||z  |d
      }|rm|j|dg|
z  }n|:tA        |      tA        |      ur$tC        dtA        |       dtA        |       d      t        |t              r|g}n1|
t        |      k7  r!tE        d| dt        |       d| d|
 d	      |}t        | t              r| j                  || j                        }|j                  d   }| j                  |d|dd      }t+        | j
                  j,                  d      r<| j
                  j,                  j.                  r|j0                  j3                  |      }nd}| j                  |j                  j3                  |      |      }|d   }|rK|j                  d   }|j3                  ||      }|j=                  d|d      }|j?                  |
|z  |d
      }| j
                  ,t        | t              rt        rtG        | j
                  |       ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr   r   
max_lengthTpt)paddingr   
truncationreturn_tensorslongest)r   r   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)r   output_hidden_states)dtyper    z?`negative_prompt` should be the same type to `prompt`, but got  != r   z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$r   r   _lora_scaler   r   r6   r   r/   r   rI   r   r   maybe_convert_promptr9   model_max_length	input_idsr   equalbatch_decoderm   rn   hasattrrC   r   r   to
text_modelfinal_layer_normr   r<   repeatviewtype	TypeErrorrF   r   )rY   ro   r   r   r   r   rp   r   r   r   r   text_inputstext_input_idsuntruncated_idsremoved_textr   prompt_embeds_dtypebs_embedseq_len_uncond_tokensr   uncond_inputs                          r2   r   z!UniDiffuserPipeline.encode_prompt  sJ   V !j7U&V)D $.t/@/@*M!$"3"3Z@*VS"9JJvt$<VJ&,,Q/J $ ;<2264;N;NO--$..??# . K )22N"11&)\`1akkO$$R(N,@,@,DDU[[N  $22??#At':':'K'Ka'ORT'T$TU  ++<<=Y|nV
 t((//1EF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS1!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;< $ 9 9-I\I\ ]&,,Q/J..$%# / L t((//1EF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@NcAcelnp%q"($ >?DT#D$5$5zB444r1   c           	      N   t        |t        j                  t        j                  j                  t
        f      st        dt        |             |j                  ||      }||z  }t        |t
              r)t        |      |k7  rt        dt        |       d| d      t        |t
              rt        |      D cg c]a  }| j                  j                  |||dz          j                  j                  ||         | j                  j                  j                   z  c }	}t        j"                  |	d	      }	nX| j                  j                  |      j                  j                  |      }	|	| j                  j                  j                   z  }	||	j$                  d   kD  rh||	j$                  d   z  dk(  rSd
| d|	j$                  d    d}
t'        dd|
d       ||	j$                  d   z  }t        j"                  |	g|z  d	      }	n^||	j$                  d   kD  r4||	j$                  d   z  dk7  rt        d|	j$                  d    d| d      t        j"                  |	gd	      }	|r/t        j(                  |	      }t        j"                  |	|	|gd	      }	|	S c c}w )NK`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is r   r   /You have passed a list of generators of length +, but requested an effective batch size of @. Make sure the batch size matches the length of the generators.r   )r]   r   dimYou have passed # text prompts (`prompt`), but only     initial images (`image`). Initial images are now duplicating to match the number of text prompts. Note that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update your script to pass as many initial images as text prompts to suppress this warning.len(prompt) != len(image)r   Fr   'Cannot duplicate `image` of batch size  to  text prompts.)r   r   Tensorr*   r+   r   rF   r   r   rI   ranger5   encodelatent_distsamplerC   scaling_factorr   r   r   
zeros_like)rY   rq   r   r   r   r   r   r]   iimage_latentsr   additional_image_per_promptuncond_image_latentss                r2   encode_image_vae_latentsz,UniDiffuserPipeline.encode_image_vae_latentsE  s    %%,,		!FG]^bch^i]jk  e4"77
i&3y>Z+GA#i.AQ R&<'gi 
 i& z*  a!a% 01==DDyYZ|D\((//001M 
 "IIm;M HHOOE2>>EEPYEZM)DHHOO,J,JJM++A..:@S@STU@V3VZ[3[ #:,.QR_ReRefgRhQi jh h   17<O_de*48K8KA8N*N'!II}o8S&SYZ[M---a00Z-BUBUVWBX5X\]5]9-:M:Ma:P9QQUV`Uaaop  "II}o1=M&#(#3#3M#B !II}mEY&Z`abMCs   ?A&J"c                 4   t        |t        j                  t        j                  j                  t
        f      st        dt        |             | j                  j                  |d      }|j                  ||      }||z  }t        |t
              rPt        |      D cg c]$  } | j                  di |||dz    j                  & }	}t        j                  |	d      }	n | j                  di |j                  }	||	j                  d   kD  rh||	j                  d   z  dk(  rSd| d	|	j                  d    d
}
t!        dd|
d       ||	j                  d   z  }t        j                  |	g|z  d      }	n^||	j                  d   kD  r4||	j                  d   z  dk7  rt        d|	j                  d    d| d      t        j                  |	gd      }	t        |t
              r)t#        |      |k7  rt        dt#        |       d| d      |	S c c}w )Nr   r   )r   r   r   r   r   r   r   r   r   r   Fr   r   r   r   r   r   r   r0   )r   r   r   r*   r+   r   rF   r   r8   
preprocessr   r   r7   image_embedsr   r   r   rI   )rY   rq   r   r   r   r   r]   preprocessed_imager   r   r   r   s               r2   encode_image_clip_latentsz-UniDiffuserPipeline.encode_image_clip_latents  s_    %%,,		!FG]^bch^i]jk  "66AA B 
 022&2N"77
i&Z_`jZkUV"""C%7AE%BCPPM  "IIm;M.D..D1CDQQM++A..:@S@STU@V3VZ[3[ #:,.QR_ReRefgRhQi jh h   17<O_de*48K8KA8N*N'!II}o8S&SYZ[M---a00Z-BUBUVWBX5X\]5]9-:M:Ma:P9QQUV`Uaaop  "II}o1=Mi&3y>Z+GA#i.AQ R&<'gi 
 ?s   #)Hc	                 *   ||z  ||f}	t        |t              r)t        |      |k7  rt        dt        |       d| d      |t	        |	|||      }n&|j                  |dd      }|j                  ||      }|| j                  j                  z  }|S Nr   r   r   r]   r   r   r   r   	r   r   rI   rF   r   r   r   r=   init_noise_sigma)
rY   r   r   r   rD   r   r   r]   rr   r   s
             r2   prepare_text_latentsz(UniDiffuserPipeline.prepare_text_latents  s     33WkJi&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZG nn%:AqAGjjej<G DNN;;;r1   c
                 b   ||z  ||| j                   z  || j                   z  f}
t        |t              r)t        |      |k7  rt	        dt        |       d| d      |	t        |
|||      }	n'|	j                  |ddd      }	|	j                  ||      }	|	| j                  j                  z  }	|	S r  )
r@   r   r   rI   rF   r   r   r   r=   r  )rY   r   r   rM   heightwidthr   r   r]   rr   r   s              r2   prepare_image_vae_latentsz-UniDiffuserPipeline.prepare_image_vae_latents  s     .. d+++T***	
 i&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZG nn%:Aq!DGjjej<G DNN;;;r1   c                 *   ||z  d|f}t        |t              r)t        |      |k7  rt        dt        |       d| d      |t	        ||||      }n&|j                  |dd      }|j                  ||      }|| j                  j                  z  }|S )Nr   r   r   r   r  r   r  )	rY   r   r   clip_img_dimr   r   r]   rr   r   s	            r2   prepare_image_clip_latentsz.UniDiffuserPipeline.prepare_image_clip_latents  s     33QEi&3y>Z+GA#i.AQ R&<'gi 
 ?"5IfTYZG nn%:AqAGjjej<G DNN;;;r1   c                 B   | j                   j                  || j                  j                  |      \  }}|j	                         j                         }t        ||      D cg c].  \  }}| j                  j                  |d t        |       d      0 }}}|S c c}}w )Nr   T)skip_special_tokens)	r:   generate_captionsr;   eos_token_idcpunumpyzipdecodeint)	rY   text_latentsr   output_token_listseq_lengthsoutput_listoutputlengthgenerated_texts	            r2   decode_text_latentsz'UniDiffuserPipeline.decode_text_latents
  s    )-):):)L)L$--::6 *M *
&; (++-335 #&k;"?
 &&vmF'<RV&W
 
 	
s   #3Bc                 Z   |j                   d   }|| j                  z  }|| j                  z  }| j                  |z  |z  }|j                  || j                  gd      \  }}	t        j                  ||| j                  ||f      }t        j                  |	|d| j                  f      }	||	fS )z
        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim) into two tensors of shape (B, C, H, W)
        and (B, 1, clip_img_dim)
        r   r   r   )r   r@   rM   splitrR   r   reshape)
rY   xr  r  r   latent_heightlatent_widthimg_vae_dimimg_vaeimg_clips
             r2   _splitzUniDiffuserPipeline._split  s    
 WWQZ
$"7"77 5 55//-?,NGG[$2S2S$TZ[G\--*d6O6OQ^`l)mn==J4;\;\+]^  r1   c                     t        j                  ||j                  d   df      }t        j                  ||j                  d   df      }t        j                  ||gd      S )z
        Combines a latent image img_vae of shape (B, C, H, W) and a CLIP-embedded image img_clip of shape (B, 1,
        clip_img_dim) into a single tensor of shape (B, C * H * W + clip_img_dim).
        r   r   r   r   r!  r   concat)rY   r&  r'  s      r2   _combinezUniDiffuserPipeline._combine%  sV    
 --'--*:B)?@==HNN1,=r+BC||Wh/R88r1   c                    |j                   d   }|| j                  z  }|| j                  z  }| j                  |z  |z  }| j                  | j                  z  }|j                  || j                  |gd      \  }	}
}t        j                  |	|| j                  ||f      }	t        j                  |
|d| j                  f      }
t        j                  ||| j                  | j                  f      }|	|
|fS )a%  
        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim + text_seq_len * text_dim] into (img_vae,
        img_clip, text) where img_vae is of shape (B, C, H, W), img_clip is of shape (B, 1, clip_img_dim), and text is
        of shape (B, text_seq_len, text_dim).
        r   r   r   )	r   r@   rM   rO   rU   r   rR   r   r!  )rY   r"  r  r  r   r#  r$  r%  text_dimr&  r'  r%   s               r2   _split_jointz UniDiffuserPipeline._split_joint.  s     WWQZ
$"7"77 5 55//-?,N,,t/I/II"#'';8Y8Y[c*djk'"l4--*d6O6OQ^`l)mn==J4;\;\+]^}}TJ0I0I4KeKe#fg$&&r1   c                    t        j                  ||j                  d   df      }t        j                  ||j                  d   df      }t        j                  ||j                  d   df      }t        j                  |||gd      S )a1  
        Combines a latent image img_vae of shape (B, C, H, W), a CLIP-embedded image img_clip of shape (B, L_img,
        clip_img_dim), and a text embedding text of shape (B, L_text, text_dim) into a single embedding x of shape (B,
        C * H * W + L_img * clip_img_dim + L_text * text_dim).
        r   r   r   r*  )rY   r&  r'  r%   s       r2   _combine_jointz"UniDiffuserPipeline._combine_jointA  su     --'--*:B)?@==HNN1,=r+BC}}TDJJqM2#67||Wh52>>r1   c                    |dk(  r| j                  |||      \  }}}| j                  ||||||      \  }}}| j                  |||      }|	dk  r|S t        |j                  |
||j
                        }t        |j                  |
||j
                        }t        |j                  |
||j
                        }| j                  ||||||      \  }}}| j                  ||||||      \  }}}| j                  |||      }|	|z  d|	z
  |z  z   S |dk(  r| j                  |||      \  }}| j                  ||||d|      \  }}}| j                  ||      }|	dk  r|S t        |j                  |
||j
                        }| j                  ||||||      \  }}}| j                  ||      }|	|z  d|	z
  |z  z   S |dk(  r| j                  |||d||      \  }}}|	dk  r|S t        |j                  |
||j
                        }t        |j                  |
||j
                        }| j                  ||||||      \  }}}|	|z  d|	z
  |z  z   S |dk(  r| j                  ||||||      \  }}}|S |d	k(  rE| j                  |||      \  }}| j                  ||||||      \  }}}| j                  ||      }|S y
)zq
        Gets the noise prediction using the `unet` and performs classifier-free guidance, if necessary.
        rk   )timestep_imgtimestep_text	data_type      ?r  ri   r   rj   r%   rl   N)r/  r<   r1  r   r   r   r(  r,  )rY   rW   rr   trp   r&  r'  max_timestepr5  guidance_scaler]   r   r  r  img_vae_latentsimg_clip_latentsr  img_vae_outimg_clip_outtext_outx_out	img_vae_T
img_clip_Ttext_Tr   text_out_uncondimg_vae_out_uncondimg_clip_out_uncondx_out_uncondimg_outimg_out_unconds                                  r2   _get_noise_predz#UniDiffuserPipeline._get_noise_predL  s   & 7?>B>O>OPWY_af>g;O-|26))!1<a_`lu 3< 3/Kx ''\8LE$ %W]]iPV^e^k^klI%hnn	RX`h`n`noJ!-"5"5SYanatatuF$(II:|,^_kt %. %!Aq/ :> *# :C :6 3Q  ../ACVXghL!E)S>-A\,QQQZ04GVU0S-O-26))!1=q`amv 3< 3/Kx mmK>G$ "-"5"5SYanatatuFGKyy *# HQ HD 3_ "]]+=?RSN!G+s^/C~.UUUZ26))7!W` 3< 3/Kx $ %W]]iPV^e^k^klI%hnn	RX`h`n`noJGKyy:w\YZfo HQ HD 3_ "H,n0D/WWWV^26))7UVbk 3< 3/Kx OU]04GVU0S-O-26)) *# 3< 3/Kx mmK>GN r1   c                 R   |j                   }t        |      dz   }dj                  d |D              }t        |      |k7  r!t        d| d| d| dt        |       d	      t	        d|      D ]7  }||   ||dz
     k7  st        d| d| d| d||    d	||dz
      d
| d       y )Nr   z, c              3   2   K   | ]  }t        |        y wN)r/   ).0r   s     r2   	<genexpr>z:UniDiffuserPipeline.check_latents_shape.<locals>.<genexpr>  s     &JCs3x&Js   `z!` should have shape (batch_size, z), but the current shape z has z dimensions.r   z at dimension r   )r   rI   joinrF   r   )rY   latents_namerr   expected_shapelatents_shapeexpected_num_dimsexpected_shape_strr   s           r2   check_latents_shapez'UniDiffuserPipeline.check_latents_shape  s   /!3!YY&J>&JJ}!22L>!BCUBV W!?%M(:';<I  q+, 	AQ>!a%#88 ~%FGYFZ [%eM!,<+=T.QRUVQVBWAXXfghfiijl 	r1   c           	         || j                   z  dk7  s|| j                   z  dk7  rt        d| j                    d| d| d      ||0t        |t              r|dk  rt        d| dt	        |       d      |dk(  r||t        d	| d
| d      ||t        d      |7t        |t
              s't        |t              st        dt	        |             ||	t        d| d|	 d      |A|	?|j                  |	j                  k7  r&t        d|j                   d|	j                   d      |dk(  r|t        d      || j                   z  }|| j                   z  }|
d u}|d u}|d u}|d u}|rz|d uxs
 |d uxs |d u}|rt        j                  d       | j                  |z  |z  }| j                  | j                  z  }|| j                  z   |z   }|f}| j                  d|
|       |r+| j                  | j                  f}| j                  d||       |r"| j                  ||f}| j                  d||       |r!d| j                  f}| j                  d||       |dv rO|rM|rK|j                  d   |j                  d   k7  r,t        d|j                  d    d|j                  d    d      |dk(  r|r|r~|r{|j                  d   |j                  d   k7  s|j                  d   |j                  d   k7  r<t        d|j                  d    d|j                  d    d|j                  d    d      y y y y y )Nr   z-`height` and `width` have to be divisible by z	 but are z and r   z5`callback_steps` has to be a positive integer but is z	 of type ri   zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` rj   z1`img2text` mode requires an image to be provided.zYou have supplied both `latents` and at least one of `prompt_latents`, `vae_latents`, and `clip_latents`. The value of `latents` will override the value of any individually supplied latents.rr   rs   rt   r   ru   ri   rl   z^Both `vae_latents` and `clip_latents` are supplied, but their batch dimensions are not equal: r   rk   zsAll of `prompt_latents`, `vae_latents`, and `clip_latents` are supplied, but their batch dimensions are not equal: )r@   rF   r   r  r   r/   r   r   rm   rn   rM   rO   rP   rR   rV  )rY   rW   ro   rq   r  r  callback_stepsr   rp   r   rr   rs   rt   ru   r#  r$  r|   ry   rz   r{   individual_latents_availabler%  r.  latents_dimlatents_expected_shapeprompt_latents_expected_shapevae_latents_expected_shapeclip_latents_expected_shapes                               r2   check_inputsz UniDiffuserPipeline.check_inputs  sm   " D)))Q.%$:O:O2OST2T?@U@U?VV_`f_gglmrlsstu  "&
>30OSaefSfGGW X(), 
 :!m&? 4VH<RS`Ra b4 4  M$9 {  #Z-DZX^`dMe #UVZ[aVbUc!dee*/E/Q =o=N O.//ac 
 (-C-O &&*@*F*FF$11>1D1D0E F2889<  :} !TUU $"7"77 5 55!(!4#1#=  +4 7!-T!9!d*ak.Ea]aIa ) ,|
 33mClRK0043P3PPH%(I(IIHTK&1^"$$Y9OP $-1-F-FHeHe,f)$$%5~Gde *.*C*C]T`)a&$$]KA[\!+,d.O.O*P'$$^\C^_&&+@E[  #|'9'9!'<< #))!,-T,2D2DQ2G1HK 
 7?7<QVl##A&+*;*;A*>>.BVBVWXBY]i]o]opq]rBr 22@2F2Fq2I1J${O`O`abOcNd<--a014  Cs Wm<Q7?r1   ro   rq   r  r  r5  num_inference_stepsr9  r   r   r   r\   r]   rr   rs   rt   ru   output_typereturn_dictcallbackrY  c                 p   |xs | j                   | j                  z  }|xs | j                   | j                  z  }| j                  |||||||      }| j                  |||||||||||||       | j	                  |||||	|
||||
      \  }}| j
                  }| j                  | j                  k  xs | j                  dk7  }|dkD  }|| j                  |||      \  }}}|dv r"||J | j                  |||||||      \  }}nA| j                  ||| j                  | j                  | j                  j                  |||      }|r| j                  j!                  |      }|dv r|J d       | j"                  j%                  |      }|j&                  d	d \  }}| j)                  ||||j                  |d
|      }| j+                  ||||j                  ||      }|j-                  d      }nZ| j/                  ||| j0                  |||j                  |||	      }| j3                  ||| j4                  |j                  |||      }| j6                  j9                  ||       | j6                  j:                  } | j6                  j<                  j>                  }!|dk(  r| jA                  |||      }n|dv r| jC                  ||      }n|dv r|}| jE                  ||      }"tF        jI                  d|"        tK        |       || j6                  jL                  z  z
  }#| jO                  |      5 }$tQ        |       D ]  \  }%}&| jS                  |||&||||!||||||      }' | j6                  jT                  |'|&|fi |"jV                  }|%tK        |       dz
  k(  s'|%dz   |#kD  r]|%dz   | j6                  jL                  z  dk(  r>|$jY                          |,|%|z  dk(  r$|%t[        | j6                  dd      z  }( ||(|&|       t\        st_        j`                           	 ddd       d}d})|dk(  rs| j                  |||      \  }}}*|dk(  sB| jb                  je                  || jb                  j<                  jf                  z  d
      d   }n|}| ji                  |*|      })n||dv r`| jk                  |||      \  }}|dk(  sB| jb                  je                  || jb                  j<                  jf                  z  d
      d   }n|}n|dv r|}*| ji                  |*|      })| jm                          |1dg|j&                  d   z  }+| j"                  jo                  |||+      }tq        | d      r&| jr                  | jr                  ju                          |s||)fS tw        ||)      S # 1 sw Y   xY w)uf  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
                Required for text-conditioned image generation (`text2img`) mode.
            image (`torch.Tensor` or `PIL.Image.Image`, *optional*):
                `Image` or tensor representing an image batch. Required for image-conditioned text generation
                (`img2text`) mode.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            data_type (`int`, *optional*, defaults to 1):
                The data type (either 0 or 1). Only used if you are loading a checkpoint which supports a data type
                embedding; this is added for compatibility with the
                [UniDiffuser-v1](https://huggingface.co/thu-ml/unidiffuser-v1) checkpoint.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 8.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). Used in
                text-conditioned image generation (`text2img`) mode.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt. Used in `text2img` (text-conditioned image generation) and
                `img` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
            num_prompts_per_image (`int`, *optional*, defaults to 1):
                The number of prompts to generate per image. Used in `img2text` (image-conditioned text generation) and
                `text` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint
                image-text generation. Can be used to tweak the same generation with different prompts. If not
                provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes
                a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`,
                `vae_latents`, and `clip_latents`.
            prompt_latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            vae_latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            clip_latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned
                image generation (`text2img`) mode.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used
                in text-conditioned image generation (`text2img`) mode.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple.
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.

        Returns:
            [`~pipelines.unidiffuser.ImageTextPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.unidiffuser.ImageTextPipelineOutput`] is returned, otherwise a
                `tuple` is returned where the first element is a list with the generated images and the second element
                is a list of generated texts.
        ri   r6  Nr   )ro   r   r   r   r   rp   r   )r   r   r   rD   r   r   r]   rr   r   z(`img2text` requires a conditioning imageF)rq   r   r   r   r   r   r]   )rq   r   r   r   r   r]   r   )	r   r   rM   r  r  r   r   r]   rr   )r   r   r  r   r   r]   rr   r  rk   rX  )rj   r%   zScheduler extra step kwargs: )totalr   orderlatent)rc  T)rb  do_denormalizefinal_offload_hook)r$   r%   )<rT   r@   r   r`  r   _execution_devicerU   rP   rW   r/  r   r  rO   r6   r   r:   r   rK   r   r   r   r   	unsqueezer	  rM   r  rR   r=   set_timesteps	timestepsrC   num_train_timestepsr1  r,  rg   rm   debugrI   rh  progress_bar	enumeraterI  ra   prev_sampleupdaterH   XLA_AVAILABLExm	mark_stepr5   r  r   r  r(  maybe_free_model_hookspostprocessr   rk  offloadr#   ),rY   ro   rq   r  r  r5  ra  r9  r   r   r   r\   r]   rr   rs   rt   ru   rp   r   rb  rc  rd  rY  rW   r   r   r   reduce_text_emb_dimr   	image_vaeimage_vae_latentsimage_clip_latentsro  r8  re   num_warmup_stepsrr  r   r7  
noise_predstep_idxr%   r  rj  s,                                               r2   __call__zUniDiffuserPipeline.__call__G  s   b G4//$2G2GGE--0E0EE ug~Wbdpq"	
" "&!7!7!!"

J ''"884;X;XXs\`\e\eis\s '5s&:# 8<8I8I'SY[`8a5K~<%)BBB484F4F&0,G /+'= 5G 51M1 !55%&011 99''--#& 6 	M  --44]CM <$P&PP$,,77>I%OOBC0MFE $ = =%&0#)),1# !> ! "&!?!?%&0#))# "@ " "4!=!=a!@ !% > >%&0%)%>%>#))## !? 
! "&!@!@%&0!>>#))#$ "A " 	$$%8$HNN,,	~~,,@@ 7?))*;=OQ^_G((mm$57IJG))#G !::9cJ45F4GHI y>,?$..BVBV,VV%89 	#\!), #1 "11!%& "
" .$..--j!WZHYZff I**A9I/IqSTuX\XfXfXlXlNlpqNq '')+N0Ba0G#$(K#K 1g6 LLN=#	#D 7?BFBSBST[]cejBk?1<(*(9DHHOO<Z<Z(Zhmnopq)++L&AD((48KKQV4W11(*(9DHHOO<Z<Z(Zhmnopq)))"L++L&AD##% "Vekk!n4N((44Udr4sE 4-.43J3J3V##++-4= &e$??Q	# 	#s   7CV+V++V5)NNNN)NNNNNrL  )NNNNNNN)NNNNr   2   g       @Nr   r   g        NNNNNNNpilTNr   )<r&   r'   r(   r)   _last_supported_versionmodel_cpu_offload_seqr   r	   r   r   r
   r    r   r!   r   rB   rg   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   floatr   r  r   r   r   r  r	  r  r  r(  r,  r/  r1  rI  rV  r`  no_gradr   r/   r   r*   r+   	Generatorboolr   r  __classcell__)rZ   s   @r2   r4   r4   A   s	   B 'R0#0# $0# 5	0#
 10# &0# -0# &0# 0# -0#f!"8v"#!"J&f 049=&*  - !) 6 UON 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5@ :F 6r jnD !H bf,	! 9'&	?xt0 #ob U]]_ 37@D $##$#% #;?/0/0MQ*.15.2/3049=%* GK/V@sDI~./V@ ellCIIOO;<=V@ 	V@
 }V@ C=V@ !V@ V@ "%T#Y"78V@  (}V@  (}V@ V@ E%//43H"HIJV@ %,,'V@ !.V@  ell+!V@" u||,#V@$  -%V@& !) 6'V@( c])V@* +V@, 8S#u||$<d$BCD-V@. /V@ V@r1   r4   ):r_   dataclassesr   typingr   r   r   r   r  r,   	PIL.Imager*   r   transformersr   r	   r
   r   r   rK   r   loadersr   r   modelsr   models.lorar   
schedulersr   utilsr   r   r   r   r   r   utils.outputsr   utils.torch_utilsr   pipeline_utilsr   r   modeling_text_decoderr    modeling_uvitr!   torch_xla.core.xla_modelcore	xla_modelrw  rv  
get_loggerr&   rm   r#   r4   r0   r1   r2   <module>r     s     ! 2 2     1 R # 9 3  ( - G 9 + ))MM			H	% 6j 6 6"]@13D ]@r1   