
    biW                        d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
mZmZ d dlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZ ddlmZ  e       rd dl m!c m"Z# dZ$ndZ$ ejJ                  e&      Z' G d dee      Z(y)    N)ListOptionalTupleUnion)
functional)CLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )PriorTransformerUNet2DConditionModelUNet2DModel)UnCLIPScheduler)is_torch_xla_availablelogging)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelTFc                    d    e Zd ZU dZdZdgZeed<   eed<   e	ed<   e
ed<   eed<   eed<   eed	<   eed
<   eed<   eed<   dZdedede
dede	ded	ed
ededef fdZd Z	 	 d!deeeef      deej,                     fdZ ej0                         	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"deeeee   f      dededededeeej8                  eej8                     f      deej,                     deej,                     deej,                     deeeef      deej,                     dededee   defd        Z xZ S )#UnCLIPPipelineaE  
    Pipeline for text-to-image generation using unCLIP.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):
            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
        super_res_last ([`UNet2DModel`]):
            Super resolution UNet. Used in the last step of the super resolution diffusion process.
        prior_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
        decoder_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
        super_res_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).

    z0.33.1priordecoder	text_projtext_encoder	tokenizersuper_res_firstsuper_res_lastprior_schedulerdecoder_schedulersuper_res_schedulerzAtext_encoder->text_proj->decoder->super_res_first->super_res_lastc                 X    t         |           | j                  |||||||||	|

       y )N)
r   r   r   r   r   r    r!   r"   r#   r$   )super__init__register_modules)selfr   r   r   r   r   r    r!   r"   r#   r$   	__class__s              e/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/unclip/pipeline_unclip.pyr'   zUnCLIPPipeline.__init__Y   sA     	%+)+/ 3 	 	
    c                     |t        ||||      }n;|j                  |k7  rt        d|j                   d|       |j                  |      }||j                  z  }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r)   r1   r0   r/   r.   latents	schedulers          r+   prepare_latentszUnCLIPPipeline.prepare_latentsu   sg    ?"5IfTYZG}}% #A'--P[\a[b!cddjj(GI666r,   text_model_outputtext_attention_maskc                    |t        |t              rt        |      nd}| j                  |d| j                  j                  dd      }|j
                  }	|j                  j                         j                  |      }
| j                  |dd      j
                  }|j                  d   |	j                  d   k\  rt        j                  |	|      s| j                  j                  |d d | j                  j                  dz
  df         }t        j                  d	| j                  j                   d
|        |	d d d | j                  j                  f   }	| j                  |	j                  |            }|j                   }|j"                  }n|d   j                  d   }|d   |d   }}|}
|j%                  |d      }|j%                  |d      }|
j%                  |d      }
|radg|z  }| j                  |d| j                  j                  dd      }|j                  j                         j                  |      }| j                  |j
                  j                  |            }|j                   }|j"                  }|j                  d   }|j'                  d|      }|j)                  ||z  |      }|j                  d   }|j'                  d|d      }|j)                  ||z  |d      }|j%                  |d      }t        j*                  ||g      }t        j*                  ||g      }t        j*                  ||
g      }
|||
fS )Nr   
max_lengthTpt)paddingr;   
truncationreturn_tensorslongest)r=   r?   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   dim )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr3   r1   torchequalbatch_decodeloggerwarningr   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r)   promptr/   num_images_per_promptdo_classifier_free_guidancer8   r9   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lens                          r+   _encode_promptzUnCLIPPipeline._encode_prompt   sv    $(264(@VaJ..$>>::# ) K )22N#22779<<VDI"nnVYW[n\ffO$$R(N,@,@,DDU[[N  $~~::#At~~'F'F'JR'O$OP  778	,Q "03TT^^5T5T3T0T!U"&"3"3N4E4Ef4M"N/;;M"5"G"G +1-33A6J1B11EGXYZG[.M+I%778MST7U1CCDY_`Ca//0E1/M	&D:-M>>$>>::# * L  ,::??ADDVL9=9J9J<KaKaKdKdekKl9m6%O%[%[")S)e)e& -2215G%;%B%B1F[%\"%;%@%@NcAcel%m"066q9G)C)J)J1Ncef)g&)C)H)H22GR*&  0AABW]^A_ "II'=}&MNM"'))-GI\,]"^		#3Y"?@I19<<r,   rW   rX   prior_num_inference_stepsdecoder_num_inference_stepssuper_res_num_inference_stepsr.   prior_latentsdecoder_latentssuper_res_latentsprior_guidance_scaledecoder_guidance_scaleoutput_typereturn_dictc                    |Ft        |t              rd}nEt        |t              rt        |      }n)t	        dt        |             |
d   j                  d   }| j                  }||z  }|dkD  xs |dkD  }| j                  |||||
|      \  }}}| j                  j                  ||       | j                  j                  }| j                  j                  j                  }| j                  ||f|j                   |||| j                        }t#        | j%                  |            D ]  \  }}|rt'        j(                  |gdz        n|}| j                  |||||      j*                  }|r|j-                  d      \  }}||||z
  z  z   }|dz   |j                  d   k(  rd}n||dz      }| j                  j/                  |||||	      j0                  } | j                  j3                  |      }|}| j5                  ||||
      \  }} |j
                  dk(  rm|j                  t&        j6                        }t9        j:                  || j4                  j<                  dfd      }!|!j                  t&        j>                        }!n.t9        j:                  || j4                  j<                  dfd      }!| j@                  j                  ||       | j@                  j                  }"| jB                  j                  jD                  }#| jB                  j                  jF                  }$| jB                  j                  jF                  }%| j                  ||#|$|%f|j                   |||| j@                        }t#        | j%                  |"            D ]  \  }}|rt'        j(                  |gdz        n|}| jC                  |||| |!      jH                  }&|r~|&j-                  d      \  }'}(|'jK                  |j                  d   d      \  }'})|(jK                  |j                  d   d      \  }(}*|'||(|'z
  z  z   }&t'        j(                  |&|*gd      }&|dz   |"j                  d   k(  rd}n|"|dz      }| j@                  j/                  |&||||      j0                  } |jM                  dd      }|}+| jN                  j                  ||       | jN                  j                  },| jP                  j                  jD                  dz  }-| jP                  j                  jF                  }$| jP                  j                  jF                  }%| j                  ||-|$|%f|+j                   |||	| jN                        }	|j
                  dk(  rt9        jR                  |+|$|%g      }.nRi }/dtU        jV                  t8        jR                        jX                  v rd|/d<   t9        jR                  |+f|$|%gddd|/}.t#        | j%                  |,            D ]  \  }}||,j                  d   dz
  k(  r| jZ                  }0n| jP                  }0t'        j(                  |	|.gd      } |0||      jH                  }&|dz   |,j                  d   k(  rd}n|,|dz      }| jN                  j/                  |&||	||      j0                  }	t\        st_        j`                           |	}1| jc                          |1dz  dz   }1|1jM                  dd      }1|1je                         jg                  dddd      ji                         jk                         }1|dk(  r| jm                  |1      }1|s|1fS to        |1      S )a  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
                and `text_attention_mask` is passed.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            prior_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
                quality image at the expense of slower inference.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
                Pre-generated noisy latents to be used as inputs for the prior.
            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            text_model_output (`CLIPTextModelOutput`, *optional*):
                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
                `text_attention_mask` in this case. `prompt` can the be left `None`.
            text_attention_mask (`torch.Tensor`, *optional*):
                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
                masks are necessary when passing `text_model_output`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        Nr   z2`prompt` has to be of type `str` or `list` but is r   g      ?)r/   r   )timestepproj_embeddingencoder_hidden_statesrJ   )rv   sampler.   prev_timestep)image_embeddingsra   text_encoder_hidden_statesrY   mps)valueT)ry   rv   rx   class_labelsrJ   rB   )rz   r.   rA   )size	antialiasbicubicF)r   modealign_corners)ry   rv   g      ?r   pil)images)8rE   strrF   rG   r2   typer1   _execution_devicerj   r"   set_timesteps	timestepsr   configembedding_dimr7   r0   	enumerateprogress_barrL   rV   predicted_image_embeddingchunkstepprev_samplepost_process_latentsr   intFpadclip_extra_context_tokensrK   r#   r   in_channelssample_sizery   splitclampr$   r    interpolateinspect	signature
parametersr!   XLA_AVAILABLExm	mark_stepmaybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )2r)   rW   rX   rk   rl   rm   r.   rn   ro   rp   r8   r9   rq   rr   rs   rt   rZ   r/   rY   ra   rb   r]   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textrz   r{   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidth
noise_prednoise_pred_uncondnoise_pred_text_predicted_varianceimage_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetimages2                                                     r+   __call__zUnCLIPPipeline.__call__   s   J &#&
FD) [
 #UVZ[aVbUc!dee*1-33A6J''"77
&:S&@&`DZ]`D`#8<8K8KF13NPacv9
5*I 	**+DV*T!%!5!5!?!?

))77,,'  
 d//0FGH 	DAqC^M?Q+>!?dq(,

",&9( )3 ) (' & +SlSrSrstSuP02P,LOc25UUP -) 1u.44Q77 $ 6q1u = 0055)$#+ 6  k /	> 

77F( >B^^-'':(C	 >L >
:: ;;% "uyy1I !i$..2Z2Z\]1^fg h 1 6 6uzz B !i$..2Z2Z\]1^fj k,,-HQW,X#'#9#9#C#C #||22>>$$00##//..-vu=%%""
 d//0HIJ 	DAqE`O+<q+@!Afu)&9:0 &  f  +5?5E5Ea5H2!?'8'>'>?Q?W?WXY?Z`a'>'b$!16E6K6KL^LdLdefLgmn6K6o3!3.1G?]nKn1oo
"YY
4F'GQO
1u066q99 $ 8Q ? #4499AmW` : k 3	: *//A6% 	  ../LU[.\%)%=%=%G%G"''..::a?%%,,88$$++77 00651$$
 ;;%]];fe_MN$&!g//>III59%k2]]#)5/	QVZoN d//0JKL 	DAq .44Q7!;;**++!&,=~+NTU!V) f 
 1u288;; $ :1q5 A !% 8 8 = =A0Yb !> !k  5	8 " 	##% c!Aq!		##Aq!Q/557==?%%%e,E8O"%00r,   )NN)Nr      r      NNNNNNg      @g       @r   T)!__name__
__module____qualname____doc___last_supported_version_exclude_from_cpu_offloadr   __annotations__r   r   r   r	   r   r   model_cpu_offload_seqr'   r7   r   r   r
   r   rL   Tensorrj   no_gradr   r   r   	Generatorr   rK   r   __classcell__)r*   s   @r+   r   r   )   s\   < '!(	!!""--  $$&&((_

 &
 2	

 !
 '
 %
 $
 )
 +
 -
8	" JN6:Y= $E*=u*D$EFY= &ell3Y=v U]]_ 37%&)++--.MQ042648IM6:&)(+%* !\1sDI~./\1  #\1 $'	\1
 &)\1 (+\1 E%//43H"HIJ\1  -\1 "%,,/\1 $ELL1\1 $E*=u*D$EF\1 &ell3\1 $\1 !&\1 c]\1  !\1 \1r,   r   ))r   typingr   r   r   r   rL   torch.nnr   r   transformersr   r	   &transformers.models.clip.modeling_clipr
   modelsr   r   r   
schedulersr   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rO   r    r,   r+   <module>r      sp     / /  $ C F I I ) 4 - \ \ * ))MM			H	%O1,.? O1r,   