
    bi                        d dl Z d dlmZmZ d dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ  ej2                  e      Zd ZddZ G d dej<                        Z G d dej<                        Z  G d dej<                        Z! G d dej<                        Z" G d de
e      Z# G d de
e      Z$y)    N)OptionalUnion)nn   )ConfigMixinregister_to_config)
ModelMixin)FeedForward)	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)AdaLayerNorm)loggingc                    d }||d|z  z
  k  s||d|z  z   kD  rt         j                  d       t        j                         5   |||z
  |z        } |||z
  |z        }| j	                  d|z  dz
  d|z  dz
         | j                          | j                  |t        j                  d      z         | j                  |       | j                  ||       | cd d d        S # 1 sw Y   y xY w)Nc                 d    dt        j                  | t        j                  d      z        z   dz  S )N      ?       @)matherfsqrt)xs    h/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/unidiffuser/modeling_uvit.pynorm_cdfz(_no_grad_trunc_normal_.<locals>.norm_cdf   s(    dhhq499S>122c99       zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.   r   )minmax)loggerwarningtorchno_graduniform_erfinv_mul_r   r   add_clamp_)tensormeanstdabr   lus           r   _no_grad_trunc_normal_r1      s    : 	q1s7{q1s7{ 2;	

 
  a$h#%&a$h#%& 	A	1q519- 	 	C$))C.()D 	!#+  s   BC$$C-c                      t        | ||||      S )a  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
    generating the random values works best when :math:`a \leq \text{mean} \leq b`.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    Examples:
        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
    )r1   )r*   r+   r,   r-   r.   s        r   trunc_normal_r3   9   s      "&$Q::r   c                   <     e Zd ZdZ	 	 	 	 	 	 	 	 	 d fd	Zd Z xZS )
PatchEmbedz2D Image to Patch Embeddingc
                    t         |           ||z  ||z  z  }
|| _        || _        t	        j
                  ||||f||      | _        |rt	        j                  |dd      | _        nd | _        |	| _	        | j                  rLt        |t        |
dz        d      }| j                  d|j                         j                  d	      d
       y y )N)kernel_sizestridebiasFgư>)elementwise_affineeps      ?pt)output_type	pos_embedr   )
persistent)super__init__flatten
layer_normr   Conv2dproj	LayerNormnormuse_pos_embedr   intregister_bufferfloat	unsqueeze)selfheightwidth
patch_sizein_channels	embed_dimrD   rC   r9   rI   num_patchesr?   	__class__s               r   rB   zPatchEmbed.__init__O   s     	+0CD$IIZ0HQ[bf
	 Y5dSDIDI*/	3{C?O;P^bcI  ioo.?.I.I!.LY^ _ r   c                     | j                  |      }| j                  r!|j                  d      j                  dd      }| j                  r| j	                  |      }| j
                  r|| j                  z   S |S )Nr   r   )rF   rC   	transposerD   rH   rI   r?   )rN   latents     r   forwardzPatchEmbed.forwardn   sd    6"<<^^A&00A6F??YYv&FDNN**Mr   )	   rZ      r      FTTT)__name__
__module____qualname____doc__rB   rY   __classcell__rU   s   @r   r5   r5   L   s.    % `>	r   r5   c                   *     e Zd Zdef fdZd Z xZS )	SkipBlockdimc                     t         |           t        j                  d|z  |      | _        t        j
                  |      | _        y )Nr   )rA   rB   r   Linearskip_linearrG   rH   )rN   re   rU   s     r   rB   zSkipBlock.__init__{   s7    99QWc2 LL%	r   c                 x    | j                  t        j                  ||gd            }| j                  |      }|S )Nre   )rh   r#   catrH   )rN   r   skips      r   rY   zSkipBlock.forward   s3    UYY4yb9:IIaLr   )r]   r^   r_   rJ   rB   rY   ra   rb   s   @r   rd   rd   z   s    &C &r   rd   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 ddedededee   dedee   ded	ed
edededededef fdZ	 	 	 	 	 	 ddZ	 xZ
S )UTransformerBlockaS  
    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations.

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (:obj: `int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float32 when performing the attention calculation.
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The layer norm implementation to use.
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g.
            `pre_layer_norm = True`.
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    re   num_attention_headsattention_head_dimcross_attention_dimactivation_fnnum_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionupcast_attentionnorm_elementwise_affine	norm_typepre_layer_normfinal_dropoutc           	      f   t         |           |	| _        |d uxr |dk(  | _        || _        |dv r|t        d| d| d      t        ||||||	r|nd |      | _        ||
rt        ||
s|nd |||||      | _        nd | _        | j                  rt        ||      | _
        nt        j                  ||      | _
        ||
r5| j                  rt        ||      nt        j                  ||      | _        nd | _        t        j                  ||      | _        t        ||||	      | _        y 
Nada_norm)r   ada_norm_zeroz`norm_type` is set to zw, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to .)	query_dimheadsdim_headdropoutr9   rr   rx   )r   rr   r   r   r   r9   rx   )r:   )r   rs   r|   rA   rB   rv   use_ada_layer_normr{   
ValueErrorr   attn1attn2r   norm1r   rG   norm2norm3r
   ffrN   re   rp   rq   r   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   rU   s                   r   rB   zUTransformerBlock.__init__   f   $ 	$8!#6d#B"_	U_H_,55:M:U( 4KKT+UVX  %'7K 3QU-

 *.C"?T$7Z^)+#!1DJ DJ""%c+>?DJc>UVDJ*.C ** S"56\\#:QR J DJ \\#:QR
c7-_lmr   c                 @   | j                   r1| j                  r| j                  ||      }n| j                  |      }n|}||ni } | j                  |f| j                  r|nd |d|}	| j                   s0| j                  r| j                  |	|      }	n| j                  |	      }	|	|z   }| j
                  | j                   r0| j                  r| j                  ||      n| j                  |      }n|} | j
                  |f||d|}	| j                   s/| j                  r| j                  |	|      n| j                  |	      }	|	|z   }| j                   r| j                  |      }n|}| j                  |      }
| j                   s| j                  |
      }
|
|z   }|S N)encoder_hidden_statesattention_mask	r{   r   r   r   rv   r   r   r   r   )rN   hidden_statesr   r   encoder_attention_masktimestepcross_attention_kwargsclass_labelsnorm_hidden_statesattn_output	ff_outputs              r   rY   zUTransformerBlock.forward   s    &&%)ZZx%H"%)ZZ%>"!. <R;]!7ce djj
;?;T;T"7Z^)
 %	
 ""&&"jjh?"jj5#m3::!"";?;R;RDJJ}h7X\XbXbcpXq # &3"
 %$**"&;5 )	K &&CGCZCZdjjh?`d`j`jkv`w'-7M !%M!:!.GG./	 ""

9-I!M1r   )        NgegluNFFFFTrD   TFNNNNNNr]   r^   r_   r`   rJ   r   strboolrB   rY   ra   rb   s   @r   ro   ro      s    L -1$-1$%*&+!&(,%##!KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn` "##Mr   ro   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 ddedededee   dedee   ded	ed
edededededef fdZ	 	 	 	 	 	 ddZ	 xZ
S )UniDiffuserBlocka@	  
    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the
    LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser
    implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104).

    Parameters:
        dim (`int`): The number of channels in the input and output.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
        activation_fn (`str`, *optional*, defaults to `"geglu"`):
            Activation function to be used in feed-forward.
        num_embeds_ada_norm (:obj: `int`, *optional*):
            The number of diffusion steps used during training. See `Transformer2DModel`.
        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
            Configure if the attentions should contain a bias parameter.
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used.
        double_self_attention (`bool`, *optional*):
            Whether to use two self-attention layers. In this case no cross attention layers are used.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float() when performing the attention calculation.
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        norm_type (`str`, defaults to `"layer_norm"`):
            The layer norm implementation to use.
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    re   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   c           	      f   t         |           |	| _        |d uxr |dk(  | _        || _        |dv r|t        d| d| d      t        ||||||	r|nd |      | _        ||
rt        ||
s|nd |||||      | _        nd | _        | j                  rt        ||      | _
        nt        j                  ||      | _
        ||
r5| j                  rt        ||      nt        j                  ||      | _        nd | _        t        j                  ||      | _        t        ||||	      | _        y r~   r   r   s                   r   rB   zUniDiffuserBlock.__init__r  r   r   c                 .   | j                   r0| j                  r| j                  ||      }n| j                  |      }||ni } | j                  |f| j                  r|nd |d|}||z   }| j                   s0| j                  r| j                  ||      }n| j                  |      }| j
                  | j                   r/| j                  r| j                  ||      n| j                  |      } | j
                  |f||d|}||z   }| j                   s/| j                  r| j                  ||      n| j                  |      }| j                   r| j                  |      }| j                  |      }	|	|z   }| j                   s| j                  |      }|S r   r   )
rN   r   r   r   r   r   r   r   r   r   s
             r   rY   zUniDiffuserBlock.forward  s    && $

=( C $

= 9 <R;]!7ce djj
;?;T;T"7Z^)
 %	
 $m3
 ""&& $

=( C $

= 9::!"";?;R;RDJJ}h7X\XbXbcpXq  %$**&;5 )	K (-7M &&;?;R;RDJJ}h7X\XbXbcpXq   JJ}5MGGM*	!M1 "" JJ}5Mr   )r   Nr   NFFFFTrD   FTr   r   rb   s   @r   r   r   N  s    !P -1$-1$%*&+!&(,%$"!KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn` "##Mr   r   c            .           e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededee   dee   dededed	ee   d
edee   dee   dee   de	dee   dededede	de	dededef, fd       Z
	 	 	 	 	 	 	 ddededefdZ xZS )UTransformer2DModelay  
    Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared
    to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion,
    similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`]
    layer and then reshaped to (b, t, d).

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            Pass if the input is continuous. The number of channels in the input.
        out_channels (`int`, *optional*):
            The number of output channels; if `None`, defaults to `in_channels`.
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        norm_num_groups (`int`, *optional*, defaults to `32`):
            The number of groups to use when performing Group Normalization.
        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the TransformerBlocks' attention should contain a bias parameter.
        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
            `ImagePositionalEmbeddings`.
        num_vector_embeds (`int`, *optional*):
            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
            Includes the class for the masked latent pixel.
        patch_size (`int`, *optional*, defaults to 2):
            The patch size to use in the patch embedding.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
            up to but not more than steps than `num_embeds_ada_norm`.
        use_linear_projection (int, *optional*): TODO: Not used
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
            transformer block.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float() when performing the attention calculation.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
            behavior in `diffusers`.)
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        use_patch_pos_embed (`bool`, *optional*):
            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
        final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
    rp   rq   rR   out_channels
num_layersr   norm_num_groupsrr   ru   sample_sizenum_vector_embedsrQ   rs   rt   use_linear_projectionrv   rx   rz   
block_typer{   ry   ff_final_dropoutc                    t         |           || _        || _        || _        ||z  }||J d       |
J d       |
| _        |
| _        || _        t        |
|
||||      | _	        |dk(  rt        }nt        }t        j                  t        |dz        D cg c]  } |||||||||	||||||       c}      | _         |||||||||	||||||      | _        t        j                  t        |dz        D cg c]7  }t        j"                  t%        |       |||||||||	||||||      d      9 c}      | _        ||n|| _        t        j*                  |      | _        y c c}w c c}w )Nz0Patch input requires in_channels and patch_size.z?UTransformer2DModel over patched input must provide sample_sizerO   rP   rQ   rR   rS   rI   unidiffuserr   )r   rr   rs   rt   ru   rv   rx   rz   r{   ry   r|   )rm   block)rA   rB   r   rp   rq   rO   rP   rQ   r5   r?   r   ro   r   
ModuleListrangetransformer_in_blockstransformer_mid_block
ModuleDictrd   transformer_out_blocksr   rG   norm_out)rN   rp   rq   rR   r   r   r   r   rr   ru   r   r   rQ   rs   rt   r   rv   rx   rz   r   r{   ry   use_patch_pos_embedr   	inner_dim	block_clsdrU   s                              r   rB   zUTransformer2DModel.__init__M  s   6 	%:"#6 "4'*<<	 &:+AuCuuA&i(ii& " 
$#!#-
 &(I)I%']]$ zQ/#" ! '&#(;"/(;#1)=%5'#1,C"2&
", &/ 3' 3)!5-)$;*&
"& ')mm2 zQ/10 /  )%! "+%/.$+0C*70C+91E-=&/+94K*:"	'
#< ,8+?K\ Y/URs   E+=<E0return_dicthidden_states_is_embedding
unpatchifyc	           	         |s|rt        d| d| d| d      |s| j                  |      }g }	| j                  D ]   }
 |
|||||      }|	j                  |       " | j	                  |      }| j
                  D ],  } |d   ||	j                               } |d   |||||      }. | j                  |      }|rt        |j                  d   d	z        x}}|j                  d
||| j                  | j                  | j                  f      }t        j                  d|      }|j                  d
| j                  || j                  z  || j                  z  f      }n|}|s|fS t        |      S )a  
        Args:
            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
                When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.long`, *optional*):
                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
                conditioning.
            cross_attention_kwargs (*optional*):
                Keyword arguments to supply to the cross attention layers, if used.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.
            hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
                Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
                ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
                transformer blocks.
            unpatchify (`bool`, *optional*, defaults to `True`):
                Whether to unpatchify the transformer output.

        Returns:
            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
            returning a tuple, the first element is the sample tensor.
        z!Cannot both define `unpatchify`: z and `return_dict`: z since when `unpatchify` is zy the returned output is of shape (batch_size, seq_len, hidden_dim) rather than (batch_size, num_channels, height, width).)r   r   r   r   rm   r   r   r<   rj   shapenhwpqc->nchpwq)sample)r   r?   r   appendr   r   popr   rJ   r   reshaperQ   r   r#   einsumr   )rN   r   r   r   r   r   r   r   r   skipsin_block	out_blockrO   rP   outputs                  r   rY   zUTransformer2DModel.forward  s   T k3J<?ST_S` a$$.< 0JJ  * NN=9M
 22 	(H$&;!'=)M LL'	( 22=A 44 	I-If-mUYY[IM.Ig.&;!'=)M	 m4  !4!4Q!73!>??FU)1165$//4??DL]L]^ 2 M "LL)9=IM"**4,,ft.FPTP_P_H_` + F #F9'v66r   )r[   X   NNr   r       NFNNr   r   NFFFrD   r   FTFF)NNNNTFT)r]   r^   r_   r`   r   rJ   r   rL   r   r   rB   rY   ra   rb   s   @r   r   r     s   7r  $&"$%)&*!-1$%)+/$%$-1&+%*!&%'$(,!!&1H0 H0  H0 c]	H0
 smH0 H0 H0 H0 &c]H0 H0 c]H0 $C=H0 SMH0 H0 &c]H0   $!H0" ##H0$ %H0& 'H0( )H0* +H0, "&-H00 1H0 H0Z ## +0f7 f7 %)f7 f7r   r   c            6           e Zd ZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%dedededededee   dee   d	ed
ededee   dedee   dee   dee   de	dee   dededede	de	dedededef4 fd       Z
ej                  j                  d        Z	 	 	 d&dej                  dej                  d ej                  d!eej                  eef   d"eej                  eef   d#eeej                  eef      fd$Z xZS )'UniDiffuserModela  
    Transformer model for a image-text [UniDiffuser](https://huggingface.co/papers/2303.06555) model. This is a
    modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the
    CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details).

    Parameters:
        text_dim (`int`): The hidden dimension of the CLIP text model used to embed images.
        clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts.
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            Pass if the input is continuous. The number of channels in the input.
        out_channels (`int`, *optional*):
            The number of output channels; if `None`, defaults to `in_channels`.
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        norm_num_groups (`int`, *optional*, defaults to `32`):
            The number of groups to use when performing Group Normalization.
        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
        attention_bias (`bool`, *optional*):
            Configure if the TransformerBlocks' attention should contain a bias parameter.
        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
            `ImagePositionalEmbeddings`.
        num_vector_embeds (`int`, *optional*):
            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
            Includes the class for the masked latent pixel.
        patch_size (`int`, *optional*, defaults to 2):
            The patch size to use in the patch embedding.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
            up to but not more than steps than `num_embeds_ada_norm`.
        use_linear_projection (int, *optional*): TODO: Not used
        only_cross_attention (`bool`, *optional*):
            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
            transformer block.
        upcast_attention (`bool`, *optional*):
            Whether to upcast the query and key to float32 when performing the attention calculation.
        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
            behavior in `diffusers`.)
        pre_layer_norm (`bool`, *optional*):
            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
            (`pre_layer_norm = False`).
        norm_elementwise_affine (`bool`, *optional*):
            Whether to use learnable per-element affine parameters during layer normalization.
        use_patch_pos_embed (`bool`, *optional*):
            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
        ff_final_dropout (`bool`, *optional*):
            Whether to use a final Dropout layer after the feedforward network.
        use_data_type_embedding (`bool`, *optional*):
            Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1
            is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type`
            argument, which can either be `1` to use the weights trained on non-publically-available data or `0`
            otherwise. This argument is subsequently embedded by the data type embedding, if used.
    text_dimclip_img_dimnum_text_tokensrp   rq   rR   r   r   r   r   rr   ru   r   r   rQ   rs   rt   r   rv   rx   rz   r   r{   ry   r   use_data_type_embeddingc                    t         |           ||z  | _        |J d       || _        || _        ||n|| _        || _        | j                  |z  | j                  |z  z  | _        t        ||||| j                  |      | _	        t        j                  || j                        | _        t        j                  || j                        | _        t        | j                  dd      | _        |r/t!        | j                  d| j                  z  | j                        nt        j"                         | _        t        | j                  dd      | _        |r/t!        | j                  d| j                  z  | j                        nt        j"                         | _        || _        d|z   d	z   | j                  z   | _        t        j.                  t1        j2                  d	| j,                  | j                              | _        t        j6                  |	
      | _        t;        | j4                  d       || _        | j<                  r^t        j>                  d| j                        | _         t        j.                  t1        j2                  d	d	| j                              | _!        tE        d$i d|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d |d!|d"|d#|| _#        |dz  |z  }t        j                  | j                  |      | _$        t        j                  | j                  |      | _%        t        j                  | j                  |      | _&        y )%Nz<UniDiffuserModel over patched input must provide sample_sizer   Tr   )flip_sin_to_cosdownscale_freq_shift   )out_dimr   r   )pg{Gz?)r,   rp   rq   rR   r   r   r   r   rr   ru   r   r   rQ   rs   rt   r   rv   rx   rz   r   r{   ry   r   r    )'rA   rB   r   r   rR   r   rQ   rT   r5   
vae_img_inr   rg   clip_img_intext_inr   timestep_img_projr   Identitytimestep_img_embedtimestep_text_projtimestep_text_embedr   
num_tokens	Parameterr#   zerosr?   Dropoutpos_embed_dropr3   r   	Embeddingdata_type_token_embeddingdata_type_pos_embed_tokenr   transformervae_img_outclip_img_outtext_out)rN   r   r   r   rp   rq   rR   r   r   r   r   rr   ru   r   r   rQ   rs   rt   r   rv   rx   rz   r   r{   use_timestep_embeddingry   r   r   r   	patch_dimrU   s                                 r   rB   zUniDiffuserModel.__init__  sz   @ 	 -/AA&f(ff&&&+7+?K\$ ,,
:t?O?OS]?]^
 %!#nn-
 99\4>>Byy4>>: "+NN !""
 & DNN"  	 #,NN !"#
 & DNN"  	   //1A58H8HHekk!T__dnn&UV jj73dnn$/ (?$''-/\\!T^^-LD*-/\\%++aDNN:[-\D* / 
 3
1
 $
 &	

 "
 
 ,
 !4
 *
 $
 0
 "
 (
 !4
 #8
  "6!
" .#
$  %
& "'
( *)
* %<+
, !4-
. ./
6  ]l2	99T^^Y?IIdnnlC		$..(;r   c                     dhS )Nr?   r   )rN   s    r   no_weight_decayz UniDiffuserModel.no_weight_decay
  s
    }r   latent_image_embedsimage_embedsprompt_embedstimestep_imgtimestep_text	data_typec	           
      *	   |j                   d   }	| j                  |      }
| j                  |      }| j                  |      }|j	                  d      |
j	                  d      }}t        j                  |      s1t        j                  |gt
        j                  |
j                        }|t        j                  |	|j                  |j                        z  }| j                  |      }|j                  | j                        }| j                  |      }|j                  d      }t        j                  |      s1t        j                  |gt
        j                  |
j                        }|t        j                  |	|j                  |j                        z  }| j!                  |      }|j                  | j                        }| j#                  |      }|j                  d      }| j$                  r|J d       t        j                  |      s1t        j                  |gt
        j&                  |
j                        }|t        j                  |	|j                  |j                        z  }| j)                  |      j                  d      }t        j*                  ||||||
gd      }nt        j*                  |||||
gd      }| j$                  rQt        j*                  | j,                  ddddddf   | j.                  | j,                  ddddddf   gd      }n| j,                  }||z   }| j1                  |      }| j3                  ||dd|d	d
d	      d   }| j$                  r!|j5                  ddd|d|fd      \  }}}}}}n|j5                  dd|d|fd      \  }}}}}| j7                  |      }t'        |j                   d   dz        x}}|j9                  d||| j:                  | j:                  | j<                  f      }t        j>                  d|      }|j9                  d| j<                  || j:                  z  || j:                  z  f      }| jA                  |      }| jC                  |      }|||fS )am  
        Args:
            latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`):
                Latent image representation from the VAE encoder.
            image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`):
                CLIP-embedded image representation (unsqueezed in the first dimension).
            prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`):
                CLIP-embedded text representation.
            timestep_img (`torch.long` or `float` or `int`):
                Current denoising step for the image.
            timestep_text (`torch.long` or `float` or `int`):
                Current denoising step for the text.
            data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`):
                Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data,
                or `0` otherwise.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            cross_attention_kwargs (*optional*):
                Keyword arguments to supply to the cross attention layers, if used.


        Returns:
            `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE
            image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text
            embedding.
        r   r   )dtypedevice)r   rk   NzBdata_type must be supplied if the model uses a data type embeddingr   FT)r   r   r   r   r   r   r   r<   rj   r   r   )"r   r   r   r   sizer#   	is_tensorr*   longr   onesr   r   tor   rM   r   r   r   rJ   r   rl   r?   r   r   r   splitr   r   rQ   r   r   r   r   )rN   r   r   r   r   r   r   r   r   
batch_sizevae_hidden_statesclip_hidden_statestext_hidden_statesr   num_img_tokenstimestep_img_tokentimestep_text_tokendata_type_tokenr   r?   t_img_token_outt_text_token_outdata_type_token_outr   img_clip_outimg_vae_outrO   rP   s                               r   rY   zUniDiffuserModel.forward  s   L )..q1
 !OO,?@!--l;!\\-8*<*A*A!*DFWF\F\]^F_ |, <<ejjQbQiQijL $ejj<CUCU^j^q^q&rr!33LA 0222D!445GH/99a9@ }-!LL-

SdSkSklM &

:]EXEXanauau(vv"55mD 2444::4F"667JK1;;;B ''(n*nn(??9-!LL)EIIN_NfNfg	 "EJJzYbYiYi$jjI"<<YGQQVWQXO!II&'#&&% 
M "II#%8:LN`bstM ''		7U7A.0N0NPTP^P^_`bgbikl_lPmntuI I%	1++M: (("7#9'+ ) 	
 	 '' ##Q1oq.$QWX#Y # VcUhUhA>: Vi VRO-x{ &&{3 [..q1S899!))vudootHYHYZ * 
 ll#3[A!))t((&4??*BEDOOD[\ * 
 ((6==*L(22r   )r\   i   M   r[   r   NNr   r   r   NFNNNr   NFFFrD   r   FFTFTF)r   NN)r]   r^   r_   r`   r   rJ   r   rL   r   r   rB   r#   jitignorer   Tensorr   rY   ra   rb   s   @r   r   r   A  s`   =~  !#%"$%)&*!-1$%)+/$($-1&+%*!&%'$$(,!!%(-;F<F< F< 	F<
 !F<  F< c]F< smF< F< F< F< &c]F< F< c]F< $C=F<  SM!F<" #F<$ &c]%F<&  $'F<( #)F<* +F<, -F<. /F<0 1F<4 "&5F<8 9F<: "&;F< F<P YY  @A"#_3"\\_3 ll_3 ||	_3
 ELL%45_3 U\\5#56_3 E%,,s":;<_3r   r   )r   r   g       r   )%r   typingr   r   r#   r   configuration_utilsr   r   modelsr	   models.attentionr
   models.attention_processorr   models.embeddingsr   r   r   models.modeling_outputsr   models.normalizationr   utilsr   
get_loggerr]   r!   r1   r3   Moduler5   rd   ro   r   r   r   r   r   r   <module>r#     s     "   B   + 3 V V ? 0  
		H	%"J;&+ +\		 &|		 |B~ryy ~Jk7*k k7\	l3z; l3r   