
    bi8                        d dl mZmZmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ  ej8                  e      Z G d dej>                        Z  G d dee      Z!y)    )AnyDictOptionalN   )ConfigMixinregister_to_config)logging   )LuminaFeedForward)	AttentionLuminaAttnProcessor2_0)&LuminaCombinedTimestepCaptionEmbeddingLuminaPatchEmbed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                        e Zd ZdZ	 ddededededededed	ed
eddf fdZ	 ddej                  dej                  dej                  dej                  dej                  dej                  de
eeef      fdZ xZS )LuminaNextDiTBlocka  
    A LuminaNextDiTBlock for LuminaNextDiT2DModel.

    Parameters:
        dim (`int`): Embedding dimension of the input features.
        num_attention_heads (`int`): Number of attention heads.
        num_kv_heads (`int`):
            Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
        multiple_of (`int`): The number of multiple of ffn layer.
        ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
        norm_eps (`float`): The eps for norm layer.
        qk_norm (`bool`): normalization for query and key.
        cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
        norm_elementwise_affine (`bool`, *optional*, defaults to True),
    dimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsqk_normcross_attention_dimnorm_elementwise_affinereturnNc
                    t         
|           ||z  | _        t        j                  t        j                  |g            | _        t        |d ||z  |rdnd ||dddt               
      | _
        t        j                         | j                  _        t        ||||z  |rdnd ||dddt               
      | _        t        |t        d|z  dz        ||      | _        t#        |||	      | _        t'        |||		      | _        t'        |||		      | _        t'        |||		      | _        t'        |||		      | _        y )
Nlayer_norm_across_headsh㈵>F)
	query_dimr   dim_headr   headskv_headsepsbiasout_bias	processor   r   )r   	inner_dimr   r   )embedding_dimr   r   )r(   elementwise_affine)super__init__head_dimnn	Parametertorchzerosgater   r   attn1Identityto_outattn2r   intfeed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2norm1_context)selfr   r   r   r   r   r   r   r   r   	__class__s             i/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/transformers/lumina_nextdit2d.pyr1   zLuminaNextDiTBlock.__init__5   sI    	22LL.A-B!CD	  $//18-d%!,.

 KKM

  3//18-d%!,.

 .%#+/*#1	
 '$;


 !(G^_ShCZ[
 (G^_$%8h[rs    hidden_statesattention_maskimage_rotary_embencoder_hidden_statesencoder_masktembcross_attention_kwargsc           	         |}| j                  ||      \  }	}
}} | j                  d|	|	|||d|}| j                  |      } | j                  d|	|||dd|}|| j                  j                         j                  dddd      z  }||z   }|j                  d      } | j                  j                  d   |      }||
j                  d      j                         | j                  |      z  z   }| j                  | j                  |      d|j                  d      z   z        }||j                  d      j                         | j                  |      z  z   }|S )a  
        Perform a forward pass through the LuminaNextDiTBlock.

        Parameters:
            hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
            attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
            image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
            encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
            encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
            temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
            cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
        )rG   rJ   rH   query_rotary_embkey_rotary_embN   r    )r>   r8   rB   r;   r7   tanhviewflattenr:   	unsqueezer@   r=   r?   rA   )rC   rG   rH   rI   rJ   rK   rL   rM   residualnorm_hidden_statesgate_msa	scale_mlpgate_mlpself_attn_outputnorm_encoder_hidden_statescross_attn_outputmixed_attn_output
mlp_outputs                     rE   forwardzLuminaNextDiTBlock.forwardv   s   , ! =AJJ}VZ<[9Hi%4:: 
,"4)-+
 %
 &*%7%78M%N"&DJJ 
,"<'-
 %
 .		0@0E0EaBPQ0RR,/@@-55b9,

))!,->? 8#5#5a#8#=#=#?$**]B[#[[&&t~~m'DIL_L_`aLbHb'cd
%(:(:1(=(B(B(Dt~~V`Ga(aarF   )T)N)__name__
__module____qualname____doc__r<   floatboolr1   r5   Tensorr   r   strr   rc   __classcell__rD   s   @rE   r   r   $   s    4 )-?t?t !?t 	?t
 ?t "?t ?t ?t !?t "&?t 
?tR <@9||9 9  ,,	9
  %||9 ll9 ll9 !)c3h 89rF   r   c                        e Zd ZdZg dZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedee   dee   dee   dee   d	ee   d
ee   dee   dee   dee   dee	   dee	   dee   dee   ddf fd       Z
	 	 ddej                  dej                  dej                  dej                  dej                  deeef   dej                  fdZ xZS )LuminaNextDiT2DModelaa  
    LuminaNextDiT: Diffusion model with a Transformer backbone.

    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

    Parameters:
        sample_size (`int`): The width of the latent images. This is fixed during training since
            it is used to learn a number of position embeddings.
        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
        in_channels (`int`, *optional*, defaults to 4):
            The number of input channels for the model. Typically, this matches the number of channels in the input
            images.
        hidden_size (`int`, *optional*, defaults to 4096):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        num_layers (`int`, *optional*, default to 32):
            The number of layers in the model. This defines the depth of the neural network.
        num_attention_heads (`int`, *optional*, defaults to 32):
            The number of attention heads in each attention layer. This parameter specifies how many separate attention
            mechanisms are used.
        num_kv_heads (`int`, *optional*, defaults to 8):
            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
            If None, it defaults to num_attention_heads.
        multiple_of (`int`, *optional*, defaults to 256):
            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
            configurations.
        ffn_dim_multiplier (`float`, *optional*):
            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
            the model configuration.
        norm_eps (`float`, *optional*, defaults to 1e-5):
            A small value added to the denominator for numerical stability in normalization layers.
        learn_sigma (`bool`, *optional*, defaults to True):
            Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
            predictions.
        qk_norm (`bool`, *optional*, defaults to True):
            Indicates if the queries and keys in the attention mechanism should be normalized.
        cross_attention_dim (`int`, *optional*, defaults to 2048):
            The dimensionality of the text embeddings. This parameter defines the size of the text representations used
            in the model.
        scaling_factor (`float`, *optional*, defaults to 1.0):
            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
            overall scale of the model's operations.
    )patch_embeddernormffn_normNsample_size
patch_sizein_channelshidden_size
num_layersr   r   r   r   r   learn_sigmar   r   scaling_factorr    c                 p   t         |           || _        || _        || _        |r|dz  n|| _        || _        || _        ||z  | _        || _	        t        |||d      | _        t        j                  t        j                  |            | _        t#        t%        |d      |      | _        t        j(                  t+        |      D cg c]  }t-        |||||	|
||       c}      | _        t1        |t%        |d      ddd||z  | j
                  z        | _        ||z  d	z  d
k(  sJ d       y c c}w )Nr
   T)rt   ru   	embed_dimr)   i   )rv   r   Fgư>)r.   conditioning_embedding_dimr/   r(   r)   out_dim   r   z+2d rope needs head dim to be divisible by 4)r0   r1   rs   rt   ru   out_channelsrv   r   r2   ry   r   rp   r3   r4   r5   empty	pad_tokenr   mintime_caption_embed
ModuleListranger   layersr   norm_out)rC   rs   rt   ru   rv   rw   r   r   r   r   r   rx   r   r   ry   _rD   s                   rE   r1   zLuminaNextDiT2DModel.__init__   sS   $ 	&$&/:K!O&#6 #'::,.!{kX\
 ekk+&>?"HK.DW#
 mm z*  #' &'	
 2%'*;'=$+d.?.??
 22a71<k>kk<1s   D3rG   timesteprJ   rK   rI   rM   c           
      `   | j                  ||      \  }}}	}|j                  |j                        }| j                  |||      }
|j	                         }| j
                  D ]  } |||||||
|      } | j                  ||
      }| j                  x}}|	d   \  }}|j                  d      }||z  ||z  z  }|ddd|f   j                  |||z  ||z  ||| j                        }|j                  dddddd      j                  dd      j                  dd      }|s|fS t        |	      S )
a  
        Forward pass of LuminaNextDiT.

        Parameters:
            hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
            timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
            encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
            encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
        )rL   rM   r   N   rQ   r   r
   r~   )sample)rp   todevicer   ri   r   r   rt   sizerV   r   permuterW   r   )rC   rG   r   rJ   rK   rI   rM   return_dictmaskimg_sizerL   layerheight_tokenswidth_tokensheightwidth
batch_sizesequence_lengthoutputs                      rE   rc   zLuminaNextDiT2DModel.forward#  sl   & ;?:M:Mm]m:n7tX'7+..}/C/CD&&x1FU#((*[[ 		E! %'=M		 mT: (,6 "''*
!]2u7LM%a)9/)9&9:??-/,1FWceievev
 &&q!Q1a8@@AFNNqRST9'v66rF   )   r
   r~   i 	      r   N   Nr#   TTi   g      ?)NT)rd   re   rf   rg    _skip_layerwise_casting_patternsr   r<   r   rh   ri   r1   r5   rj   r   rk   r   rc   rl   rm   s   @rE   ro   ro      s   +Z (N$ $%%&%)$&-/&*%(.2$(&*"&-1*->l>l SM>l c]	>l
 c]>l SM>l &c]>l sm>l c]>l %UO>l 5/>l d^>l $>l &c]>l !>l  
!>l >lN 2637||37 ,,37  %||	37
 ll37  ,,37 !%S#X37 
37rF   ro   )"typingr   r   r   r5   torch.nnr3   configuration_utilsr   r   utilsr	   	attentionr   attention_processorr   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerrd   loggerModuler   ro   rT   rF   rE   <module>r      si    ' &   B  ) C 8 ' Q Q 
		H	%K K\d7:{ d7rF   