
    bij                        d dl mZmZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z d dlmZmZmZ d dlmZmZ d dlmZmZmZmZmZ d d	lmZ d d
lmZmZ  ej<                  e      Z  G d dejB                        Z" G d dejB                        Z# G d dejB                        Z$ G d dejB                        Z% G d dejB                        Z& G d de      Z' G d de      Z(y)    )OptionalTupleUnionN)nn)BertTokenizer)QuickGELUActivation))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)Blip2ConfigBlip2VisionConfig)Blip2EncoderBlip2PreTrainedModelBlip2QFormerAttentionBlip2QFormerIntermediateBlip2QFormerOutput)apply_chunking_to_forward)loggingreplace_return_docstringsc                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )Blip2TextEmbeddingsz;Construct the embeddings from word and position embeddings.c                 L   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d             t'        |dd      | _        || _        y )N)padding_idxepsposition_ids)   position_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr   configselfr4   	__class__s     l/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/blip_diffusion/modeling_blip2.pyr"   zBlip2TextEmbeddings.__init__2   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ij'.v7PR\']$    c                    ||j                         d   }nd}|&| j                  d d |||z   f   j                         }|t| j                  |      }| j                  dk(  r| j                  |      }||z   }|>|j                  d   }|j                  |dd      }t        j                  ||fd      }n|}|j                  |j                        }| j                  |      }| j                  |      }|S )Nr   r   r    dim)sizer   cloner'   r   r)   shaperepeatr0   cattodtyper*   r.   )	r6   	input_idsr   query_embedspast_key_values_length
seq_length
embeddingsr)   
batch_sizes	            r8   forwardzBlip2TextEmbeddings.forwardB   s
     ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|&L#'*==
''--a0
+22:q!D"YYj'AqI
%J]]<#5#56
^^J/
\\*-
r9   )NNNr   )__name__
__module____qualname____doc__r"   rJ   __classcell__r7   s   @r8   r   r   /   s    E$  r9   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Blip2VisionEmbeddingsr4   c                 r   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  dd| j                              | _        t        j                  d| j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j                  t        j                  d| j                  | j                              | _        y )Nr      F)in_channelsout_channelskernel_sizestridebias   )r!   r"   r4   r%   	embed_dim
image_size
patch_sizer   	Parameterr0   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr5   s     r8   r"   zBlip2VisionEmbeddings.__init__f   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\krw 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"br9   pixel_valuesreturnc                    |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      j	                  |      }t        j                  ||gd      }|| j                  d d d |j                  d      d d f   j	                  |      z   }|S )Nr   rC   rZ   r   r   r;   )r?   rb   weightrC   rB   flatten	transposer`   r2   r0   rA   re   r=   )r6   rf   rI   target_dtypepatch_embedsclass_embedsrH   s          r8   rJ   zBlip2VisionEmbeddings.forwardx   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EHHVYYl;C
$"9"9!=Qzq?Q=QST:T"U"X"XYe"ff
r9   )	rK   rL   rM   r   r"   r0   TensorrJ   rO   rP   s   @r8   rR   rR   e   s-    c0 c$	ELL 	U\\ 	r9   rR   c                   :     e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )Blip2QFormerEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )NF)
r!   r"   r4   r   
ModuleListrangenum_hidden_layersBlip2QFormerLayerlayergradient_checkpointingr6   r4   	layer_idxr7   s      r8   r"   zBlip2QFormerEncoder.__init__   sV    ]]CHIaIaCbcivy1c

 ',# ds   A$c                 l   |	rdnd }|rdnd }|rdnd }|rdnd }t        | j                  j                        D ]  }| j                  |   }|	r||fz   }|||   nd }|||   nd }t	        | j                  dd      rGt        j                         r3|rt        j                  d       d}| j                  |||||||||	      }n |||||||||      }|d   }|r	||d   fz  }|s||d   fz   }|j                  s||d   fz   } |	r||fz   }|
st        d	 |||||fD              S t        |||||
      S )N ry   FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   r   r   rZ   c              3   $   K   | ]  }|| 
 y wNr}   ).0vs     r8   	<genexpr>z.Blip2QFormerEncoder.forward.<locals>.<genexpr>   s      
 = 
s   )last_hidden_statepast_key_valueshidden_states
attentionscross_attentions)ru   r4   rv   rx   r3   r0   is_grad_enabledloggerwarning_gradient_checkpointing_funchas_cross_attentiontupler	   )r6   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictquery_lengthall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskpast_key_valuelayer_outputss                        r8   rJ   zBlip2QFormerEncoder.forward   s    #7BD$5b4%6rD#,R$t{{445 ,	VA::a=L#$58H$H!.7.CilO3B3N_Q/TXNt{{$<eDI^I^I`NNt !&I $ A A !"#)*"% 
! !-!"#)*"% 	! *!,M"}R'8&::" &9]1=M<O&O#33+?=QRCSBU+U(Y,	V\   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r9   )
NNNNNNFFTr   rK   rL   rM   r"   rJ   rO   rP   s   @r8   rr   rr      s.    , "#"W
r9   rr   c                   @     e Zd Z fdZ	 	 	 	 	 	 	 ddZd Zd Z xZS )rw   c                 f   t         |           |j                  | _        d| _        t	        |      | _        || _        ||j                  z  dk(  rt	        |d      | _        d| _	        nd| _	        t        |      | _        t        |      | _        t        |      | _        t        |      | _        y )Nr   r   T)is_cross_attentionF)r!   r"   chunk_size_feed_forwardseq_len_dimr   	attentionr{   cross_attention_frequencycrossattentionr   r   intermediateintermediate_queryr   output_queryoutputrz   s      r8   r"   zBlip2QFormerLayer.__init__   s    '-'E'E$.v6"v7771<"7SW"XD'+D$',D$4V<":6"B.v6(0r9   c	           
         ||d d nd }	| j                  |||||	      }
|
d   }|
dd }|
d   }|dkD  r|d d d |d d f   }| j                  r1|t        d      | j                  ||||||      }|d   }||dd z   }t	        | j
                  | j                  | j                  |      }|j                  d   |kD  r~t	        | j                  | j                  | j                  |d d |d d d f         }t        j                  ||gd      }n,t	        | j                  | j                  | j                  |      }|f|z   }||fz   }|S )	NrZ   )r   r   r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r;   )r   r   
ValueErrorr   r   feed_forward_chunk_queryr   r   r?   feed_forward_chunkr0   rA   )r6   r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsattention_outputoutputspresent_key_valuequery_attention_outputcross_attention_outputslayer_outputlayer_output_texts                     r8   rJ   zBlip2QFormerLayer.forward   s    :H9S>"1#5Y] !%/3 "0 "
 2!4(2.226!%5a,6I%J"''(0$%eff*.*=*=*")*&7 +> +' *A)C&!$;Ab$AA4--,,  &	L  %%a(<7$=++00$$$Qq%89	%!  %yy,8I)JPQR4'',,   	L  /G+.00r9   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   r6   r   intermediate_outputr   s       r8   r   z$Blip2QFormerLayer.feed_forward_chunkD  s,    "//0@A{{#68HIr9   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   r   s       r8   r   z*Blip2QFormerLayer.feed_forward_chunk_queryI  s.    "556FG(()<>NOr9   )NNNNNFr   )rK   rL   rM   r"   rJ   r   r   rO   rP   s   @r8   rw   rw      s0    1, "#EN
r9   rw   c                   &     e Zd Zd fd	Zd Z xZS )	ProjLayerc                    t         |           t        j                  ||      | _        t               | _        t        j                  ||      | _        t        j                  |      | _	        t        j                  ||      | _
        y Nr   )r!   r"   r   Lineardense1	QuickGELUact_fndense2r,   r.   r*   )r6   in_dimout_dim
hidden_dimdrop_pr   r7   s         r8   r"   zProjLayer.__init__Q  s_     ii
3kii
G4zz&)g37r9   c           	          |}| j                  |      }| j                  | j                  | j                  | j	                  |                        |z   }|S r   )r*   r.   r   r   r   )r6   xx_ins      r8   rJ   zProjLayer.forward\  sG    NN1LLT[[Q%@ABTIr9   )皙?-q=r   rP   s   @r8   r   r   P  s    	8r9   r   c                        e Zd ZdZeZdef fdZ eee      	 	 	 	 dde	e
j                     de	e   de	e   de	e   deeef   f
d	       Zd
 Z xZS )Blip2VisionModelrf   r4   c                 B   t         |   |       || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        | j                          y r   )r!   r"   r4   r%   rR   rH   r   r*   r+   pre_layernormr   encoderpost_layernorm	post_init)r6   r4   r[   r7   s      r8   r"   zBlip2VisionModel.__init__j  sv     &&	/7\\)9N9NO#F+ ll9&:O:OPr9   )output_typeconfig_classr   r   r   rg   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |      }| j                  |      }| j                  ||||      }|d   }| j                  |      }|dddddf   }| j                  |      }|s
||f|dd z   S t        |||j                  |j                        S )z
        Returns:

        Nz You have to specify pixel_values)inputs_embedsr   r   r   r   r   )r   pooler_outputr   r   )r4   r   r   use_return_dictr   rH   r   r   r   r
   r   r   )	r6   rf   r   r   r   r   encoder_outputsr   pooled_outputs	            r8   rJ   zBlip2VisionModel.forwardu  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5**=9,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%}58KKK)/')77&11	
 	
r9   c                     | j                   S r   )rH   r6   s    r8   get_input_embeddingsz%Blip2VisionModel.get_input_embeddings  s    r9   )NNNN)rK   rL   rM   main_input_namer   r   r"   r   r
   r   r0   rp   boolr   r   rJ   r   rO   rP   s   @r8   r   r   f  s    $O$L	0 	 +ETef 04,0/3&**
u||,*
 $D>*
 'tn	*

 d^*
 
u00	1*
 g*
Xr9   r   c                        e Zd ZdZdef fdZd Zd Zd Z	 dde	j                  dee   d	e	j                  d
ede	j                  f
dZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )Blip2QFormerModelz:
    Querying Transformer (Q-Former), used in BLIP-2.
    r4   c                     t         |   |       || _        t        |j                        | _        t        |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t!        |d      r|j"                  t%        j&                  dd      | _        n&t%        j&                  |j"                  d      | _        | j"                  j)                  ddi       t+        |j                  j                  |j                  j                  |j                  j                  dz  d	d
      | _        t/        |j                        | _        | j3                          y )Nr   	tokenizerzbert-base-uncasedright)truncation_side	bos_tokenz[DEC]   r   r   )r   r   r   r   r   )r!   r"   r4   r   qformer_configrH   r   vision_configvisual_encoderr   r^   r0   zerosnum_query_tokensr%   query_tokenshasattrr   r   from_pretrainedadd_special_tokensr   
proj_layerrr   r   r   r5   s     r8   r"   zBlip2QFormerModel.__init__  s,    -f.C.CD.v/C/CDLLQ8O8OQWQfQfQrQr)stv{+v/?/?/G*::;N`ghDN*::6;K;K]deDN));*@A#((44))55,,881<
 +6+@+@Ar9   c                 .    | j                   j                  S r   rH   r'   r   s    r8   r   z&Blip2QFormerModel.get_input_embeddings  s    ...r9   c                 &    || j                   _        y r   r   )r6   values     r8   set_input_embeddingsz&Blip2QFormerModel.set_input_embeddings  s    */'r9   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   rx   r   prune_heads)r6   heads_to_prunerx   headss       r8   _prune_headszBlip2QFormerModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr9   r   input_shapedevice	has_queryrg   c                     |j                         dk(  r|dddddddf   }nF|j                         dk(  r|ddddddf   }n%t        dj                  ||j                              |j	                  | j
                        }d|z
  dz  }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        rT   NrZ   zAWrong shape for input_ids (shape {}) or attention_mask (shape {})ri   g      ?g     )r<   r   formatr?   rB   rC   )r6   r   r  r  r  extended_attention_masks         r8   get_extended_attention_maskz-Blip2QFormerModel.get_extended_attention_mask  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#SZZ!5!5  #:"<"<4::"<"N#&)@#@H"L&&r9   c                    | j                  |dd      }|j                  | j                        }|j                  }|j                  d   }t        j                  || j                  j                         d   ft
        j                        j                  | j                        }t        j                  ||j                  gd      }||n| j                  j                  }|	|	n| j                  j                  }	|
|
n| j                  j                  }
|,|d   d   j                  d	   | j                  j                   z
  nd}| j                  j                  d   }| j#                  || j                  |
      }|j                         dd }|\  }}|j                  }| j%                  |      j&                  }|}|t        j                  |||z   f|      }| j)                  |||      }|t+        |t,              r|d   j                         \  }}}n|j                         \  }}}||f}t+        |t,              r|D cg c]  }| j/                  |       }}n?|)t        j                  ||      }| j/                  |      }n| j/                  |      }nd}| j1                  || j                  j2                  j4                        }| j7                  |||||||||	|
|      }|d   }|dddddf   } |
s| j9                  |ddd|ddf         S t;        || |j<                  |j>                  |j@                  |jB                        S c c}w )a	  
        encoder_hidden_states  (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
            `(batch_size, sequence_length)`.
        use_cache (`bool`, `optional`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        ptT)return_tensorspaddingr   r   ri   r;   NrZ   )rD   rE   rF   r   )r  )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )"r   rB   r  rD   r?   r0   onesr   r=   longrA   r   r4   r   r   r   r   rH   r   r   r  
isinstancelistinvert_attention_maskget_head_maskr   rv   r   r   r   r   r   r   r   )!r6   
text_inputimage_inputr   r   r   r   r   r   r   r   textrD   rI   
query_attsr   rF   r   embedding_outputr  rG   r  image_embeds_frozenr  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapemaskencoder_extended_attention_maskr   sequence_outputr   s!                                    r8   rJ   zBlip2QFormerModel.forward  sx   B ~~jt~Lwwt{{#NN	__Q'
ZZT->->-C-C-Ea-H IQVQ[Q[\__`d`k`kl
J0C0C#D!L1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] JYIdOAq!''*T[[-E-EEjk 	 ((..q1??**#9 + 
 '++-cr2!,
J!(("11+>PP 3!"ZZ*jCY6Y)ZdjkN #'"B"B>S^`f"g !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2wX\43M3Md3S2w/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y$++2L2L2^2^_	,,2"7#B+/!5#% ' 
 *!,'1a0???1m|mQ3F#GHH;-'+;;)77&11,==
 	
G 3xs   M)F)
NNNNNNNNNN)rK   rL   rM   rN   r   r"   r   r   r   r0   rp   r   intr  r   r  rJ   rO   rP   s   @r8   r   r     s    { ./0C  +'+' 3Z+' 	+'
 +' 
+'^ "#!A
r9   r   ))typingr   r   r   r0   torch.utils.checkpointr   transformersr   transformers.activationsr   r   transformers.modeling_outputsr	   r
   r   /transformers.models.blip_2.configuration_blip_2r   r   *transformers.models.blip_2.modeling_blip_2r   r   r   r   r   transformers.pytorch_utilsr   transformers.utilsr   r   
get_loggerrK   r   Moduler   rR   rr   rw   r   r   r   r}   r9   r8   <module>r+     s    * )    & E 
 [  A 
		H	%2")) 2lBII @`
")) `
Hc		 cN		 ,=+ =BX
, X
r9   