
    bi7                     ~    d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 ddlmZmZ ddlmZ  G d	 d
eee
      Zy)    )OptionalN)nn)
GPT2ConfigGPT2LMHeadModel)ModuleUtilsMixin   )ConfigMixinregister_to_config)
ModelMixinc            (           e Zd ZdZddgZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(dededee   deded	ed
ededee   dede	de	de	de	de	de
de
de
de
f& fd       Z	 	 d)dej                  dej                  deej                     deej                     fdZdedej                  dej                  fd Zd! Z ej$                         d"        Z ej$                         	 	 	 	 	 	 	 d*d#ed$ed%e	d&ee   fd'       Z xZS )+UniDiffuserTextDecodera  
    Text decoder model for a image-text [UniDiffuser](https://huggingface.co/papers/2303.06555) model. This is used to
    generate text from the UniDiffuser image-text embedding.

    Parameters:
        prefix_length (`int`):
            Max number of prefix tokens that will be supplied to the model.
        prefix_inner_dim (`int`):
            The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
            CLIP text encoder.
        prefix_hidden_dim (`int`, *optional*):
            Hidden dim of the MLP if we encode the prefix.
        vocab_size (`int`, *optional*, defaults to 50257):
            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
        n_positions (`int`, *optional*, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        n_embd (`int`, *optional*, defaults to 768):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        n_head (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu"`):
            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        embd_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        scale_attn_weights (`bool`, *optional*, defaults to `True`):
            Scale attention weights by dividing by sqrt(hidden_size)..
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
            dot-product/softmax to float() when training with mixed precision.
    zh\.\d+\.attn\.biaszh\.\d+\.attn\.masked_biasprefix_lengthprefix_inner_dimprefix_hidden_dim
vocab_sizen_positionsn_embdn_layern_headn_inneractivation_functionresid_pdrop
embd_pdrop
attn_pdroplayer_norm_epsiloninitializer_rangescale_attn_weights	use_cachescale_attn_by_inverse_layer_idxreorder_and_upcast_attnc                 8   t         |           || _        ||k7  r|t        d| d| d      || _        || _        | j
                  *t        j                  | j                  | j
                        nt        j                         | _	        | j
                   t        j                  | j
                  |      nt        j                         | _
        t        di d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|}t        |      | _        y )Nz>`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: z and `n_embd`: z are not equal.r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     )super__init__r   
ValueErrorr   r   r   LinearIdentityencode_prefixdecode_prefixr   r   transformer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    
gpt_config	__class__s                        p/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/unidiffuser/modeling_text_decoder.pyr$   zUniDiffuserTextDecoder.__init__B   s   . 	*v%*;*CPQbPc d$X_6 
 !1!2 %%1 IId++T-C-CD 	 :>9O9O9[BIId,,f5acalalan 	   
!
#
 
 	

 
 
 !4
 $
 "
 "
  2
 0
  2
  
 -L
  %<!

$ +:6    	input_idsprefix_embedsattention_masklabelsc                    | j                   j                   j                  |      }| j                  |      }| j                  |      }t	        j
                  ||fd      }|B| j                  |j                  d   |j                        }t	        j
                  ||fd      }| j                  |||      }	| j                  |	|fS |	S )a)  
        Args:
            input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
                Text tokens to use for inference.
            prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
                Prefix embedding to prepend to the embedded tokens.
            attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
                Attention mask for the prefix embedding.
            labels (`torch.Tensor`, *optional*):
                Labels to use for language modeling.
           dimr   )inputs_embedsr3   r2   )
r*   wter(   r)   torchcatget_dummy_tokenshapedevicer   )
r+   r0   r1   r2   r3   embedding_texthiddenembedding_catdummy_tokenouts
             r.   forwardzUniDiffuserTextDecoder.forward   s    $ ))5599)D##M2**62		=."AqI..yq/A9CSCSTKYYY7Q?F]6Zhi!!-;Jr/   
batch_sizer>   returnc                 d    t        j                  || j                  t         j                  |      S )N)dtyper>   )r:   zerosr   int64)r+   rE   r>   s      r.   r<   z&UniDiffuserTextDecoder.get_dummy_token   s"    {{:t'9'9U[\\r/   c                 $    | j                  |      S )N)r(   )r+   prefixs     r.   encodezUniDiffuserTextDecoder.encode   s    !!&))r/   c                 b   t        j                  |dd      }g }g }|D ]a  }| j                  |j                  |            }| j	                  |||      \  }}|j                  |d          |j                  |d          c t        j                  |      }t        j                  |      }||fS )a  
        Generate captions given text embedding features. Returns list[L].

        Args:
            features (`torch.Tensor` of shape `(B, L, D)`):
                Text embedding features to generate captions from.
            eos_token_id (`int`):
                The token ID of the EOS token for the text decoder model.
            device:
                Device to perform text generation on.

        Returns:
            `List[str]`: A list of strings generated from the decoder model.
        r5   r   r6   )input_embedsr>   eos_token_id)r:   splitr)   togenerate_beamappendstack)	r+   featuresrP   r>   generated_tokensgenerated_seq_lengthsfeatureoutput_tokensseq_lengthss	            r.   generate_captionsz(UniDiffuserTextDecoder.generate_captions   s    " ;;x2 " 	9G((F);<G)-););$V, *< *&M; ##M!$45!((Q8	9 !;;'78 %,A B!666r/   	beam_sizeentry_lengthtemperaturerP   c                    |}d}	d}
t        j                  ||t         j                        }t        j                  ||t         j                        }||}n%| j
                  j
                  j                  |      }t        |      D ]c  }| j                  |      }|j                  }|dddddf   |dkD  r|ndz  }|j                  d      j                         }|
|j                  |d      \  }
} |j                  |g|j                  dd  }|j                  dd      |
j                  d      }
}|	|}	n |	j                  |g|	j                  dd  }	t        j                   |	|fd      }	nt#        t$        j&                         ||<   d||df<   |
dddf   |z   }|| xx   dz  cc<   ||dddf   z  }|j)                  d      j                  |d      \  }}||j                  d   z  }||   }||j                  d   z  }|j+                  d      }|	|   }	t        j                   |	|fd      }	||   }||z  }
||   }| j
                  j
                  j                  |j                               j)                  |j                  d   dd      }t        j                   ||fd      }||j-                  |      j                         z   }|j/                         sd n |
|z  }
|
j1                  d	
      }|D cg c]  }|	|   	 }}t        j2                  |d      }t        j4                  |D cg c]  }||   	 c}|j6                        }||fS c c}w c c}w )a  
        Generates text using the given tokenizer and text prompt or token embedding via beam search. This
        implementation is based on the beam search implementation from the [original UniDiffuser
        code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).

        Args:
            eos_token_id (`int`, *optional*):
                The token ID of the EOS token for the text decoder model.
            input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
                Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
                must be supplied.
            input_embeds (`torch.Tensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                An embedded representation to directly pass to the transformer as a prefix for beam search. One of
                `input_ids` and `input_embeds` must be supplied.
            device:
                The device to perform beam search on.
            beam_size (`int`, *optional*, defaults to `5`):
                The number of best states to store during beam search.
            entry_length (`int`, *optional*, defaults to `67`):
                The number of iterations to run beam search.
            temperature (`float`, *optional*, defaults to 1.0):
                The temperature to use when performing the softmax over logits from the decoding model.

        Returns:
            `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
            token sequences sorted by score in descending order, and the second element is the sequence lengths
            corresponding to those sequences.
        N)r>   rH   )r8   r         ?r5   r6   T)
descending)rH   )r:   onesintrI   boolr*   r9   rangelogitssoftmaxlogtopkexpandr=   permutesqueezer;   floatnpinfview	unsqueezeeqallargsortrU   tensorrH   )r+   r0   rO   r>   r]   r^   r_   rP   stop_token_indextokensscoresr[   
is_stopped	generatedioutputsrh   next_tokens
scores_sumscores_sum_averagenext_tokens_sourcenext_token_embedorderoutput_textss                           r.   rS   z$UniDiffuserTextDecoder.generate_beam   sm   P (jj6K[[6L
#$I((4488CI|$ $	A&&Y&?G^^FAr1H%aSQF^^B'++-F~&,kk)R&@#,I,,YM9LM	&1&9&9!Q&?PQARV>(F*V]]9Hv||AB7GHF"YY'<!DF&+BFFm^z"()z1}%#AtG_v5
ZK(A-(%/+ag2F%F"2D2I2I"2M2R2RS\^`2a/"K%0J4D4DQ4G%G")*<=)J,<,<Q,??)33A6 23FK#8a@%&89	+k9'(:;
#//;;??@S@S@UV[[\e\k\klm\npqsuv		9.>"?QGI#knn5E&F&N&N&PPJ~~I$	L +%$/+01aq	11{{<Q7llE#BqKN#B+J[J[\[(( 2#Bs   
M%M*)NiQ  i   i      r   Ngelu_new皙?r   r   gh㈵>g{Gz?TTFF)NN)NNN   C   rb   N)__name__
__module____qualname____doc__"_keys_to_ignore_on_load_unexpectedr
   re   r   strro   rf   r$   r:   TensorrD   r>   r<   rM   no_gradr\   rS   __classcell__)r-   s   @r.   r   r      s   /b +@A])^&
 ,0!%#- $(#'#'05(-)>7>7 >7 $C=	>7
 >7 >7 >7 >7 >7 #>7 !>7 >7 >7 >7 ">7  !!>7" !#>7$ %>7& *.'>7( "&)>7 >7H 26)-<< || !.	
 &@]# ]u|| ] ]* U]]_7 7> U]]_  &*^)
 ^) ^) ^) sm^) ^)r/   r   )typingr   numpyrp   r:   r   transformersr   r   transformers.modeling_utilsr   configuration_utilsr	   r
   modelsr   r   r"   r/   r.   <module>r      s0        4 8 B  Z)Z6F Z)r/   