
    bi_                        d dl Z d dlmZmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZ  ej                   e      Z G d d	e      Z G d
 dee
      Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Zy)    N)CallableListOptionalUnion   )ConfigMixinregister_to_config)logging   )
ModelMixinc                       e Zd ZdZded   f fdZddej                  deee	      deej                     fdZ
	 	 	 	 dd	eeej                  f   d
edededee   f
dZedeeeej                  f      fd       Z xZS )MultiAdaptera  
    MultiAdapter is a wrapper model that contains multiple adapter models and merges their outputs according to
    user-assigned weighting.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for common methods such as downloading
    or saving.

    Args:
        adapters (`List[T2IAdapter]`, *optional*, defaults to None):
            A list of `T2IAdapter` model instances.
    adapters
T2IAdapterc                 $   t         t        |           t        |      | _        t        j                  |      | _        t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  }|d   j                  }t        dt        |            D ]]  }||   j                  |k7  s||   j                  |k7  s(t        d| d| d| d||   j                   d| d	||   j                          || _	        || _
        y )
Nr   zExpecting at least one adapterr   zQFor a single adapter, please use the `T2IAdapter` class instead of `MultiAdapter`zjExpecting all adapters to have the same downscaling behavior, but got:
adapters[0].total_downscale_factor=z
adapters[0].downscale_factor=z

adapter[`z`].total_downscale_factor=z`].downscale_factor=)superr   __init__lennum_adapternn
ModuleListr   
ValueErrortotal_downscale_factordownscale_factorrange)selfr   $first_adapter_total_downscale_factorfirst_adapter_downscale_factoridx	__class__s        S/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/adapter.pyr   zMultiAdapter.__init__)   s5   lD*,x=h/x=A=>>x=Apqq 08{/Q/Q,)1!)E)E&CM* 	C448\\C=115SS ::^9_ `44R3S T  #u$>x}?c?c>d e  #u$8#9W9W8X	Z 	 'K# >    xsadapter_weightsreturnc                    |1t        j                  d| j                  z  g| j                  z        }nt        j                  |      }d}t        ||| j                        D ]c  \  }}} ||      }|'|}t        t        |            D ]  }|||   z  ||<    8t        t        |            D ]  }||xx   |||   z  z  cc<    e |S )az  
        Args:
            xs (`torch.Tensor`):
                A tensor of shape (batch, channel, height, width) representing input images for multiple adapter
                models, concatenated along dimension 1(channel dimension). The `channel` dimension should be equal to
                `num_adapter` * number of channel per image.

            adapter_weights (`List[float]`, *optional*, defaults to None):
                A list of floats representing the weights which will be multiplied by each adapter's output before
                summing them together. If `None`, equal weights will be used for all adapters.
        Nr   )torchtensorr   zipr   r   r   )	r   r#   r$   accume_statexwadapterfeaturesis	            r!   forwardzMultiAdapter.forwardL   s     "#llA0@0@,@+ADDTDT+TUO#ll?;O _dmmD 	7MAq'qzH#'s<01 :A&',q/&9LO: s8}- 7A Oq8A;6O7	7 r"   save_directoryis_main_processsave_functionsafe_serializationvariantc                 t    d}|}| j                   D ]%  }|j                  |||||       |dz  }|d| z   }' y)a6  
        Save a model and its configuration file to a specified directory, allowing it to be re-loaded with the
        `[`~models.adapter.MultiAdapter.from_pretrained`]` class method.

        Args:
            save_directory (`str` or `os.PathLike`):
                The directory where the model will be saved. If the directory does not exist, it will be created.
            is_main_process (`bool`, optional, defaults=True):
                Indicates whether current process is the main process or not. Useful for distributed training (e.g.,
                TPUs) and need to call this function on all processes. In this case, set `is_main_process=True` only
                for the main process to avoid race conditions.
            save_function (`Callable`):
                Function used to save the state dictionary. Useful for distributed training (e.g., TPUs) to replace
                `torch.save` with another method. Can also be configured using`DIFFUSERS_SAVE_MODE` environment
                variable.
            safe_serialization (`bool`, optional, defaults=True):
                If `True`, save the model using `safetensors`. If `False`, save the model with `pickle`.
            variant (`str`, *optional*):
                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
        r   )r2   r3   r4   r5   r   _N)r   save_pretrained)	r   r1   r2   r3   r4   r5   r   model_path_to_saver-   s	            r!   r8   zMultiAdapter.save_pretrainedi   sb    8 +}} 
	@G##" /+#5 $  1HC!3#i!?
	@r"   pretrained_model_pathc                    d}g }|}t         j                  j                  |      rTt        j                  |fi |}|j                  |       |dz  }|d| z   }t         j                  j                  |      rTt        j                  t        |       d| d       t        |      dk(  r2t        dt         j                  j                  |       d|dz    d       | |      S )	a  
        Instantiate a pretrained `MultiAdapter` model from multiple pre-trained adapter models.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, set it back to training mode using `model.train()`.

        Warnings:
            *Weights from XXX not initialized from pretrained model* means that the weights of XXX are not pretrained
            with the rest of the model. It is up to you to train those weights with a downstream fine-tuning. *Weights
            from XXX not used in YYY* means that the layer XXX is not used by YYY, so those weights are discarded.

        Args:
            pretrained_model_path (`os.PathLike`):
                A path to a *directory* containing model weights saved using
                [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
            torch_dtype (`torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under this dtype.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device.

                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                more information about each option see [designing a device
                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
            max_memory (`Dict`, *optional*):
                A dictionary mapping device identifiers to their maximum memory. Default to the maximum memory
                available for each GPU and the available CPU RAM if unset.
            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
                setting this argument to `True` will raise an error.
            variant (`str`, *optional*):
                If specified, load weights from a `variant` file (*e.g.* pytorch_model.<variant>.bin). `variant` will
                be ignored when using `from_flax`.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                If `None`, the `safetensors` weights will be downloaded if available **and** if`safetensors` library is
                installed. If `True`, the model will be forcibly loaded from`safetensors` weights. If `False`,
                `safetensors` is not used.
        r   r   r7   z adapters loaded from .zNo T2IAdapters found under z. Expected at least _0)ospathisdirr   from_pretrainedappendloggerinfor   r   dirname)clsr:   kwargsr   r   model_path_to_loadr-   s          r!   rA   zMultiAdapter.from_pretrained   s    Z 
 3ggmm./ 001CNvNGOOG$1HC!61SE!B ggmm./ 	s8}o%;<Q;RRSTUx=A-bggoo>S.T-UUij  CG  kG  jH  HI  J  8}r"   N)TNTN)__name__
__module____qualname____doc__r   r   r'   Tensorr   floatr0   r   strr>   PathLikeboolr   r8   classmethodrA   __classcell__r    s   @r!   r   r      s    
!?l!3 !?F%,, $u+9N Z^_d_k_kZl @ !%"&#'!%(@c2;;./(@ (@  	(@
 !(@ #(@T AHU3CS=T4U A Ar"   r   c                        e Zd ZdZedg ddddfdedee   d	ed
edef
 fd       Zde	j                  dee	j                     fdZed        Zed        Z xZS )r   a\  
    A simple ResNet-like model that accepts images containing control signals such as keyposes and depth. The model
    generates multiple feature maps that are used as additional conditioning in [`UNet2DConditionModel`]. The model's
    architecture follows the original implementation of
    [Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
     and
     [AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).

    This model inherits from [`ModelMixin`]. Check the superclass documentation for the common methods, such as
    downloading or saving.

    Args:
        in_channels (`int`, *optional*, defaults to `3`):
            The number of channels in the adapter's input (*control image*). Set it to 1 if you're using a gray scale
            image.
        channels (`List[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The number of channels in each downsample block's output hidden state. The `len(block_out_channels)`
            determines the number of downsample blocks in the adapter.
        num_res_blocks (`int`, *optional*, defaults to `2`):
            Number of ResNet blocks in each downsample block.
        downscale_factor (`int`, *optional*, defaults to `8`):
            A factor that determines the total downscale factor of the Adapter.
        adapter_type (`str`, *optional*, defaults to `full_adapter`):
            Adapter type (`full_adapter` or `full_adapter_xl` or `light_adapter`) to use.
       @       r[   r      full_adapterin_channelschannelsnum_res_blocksr   adapter_typec                     t         |           |dk(  rt        ||||      | _        y |dk(  rt	        ||||      | _        y |dk(  rt        ||||      | _        y t        d| d      )Nr]   full_adapter_xllight_adapterzUnsupported adapter_type: 'zH'. Choose either 'full_adapter' or 'full_adapter_xl' or 'light_adapter'.)r   r   FullAdapterr-   FullAdapterXLLightAdapterr   )r   r^   r_   r`   r   ra   r    s         r!   r   zT2IAdapter.__init__   s     	>)&{HnN^_DL..(hP`aDL_,'X~O_`DL-l^ <8 8 r"   r+   r%   c                 $    | j                  |      S )a  
        This function processes the input tensor `x` through the adapter model and returns a list of feature tensors,
        each representing information extracted at a different scale from the input. The length of the list is
        determined by the number of downsample blocks in the Adapter, as specified by the `channels` and
        `num_res_blocks` parameters during initialization.
        )r-   r   r+   s     r!   r0   zT2IAdapter.forward
  s     ||Ar"   c                 .    | j                   j                  S rI   )r-   r   r   s    r!   r   z!T2IAdapter.total_downscale_factor  s    ||222r"   c                 B    | j                   j                  j                  S )zThe downscale factor applied in the T2I-Adapter's initial pixel unshuffle operation. If an input image's dimensions are
        not evenly divisible by the downscale_factor then an exception will be raised.
        )r-   	unshuffler   rk   s    r!   r   zT2IAdapter.downscale_factor  s    
 ||%%666r"   )rJ   rK   rL   rM   r	   intr   rP   r   r'   rN   r0   propertyr   r   rT   rU   s   @r!   r   r      s    4  4 !* s) 	
   , $u||*<  3 3 7 7r"   r   c            	            e Zd ZdZdg dddfdedee   ded	ef fd
Zdej                  deej                     fdZ	 xZ
S )re   2
    See [`T2IAdapter`] for more information.
    rW   rX   r   r\   r^   r_   r`   r   c                    t         |           ||dz  z  }t        j                  |      | _        t        j
                  ||d   dd      | _        t        j                  t        |d   |d   |      gt        dt        |            D cg c]  }t        ||dz
     ||   |d       c}      | _        |dt        |      dz
  z  z  | _        y c c}w Nr   r   rW   r   kernel_sizepaddingTdown)r   r   r   PixelUnshufflerm   Conv2dconv_inr   AdapterBlockr   r   bodyr   r   r^   r_   r`   r   r/   r    s         r!   r   zFullAdapter.__init__'  s     	!$4a$77**+;<yyhqkqRSTMMXa[(1+~F #1c(m4 !!a%(1+~TXY
	 '7s8}q?P9Q&Q#s   C
r+   r%   c                     | j                  |      }| j                  |      }g }| j                  D ]  } ||      }|j                  |        |S )a  
        This method processes the input tensor `x` through the FullAdapter model and performs operations including
        pixel unshuffling, convolution, and a stack of AdapterBlocks. It returns a list of feature tensors, each
        capturing information at a different stage of processing within the FullAdapter model. The number of feature
        tensors in the list is determined by the number of downsample blocks specified during initialization.
        rm   r{   r}   rB   r   r+   r.   blocks       r!   r0   zFullAdapter.forwardA  sS     NN1LLOYY 	EaAOOA	 r"   rJ   rK   rL   rM   rn   r   r   r'   rN   r0   rT   rU   s   @r!   re   re   "  sj     4 !RR s)R 	R
 R4 $u||*< r"   re   c            	            e Zd ZdZdg dddfdedee   ded	ef fd
Zdej                  deej                     fdZ	 xZ
S )rf   rq   rW   rX   r      r^   r_   r`   r   c           	      t   t         |           ||dz  z  }t        j                  |      | _        t        j
                  ||d   dd      | _        g | _        t        t        |            D ]  }|dk(  r0| j                  j                  t        ||dz
     ||   |             8|dk(  r2| j                  j                  t        ||dz
     ||   |d             o| j                  j                  t        ||   ||   |              t        j                  | j                        | _        |dz  | _        y rs   )r   r   r   ry   rm   rz   r{   r}   r   r   rB   r|   r   r   r~   s         r!   r   zFullAdapterXL.__init__Y  s    	!$4a$77**+;<yyhqkqRST	s8}% 	YAAv		  hq1uox{N![\a		  hq1uox{Nae!fg		  hqk8A;!WX	Y MM$)),	&6&:#r"   r+   r%   c                     | j                  |      }| j                  |      }g }| j                  D ]  } ||      }|j                  |        |S )z
        This method takes the tensor x as input and processes it through FullAdapterXL model. It consists of operations
        including unshuffling pixels, applying convolution layer and appending each block into list of feature tensors.
        r   r   s       r!   r0   zFullAdapterXL.forwardu  sS    
 NN1LLOYY 	EaAOOA	 r"   r   rU   s   @r!   rf   rf   T  se     4 ";; s); 	;
 ;8 $u||*< r"   rf   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z	 xZ
S )r|   a#  
    An AdapterBlock is a helper model that contains multiple ResNet-like blocks. It is used in the `FullAdapter` and
    `FullAdapterXL` models.

    Args:
        in_channels (`int`):
            Number of channels of AdapterBlock's input.
        out_channels (`int`):
            Number of channels of AdapterBlock's output.
        num_res_blocks (`int`):
            Number of ResNet blocks in the AdapterBlock.
        down (`bool`, *optional*, defaults to `False`):
            If `True`, perform downsampling on AdapterBlock's input.
    r^   out_channelsr`   rx   c                 8   t         |           d | _        |rt        j                  ddd      | _        d | _        ||k7  rt        j                  ||d      | _        t        j                  t        |      D cg c]  }t        |       c} | _
        y c c}w )Nr   Tru   stride	ceil_moder   ru   )r   r   
downsampler   	AvgPool2din_convrz   
Sequentialr   AdapterResnetBlockresnets)r   r^   r   r`   rx   r7   r    s         r!   r   zAdapterBlock.__init__  s{     llqdSDO,&99[,ANDL}}8=n8MN1 .N
Ns   ;Br+   r%   c                     | j                   | j                  |      }| j                  | j                  |      }| j                  |      }|S )a  
        This method takes tensor x as input and performs operations downsampling and convolutional layers if the
        self.downsample and self.in_conv properties of AdapterBlock model are specified. Then it applies a series of
        residual blocks to the input tensor.
        )r   r   r   ri   s     r!   r0   zAdapterBlock.forward  sE     ??&"A<<#QALLOr"   FrJ   rK   rL   rM   rn   rR   r   r'   rN   r0   rT   rU   s   @r!   r|   r|     sE    
C 
s 
C 
W[ 
 %,, r"   r|   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )r   z
    An `AdapterResnetBlock` is a helper model that implements a ResNet-like block.

    Args:
        channels (`int`):
            Number of channels of AdapterResnetBlock's input and output.
    r_   c                     t         |           t        j                  ||dd      | _        t        j
                         | _        t        j                  ||d      | _        y )NrW   r   rt   r   r   r   r   rz   block1ReLUactblock2r   r_   r    s     r!   r   zAdapterResnetBlock.__init__  sE    ii(1M779ii(Br"   r+   r%   c                 n    | j                  | j                  |            }| j                  |      }||z   S )z
        This method takes input tensor x and applies a convolutional layer, ReLU activation, and another convolutional
        layer on the input tensor. It returns addition with the input tensor.
        r   r   r   r   r+   hs      r!   r0   zAdapterResnetBlock.forward  0     HHT[[^$KKN1ur"   
rJ   rK   rL   rM   rn   r   r'   rN   r0   rT   rU   s   @r!   r   r     s1    C C	 	%,, 	r"   r   c            	            e Zd ZdZdg dddfdedee   ded	ef fd
Zdej                  deej                     fdZ	 xZ
S )rg   rq   rW   )rY   rZ   r[      r\   r^   r_   r`   r   c                    t         |           ||dz  z  }t        j                  |      | _        t        j
                  t        ||d   |      gt        t        |      dz
        D cg c]  }t        ||   ||dz      |d       c}t        |d   |d   |d            | _	        |dt        |      z  z  | _
        y c c}w )Nr   r   r   Trw   )r   r   r   ry   rm   r   LightAdapterBlockr   r   r}   r   r~   s         r!   r   zLightAdapter.__init__  s     	!$4a$77**+;<MM!+x{NK #3x=1#45 &hqk8AE?NY]^ "(2,nSWX	
	 '7!s8}:L&M#s   *C
r+   r%   c                     | j                  |      }g }| j                  D ]  } ||      }|j                  |        |S )z
        This method takes the input tensor x and performs downscaling and appends it in list of feature tensors. Each
        feature tensor corresponds to a different level of processing within the LightAdapter.
        )rm   r}   rB   r   s       r!   r0   zLightAdapter.forward  sF    
 NN1YY 	EaAOOA	 r"   r   rU   s   @r!   rg   rg     sj     . !NN s)N 	N
 N4 $u||*< r"   rg   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z	 xZ
S )r   a<  
    A `LightAdapterBlock` is a helper model that contains multiple `LightAdapterResnetBlocks`. It is used in the
    `LightAdapter` model.

    Args:
        in_channels (`int`):
            Number of channels of LightAdapterBlock's input.
        out_channels (`int`):
            Number of channels of LightAdapterBlock's output.
        num_res_blocks (`int`):
            Number of LightAdapterResnetBlocks in the LightAdapterBlock.
        down (`bool`, *optional*, defaults to `False`):
            If `True`, perform downsampling on LightAdapterBlock's input.
    r^   r   r`   rx   c                 d   t         |           |dz  }d | _        |rt        j                  ddd      | _        t        j
                  ||d      | _        t        j                  t        |      D cg c]  }t        |       c} | _
        t        j
                  ||d      | _        y c c}w )Nr   r   Tr   r   r   )r   r   r   r   r   rz   r   r   r   LightAdapterResnetBlockr   out_conv)r   r^   r   r`   rx   mid_channelsr7   r    s          r!   r   zLightAdapterBlock.__init__  s    #q( llqdSDOyylJ}}V[\jVk&lQR'>|'L&lm		,!L 'ms   4B-r+   r%   c                     | j                   | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )z
        This method takes tensor x as input and performs downsampling if required. Then it applies in convolution
        layer, a sequence of residual blocks, and out convolutional layer.
        )r   r   r   r   ri   s     r!   r0   zLightAdapterBlock.forward  sI    
 ??&"ALLOLLOMM!r"   r   r   rU   s   @r!   r   r     sJ    
MC 
Ms 
MC 
MW[ 
M %,, r"   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )r   a  
    A `LightAdapterResnetBlock` is a helper model that implements a ResNet-like block with a slightly different
    architecture than `AdapterResnetBlock`.

    Args:
        channels (`int`):
            Number of channels of LightAdapterResnetBlock's input and output.
    r_   c                     t         |           t        j                  ||dd      | _        t        j
                         | _        t        j                  ||dd      | _        y )NrW   r   rt   r   r   s     r!   r   z LightAdapterResnetBlock.__init__8  sG    ii(1M779ii(1Mr"   r+   r%   c                 n    | j                  | j                  |            }| j                  |      }||z   S )z
        This function takes input tensor x and processes it through one convolutional layer, ReLU activation, and
        another convolutional layer and adds it to input tensor.
        r   r   s      r!   r0   zLightAdapterResnetBlock.forward>  r   r"   r   rU   s   @r!   r   r   .  s1    N N	 	%,, 	r"   r   )r>   typingr   r   r   r   r'   torch.nnr   configuration_utilsr   r	   utilsr
   modeling_utilsr   
get_loggerrJ   rC   r   r   Modulere   rf   r|   r   rg   r   r    r"   r!   <module>r      s    
 2 2   A  & 
		H	%y: yxD7[ D7T/")) /d/BII /d-299 -` <,299 ,^(		 (Vbii r"   