
    bi1M                        d dl mZ d dlmZmZmZmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ e G d de             Z% G d dee      Z&y)    )	dataclass)DictOptionalTupleUnionN)nn   )ConfigMixinregister_to_config)ConsistencyDecoderScheduler)
BaseOutput)apply_forward_hook)randn_tensor   )ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessor)
ModelMixin)UNet2DModel   )DecoderOutputDiagonalGaussianDistributionEncoderc                       e Zd ZU dZded<   y)ConsistencyDecoderVAEOutputa2  
    Output of encoding method.

    Args:
        latent_dist (`DiagonalGaussianDistribution`):
            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
    r   latent_distN)__name__
__module____qualname____doc____annotations__     p/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/autoencoders/consistency_decoder_vae.pyr   r   &   s     0/r%   r   c            4       t    e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d7dedededede	edf   d	e
d
e	edf   dedededede
de	edf   de	edf   dededededededededede	edf   f0 fd       Zd8de
fdZd Zd  Zd! Zed"eeef   fd#       Zd$eeeeef   f   fd%Zd& Ze	 d8d'ej2                  d(e
d"eee	e   f   fd)       Ze	 	 	 d9d*ej2                  d+eej<                     d(e
d,ed"eee	ej2                     f   f
d-       Z d.ej2                  d/ej2                  d0ed"ej2                  fd1Z!d.ej2                  d/ej2                  d0ed"ej2                  fd2Z"d8d'ej2                  d(e
d"eee	f   fd3Z#	 	 	 d:d4ej2                  d5e
d(e
d+eej<                     d"eee	ej2                     f   f
d6Z$ xZ%S );ConsistencyDecoderVAEaP  
    The consistency decoder used with DALL-E 3.

    Examples:
        ```py
        >>> import torch
        >>> from diffusers import StableDiffusionPipeline, ConsistencyDecoderVAE

        >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
        >>> pipe = StableDiffusionPipeline.from_pretrained(
        ...     "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
        ... ).to("cuda")

        >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
        >>> image
        ```
    Fscaling_factorlatent_channelssample_sizeencoder_act_fnencoder_block_out_channels.encoder_double_zencoder_down_block_typesencoder_in_channelsencoder_layers_per_blockencoder_norm_num_groupsencoder_out_channelsdecoder_add_attentiondecoder_block_out_channelsdecoder_down_block_typesdecoder_downsample_paddingdecoder_in_channelsdecoder_layers_per_blockdecoder_norm_epsdecoder_norm_num_groupsdecoder_num_train_timestepsdecoder_out_channelsdecoder_resnet_time_scale_shiftdecoder_time_embedding_typedecoder_up_block_typesc                 |   t         |           t        ||||||	|
|      | _        t	        |||||||||||||      | _        t               | _        | j                  |       | j                  d       | j                  dt        j                  g d      d d d d d f   d       | j                  d	t        j                  g d
      d d d d d f   d       t        j                  d|z  d|z  d      | _        d| _        d| _        | j"                  j$                  | _        t)        | j"                  j$                  t*        t,        f      r| j"                  j$                  d   n| j"                  j$                  }t/        |dt1        | j"                  j2                        dz
  z  z        | _        d| _        y )N)act_fnblock_out_channelsdouble_zdown_block_typesin_channelslayers_per_blocknorm_num_groupsout_channels)add_attentionrC   rE   downsample_paddingrF   rG   norm_epsrH   num_train_timestepsrI   resnet_time_scale_shifttime_embedding_typeup_block_types)rC   F)force_upcastmeans)gg:?gyD?glL?gN3^)
persistentstds)g4?gn=?gr	^?gr` ?r   r   r   g      ?)super__init__r   encoderr   decoder_unetr   decoder_schedulerr   register_buffertorchtensorr   Conv2d
quant_convuse_slicing
use_tilingconfigr+   tile_sample_min_size
isinstancelisttupleintlenrC   tile_latent_min_sizetile_overlap_factor)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   	__class__s                            r&   rV   zConsistencyDecoderVAE.__init__I   s   V 	!9%5+53-	
 (/959+5%3 ;-$C ;1
 "=!>3MNU3LLIJ4QRTXZ^K^_ 	 	

 	ELL!OPQUWXZ^`dQderw 	 	
 ))A$7_9LaP  %)KK$;$;! $++11D%=A KK##A&(( 	
 %(qSA_A_=`cd=d7e(f$g!#' r%   r`   c                     || _         y)a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        N)r`   )rj   r`   s     r&   enable_tilingz#ConsistencyDecoderVAE.enable_tiling   s     %r%   c                 &    | j                  d       y)z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)rm   rj   s    r&   disable_tilingz$ConsistencyDecoderVAE.disable_tiling   s    
 	5!r%   c                     d| _         y)z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        TNr_   ro   s    r&   enable_slicingz$ConsistencyDecoderVAE.enable_slicing   s    
  r%   c                     d| _         y)z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FNrr   ro   s    r&   disable_slicingz%ConsistencyDecoderVAE.disable_slicing   s    
 !r%   returnc                     i }dt         dt        j                  j                  dt        t         t
        f   ffd| j                         D ]  \  }} |||        |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                     t        |d      r|j                         ||  d<   |j                         D ]  \  }} |  d| ||        |S )Nget_processor
.processor.)hasattrr|   named_children)rx   ry   rz   sub_namechildfn_recursive_add_processorss        r&   r   zJConsistencyDecoderVAE.attn_processors.<locals>.fn_recursive_add_processors   sd    v/282F2F2H
dV:./#)#8#8#: U%+tfAhZ,@%TU r%   )strr[   r   Moduler   r   r   )rj   rz   rx   ry   r   s       @r&   attn_processorsz%ConsistencyDecoderVAE.attn_processors   sm     
	c 	588?? 	X\]`bt]tXu 	 !//1 	BLD&'fjA	B r%   	processorc           	      T   t        | j                  j                               }t        |t              r,t        |      |k7  rt        dt        |       d| d| d      dt        dt        j                  j                  ffd| j                         D ]  \  }} |||        y)	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rx   ry   c                     t        |d      rEt        |t              s|j                  |       n#|j                  |j	                  |  d             |j                         D ]  \  }} |  d| ||        y )Nset_processorr}   r~   )r   rc   dictr   popr   )rx   ry   r   r   r   fn_recursive_attn_processors        r&   r   zMConsistencyDecoderVAE.set_attn_processor.<locals>.fn_recursive_attn_processor   sx    v/!)T2((3(($z7J)KL#)#8#8#: T%+tfAhZ,@%STr%   N)rg   r   keysrc   r   
ValueErrorr   r[   r   r   r   )rj   r   countrx   ry   r   s        @r&   set_attn_processorz(ConsistencyDecoderVAE.set_attn_processor   s     D((--/0i&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1 	ALD&'fi@	Ar%   c           	      j   t        d | j                  j                         D              rt               }nmt        d | j                  j                         D              rt	               }n8t        dt        t        | j                  j                                            | j                  |       y)ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   @   K   | ]  }|j                   t        v   y wN)rk   r   .0procs     r&   	<genexpr>zCConsistencyDecoderVAE.set_default_attn_processor.<locals>.<genexpr>  s     i4t~~!>>i   c              3   @   K   | ]  }|j                   t        v   y wr   )rk   r   r   s     r&   r   zCConsistencyDecoderVAE.set_default_attn_processor.<locals>.<genexpr>  s     h$#==hr   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r   nextiterr   )rj   r   s     r&   set_default_attn_processorz0ConsistencyDecoderVAE.set_default_attn_processor  s     i4K_K_KfKfKhii,.Ih$J^J^JeJeJghh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r%   xreturn_dictc                    | j                   rK|j                  d   | j                  kD  s|j                  d   | j                  kD  r| j                  ||      S | j                  rU|j                  d   dkD  rC|j                  d      D cg c]  }| j                  |       }}t        j                  |      }n| j                  |      }| j                  |      }t        |      }|s|fS t        |      S c c}w )al  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a
                plain `tuple` is returned.
        )r   r   r   r   )r`   shaperb   tiled_encoder_   splitrW   r[   catr^   r   r   )rj   r   r   x_sliceencoded_sliceshmoments	posteriors           r&   encodezConsistencyDecoderVAE.encode  s    $ ??d.G.G G177SU;Y]YrYrKr$$QK$@@
QCD771:Ndll73NNN		.)AQA//!$09	<*yAA Os   	C7z	generatornum_inference_stepsc                 :   || j                   j                  z  | j                  z
  | j                  z  }dt	        | j                   j
                        dz
  z  }t        j                  |d|      }|j                  \  }}}}	| j                  j                  || j                         | j                  j                  t        |d||	f||j                  |j                        z  }
| j                  j                  D ]  }t!        j"                  | j                  j%                  |
|      |gd      }| j'                  ||      j(                  d	d	d	dd	d	d	d	f   }| j                  j+                  |||
|      j,                  }|}
 |
}|s|fS t/        |
      S )a  
        Decodes the input latent vector `z` using the consistency decoder VAE model.

        Args:
            z (torch.Tensor): The input latent vector.
            generator (Optional[torch.Generator]): The random number generator. Default is None.
            return_dict (bool): Whether to return the output as a dictionary. Default is True.
            num_inference_steps (int): The number of inference steps. Default is 2.

        Returns:
            Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.

        r   r   nearest)modescale_factor)devicer	   )r   dtyper   dimNsample)ra   r)   rR   rT   rg   rC   Finterpolater   rY   set_timestepsr   init_noise_sigmar   r   	timestepsr[   concatscale_model_inputrX   r   stepprev_sampler   )rj   r   r   r   r   r   
batch_size_heightwidthx_ttmodel_inputmodel_outputr   x_0s                   r&   decodezConsistencyDecoderVAE.decode:  sy   * +++djj8DIIES!?!?@1DEMM!),G'(ww$
Avu,,-@,U$$55FE*iqwwWXW_W_9
 
 ''11 	A,,(>(>(P(PQTVW(XZ['\bcdK,,[!<CCArr1aKPL0055lAsIVbbKC		 6MC((r%   abblend_extentc                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d | |z   d d f   d||z  z
  z  |d d d d |d d f   ||z  z  z   |d d d d |d d f<   C |S )Nr   r   minr   range)rj   r   r   r   ys        r&   blend_vzConsistencyDecoderVAE.blend_vj  s    1771:qwwqz<@|$ 	xAa\MA$5q89Q\AQ=QRUVWXZ[]^`aWaUbfgjvfvUwwAaAqjM	xr%   c                     t        |j                  d   |j                  d   |      }t        |      D ]A  }|d d d d d d | |z   f   d||z  z
  z  |d d d d d d |f   ||z  z  z   |d d d d d d |f<   C |S )Nr	   r   r   )rj   r   r   r   r   s        r&   blend_hzConsistencyDecoderVAE.blend_hq  s    1771:qwwqz<@|$ 	xAaA}q'889Q\AQ=QRUVWXZ[]^`aWaUbfgjvfvUwwAaAqjM	xr%   c           
         t        | j                  d| j                  z
  z        }t        | j                  | j                  z        }| j                  |z
  }g }t	        d|j
                  d   |      D ]  }g }t	        d|j
                  d   |      D ]`  }	|dddd||| j                  z   |	|	| j                  z   f   }
| j                  |
      }
| j                  |
      }
|j                  |
       b |j                  |        g }t        |      D ]  \  }}g }t        |      D ]d  \  }	}
|dkD  r| j                  ||dz
     |	   |
|      }
|	dkD  r| j                  ||	dz
     |
|      }
|j                  |
ddddd|d|f          f |j                  t        j                  |d              t        j                  |d      }t        |      }|s|fS t        |      S )a  Encode a batch of images using a tiled encoder.

        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
        output, but they should be much less noticeable.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                instead of a plain tuple.

        Returns:
            [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
                If return_dict is True, a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
                is returned, otherwise a plain `tuple` is returned.
        r   r   r   r	   Nr   r   )rf   rb   ri   rh   r   r   rW   r^   append	enumerater   r   r[   r   r   r   )rj   r   r   overlap_sizer   	row_limitrowsirowjtileresult_rows
result_rowr   r   s                  r&   r   z"ConsistencyDecoderVAE.tiled_encodew  s   ( 444D<T<T8TUV444t7O7OOP--<	 q!''!*l3 	AC1aggaj,7 !Aq1t'@'@#@@!a$JcJcFcBccd||D)t,

4 	!
 KK	 o 
	=FAsJ$S> F4 q5<<QUAlKDq5<<AE
D,GD!!$q!ZiZ)'C"DEF uyy;<
	= ))KQ/09	<*yAAr%   r   sample_posteriorc                     |}| j                  |      j                  }|r|j                  |      }n|j                         }| j	                  ||      j                  }|s|fS t        |      S )a  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
            generator (`torch.Generator`, *optional*, defaults to `None`):
                Generator to use for sampling.

        Returns:
            [`DecoderOutput`] or `tuple`:
                If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
        )r   r   )r   r   r   r   r   r   )	rj   r   r   r   r   r   r   r   decs	            r&   forwardzConsistencyDecoderVAE.forward  sk    * KKN..	  9 5A Akk!yk1886MC((r%   )g{P?       silu)         r   T)DownEncoderBlock2Dr   r   r   r	   r   r   r   F)i@  i     r   )ResnetDownsampleBlock2Dr   r   r   r      r	   gh㈵>r   r      scale_shiftlearned)ResnetUpsampleBlock2Dr   r   r   )T)NTr   )FTN)&r   r    r!   r"   _supports_group_offloadingr   floatrf   r   r   boolrV   rm   rp   rs   ru   propertyr   r   r   r   r   r   r   r[   Tensorr   r   r   r   	Generatorr   r   r   r   r   r   __classcell__)rk   s   @r&   r(   r(   4   s   $ "' !( $6J!%5
 $%()')$%&+6L5
 +,#$()"'')+/$%/<+43
G^(^( ^( 	^(
 ^( %*#s(O^( ^( #(S/^( !^( #&^(  "%!^(" "#^($  $%^(& %*#s(O'^(( #(S/)^(4 %(5^(6 !7^(8 #&9^(:  ;^(< "%=^(> &)?^(@ "A^(B *-C^(D &)E^(F !&c3hG^( ^(B% %" ! c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF+ 37 B B,0 B	*E2N,OO	P B  BD  04 #$,)<<,) EOO,,) 	,)
 !,) 
}eELL11	2,) ,)^ %,, c ell  %,, c ell 5Bell 5B 5BOjlqOqIr 5Bt "' /3 ) )  ) 	 )
 EOO, ) 
}eELL11	2 )r%   r(   )'dataclassesr   typingr   r   r   r   r[   torch.nn.functionalr   
functionalr   configuration_utilsr
   r   
schedulersr   utilsr   utils.accelerate_utilsr   utils.torch_utilsr   attention_processorr   r   r   r   r   modeling_utilsr   unets.unet_2dr   vaer   r   r   r   r(   r$   r%   r&   <module>r     sn    " / /     B 5  8 -  ( ' E E 
0* 
0 
0Z)J Z)r%   