
    bi	B                         d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZmZ e G d de             Z G d dee
      Zy)    )	dataclass)OptionalTupleUnionN   )ConfigMixinregister_to_config)
BaseOutput   )GaussianFourierProjectionTimestepEmbedding	Timesteps)
ModelMixin   )UNetMidBlock2Dget_down_blockget_up_blockc                   0    e Zd ZU dZej
                  ed<   y)UNet2DOutputz
    The output of [`UNet2DModel`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
            The hidden states output from the last layer of the model.
    sampleN)__name__
__module____qualname____doc__torchTensor__annotations__     Y/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/models/unets/unet_2d.pyr   r      s     LLr   r   c            ;           e Zd ZdZdZdgZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(deee	e
e	e	f   f      de	de	deded	ee	   d
e	dede
edf   dee   de
edf   de
e	df   de	dede	dededededee	   de	dee	   dedededee   dee	   d ee	   f8 fd!       Z	 	 d)d"ej                   d#eej                   ee	f   d$eej                      d%ed&eee
f   f
d'Z xZS )*UNet2DModela  
    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
            1)`.
        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
            Whether to flip sin to cos for Fourier time embedding.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
            Tuple of downsample block types.
        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `None`.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
            Tuple of upsample block types.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
            Tuple of block output channels.
        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
        downsample_type (`str`, *optional*, defaults to `conv`):
            The downsample type for downsampling layers. Choose between "conv" and "resnet"
        upsample_type (`str`, *optional*, defaults to `conv`):
            The upsample type for upsampling layers. Choose between "conv" and "resnet"
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
            given number of groups. If left as `None`, the group norm layer will only be created if
            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
        class_embed_type (`str`, *optional*, defaults to `None`):
            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
            `"timestep"`, or `"identity"`.
        num_class_embeds (`int`, *optional*, defaults to `None`):
            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
            conditioning with `class_embed_type` equal to `None`.
    Tnormsample_sizein_channelsout_channelscenter_input_sampletime_embedding_typetime_embedding_dim
freq_shiftflip_sin_to_cosdown_block_types.mid_block_typeup_block_typesblock_out_channelslayers_per_blockmid_block_scale_factordownsample_paddingdownsample_typeupsample_typedropoutact_fnattention_head_dimnorm_num_groupsattn_norm_num_groupsnorm_epsresnet_time_scale_shiftadd_attentionclass_embed_typenum_class_embedsnum_train_timestepsc                    t         *|           || _        |xs |d   dz  }t        |	      t        |      k7  rt	        d|	 d| d      t        |      t        |	      k7  rt	        d| d|	 d      t        j                  ||d   dd	
      | _        |dk(  rt        |d   d      | _	        d|d   z  }nH|dk(  rt        |d   ||      | _	        |d   }n(|dk(  r#t        j                  ||d         | _	        |d   }t        |      | _        ||t        j                  ||      | _        n?|dk(  rt        ||      | _        n(|dk(  rt        j                  ||      | _        nd | _        t        j                   g       | _        d | _        t        j                   g       | _        |d   }t)        |	      D ]V  \  } }!|}"||    }| t        |      dz
  k(  }#t+        |!||"|||# |||||n|||||      }$| j"                  j-                  |$       X |
d | _        n%t/        |d   ||||||||n|d   |||      | _        t1        t3        |            }%|%d   }t)        |      D ]w  \  } }&|}'|%|    }|%t5        | dz   t        |      dz
           }"| t        |      dz
  k(  }#t7        |&|dz   |"||'||# |||||n||||      }(| j&                  j-                  |(       y ||nt5        |d   dz  d      })t        j8                  |d   |)|      | _        t        j<                         | _        t        j                  |d   |dd
      | _         y )Nr      z\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: r   )r   r   )kernel_sizepaddingfourier   )embedding_sizescaler   
positionallearnedtimestepidentityr   )
num_layersr%   r&   temb_channelsadd_downsample
resnet_epsresnet_act_fnresnet_groupsr7   r2   r;   r3   r5   )r%   rN   r5   rP   rQ   output_scale_factorr;   r7   rR   attn_groupsr<   )rM   r%   r&   prev_output_channelrN   add_upsamplerP   rQ   rR   r7   r;   r4   r5       )num_channels
num_groupseps)!super__init__r$   len
ValueErrornnConv2dconv_inr   	time_projr   	Embeddingr   time_embeddingclass_embeddingIdentity
ModuleListdown_blocks	mid_block	up_blocks	enumerater   appendr   listreversedminr   	GroupNormconv_norm_outSiLUconv_actconv_out)+selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   time_embed_dimtimestep_input_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsup_block_typerV   up_blocknum_groups_out	__class__s+                                             r    r]   zUNet2DModel.__init___   sA   @ 	&+H/A!/Dq/H  C$77no  oA  AU  Vd  Ue  ef  g  !"c*:&;;t  vH  uI  I_  `p  _q  qr  s 
 yy.@.CQR\bc )+6FXYZF[cefDN!"%7%:!: L0&'9!'<ozZDN!3A!6 I-\\*=?QRS?TUDN!3A!6/0BNS #(8(D#%<<0@.#QD +#45G#XD +#%;;~~#ND #'D ==,r* ,A."+,<"= 	0A*M/2N#&8"9A"==N'+)+,#11#$-9K9W#5]k#5(? /J  ##J/+	00 !!DN+.r2,#$$:(?9K9W#5]opr]s-0+DN '+84F+G&H#4Q7 ). 9 	,A}"08;N7AE3GYCZ]^C^8_`M#&8"9A"==N#+a/)+$7,!//#$-9K9W#5]k(?+H  NN!!(+/	,4 -<,GSQcdeQfjkQkmoMp\\7I!7LYgmuv			"4Q"7ST^_`r   r   rK   class_labelsreturn_dictreturnc           	      x   | j                   j                  rd|z  dz
  }|}t        j                  |      s2t        j                  |gt        j
                  |j                        }nKt        j                  |      r6t        |j                        dk(  r|d   j                  |j                        }|t        j                  |j                  d   |j                  |j                        z  }| j                  |      }|j                  | j                        }| j                  |      }| j                  h|t        d      | j                   j                   dk(  r| j                  |      }| j                  |      j                  | j                        }||z   }n| j                  |t        d	      |}	| j#                  |      }|f}
| j$                  D ]0  }t'        |d
      r ||||	      \  }}}	n |||      \  }}|
|z  }
2 | j(                  | j)                  ||      }d}	| j*                  D ]Y  }|
t        |j,                         d }|
dt        |j,                          }
t'        |d
      r |||||	      \  }}	P ||||      }[ | j/                  |      }| j1                  |      }| j3                  |      }|	||	z  }| j                   j4                  dk(  rA|j7                  |j                  d   gdgt        |j                  dd       z        }||z  }|s|fS t9        |      S )a  
        The [`UNet2DModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, channel, height, width)`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.

        Returns:
            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
                returned where the first element is the sample tensor.
        r   g      ?)dtypedevicer   N)r   z=class_labels should be provided when doing class conditioningrK   zJclass_embedding needs to be initialized in order to use class conditioning	skip_conv)hidden_statestembskip_sample)r   r   rE   r   )r   )configr'   r   	is_tensortensorlongr   r^   shapetoonesr   rc   re   rf   r_   r=   rb   ri   hasattrrj   rk   resnetsrr   rt   ru   r(   reshaper   )rv   r   rK   r   r   	timestepst_embemb	class_embr   down_block_res_samplesdownsample_blockres_samplesupsample_blocks                 r    forwardzUNet2DModel.forward   s$   2 ;;**Z#%F 	y)i[

6==YI__Y'C	,@A,E!$**6==9I 

6<<?)//ZcZjZj kk	y)
 tzz*!!%(+# !`aa{{++z9#~~l;,,\:==DJJ=OI	/C!!)l.Fijj f% #) $ 0 0 	2'53C"(s40[ '7VRU&V#"k1"	2 >>%^^FC0F "nn 	BN0#n6L6L2M1M1OPK%;<Zs>CYCY?Z>Z%["~{3&4V[#{&[#'SA	B ##F+v&v&"k!F;;**i7!))6<<?*[qcCUVUWHXDY>Y*[\Ii'F96**r   )Nr   r   FrI   Nr   T)DownBlock2DAttnDownBlock2Dr   r   r   )AttnUpBlock2Dr   r   	UpBlock2D)   i  i  i  r   r   r   convr   g        silu   rX   Ngh㈵>defaultTNNN)NT)r   r   r   r    _supports_gradient_checkpointing _skip_layerwise_casting_patternsr	   r   r   intr   boolstrfloatr]   r   r   r   r   __classcell__)r   s   @r    r"   r"   (   s   1f (,$(.x$ >B$)#/,0 $,t(8*j.B !()"#%#,-!.2'0"*.*.-1;XaeCsCx$89:Xa Xa 	Xa
 "Xa !Xa %SMXa Xa Xa  S/Xa !Xa c3hXa "#s(OXa Xa !&Xa   !Xa" #Xa$ %Xa& 'Xa( )Xa* %SM+Xa, -Xa. 'sm/Xa0 1Xa2 "%3Xa4 5Xa6 #3-7Xa8 #3-9Xa: &c];Xa Xa| 04 h+h+ eS01h+ u||,	h+
 h+ 
|U"	#h+r   r"   )dataclassesr   typingr   r   r   r   torch.nnr`   configuration_utilsr   r	   utilsr
   
embeddingsr   r   r   modeling_utilsr   unet_2d_blocksr   r   r   r   r"   r   r   r    <module>r      sV    " ) )   B  P P ' H H 	: 	 	z+*k z+r   