
    bi                        d Z ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZmZ  e       rddlZ ej4                  e      Z G d
 dee
      Z e       rddlm Z   G d dejB                        Z"e G d d             Z#e G d de#             Z$e G d de#             Z%e G d de#             Z&e G d de#             Z'y)z
Adapted from
https://github.com/huggingface/transformers/blob/52cb4034ada381fe1ffe8d428a1076e5411a8026/src/transformers/utils/quantization_config.py
    N)	dataclass)Enum)partial)AnyDictListOptionalUnion)version   )is_torch_availableis_torchao_availableloggingc                       e Zd ZdZdZdZdZy)QuantizationMethodbitsandbytesgguftorchaoquantoN)__name__
__module____qualname__BITS_AND_BYTESGGUFTORCHAOQUANTO     c/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/quantizers/quantization_config.pyr   r   ,   s    #NDGFr   r   )MappingTypec                        e Zd Z fdZ xZS )TorchAoJSONEncoderc                 Z    t        |t              r|j                  S t        |   |      S N)
isinstancer    namesuperdefault)selfobj	__class__s     r   r(   zTorchAoJSONEncoder.default7   s%    #{+xx7?3''r   )r   r   r   r(   __classcell__)r+   s   @r   r"   r"   6   s    	( 	(r   r"   c                       e Zd ZU dZeed<   g Zedd       Zde	e
ej                  f   fdZdee
ef   fdZd Zd	 Zdd
ede
fdZd Zy)QuantizationConfigMixinz-
    Mixin class for quantization config
    quant_methodc                      | di |}g }|j                         D ]0  \  }}t        ||      st        |||       |j                  |       2 |D ]  }|j	                  |d        |r||fS |S )a  
        Instantiates a [`QuantizationConfigMixin`] from a Python dictionary of parameters.

        Args:
            config_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object.
            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
                `PreTrainedModel`.
            kwargs (`Dict[str, Any]`):
                Additional parameters from which to initialize the configuration object.

        Returns:
            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
        Nr   )itemshasattrsetattrappendpop)clsconfig_dictreturn_unused_kwargskwargsconfig	to_removekeyvalues           r   	from_dictz!QuantizationConfigMixin.from_dictF   s    $ #{#	 ,,. 	&JCvs#U+  %	&  	"CJJsD!	"  6>!Mr   json_file_pathc                     t        |dd      5 }| j                         }t        j                  |dd      dz   }|j	                  |       ddd       y# 1 sw Y   yxY w)	a  
        Save this instance to a JSON file.

        Args:
            json_file_path (`str` or `os.PathLike`):
                Path to the JSON file in which this configuration instance's parameters will be saved.
            use_diff (`bool`, *optional*, defaults to `True`):
                If set to `True`, only the difference between the config instance and the default
                `QuantizationConfig()` is serialized to JSON file.
        wzutf-8)encodingr   Tindent	sort_keys
N)opento_dictjsondumpswrite)r)   r?   writerr7   json_strings        r   to_json_filez$QuantizationConfigMixin.to_json_fileg   sU     .#8 	&F,,.K**[dKdRKLL%		& 	& 	&s   =AAreturnc                 @    t        j                  | j                        S )
        Serializes this instance to a Python dictionary. Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
        )copydeepcopy__dict__r)   s    r   rH   zQuantizationConfigMixin.to_dictx   s    
 }}T]]++r   c              #      K   t        j                  | j                        j                         D ]  \  }}||f  yw)zTallows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixinN)rR   rS   rT   r1   )r)   attrr=   s      r   __iter__z QuantizationConfigMixin.__iter__   s9     ==7==? 	KD%+	s   =?c                 T    | j                   j                   d| j                          S )N )r+   r   to_json_stringrU   s    r   __repr__z QuantizationConfigMixin.__repr__   s(    ..))*!D,?,?,A+BCCr   use_diffc                     |du r| j                         }n| j                         }t        j                  |dd      dz   S )a  
        Serializes this instance to a JSON string.

        Args:
            use_diff (`bool`, *optional*, defaults to `True`):
                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
                is serialized to JSON string.

        Returns:
            `str`: String containing all the attributes that make up this configuration instance in JSON format.
        Tr   rC   rF   )to_diff_dictrH   rI   rJ   )r)   r]   r7   s      r   r[   z&QuantizationConfigMixin.to_json_string   s=     t++-K,,.Kzz+a4@4GGr   c                     g }|j                         D ]0  \  }}t        | |      st        | ||       |j                  |       2 |j                         D ci c]  \  }}||vs|| }}}|S c c}}w )a  
        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
        returning all the unused kwargs.

        Args:
            kwargs (`Dict[str, Any]`):
                Dictionary of attributes to tentatively update this class.

        Returns:
            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
        )r1   r2   r3   r4   )r)   r9   r;   r<   r=   unused_kwargss         r   updatezQuantizationConfigMixin.update   s}     	 ,,. 	&JCtS!c5)  %	& 7=lln]
US\H\e]] ^s   A0&A0N)F)T)r   r   r   __doc__r   __annotations___exclude_attributes_at_initclassmethodr>   r
   strosPathLikerN   r   r   rH   rX   r\   boolr[   rb   r   r   r   r.   r.   =   s|     %$"$ @&5bkk1A+B &",c3h ,
DHt Hs H$r   r.   c                       e Zd ZdZg dZ	 	 	 	 	 	 	 	 	 	 ddZed        Zej                  de	fd       Zed        Z
e
j                  de	fd	       Z
d
 Zd Zd Zdeeef   fdZd Zdeeef   fdZy)BitsAndBytesConfiga  
    This is a wrapper class about all possible attributes and features that you can play with a model that has been
    loaded using `bitsandbytes`.

    This replaces `load_in_8bit` or `load_in_4bit` therefore both options are mutually exclusive.

    Currently only supports `LLM.int8()`, `FP4`, and `NF4` quantization. If more methods are added to `bitsandbytes`,
    then more arguments will be added to this class.

    Args:
        load_in_8bit (`bool`, *optional*, defaults to `False`):
            This flag is used to enable 8-bit quantization with LLM.int8().
        load_in_4bit (`bool`, *optional*, defaults to `False`):
            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
            `bitsandbytes`.
        llm_int8_threshold (`float`, *optional*, defaults to 6.0):
            This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
            Multiplication for Transformers at Scale` paper: https://huggingface.co/papers/2208.07339 Any hidden states
            value that is above this threshold will be considered an outlier and the operation on those values will be
            done in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5],
            but there are some exceptional systematic outliers that are very differently distributed for large models.
            These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of
            magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6,
            but a lower threshold might be needed for more unstable models (small models, fine-tuning).
        llm_int8_skip_modules (`List[str]`, *optional*):
            An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as
            Jukebox that has several heads in different places and not necessarily at the last position. For example
            for `CausalLM` models, the last `lm_head` is typically kept in its original `dtype`.
        llm_int8_enable_fp32_cpu_offload (`bool`, *optional*, defaults to `False`):
            This flag is used for advanced use cases and users that are aware of this feature. If you want to split
            your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
            this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
            operations will not be run on CPU.
        llm_int8_has_fp16_weight (`bool`, *optional*, defaults to `False`):
            This flag runs LLM.int8() with 16-bit main weights. This is useful for fine-tuning as the weights do not
            have to be converted back and forth for the backward pass.
        bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
            This sets the computational type which might be different than the input type. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.
        bnb_4bit_quant_type (`str`,  *optional*, defaults to `"fp4"`):
            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
            which are specified by `fp4` or `nf4`.
        bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
            This flag is used for nested quantization where the quantization constants from the first quantization are
            quantized again.
        bnb_4bit_quant_storage (`torch.dtype` or str, *optional*, defaults to `torch.uint8`):
            This sets the storage type to pack the quanitzed 4-bit prarams.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional parameters from which to initialize the configuration object.
    )_load_in_4bit_load_in_8bitr/   Nc                 N    t         j                   _        |r|rt        d      | _        | _        | _        | _        | _        | _	        | _
        |	 _        |t        j                   _        nSt        |t               rt#        t        |       _        n-t        |t        j$                        r| _        nt        d      |
t        j&                   _        nbt        |
t               r%|
dvrt        d      t#        t        |
       _        n-t        |
t        j$                        r|
 _        nt        d      |rQt+         fd|D              s=t,        j/                  dt1        |j3                                d j4                   d	        j7                          y )
NVload_in_4bit and load_in_8bit are both True, but only one can be used at the same timez8bnb_4bit_compute_dtype must be a string or a torch.dtype)float16float32int8uint8float64bfloat16zv`bnb_4bit_quant_storage` must be a valid string (one of 'float16', 'float32', 'int8', 'uint8', 'float64', 'bfloat16') z8bnb_4bit_quant_storage must be a string or a torch.dtypec              3   :   K   | ]  }|j                   v   y wr$   )re   ).0kr)   s     r   	<genexpr>z.BitsAndBytesConfig.__init__.<locals>.<genexpr>  s     Ta4#C#CCTs   zUnused kwargs: z. These kwargs are not used in .)r   r   r/   
ValueErrorrn   rm   llm_int8_thresholdllm_int8_skip_modules llm_int8_enable_fp32_cpu_offloadllm_int8_has_fp16_weightbnb_4bit_quant_typebnb_4bit_use_double_quanttorchrr   bnb_4bit_compute_dtyper%   rg   getattrdtypert   bnb_4bit_quant_storageallloggerwarninglistkeysr+   	post_init)r)   load_in_8bitload_in_4bitr}   r~   r   r   r   r   r   r   r9   s   `           r   __init__zBitsAndBytesConfig.__init__   sm    /==Luvv))"4%:"0P-(@%#6 )B&!)*/--D'.4*1%9O*PD'.<*@D'WXX!)*/++D'.4%-kk  M  +2%9O*PD'.<*@D'WXX#TVTTNN_T&++--@,AA`aeaoao`ppqrsr   c                     | j                   S r$   )rm   rU   s    r   r   zBitsAndBytesConfig.load_in_4bit      !!!r   r=   c                 z    t        |t              st        d      | j                  r|rt	        d      || _        y )Nload_in_4bit must be a booleanrp   )r%   rj   	TypeErrorr   r|   rm   r)   r=   s     r   r   zBitsAndBytesConfig.load_in_4bit"  7    %&<==uvv"r   c                     | j                   S r$   )rn   rU   s    r   r   zBitsAndBytesConfig.load_in_8bit+  r   r   c                 z    t        |t              st        d      | j                  r|rt	        d      || _        y )Nload_in_8bit must be a booleanrp   )r%   rj   r   r   r|   rn   r   s     r   r   zBitsAndBytesConfig.load_in_8bit/  r   r   c                    t        | j                  t              st        d      t        | j                  t              st        d      t        | j
                  t              st        d      | j                  %t        | j                  t              st        d      t        | j                  t              st        d      t        | j                  t              st        d      | j                  /t        | j                  t        j                        st        d      t        | j                  t              st        d	      t        | j                   t              st        d
      | j                  rTt#        j$                  t&        j(                  j#                  d            t#        j$                  d      k\  st+        d      yy)z~
        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
        r   r   z"llm_int8_threshold must be a floatNz/llm_int8_skip_modules must be a list of stringsz2llm_int8_enable_fp32_cpu_offload must be a booleanz*llm_int8_has_fp16_weight must be a booleanz*bnb_4bit_compute_dtype must be torch.dtypez$bnb_4bit_quant_type must be a stringz+bnb_4bit_use_double_quant must be a booleanr   z0.39.0z[4 bit quantization requires bitsandbytes>=0.39.0 - please upgrade your bitsandbytes version)r%   r   rj   r   r   r}   floatr~   r   r   r   r   r   r   r   rg   r   r   parse	importlibmetadatar|   rU   s    r   r   zBitsAndBytesConfig.post_init8  st    $++T2<==$++T2<==$1159@AA%%1*TE_E_ae:fMNN$??FPQQ$77>HII&&2:dFaFachcncn;oHII$22C8BCC$88$?IJJW]]93E3E3M3Mn3]%^biboboc
 &
 m &
r   c                 6    | j                   xs | j                  S )zP
        Returns `True` if the model is quantizable, `False` otherwise.
        )r   r   rU   s    r   is_quantizablez!BitsAndBytesConfig.is_quantizable]  s       5D$5$55r   c                     | j                   ry| j                  r| j                  dk(  ry| j                  r| j                  dk(  ryy)z
        This method returns the quantization method used for the model. If the model is not quantizable, it returns
        `None`.
        llm_int8fp4nf4N)r   r   r   rU   s    r   quantization_methodz&BitsAndBytesConfig.quantization_methodc  sE    
 4#;#;u#D4#;#;u#Dr   rO   c                    t        j                  | j                        }t        |d         j	                  d      d   |d<   t        |d         j	                  d      d   |d<   | j
                  |d<   | j                  |d<   |S )rQ   r   r{      r   r   r   )rR   rS   rT   rg   splitr   r   )r)   outputs     r   rH   zBitsAndBytesConfig.to_dictq  s    
 t}}-+.v6N/O+P+V+VWZ+[\]+^'(+.v6N/O+P+V+VWZ+[\]+^'(!%!2!2~!%!2!2~r   c                     | j                         }| j                  j                   dt        j                  |dd       dS )NrZ   r   TrC   rF   )rH   r+   r   rI   rJ   r)   r7   s     r   r\   zBitsAndBytesConfig.__repr__~  s;    lln..))*!DJJ{1X\,]+^^`aar   c                     | j                         }t               j                         }i }|j                         D ]  \  }}|||   k7  s|||<    |S )a'  
        Removes all attributes from config which correspond to the default config attributes for better readability and
        serializes to a Python dictionary.

        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
        )rH   rl   r1   )r)   r7   default_config_dictserializable_config_dictr<   r=   s         r   r_   zBitsAndBytesConfig.to_diff_dict  sh     lln 12::<#%  &++- 	6JC+C0005(-	6 ('r   )
FFg      @NFFNr   FN)r   r   r   rc   re   r   propertyr   setterrj   r   r   r   r   r   rg   r   rH   r\   r_   r   r   r   rl   rl      s    1f #U ").!&#!"'#5n " " #$ # # " " #$ # ##J6c3h b(d38n (r   rl   c                   $    e Zd ZdZdded   fdZy)GGUFQuantizationConfigaI  This is a config class for GGUF Quantization techniques.

    Args:
        compute_dtype: (`torch.dtype`, defaults to `torch.float32`):
            This sets the computational type which might be different than the input type. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.

    Ncompute_dtypeztorch.dtypec                     t         j                  | _        || _        d| _        d | _        | j                  t        j                  | _        y y )NT)r   r   r/   r   pre_quantizedmodules_to_not_convertr   rr   )r)   r   s     r   r   zGGUFQuantizationConfig.__init__  sG    .33*! '+#%!&D &r   r$   )r   r   r   rc   r	   r   r   r   r   r   r     s    	/h}&= 	/r   r   c                   d    e Zd ZdZddedeee      ddfdZed        Z	e
defd       Zd	 Zd
 Zy)TorchAoConfigaJ
  This is a config class for torchao quantization/sparsity techniques.

    Args:
        quant_type (`str`):
            The type of quantization we want to use, currently supporting:
                - **Integer quantization:**
                    - Full function names: `int4_weight_only`, `int8_dynamic_activation_int4_weight`,
                      `int8_weight_only`, `int8_dynamic_activation_int8_weight`
                    - Shorthands: `int4wo`, `int4dq`, `int8wo`, `int8dq`

                - **Floating point 8-bit quantization:**
                    - Full function names: `float8_weight_only`, `float8_dynamic_activation_float8_weight`,
                      `float8_static_activation_float8_weight`
                    - Shorthands: `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`,
                      `float8_e4m3_tensor`, `float8_e4m3_row`,

                - **Floating point X-bit quantization:**
                    - Full function names: `fpx_weight_only`
                    - Shorthands: `fpX_eAwB`, where `X` is the number of bits (between `1` to `7`), `A` is the number
                      of exponent bits and `B` is the number of mantissa bits. The constraint of `X == A + B + 1` must
                      be satisfied for a given shorthand notation.

                - **Unsigned Integer quantization:**
                    - Full function names: `uintx_weight_only`
                    - Shorthands: `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo`
        modules_to_not_convert (`List[str]`, *optional*, default to `None`):
            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
            modules left in their original precision.
        kwargs (`Dict[str, Any]`, *optional*):
            The keyword arguments for the chosen type of quantization, for example, int4_weight_only quantization
            supports two keyword arguments `group_size` and `inner_k_tiles` currently. More API examples and
            documentation of arguments can be found in
            https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques

    Example:
        ```python
        from diffusers import FluxTransformer2DModel, TorchAoConfig

        quantization_config = TorchAoConfig("int8wo")
        transformer = FluxTransformer2DModel.from_pretrained(
            "black-forest-labs/Flux.1-Dev",
            subfolder="transformer",
            quantization_config=quantization_config,
            torch_dtype=torch.bfloat16,
        )
        ```
    N
quant_typer   rO   c           	         t         j                  | _        || _        || _        d|v r|d   | _        n|| _        | j                         }| j                  |j                         vr|| j                  j                  d      xs | j                  j                  d      }|r)| j                         st        d| j                   d      t        d| j                   d      || j                     }t        j                  |      }|j                  j                         D ch c]N  }|j                  t        j                   j"                  t        j                   j$                  fv r|j&                  P }	}t)        | j
                  j                         |	z
        }
t+        |
      dkD  rt        d| d	|
 d
|	 d      y c c}w )Nquant_type_kwargsr   fpzRequested quantization type: z is not supported on GPUs with CUDA capability <= 8.9. You can check the CUDA capability of your GPU using `torch.cuda.get_device_capability()`.z is not supported or is an incorrect `quant_type` name. If you think the provided quantization type should be supported, please open an issue at https://github.com/huggingface/diffusers/issues.r   zThe quantization method "z4" does not support the following keyword arguments: z2. The following keywords arguments are supported: r{   )r   r   r/   r   r   r   !_get_torchao_quant_type_to_methodr   
startswith&_is_xpu_or_cuda_capability_atleast_8_9r|   inspect	signature
parametersvalueskind	ParameterKEYWORD_ONLYPOSITIONAL_OR_KEYWORDr&   r   len)r)   r   r   r9   TORCHAO_QUANT_TYPE_METHODSis_floating_quant_typemethodr   param
all_kwargsunsupported_kwargss              r   r   zTorchAoConfig.__init__  s   .66$&<# &(%+,?%@D"%+D"%)%K%K%M"??"<"A"A"CC%)__%?%?%H%lDOOLfLfgkLl"%d.Y.Y.[ 3DOO3D El m 
 //@ AK L 
 ,DOO<%%f-	 #--446
zzg//<<g>O>O>e>eff JJ

 

 "$"8"8"="="?*"LM!"Q&+J<7k%&&XYcXddeg  '
s   "AGc                 x   t               r!ddlmm}m}mm}m}m}m	}m
} ddlmm dt        j                  ffd}dt         ffd}	||||d}
||||d	}t#        |t        j$                  
      |t#        |t        j$                  
      t#        |t        j&                  
      t#        t        j&                  t        j&                        d |t        j&                        d|i |	d       |	d       |	d       |	d       |	d      }|t#        |t        j(                        t#        |t        j*                        t#        |t        j,                        t#        |t        j.                        t#        |t        j0                        t#        |t        j2                        t#        |t        j4                        d}i }|j7                  |
       |j7                  |       |j7                  |       | j9                         r|j7                  |       |S t;        d      )z`
        Returns supported torchao quantization types with all commonly used notations.
        r   )	'float8_dynamic_activation_float8_weight&float8_static_activation_float8_weightfloat8_weight_onlyfpx_weight_onlyint4_weight_only#int8_dynamic_activation_int4_weight#int8_dynamic_activation_int8_weightint8_weight_onlyuintx_weight_only)PerRow	PerTensorr   c           	          | t         j                  k(  rdnd}i }fD ].  }|u rdnd}t        | |  |        |       f      |d| d| <   0 |S )Ne5m2e4m3tensorrow)activation_dtypeweight_dtypegranularity	float8dq__)r   float8_e5m2r   )r   r&   typesgranularity_clsgranularity_namer   r   r   s        r   generate_float8dq_typeszPTorchAoConfig._get_torchao_quant_type_to_method.<locals>.generate_float8dq_types!  s}    !&%*;*;!;v(16': O3Bi3OxUZ$CJ?).%*%4%68I$J	DEIdV1-=,>?@ r   bitsc           	          i }t        d|       D ]$  }| |z
  dz
  }t        ||      |d|  d| d| <   & | dz
  }|dz   dz  }||z
  }t        ||      |d|  <   |S )Nr   )ebitsmbitsr   _emr   )ranger   )r   r   r   r   non_sign_bitsdefault_ebitsdefault_mbitsr   s          r   generate_fpx_quantization_typeszXTorchAoConfig._get_torchao_quant_type_to_method.<locals>.generate_fpx_quantization_types1  s    "1d^ mE 5L1,E:A/Y^fk:lEBtfBugQug67m !%q!.!2q 8 - =%,_MYf%g4&k"r   )int4wor   int4dqr   )int8wor   int8dqr   )r   )r   r   )float8wor   float8wo_e5m2float8wo_e4m3float8dqr   float8dq_e4m3r                  )r   )r   uint1wouint2wouint3wouint4wouint5wouint6wouint7wozYTorchAoConfig requires torchao to be installed, please install with `pip install torchao`)r   torchao.quantizationr   r   r   r   r   r   r   r   r   torchao.quantization.observerr   r   r   r   intr   r   float8_e4m3fnuint1uint2uint3uint4uint5uint6uint7rb   r   r|   )r6   r   r   r   r   r   r   r   r   r   INT4_QUANTIZATION_TYPESINT8_QUANTIZATION_TYPESFLOATX_QUANTIZATION_TYPESUINTX_QUANTIZATION_DTYPESQUANTIZATION_TYPESr   r   r   r   s                  @@@@r   r   z/TorchAoConfig._get_torchao_quant_type_to_method
  s     !
 
 
 Hu{{  c   +$4=7Z'# +$4=7Z'# $$6UEVEVW&8!();%J[J[!\!();%J]J]!^C;b ");%*%8%8!&!4!4")#)2 *%*=*=>3#)6 9:`7#)< 2!4=#)> 2!4?#)@ 2!4A#)B 2!4C#)D 2!4E#)%L &7"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH"#4EKKH
)% "$%%&=>%%&=>%%&?@99;"))*CD%%k r   c                      t         j                  j                         r0t         j                  j                         \  } }| dk(  r|dk\  S | dk\  S t         j                  j                         ryt        d      )N   	   TzPTorchAO requires a CUDA compatible GPU or Intel XPU and installation of PyTorch.)r   cudais_availableget_device_capabilityxpuRuntimeError)majorminors     r   r   z4TorchAoConfig._is_xpu_or_cuda_capability_atleast_8_9  sb    ::""$ ::;;=LE5zz!A:YY##%qrrr   c                 `    | j                         } || j                     di | j                  S )Nr   )r   r   r   )r)   r   s     r   get_apply_tensor_subclassz'TorchAoConfig.get_apply_tensor_subclass  s1    %)%K%K%M":)$//:TT=S=STTr   c                     | j                         }| j                  j                   dt        j                  |ddt
               dS )a_  
        Example of how this looks for `TorchAoConfig("uint4wo", group_size=32)`:

        ```
        TorchAoConfig {
            "modules_to_not_convert": null,
            "quant_method": "torchao",
            "quant_type": "uint4wo",
            "quant_type_kwargs": {
                "group_size": 32
            }
        }
        ```
        rZ   r   T)rD   rE   r6   rF   )rH   r+   r   rI   rJ   r"   r   s     r   r\   zTorchAoConfig.__repr__  sB     lln~~&&'qKUY_q)r(ssuv	
r   r$   )r   r   r   rc   rg   r	   r   r   rf   r   staticmethodrj   r   r&  r\   r   r   r   r   r     sm    .`&3 &c@S &im &P D DL 	sD 	s 	sU
r   r   c                   8    e Zd ZdZ	 	 ddedeee      fdZd Zy)QuantoConfiga  
    This is a wrapper class about all possible attributes and features that you can play with a model that has been
    loaded using `quanto`.

    Args:
        weights_dtype (`str`, *optional*, defaults to `"int8"`):
            The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2")
       modules_to_not_convert (`list`, *optional*, default to `None`):
            The list of modules to not quantize, useful for quantizing models that explicitly require to have some
            modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
    Nweights_dtyper   c                 j    t         j                  | _        || _        || _        | j                          y r$   )r   r   r/   r+  r   r   )r)   r+  r   r9   s       r   r   zQuantoConfig.__init__  s,     /55*&<#r   c                 ^    g d}| j                   |vrt        d| d| j                          y)z;
        Safety checker that arguments are correct
        )float8rs   int4int2zOnly support weights in z but found N)r+  r|   )r)   accepted_weightss     r   r   zQuantoConfig.post_init  sA     >%5578H7IUYUgUgThijj 6r   )rs   N)	r   r   r   rc   rg   r	   r   r   r   r   r   r   r*  r*    s5    
 $6:

 !)c 3
kr   r*  )(rc   rR   importlib.metadatar   r   rI   rh   dataclassesr   enumr   	functoolsr   typingr   r   r   r	   r
   	packagingr   utilsr   r   r   r   
get_loggerr   r   rg   r   %torchao.quantization.quant_primitivesr    JSONEncoderr"   r.   rl   r   r   r*  r   r   r   <module>r<     s  $
     	 !   3 3  E E 			H	%d  A(T-- ( o o od e(0 e( e(P /4 / /, B
+ B
 B
J k* k kr   