
    bif                         d Z ddlmZmZmZmZmZmZ ddlm	Z	 ddl
mZ erddlmZ ddlmZmZmZmZmZmZ  e       rdd	lZ ej*                  e      Z G d
 de      Z G d de      Zy	)z
Adapted from
https://github.com/huggingface/transformers/blob/c409cd81777fb27aadc043ed3d8339dbc020fb3b/src/transformers/quantizers/quantizer_bnb_4bit.py
    )TYPE_CHECKINGAnyDictListOptionalUnion   )get_module_from_name   )DiffusersQuantizer)
ModelMixin)is_accelerate_availableis_accelerate_versionis_bitsandbytes_availableis_bitsandbytes_versionis_torch_availableloggingNc                   2    e Zd ZdZdZdZ fdZd ZddZdd	d
dde	de
e	ef   def
dZ	 ddd	d
dde	ddde
e	ef   deee	      fdZd Zde
e	eee	f   f   de
e	eee	f   f   fdZd dZd Zg fdd	dee	   fdZd!dZed        Zedefd       Zd Z xZS )"BnB4BitDiffusersQuantizera  
    4-bit quantization from bitsandbytes.py quantization method:
        before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the
        layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call saving:
            from state dict, as usual; saves weights and `quant_state` components
        loading:
            need to locate `quant_state` components and pass to Param4bit constructor
    TFc                     t        |   |fi | | j                  j                  | j                  j                  | _        y y Nsuper__init__quantization_configllm_int8_skip_modulesmodules_to_not_convertselfr   kwargs	__class__s      j/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/quantizers/bitsandbytes/bnb_quantizer.pyr   z"BnB4BitDiffusersQuantizer.__init__9   B    ,77##99E*.*B*B*X*XD' F    c                    t         j                  j                         s)t         j                  j                         st	        d      t               rt        dd      rt        d      t               rt        dd      rt        d      |j                  dd      rt        d	      |j                  d
d       }|t        |t              rw| j                  j                  s`|j!                         D ci c]  }|| j"                  vs|||    }}d|j%                         v sd|j%                         v rt        d      y y y y c c}w )N/No GPU found. A GPU is needed for quantization.<0.26.0z_Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>=0.26.0'`0.43.3zrUsing `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`	from_flaxFzConverting into 4-bit weights from flax weights is currently not supported, please make sure the weights are in PyTorch format.
device_mapcpudisk  Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. torchcudais_availablexpuRuntimeErrorr   r   ImportErrorr   r   get
ValueError
isinstancedictr    llm_int8_enable_fp32_cpu_offloadkeysr   valuesr   argsr    r+   keydevice_map_without_no_converts         r"   validate_environmentz.BnB4BitDiffusersQuantizer.validate_environment?   S   

'')UYY-C-C-EPQQ&(,A#x,Pq  )*.Ec8.T E  ::k5); 
 ZZd3
":t,,,MM 1;0A-),SPTPkPkEkZ_$-) - 5<<>>&LiLpLpLrBr )  Cs N - #-   3EEreturnc                     |t         j                  k7  r'ddlm} t        j                  d       |j                  S t        d| d      )Nr   )CustomDtypezXtarget_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantizationzWrong `target_dtype` (z) provided.)r0   int8accelerate.utilsrF   loggerinfoINT4r7   )r   target_dtyperF   s      r"   adjust_target_dtypez-BnB4BitDiffusersQuantizer.adjust_target_dtyped   s?    5::%4KKrs###5l^;OPPr$   modelr   param_valuetorch.Tensor
param_name
state_dictc                     dd l }t        ||      \  }}t        |j                  j	                  |d       |j
                  j                        ryt        ||j
                  j                        r|dk(  ryy)Nr   TbiasF)bitsandbytesr
   r8   _parametersr6   nn
Params4bit
Linear4bit	r   rN   rO   rQ   rR   r    bnbmoduletensor_names	            r"   check_if_quantized_paramz2BnB4BitDiffusersQuantizer.check_if_quantized_paramm   se     	#25*Ef((,,[$?ARARS 1 12{f7L r$   target_devicetorch.deviceunexpected_keysc                    dd l }t        ||      \  }	}
|
|	j                  vrt        |	 d|
 d      t	        |	|
      }|
dk(  r`||j                  |      }n|j                  |      }t        j                  j                  ||j                        }||	j                  |
<   y t        |	j                  |
   |j                  j                        st        d      |j                  t        j                  d      k(  r,|dt        j                  d      fvr|t        |
 d| d      | j                  r| j                  st        d	      |d
z   |vr|dz   |vrt        d| d      i }|j                         D ]=  \  }}|dz   |v s|j!                  |      s |||<   |(||v s-|j#                  |       ? |j                  j                  j%                  ||d|      }nK|j                  d      }|j&                  } |j                  j                  |fddi|j                  |      }||	j                  |
<   y )Nr   - does not have a parameter or a buffer named .rT   )requires_gradz0this function only loads `Linear4bit components`meta7 is on the meta device, we need a `value` to put in on zDetected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`.z.quant_state.bitsandbytes__fp4z.quant_state.bitsandbytes__nf4zSupplied state dict for zT does not contain `bitsandbytes__*` and possibly other `quantized_stats` components.F)dataquantized_statsre   devicer,   re   )rU   r
   rV   r7   getattrtor0   rW   	Parameterre   r8   rX   rj   pre_quantizedis_serializableitems
startswithremovefrom_prequantized__dict__)r   rN   rO   rQ   r_   rR   ra   r    r[   r\   r]   	old_value	new_valueri   kvs                   r"   create_quantized_paramz0BnB4BitDiffusersQuantizer.create_quantized_param   sq    	#25*Ef000x'TU`TaabcddFK0	& "%LL7	'NN=9	**9ID[D[*\I.7F{+&,,[9366;L;LMOPPV 44fell6.B%CC#},cdqcrrstuu  '' u 
 ==ZO==ZO .zl  ;O  P  !O"((* 21 #q(Q\\*-E)*OA&&2qO7K'..q12 ));;  /#$	 < I $u-I''F))))S5SFSVVWdeI*3;'r$   c           	          |j                   }|j                   }|j                         }d|v r|fn	|dz   dz  df}||k7  rt        d| d| d| d      y)	NrT      r   z3Expected the flattened shape of the current param (z) to be z but is rd   T)shapenumelr7   )r   rQ   current_paramloaded_paramcurrent_param_shapeloaded_param_shapeninferred_shapes           r"   check_quantized_param_shapez5BnB4BitDiffusersQuantizer.check_quantized_param_shape   s    +11)//%%'!':!5!QUqL!;L/Ej\QYZlYmmu  wE  vF  FG  H  r$   
max_memoryc                 ^    |j                         D ci c]  \  }}||dz   }}}|S c c}}w Ng?rp   r   r   r?   vals       r"   adjust_max_memoryz+BnB4BitDiffusersQuantizer.adjust_max_memory   6    6@6F6F6HI(#sc3:oI
I J   )c                 V    |&t         j                  d|       t        j                  }|S Na  Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.float16 to remove this warning.rI   rJ   r0   float16r   torch_dtypes     r"   update_torch_dtypez,BnB4BitDiffusersQuantizer.update_torch_dtype   0    KKE   --Kr$   c                     |zt         j                  j                         r"dt         j                  j                          }n!dt         j                  j                          }d|i}t
        j                  d       |S Nzxpu:zcuda: zThe device_map was not initialized. Setting device_map to {: {current_device}}. If you want to use the model for inference, please set device_map ='auto' r0   r3   r2   current_devicer1   rI   rJ   r   r+   r   s      r"   update_device_mapz+BnB4BitDiffusersQuantizer.update_device_map   p    yy%%'#'		(@(@(B'C!D#()B)B)D(E!Fn-JKK] r$   keep_in_fp32_modulesc                    ddl m} | j                  j                  }| j                  j                  | _        t        | j
                  t              s| j
                  g| _        | j
                  j                  |       t        |t              ryt        |j                               dkD  r]|j                         D cg c]  \  }}|dv s| }	}}t        |	      dkD  r|st        d      | j
                  j                  |	       | j
                  D 
cg c]  }
|
|
	 c}
| _         ||| j
                  | j                        }| j                  |j                  _        d|_        y c c}}w c c}
w Nr{   )replace_with_bnb_linear)r-   r,   r   zIf you want to offload some keys to `cpu` or `disk`, you need to set `llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be  converted to 8-bit but kept in 32-bit.)r   r   T)utilsr   r   r:   r   r   r8   listextendr9   lenr;   rp   r7   configis_loaded_in_4bitr   rN   r+   r   r    r   load_in_8bit_fp32_cpu_offloadr?   valuekeys_on_cpur\   s              r"   $_process_model_before_weight_loadingz>BnB4BitDiffusersQuantizer._process_model_before_weight_loading   L    	3(,(@(@(a(a% '+&>&>&T&T#$55t<+/+F+F*GD'##**+?@ j$'C
0A,BQ,F1;1A1A1C`:3uP_G_3`K`;!#,I > 
 ''..{; =A<W<W&n&[a[mv&n#'$*E*E[_[s[s
 ,0+C+C("&) a 'o   >E,E,E2E2c                 (    | j                   |_        |S r   )ro   is_4bit_serializabler   rN   r    s      r"   #_process_model_after_weight_loadingz=BnB4BitDiffusersQuantizer._process_model_after_weight_loading)      %)%9%9"r$   c                      yNT r   s    r"   ro   z)BnB4BitDiffusersQuantizer.is_serializable-       r$   c                      yr   r   r   s    r"   is_trainablez&BnB4BitDiffusersQuantizer.is_trainable2  r   r$   c                    ddl m} |j                  j                  dk(  }|rt        j                  d       t        j                  j                         r.|j                  t        j                  j                                n-|j                  t        j                  j                                 ||| j                  | j                        }|r|j                  d       |S )Nr{   dequantize_and_replacer,   zModel was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device.r   )r   r   rj   typerI   rJ   r0   r3   r2   rl   r   r1   r   r   )r   rN   r   is_model_on_cpus       r"   _dequantizez%BnB4BitDiffusersQuantizer._dequantize7  s    1,,++u4KK ^ yy%%'11342245&4..DD\D\
 HHUOr$   rL   torch.dtyperD   r   r   r   r   rD   r   rN   r   )__name__
__module____qualname____doc__use_keep_in_fp32_modulesrequires_calibrationr   rA   rM   strr   r   boolr^   r   r   ry   r   r   intr   r   r   r   r   propertyro   r   r   __classcell__r!   s   @r"   r   r   ,   sh     $ Y#JQ $ 	
 cN 
8 04L4L4 $L4 	L4
 &L4 cNL4 "$s),L4\DeCHo1E,F 4PSUZ[^`c[cUdPdKe 
& +-	)')' #3i	)'V   d  r$   r   c                   >    e Zd ZdZdZdZ fdZd Zdee	e
ee	f   f   dee	e
ee	f   f   fdZdd	Zd
 ZddZddddde	dee	ef   fdZ	 d ddddde	dddee	ef   deee	      fdZd!dZg fdddee	   fdZed        Zedefd       Zedefd       Zd Z xZS )"BnB8BitDiffusersQuantizera  
    8-bit quantization from bitsandbytes quantization method:
        before loading: converts transformer layers into Linear8bitLt during loading: load 16bit weight and pass to the
        layer object after: quantizes individual weights in Linear8bitLt into 8bit at fitst .cuda() call
    saving:
        from state dict, as usual; saves weights and 'SCB' component
    loading:
        need to locate SCB component and pass to the Linear8bitLt object
    TFc                     t        |   |fi | | j                  j                  | j                  j                  | _        y y r   r   r   s      r"   r   z"BnB8BitDiffusersQuantizer.__init__Z  r#   r$   c                    t         j                  j                         s)t         j                  j                         st	        d      t               rt        dd      rt        d      t               rt        dd      rt        d      |j                  dd      rt        d	      |j                  d
d       }|t        |t              rw| j                  j                  s`|j!                         D ci c]  }|| j"                  vs|||    }}d|j%                         v sd|j%                         v rt        d      y y y y c c}w )Nr&   r'   r(   z_Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>=0.26.0'`r)   zrUsing `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`r*   FzConverting into 8-bit weights from flax weights is currently not supported, please make sure the weights are in PyTorch format.r+   r,   r-   r.   r/   r=   s         r"   rA   z.BnB8BitDiffusersQuantizer.validate_environment`  rB   rC   r   rD   c                 ^    |j                         D ci c]  \  }}||dz   }}}|S c c}}w r   r   r   s       r"   r   z+BnB8BitDiffusersQuantizer.adjust_max_memory  r   r   c                 V    |&t         j                  d|       t        j                  }|S r   r   r   s     r"   r   z,BnB8BitDiffusersQuantizer.update_torch_dtype  r   r$   c                     |zt         j                  j                         r"dt         j                  j                          }n!dt         j                  j                          }d|i}t
        j                  d       |S r   r   r   s      r"   r   z+BnB8BitDiffusersQuantizer.update_device_map  r   r$   c                 r    |t         j                  k7  rt        j                  d       t         j                  S )NzRtarget_dtype {target_dtype} is replaced by `torch.int8` for 8-bit BnB quantization)r0   rG   rI   rJ   )r   rL   s     r"   rM   z-BnB8BitDiffusersQuantizer.adjust_target_dtype  s%    5::%KKlmzzr$   rN   r   rO   rP   rQ   rR   c                 ~   dd l }t        ||      \  }}t        |j                  j	                  |d       |j
                  j                        rp| j                  rc|j                  dd      |j                         vrt        d      |j                  t        j                  k7  rt        d|j                   d      yy)	Nr   weightSCBz$Missing quantization component `SCB`zIncompatible dtype `z@` when loading 8-bit prequantized weight. Expected `torch.int8`.TF)rU   r
   r8   rV   r6   rW   
Int8Paramsrn   replacer;   r7   dtyper0   rG   rZ   s	            r"   r^   z2BnB8BitDiffusersQuantizer.check_if_quantized_param  s     	#25*Ef((,,[$?ARARS!!%%h6joo>OO$%KLL$$

2$.{/@/@.A  BB  C  r$   r_   r`   ra   c                    dd l }|j                  dd      }	|j                  dd      }
|j                  |	d       }|j                  |
d       }t        ||      \  }}||j                  vrt        | d| d      t        ||      }t        |j                  |   |j                  j                        st        d| d      |j                  t        j                  d	      k(  r,|d	t        j                  d	      fvr|t        | d
| d      |j                  d      }| j                  r| j                  st        d      |j                  } |j                  j                  |fddi|j                  |      }||j                  |<   |9t!        |j"                  d|j                  |             ||j%                  |	       |||j%                  |
       y y y )Nr   r   r   weight_formatrc   rd   zParameter `z0` should only be a `bnb.nn.Int8Params` instance.rf   rg   r,   zDetected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`.re   F)rU   r   r6   r
   rV   r7   rk   r8   rW   r   rj   r0   rl   rn   ro   rt   setattrr   rr   )r   rN   rO   rQ   r_   rR   ra   r    r[   fp16_statistics_keyfp16_weights_format_keyfp16_statisticsfp16_weights_formatr\   r]   ru   rv   s                    r"   ry   z0BnB8BitDiffusersQuantizer.create_quantized_param  s    	#(005A","4"4X"O$..)<dC(nn-DdK25*Ef000x'TU`TaabcddFK0	&,,[9366;L;LM{;-7ghiiV 44fell6.B%CC#},cdqcrrstuuNN5)	d&:&:q 
 ##%CFF%%iOuOORRS`a	*3;'&FMM5/*<*<]*KL*&&':; */J""#:; 0K*r$   c                 (    | j                   |_        |S r   )ro   is_8bit_serializabler   s      r"   r   z=BnB8BitDiffusersQuantizer._process_model_after_weight_loading  r   r$   r   c                    ddl m} | j                  j                  }| j                  j                  | _        t        | j
                  t              s| j
                  g| _        | j
                  j                  |       t        |t              ryt        |j                               dkD  r]|j                         D cg c]  \  }}|dv s| }	}}t        |	      dkD  r|st        d      | j
                  j                  |	       | j
                  D 
cg c]  }
|
|
	 c}
| _         ||| j
                  | j                        }| j                  |j                  _        d|_        y c c}}w c c}
w r   )r   r   r   r:   r   r   r8   r   r   r9   r   r;   rp   r7   r   is_loaded_in_8bitr   s              r"   r   z>BnB8BitDiffusersQuantizer._process_model_before_weight_loading   r   r   c                      yr   r   r   s    r"   ro   z)BnB8BitDiffusersQuantizer.is_serializable+       r$   c                      yr   r   r   s    r"   r   z&BnB8BitDiffusersQuantizer.is_trainable1  r   r$   c                      yr   r   r   s    r"   is_compileablez(BnB8BitDiffusersQuantizer.is_compileable7  s    r$   c                 P    ddl m}  ||| j                  | j                        }|S )Nr{   r   r   )r   r   r   r   )r   rN   r   s      r"   r   z%BnB8BitDiffusersQuantizer._dequantize;  s)    1&4..DD\D\
 r$   r   r   r   r   )r   r   r   r   r   r   r   rA   r   r   r   r   r   r   r   rM   r   r^   r   r   ry   r   r   r   ro   r   r   r   r   r   r   s   @r"   r   r   L  sl     $ Y#LDeCHo1E,F 4PSUZ[^`c[cUdPdKe 
 $ 	
 cN: 044<4< $4< 	4<
 &4< cN4< "$s),4<n +-	)')' #3i	)'V   d     r$   r   )r   typingr   r   r   r   r   r   r   r
   baser   models.modeling_utilsr   r   r   r   r   r   r   r0   
get_loggerr   rI   r   r   r   r$   r"   <module>r      sg   
 C B ) % 3  			H	%] 2 ]@	u 2 ur$   