
    biT                     @   d dl Z d dlZd dlmZ ej
                  dej                  dej                  fd       Zej
                  dej                  dej                  fd       Zd Z	ej
                  dej                  fd       Z
ej
                  d	        Zej
                  dej                  fd
       Zej
                  d        Zej
                  dej                  fd       Zej
                  dej                  dej                  fd       Zej
                  dej                  dej                  fd       Zej
                  dej                  dej                  fd       Zde j$                  de j$                  dedede j*                  de j$                  ddfdZde j$                  de j$                  dede j$                  de j*                  de j$                  ddfdZej
                  dej                  dej                  dej                  fd       Zy)    N
BLOCK_SIZESPLIT_NUM_BLOCKSc                    |dz  }t        j                  d      |z  }t        j                  d||z        }||z  |z   }	|	|k  }
t        j                  | |	z   |
d      }t        j                  |||f      }t        j
                  t        j                  |      d      }t        j                  ||z   t        j                  d|      z   |       ||d d d f   z  }t        j                  |dd      }t        j                  |dk  d	d      }t        j                  |      }t        j                  |d
kD  t        j                  |dkD  t        j                  |dkD  dd      t        j                  |dkD  dd            t        j                  |dkD  t        j                  |dkD  dd      t        j                  |dkD  dd                  }||z  j                  t         j                        }|j	                  ||dz  df      }|j                         \  }}|dz  |dz  z  }t        j                  |||z  f      }||z  dz  t        j                  d||z        z   }||dz  k  }t        j                  ||z   ||       y )N   r           maskother   axis            ?   g&>?g?gD^Ř?   gyCuΪ?      g      ?g%?      gْvWUe?   r	   tl
program_idarangeloadreshapemaxabsstoreclampwheretouint8split)A_ptr
absmax_ptrout_ptr
n_elementsr   r   PAIRED_SPLIT_NUM_BLOCKSblock_start_idx
thread_idxoffsetsr	   A
A_reshapedabsmaxA_normalizedsignA_absfresult	quantizedleftrightpackedpacked_flatout_offsetsout_masks                            d/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/triton/kernels_4bit.pyquantize_fp4_blockwise_kernelr=      sY    -=q,@mmA&)@@O15
BCJ
*Z7GZD
d#6A A 7DEJ VVBFF:&Q/FHHZ/)BIIa9P,QQSYZq$w/L88L$4L88L1$ff5DVVL!FXX
Xrxx(:E5I288TZ]fTfhmotKu	
 	YHHVj(&&9HHVj(&&9	

F $""288,I!!#:J!OQ"OPI//#KD%QY%#+&F**Vj3C&C%EFK!J.!3biiCSV`C`6aaKZ1_,HHHW{"Kh?    c                 $   |dz  }t        j                  d      |z  }t        j                  d||z        }||z  |z   }	|	|k  }
t        j                  | |	z   |
d      }t        j                  |||f      }t        j
                  t        j                  |      d      }t        j                  ||z   t        j                  d|      z   |       ||d d d f   z  }t        j                  |dd      }t        j                  |d	kD  t        j                  |d
kD  t        j                  |dkD  t        j                  |dkD  dd      t        j                  |dkD  dd            t        j                  |dkD  t        j                  |dkD  dd      t        j                  |dkD  dd                  t        j                  |dkD  t        j                  |dkD  t        j                  |dkD  dd      t        j                  |dkD  dd             t        j                  |d!kD  t        j                  |d"kD  d#d      t        j                  |d$kD  dd                        }|j                  t         j                        }|j	                  ||dz  df      }|j                         \  }}|d z  |dz  z  }t        j                  |||z  f      }||z  dz  t        j                  d||z        z   }||dz  k  }t        j                  ||z   ||%       y )&Nr   r   r   r   r   r   r   r   g   __?g    ?g   ൑?g   0;?r      g   p?      g    ?g   Z?   
   g   Pɾ?	   r   g   Nտg   xg    Or   r   g   οr   r   g   Ng    pݿr   g   #r   r   )r&   r'   r(   r)   r   r   r*   r+   r,   r-   r	   r.   r/   r0   r1   r4   r5   r6   r7   r8   r9   r:   r;   s                          r<   quantize_nf4_blockwise_kernelrF   U   s    -=q,@mmA&)@@O15
BCJ
*Z7GZD
d#6A A 7DEJ VVBFF:&Q/FHHZ/)BIIa9P,QQSYZq$w/L88L$4LXX**
--HH11(::FFK(::FFK
 HH11(::FFK(::FFK	
 	//HH33(==vvN(<<ffM
 HH22(;;VVL(;;VVL	
F: 		"((#I!!#:J!OQ"OPI//#KD%QY%#+&F**Vj3C&C%EFK!J.!3biiCSV`C`6aaKZ1_,HHHW{"Kh?r>   c                     d}t        j                  ||      f}|dk(  rt        |   | |||||       ||fS t        |   | |||||       ||fS )Nr   fp4)r&   r'   r(   r)   r   r   )tritoncdivr=   rF   )	r.   	blocksize
quant_typeblocksr0   num_elementsquantized_outsplit_num_blocksgrids	            r<   quantize_4bit_blockwise_tritonrR      s~    KK 013DU%d+!# -	
" &   	&d+!# -	
 &  r>   QUANT_BLOCKc                    |dz  }||k  }| dz  }| dz	  }	||z  }
t        j                  ||
z   |dd      }t        j                  ||	z   d      }t        j                  ||z   d      }||z  }||z  }t        j                  ||      }|S )Nr   r   r   r   
evict_lastr	   r
   eviction_policyrW   )r   r   
interleave)ar-   	quant_ptrr'   n_elemsrS   PAIRED_QUANT_BLOCKr	   higherlowerabs_offsetsr0   lower_4higher_4mul_highmul_lowout_dqs                    r<   dequant_4bit_body_utilrf      s    '2a'7WDWFFE//KWWZ+-DUabF ggi%'FGwwy6)<HH& HG]]7H-FMr>   c           	         t        j                  | dz  dk(  dd      }| dz  dk(  }| dz  dk(  }| dz  dk(  }t        j                  |t        j                  |dd      t        j                  |d	d
            }t        j                  |t        j                  |dd      t        j                  |dd            }t        j                  |||      }||z  |z  S )Nr   r   r   r   r   r   g      ?gK}\UU?g      ?gQUU?gvWUU?gDpTUu?r   r   r"   )	valr0   r2   	third_bit
second_bit	first_bitbranch1branch2outs	            r<   dequantize_fp4_treerp      s    
 88S6\f,dC8Dv&(I,6)Jv&(Ihh
D*-
C,G
 hh
C,
J,G
 ((9gw
/C:r>   c                     |dz  }||k  }| dz  }| dz	  }||z  }	t        j                  ||	z   |dd      }
t        ||
      }t        ||
      }t        j                  ||      }|S Nr   r   r   r   rU   rV   )r   r   rp   rY   rZ   r-   r'   r\   rS   r]   r	   r^   r_   r`   r0   rc   rd   re   s                 r<   dequant_fp4_body_utilrt      s{    '2a'7WDWFFE//KWWZ+-DUabF"662H!%0G]]7H-FMr>   c                    | dz  dk(  }| dz  dk(  }| dz  dk(  }| dz  dk(  }t        j                  |t        j                  |t        j                  |dd      t        j                  |dd            t        j                  |t        j                  |d	d
      t        j                  |dd                  }t        j                  |t        j                  |t        j                  |dd      t        j                  |dd            t        j                  |t        j                  |dd      t        j                  |dd                  }t        j                  |||      S )Nr   r   r   r   r   g   `v"?g   ` ?g    4?g   @g?g   ?g   `\?g   __?r   g    Og   ০ǿg   I4ҿg    TFٿg    fg    6Gr   rh   )ri   cond0cond1cond2cond3
branch_pos
branch_negs          r<   dequantize_nf4_treer|      sD    6\f$E6\f$E6\f$E6\f$E 
HHUC!34HHU.0CD	

 	HHU/1DEHHU/1DE	
J 
HHUC!56HHU02FG	

 	HHU02EFHHU/6	
J 88E:z22r>   c                     |dz  }||k  }| dz  }| dz	  }||z  }	t        j                  ||	z   |dd      }
t        |      |
z  }t        |      |
z  }t        j                  ||      }|S rr   )r   r   r|   rY   rs   s                 r<   dequant_nf4_body_utilr~     s    '2a'7WDWFFE//KWWZ+-DUabF"6*V3H!%(61G]]7H-FMr>   
SPLIT_SIZEc                 Z   t        j                  d      }||z  }|t        j                  d|      z   }	|	|k  }
t        j                  | |	z   |
d      }t	        ||	||||      }||z  dz  }|t        j                  d|dz        z   }||dz  k  }
t        j
                  ||z   ||
       y )Nr   r   evict_firstrX   )rZ   r-   r[   r'   r\   rS   r   )r   r   r   r   rf   r    )a_ptrc_ptrr[   r'   num_paired_elementsrS   r   pidblock_startr-   r	   rZ   re   out_block_startoffss                  r<   dequant_4bit_kernelr   K  s     --Q
C
"KBIIa44G((D
}EA#
#F J&*ORYYq*q.99D%))DHHUT\64(r>   c                 X   t        j                  d      }||z  }|t        j                  d|      z   }||k  }	t        j                  | |z   |	d      }
t	        |
||||      }||z  dz  }|t        j                  d|dz        z   }||dz  k  }	t        j
                  ||z   ||	       y Nr   r   r   rX   )rZ   r-   r'   r\   rS   r   )r   r   r   r   rt   r    r   r   r'   r   rS   r   r   r   r-   r	   rZ   re   r   r   s                 r<   dequant_fp4_kernelr   p       --Q
C
"KBIIa44G((D
}EA"
#F J&*ORYYq*q.99D%))DHHUT\64(r>   c                 X   t        j                  d      }||z  }|t        j                  d|      z   }||k  }	t        j                  | |z   |	d      }
t	        |
||||      }||z  dz  }|t        j                  d|dz        z   }||dz  k  }	t        j
                  ||z   ||	       y r   )r   r   r   r   r~   r    r   s                 r<   dequant_nf4_kernelr     r   r>   r.   r0   rK   rL   dtypero   returnc                     | j                         }d}t        j                  ||      f}|dk(  rt        |   | |||||       y t	        |   | |||||       y )N   rH   )numelrI   rJ   r   r   )	r.   r0   rK   rL   r   ro   number_of_paired_elementsr   rQ   s	            r<   dequantize_4bit_implr     se     !"	 JKK1:>@DU4 C1JIWab4 C1JIWabr>   codec           	      ~    | j                         }d}t        j                  ||      f}t        |   | ||||||       y )Nr   )r   rI   rJ   r   )	r.   r0   rK   r   r   ro   r   r   rQ   s	            r<   !dequantize_4bit_impl_passing_coder     sF     !"	 JKK1:>@DadF4MyZder>   	CODE_SIZEc                    |dz  }t        j                  d      |z  }	t        j                  d||z        }
|	|z  |
z   }||k  }t        j                  | |z   |d      }t        j                  |||f      }t        j
                  t        j                  |      d      }t        j                  ||	z   t        j                  d|      z   |       ||d d d f   z  }t        j                  |dd      }t        j                  ||ft         j                  	      }t        j                  ||f|dz
  t         j                  	      }t        d
      D ]U  }||z   dz  }t        j                  ||z         }||kD  }t        j                  |||      }t        j                  |||      }W t        j                  ||z         }t        j                  ||z         }t        j                  ||z
        }t        j                  ||z
        }t        j                  ||k  ||      j                  t         j                        }|j	                  ||dz  df      }|j                  t         j                  d      }|j!                         \  }}|d
z  |dz  z  }t        j                  |||z  f      }|	|z  dz  t        j                  d||z        z   } | |dz  k  }!t        j                  || z   ||!       y )Nr   r   r   r   r   r   r   r   )r   r   T)bitcastr   r   )r   r   r   r   r   r   r   r    r!   zerosint32fullranger"   r#   r$   r%   )"r&   code_ptrr'   r(   r)   r   r   r   r*   r+   r,   r-   r	   r.   r/   r0   r1   lower_pivotupper_pivot_pivotri   	is_higher	lower_val	upper_val
lower_dist
upper_distr5   r6   r7   r8   r9   r:   r;   s"                                     r<   quantize_4bit_blockwise_kernelr     s    -=q,@mmA&)@@O15
BCJ
*Z7GZD
d#6A A 7DEJ VVBFF:&Q/FHHZ/)BIIa9P,QQSYZq$w/L88L$4L((3Z@QK''2J?QVXV^V^_K1X >{*q0ggh&' 3&	hhy%=hhy+u=> ;./I;./Iy01Jy01Jz1;LOOPRPXPXYI!!#:J!OQ"OPIRXXt4I//#KD%QY%#+&F **Vj3C&C%EFK!J.!3biiCSV`C`6aaKZ1_,HHHW{"Kh?r>   )torchrI   triton.languagelanguager   jit	constexprr=   rF   rR   rf   rp   rt   r|   r~   r   r   r   Tensorintstrr   r   r   r    r>   r<   <module>r      s      2@
 2@ ll2@ 2@B B@
 B@ llB@ B@J!2 TVT`T`  *  .    $3 $3N   ` )KM<<)egeqeq) )H )@B)Z\ZfZf) )F )@B)Z\ZfZf) )0c||cLLc c 	c
 ;;c 
c 
c0f||fLLf f ,,	f
 ;;f 
f 
fP :@ :@ ||:@ ll:@ :@r>   