
    biQ              	       F   d dl Z d dlmZ  e       sd Zde j                  fdZyd dlZd dlmZ	  ej                   ej                  ddid	       ej                  dd
id      gdg      ej                  de	j                  fd              Zde j                  fdZ ej                   ej                  ddddd	       ej                  ddddd	      gddg      ej                  de	j                  de	j                  de	j                  fd              Zd Zy)    N)is_triton_availablec                      y N )inputs    ^/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/triton/quantize_global.pyquantize_global_transposer	              xc                      y r   r   )r   s    r   quantize_globalr   
   r
   r   
BLOCK_SIZEi      )	num_warpsi      )
num_stages
n_elements)configskeyc                 T   t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  |      }
t         j                  j                  d|	|
z  z        }t        j                  ||z   ||       y )Nr   )axismask     _@)tl
program_idarangeload	libdevicellrintstore)x_ptrabsmax_inv_ptr
output_ptrr   r   pidblock_startoffsetsr   r   
absmax_invoutputs               r   _quantize_globalr+      s     mm#J&		!Z 88#GGEGO$/WW^,
$$Ua*n%=>
g%vD9r   c                 L   | j                         j                         j                  d      }d|z  }t        j                  | j
                  dt        j                  d}| j                  r|j                  sJ |j                         fd}t        |   | ||       ||fS )Nr         ?cudadevicedtypec                 8    t        j                  | d         fS )Nr   tritoncdiv)metar   s    r   <lambda>z!quantize_global.<locals>.<lambda>/   s    V[[T,5GHJ r   )
absmax	unsqueezetorchemptyshapeint8is_cudanumelr+   )r   absmaxr)   r*   gridr   s        @r   r   r   )   s    ((+6\
aggfEJJGyyV^^++\\^
Jq*fjAv~r         )BLOCK_MBLOCK_NGROUP_MMNrE   rF   rG   c                 >   t        j                  d      }||	z   dz
  |	z  }||
z   dz
  |
z  }||z  }||z  }t        |||z  z
  |      }||z  ||z  z   }||z  |z  }||	z  t        j                  d|	      z   }||
z  t        j                  d|
      z   }| |d d d f   |z  |d d d f   |z  z   z   } ||k  d d d f   ||k  d d d f   z  }t        j                  | |      }t        j                  |      }||	z  t        j                  d|	      z   }||
z  t        j                  d|
      z   }||d d d f   |z  |d d d f   |z  z   z   }||k  d d d f   ||k  d d d f   z  }t         j
                  j                  d||z  z        }t        j                  |||       y )Nr   r   r   r   )r   r   minr   r   r    r!   r"   )Ar$   B	stride_am	stride_an	stride_bn	stride_bmrH   rI   rE   rF   rG   r&   grid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnr   ar)   r*   s                             r   _quantize_global_transposer\   4   s   . mmAg+/g-g+/g-& %<(W"44g>
7"cJ&67u+W_ryyG44W_ryyG44AtGy(2dAg;+BBCQ4 BFD!G#44GGAD!WW^,
 W_ryyG44W_ryyG44AtGy(2dAg;+BBCQ4 BFD!G#44$$Ua*n%=>
F&r   c                    | j                         j                         j                  d      }d|z  }| j                  \  t	        j
                  dt        j                        }|j                  d      k(  r|j                  d      k(  sJ | j                  d      dk(  s| j                  d      dk(  sJ |j                  d      dk(  s|j                  d      dk(  sJ fd}t        |   | ||| j                  d      | j                  d      |j                  d      |j                  d      	       ||fS )Nr   r-   r.   r/   r   c                 l    t        j                  | d         t        j                  | d         z  fS )NrE   rF   r3   )METArH   rI   s    r   r7   z+quantize_global_transpose.<locals>.<lambda>p   s.    V[[DO<v{{1dS\o?^^` r   )
r8   r9   r:   r=   r;   r<   r>   sizestrider\   )r   rA   r)   outrB   rH   rI   s        @@r   r	   r	   f   s   ",,Q/6\
{{1kk!QvUZZ@xx{aCHHQK1$444||A!#u||A!';;;zz!}!SZZ]a%777`"4(LLOLLOJJqMJJqM
	
 F{r   )r;    bitsandbytes.triton.triton_utilsr   r	   Tensorr   r4   triton.languagelanguager   autotuneConfigjit	constexprr+   r\   r   r   r   <module>rk      sJ    @5<<    V__FMM<.!<FMM<.1=
 N ZZ:
 LL: : 5<<  V__FMMccaHTUVFMMccaHTUV

 #J ZZ'' '' '' '' ''Rr   