
    bi                     $   d dl Z d dlZd dlmZ ej
                  dej                  dej                  fd       Z	 	 	 dde j                  de j                  de j                  de	d	e j                  d
e j                  fdZej
                  dej                  dej                  dej                  fd       ZddZej
                  dej                  dej                  dej                  fd       Zej
                  dej                  fd       Zy)    NQUANT_BLOCK
SPLIT_SIZEc                     t        j                  d      }||z  }|t        j                  d|      z   }	|	|k  }
t        | |	|||
|      }t        j                  ||	z   ||
       y )Nr   axis)tl
program_idarange"dequant_8bit_blockwise_kernel_utilstore)a_ptrout_ptrcode_ptr
absmax_ptrnr   r   pidblock_startoffsetsmaskout_dqs               j/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/backends/triton/kernels_8bit_quant.pydequant_8bit_kernelr      se     --Q
C
"KBIIa44GQ;D/w*VZ\ghFHHWw-    aabsmaxquant_state_codequant_blocksizedtypeoutc           	          | j                         }|/|t        d      t        j                  | || j                        }d}t        j                  ||      f}t        |   | ||||||       |S )Nz'If out is None, dtype must be specified)r   device   )numel
ValueErrortorch
empty_liker!   tritoncdivr   )	r   r   r   r   r   r   r   r   grids	            r   dequant_8bit_blockwiser*   -   s~     	
	A
{=FGGqahh?JKK:&(D		 Jr   
BLOCK_SIZE	CODE_SIZESPLIT_NUM_BLOCKSc                 p   t        j                  d      |z  }t        j                  d||z        }	||z  |	z   }
|
|k  }t        j                  | |
z   |d      }t	        |||||      \  }}t        j
                  ||z   t        j                  d|      z   |       t        j
                  ||
z   ||       y )Nr           )r   other)r   )r   r	   r
   load#quantize_8bit_blockwise_kernel_utilr   )A_ptrr   r   r   
n_elementsr+   r,   r-   block_start_idx
thread_idxr   r   A	quantizedr   s                  r   quantize_8bit_blockwise_kernelr9   S   s     mmA&)99O1.;<J
*Z7GZD
d#6A;AxT^`pqIvHHZ/)BIIa9I,JJFSHHWw	5r   c           
         | j                         }|| z   }|-t        j                  |f| j                  | j                        }|3t        j
                  | j                         t        j                        }d}t        j                  ||      f}t        |   | ||||||j                         |       |j                  | j                        }||fS )N)r!   r   r      )r3   r   r   r   r4   r+   r,   r-   )r#   r%   emptyr!   r   r&   flattenuint8r'   r(   r9   reshapeshape)	r7   code	blocksizer   r   r   blockssplit_num_blocksr)   s	            r   quantize_blockwise_tritonrF   k   s    		AYJF~fYqxxqwwG
{qyy{%++>KK 013D"4(**,) ++agg
C;r   N_PER_THc                    t        j                  | ||f      }t        j                  t        j                  |      d      }||d d d f   z  }t        j                  |dd      }t        j
                  ||ft         j                        }t        j                  ||f|dz
  t         j                        }	t        d      D ]U  }
||	z   dz  }t        j                  ||z         }||kD  }t        j                  |||      }t        j                  ||	|      }	W t        j                  ||z         }t        j                  ||	z         }t        j                  ||z
        }t        j                  ||z
        }t        j                  ||k  ||	      j                  t         j                        }t        j                  |||z  f      }||fS )Nr<   r   g      g      ?r;         )r   r@   maxabsclampzerosint32fullranger1   wheretor?   )r   r   r,   r+   rG   
a_reshapedr   a_normalizedlower_pivotupper_pivot_pivotval	is_higher	lower_val	upper_val
lower_dist
upper_distr8   quantized_flats                       r   r2   r2      s    A*56J VVBFF:&Q/Fq$w/L88L$4L((Hj1BK''8Z0)a-rxxPK 1X >{*q0ggh&' 3&	hhy%=hhy+u=> ;./I;./Iy01Jy01Jz1;LOOPRPXPXYI ZZ	J,A+CDN6!!r   c                     t        j                  | |z   |d      j                  t         j                        }t        j                  ||z   |      }||z  }t        j                  ||z   |dd      }	||	z  }
|
S )Nr   )r0   r/   
evict_last)r   r0   eviction_policy)r   r1   rS   r?   )r   r   r   r   r   r+   r   scaled_int8absmax_offsetsr   r   s              r   r   r      sp     	Q/22288<A''(Q,-K
*NWWZ.0t3XdeF6!FMr   )@   NN)NN)r%   r'   triton.languagelanguager   jit	constexprr   Tensorintr   r*   r9   rF   r2   r    r   r   <module>rn      sO     . . . . .* ||LL ll 	
 ;; 
L 6 6 ||6 ll6 6.: '" ||'" 	'"
 ll'" '"T   r   