
    bi6                         d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ e j                  d        Zd Zd Zd Z	 dd	Zd
 Zy)    N)cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                      	 t        dg      d   dz  S # t        $ rG dd l} | j                          | j	                  d      }| j                  || j                        dz  cY S w xY w)Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r   handles     `/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/bitsandbytes/triton/matmul_perf_model.pyget_clock_rate_in_khzr      sm    To&'*S00 T2215//8L8LMPSSSTs    AA$#A$c                     |t        |d      z  }t        j                  j                  j	                  |       d   dz  }t        ||      |z  t        |t               |       z  }|S z!return compute throughput in TOPS   multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopss          r   get_tensorcore_tflopsr#      sm    SA..K==&&<<VDE[\_``LL+&
	
#E+@+BF
K	L 
 M    c                     |t        |d      z  }t        j                  j                  j	                  |       d   dz  }t        ||      |z  t        |t               |       z  }|S r   )r   r   r   r   r   r   r   r   s          r   get_simd_tflopsr&   +   sh    SA..K==&&<<VDE[\_``LL+&58KEShSjlr8ss  Mr$   c                     t         j                  j                  |       }|d   dk  r!|t         j                  k(  rt	        | |||      S t        | |||      S )Nr      )torchcudaget_device_capabilityfloat32r&   r#   )r   r   r   r   
capabilitys        r   
get_tflopsr.   5   sO    11&9J!}qUemm3vxEBB 9eDDr$   c                    t         j                  j                         }|j                  }|j	                         }t        ||      }t        ||	      }|}||z  |z  }t        ||      t        ||	      }}d|z  |z  |z  dz  }t        ||| |      }||z  }t        j                  j                  j                  |      d   }t        d||z        }t        d|dz        }t        t        d|dz
  dz        d      }t        |      |dz  |d	z  z   z  }|d
z  }||z  |z  dd|dz
  z  z   z  }||z  |z  dz  |dz
  z  }||z  |z  dd|dz
  z  z   z  } ||z  |z  dz  |dz
  z  }!|| z   dz  }"||!z   dz  }#|"|z  |#|z  z   }$|dz  }%||z  |z  |z  dz  }&|dk(  r|&|%z  }'n|%}(|&|(z  }'||z  dz  dz  |%z  })|'|)z  }'t        ||$      |'z   }*|rt        d|* d| d|$ d|' d|dz   d       |*S )zGreturn estimated running time in ms
    = max(compute, loading) + store   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r)   r*   current_devicer   element_sizer   maxr.   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_mss+                                              r   estimate_matmul_timera   <   s   & ZZ&&(FGGE^^FQ IQ II9$y0H q'?C7OqA A	A!34Ifh	59DT!J ]]  66v>?UVF1h/0q(R-0s1x"}&BCQGF#';d'BEY\`E`'`aGaKEa%&.Ay1}(=$=>KA$	A6Ia%&.Ay1}(=$=>KA$	A6I+<JI%+6H7"X%55G }Hq56>G+{;L!|(*	)+a%!){+h6G
G,x7M=/);J< H$I%5hZ @.45Q8	

 r$   c                    t         j                  j                         }t         j                  j                         }|d   j	                         }|d   j
                  }g }| D ]}  }|j                  }	|	d   |	d   |	d   |j                  f\  }
}}}t        j                  j                  j                  |      d   }|
|z   |z  |z  |z  }||k  sm|j                  |        |} |t         j                  t         j                  fvr"| D cg c]  }|j                  d   dk(  s| } }i }| D ]g  }|j                  }	|	d   |	d   |	d   |	d   |j                  |j                  f\  }
}}}}}|
||||f}||v r||   j                  ||f       `||fg||<   i g }|j!                         D ]  \  }}|\  }
}}}}|d   d	k\  r[|
|z  |z  d
z  }|t#        d|      z  d	z  }d}||z  t%        j&                  d|fd      }|D ]  }|j                  |d           q|d   d   }d|_        |j                  |        |S c c}w )Nr;   rA   rB   rC   max_shared_memrD   r1   r   r(   i   r   i,  r0   c                 R    | d   z
  dk  rdt        | d   z
        z   S | d   z
  S )Nr1   r   
   )abs)xoptimal_num_stagess    r   <lambda>z$early_config_prune.<locals>.<lambda>   sG    !11Q6 QqT$6677  1 22 r$   )key)r)   r*   r6   r+   r7   r   rF   r:   r   r   r   r   appendfloat16r,   r   itemsr   heapq	nsmallest)configs
named_argsrF   r   r-   rG   r   pruned_configsconfigkwrA   rB   rC   r:   max_shared_memoryrequired_shared_memoryconfigs_maprD   r   rj   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrh   s                               @r   early_config_pruner      s   ZZ&&(F113J_))+FsO!!E N *]]yMyMyM	1
-': #MM//EEfMN^_")G"3w!>!Kf!T!%66!!&)* G U]]EMM22(/Qf6==3Kq3P6QQ K 6]]yMyMyMyME
A'7Iz '9=+##VZ$89!' 45K6" N!!# 11895'7Ia=AW$w.+>DAy 11A5J N!/*!< ooG  ,%%ad+, aDGM'(M$!!-0516 c Rs   H?$H?)F)	functoolsrn   r)   tritonr   triton.runtimer   triton.testingr   r   r   r   	lru_cacher   r#   r&   r.   ra   r    r$   r   <module>r      s^        !  T T	E* IXKr$   