
    ukiL9                     ^   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d dlZd
dl 	 d dlmZ  ed      Z! e
d      Z"e#jI                  dd       G d d             Z%eddddee"e!f   de	d   de	d
   dee"e&e!e'dz  f   f   fd       Z(eddddee"e!f   de	d   de	d
   dee"e&e!e)e&e*e'f      dz  f   f   fd       Z(eddddee"e!f   de	d   de+dee"e&e!e)e'   dz  f   f   fd       Z(eddddee"e!f   de	d   de+dee"e&e!e)e)e&e*e'f         dz  f   f   fd       Z(dd
dde,de+fdZ( G d d       Z-e#jI                  d!       G d" d#             Z. G d$ d%      Z/y# e $ r dZY >w xY w)&    )CallableN)Literal	ParamSpecTypeVaroverload)stages)util)ir)arith)gpu)memref   )*)
mosaic_gpuTPT)frozenkw_onlyc                   8    e Zd ZU dZdZeed<   ddddedefdZy	)
CuptizCUPTI-based profiler.Tfinalizer   	aggregate
iterationsr   r   c                     t        t        j                  t        j                  f      st	        j
                         fd}|S )Nc                  8   t         t        d      t        j                   | i |       t         j                  }|j                          	 t              D cg c]
  } | i | }}|D ]  }t        j                  |        |d   }|j                  j                        }|s|d fS t        |      z  dk7  rt        d      t        |      z  }t        j                  ||gdz
  z        }	t        |	d         D ]2  \  }
\  }}t        d      D ]  }|	|   |
   d   |k7  st        d       4 r|	D cg c]  }t        d |D               }	}|t        |	      dk(  r|	d   fS |	fS c c}w # |j                  j                        }w xY wc c}w )Nz1CUPTI profiling is not supported on this platformr   zJThe number of kernel launches is not divisible by the number of iterationsr   z1Kernel names are not consistent across iterationsc              3   &   K   | ]	  }|d      yw)r   N ).0items     _/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/jax/experimental/mosaic/gpu/profiler.py	<genexpr>z1Cupti.measure.<locals>.wrapper.<locals>.<genexpr>_   s     ,DQ,s   )mosaic_gpu_libRuntimeErrorjaxblock_until_ready_mosaic_gpu_ext_cupti_initrange_cupti_get_timingsr   lenr	   
split_list	enumeratesum)argskwargsext_all_resultsrresultstimingskernels_per_iteriter_timings
kernel_idxkernel_nameir   fr   selfs                r!   wrapperzCupti.measure.<locals>.wrapper>   s   		NOO	At.v./**c	oo8383DEaq$)&)EE 	#A



"	#a.((7}	W
	"a	'
 	
 W3__
$%a8l +4LO*D T
&*&{Aq*% 	TA!_Z(+{:RSS	TT
 
<H
18C,G,,
 
 \):a)?l1oQQ\QQ9 F
 ((7&
s$   E7 E2+#E7 ?F2E7 7F)
isinstancer   WrappedCompiledr%   jit)r=   r<   r   r   r>   s   ```` r!   measurezCupti.measure8   s7     a&..&//:;
''!*a$RL N    N)	__name__
__module____qualname____doc__r   bool__annotations__intrC   r   rD   r!   r   r   1   s-     (D %)A,!,69,rD   r   .r   r<   r   r   returnc                     y Nr   r<   r   r   s      r!   rC   rC   f        rD   Fc                     y rN   r   rO   s      r!   rC   rC   o   rP   rD   c                     y rN   r   rO   s      r!   rC   rC   x   rP   rD   c                     y rN   r   rO   s      r!   rC   rC      rP   rD   c                b    |dk  rt        d|d      t               j                  | ||      S )a  Measures the GPU runtime of a function using CUPTI.

  ``measure`` is a higher-order function that wraps a function ``f`` to
  return GPU runtime in milliseconds, in addition to its regular outputs.

  Args:
    f: The function to measure.
    aggregate: Whether to report an aggregate runtime. When ``False`` (only
      supported by ``mode="cupti"``), the per-kernel timings are returned as a
      list of tuples ``(<kernel name>, <runtime in ms>)``.
    iterations: How many times to run the function. Only supported by
      ``mode="cupti"``. When greater than 1, the return type will become a list
      of measurements.

  Returns:
    A function that accepts the same inputs as ``f`` and returns
    ``(f_outputs, timings)``, where ``f_outputs`` are the outputs of ``f``,
    and ``timings`` is either a float or a list of tuples, depending on
    ``aggregate``. If no kernels are launched, ``timings`` is ``None``.

  Notes:
    `CUPTI (CUDA Profiling Tools Interface)
    <https://docs.nvidia.com/cupti/index.html>`_ is a high-accuracy profiling
    API used by Nsight Systems and Nsight Compute. The CUPTI API only allows a
    single subscriber, so ``measure`` cannot be used with other CUPTI-based
    tools like CUDA-GDB, Compute Sanitizer, Nsight Systems, or Nsight
    Compute.
  r   ziterations=z must be positiver   )
ValueErrorr   rC   rO   s      r!   rC   rC      s7    > !^

}$56
77	iJ	GGrD   c                   &   e Zd ZdZdZddedefdZdeedf   deedf   d	efd
Z	deedf   deedf   d	e
j                  fdZdeedf   deedf   d	e
j                  fdZdeedf   fdZdeedf   fdZded	efdZdeedf   deedf   fdZy)ProfilerSpecr   l        entries_per_warpgroup	dump_pathc                     || _         i | _        |dk(  r.t        j                  dt	        j
                               | _        y || _        y )NspongeTEST_UNDECLARED_OUTPUTS_DIR)rX   interned_namesosgetenvtempfile
gettempdirrY   )r=   rX   rY   s      r!   __init__zProfilerSpec.__init__   sA    !6D*,DHyy
')<)<)>dn !dnrD   grid.blockrL   c                     t        j                  |      t        z  rt        d      t        j                  |      t        j                  |      z  t        z  S )Nz.Block size is not a multiple of warpgroup size)mathprodWARPGROUP_SIZErU   r=   rc   rd   s      r!   _num_warpgroupszProfilerSpec._num_warpgroups   sB     yy.(GHH99T?TYYu--??rD   c                     t         j                  j                  | j                  ||      | j                  z  ft         j
                  j                  d            S )N    )r
   
MemRefTypegetrj   rX   IntegerTypeget_signlessri   s      r!   mlir_buffer_typezProfilerSpec.mlir_buffer_type   sK     ==			dE	*T-G-G	GI
##B' rD   c                     t        j                  | j                  ||      | j                  z  ft        j
                        S rN   )r%   ShapeDtypeStructrj   rX   jnpuint32ri   s      r!   jax_buffer_typezProfilerSpec.jax_buffer_type   s<     			dE	*T-G-G	GI

 rD   c                 V    | j                  d|      }t        || j                  z        S )Nr   )rj   rK   rX   )r=   rd   num_warpgroupss      r!   smem_i32_elementszProfilerSpec.smem_i32_elements   s*    ))"e4N~ : ::;;rD   c                 .    d}| j                  |      |z  S )N   )ry   )r=   rd   bytes_per_entrys      r!   
smem_byteszProfilerSpec.smem_bytes   s    O!!%(?::rD   namec                     | j                   j                  |d       x}|S t        | j                         x}| j                   |<   || j                  z  rt	        d      |S )NzAllocated too many names)r]   rn   r+   EXITr$   )r=   r~   name_ids      r!   intern_namezProfilerSpec.intern_name   sb    &&**466Cn*-d.A.A*BBGd!!$'344NrD   c                    t        j                  |      }t        j                  |      }| j	                  d|      }|j                  ||| j                        }|d   }|d   }	|d   }
|
dz   }t        j                  || j                  kD        rt        d      |ddd f   }|d d d d dd d	f   }t        j                  |j                  d
         dd d	   |
d   k  }|d d d d dd f   |d d d d d d
f   z
  j                  |d d d d dd f   d      }t        d|dz
        }| j                  j                         D ci c]  \  }}||
 }}}g }t        j                  ||      D ]5  \  }}|
||f   }d }|d	z  dk(  sJ |       |||f   }g }t!        d      }t#        d|d	      D ]  }||||f   }||||dz   f   }||}||z  }||d	z  |z  z  }|dk  r q|}d}|t$        j&                  z  r|t$        j&                  z  }d}||   } ||k\  r2||z
  dkD  r%t)        j*                  d|rdnd d|  d| d|        |dz   }|}|j-                  | |rdndt!        ||z         dz  dt/        |	||f         z   d|z   ||z  z   d        |s%|j-                  |       8 t1        |d       }t3        t4        j6                  j9                  |            }!t;        j<                  d|!d |      S c c}}w )!Nr   ).r   ).r   ).      z*Insufficient space to capture a full trace.r   r   ).N   )whereinitialr   z-infTF
   z6Profiler clock went significantly backwards for event startendz `z`: z -> BEg     @@)r~   phtspidtidc                     | d   d   S )Nr   r   r   )xs    r!   <lambda>z#ProfilerSpec.dump.<locals>.<lambda>'  s    !A$t* rD   )keyns)displayTimeUnittraceEvents)npasarrayrf   rg   rj   reshaperX   anyr$   arangeshapeminmaxr]   itemsndindexfloatr)   rW   r   warningswarnappendrK   sortedlist	itertoolschainfrom_iterablejsondump)"r=   bufferr<   rc   rd   
num_blockswarpgroups_per_blockentriesstart_timessm_idstraces_usedentries_usedtracestime_eventsvalid_times_maskprofiling_overheadkvuninternevents	block_idxwg_idxvalid_entrieslocal_clock_offset
start_timeblock_events	last_timer;   tagtimer   beginr~   flat_eventss"                                     r!   r   zProfilerSpec.dump   s   ZZF4J//E:nn($*D*DG &/KV_F&/K?L	vvlT7778EFFS!"WF Aqt!t$Kyyb!1214a48;y;QQ%aABh/+aCRCi2HHMMq!QRx(" N  Q 2Q 67!%!4!4!:!:!<=A1=H=FZZ
4HI (&	6!)V"34mQ!#2]2#y&01jl-iQq) !&!Y)*iQ./%#
""a---!8
\&&&l///'% "MM$G%04&I; G6
 Q$	#C
T)*S0s6)V"3455v: 4y @@
 	1!&@ 
--
%Q(&R F 45Fy44V<=K99kJANN[ >s   K7N)r[   )rE   rF   rG   ENTERr   rK   strrb   tuplerj   r
   Typerq   rv   ry   r}   r   r   r   rD   r!   rW   rW      s   
%	$!C !C !@S/@*/S/@
@S/*/S/	wwS/*/S/	ww<U38_ <;eCHo ;c c EO%S/ EO%S/ EOrD   rW   )r   c                       e Zd ZU dZej
                  ed<   ej
                  ed<   ej
                  ed<   ej
                  ed<   ej
                  ed<   y)_ProfilerCtxzSet of IR values referenced by the profiler logic.

  The profiler logic is implemented using `CustomPrimitiveOp` which requires
  that all IR values referenced in its body be passed as operands to the op.
  r   is_profiling_threadsmem_buffergmem_bufferoffsetN)rE   rF   rG   rH   r
   ValuerJ   r   rD   r!   r   r   ,  s:     
/xxxxxx
((rD   r   c                       e Zd Zdedej
                  dej
                  defdZej                  d        Z
ej                  defd       Zd	eed
f   deed
f   fdZy)OnDeviceProfilerspecr   r   wrap_in_custom_primitivec           
      >   t         j                  j                  d      }t         j                  j	                         }|| _        |j                  | _        || _        t        d      }t        j                  |t        j                  |t        | j                  |                  }t        |t        || j                              }t        j                   t        j"                  j$                  t        j&                  t)               t        t*        |            t        d|            }	t-        j.                  t         j0                  j	                  d|      g g       }
t-        j2                  t        d|      |
g        t5        t7        d      |	|||
      | _        y )Nrl   Fsyncr   r   low)r   r   r   r   r   )r
   ro   rp   	IndexTypern   r   rX   entries_per_wgr   warpgroup_idxr   
index_castmulicmemref_slicedscmpiCmpIPredicateeqremui
thread_idxrh   r   allocarm   storer   globaltimerctx)r=   r   r   r   r   i32indexr   	wg_offsetr   r   s              r!   rb   zOnDeviceProfiler.__init__=  s2    ..
%
%b
)CLLEDI44D$<D!&F  uzz&!D$7$7"=>I {By$:M:M,NOK**JL!NC"89	!S	 ]]2==,,R7R@F
LL1efb)% /DHrD   c              #   D  K   | j                   s| j                   y dt        t        j                     fd}t
        j                  g  || j                        g t        j                  j                  g       gg       }|j                  D cg c]  }|j                   }} |j                  j                  j                  | }t        j                  |      5  t        |j                     t
        j#                  g        d d d        y c c}w # 1 sw Y   y xY ww)NrL   c                 z    t         j                  |       D cg c]  }t        | |j                         c}S c c}w rN   )dataclassesfieldsgetattrr~   )objfields     r!   r   z.OnDeviceProfiler._profiler_ctx.<locals>.fieldsd  s,    4?4F4Fs4KL5gc5::&LLLs   8)result	operands_
in_layoutsin_transformsout_layouts)r   r   r   r
   r   dialectCustomPrimitiveOp	ArrayAttrrn   r   typebodyblocksr   InsertionPointr   	argumentsreturn_)r=   r   opargargs_tyrd   s         r!   _profiler_ctxzOnDeviceProfiler._profiler_ctx^  s     ((HHnMtBHH~ M 
	"	""||''+, 
# 
B $&<<0Csxx0G0!BGGNN!!7+E			5	! %//**oob  1 s*   BD D":D *DD DD r~   c              #   4   K   t         j                  j                  d      t         j                  j	                          j
                  j                  |       fd} |t        j                         d   |t        j                         y w)Nrl   c           
      d   j                         5 }t        j                  |j                  g       }t	        |j
                  |      }t        |d      }t        j                  j                  d      }t        j                  ||      }t        j                  t        j                  j                  d      |j                  |t!        | 
z        gddd       t#        j$                  |t!        d		            }t        j&                  ||j                  g        d d d        y # 1 sw Y   y xY w)
Nr   )memory_space@   z
!llvm.voidzB
            @$0 st.shared.v2.u32 [$1], {$2, %clock};
            zb,l,rT)has_side_effectsr   )r	  r   loadr   r   r   
memref_ptrr
   ro   rp   llvmptrtoint
inline_asmr   parser   r   r   addir   )modifierr   r   base_refbase_ptri64	base_addr
new_offsetr   r   r   r=   s           r!   r   z&OnDeviceProfiler.record.<locals>.storey  s     13 SZZ,8hQ7nn))"-MM#x0	GGMM,'$$i8g3Es1KL ! 	 	
 ZZ!U4
ZR0'1 1 1s   DD&&D/)
r
   ro   rp   r   rn   r   r   rW   r   r   )r=   r~   r   r   r   r   s   `  @@@r!   recordzOnDeviceProfiler.recordt  sg     
..
%
%b
)CLLEii##D)G1, 
,

		,

s   BBrc   .rd   c           
         t         j                  j                         }t         j                  j	                  d      }| j                         5 }t        j                          t        d|      }t        j                  D ]R  }t        j                  t        j                  |t        j                  |            t        j                  |            }T t        d      }t!        j"                  |      t$        z  }	t        j                  t        j                  |t        |	|            t        j&                  ||            }
t        j                  |
t        | j(                  |            }t+        |j,                  t/        || j(                              }t1        |j2                        5  t5        j6                  |j8                  |t        d|      g       t5        j6                  t;               |t        d|      g       t        j&                  |t5        j<                  |j>                  g             }t5        j6                  ||t        d|      g       t@        j=                  t         jB                  j                  | j(                  dz
  f|      |jD                  t        d|      g      }t@        j7                  ||t        d|      g       d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nrl   r   Fr   r   r   r   )#r
   r   rn   ro   rp   r	  r   barrierr   	Dimensionr   r  r   grid_dimblock_idr   rf   rg   rh   r   r   r   r   r   whenr   r   r   r   smidr  r   vector
VectorTyper   )r=   rc   rd   r   r   r   r   dimr   wg_per_blockglobal_wg_idxstart_offsetwg_gmem_buffer
num_tracesr   s                  r!   r   zOnDeviceProfiler.finalize  s(   LLE
..
%
%b
)C				 <	kkmAu+i 
#JJJJy#,,s"34cll36G
	
 %(fYYu%7ljj
**Y, 6
7


5&
)m ZZq1D1De/LMl#
//2lD,?,?@n ''( 
<SYY1e>TV^a5k];%%c6;;szz2+FG
Z!Au+?MMt22Q68#>OOq%[M

 	V^a5k];
<%< <$
< 
<%< <s&   EK1-D'K%K1%K.	*K11K:N)rE   rF   rG   rW   r
   r   rI   rb   
contextlibcontextmanagerr	  r   r  r   rK   r   r   rD   r!   r   r   ;  s     88 88	
 !%B  *   < <5c?  <5c?  <rD   r   )0collections.abcr   r,  r   r   rf   r^   r`   typingr   r   r   r   r   r%   jax._srcr   r	   	jax.numpynumpyrt   jaxlib.mlirr
   jaxlib.mlir.dialectsr   r   r   r   utilsjax._src.libr   r#   ImportErrorr   r   r   	dataclassr   r   r   rC   r   r   rK   rI   rW   r   r   r   rD   r!   <module>r9     s    %     	  8 8  
     & $ '  7 CLcN dD12 2 22h 
  # 	1~ t} 
	
 aq%$,''( 
 
 !$ 	1~ u~ 
	
 aq$uS%Z01D8899: 
 
  #	1~ t} 	
 aq$u+,,--. 
 
 !$	1~ u~ 	
 aq$tE#u*$567$>>??@ 
 !A!H!H.1!HHzO zOz d#  $x< x<g  .s   'F! !F,+F,