
    bi\                         d dl Z d dlZd dlZd dlmZ d dlmZ  G d de      Z G d de      Z	 	 	 	 	 ddej                  d	e	d
e	de
de
de	defdZy)    N)nn)saved_tensors_hooksc                   B     e Zd ZdZ	 	 	 	 d	dededededdf
 fdZ xZS )
OffloadActivationsa		  
    Context manager under which activation tensors created in the forward pass will be offloaded.

    Enable the memory efficiency technique of activation offloading, where activations bigger than `min_offload_size`
    bytes will be offloaded to CPU in the forward and brought back in the backward. This is in contrast to maintaining
    the activation on GPU VRAM throughout the program.

    This manager contains the option of using one additional CUDA stream to handle the communication between CUDA and
    CPU, which is intended to overlap with the default computation stream to improve runtime. We designed
    synchronization with a few heuristics for optimizing the tradeoff between runtime vs memory usage.

    Args:
        use_pin_memory (`bool`, *optional*, defaults to `True`):
            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
            be moved back onto GPU more quickly but is a limited resource.
        use_streams (`bool`, *optional*, defaults to `True`):
            Whether to use streams for performance optimization where the communications get overlapped with the
            computation. Requires a torch build after torch-2.5.0.
        min_offload_size (`int`, *optional*, defaults to `1024`):
            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
            do not want to waste bandwidth and resources moving it to CPU and back.
        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
            runtime.

    Raises:
        ValueError: if `max_fwd_stash_size` is not at least `1`.

    Example:
    ```python
    >>> with OffloadActivations():
    ...     outputs = model(inputs, labels=labels)
    >>> loss = outputs.loss
    >>> loss.backward()
    ```
    use_pin_memoryuse_streamsmin_offload_sizemax_fwd_stash_sizereturnNc                 &   	
 | _         | _        i  _        d _        d _        d _        d _        | _        d _        t        t        d      r(t        j                  j                         j                  nd _         j                  dk(  rt        j                  j!                         nt        j"                  j%                          _         j                   r j                  dk(  rt        j(                         nt        j"                  j)                          _        i  _        |dk  rt/        d|       | _        i  _        i  _        d  _        d  _        	 fd	d
t:        fd	d
t<        f fddt        j>                  d
t<        fd
dt        j>                  d
t<        f
 fd}dt<        d
t        j>                  f fd}dt<        d
t        j>                  f fd} j                   r|n|}t@           ||       y )Nr   T<   acceleratorcudaxpu   z/max_fwd_stash_size should be at least 1 but is c                              } | j                   kD  r't        j                  d| dj                   d       y y )Nz	curr_pct=z!% > self.virtual_memory_safe_pct=z% of virtual memory used)virtual_memory_safe_pctwarningswarn)curr_pctget_cpu_ram_pctselfs    [/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/trl/models/activation_offloading.py verify_sufficient_virtual_memoryzEOffloadActivations.__init__.<locals>.verify_sufficient_virtual_memoryo   sB    &(H$666+M0L0L/NNfgh 7    r   c                  >    t        j                         j                  S N)psutilvirtual_memorypercent r   r   r   z4OffloadActivations.__init__.<locals>.get_cpu_ram_pctt   s    ((*222r   c                  F     xj                   dz  c_          j                   S )Nr   )	tensor_id)r   s   r   get_tensor_idz2OffloadActivations.__init__.<locals>.get_tensor_idx   s    NNaN>>!r   xc                 D    | j                         | j                         z  S r   )element_sizenelement)r%   s    r   get_num_bytes_tensorz9OffloadActivations.__init__.<locals>.get_num_bytes_tensor}   s    >>#ajjl22r   
activationc                    j                   r1t        j                        dk7  rt        d      d_         d_         	|       } 
       }| j
                  j                  dv r|j                  k\  rt        | t        j                  j                        st        t        j                  d      r%t        | t        j                  j                        sj                  rt        j                   j#                               D ]P  }||j$                  z
  k  r;j                   |   \  }}j&                  j)                  |       j                   |= P n j*                  j-                  j&                         j                  rj*                  nj&                  }j.                  dk(  r|nt        j0                  j3                  |      5  t        j4                  | j6                  d	      }|j9                  | d
       |dfj                  |<   d d d        j                  r+j*                  j;                         }| |fj                   |<   |S | dfj                  |<   |S # 1 sw Y   UxY w)Nr   z8Backward pass should have cleared tracker of all tensorsFT)r   r   Bufferr   cpu)
pin_memorydevicenon_blocking)is_first_forward_calllentracker
ValueErroris_first_backward_callr/   typemin_tensor_size_bytes
isinstancetorchr   	Parameterhasattrr,   r   list	fwd_stashkeysr
   s0
wait_events1wait_streamaccelerator_typer   stream
empty_liker   copy_record_event)r*   	num_bytesr#   id_evrE   
cpu_tensoreventr)   r$   r   s            r   pack_tensorz0OffloadActivations.__init__.<locals>.pack_tensor   s"   ))t||$)$%_`` .3*.2+ -Z8I%I
 !!&&/9!;!;;":uxx/A/AB$UXXx8Z
TYT\T\TcTc=d ## #4>>#6#6#89 "T-D-D!DD$(NN2$6EAr GG..r2 $r 2!" GG''0$($4$4$''#44=V5::CTCTU[C\ !&!1!1*I\I\ej!kJ$$Zd$C"/DLL+ ## GG002E 2<U0CDNN9- 	 +Y'
 ' s   AI11I:unpack_tensor_idc                 @   j                   r4j                  rd_        j                  r         d_         d_        | j                  vrt        d|        j                  |    \  }}|r|j                  j                  d      }|}j                  | = |S )NFTzUntracked tensor with id r0   )r6   is_first_forward_passr   r2   r4   r5   torD   )rP   maybe_accelerator_tensormodifiedaccelerator_tensorr   r   s       r   unpack_tensor_single_streamz@OffloadActivations.__init__.<locals>.unpack_tensor_single_stream   s     **--16D.**8:.3+-1*t||3 #<=M<N!OPP15>N1O.$h%=%@%@AVAVei%@%j"+=( -.++r   c                 (   	
 j                   rt        j                  j                         _        d	fd}t        j
                  j                  j                  j                  j                  |       j                  rd_
        j                  r         d_         d_         j                  vrt        d        j                      \  }}|rt        j                  j                         }t        j                  j                         }g 	|j                  k(  r9j                   |k7  r*|_        t#        j$                  j'                               	d j(                  v rj(                      d   }dn׉j*                  dk(  rj,                  n(t        j.                  j1                  j,                        5  |j3                  j*                  d      }|}d d d        j4                  j7                  j,                         |j$                   <   t        j                  j9                  |j;                         j<                        
	
 fd}|j?                  |       j                   = |S # 1 sw Y   xY w)
Nc                      t        j                  j                               D ]9  } j                  |    }j                  j                  |       j                  | = ; y r   )r=   bwd_tensor_stashr?   bwd_ev_stashrB   rA   )rJ   rN   r   s     r   !wait_and_del_remaining_referenceszjOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.wait_and_del_remaining_references   sV    "4#8#8#=#=#?@ 6 $ 1 1" 5**51 11"56r   FTzuntracked tensor with id r   r   r0   c                    r	j                      }t        j                  j                  |j	                         j
                        
kD  r)|j                  	j                         	j                   = n)	j                  j                         }|	j                  <   t        	j                  j                               D ]<  }	j                  |   \  }}	j                  j                  |       	j                  |= > D ]9  }	j                  |   }	j                  j                  |       	j                   |= ; | S r   )rZ   r:   _C_storage_Use_Countuntyped_storage_cdatarecord_streamr@   rH   r[   r=   r>   r?   rA   rB   )outputsinputsunpacked_tensorrN   rJ   rK   rL   brought_back_from_cpuprev_node_idsr   storage_refcountrP   s          r   hookzMOffloadActivations.__init__.<locals>.unpack_tensor_with_streams.<locals>.hook  s   , +/*?*?@P*Q 88667V7V7X7_7_`css+99$''B $ 5 56F G$(GG$8$8$:EBGD--.>? #4>>#6#6#89 / $r 22**2. NN2./ , 6 $ 1 1" 5**51 11"56
 #Nr   r   N) r6   r:   r^   _current_graph_task_idcurr_graph_idautogradvariableVariable_execution_enginequeue_callbackrR   r   r2   r4   r5   _current_autograd_nodecurr_autograd_noder=   rZ   r?   r>   rD   rB   r   rE   rS   r@   rC   r_   r`   ra   register_hook)rP   r\   rT   rU   graph_idnoderV   ri   rf   rg   rh   r   r   s   `       @@@r   unpack_tensor_with_streamsz?OffloadActivations.__init__.<locals>.unpack_tensor_with_streams   s<    **%*XX%D%D%F"6 ''00BBQQRst--16D.**8:.3+-1*t||3 #<=M<N!OPP15>N1O.$h 88::<xx668 " t111d6M6MQU6U.2D+$()>)>)C)C)E$FM(,%#t~~5/3~~>N/OPQ/R,,1) %)$9$9U$B

HYHYZ^ZaZaHb F-E-H-HI^I^mq-H-r*3E0F
 GG''0 ?WD))*:; (-xx'B'BC[CkCkCmCtCt'u$"# "#H ""4( -.++CF Fs    JJ)"r   r8   r4   r#   r2   r6   rR   r   r   r<   r:   r   current_acceleratorr7   rD   r   current_streamr   default_streamr@   StreamrB   r>   r5   r
   rZ   r[   rl   rs   floatintTensorsuper__init__)r   r   r   r	   r
   rO   rW   rw   unpack_tensorr   r)   r$   r   	__class__s   `        @@@@r   r   zOffloadActivations.__init__F   s    '%5"%)"&*#%)" -')$ =DE=<YE11388_e 	
 +/*?*?5*HEII$$&ejjNgNgNi 	
 (,(=(=(FellnEJJL]L]L_DGDN!A% #RSeRf!ghh&8D#$&D! "D!%D&*D#	i
	3 	3	"s 	"
	3ELL 	3S 	3
:	ELL :	S :	x	,# 	,%,, 	,0m	, m	, m	,^ 7;6F6F2Lgm4r   )TT      )__name__
__module____qualname____doc__boolr}   r   __classcell__r   s   @r   r   r      sX    &T  $  $"#@5@5 @5 	@5
  @5 
@5 @5r   r   c                   $     e Zd ZdZd fdZ xZS )NoOpManagerag  
    A `saved_tensors_hook` manager used to disable any other `saved_tensors_hook` manager applied before. This relies
    on the behavior that only the most recently registered `saved_tensors_hook` will run.

    One example usage is to opt a local region of code out of activations offloading, which is usually applied globally
    to best track state.
    c                 ,    d }t         |   ||       y )Nc                     | S r   r!   )tensors    r   noopz"NoOpManager.__init__.<locals>.noopS  s    Mr   )r   r   )r   r   r   s     r   r   zNoOpManager.__init__R  s    	 	t$r   rj   )r   r   r   r   r   r   r   s   @r   r   r   I  s    % %r   r   modelr   r   r	   r
   warn_if_no_headr   c                 X   t        ||||      }d}t               | }t        |d      r|j                  }t        |d      rt        |d      r|j                  }t        |d      rt        |j                  t        j                        rB|j                  j                  fd       |j                  j                  fdd	
       d	}nnt        |j                  d      rWt        |j                  j                  t        j                        r(|j                  j                  j                  fd       |j                  j                  j                  fdd	
       d	}nt        |d      rB|j                  j                  fd       |j                  j                  fdd	
       d	}nt        |d      r|j                  }	t        |	d      rB|	j                  j                  fd       |	j                  j                  fdd	
       d	}nt        |	d      r|	j                  j                  fd       |	j                  j                  fdd	
       d	}nt        |d      st        |d      rHt        |dd      xs |j                  }
|
j                  fd       |
j                  fdd	
       d	}npt        |d      rdt        |j                   t        j                        r@|j                   j                  fd       |j                   j                  fdd	
       d	}|s|rt#        j$                  d       |j'                         D ]B  \  }}d|j)                         v s|j                  fd        |j                  fd!d	
       D |S )"a(  
    Returns the activation offloading context manager for the model. All but the last output Linear in every step will
    be offloaded.

    If activation offloading is enabled, we return the OffloadActivations context manager. If activation offloading is
    disabled, we return a NoOpManager context manager.

    Args:
        model (`nn.Module`):
            Model to wrap with the activation offloading context manager.
        use_pin_memory (`bool`, *optional*, defaults to `True`):
            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
            be moved back onto GPU more quickly but is a limited resource.
        use_streams (`bool`, *optional*, defaults to `True`):
            Whether to use streams for performance optimization where the communications get overlapped with the
            computation. Requires a torch build after torch-2.5.0.
        min_offload_size (`int`, *optional*, defaults to `1024`):
            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
            do not want to waste bandwidth and resources moving it to CPU and back.
        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
            runtime.
        warn_if_no_head (`bool`, *optional*, defaults to `True`):
            Whether to warn if no output head is detected. If set to `False`, no warning will be raised if no output
            head is detected.

    Returns:
        `contextlib.ContextDecorator`:
            Activation offloading context manager for the model.
    )r   r   r	   r
   Fmodule
base_modelpeft_configoutputc                  $    j                         S r   	__enter__argsnoop_ctxs    r   <lambda>z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    8K]K]K_ r   c                  $    j                         S r   __exit__r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    xGXGXGZ r   T)always_calllinearc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    RZRdRdRf r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    hN_N_Na r   lm_headc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    HZHZH\ r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    HDUDUDW r   decoderc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    8CUCUCW r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    x?P?P?R r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    HDVDVDX r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    @Q@Q@S r   final_layer_normln_fNc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>      8;M;M;O r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>      x7H7H7J r   headc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    XEWEWEY r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  s    ARARAT r   aQ  During activation offloading, no output head was detected. If your model has an output head, it will be offloaded. This usually greatly slows training, given the large vocabulary size. To change this behavior, set your output head as model.output and make it an nn.Module. You can disable this warning by passing `warn_if_no_head=False`.ligerc                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  r   r   c                  $    j                         S r   r   r   s    r   r   z0get_act_offloading_ctx_manager.<locals>.<lambda>  r   r   )r   r   r<   r   r   r9   r   r   Moduleregister_forward_pre_hookregister_forward_hookr   r   r   getattrr   r   r   r   named_moduleslower)r   r   r   r	   r
   r   activations_handling_ctxoutput_head_detectedunwrapped_modelr   
final_normnamer   r   s                @r   get_act_offloading_ctx_managerr   Y  s   R  2%)-	  !}H O))00-'/=2Y)44 )o,,bii8""<<=_`""889Zhl8m#' _++X6:oF\F\FcFcegenen;o""))CCDfg""))??@aos?t#'  
)	,99:\]556Wei5j# 
)	,!))7H%NN445WXNN001R`d0e#' Wi(OO556XYOO112Sae1f#'  
"4	5RX9Y_.@$G_?K_K_
,,-OP(()JX\(]# 
&	)j9M9Mryy.Y667YZ223Tbf2g#O/	
 (557 ^fdjjl",,-OP(()JX\(]^
 $#r   )TTr   r   T)r   r   r:   r   torch.autograd.graphr   r   r   r   r   r}   r   r!   r   r   <module>r      s   *     4i5, i5X	%% %$    w$99w$w$ w$ 	w$
 w$ w$ w$r   