
    bi"z                        d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZ ddlZddlmZmZ d	d
lmZ d	dlmZmZ d	dlmZmZmZ  e       r	ddlmc mZ  e       rddlmZ d Z d Z!d Z"d Z#d Z$e dddZ%dDdZ&d Z'd Z(d Z)d Z*d Z+d Z,d Z-d Z. G d d e/      Z0d! Z1d" Z2e1d#        Z3d$efd%Z4d$efd&Z5dEd'Z6dFd(Z7ejp                  d	ejr                  dejt                  d)ejv                  d*ejx                  d+ejz                  d,ej|                  d-ej~                  d.ej                  d/ej                  d0i
ZBeBj                         D  ci c]  \  } }|| 
 c}} ZDd1 ZEdGd2ej                  fd3ZGe1dEd4eHfd5       ZIdEd4eHfd6ZJdHd7ZKdEd8ZL G d9 d:eM      ZNe2dId;       ZOdEd<ZPe1dJd=       ZQd> ZR G d? d@      ZSdA ZTdB ZUedKdC       ZVyc c}} w )LzB
A set of basic tensor ops compatible with tpu, gpu, and multigpu
    N)Mapping)contextmanagernullcontext)update_wrapperwraps)Any   )AcceleratorStatePartialState   )!TORCH_DISTRIBUTED_OPERATION_TYPES)DistributedTypeTensorInformation)is_npu_availableis_torch_distributed_availableis_torch_xla_available)ReduceOpc                 6    t        | t        j                        S N)
isinstancetorchTensortensors    V/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/accelerate/utils/operations.pyis_torch_tensorr   ,   s    fell++    c           
      v   t        | t        j                  j                  t        j                  j                  t        j                  j
                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                        S r   )
r   r   xpuFloatTensor
ByteTensor	IntTensor
LongTensor
HalfTensorDoubleTensorBFloat16Tensorr   s    r   is_torch_xpu_tensorr'   0   sm    														  	 	r   c                 "    t        | t              S r   )r   r   tensor_infos    r   is_tensor_informationr+   =   s    k#455r   c                 Z    t        | t              xr t        | d      xr t        | d      S )z
    Checks if `data` is a `namedtuple` or not. Can have false positives, but only if a user is trying to mimic a
    `namedtuple` perfectly.
    _asdict_fields)r   tuplehasattrdatas    r   is_namedtupler3   A   s*    
 dE"\wtY'?\GDR[D\\r   c                 h    t        |       r t        |       t        |       S  t        |       |      S )zO
    Cast a generator to the same type as obj (list, tuple, or namedtuple)
    )r3   typelist)obj	generators     r   
honor_typer9   I   s2    
 StCy$y/**tCy##r   F	test_typeerror_on_other_typec                    t        |t        t        f      rt        | fd|D              S t        |t              rD t        |      |j                         D ci c]  \  }}|t         |gd c}}      S  |      r  |gi S r2t        dt        |       d j                   dj                   d      |S c c}}w )ad  
    Recursively apply a function on a data structure that is a nested list/tuple/dictionary of a given base type.

    Args:
        func (`callable`):
            The function to recursively apply.
        data (nested list/tuple/dictionary of `main_type`):
            The data on which to apply `func`
        *args:
            Positional arguments that will be passed to `func` when applied on the unpacked data.
        main_type (`type`, *optional*, defaults to `torch.Tensor`):
            The base type of the objects to which apply `func`.
        error_on_other_type (`bool`, *optional*, defaults to `False`):
            Whether to return an error or not if after unpacking `data`, we get on an object that is not of type
            `main_type`. If `False`, the function will leave objects of types different than `main_type` unchanged.
        **kwargs (additional keyword arguments, *optional*):
            Keyword arguments that will be passed to `func` when applied on the unpacked data.

    Returns:
        The same data structure as `data` with `func` applied to every object of type `main_type`.
    c              3   D   K   | ]  }t        |gd   yw)r:   Nrecursively_apply).0oargsr<   funckwargsr;   s     r   	<genexpr>z$recursively_apply.<locals>.<genexpr>m   s@        "!".7M`djs    r:   zUnsupported types (z) passed to `z?`. Only nested list/tuple/dicts of objects that are valid for `z` should be passed.)
r   r/   r6   r9   r   r5   itemsr@   	TypeError__name__)rD   r2   r;   r<   rC   rE   kvs   ` ````  r   r@   r@   T   s   , $& 	
 	
 
D'	"tDz
 !JJL	 Aq $!".7M`dj 
 	
 
4D*4*6**	!$t*]4==/ J++4+=+=*>>QS
 	
 Ks   #C
c                    t        |       st        | d      rdk(  rd	 | j                        S t        | t        t        f      rt        | fd| D              S t        | t              r^t        t              rgng  t        |       | j                         D ci c]  \  }}||v r|nt        |       c}}      S | S # t        $ r | j                        cY S t        $ r,}t               rt        t              rd n|Y d}~nd}~ww xY w	 | j                        S # t        $ r | j                        cY S w xY wc c}}w )	a  
    Recursively sends the elements in a nested list/tuple/dictionary of tensors to a given device.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to a given device.
        device (`torch.device`):
            The device to send the data to.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    tonpuznpu:0)non_blockingznpu:Nc              3   <   K   | ]  }t        |         yw)rO   	skip_keysN)send_to_device)rA   tdevicerO   rR   s     r   rF   z!send_to_device.<locals>.<genexpr>   s!     ocd^AvLT]^^os   rQ   )r   r0   rM   rH   AssertionErrorr   r   intr/   r6   r9   r   strr5   rG   rS   )r   rU   rO   rR   errorrJ   rT   s    ```   r   rS   rS      sg    v'&$"7U?F	99V,9?? 
FUDM	*ohno
 	
 
FG	$i%"IItF| #LLNAq Y1N1fS_kt,uu
 	
 =  	%99V$$ 	  !fc*#F8_F		%99V,9?? 	%99V$$	%s5   C ) E
D#/D#7"DD#'D: :EEc                      d }t        ||       S )aK  
    Recursively gathers the information needed to rebuild a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with [`~utils.TensorInformation`] instead of tensors.
    c                 D    t        | j                  | j                        S )N)shapedtype)r   r\   r]   r   s    r   _get_data_structurez/get_data_structure.<locals>._get_data_structure   s     v||6<<HHr   r?   )r2   r^   s     r   get_data_structurer_      s    I 0$77r   c                      d }t        ||       S )a:  
    Recursively gathers the shape of a nested list/tuple/dictionary of tensors as a list.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to send to analyze.

    Returns:
        The same data structure as `data` with lists of tensor shapes instead of tensors.
    c                 ,    t        | j                        S r   )r6   r\   r   s    r   
_get_shapezget_shape.<locals>._get_shape   s    FLL!!r   r?   )r2   rb   s     r   	get_shaperc      s    " Z..r   c                 ,    d }t        || t              S )z
    Recursively initializes tensors from a nested list/tuple/dictionary of [`~utils.TensorInformation`].

    Returns:
        The same data structure as `data` with tensors instead of [`~utils.TensorInformation`].
    c                 T    t        j                  | j                  d| j                  iS Nr]   )r   emptyr\   r]   r)   s    r   _initialize_tensorz.initialize_tensors.<locals>._initialize_tensor   s"    {{K--G[5F5FGGr   r;   )r@   r+   )data_structurerh   s     r   initialize_tensorsrk      s    H /K`aar   c                    t        | t        t        t        f      r&t	        |       dk(  rt        dt        |        d      t        | t        t        f      rt        | d         S t        | t              r%| j                         D ]  }t        | |         c S  n2t        | t        j                        st        dt        |        d      | j                  d   S )a  
    Recursively finds the batch size in a nested list/tuple/dictionary of lists of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    r   z&Cannot find the batch size from empty .z0Can only find the batch size of tensors but got )r   r/   r6   r   len
ValueErrorr5   find_batch_sizekeysr   r   rH   r\   )r2   rJ   s     r   rp   rp      s     $g./SY!^A$t*QOPP$&tAw''	D'	" 	,A"47++	,ell+J4PT:,VWXYY::a=r   c                 D    	 t        |       S # t        t        f$ r Y yw xY w)a  
    Same as [`utils.operations.find_batch_size`] except will ignore if `ValueError` and `TypeErrors` are raised

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to find the batch size.

    Returns:
        `int`: The batch size.
    N)rp   ro   rH   r1   s    r   ignorant_find_batch_sizers     s,    t$$	" s   
 c                      d }t        ||       S )aS  
    Recursively finds tensors in a nested list/tuple/dictionary and converts them to a list of numbers.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`): The data from which to convert to regular numbers.

    Returns:
        The same data structure as `data` with lists of numbers instead of `torch.Tensor`.
    c                     | j                         j                         } | j                  t        j                  k(  r| j                  t        j                        } | j                         S r   )detachcpur]   r   bfloat16rM   float32tolistr   s    r   _convert_to_listz!listify.<locals>._convert_to_list   sF    $$&<<5>>) YYu}}-F}}r   r?   )r2   r{   s     r   listifyr|     s     -t44r   c                 P    d }t        || d      }t        j                          |S )Nc                     | j                   dk(  r| j                         d    } | j                         s| j                         } t	        j
                  |       S )Nr   )ndimcloneis_contiguous
contiguousxm
all_gatherr   s    r   _tpu_gather_onez$_tpu_gather.<locals>._tpu_gather_one-  sI    ;;!\\^D)F ##%&&(F}}V$$r   Tr<   )r@   r   	mark_step)r   r   ress      r   _tpu_gatherr   ,  s%    % OV
NCLLNJr   c                     t               t        j                  j                  j                  j
                  dk(  rt        j                  j                          fd}t        || d      S )Nr   c                    | j                   dk(  r| j                         d    } | j                         s| j                         } j                  j                  dk7  rtt        j                  j                  | j                         z  | j                  j                        } ||         |j                  dg| j                         dd   S t        j                        D cg c]  }t        j                  |        }}t
        j                  j!                  ||        t        j"                  |d      S c c}w )Nr   gloor]   rU   r   dim)r   r   r   r   backendr   rg   num_processesnumelr]   rU   viewsizerange
empty_likedistributedr   cat)r   output_tensors_	gather_opstates      r   _gpu_gather_onez$_gpu_gather.<locals>._gpu_gather_oneC  s   ;;!\\^D)F ##%&&(F==$&)@
 #[[##flln4ll||N
 nf-&>&&r>FKKM!",=>>
 AFeFYFY@Z[1e..v6[N[((@99^33 \s   *D?Tr   )	r   r   r   all_gather_into_tensorrU   r5   r   synchronizer@   )r   r   r   r   s     @@r   _gpu_gatherr   ;  sU    NE!!88I ||E!		48 _f$OOr   c                       e Zd ZdZy)DistributedOperationExceptionz
    An exception class for distributed operations. Raised if the operation cannot be performed due to the shape of the
    tensors.
    N)rI   
__module____qualname____doc__ r   r   r   r   b  s    
 	r   r   c                 .     t                fd       }|S )zv
    Verifies that `tensor` is the same shape across all processes. Only ran if `PartialState().debug` is `True`.
    c                  0   t               j                  t        j                  k(  st               j                  s 
| i |S 
j
                   d
j                   }d|v r|d   }n| d   }t               j                  j                  t        |      j                  k7  rgt        d| d|j                  j                   dt               j                  j                   dt               j                  j                   d| d      t        |      }t        |g      }|d   f|j                  |d         t        |      k(  }|sDd	j                  t!        |      D cg c]  \  }}d
| d|  c}}      }	t        d| d|	        
| i |S c c}}w )Nrm   r   r   z%One or more of the tensors passed to z were not on the z+ while the `Accelerator` is configured for z. Please move it to the z before calling z
  - zProcess z: znCannot apply desired operation due to shape mismatches. All shapes across devices must be valid.

Operation: `z`
Input shapes:
  - )r   distributed_typer   NOdebugr   rI   rU   r5   find_devicer   rc   gather_objectcountrn   join	enumerate)rC   rE   	operationr   shapesoutputare_sameir\   process_shape_strfunctions             r   wrapperz!verify_operation.<locals>.wrapperp  s   >**o.@.@@H\H\T,V,,**+1X->->,?@	vH%F!WF>  %%V)<)A)AA/7	{BSTZTaTaTfTfSg  hS  T`  Tb  Ti  Ti  Tn  Tn  So o))5)>)>)C)C(DDTU^T__`b  6"x(!9 ||F1I.#f+=H$,MM[dek[l2mxqRWXaS5'3J2m$n!3''0k1GHYGZ\ 
 ((( 3ns   F
r   r   r   s   ` r   verify_operationr   k  s"    
 8_) )4 Nr   c                 .     t                fd       }|S )z
    Checks that `verify_operation` failed and if so reports a more helpful error chaining the existing
    `DistributedOperationException`.
    c                      	  | i |S # t         $ r0}j                   dj                   }t        d| d      |d }~ww xY w)Nrm   zError found while calling `z1`. Please see the earlier error for more details.)r   r   rI   )rC   rE   er   r   s       r   r   z"chained_operation.<locals>.wrapper  sc    	T,V,,, 	#../q1B1B0CDI/-i[8ij	s    	A+?Ar   r   s   ` r   chained_operationr     s"     8_  Nr   c                     t               j                  t        j                  k(  rt	        |       S t               j                  t
        v rt        |       S | S )a4  
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.

    Returns:
        The same data structure as `tensor` with all tensors sent to the proper device.
    )r   r   r   XLAr   r   r   r   s    r   gatherr     sF     ~&&/*=*==6""		(	(,M	M6""r   objectc                     t        t               j                        D cg c]  }d  }}t        j                  j                  ||        |D cg c]  }|D ]  }|  c}}S c c}w c c}}w r   )r   r   r   r   r   all_gather_object)r   r   output_objectsyxs        r   _gpu_gather_objectr     s`    $),.*F*F$GHqdHNH	''?%1!q1!A1A11 I 2s   	A$A)c                     t               j                  t        j                  k(  rt	        d      t               j                  t
        v rt        |       S | S )a5  
    Recursively gather object in a nested list/tuple/dictionary of objects from all devices.

    Args:
        object (nested list/tuple/dictionary of picklable object):
            The data to gather.

    Returns:
        The same data structure as `object` with all the objects sent to every device.
    z&gather objects in TPU is not supported)r   r   r   r   NotImplementedErrorr   r   )r   s    r   r   r     sG     ~&&/*=*==!"JKK		(	(,M	M!&))r   c                 (    dd}t        || d|      S )Nc                 H    t         j                  j                  | |       | S )Nsrc)r   r   	broadcast)r   r   s     r   _gpu_broadcast_onez*_gpu_broadcast.<locals>._gpu_broadcast_one  s     ##F#4r   T)r<   r   r   r?   )r2   r   r   s      r   _gpu_broadcastr     s     /4UXYYr   c                 T   t        | t        t        f      rt        | fdt	        |       D              S t        | t
              rC t        |       | j                         D ci c]  \  }}|t        | d|        c}}      S t        j                  | fd      S c c}}w )Nc              3   H   K   | ]  \  }}t        | d |         yw)r   nameN)_tpu_broadcast)rA   r   rT   r   s      r   rF   z!_tpu_broadcast.<locals>.<genexpr>  s*     "gTQPQ>!TF!A3-#H#H"gs   "r   r   c                     |    S r   r   )r   r   s    r   <lambda>z _tpu_broadcast.<locals>.<lambda>  s    !C& r   )r   r6   r/   r9   r   r   r5   rG   r   r   mesh_reduce)r   r   r   rJ   rK   s    ``  r   r   r     s    &4-(&"gU^_eUf"ghh	FG	$tF|RXR^R^R`a$!QQq$q} EEabb>>$(899 bs   %B$
                  	   
   c                    d}t               }t        j                  |t        j                  |j                        }| V| j
                  }t        | j                     }t        j                  t        |      |gz   t              |dt        |      dz    t        |d      }||j                            }t        |dd d	         }|dd }||fS )
ze
    Grabs the shape of `tensor` only available on one process and returns a tensor of its shape
    i   r   Nr]   r   sum	reductionr   r   )r   r   rg   rW   rU   r\   TENSOR_TYPE_TO_INTr]   r   r6   rn   reducenonzero)r   max_tensor_dimensionr   base_tensorr\   tensor_dtyper]   s          r   gather_tensor_shaper     s    
 !NE++2%))ELLYK
 )&,,7(-T%[L>5QY\(]$c%j1n%6Kk1134KBC #$Ecr"Kr   returnc                     t               }t        |       \  }}| 7t        j                  |t        |         j                  |j                        } t        | d      S )a  
    Copys a tensor that only exists on a single device and broadcasts it to other devices. Differs from `broadcast` as
    each worker doesn't need to know its shape when used (and tensor can be `None`)

    Args:
        tensor (`torch.tensor`):
            The tensor that should be sent to all devices. Must only have it be defined on a single device, the rest
            should be `None`.
    r   r   r   )r   r   r   zerosTENSOR_INT_TO_DTYPErM   rU   r   )r   r   r\   r]   s       r   copy_tensor_to_devicesr     sP     NE&v.LE5~U*=e*DEHHV&E**r   from_processc                     t               j                  t        j                  k(  rt	        | |d      S t               j                  t
        v rt        | |      S | S )a  
    Recursively broadcast tensor in a nested list/tuple/dictionary of tensors to all devices.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data

    Returns:
        The same data structure as `tensor` with all tensors broadcasted to the proper device.
    zaccelerate.utils.broadcast)r   r   r   )r   r   r   r   r   r   r   )r   r   s     r   r   r     sM     ~&&/*=*==f,=YZZ		(	(,M	Mf,77r   c                 ,   t               j                  t        j                  k(  r2t	        |       D ]"  \  }}t        j                  d|fd      | |<   $ | S t               j                  t        v r!t        j                  j                  |        | S )a  
    Broadcast a list of picklable objects form one process to the others.

    Args:
        object_list (list of picklable objects):
            The list of objects to broadcast. This list will be modified inplace.
        from_process (`int`, *optional*, defaults to 0):
            The process from which to send the data.

    Returns:
        The same list containing the objects from process 0.
    z&accelerate.utils.broadcast_object_listc                     |    S r   r   )r   r   s    r   r   z'broadcast_object_list.<locals>.<lambda>>  s    efgset r   r   )r   r   r   r   r   r   r   r   r   r   broadcast_object_list)object_listr   r   r7   s    `  r   r   r   /  s     ~&&/*=*==, 	vFAs^^,TVY[tuKN	v  
	(	(,M	M///Nr   c                 "    d }t        || |      S )aN  
    Recursively takes a slice in a nested list/tuple/dictionary of tensors.

    Args:
        data (nested list/tuple/dictionary of `torch.Tensor`):
            The data to slice.
        tensor_slice (`slice`):
            The slice to take.

    Returns:
        The same data structure as `data` with all the tensors slices.
    c                     | |   S r   r   )r   tensor_slices     r   _slice_tensorz$slice_tensors.<locals>._slice_tensorR  s    l##r   r?   )r2   r   process_indexr   r   s        r   slice_tensorsr  D  s    $ ]D,??r   c                     t         d   t        t        f      r.t         d    fdt	        t         d               D              S t         d   t              rR t         d          d   j                         D ci c]!  }|t         D cg c]  }||   	 c}      # c}}      S t         d   t        j                        st        dt         d                t        j                         S c c}w c c}}w )a  
    Recursively concatenate the tensors in a nested list/tuple/dictionary of lists of tensors with the same shape.

    Args:
        data (nested list/tuple/dictionary of lists of tensors `torch.Tensor`):
            The data to concatenate.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to concatenate.

    Returns:
        The same data structure as `data` with all the tensors concatenated.
    r   c              3   d   K   | ]"  }t        D cg c]  }||   	 c}        $ yc c}w w)r   N)concatenate)rA   r   dr2   r   s      r   rF   zconcatenate.<locals>.<genexpr>f  s.     #lSTKt0D!10D#$N$N#l0Ds   0+
0r   z%Can only concatenate tensors but got )r   r/   r6   r9   r   rn   r   r5   rq   r  r   r   rH   r   )r2   r   rJ   r  s   ``  r   r  r  X  s     $q'E4=)$q'#lX]^abfghbi^jXk#lmm	DGW	%tDG}UYZ[U\UaUaUcdPQaD-Aqad-As!KKdeeQ.?T!WOPP99Ts## .Bds   D
C=D
=D
c                       e Zd Zy)CannotPadNestedTensorWarningN)rI   r   r   r   r   r   r  r  n  s    r   r  c                 ,    dd}t        || d|||      S )a3  
    Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so they
    can safely be gathered.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather.
        dim (`int`, *optional*, defaults to 0):
            The dimension on which to pad.
        pad_index (`int`, *optional*, defaults to 0):
            The value with which to pad.
        pad_first (`bool`, *optional*, defaults to `False`):
            Whether to pad at the beginning or the end.
    c                   	
 t        | dd      rt        j                  dt               | S t	        | j
                        k\  st	        | j
                         k  r| S dk  rt	        | j
                        z  t        j                  | j
                  | j                        d    }t        |      j                         }t        fd|D              		| j
                     k(  r| S | j
                  
t        
      }	|<   | j                  t        |            |z   }|r)t        	
fdt        t	        |            D              }n't        
fdt        t	        |            D              }| ||<   |S )	N	is_nestedFzHCannot pad nested tensors without more information. Leaving unprocessed.r   )rU   c              3   (   K   | ]	  }|     y wr   r   )rA   sr   s     r   rF   zFpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>  s     -!qv-s   c              3   b   K   | ]&  }|k(  rt           z
        n
t        d        ( y wr   slice)rA   r   r   max_sizeold_sizes     r   rF   zFpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>  s6      [\Q#Xh#.95QU;Vs   ,/c              3   \   K   | ]#  }|k(  rt        d          n
t        d       % ywr   Nr  rA   r   r   r  s     r   rF   zFpad_across_processes.<locals>._pad_across_processes.<locals>.<genexpr>  s,     oUVqCxE!Xc]3U4[Po   ),)getattrwarningswarnr  rn   r\   r   r   rU   r   rw   maxr6   	new_zerosr/   r   )r   r   	pad_index	pad_firstr   sizesnew_size
new_tensorindicesr  r  s    `       @@r   _pad_across_processesz3pad_across_processes.<locals>._pad_across_processes  sK   6;.MMZ, M#fll##sc&,,.?-?'?M73v||$$C ||FLL?Et  "-u--v||C((M<<> %%eHo6B
 `efijrfs`t G oZ_`cdl`mZnooG$
7r   T)r<   r   r  r  r   r   Fr?   )r   r   r  r  r!  s        r   pad_across_processesr#  r  s&    " D v4ST]ir r   c                 ,    dd}t        || d|||      S )z
    Takes a `tensor` of arbitrary size and pads it so that it can work given `num_processes` needed dimensions.

    New tensors are just the last input repeated.

    E.g.:
      Tensor: ([3,4,4]) Num processes: 4 Expected result shape: ([4,4,4])

    c                 F  
 ||z  }|||z  z
  }||z  dk(  r||z
  }n|||z  z
  }|||z  cxkD  rdk  rn n||z
  }| j                   
t        
      }||z   |d<   | j                  t        |            }t        
fdt	        t        |            D              }	| ||	<   |S )Nr   r   c              3   \   K   | ]#  }|k(  rt        d          n
t        d       % ywr  r  r  s     r   rF   z@pad_input_tensors.<locals>._pad_input_tensors.<locals>.<genexpr>  s,     kQR18a#/tLkr  )r\   r6   r  r/   r   rn   )r   
batch_sizer   r   	remainderlast_inputsto_padr  r  r   r  s      `      @r   _pad_input_tensorsz-pad_input_tensors.<locals>._pad_input_tensors  s    -/	 I$=>&!+"Z/F"jM&ABF &,1, 6)F<<> 6)%%eHo6
kV[\_`h\iVjkk$
7r   T)r<   r'  r   r   r   r?   )r   r'  r   r   r+  s        r   pad_input_tensorsr,    s(    &  # r   c                 *    dd}t        || d||      S )aX  
    Recursively reduce the tensors in a nested list/tuple/dictionary of lists of tensors across all processes by the
    mean of a given operation.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to reduce.
        reduction (`str`, *optional*, defaults to `"mean"`):
            A reduction method. Can be of "mean", "sum", or "none"
        scale (`float`, *optional*):
            A default scaling value to be applied after the reduce, only valied on XLA.

    Returns:
        The same data structure as `data` with all the tensors reduced.
    c                    t               }| j                         }|j                  t        j                  k(  r|S |j                  t        j
                  k(  rOt        j                          t        j                  t        j                  |g|       t        j                          nJ|j                  j                  t        v r.t        j                  j                  |t        j                         |dk(  r||j                   z  }|S )Nmean)r   r   r   r   r   r   r   r   
all_reduce
REDUCE_SUMvaluer   r   r   r   SUMr   )r   r   scaler   cloned_tensors        r   _reduce_across_processesz(reduce.<locals>._reduce_across_processes  s    !!_%7%77  !!_%8%88
 LLNMM"---%@LLN##))-NN((EU000Mr   T)r<   r   r4  r/  g      ?r?   )r   r   r4  r6  s       r   r   r     s"    $&  &di_d r   c                 *    d }d }t        || |      S )av  
    Recursively converts the elements nested list/tuple/dictionary of tensors in FP16/BF16 precision to FP32.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to convert from FP16/BF16 to FP32.

    Returns:
        The same data structure as `tensor` with all tensors that were in FP16/BF16 precision converted to FP32.
    c                 "    | j                         S r   )floatr   s    r   _convert_to_fp32z)convert_to_fp32.<locals>._convert_to_fp32  s    ||~r   c                     t        |       xs t        | d      xr, | j                  t        j                  t        j
                  fv S rf   )r   r0   r]   r   float16rx   r   s    r   _is_fp16_bf16_tensorz-convert_to_fp32.<locals>._is_fp16_bf16_tensor  s@    'C767+C 
MMNNZ
 J
 	
r   ri   r?   )r   r;  r>  s      r   convert_to_fp32r?    s    
 -vAUVVr   c                   "    e Zd ZdZd Zd Zd Zy)ConvertOutputsToFp32ad  
    Decorator to apply to a function outputing tensors (like a model forward pass) that ensures the outputs in FP16
    precision will be convert back to FP32.

    Args:
        model_forward (`Callable`):
            The function which outputs we want to treat.

    Returns:
        The same function as `model_forward` but with converted outputs.
    c                 *    || _         t        | |       y r   )model_forwardr   )selfrC  s     r   __init__zConvertOutputsToFp32.__init__!  s    *t]+r   c                 8    t         | j                  |i |      S r   )r?  rC  )rD  rC   rE   s      r   __call__zConvertOutputsToFp32.__call__%  s     1t114B6BCCr   c                 ,    t        j                  d      )NzCannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it.)picklePicklingError)rD  s    r   __getstate__z!ConvertOutputsToFp32.__getstate__(  s    "" `
 	
r   N)rI   r   r   r   rE  rG  rK  r   r   r   rA  rA    s    
,D
r   rA  c                 6     t                 fd} |_        |S )Nc                       | i |S r   r   )rC   rE   rC  s     r   forwardz(convert_outputs_to_fp32.<locals>.forward1  s    d-f--r   )rA  __wrapped__)rC  rN  s   ` r   convert_outputs_to_fp32rP  .  s!    (7M. (GNr   c                    t        | t              r'| j                         D ]  }t        |      }||c S  yt        | t        t
        f      r| D ]  }t        |      }||c S  yt        | t        j                        r| j                  S y)z
    Finds the device on which a nested dict/list/tuple of tensors lies (assuming they are all on the same device).

    Args:
        (nested list/tuple/dictionary of `torch.Tensor`): The data we want to know the device of.
    N)	r   r   valuesr   r/   r6   r   r   rU   )r2   r7   rU   s      r   r   r   :  s     $ ;;= 	C %F!	 
D5$-	( 	C %F!	 
D%,,	'{{ 
(r   c              #   T  K   t               j                  t        j                  k7  s6t               j                  -t               j                  j                         st               }n#ddl}|j                  j                  | |||      }|5  d ddd       y# 1 sw Y   yxY ww)z
    Wrapper around `deepspeed.runtime.zero.GatheredParameters`, but if Zero-3 is not enabled, will be a no-op context
    manager.
    Nr   )modifier_rank
fwd_moduleenabled)
r
   r   r   	DEEPSPEEDdeepspeed_pluginis_zero3_init_enabledr   	deepspeedzeroGatheredParameters)paramsrT  rU  rV  gather_param_contextrZ  s         r   r\  r\  O  s      **o.G.GG++7 "33IIK*}(~~@@-JPW  A  
 
   s   BB(B	B(B%!B()FNr   )r   zbroadcast tensorr   )NNr"  r7  )NNT)Wr   rI  r  collections.abcr   
contextlibr   r   	functoolsr   r   typingr   r   r   r
   r   	constantsr   dataclassesr   r   importsr   r   r   torch_xla.core.xla_modelcore	xla_modelr   torch.distributedr   r   r'   r+   r3   r9   r@   rS   r_   rc   rk   rp   rs   r|   r   r   	Exceptionr   r   r   r   r   r   r   r   r:  doublehalfrx   uint8int8int16int32int64boolr   rG   r   r   r   r   rW   r   r   r  r  UserWarningr  r#  r,  r   r?  rA  rP  r   r\  )rJ   rK   s   00r   <module>rt     s3     # 2 +   2 8 ;  ))!#*,
6]$ 4CX] 0f1h8$/$b."5.$PN	I 	 F&  &2s 2# &Z: 
KK	LL!	JJ	NNA	KK	JJ	KK	KK	KK	JJ  );(@(@(BC1q!tC 2+5<< +" C  *S *@($,	; 	 4 4n%P & &RW0
 
4	*  I Ds   G