
    bi                     Z   d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	c m
Z d dlmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZ  e       rd dlmZmZ d dlmZ  d dlm!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZEmFZFmGZGmHZH d dlImJZJmKZKmLZL d dlMmNZN d dlOmPZPmQZQmRZRmSZSmTZTmUZU d dlVmWZW d dlXmYZYmZZZm[Z[m\Z\m]Z]m^Z^ d d l_m`Z`maZambZbmcZc d=d!Zdd" Ze G d# d$      Zfd% Zgd& Zh G d' d(e      Zid) Zj G d* d+      Zk G d, d-e      Zld. Zm G d/ d0e      Zn G d1 d2en      Zo G d3 d4en      Zp G d5 d6en      Zqd7 Zrdi fd8Zs G d9 d:ej                  j                        Zud; Zvd< Zwy)>    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)mputensor_parallel)finalize_model_grads)	ModelType)get_num_microbatches)get_megatron_optimizer)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)build_train_valid_test_datasets)	BertModelFloat16ModuleGPTModelT5Model)Classification)get_argsget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)set_global_variables)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelc           	      8   t               }|j                  rdnd}|j                  dk(  r't        d|j                   d| d       t        d       t        |      }|j                  dk(  rU|j                  r-|j                  rd	nd}t        |||j                  d
| |      }|S t        ||j                  d	| |      }|S |j                  dk(  rt        |dd
| |      }|S |j                  dk(  rt        |dd
| |||      }|S t        d|j                         )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr	   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)rD   num_classesrE   rH   rI   gpt)rD   rE   rG   rH   rI   t5)rD   rE   rG   rH   rI   add_encoderadd_decoderUnsupported model type: )r$   pretraining_flagrankprintmodel_type_namer*   bert_binary_headr   r#   
num_labelsr!   r"   
ValueError)	rH   rI   rM   rN   argsmoderD   rE   models	            W/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcr[   [   sS   :D!22>DyyA~	$../~dV6JKx	
 /t4Fv%  "&"7"7QQN- $ 5 5 $')EH L7 # OO ')E6 L) 
			& #%
& L 
			% #%##
 L 3D4H4H3IJKK    c                    | j                  d       t               }| j                  j                  j                  | j                  j                  j
                  t        d      | j                  j                  j
                  }| j                  j                  j	                  |      }t        | |      }t        | |d       }nt        j                  }|j                  dk(  rt        j                  }t        }| j                  j                  j
                   | j                  j                  j
                  }t        |||j                  |j                   |j"                        \  }}}t%        |      |_        |||fS )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrL   )no_wd_decay_condscale_lr_condlr_mult)rR   r$   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrV   prepare_optimizerprepare_schedulerr   encoder_or_decoderrS   encoder_and_decoderr[   r;   r_   r`   ra   len	model_len)acceleratorrW   custom_model_provider_funcrY   	optimizerr^   
model_typemodel_provider_func_s           rZ   !prepare_model_optimizer_schedulerrq      sM   ;<:D++IIU//NNVs  &1%6%6%I%I%h%h"!!44RRSmn%k59	%k9M	11
4'"66J2//NNZ#.#4#4#G#G#f#f (A !22,,LL)
%	9 ZDN)Y&&r\   c                   (    e Zd ZdZd Zd Zd Zd Zy)MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                     t        j                         }t        |      }t        |      }|j	                         }t        |d         | _        | j                  j                  |       d| j                  d<   y )Nr   Tmegatron_dataset_flag)argparseArgumentParserr(   r)   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argss       rZ   __init__z"MegatronLMDummyDataLoader.__init__   sh    ((*'%f-++-	 1.  05912r\   c                     t               }| j                  j                         D ];  \  }}t        ||d      }||k7  rt	        d| d| d| d|        t        |||       = y )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r$   rz   itemsgetattrrR   setattr)r|   rW   keyvalue	old_values        rZ   set_megatron_data_argsz0MegatronLMDummyDataLoader.set_megatron_data_args   s~    z++113 	&JCc2.IE!RSVRWWXYbXccijminnopuovw D#u%	&r\   c                 x   d }|j                   j                  j                   |j                   j                  j                  S 	 t               }|j                  dk(  rddlm} d|_        |S |j                  dk(  rddlm} d|_        |S |j                  dk(  rddl	m} d|_        |S 	 |S # t        $ r Y |S w xY w)Nc                 N   t               }t        |j                  t        t        f      r|j                  n|j                  g|j
                  | |j                  d}|j                  dk(  r)|j                  |j                  |j                  d       n~|j                  dk(  r|j                  d|j                  i       nQ|j                  dk(  r*|j                  |j                  |j                  dd       nt        d|j                         t        d	i |\  }}}|||fS )
z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedrC   )max_seq_lengthbinary_headrK   r   rL   )r   max_seq_length_decdataset_typerO    )r$   
isinstance	data_pathlisttuplesplitr   rS   r{   
seq_lengthrT   encoder_seq_lengthdecoder_seq_lengthrV   r   )train_val_test_num_samplesrW   rz   train_dsvalid_dstest_dss         rZ   "train_valid_test_datasets_providerzlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_provider   s   :D1;DNNTSXM1Zt~~aeaoao`p!%0J			L ##v-##*.//'+'<'< %%.##($//
 %%-##*.*A*A.2.E.E(, !#;D<P<P;Q!RSS*I*YL*Y'HhXw..r\   rC   r   )r   TrK   rL   )rb   rc   *custom_megatron_datasets_provider_functionr$   rS   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)r|   rl   r   rW   s       rZ   &get_train_valid_test_datasets_providerz@MegatronLMDummyDataLoader.get_train_valid_test_datasets_provider   s    !	/F //ZZf$$77bbb	:D##v-LDH2A99%%.KDH2A99%%-JDH2A99	 . 21  	11	s   'B, -B, B, ,	B98B9c                 t   t               }| j                  |      }|j                  ~g }g }g }t        t	        |dd            D ]^  }t        j                  |       t        |      }|j                  |d          |j                  |d          |j                  |d          ` nt        |      \  }}}|||fS )Nrk   r   r   r	   )	r$   r   $virtual_pipeline_model_parallel_sizeranger   r   (set_virtual_pipeline_model_parallel_rankr8   append)	r|   rl   rW   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorss	            rZ   r8   z?MegatronLMDummyDataLoader.build_train_valid_test_data_iterators  s    z,0,W,WXc,d)44@"$"$!#74a89 8<<Q?ABcd	#**9Q<8#**9Q<8")))A,78 Lq1LH!46H #$79KKKr\   N)__name__
__module____qualname____doc__r   r   r   r8   r   r\   rZ   rs   rs      s    :&:2xLr\   rs   c                      G d d      }|d u }t        j                  |t         j                  | j                        }t         j                  j                  |t               t                      |s	|r |       S |S )Nc                       e Zd Zd Zd Zy)?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderc                     | S Nr   r|   s    rZ   __iter__zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__  s    Kr\   c                     i S r   r   r   s    rZ   __next__zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__!  s    Ir\   N)r   r   r   r   r   r   r\   rZ   DummyMegatronDataloaderr     s    		r\   r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr   r   )rl   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptys        rZ   _handle_megatron_data_iteratorr     sw      +d2!&.DEJJ_j_q_q!r	"$F$HPoPq    &*@&((r\   c           
      8   | j                  d       t               }|j                  s0ddlm}m} |j                  |j                  z  }|D ci c]  }|t        ||||          }}|d   Pt        |d   t        j                  j                  j                        r||d   _        n|d= |d= |d= ||d   _        n|d= ||d<   t        j                  j                  j                  |j                   fi |} ||| j"                  t%        j&                         t%        j(                         dd	| j*                  j-                         | j.                  
      S |j0                   |j0                  \  |_        |_        |_        nd\  |_        |_        |_        |j                  |j                  z  |_        |j9                  |       \  }}	}
|j                  |j                  z  |_        t;        | |      }t;        | |	      }	t;        | |
      }
||	|
fS c c}w )NzPreparing dataloaderr	   )_PYTORCH_DATALOADER_KWARGSprepare_data_loader
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )rl   r   )rR   r$   ru   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr8   r   )rl   
dataloaderrW   r   r   r   kkwargsr   r   r   s              rZ   r   r   .  s,   ,-:D%%Q0043I3IITnoq!WZ,Fq,IJJoo,'&+U[[-=-=-J-JK/?y!,9%9%<(5E'2'#3F< [[%%001C1CNvN
 #::<446!++002(99	
 		
   ,
 %%	++* dk`D')DdF` $ 5 58N8N N <<[I		
 $ 5 59O9O O<#3F
 =#3F
 <cuv"$79KKKo ps   Hc                   <     e Zd Z fdZddZd Zed        Z xZS )MegatronLMOptimizerWrapperc                 *    t         |   |dd        y )NF)device_placementscalersuperr   )r|   rn   	__class__s     rZ   r   z#MegatronLMOptimizerWrapper.__init__q  s    U4Hr\   c                      y r   r   )r|   set_to_nones     rZ   	zero_gradz$MegatronLMOptimizerWrapper.zero_gradt      r\   c                      y r   r   r   s    rZ   stepzMegatronLMOptimizerWrapper.stepw  r   r\   c                 .    | j                   j                  S )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rn   skipped_iterr   s    rZ   step_was_skippedz+MegatronLMOptimizerWrapper.step_was_skippedz  s     ~~***r\   r   )	r   r   r   r   r   r   propertyr  __classcell__r   s   @rZ   r   r   p  s'    I + +r\   r   c                     | j                  d       t               }t        ||j                  |j                  |j
                        S )NzPreparing optimizer)rR   r$   r   r_   r`   ra   )rl   rY   rW   s      rZ   rf   rf     s<    +,:D!%)>)>@R@RTXT`T`aar\   c                       e Zd ZdZddZy)MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    Nc                 <    || _         || _        || _        || _        y r   )rn   total_num_stepswarmup_num_stepsr   )r|   rn   r	  r
  r   s        rZ   r   z!MegatronLMDummyScheduler.__init__  s     ". 0r\   Nr   )r   r   r   r   r   r   r\   rZ   r  r    s    r\   r  c                   $     e Zd Z fdZd Z xZS )MegatronLMSchedulerWrapperc                 &    t         |   ||       y r   r   )r|   r^   
optimizersr   s      rZ   r   z#MegatronLMSchedulerWrapper.__init__  s    J/r\   c                      y r   r   )r|   rW   r   s      rZ   r   zMegatronLMSchedulerWrapper.step  s    r\   )r   r   r   r   r   r  r  s   @rZ   r  r    s    0r\   r  c                 >    | j                  d       t        |      }|S )NzPreparing scheduler)rR   r9   )rl   rn   r^   s      rZ   rg   rg     s!    +,-i8Ir\   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                 0    t         |           || _        y r   )r   r   name)r|   r  r   s     rZ   r   zAbstractTrainStep.__init__  s    	r\   c                      y r   r   )r|   rl   ru   s      rZ   get_batch_funcz AbstractTrainStep.get_batch_func  r   r\   c                      y r   r   r   s    rZ   get_forward_step_funcz'AbstractTrainStep.get_forward_step_func  r   r\   c                      y r   r   )r|   rl   s     rZ   get_loss_funczAbstractTrainStep.get_loss_func  r   r\   )	r   r   r   r   r   r  r  r  r  r  s   @rZ   r  r    s    Er\   r  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                 V   t         |   d       | j                  ||j                        | _        | j                  ||j                  |j                        | _        | j                  |j                  |j                        | _        |j                  sd | _        y ddlm} || _        y )Nr  r   )SequenceClassifierOutput)r   r   r  ru   	get_batchr  rP   rU   	loss_funcr  rT   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr  )r|   rl   rW   r  r   s       rZ   r   zBertTrainStep.__init__  s    ),,[$:T:TU++K9N9NPTP_P_` 66t7L7LdNcNcd%%&*D#N&>D#r\   c                     d }d }|j                   j                  j                   |j                   j                  j                  S |r		 ddlm} |S |S # t
        $ r Y |S w xY w)Nc                 l   g d}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|d   j                         }|d   j                         }|d   j                         }|d   j                         }	|d   j                         }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr)  r*  r,  r-  r+  r.  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr*  sentence_orderr-  	lm_labelsr.  s              rZ   get_batch_megatronz8BertTrainStep.get_batch_func.<locals>.get_batch_megatron  s     YD{{H (M*$33D$IF F^((*F7O((*E#K0557N{+113Ix(--/I!.1668L5.)YTTr\   c                    t        |       }t        |t        j                  j	                               }|d   j                         }|d   j                         }d|v r|d   j                         }nd}d|v r9|d   j                         }|d   dk7  j                  t        j                        }nd}d}d|v r|d   j                         }nd}||||||fS )r(  	input_idsattention_masktoken_type_idsNr+  next_sentence_label)r1  r   r   cudacurrent_devicer3  tor4  )r   r   r8  r.  r*  r:  r-  r9  s           rZ   get_batch_transformerz;BertTrainStep.get_batch_func.<locals>.get_batch_transformer  s    &D!$

(A(A(CDD +&++-F 01668L4'-.3354 N//1	!(^t377D	 	 	$,!%&;!<!A!A!C!%5.)YTTr\   r   r   )rb   rc   custom_get_batch_functionr   r   r   r|   rl   ru   r;  rE  r   s         rZ   r  zBertTrainStep.get_batch_func  ss    	U0	U2 //IIU$$77QQQ 3  
 )(	  %%   
A 	A! A!c                      d } fd}|j                   j                  j                   |j                   j                  j                  S |r|S |S )Nc                    |\  }}|j                         }| j                         } t        j                  |j                  d      | j	                  d      z        | j                         z  }|tt        j                  |j                  dd      j                         |j                  d      d      }|j                         }||z   }t        ||g      }||d   |d   dfS |}t        |g      }|d|d   ifS )Nr	   )ignore_indexr   r   )lm losszsop lossrN  )r4  r   sumviewreshapeFcross_entropyr>   )	r-  r9  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossess	            rZ   loss_func_pretrainz7BertTrainStep.get_loss_func.<locals>.loss_func_pretrain  s    #0 Hj~~'H!)Iiib 1I4E4Eb4I IJY]]_\G%??:??2q+A+G+G+I>K^K^_aKbqst#>>+)"KWV^L_"`);YZI[\\\ "KWI"Vi);<<<r\   c                    dk(  r2t               } ||j                  d      | j                  d            }nj                  dkD  r_| j                  t        j
                  t        j                  fv r3t               } ||j                  d      | j                  d            }nt               } |||       }t        |g      }|d|d   ifS )Nr   rL  rY  r   )
r   rP  rU   r   r   r3  intr   r   r>   )r+  logitsloss_fctrY  rZ  rU   r|   s        rZ   loss_func_finetunez7BertTrainStep.get_loss_func.<locals>.loss_func_finetune%  s    Q"9BRA1$&,,5::uyy:Q*Q+-B
 ;V[[_M,./GOO&/!"4555r\   rb   rc   custom_loss_function)r|   rl   rP   rU   r[  r`  s   `  `  rZ   r  zBertTrainStep.get_loss_func  sN    	=&	6 //DDP$$77LLL%%%%r\   c                       fd}|S )Nc                     j                  |       \  }}}}}}
sd}r% |||||      }|t        j                  ||      fS  ||||      }	|	t        j                  |      fS )Forward step.Ntokentype_idsr:  )rg  r   r   r!  )r   rY   r8  r*  r9  r-  r+  r.  rT  r^  rT   rP   r|   s             rZ   r"  z9BertTrainStep.get_forward_step_func.<locals>.forward_step;  sw    MQ^^\iMjJFE>9fl# %fl%[a b$gdnni&XXXv|5Iwt~~v>>>r\   r   )r|   rP   rT   r"  s   ``` rZ   r  z#BertTrainStep.get_forward_step_func:  s    	? r\   	r   r   r   r   r   r  r  r  r  r  s   @rZ   r  r    s    
?>)@'&Rr\   r  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    t         |   d       | j                  ||j                        | _        | j                  |      | _        | j                         | _        |j                  dz
  | _
        |j                  t               }|j                  | _
        |j                  | _        |j                  | _        |j                   | _        |j"                  sd | _        y ddlm} || _        y )Nrk  r   r   )!CausalLMOutputWithCrossAttentions)r   r   r  ru   r   r  r!  r  r"  padded_vocab_size	eod_token
vocab_filer&   eodreset_position_idsreset_attention_maskeod_mask_lossr#  r$  r%  rm  )r|   rl   rW   	tokenizerrm  r   s        rZ   r   zGPTTrainStep.__init__S  s    (,,[$:T:TU++K8 668//!3??&%I&]]DN"&"9"9$($=$=!!//%%&*D#W&GD#r\   c                       fd} fd}|j                   j                  j                   |j                   j                  j                  S |r		 ddlm} |S |S # t
        $ r Y |S w xY w)Nc                    dg}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|ddddf   j                         }|ddddf   j                         }t        |j                  j                  j                  j                        \  }}	}
|||	||
fS )zGenerate a batchr)  Nr   rL  )r   r0  r1  r   r2  r3  
contiguousr@   ro  rr  rs  rt  )r   r5  r6  r   r7  tokens_r+  r8  r>  r-  position_idsr|   s              rZ   r;  z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatrong  s     8D{{H (M*$33D$IF Vn))+GQU^..0FQV_//1F 7V(?(?AZAZ\`\n\n73NI| 69nlJJr\   c                 4   t        |       }d|d   i}t        |t        j                  j	                               }|d   j                         }t        j                  |j                  d   df|j                  |j                        	j                  z   }t        j                  ||gd      }|d d dd f   j                         }|d d d df   j                         }t        |	j                  	j                  	j                  d      \  }}}|||||fS )Nr=  r   r   r   dimrL  T)r1  r   r   rB  rC  r3  zerosshaper   r   ro  concatrx  r@   rr  rs  )
r   r   ry  paddingr+  r8  r>  r-  rz  r|   s
            rZ   rE  z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformer  s   &Dk!23D!$

(A(A(CDD;',,.Gkk7==#3Q"7w}}U\UcUcdgkguguuGllGW#51=GQU^..0FQV_//1F6U(?(?AZAZ\`73NI| 69nlJJr\   r   rF  )rb   rc   rG  r   r   r   rH  s   `     rZ   r  zGPTTrainStep.get_batch_funcf  st    	K2	K  //IIU$$77QQQ 2  
 )(	  %%s   A 	A&%A&c                     t               fd}|j                  j                  j                   |j                  j                  j                  S |S )Nc                    j                   r|\  }}n|}|j                         }| j                  d      j                         } j                  dkD  rt	        j
                  t	        j                  |j                  d      | z        j                  d      | j                         j                  d      g      }t        j                  j                  |t        j                                |d   |d   z  }n8t	        j                  |j                  d      | z        | j                         z  }j                  rot        j                  j                         }|j                         rAJ d| dt        j                  j                          dt!        j"                         d           t%        |g      }d|d   i}j                   r|j'                  d	i       ||fS )
NrL  r   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: rN  r^  )return_logitsr4  rP  context_parallel_sizer   catrO  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanrB  rC  osunamer>   r{   )	r-  rT  lossesr^  rY  global_rankaveraged_lossoutput_dictrW   s	           rZ   r!  z-GPTTrainStep.get_loss_func.<locals>.loss_func  s   !!!.&\\^F!r*002I))A-yy%))FKKOi,G"H"M"Ma"PR[R_R_RaRfRfghRi!jk!!,,T9W9W9Y,ZAwa(yyR9!<=	O 22#//88:::< K= )$zz88:;8BHHJqM?T' FtfMM$mA&67K!!""Hf#56$$r\   )r$   rb   rc   rb  )r|   rl   r!  rW   s      @rZ   r  zGPTTrainStep.get_loss_func  sG    z	%< //DDP$$77LLLr\   c                       fd}|S )Nc                 z    j                  |       \  }}}}} |||||      }|t        j                  |      fS )re  )r+  rh  )	r   rY   r8  r+  r-  r>  rz  rT  r|   s	           rZ   r"  z8GPTTrainStep.get_forward_step_func.<locals>.forward_step  sG     GKnnUbFcCFFI~|!&,vVM '$..)"DDDr\   r   r|   r"  s   ` rZ   r  z"GPTTrainStep.get_forward_step_func  s    	E r\   ri  r  s   @rZ   rk  rk  K  s    H&6)p#J	r\   rk  c                   d     e Zd ZdZ fdZed        Zed        Zed        Zd Z	d Z
d Z xZS )	T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                     t         |   d       | j                  ||j                        | _        | j                  |      | _        | j                         | _        |j                  sd | _
        y ddlm} || _
        y )Nr  r   )Seq2SeqLMOutput)r   r   r  ru   r   r  r!  r  r"  r#  r$  r%  r  )r|   rl   rW   r  r   s       rZ   r   zT5TrainStep.__init__  si    ',,[$:T:TU++K8 668%%&*D#E&5D#r\   c                 ^    | j                  d      }| j                  d      }||z  }|dk  }|S )Nr   r	         ?)	unsqueeze)r>  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_masks        rZ   attn_mask_postprocessz!T5TrainStep.attn_mask_postprocess  sC     ,55a8+55a8/2DD"4s":&&r\   c                 j    t        j                  t        j                  d| | f|            }|dk  }|S Nr   r   r  )r   trilones)r   r   r>  s      rZ   get_decoder_maskzT5TrainStep.get_decoder_mask  s3    EJJ:z/JSY$Z['#-r\   c                     | j                   \  }}| j                  d      }t        j                  ||df|      }||z  }|dk  }|S r  )r  r  r   r  )	r>  dec_seq_lengthr   r   _r  r  r  r  s	            rZ   get_enc_dec_maskzT5TrainStep.get_enc_dec_mask  sZ    &,,
A ,55a8"ZZ^Q(GPVW/2DD"4s":&&r\   c                     d }d }|j                   j                  j                   |j                   j                  j                  S |r		 ddlm} |S |S # t
        $ r Y |S w xY w)Nc                 R   g d}t         j                  }| t        |       }nd}t        j                  |||      }|d   j                         }|d   j                         }|d   j                         }|d   j                         }|d   dk  }	|d	   dk  }
|d
   dk  }|||||	|
|fS )r(  )text_enctext_decr+  r-  enc_maskdec_maskenc_dec_maskNr  r  r+  r-  r  r  r  r  r/  )r   r5  r6  r   r7  
tokens_enc
tokens_decr+  r-  r  r  r  s               rZ   r;  z6T5TrainStep.get_batch_func.<locals>.get_batch_megatron  s     kD{{H (M*$33D$IF  
+002J
+002JH%**,F{+113Ij)C/Hj)C/H!.1C7Lz9fhR^^^r\   c                 :   t        |       }t        |t        j                  j	                               }|d   j                         }|d   j                         }|dk7  j                  t        j                        }d|v r|d   j                         }nn|j                  |j                  |j                  t        j
                        }|dddf   j                         |dd	df<   d
|d<   |j                  |dk(  d
       t        j                  |d   j                               }t        j                  |j                  d	   |j                        }t        j!                  |d   j                         |j                  d	   |j                        }|||||||fS )r(  r=  r+  r@  decoder_input_ids)r   r   .NrL  r   r   ).r   r>  )r1  r   r   rB  rC  r3  rD  r4  	new_zerosr  r   clonemasked_fill_r  r  r  r  )	r   r   r  r+  r-  r  r  r  r  s	            rZ   rE  z9T5TrainStep.get_batch_func.<locals>.get_batch_transformer  sz   &D!$

(A(A(CDDk*//1J(^((*F4++EKK8I"d*!"56;;=
#--fll6==X]XbXb-c
&,S#2#X&6&<&<&>
37#%&
6"''
d(:A>"88>N9O9T9T9VWH"33J4D4DQ4GIZIZ[H&77%&++-z/?/?/BJDUDUL z9fhR^^^r\   r   rF  )rb   rc   rG  r   r   r   rH  s         rZ   r  zT5TrainStep.get_batch_func  ss    	_2	_. //IIU$$77QQQ 1  
 )(	  %%rI  c                     d }|j                   j                  j                   |j                   j                  j                  S |S )Nc                     |j                         }t        j                  |j                  d      | j	                  d      z        | j                         z  }|}t        |g      }|d|d   ifS )NrL  rN  r   )r4  r   rO  rP  rQ  r>   )r-  rT  rU  rW  rY  rZ  s         rZ   r!  z,T5TrainStep.get_loss_func.<locals>.loss_funcB  sh    $**,Hiib 1I4E4Eb4I IJY]]_\GDG	RO)_Q%7888r\   ra  )r|   rl   r!  s      rZ   r  zT5TrainStep.get_loss_funcA  s?    	9 //DDP$$77LLLr\   c                       fd}|S )Nc           	          
j                  |       \  }}}}}}} ||||||d|      }	|	t        
j                  |      fS )re  Nrf  rh  )r   rY   r  r  r-  r:  r  r  r  rT  r|   s             rZ   r"  z7T5TrainStep.get_forward_step_func.<locals>.forward_stepP  s_     ^b]k]k^ZJ
Iy(Hl "J(LX\hqM !'$..)"DDDr\   r   r  s   ` rZ   r  z!T5TrainStep.get_forward_step_funcO  s    	E r\   )r   r   r   r   r   staticmethodr  r  r  r  r  r  r  r  s   @rZ   r  r    sY    
6 
' 
'  
 	' 	'=)~r\   r  c                      t               } t                | j                  dk(  rt        d| j                   d       t        | j                  | j                         y )Nr   z> setting random seeds to z ...)r$   r3   rQ   rR   r   r4   data_parallel_random_init)rW   s    rZ   finish_mpu_initr  `  sF    :D yyA~*499+T:;TYY > >?r\   c                 8   | j                  d       t        j                  j                         sJ d       t	        |d      }|j                         D ]Q  \  }}t        ||d       2|j                  dk(  r#t        d| dt        ||       d| d| d	       t        |||       S |j                  s|j                  d
d      r|j                  J d       t        |       t        |       t        |       t                t!                t#                t%                t'               }t        |dd       t)        |j*                  |      |_        |j.                  dk(  r*|j0                  r|j2                  dk(  rd|_        d|_        y d|_        d|_        y )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for r   r   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentrn  rC   r	   )rR   r   rB  is_availabler+   r   r   rQ   r   r  getloadr-   r,   r0   r  r2   r1   r5   r$   r7   orig_vocab_sizern  rS   rP   rU   rT   	iteration)rl   extra_args_providerargs_defaultsrW   r   r   s         rZ   
initializer  m  s   01::""$?&??$ )tDD $))+ "
U4d#/yyA~@QwtUXGYFZZ`ad`eefglfmn 	c5!" =#4#45JE#Ryy$W&WW$!$'$     :Dt($/7!9$:N:NPT!Uv%$*?*?DOOWXDX $ DN !&DNr\   c                   v     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Z	 	 	 	 	 	 	 	 ddZ xZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    t         |           || _        |d   | _        || _        || _        t               }|j                  j                  j                  K |j                  j                  j                  |fi |j                  j                  j                  | _        n{|j                  dk(  rt        ||      | _        nZ|j                  dk(  rt        ||      | _        n9|j                  dk(  rt        ||      | _        nt!        d|j                         d| j                  _        i | _        i | _        d| _        d| _        d| _        d | _        |j0                  t3                y y )Nr   rC   rK   rL   rO   FT)r   r   module
base_modelrn   r^   r$   rb   rc   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrS   r  rk  r  rV   r   total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr6   )r|   rl   rY   rn   r^   rW   r   s         rZ   r   zMegatronEngine.__init__  sR   (""z//GGS&bk&7&7&J&J&b&b'#))<<UU'D# !!V+&3K&FD#!!U*&2;&ED#!!T)&1+t&DD#78L8L7MNOO&+#  "$&!"&451!+%' ,r\   c                     t               }t         j                  d         } j                  j                  |_        t         j                  d   t              r|j                  r|j                  J d        j                  D cg c]  }|j                   c}|_	        t         j                        dk(  r|j                  d   |_	        |j                  rU j                  D cg c]  }|j                   c}|_        t         j                        dk(  r|j                  d   |_        |j                  rn|j                   rbt#        t         j                              D cg c]   fd
 c}|_        t         j                        dk(  r|j$                  d   |_        t&        |_        |S c c}w c c}w c c}w )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducer   c                 <    j                   j                  |       S r   )rn   finish_param_sync)xmodel_indexr|   s    rZ   <lambda>z2MegatronEngine.get_module_config.<locals>.<lambda>  s    $..::;J r\   )r$   r   r  rn   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcno_syncrj   delay_grad_reducestart_grad_syncgrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)r|   rW   rD   model_chunkr  s   `   `rZ   get_module_configz MegatronEngine.get_module_config  sx   z!$++a.1!%!:!:dkk!nh/D4L4L&&. V. KO++"V;;#6#6"VF4;;1$&,&9&9!&<#%%X\XcXc(d)D)D(d%t{{#q(,2,A,A!,DF)$$)@)@^cdghlhshsdt^u&OZJ&F" 4;;1$)/)?)?)B&+?( #W )e&s   
F9+F>+Gc                     | j                   D ]  }|j                           | j                  | j                         | _        | j	                          y r   )r  trainr  r  log_eval_resultsr|   model_modules     rZ   r  zMegatronEngine.train  sL     KK 	!L 	! %!%!7!7!9Dr\   c                     | j                   D ]  }|j                           | j                  | j                         | _        y y r   )r  evalr  r  r  s     rZ   r  zMegatronEngine.eval  sE     KK 	 L	  %!%!7!7!9D &r\   c                 x   t               }g }t        |      dkD  r|j                  dkD  rot        d|j                        D ]U  }|j	                  |j                         D ci c](  \  }}||||j                  z  |dz   |j                  z   * c}}       W n|g}t        | j                        dkD  r`t        |      dkD  r7t        t        | j                              D cg c]  }t        |       c}}|S d gt        | j                        z  }|S t        |      dkD  rt        |      nd }|S c c}}w c c}w )Nr   r   )	r$   rj   r   r   r   r   r   r  iter)	r|   
batch_datarW   data_chunksr   r   vr  batch_data_iterators	            rZ   get_batch_data_iteratorz&MegatronEngine.get_batch_data_iterator  sC   zz?Q%%)q$"8"89 A&& )3(8(8(: $1 qT%:%:!:a!etG\G\=\]]  *lt{{a z?Q& -2#dkk2B,CDqk"D   #"	 Vc$++..   #" 8;:7J${"3PT""! Es   !-D1"D7c                     | j                  |      }t        | j                  j                  || j                  | j
                  | j                  | j                        \  }}}}|dk(  | j
                  _        ||||fS )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        )forward_step_funcr   rY   rn   opt_param_schedulerrD   r   )	r  r<   r  r"  r  rn   r^   r  r   )r|   r  r  loss_reducedr   	grad_normnum_zeros_in_grads          rZ   r<   zMegatronEngine.train_step
  s}     #:::FCM"55BB-++nn $%%D
@lI/@ '3a&7#\96GGGr\   c           	         t               }| j                  |      }t               } || j                  j                  || j
                  t               |j                  |j                  d      }|j                  dk\  rt        j                  j                          |xj                  t        j                         |j                  z  t               z  z  c_        t        j                   d      rni }|d   D ]b  }|D cg c]  }||   	 }	}t#        |	d   j$                        dk(  rt'        |	      t#        |	      z  ||<   Kt        j(                  |	      ||<   d |S i S c c}w )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        T)r  r   rY   num_microbatchesr   r   forward_onlyr   )ignore_virtualr   )r$   r  r   r  r"  r  r   r   r   empty_unused_memory_levelr   rB  empty_cacher   r   r   is_pipeline_last_stagerj   r  rO  r  )
r|   r  rW   r  forward_backward_func
loss_dictsr  r   r  losses_reduced_for_keys
             rZ   	eval_stepzMegatronEngine.eval_step!  sO    z":::F 9 ;*"55BB-++13!22

 ))Q.JJ""$##,,.1F1FFI]I__	
# %%T:L!!} M:D)EQ!C&)E&)E-a06671<(+,B(CcJ`Fa(aL%(-5K(LL%M  	 *Fs   ?E!c                    t               }| j                  d   j                  r6 | j                  d
i |\  }}}}| xj                  dz  c_        t        j                         |j                  z  t               z  }|xj                  |z  c_	        | xj                  t        ||      z  c_
        |j                  }| j                  j                         j                         }d }	|j                   rt#        | j$                        }	t'        || j(                  | j                  j*                  d   d   | j                  || j,                  |||	|
      | _        n | j.                  d
i |}|j                  |D ]  }
| j0                  j3                  |
t4        j6                  j9                  dg            ||
   z   | j0                  |
<   | j0                  j3                  |
dz   t4        j6                  j9                  dg            t4        j6                  j9                  dg      z   | j0                  |
dz   <    t5        j:                  dt4        j6                  j=                               }|D ]&  }
t?        ||
   j@                        dk(  s|||
   z  }( d }d|v r|d   }| jB                  jD                  | jB                  jE                  ||	      S |S )Nr   r   lr        
_num_iters      ?r  r^  )rY  r^  r   )#r$   r  trainingr<   r  r   r   r   r   r   r  r:   r  rn   get_loss_scaleitemlog_params_normr?   rY   r=   r  param_groupsr  r  r  r  r   rB  FloatTensorr   rC  rj   r  r  r$  )r|   r  rW   	loss_dictr   r  r  r   
loss_scaleparams_normr   rY  r^  s                rZ   forwardzMegatronEngine.forwardI  s    z;;q>""DSDOODaV`DaAI|Y0ANNaN99;d>S>SSVjVllJ'':5'559VW[]g9hh5##/!^^::<AAC
"''"5djj"AK*6((NN//248NN++ %+' '44I##/$ 6C1155c5::;Q;QSVRW;XY\efi\jj --c2 EID]D]DaDal*EJJ,B,BC5,IE

..u5E6D--cL.@A	6 ||C

(A(A(CD 	'C9S>''(A-	#&	' y x(F""55A**==4PV=WWr\   c                    t               }|j                  | j                  dk(  ry t               }t               }d| j                   d}| j                  D ]  }|j                  d      r| j                  |   | j                  |dz      z  }|| d| dz  }t        j                  t        d|j                                     }|j                  r|| d| dz  }|s|j                  | d|j                         | j                         |j                  s|j                  | d	|| j                          t        |      d
z   }t        d|z         t        |       t        d|z         i | _        y )Nr   zvalidation loss at iteration z | r  z value:    z PPL: z validationz validation pplr   -)r$   r  r  r%   r  endswithmathexpminr  rP   
add_scalarrj   r'   )r|   rW   writerstringr   r   ppllengths           rZ   r  zMegatronEngine.log_eval_results  sf   z'4>>Q+>z')00@D,, 	TC||L)--c2T5N5NsUaOa5bbEXeWC00F((3r5::<01C$$SEuC00!!SE"5uzz|T^^T((%%_&=sDNNS	T Vqf%f%$&!r\   c                 B   | j                          t               }||_        t        j                  j                          t        | j                  | j                  | j                  | j                  | j                         t        j                  j                          y )N)r  )r  r$   saver   r   barrierr/   r  r  rn   r^   r  )r|   
output_dirrW   s      rZ   r/   zMegatronEngine.save_checkpoint  so    z	!!#NNKKNNNN151Z1Z	
 	!!#r\   c                    t               }||_        d|_        d|_        t        j
                  j                          t        | j                  | j                  | j                        \  }}t        j
                  j                          || _        || _        |j                  r+| j                  dk(  r| j                  j                          y y y r  )r$   r  r   r   r   r   r,  r.   r  rn   r^   r  r  fp16reload_model_params)r|   	input_dirrW   r  r  s        rZ   r.   zMegatronEngine.load_checkpoint  s    z	&'#&'#!!#:I$++W[WeWegkgugu:v7	7!!#"4X1991,NN..0 -9r\   c
                 
   t               }|j                  dk7  rt        d      |j                  dkD  rt	        d      |j
                  rt	        d      |j                  t	        d      |j                  t	        d      ||t	        d	      |d
}n"d|cxk  rdk  st	        d       t	        d      |d}n"d|cxk  rdk  st	        d       t	        d      |d}n7|dkD  r|dkD  rt	        d      d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }d|cxk  rd
k  st	        d       t	        d      |
j                  dd      }t        |t              st	        d      |}|>t        |t              st	        d      |dk  rt	        d      |j                  d   dkD  ryt               }|
j                  d|j                        }|t        |t              st	        d      |	d
}	d}d}d}t        j                   j#                         dk(  r|>t        j$                  j'                  |j                  d   g|j                  d   z        }n |j)                  d       j%                         }|||j                  d   z
  }|dk  rt	        d!      |r||j                  d   z   dz   }d"t+        j,                  |d"z        z  }||j                  d   dz   z
  }t        j$                  j'                  |j                  g|z  g|j                  d   z        }t        j.                  t        j0                  |dddf   d       |j%                         |gd       }n||j                  d   z   }d"t+        j,                  |d"z        z  }||j                  d   z
  }t        j$                  j'                  |j                  g|z  g|j                  d   z        }t        j.                  |j%                         |gd       }|j3                  d      |j3                  d      g}t5        d#|d$      }|j7                         }t9        |t        j:                  |d%      }t9        |d   t        j:                  |d%      }|
j                  d&d      }t        j<                  j?                  |       tA        | jB                  tD        tF        tH        f      }|tK        |||||d|	'      \  }}|S tM        |||d|||||d()
      \  }}}|S )*a  
        Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
        with sampling. Refer the Megatron-LM repo for more details

        Args:
            inputs (torch.Tensor): input ids
            attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
            max_length (int, optional): max length of the generated sequence. Defaults to None.
            Either this or max_new_tokens should be provided.
            max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
            Either this or max_length should be provided.
            num_beams (int, optional): number of beams to use for beam search. Defaults to None.
            temperature (float, optional): temperature for sampling. Defaults to 1.0.
            top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
            top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
            length_penalty (float, optional): length penalty for beam search. Defaults to None.
            kwargs: additional key-value arguments
        rK   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencer  r  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerrL  )axisz%max_new_tokens must be greater than 0   r	   )int_listrQ   )r   rQ   random_seed)r6  num_return_genlength_penaltyT)return_output_log_probstop_ktop_pr3  r4  temperature#use_eod_token_for_early_termination)'r$   rS   NotImplementedErrordata_parallel_sizerV   sequence_parallelrecompute_granularityrp  r  r   r   r]  r  r&   rq  r   r   r  rB  
LongTensorrO  r"  ceilr  r  sizer   tolistr   r0  randommanual_seedrA   r  torchDDPr  r    r   r   )r|   inputsr>  
max_lengthmax_new_tokens	num_beamsr@  r>  r?  r<  r   rW   r3  r4  r5  
beam_widthru  r6  
sizes_listprompts_tokens_tensorprompts_length_tensorr  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorr:  unwrapped_modelr8  r  s                                 rZ   megatron_generatez MegatronEngine.megatron_generate  sw   B z5(%&YZZ""Q&PQQ!!XYY%%1QRR??"CDD ."8Z[[K,u,`aa -`aa=Eu$$YZZ %YZZ=ES[US[NOO5'C' !JKK ( !JKKjj4{)c)LMM *LMMjj4{)c)LMM *LMM**Y.7D)899
!j#. !@AAA~ !DEE||A"E!O	ZZimm<
!j#. !@AA! N
 $ $%%'1,%(-

(=(=v||A>ORXR^R^_`Ra>a(b%(6(:(:(:(C(H(H(J%%!+fll1o!=" !HII+fll1o=A
:>!::
!+v||A/B!C**//)-->1Q0RU[UaUabcUd0de(-__WQT]<fkkmWU\^)%
 ,fll1o=
:>!::
!+fll1o!=**//)-->1Q0RU[UaUabcUd0de(-fkkmW5MTV(W% &**1-%**1-J *!jqI ##% 0Lahi j 0q5;;Odkl m jj2  -&t8]8[\!=%%% -IFA,  K%%(-'''48LFAq r\   )NNNNNNNN)r   r   r   r   r   r  r  r  r  r<   r  r  r  r/   r.   rZ  r  r  s   @rZ   r  r    sb    (>4 :#2H.&P=~'4$1  pr\   r  c                     t        |       S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r>   )r  s    rZ   %avg_losses_across_data_parallel_groupr\  q  s     5V<<r\   c                 $    d }t        || d      S )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                    | j                   dk(  r| j                         d    } t        t        j                  j                  t        j                                     D cg c]  }t        j                  |        }}t        j                  j                  || t        j                                t        j                  |d      S c c}w )Nr   r   r|  )ndimr  r   r   r   get_world_sizer   get_data_parallel_group
empty_like
all_gatherr  )r   r  output_tensorss      rZ   _gpu_gather_onez;gather_across_data_parallel_groups.<locals>._gpu_gather_one  s    ;;!\\^D)F 5,,;;#B]B]B_;`a
 V$
 
 	$$^V3C^C^C`$ayyQ//
s    C	T)error_on_other_type)r   )r   re  s     rZ   "gather_across_data_parallel_groupsrg  |  s    0 _f$OOr\   )TTTT)xrv   r"  r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalrR  torch.nnr   r   r   torch.nn.parallel.distributedr   rL  rn   r
   r^   r   importsr   
operationsr   r   megatron.corer   r   megatron.core.distributedr  r   megatron.core.enumsr   )megatron.core.num_microbatches_calculatorr   megatron.core.optimizerr   megatron.core.parallel_stater   r   megatron.core.pipeline_parallelr   megatron.core.utilsr   0megatron.inference.text_generation.communicationr   r   -megatron.inference.text_generation.generationr   r   "megatron.legacy.data.dataset_utilsr   megatron.legacy.modelr   r    r!   r"   $megatron.legacy.model.classificationr#   megatron.trainingr$   r%   r&   r'   megatron.training.argumentsr(   r)   r*   r+   r,   megatron.training.checkpointingr-   r.   r/   megatron.training.global_varsr0   megatron.training.initializer1   r2   r3   r4   r5   r6   %megatron.training.tokenizer.tokenizerr7   megatron.training.trainingr8   r9   r:   r;   r<   r=   megatron.training.utilsr>   r?   r@   rA   r[   rq   rs   r   r   r   rf   r  r  rg   r  r  rk  r  r  r  Moduler  r\  rg  r   r\   rZ   <module>r     sx     	      A A M , , - 9 2M>-N>pI4e SQQC   lkB  O  2j'>jL jLZ$>LD+!5 + b .!5  "K% K\A$ AHN# Nb	@ 15B -`PUXX__ Ph=Pr\   