o
    ߥin                     @   sr   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d
ddZd
ddZG dd	 d	ZdS )    N)mpuprint_rank_0)poissonc                 C   s>   |d u r
t | d }t|ddD ]}| | |kr|  S qdS )N   )lenrangelstvalstarti r   \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/blocklm_utils.pyrindex   s   r   c                 C   s8   |d u rd}t |t| D ]}| | |kr|  S qdS )Nr   r   )r   r   r	   r   r   r   index_in_list   s   r   c                   @   s   e Zd Z																				d d	d
Zdd Zed!ddZdd Z	d"ddZ	d"ddZ		d"ddZ
dd Zdd Zedd ZdS )#ConstructBlockStrategy      ?              ?333333?   (   TFc              	      s  |j | _ || _d| _|| _t | _t | _d|  kr"dks%J  J || _	|| _
d| | | _| jdks9J || _|| _|| _|	| _ fddtd|D | _|| _|| _|| _|| _|| _|| _|| _|| _|| _|rqdnd	| _| j| jj| _|rd
nd	| _| j| jj| _|| _|| _ t!d| j	 d| j
 d| j d| j  t!d| j d| j d| j  t!d| j  t!d| j d| j  d S )Nr   r   r   r   g|۽c                    s   g | ]}t | qS r   )r   pmf).0r   average_block_lengthr   r   
<listcomp>K   s    
z3ConstructBlockStrategy.__init__.<locals>.<listcomp>gMASKMASKsMASKz
BERT prob z, gap sent prob z, GPT prob z, infill prob zgeneration min ratio z, block ratio z, gap sent ratio zblock length distribution zblock mask prob z, context mask ratio )"	eod_token	tokenizercountmax_seq_lengthr   get_data_parallel_rankrankget_data_parallel_world_size
world_size	bert_probgap_sentence_probgpt_probinfill_probgpt_min_ratio
bert_ratiogap_sentence_ratior   block_length_distributionblock_mask_probcontext_mask_ratiocontext_mask_rangeshort_seq_probsingle_span_probblock_position_encodingencoder_decodershuffle_blockssentinel_tokengeneration_maskget_commandIdgap_sentence_maskrandom_position	masked_lmr   )selfargsr"   r$   r)   r*   gpt_infill_probr-   r.   r/   r   max_block_lengthr1   r2   r3   r4   r5   r6   r7   r8   r9   	task_maskr>   r?   r   r   r   __init__"   sj   


"
zConstructBlockStrategy.__init__c                 C   s   | j |}d|v rdS d|v rdS d|v rdS d|v rdS d|v r$dS d|v r*dS d|v r0dS d	|v r6dS d
|v r<dS d|v rBdS d|v rHdS dS )N.T?!;:u   。u   ？u   ！u   ；u   …
F)r"   	IdToToken)r@   tokr   r   r   contains_sentence_endl   s0   z,ConstructBlockStrategy.contains_sentence_endr   c                    s   |t |  }|t|  d   fddtt| D }|  g }t|| D ]\}}|| }	|| | }
||	|
f ||d 7 }q'|S )Nr   c                    s   g | ]	}  d  qS )r   )	randrange)r   _mrngr   r   r          z7ConstructBlockStrategy.sample_spans.<locals>.<listcomp>)sumr   r   sortzipappend)span_lengthstotal_lengthrS   offsetblank_lengthplacesspansplacespan_lengthr   endr   rQ   r   sample_spans   s   z#ConstructBlockStrategy.sample_spansc                 C   s  | | g }d}dgt|| jkd   }t|}g }t|D ]<}	|	}
|
d t|k r?||
d  | jdj	kr?|
d7 }
||
 d }|t|krS|dkrS|d8 }|
|
d |f |	}q"|jdd d t|D ]\}\}}|t|d krd\}}|| t|k r|||  | | |kr||||  7 }|d7 }|| t|k r|||  | | |ks|dkr| j||||  |||d	}||7 }|| t|d k rt|||d  |d | | qkt|| j }d\}}|| t|k r!|||  | |kr!||||  7 }|d7 }|| t|k r!|||  | |ks|dkr=| j||||  |||d	}||7 }||7 }qk|S )
Nr   r   r   ENCc                 S      | d S )Nr   r   xr   r   r   <lambda>       z@ConstructBlockStrategy.sample_span_in_document.<locals>.<lambda>keyr   r   )r[   )shufflenpwherer!   tolistr   reversedr"   r;   r<   rX   rV   	enumeraterb   printintr.   )r@   tokensmasked_lengthsrS   
mask_spans
mask_indexindices
last_index	documentsindexstart_indexlengthr   r[   current_masked_lengthcurrent_countr^   current_masked_totalr   r   r   sample_span_in_document   s   

z.ConstructBlockStrategy.sample_span_in_documentbertc                 C   s   t jt|td}t|}| jdj}	t j	t|td}
|D ]\}}t
||D ]}|	||< q)d|
||< q ||
 }||||fS )Ndtyper   r   )rm   aranger   rs   copydeepcopyr"   r;   r<   zerosr   )r@   rt   
loss_masksattention_maskblock_spansrS   taskposition_idstargetsmask_id	mlm_masksr   ra   idxr   r   r   make_masked_data   s   

z'ConstructBlockStrategy.make_masked_datac           #      C   sF  t |}tjt |td}|D ]\}	}
d||	d |
< qt|d }| jr@|d | jd k r@| j|d  }|d|}|| }| jsF| j	sO|j
dd d n|| | jradd	 t|D }nd
d	 |D }g g g g f\}}}}|D ]\}	}
}|dkrdnd| }|| j|jg t||	|
 }| jdkr|dkrtt |D ]}t | jk r| jdj||< q|| |||	|
  || jdjg | js||	|
 }|| ||d g n|| jg|
|	 d   | jr|tjd|
|	 d td qt|dg|
|	 d   qt|j
dd d g g g }}}d\}}|D ]`\}	}
}|dkr<| j}n|dkrE| j}n|dkrLdnd| }| j|j}||||	 | f ||||	  ||g ||||	  |||	 g ||	| d 7 }|
}q.|t |k r|||t | | f |||d   |||d   ttt |}|d ur||ksJ |r| jt| v rt d| j!| t"| jr|| jdjg }tjt |td}|||fS t|| }|dkr^| j#dkr^t$ }|D ]/\}	}
|	dkr-t%|
|	| j& }|'t|	| |
dkrBt(|	|
| j& }|'t||
 q|)|t| j#| } | D ]}!| jdj||!< qQt|| }tjt |td}d|d |< t|| }ttj*|tdg| }"tj+||"gdd}|d ur||||fS |||||fS )Nr   r   r   r   c                 S   rd   Nr   r   re   r   r   r   rg      rh   z8ConstructBlockStrategy.make_block_data.<locals>.<lambda>ri   c                 S   s   g | ]\}\}}|||fqS r   r   )r   r   r   ra   r   r   r   r      s    
z:ConstructBlockStrategy.make_block_data.<locals>.<listcomp>c                 S   s   g | ]	\}}||d fqS r   r   )r   r   ra   r   r   r   r      rT   sopr   r   dBLOCKeop   c                 S   rd   r   r   re   r   r   r   rg     rh   rk   
generationgap_sentencer   zFound EOS in targetaxis),r   rm   onesrs   cumsumr>   r$   rO   r7   r8   rV   rl   r9   rq   rX   r"   r;   r<   r   r   r1   r   randomr6   r   r:   r=   rU   mapr!   concatenatero   rr   	DecodeIdsRuntimeErrorr2   setminr3   updatemaxsampler   stack)#r@   rt   r   r   r   rS   r   text_lengthr   r   ra   position_biastarget_tokenstarget_position_idstarget_block_position_idsr   r   	sop_tokenspan_tokenssub_idxtarget_position_idsource_tokenssource_position_idslocal_spanslastcurrent_lengthr   
mask_tokensource_lengthmask_candidates	local_endlocal_startmask_posposblock_position_idsr   r   r   make_block_data   s   











z&ConstructBlockStrategy.make_block_datac           
      C   s   | | |d |d }}|d | jdjksJ | |||}t|t|k r,d S | jr:| |||||}	|	S | j||||||d}	|	S )Ntext	loss_maskr   rc   r   )	rl   r"   r;   r<   r   r   r?   r   r   )
r@   r   ru   r   rS   r   rt   r   r   datar   r   r   generate_blank_dataV  s(   


z*ConstructBlockStrategy.generate_blank_datac                 C   s  | d| jd }| jd | }g }| jdj}| jdj}|D ]}|d dd  |d dd  }	}
t|D ]}|t|	krG|	|
}}n| dt|	| }|dkr|	| |ksl| |	|d  s|	|d  |ks|d8 }|dkr|	| |ksl| |	|d  s|	|d  |krl|| }||kr| |	|d  s|	|d  |ks|d8 }||kr| |	|d  s|	|d  |kr|| |d k r|| }|	|| |
|| }}t	|g|f}t	dg|f}|
||d	 q9q"|S )
N    r   rc   eosr   r   r   r   )r   r   )rO   r$   r"   r;   r<   r   r   rN   rm   r   rX   )r@   samplesrS   target_length
num_splitsnew_samplescls_ideos_idr   rt   r   rP   
new_tokensnew_loss_masksrandom_start
random_endr   r   r   split_sampleso  sn   "



z$ConstructBlockStrategy.split_samplesc           '   	   C   s`  t jj }|d ur|j|j}}nd\}}t| j| | | j	 | j
 }|  jd7  _g g g g f\}}}}	g g }
}| | jk rI| ||}| }|| jk }|rVdn| }g }|| jk rd}|D ]}|r|jtdt| jd | jdd g}|d }n:g d}}|t| jt|d  k r|jtdt| jd | jdd }|| ||7 }|t| jt|d  k s| jrt|d }nt|d | t| }| j||||dd}|d ur| jr|\}}}|
| || || n|\}}}}|| || || |	| || qfnF|| j| j k rd	}|D ]}|d |d
 }}g }|d | jdjkrCdnd}tt|D ]1}| || rk||d k re|||d f |d }qK|| | jdjkr{|d }qK|t|k r||t|f |st j dkrzt | j!|dd   W n t"y   t |dd   Y nw |#| g d}}|D ] \}} ||| f || | 7 }|t| j$t| kr nq| j%||d ||dd}|\}}}}}|| || || |	| || q(nLd}|&t| j't(t)dd | t*t)dd |d }!|D ]+}t(|!t|d d }"|t|d |" d  t+|d | jdjdt|d d fv}#|#sp| | j,k r/t|d |" }$|d |d
 }}|d |$ ||$d  }}||$d  }%t-.|| j/| jdjg|d d f}t-.|| j/g|f}t-.t-j0t|d td|%f}|| || || t-j1t|t| d td}t||t|d d < | j2rt-.t-j0t|tdt-j1t|d tdf}&nt-.t-j0t|d tdt-j3t|d tdf}&|	t-j4||&gdd q4| j||"g|d |dd\}}}}|| || || |	| |d u r_t ||"|# q4| jr}t j5|
t j6dt j5|t j6dt j5|t j6ddS | 7||||	\}}}}	t j5|t j6dt j5|t j6dt j5|t j6dt j5|	t j6dt j5|t j6d|dS )N)r   r   r   r   r   )weightsr   r   r   sentencer   rc   r   r   gptc                 S      t | d S Nr   r   re   r   r   r   rg         z9ConstructBlockStrategy.construct_blocks.<locals>.<lambda>c                 S   r   r   r   re   r   r   r   rg     r   r   r   r   r   r   r   )r   targetr   )r   r   r   position_idr   mode)8torchutilsr   get_worker_infoidnum_workersr   Randomr#   r(   r&   r4   r   r5   r)   choicesr   r   r0   rs   r.   rX   r?   r   r7   r*   r"   r;   r<   rN   distributedget_rankrr   r   
IndexErrorrl   r/   r   randintr-   r   r   r   r   r,   rm   r   r:   r   r   r6   r   r   tensorlong	pad_batch)'r@   r   worker_info	worker_idr   rS   token_batchtarget_batchloss_mask_batchposition_id_batchsource_batchrandsingle_spanr   r   r   ru   masked_countblock_lengthsepr   r   r   r   rt   r   r   sentence_spansry   r   r   r   ra   max_generation_lengthgeneration_lengthmultiple_docdivisiontarget_masksr   r   r   r   construct_blocks  s  














(






/









z'ConstructBlockStrategy.construct_blocksc                    s   t tt| }||d t|kr:t|  fdd| D }  fdd|D } fdd|D } fdd|D }| |||fS )Nr   c              	      ,   g | ]}t |t j t| td fqS r   rm   r   r   r   rs   )r   rt   
max_lengthr   r   r   U      z4ConstructBlockStrategy.pad_batch.<locals>.<listcomp>c              	      r   r   r   )r   r   r  r   r   r   Z  r  c              	      r   r   r   )r   r   r  r   r   r   _  s    c              	      s6   g | ]}t j|t jd  |jd  ftdfddqS )r   r   r   r   )rm   r   r   shapers   )r   r   r  r   r   r   e  s    )listr   r   r#   r   )r   r   r   r   seq_lengthsr   r  r   r   O  s    



z ConstructBlockStrategy.pad_batchN)r   r   r   r   r   r   r   r   r   r   r   r   r   TFTFFFFr   )r   )__name__
__module____qualname__rE   rN   staticmethodrb   r   r   r   r   r   r   r   r   r   r   r   r       sJ    
JB

y
& ;r   )N)r   mathr   numpyrm   r   torch.utils.datamegatron_utilr   r   scipy.statsr   r   r   r   r   r   r   r   <module>   s   

		