o
    ߥi5                     @   s   d dl Z d dlZd dlZd dlmZ d dlm  mZ ddl	m
Z
 G dd dejZG dd dejZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZdS )    N   )Rotation2xyzc                       s   e Zd Z												
				d fdd	Zdd Zdd ZdddZdd ZdddZ fddZ	 fddZ
  ZS ) MDM            皙?NgeluFrot6damass   	trans_encc                    s|  t    || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|dd | _| j| j | _|dd| _|dd| _|dd| _|| _| jdkrl| jnd	| _t| j| j| j | j| _t| j| j| _ || _!| jd
krt"d t#j$| j| j| j| j| jd}t#j%|| jd| _&n<| jdkrt"d t#j'| j| j| j| j|d}t#j(|| jd| _)n| jdkrt"d t#j*| j| j| jdd| _+nt,dt-| j| j | _.| jdkr%d| jv rt#/| j| j| _0t"d t"d || _1| 2|| _3d| jv r%t4| j| j| _5t"d t6| j| j| j| j| j| _7t8d|| jd| _9d S )N
action_embnormalize_encoder_outputF	cond_modeno_condcond_mask_prob        grur   r   zTRANS_ENC init)d_modelnheaddim_feedforwarddropout
activation)
num_layers	trans_deczTRANS_DEC initzGRU initT)r   batch_firstz>Please choose correct architecture [trans_enc, trans_dec, gru]textz
EMBED TEXTzLoading CLIP...actionzEMBED ACTIONcpu)devicesmpl_data_pathdataset):super__init__legacy	modeltypenjointsnfeatsnum_actionsdata_repr#   pose_repglobglob_rottranslation
latent_dimff_sizer   	num_headsr   ablationr   clip_dimgetr   input_featsnormalize_outputr   r   archgru_emb_dimInputProcessinput_processPositionalEncodingsequence_pos_encoderemb_trans_decprintnnTransformerEncoderLayerTransformerEncoderseqTransEncoderTransformerDecoderLayerTransformerDecoderseqTransDecoderGRUr   
ValueErrorTimestepEmbedderembed_timestepLinear
embed_textclip_versionload_and_freeze_clip
clip_modelEmbedActionembed_actionOutputProcessoutput_processr   rot2xyz)selfr'   r(   r)   r*   r/   r,   r-   r.   r0   r1   r   r2   r   r"   r3   r   r&   r+   r#   r4   r8   r>   rM   kargsseqTransEncoderLayerseqTransDecoderLayer	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/motion_generation/modules/mdm.pyr%      s   








zMDM.__init__c                 C   s   dd |   D S )Nc                 S   s   g | ]\}}| d s|qS )zclip_model.)
startswith).0namepr[   r[   r\   
<listcomp>   s    z*MDM.parameters_wo_clip.<locals>.<listcomp>)named_parameters)rU   r[   r[   r\   parameters_wo_clip   s   zMDM.parameters_wo_clipc                 C   s4   t j|ddd\}}|  | D ]}d|_q|S )Nr    F)r!   jit)cliploadeval
parametersrequires_grad)rU   rM   rO   clip_preprocessr`   r[   r[   r\   rN      s   
zMDM.load_and_freeze_clipc                 C   s\   |j \}}|rt|S | jr,| jdkr,ttj||jd| j |d}|d|  S |S )Nr   )r!   r   g      ?)	shapetorch
zeros_liketrainingr   	bernoullionesr!   view)rU   cond
force_maskbsdmaskr[   r[   r\   	mask_cond   s   

zMDM.mask_condc                 C   s   t |  j}| jdv rdnd }|d urFd}|d }||k s J tj||dd|}tj|j	d || g|j
|jd}tj||gd	d
}n
tj|dd|}| j| S )N)humanmlkit   M      T)context_lengthtruncater   )dtyper!   r   )dim)r~   )nextrh   r!   r#   re   tokenizetorl   zerosrk   r   catrO   encode_textfloat)rU   raw_textr!   max_text_lendefault_context_lengthr}   textszero_padr[   r[   r\   r      s,   zMDM.encode_textc                 C   s  |j \}}}}| |}|dd}	d| jv r*| |d }
|| | j|
|	d7 }d| jv r?| |d }|| j||	d7 }| jdkrn|	||| d|}|
|dd}|ddd	}|	|| jd|}tj||fdd
}| |}| jdkrtj||fd	d
}| |}| |dd }nC| jdkr| jrtj||fd	d
}n|}| |}| jr| j||ddd }n| j||d}n| jdkr|}| |}| |\}}| |}|S )z~
        x: [batch_size, njoints, nfeats, max_frames], denoted x_t in the paper
        timesteps: [batch_size] (int)
        uncondFr   )rs   r   r   r   r|   r   axisr   Nr   )tgtmemory)rk   rJ   r5   r   r   rL   rw   rQ   r8   reshaperepeatpermuter0   rl   r   r;   r=   rC   r>   rF   r   rS   )rU   x	timestepsyrt   r(   r)   nframesembrs   enc_textr   
x_reshapedemb_gruxseqoutput_r[   r[   r\   forward   s\   












zMDM.forwardc                    s   t  | | jj| d S N)r$   _applyrT   
smpl_model)rU   fnrY   r[   r\   r      s   z
MDM._applyc                    s*   t  j|i | | jjj|i | d S r   )r$   trainrT   r   )rU   argskwargsrY   r[   r\   r      s   z	MDM.train)r   r   r   r   r	   NNr
   Fr   r   r   r   FN)Fr   )__name__
__module____qualname__r%   rc   rN   rw   r   r   r   r   __classcell__r[   r[   rY   r\   r      s0    {

7r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )r<   r	     c                    s   t t|   tj|d| _t||}tjd|tj	d
d}ttd|d	 td |  }t|| |d d dd df< t|| |d d dd df< |
ddd}| d| d S )N)r`   r   )r   r   r|   g     @pe)r$   r<   r%   r@   Dropoutr   rl   r   aranger   	unsqueezeexpnplogsincos	transposeregister_buffer)rU   r   r   max_lenr   positiondiv_termrY   r[   r\   r%     s      zPositionalEncoding.__init__c                 C   s*   || j d |jd d d f  }| |S Nr   )r   rk   r   )rU   r   r[   r[   r\   r     s    
zPositionalEncoding.forward)r	   r   r   r   r   r%   r   r   r[   r[   rY   r\   r<      s    r<   c                       $   e Zd Z fddZdd Z  ZS )rI   c                    sF   t    || _|| _| j}tt| j|t t||| _d S r   )	r$   r%   r0   r=   r@   
SequentialrK   SiLU
time_embed)rU   r0   r=   time_embed_dimrY   r[   r\   r%     s   


zTimestepEmbedder.__init__c                 C   s   |  | jj| dddS )Nr   r   r|   )r   r=   r   r   )rU   r   r[   r[   r\   r   #  s
   

zTimestepEmbedder.forwardr   r[   r[   rY   r\   rI     s    rI   c                       r   )r:   c                    sR   t    || _|| _|| _t| j| j| _| jdkr't| j| j| _d S d S Nrot_vel)	r$   r%   r+   r6   r0   r@   rK   poseEmbeddingvelEmbedding)rU   r+   r6   r0   rY   r[   r\   r%   *  s   

zInputProcess.__init__c                 C   s   |j \}}}}|d|||| }| jdv r| |}|S | jdkrB|dg }| |}|dd  }| |}tj||fddS t)N)   r   r   r|   r   xyzhml_vecr   r   r   r   )	rk   r   r   r+   r   r   rl   r   rH   )rU   r   rt   r(   r)   r   
first_posevelr[   r[   r\   r   3  s   





zInputProcess.forwardr   r[   r[   rY   r\   r:   (  s    	r:   c                       r   )rR   c                    s^   t    || _|| _|| _|| _|| _t| j| j| _	| jdkr-t| j| j| _
d S d S r   )r$   r%   r+   r6   r0   r(   r)   r@   rK   	poseFinalvelFinal)rU   r+   r6   r0   r(   r)   rY   r[   r\   r%   F  s   

zOutputProcess.__init__c                 C   s   |j \}}}| jdv r| |}n&| jdkr5|dg }| |}|dd  }| |}tj||fdd}nt|||| j| j	}|
dddd}|S )Nr   r   r   r   r   r|   r   )rk   r+   r   r   rl   r   rH   r   r(   r)   r   )rU   r   r   rt   ru   r   r   r[   r[   r\   r   Q  s   




zOutputProcess.forwardr   r[   r[   rY   r\   rR   D  s    rR   c                       r   )rP   c                    s"   t    tt||| _d S r   )r$   r%   r@   	Parameterrl   randnaction_embedding)rU   r*   r0   rY   r[   r\   r%   d  s   


zEmbedAction.__init__c                 C   s&   |d d df  tj}| j| }|S r   )r   rl   longr   )rU   inputidxr   r[   r[   r\   r   i  s   
zEmbedAction.forwardr   r[   r[   rY   r\   rP   b  s    rP   )re   numpyr   rl   torch.nnr@   torch.nn.functional
functionalFrotation2xyzr   Moduler   r<   rI   r:   rR   rP   r[   r[   r[   r\   <module>   s    s