o
    ߥio8                     @   sT  d dl mZ d dlmZmZ d dlZd dlZd dl	Z	d dl
mZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) e) Z*dgZ+				dddZ,G dd de-Z.G dd dej/Z0ej1e%j2ej3dG dd deZ4dS )    N)AnyDict)Models)Model)MODELS)GaussianDiffusionbeta_schedule)
BertConfig	BertModel)FullTokenizer)DiffusionGenerator)SuperResUNet256)SuperResUNet1024)	ModelFileTasks)create_device)
get_logger DiffusionForTextToImageSynthesis  fixed_smallc                 C   s   t | |||}t||d}|S )N)var_type)r   r   )schedulenum_timesteps	init_beta	last_betar   betas	diffusion r   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/diffusion/model.pymake_diffusion"   s   r   c                   @   s   e Zd ZdddZdd ZdS )	Tokenizer@   c                 C   s   || _ || _t|dd| _d S )NT)
vocab_filedo_lower_case)r"   seq_lenr   	tokenizer)selfr"   r$   r   r   r   __init__.   s
   zTokenizer.__init__c                 C   s   | j |}dg|d | jd   dg }| j |}dgt| }dgt| }|dg| jt|  7 }|dg| jt|  7 }|dg| jt|  7 }t|t|  krdt|  krd| jksgJ  J t|}t|}t|}|||fS )Nz[CLS]   z[SEP]   r   )r%   tokenizer$   convert_tokens_to_idslentorch
LongTensor)r&   texttokens	input_ids
input_masksegment_idsr   r   r   __call__4   s"   




zTokenizer.__call__N)r!   )__name__
__module____qualname__r'   r4   r   r   r   r   r    ,   s    
r    c                       s$   e Zd Z fddZdd Z  ZS )DiffusionModelc                    s   t t|   ttd|dd}|d }tt	|| _
|d }tdi || _|d }tdi || _|d }tdi || _d S )	Nz{}/model_config.jsonutf-8encodingtext_configgenerator_configupsampler_256_configupsampler_1024_configr   )superr8   r'   jsonloadopenformatr
   r	   	from_dicttext_encoderr   unet_generatorr   unet_upsampler_256r   unet_upsampler_1024)r&   	model_dirmodel_configr<   r=   r>   r?   	__class__r   r   r'   L   s   zDiffusionModel.__init__c           	   	   C   s\   | j |||d\}}|d }| |||||}| |||t||||}| |t|}|S )Nr1   token_type_idsattention_mask)rF   rG   rH   r-   
zeros_likerI   t)	r&   noise	timestepsr1   rO   rP   contextyxr   r   r   forwardb   s   

zDiffusionModel.forward)r5   r6   r7   r'   rY   __classcell__r   r   rL   r   r8   J   s    r8   )module_namec                       s   e Zd Zd fdd	Zdeeef deeef fddZdeeef deeef fd	d
Ze	
 deeef deeef fddZ  ZS )r   gpuc                    s  t j rdnd}t jd||d| t|d}t t|t	j
d}|| |   t|| _|| j |j| _|j| _|j| _|j| _| dt	j }t|dd| _ttd|d	d
}tdi |d | _tdi |d | _tdi |d | _d S )Nr\   cpu)rJ   device)rJ   /r!   )r"   r$   z{}/diffusion_config.jsonr9   r:   r=   r>   r?   r   )r-   cudais_availabler@   r'   r8   rB   ospjoinr   TORCH_MODEL_BIN_FILEload_state_dictevaltor   r^   rF   rG   rH   rI   
VOCAB_FILEr    r%   rA   rC   rD   r   diffusion_generatordiffusion_upsampler_256diffusion_upsampler_1024)r&   rJ   r^   kwargsdiffusion_modelpretrained_params
vocab_pathdiffusion_paramsrL   r   r   r'   u   s>   


z)DiffusionForTextToImageSynthesis.__init__inputreturnc           	   	      s
  t  fdddD std   |  d \}}}|| jd}|| jd}|| jd}| j|||d\}}|d }| t	t
|||}| t	t
|tt
|||}| |t|}|dd	d	d
}|dd	dd  tj}|S )Nc                    s   g | ]}| v qS r   r   ).0keyrq   r   r   
<listcomp>   s    z<DiffusionForTextToImageSynthesis.forward.<locals>.<listcomp>)r/   rT   rU   z@input should contains "text", "noise", and "timesteps", but got r/   r   rN   rQ   r)        _@r(   )all
ValueErrorkeysr%   rg   r^   	unsqueezerF   rG   rT   rU   rH   r-   rR   rI   rS   clampaddmulsqueezepermuter]   numpyastypenpuint8)	r&   rq   r1   rO   rP   rV   rW   rX   imgr   ru   r   rY      s2   


$z(DiffusionForTextToImageSynthesis.forwardinputsc                 C   s   |S )Nr   )r&   r   r   r   r   postprocess   s   z,DiffusionForTextToImageSynthesis.postprocessc           	      C   s  d|vrt d|  | |d \}}}|| jd}|| jd}|| jd}| j|||d\}}|d }|dd}|dkr+| jj	t
dd	d
d
| j| j|||dt
|t
||dg|dd|dd|ddd	dddd
}|ddstj|dddd}| jj	t
|| j|t
d| j|||d|t
d| jt
|t
|t
|dg|dd|dd|ddd	dddd
}|ddstj|dddd}| jj	t
|| j|t
d| j|||d|t
d| jt
|t
|t
|dg|dd|dd|ddd	ddd d
}n|dkr| jjt
dd	d
d
| j| j|||dt
|t
||dg|dd|dd|d d!|d"d#d$}|ddsxtj|dddd}| jjt
|| j|t
d| j|||d|t
d| jt
|t
|t
|dg|dd|dd|d%d&|d'd#d$}|ddstj|dddd}| jjt
|| jd(|i|d)d|d*d|d+d#d,}nt d-|dddd.ddd/d  tj }|S )0Nr/   z%input should contain "text", but got r   rN   rQ   solverz
dpm-solverr)      r!   )rW   rV   maskgenerator_percentilegףp=
?generator_guide_scaleg      @dpm_solver_timesteps   logSNR
singlestepgO@a?)
rT   modelmodel_kwargs
percentileguide_scaler   order	skip_typemethodt_startdebugFg      @bilinear)scale_factormodealign_corners)lxltrW   rV   r   upsampler_256_percentileupsampler_256_guide_scale
   ddimgenerator_ddim_timesteps   generator_ddim_etag        )rT   r   r   r   r   ddim_timestepsetaupsampler_256_ddim_timesteps2   upsampler_256_ddim_etaconcatupsampler_1024_percentileupsampler_1024_ddim_timestepsupsampler_1024_ddim_eta)rT   r   r   r   r   r   z6currently only supports "ddim" and "dpm-solve" solversrw   r(   )!ry   rz   r%   rg   r^   r{   rF   getri   dpm_solver_sample_loopr-   randnrG   rR   Finterpolaterj   
randn_likerH   zerosrk   ddim_sample_looprI   r|   r}   r~   r   r   r]   r   r   r   r   )	r&   rq   r1   rO   rP   rV   rW   r   r   r   r   r   generate   sT  



	









	









z)DiffusionForTextToImageSynthesis.generate)r\   )r5   r6   r7   r'   r   strr   rY   r   r-   no_gradr   rZ   r   r   rL   r   r   q   s    """,)r   NNr   )5os.pathpathrb   typingr   r   rA   r   r   r-   torch.nnnntorch.nn.functional
functionalr   modelscope.metainfor   modelscope.modelsr   modelscope.models.builderr   1modelscope.models.multi_modal.diffusion.diffusionr   r   2modelscope.models.multi_modal.diffusion.structbertr	   r
   1modelscope.models.multi_modal.diffusion.tokenizerr   6modelscope.models.multi_modal.diffusion.unet_generatorr   :modelscope.models.multi_modal.diffusion.unet_upsampler_256r   ;modelscope.models.multi_modal.diffusion.unet_upsampler_1024r   modelscope.utils.constantr   r   modelscope.utils.devicer   modelscope.utils.loggerr   logger__all__r   objectr    Moduler8   register_moduletext_to_image_synthesisr   r   r   r   r   r   <module>   s@   

'