o
    GiQ7                     @   sj   d dl Z d dlZd dlZd dlmZ ddlmZ e r!d dlm	Z	 dd Z
G dd	 d	ZG d
d dZdS )    N)Image   )is_torchvision_available)
transformsc                    sH  t | j d| kr"| jtdd | jD tjd} t | j d| ks	t| j |krB|t| j   | jt fdd| jD tjd} t | j dk rbdt | j   | jt fdd| jD tjd} t	| }|j
d d d }|j
d d | }|j
d	 d d }|j
d	 d | }|||j
d | ||j
d	 | f }t|S )
z
    Crop the image so that its height and width does not exceed `max_image_size`, while ensuring both the height and
    width are multiples of 16.
       c                 s   s    | ]}|d  V  qdS )r   N .0xr   r   a/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/omnigen/processor_omnigen.py	<genexpr>"   s    zcrop_image.<locals>.<genexpr>)resamplec                 3       | ]	}t |  V  qd S Nroundr   scaler   r   r   &          c                 3   r   r   r   r   r   r   r   r   *   r   r      )minsizeresizetupler   BOXmaxBICUBICnparrayshape	fromarray)	pil_imagemax_image_sizearrcrop_y1crop_y2crop_x1crop_x2r   r   r   
crop_image   s     $$
(
r)   c                   @   s   e Zd ZddefddZdd Zdd Zd	d
 Zdd Z								dde	e
 de	e	e
  dedede
dededededefddZdS ) OmniGenMultiModalProcessor   r#   c              	      sP   || _  | _tt fddt tjg dg dddg| _t | _	d S )Nc                    
   t |  S r   r)   r"   r#   r   r   <lambda>>      
 z5OmniGenMultiModalProcessor.__init__.<locals>.<lambda>      ?r3   r3   Tmeanstdinplace)
text_tokenizerr#   r   ComposeLambdaToTensor	Normalizeimage_transformOmniGenCollatorcollator)selfr8   r#   r   r/   r   __init__8   s   z#OmniGenMultiModalProcessor.__init__c              	      sB    | _ tt fddt tjg dg dddg| _d S )Nc                    r,   r   r-   r.   r/   r   r   r0   J   r1   zAOmniGenMultiModalProcessor.reset_max_image_size.<locals>.<lambda>r2   Tr4   )r#   r   r9   r:   r;   r<   r=   )r@   r#   r   r/   r   reset_max_image_sizeF   s   
z/OmniGenMultiModalProcessor.reset_max_image_sizec                 C   s$   t |trt|d}| |S )NRGB)
isinstancestrr   openconvertr=   )r@   imager   r   r   process_imageP   s   

z(OmniGenMultiModalProcessor.process_imagec                    s   |} d u st dkr|}|jd d dS d}fddt||D }tdt|D ]}|| d dkrE|| dd  ||< q1t||}dd |D }tt	|}	|	t
tdt|	d ksmJ d|	 t|	t ksJ d	t|	 d
t  d fdd|D  g }
g }tt|D ]8}|
||  |t|d krt|
} | d | d d d }|||| g |
dg|  q|
 |dS )Nr   )	input_idspixel_valuesimage_sizesz<\|image_\d+\|>c                    s   g | ]}  |jqS r   )r8   rJ   )r	   chunkr@   r   r   
<listcomp>\       zIOmniGenMultiModalProcessor.process_multi_modal_prompt.<locals>.<listcomp>r   c                 S   s(   g | ]}t |d d dd qS )|r   _)intsplit)r	   sr   r   r   rO   c   s   ( zSimage_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be z?total images must be the same as the number of image tags, got z image tags and z imagesc                    s   g | ]} |d   qS r   r   r   )input_imagesr   r   rO   n   rP   rS   r   )add_prefix_instructionlenr8   rJ   rerU   rangefindallsortedsetlistextendr   append)r@   textrX   model_inputspatternprompt_chunksi
image_tags	image_idsunique_image_idsall_input_idsimg_inx	start_inxr   r   )rX   r@   r   process_multi_modal_promptU   s>   

$z5OmniGenMultiModalProcessor.process_multi_modal_promptc                 C   s,   d}d}d}d}| | | | | }|S )Nz	<|user|>
z:Generate an image according to the following instructions
z<|assistant|>
<|diffusion|>z<|end|>
r   )r@   promptuser_promptgeneration_promptassistant_promptprompt_suffixr   r   r   rZ   |   s   z1OmniGenMultiModalProcessor.add_prefix_instructionNj  low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers.TFr   instructionsrX   heightwidthnegative_promptuse_img_cfgseparate_cfg_inputuse_input_image_size_as_outputnum_images_per_promptreturnc
                    sL  t |tr|g}|g}g }
tt|D ]}|| }|d u rd n|| }|d ur7t|dkr7 fdd|D }nd }d|vs?J  ||}d\}} |d }|rr|d urpt|dkrpdd tt|D } d||}n|}t|	D ])}|r|
||||d	 d d
|d	 d dgf qv|
|||||gf qvq |
S )Nr   c                    s   g | ]}  |qS r   )rI   r   rN   r   r   rO          z7OmniGenMultiModalProcessor.__call__.<locals>.<listcomp>z<img><|image_1|></img>)NNr   c                 S   s   g | ]
}d |d  dqS )z<img><|image_r   z|></img>r   )r	   rh   r   r   r   rO      s     rK   rY   rS   )	rD   rE   r]   r[   ro   joinrc   r   r?   )r@   rv   rX   rw   rx   ry   rz   r{   r|   r}   
input_datarh   cur_instructioncur_input_images
mllm_inputneg_mllm_inputimg_cfg_mllm_inputimg_cfg_promptrR   r   rN   r   __call__   s>   
"	
z#OmniGenMultiModalProcessor.__call__)r+   )Nr+   r+   ru   TFFr   )__name__
__module____qualname__rT   rA   rB   rI   ro   rZ   ra   rE   booldictr   r   r   r   r   r*   7   sF    
'
	
r*   c                   @   sF   e Zd ZdddZdd Zdd Zd	d
 Zdd Zdd Zdd Z	dS )r>   r      c                 C   s   || _ || _d S r   )pad_token_idhidden_size)r@   r   r   r   r   r   rA      s   
zOmniGenCollator.__init__c           	      C   s`   g }| d}t|}|D ]}t|}dg||  tt|| d  }|| qt|S )NrS   r   r   )r   r   torchsumra   r]   rc   
LongTensor)	r@   attention_masknum_tokens_for_output_imagesposition_idstext_length
img_lengthmasktemp_ltemp_positionr   r   r   create_position   s   


zOmniGenCollator.create_positionc                 C   s  g }g }| d}t|}|| d }d}|D ]}	t|	}
||
 }ttj|
d |
d fd}tj|
d |fd}tj||gdd}tj||
| d fd}tj||gdd}|dkrtj|
d | |fd}tj||gdd}tj||fd}tj||gdd}|| }|| }|dkrd|dd| df< tjd|| jfd}nd}|	|
d |	| |d7 }qtj|dd|fS )z
        OmniGen applies causal attention to each element in the sequence, but applies bidirectional attention within
        each image sequence References: [OmniGen](https://huggingface.co/papers/2409.11340)
        rS   r   r   )r   )dimN)r   r   r   r   triloneszeroscatr   rc   	unsqueeze)r@   r   r   extended_maskpadding_imagesr   r   seq_leninxr   r   pad_l	temp_mask
image_maskpad_masktrue_img_lengthpad_img_lengthtemp_padding_imgsr   r   r   create_mask   s:   



zOmniGenCollator.create_maskc                 C   s<   |  D ]}|| D ]\}}d|| ||||f< q
q|S )Nr   )keys)r@   r   rL   b_inxrn   end_inxr   r   r   !adjust_attention_for_input_images   s
   z1OmniGenCollator.adjust_attention_for_input_imagesc                    s   t dd |D }g }g }tt|D ]U}|| }t|}||   dkr3|dg|  || n|dg  dg|   || jg  |  ||v rhg }	|| D ]}
|	 fdd|
D  qU|	||< qt|t||fS )Nc                 S   s   g | ]}t |qS r   )r[   r   r   r   r   rO          z1OmniGenCollator.pad_input_ids.<locals>.<listcomp>r   r   c                    s   g | ]}|  qS r   r   r   r   r   r   rO     r   )r   r]   r[   rc   r   r   r   )r@   rJ   rL   max_l
padded_idsr   rh   temp_idsr   new_inxold_inxr   r   r   pad_input_ids   s&   zOmniGenCollator.pad_input_idsc                 C   s   g }|D ]}| |d |d  d d  qg i }}d}|D ]+}|d d urF||d  |d D ]}	||vr>|	g||< q2||  |	 q2|d7 }qdd |D }dd |D }
| |
|\}}}| ||}| ||\}}| ||}||||||fS )	Nr   r   r   rK   rL   c                 S   s   g | ]}| d qS r   )r   r   r   r   r   rO   $  r   z6OmniGenCollator.process_mllm_input.<locals>.<listcomp>c                 S      g | ]}|d  qS )rJ   r   r   r   r   r   rO   &  r   )rc   rb   r   r   r   r   )r@   mllm_inputstarget_img_sizer   img_sizerK   rL   r   r
   r   rJ   padded_input_idsr   r   r   r   r   r   process_mllm_input  s(    

z"OmniGenCollator.process_mllm_inputc                 C   s   dd |D }dd |D }dd |D }dd |D }|d d ur/|| | }|| | }n|| }|| }|  ||\}}}}	}
}||||
|d}|S )Nc                 S   r   r   r   r	   fr   r   r   rO   /  r   z,OmniGenCollator.__call__.<locals>.<listcomp>c                 S   r   rW   r   r   r   r   r   rO   0  r   c                 S   r   )r   r   r   r   r   r   rO   1  r   c                 S   r   )r   r   r   r   r   r   rO   2  r   r   )rJ   r   r   input_pixel_valuesinput_image_sizes)r   )r@   featuresr   cfg_mllm_inputsr   r   all_padded_input_idsall_position_idsall_attention_maskall_padding_imagesall_pixel_valuesall_image_sizesdatar   r   r   r   .  s0   
	zOmniGenCollator.__call__N)r   r   )
r   r   r   rA   r   r   r   r   r   r   r   r   r   r   r>      s    
+r>   )r\   numpyr   r   PILr   utilsr   torchvisionr   r)   r*   r>   r   r   r   r   <module>   s    