o
    ei                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# e!$e%Z&dZ'zddl(m)Z) dZ'e&*d W n e+y   Y n e,y   e&-d Y nw G dd dej.Z/e'se)Z/G dd dej.Z0G dd dej.Z1G dd dej.Z2G dd  d ej.Z3G d!d" d"ej.Z4G d#d$ d$ej.Z5G d%d& d&eZ6eG d'd( d(eZ7G d)d* d*e7Z8G d+d, d,ej.Z9ed-d.G d/d0 d0e7eZ:d0d(gZ;dS )1zPyTorch Pop2Piano model.    N)nn)CrossEntropyLoss)GenerationConfig   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutput)PreTrainedModel)auto_docstringis_torchdynamo_compilinglogging   )Pop2PianoConfigT)FusedRMSNormFzVDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNormzIDiscovered apex but it failed to load, falling back to Pop2PianoLayerNormc                       s&   e Zd Zd fdd	Zdd Z  ZS )Pop2PianoLayerNormư>c                    s&   t    tt|| _|| _dS )zj
        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pop2piano/modeling_pop2piano.pyr   8   s   

zPop2PianoLayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)keepdim)tor   float32powmeanrsqrtr    r   dtypefloat16bfloat16)r!   hidden_statesvariancer&   r&   r'   forward@   s
   
zPop2PianoLayerNorm.forward)r   )__name__
__module____qualname__r   r5   __classcell__r&   r&   r$   r'   r   7   s    r   c                       *   e Zd Zdef fddZdd Z  ZS )Pop2PianoDenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr!   r<   r$   r&   r'   r   V   s
   
zPop2PianoDenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rC   rI   rG   
isinstancerD   r   r   Tensorr0   int8r+   )r!   r3   r&   r&   r'   r5   ]   s   



zPop2PianoDenseActDense.forwardr6   r7   r8   r   r   r5   r9   r&   r&   r$   r'   r;   U   s    r;   c                       r:   )Pop2PianoDenseGatedActDenser<   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r=   )r   r   r   r@   rA   rB   wi_0wi_1rD   rE   rF   rG   r   rH   rI   rJ   r$   r&   r'   r   m   s   
z$Pop2PianoDenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rK   )rI   rQ   rR   rG   rL   rD   r   r   rM   r0   rN   r+   )r!   r3   hidden_geluhidden_linearr&   r&   r'   r5   u   s   


z#Pop2PianoDenseGatedActDense.forwardrO   r&   r&   r$   r'   rP   l   s    rP   c                       r:   )Pop2PianoLayerFFr<   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr#   )r   r   is_gated_actrP   DenseReluDenser;   r   rA   layer_norm_epsilon
layer_normr   rE   rF   rG   rJ   r$   r&   r'   r      s   

zPop2PianoLayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rK   )rZ   rX   rG   )r!   r3   forwarded_statesr&   r&   r'   r5      s   

zPop2PianoLayerFF.forwardrO   r&   r&   r$   r'   rU      s    
rU   c                       sb   e Zd Z		ddededB f fddZedd
dZdddZ								dddZ	  Z
S )Pop2PianoAttentionFNr<   	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr>   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerA   d_kvkey_value_proj_dim	num_headsn_headsrF   rG   	inner_dimr]   loggerwarning_oncer%   r6   r   r@   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr!   r<   r_   r]   r$   r&   r'   r      s,   

zPop2PianoAttention.__init__T       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r(   r   )r+   r   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger&   r&   r'   _relative_position_bucket   s*   z,Pop2PianoAttention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r0   device)r}   r~   r   )r(   r   r   r   )rn   r   r   r   arangers   r+   r   r^   r`   ra   permute	unsqueeze)
r!   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr|   relative_position_bucketvaluesr&   r&   r'   compute_bias   s    
 
zPop2PianoAttention.compute_biasc
                 C   s  |j dd \}
}|du}| |}||
d| j| jdd}d}t|tr8|j	| j
}|r4|j}n|j}n|}|r>|n|}|rW|durW|rW|j| j
 j}|j| j
 j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|s|	nd}	|||| j
d|	i\}}|rt|trd|j| j
< t||dd}|du r|j d	 }|dur|n|	d d }| jstjd| j||f|j|jd
}| jr| jrd|_n| j|||j|	d}|dddd| dddf }|dur|ddddddd|j d	 f }|| }|}||7 }tjj|  dd!|}tjj"|| j"| jd}t||}|dd# }||
d| j$}| %|}||f}|rY||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr(   r)   r   Fr   Tr   )r   r0   )r   r   dim)ptraining)&shaperi   viewre   rc   	transposerL   r
   
is_updatedgetr]   cross_attention_cacheself_attention_cachelayerskeysr   rj   rk   updater   matmulr_   zerosr   r0   ro   r   requires_gradr   r   
functionalsoftmaxrx   type_asrG   
contiguousrf   rl   )r!   r3   maskkey_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputsr&   r&   r'   r5     sp   






"
&

zPop2PianoAttention.forwardFN)Trq   rr   )NN)NNNNNFFN)r6   r7   r8   r   intr   staticmethodr   r   r5   r9   r&   r&   r$   r'   r\      s(    "
/r\   c                       s>   e Zd ZddedB f fddZ						d	ddZ  ZS )
Pop2PianoLayerSelfAttentionFNr]   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr_   r]   rV   )r   r   r\   SelfAttentionr   rA   rY   rZ   r   rE   rF   rG   rp   r$   r&   r'   r   k  s   
z$Pop2PianoLayerSelfAttention.__init__c              	   C   sJ   |  |}| j|||||||d}	|| |	d  }|f|	dd   }
|
S )N)r   r   r   r   r   r   r   r   )rZ   r   rG   )r!   r3   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   r&   r&   r'   r5   s  s   

	z#Pop2PianoLayerSelfAttention.forwardr   )NNNFFNr6   r7   r8   r   r   r5   r9   r&   r&   r$   r'   r   j  s    r   c                       s@   e Zd ZddedB f fddZ							d	ddZ  ZS )
Pop2PianoLayerCrossAttentionNr]   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr   rV   )r   r   r\   EncDecAttentionr   rA   rY   rZ   r   rE   rF   rG   )r!   r<   r]   r$   r&   r'   r     s   
z%Pop2PianoLayerCrossAttention.__init__Fc
                 C   sN   |  |}
| j|
||||||||	d	}|| |d  }|f|dd   }|S )N)r   r   r   r   r   r   r   r   r   r   )rZ   r   rG   )r!   r3   r   r   r   r   r   r   r   r   r   r   layer_outputr   r&   r&   r'   r5     s   
z$Pop2PianoLayerCrossAttention.forwardrK   )NNNFNFNr   r&   r&   r$   r'   r     s    
r   c                       sF   e Zd Zd	dedB f fddZ										d
ddZ  ZS )Pop2PianoBlockFNr]   c                    s`   t    |j| _t | _| jt|||d | jr&| jt||d | jt	| d S )Nr   )r]   )
r   r   r^   r   
ModuleListlayerappendr   r   rU   rp   r$   r&   r'   r     s   

zPop2PianoBlock.__init__Tc              
   C   s  | j d ||||||	|d}|d }|dd  }|jtjkr?tt| t|jjd t|jj}tj	|| |d}| j
oE|d u}|r| j d ||||||d d ||	d}|d }|jtjkrtt| t|jjd t|jj}tj	|| |d}||dd   }| j d |}|jtjkrtt| t|jjd t|jj}tj	|| |d}|f}|| S )Nr   )r   r   r   r   r   r   r   i  )ru   maxr)   )r   r   r   r   r   r   r   )r   r0   r   r1   r{   isinfanyfinfor   clampr^   )r!   r3   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   r&   r&   r'   r5     sd   	

zPop2PianoBlock.forwardr   )
NNNNNNFFTNr   r&   r&   r$   r'   r     s    r   c                   @   sJ   e Zd ZU eed< dZdZdZdZdgZ	dgZ
e dd	 Zd
d ZdS )Pop2PianoPreTrainedModelr<   transformer)audioTFr   rD   c                 C   s  | j j}t|trt|j|d  dS t|tr'tj|j	jd|d d dS t|t
rMtj|jjd|d d t|drKtj|jjd|d d dS dS t|trtj|jjd|| j jd  d t|jdru|jjdurut|jj tj|jjd|| j jd  d t|jdr|jjdurt|jj dS dS dS t|trtj|jjd|| j jd  d t|jdr|jjdurt|jj tj|jjd|| j jd  d t|jdr|jjdurt|jj tj|jjd|| j jd  d t|jdr|jjdurt|jj dS dS dS t|trw| j j}| j j}| j j}tj|jjd||| d  d tj|jjd||d  d tj|jjd||d  d tj|jjd||| d  d |jrytj|j jd||d  d dS dS dS )zInitialize the weights      ?        )r.   stdlm_head      r?   N)!r<   initializer_factorrL   r   init	constant_r   Pop2PianoConcatEmbeddingToMelnormal_	embedding!Pop2PianoForConditionalGenerationsharedhasattrr   r;   rC   rA   r?   zeros_rD   rB   rP   rQ   rR   r\   rb   rd   ri   rj   rk   rl   r_   rn   )r!   modulefactorrA   rc   re   r&   r&   r'   _init_weights  sR   




        z&Pop2PianoPreTrainedModel._init_weightsc                 C   sx   | j j}| j j}|d u rtd||j}|dd df  |ddd f< ||d< |d u r2td||dk| |S )Nzoself.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id..r)   r   ).r   z1self.model.config.pad_token_id has to be defined.)r<   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r!   	input_idsr   r   shifted_input_idsr&   r&   r'   _shift_rightD  s    z%Pop2PianoPreTrainedModel._shift_rightN)r6   r7   r8   r   __annotations__base_model_prefixoutput_modalitiessupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulesr   no_gradr   r   r&   r&   r&   r'   r     s   
 
'r   c                       sD   e Zd Z fddZdd Z											dddZ  ZS )	Pop2PianoStackc                    sx   t    t j j| _ j| _t fddt	 j
D | _t j jd| _t j| _|   d| _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   r   )r   bool).0ir<   r&   r'   
<listcomp>b  s    z+Pop2PianoStack.__init__.<locals>.<listcomp>rV   F)r   r   r   rm   
vocab_sizerA   embed_tokensr^   r   range
num_layersblockr   rY   final_layer_normrE   rF   rG   	post_initro   rJ   r$   r  r'   r   [  s   

zPop2PianoStack.__init__c                 C   s
   || _ d S rK   )r	  r!   new_embeddingsr&   r&   r'   set_input_embeddingso     
z#Pop2PianoStack.set_input_embeddingsNc           "      K   s  |d ur|n| j j}|d ur|n| j j}|	d ur|	n| j j}	|
d ur$|
n| j j}
|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|rtd
 d}|d u r| jd u rtd| |}|\}}|du r| jstd|  d| jr|r|d u r| j jrtt| j dt| j d}nt| j d}n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| j jr
t| j ||||d}n|d d d d d d f }|j|jd}d| t|jj }| jrO|d urO| \}}}||f}|d u rItj||jd}| |}nd }|	rVdnd }|r]dnd }|rh| jrhdnd }d }d }| |}t| jD ]J\}} |	r||f }| ||||||||||d
}!|!d }|!d }| jr|d ur|!|rdnd }|r||!d f }| jr||!d f }qx|  |}| |}|	r||f }|
st!dd |||||fD S t"|||||dS )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer)   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderr  r   r   )r<   r  r   r   r   )r0   r   r&   )r   r   r   r   r   r   r(      c                 s   s    | ]	}|d ur|V  qd S rK   r&   )r  rk   r&   r&   r'   	<genexpr>  s    z)Pop2PianoStack.forward.<locals>.<genexpr>)last_hidden_stater   r3   
attentionscross_attentions)#r<   r   r   output_hidden_statesuse_return_dictr^   r   sizer   ro   r   rg   rh   r	  is_encoder_decoderr
   r	   get_seq_lengthr   r   r   r   r   r   r+   r0   r   ru   invert_attention_maskrG   	enumerater  r  tupler   )"r!   r   r   r   r   r  r   r   r   r  r   r   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r3   r  layer_modulelayer_outputsr&   r&   r'   r5   r  s   








zPop2PianoStack.forward)NNNNNNNNNNN)r6   r7   r8   r   r  r5   r9   r&   r&   r$   r'   r  Y  s    r  c                       s(   e Zd ZdZ fddZdd Z  ZS )r   z'Embedding Matrix for `composer` tokens.c                    s"   t    tj|j|jd| _d S )N)num_embeddingsembedding_dim)r   r   r   rm   composer_vocab_sizerA   r   rJ   r$   r&   r'   r     s   
z&Pop2PianoConcatEmbeddingToMel.__init__c                 C   s.   || }|  |d}tj||gdd}|S )Nr   r   )r   r   r   cat)r!   featureindex_valueembedding_offsetindex_shiftedcomposer_embeddingr  r&   r&   r'   r5   "  s   z%Pop2PianoConcatEmbeddingToMel.forward)r6   r7   r8   __doc__r   r5   r9   r&   r&   r$   r'   r     s    r   zA
    Pop2Piano Model with a `language modeling` head on top.
    )custom_introc                #       s^  e Zd ZdddZdef fddZdd Zdd	 Z	
d&dej	de
dedej	d
B fddZe	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
d'dejd
B dej	d
B dejd
B dejd
B deeej  d
B ded
B dej	d
B dej	d
B dej	d
B dejd
B ded
B ded
B ded
B ded
B dejd
B deej	 eB f dd Ze 	
	!	
d( fd"d#	Zdejfd$d%Z  ZS ))r   zshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr<   c                    s   t  | || _|j| _t|j|j| _t	|| _
t|}d|_d|_t|| _t|}d|_|j|_t|| _tj|j|jdd| _|   d S )NFTr>   )r   r   r<   rA   	model_dimr   rm   r  r   r   mel_conditionercopydeepcopyr^   r   r  encodernum_decoder_layersr  decoderr@   r   r  )r!   r<   encoder_configdecoder_configr$   r&   r'   r   4  s   




z*Pop2PianoForConditionalGeneration.__init__c                 C   s   | j S rK   )r   )r!   r&   r&   r'   get_input_embeddingsM  s   z6Pop2PianoForConditionalGeneration.get_input_embeddingsc                 C   s"   || _ | j| | j| d S rK   )r   rB  r  rD  r  r&   r&   r'   r  P  s   z6Pop2PianoForConditionalGeneration.set_input_embeddingsNinput_featurescomposergeneration_configr   c                 C   s   |j }||vrtdt|  d| || }tj|| jd}||jd }t	|
 }| j|||d}|dur_d||dddf   < tj|dddf dd	|gd	d
}||fS |dfS )a  
        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
        control the type of MIDI token generated by the model.

        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                input features extracted from the feature extractor.
            composer (`str`):
                composer token which determines the type of MIDI tokens to be generated.
            generation_config (`~generation.GenerationConfig`):
                The generation is used to get the composer-feature_token pair.
            attention_mask (``, *optional*):
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
        zPlease choose a composer from z. Composer received - r  r   )r7  r8  r9  Nr   r)   r   )axis)composer_to_feature_tokenr   listr   r   tensorr   repeatr   ru   r   r?  r  concatenater   )r!   rH  rI  rJ  r   rL  composer_valuer9  r&   r&   r'   get_mel_conditioner_outputsU  s&   &z=Pop2PianoForConditionalGeneration.get_mel_conditioner_outputsr   decoder_input_idsdecoder_attention_maskencoder_outputsr   r  decoder_inputs_embedslabelsr   r   r  r   r   returnc                 K   s  |dur|n| j j}|dur|n| j j}|dur |dur td|dur*|du r*|}|du r:| j||||||d}n$|r^t|ts^t|d t|dkrO|d ndt|dkrZ|d ndd}|d }|
durs|du rs|	du rs| |
}| j	|||	||||||||d}|d }| j j
r|| jd	  }| |}d}|
durtd
d}||d|d|
d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )aq  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
            so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
            [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
            take a look a [Pop2Piano Training](./Pop2Piano#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
            starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`
        NzSBoth `inputs_embeds` and `input_features` received! Please provide only one of them)r   r   r  r   r  r   r   r   r(   )r  r3   r  )r   r   r  r   r   r   r   r   r  r   r   r   r   )ignore_indexr)   )	losslogitsr   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)r<   r   r  r   rB  rL   r   lenr   rD  tie_word_embeddingsr>  r   r   r   r  r   r   r3   r  r  r  )r!   r   r   rS  rT  rU  r   r  rH  rV  rW  r   r   r  r   r   r$  r3   decoder_outputssequence_output	lm_logitsrZ  loss_fctoutputr&   r&   r'   r5     sv   )


z)Pop2PianoForConditionalGeneration.forward	composer1c                    s   |du r| j }|jd	i | t|dstdt|j| jjkr1td| jj dt|j d| j||||d\}}t	 j
d	d|||d|S )
a  
        Generates token ids for midi outputs.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
        strategies and code examples, check out the [following guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
            attention_mask:
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
            composer (`str`, *optional*, defaults to `"composer1"`):
                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
                `"composer"`. Please make sure that the composer value is present in `composer_to_feature_token` in
                `generation_config`. For an example please see
                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:
                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        NrL  z`composer_to_feature_token` was not found! Please refer to https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.jsonand parse a dict like that.ztconfig.composer_vocab_size must be same as the number of keys in generation_config.composer_to_feature_token! Found z vs .)rH  r   rI  rJ  )inputsr  r   rJ  r&   )rJ  r   r   r   r`  rL  r<   r5  rR  r   generate)r!   rH  r   rI  rJ  r$  r$   r&   r'   rj    s:   6

z*Pop2PianoForConditionalGeneration.generatec                 C   s
   |  |S rK   )r   )r!   rW  r&   r&   r'   %prepare_decoder_input_ids_from_labelsT  r  zGPop2PianoForConditionalGeneration.prepare_decoder_input_ids_from_labelsrK   )NNNNNNNNNNNNNNN)Nrg  N)r6   r7   r8   _tied_weights_keysr   r   rG  r  r   FloatTensorstrr   rR  r   
LongTensor
BoolTensorr#  rM   r   r  r   r5   r  rj  rk  r9   r&   r&   r$   r'   r   )  s    

1	
sYr   )<r<  r@  ry   r   r   torch.nnr   transformers.generationr   r  r   r   activationsr   cache_utilsr   r	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   utilsr   r   r   configuration_pop2pianor   
get_loggerr6   rg   _load_pop2piano_layer_normapex.normalizationr   infoImportError	ExceptionwarningModuler   r;   rP   rU   r\   r   r   r   r   r  r   r   __all__r&   r&   r&   r'   <module>   sf   
 N#%_G C  ,