o
    ÔÙ¾iŒ ã                   @   sà  d dl Z d dlmZmZ d dlZd dlm  mZ d dlm	Z	mZ G dd„ dej
ƒZd;dd„Zd<d	d
„ZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZG dd„ dej
ƒZdd„ ZG dd„ dej
ƒZG dd„ dej
ƒZG dd „ d ej
ƒZG d!d"„ d"ejƒZG d#d$„ d$ejƒZG d%d&„ d&ejj
ƒZd=d(d)„ZG d*d+„ d+ej
ƒZG d,d-„ d-eeƒZ d.ee	 fd/d0„Z!G d1d2„ d2ej
ƒZ"G d3d4„ d4ejj#ƒZ$d5e%d6e&fd7d8„Z'd9d:„ Z(dS )>é    N)ÚOptionalÚUnion)ÚTensorÚnnc                       s    e Zd ZdZ‡ fdd„Z‡  ZS )Ú	BlockBasezBlock abstract modulec                    s   t ƒ  ¡  || _|| _d S ©N)ÚsuperÚ__init__Ú
input_sizeÚoutput_size)Úselfr
   r   ©Ú	__class__© úR/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/phi4mm_utils.pyr	      s   

zBlockBase.__init__)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r	   Ú__classcell__r   r   r   r   r      s    r   Úreluc                 C   sT   |   ¡ } | dkrtjddS | dkrt ¡ S | dkrtƒ S | dkr&tj ¡ S t ¡ S )zÂSelect an activation function by name

    Args:
        name: str
            activation function name,
            one of ["relu", "gelu", "swish", "sigmoid"],
            default "relu".
    r   T©ÚinplaceÚgeluÚswishÚsigmoid)Úlowerr   ÚReLUÚGELUÚSwishÚtorchÚSigmoidÚIdentity)Únamer   r   r   Úget_activation    s   	
r$   c                 C   sâ   t  |¡ ¡ }t jj |d¡}t jjj|d| d}t  d| ¡ d¡}||k ||k@  ¡ dd…df }t  d| ¡ d¡ 	| d¡}|| }	d|	|	dk < ||	 }
||
 d¡k}|| }t
|ƒ||t
|ƒk< || }|| d¡k }||@ S )aF  
    The function is very important for Transformer Transducer Streaming mode
    Args:
        xs_len (int): sequence length
        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48].
        It also supports adaptive chunk size [0,10,15,45]
        left_window (int): how many left chunks can be seen
        right_window (int): how many right chunks can be seen. It is used for
        chunk overlap model.
        Returns:
            mask (torch.Tensor): a mask tensor for streaming model
            Torch 1.0.1
            tensor([[1., 1., 0., 0.],
                    [0., 1., 1., 0.],
                    [0., 0., 1., 1.]])
            Torch 1.4.1
            tensor([[True., True., False., False.],
                    [False., True., True., False.],
                    [False., False., True., True.]])
    )é   r   )r   r%   )Úvaluer   éÿÿÿÿNr%   )r    r   Úlongr   Ú
functionalÚpadÚarangeÚ	unsqueezeÚnonzeroÚexpandÚlen)Úx_lenÚchunk_start_idxÚleft_windowÚright_windowÚ	start_padÚend_padÚ	seq_rangeÚidxÚseq_range_expandÚidx_leftÚboundary_leftÚ	mask_leftÚ	idx_rightÚboundary_rightÚ
mask_rightr   r   r   Úadaptive_enc_mask5   s4   ÿþÿÿ
ÿÿr?   c                       s4   e Zd ZdZd	‡ fdd„Zdedefdd„Z‡  ZS )
r   zVImplement Swish activation module.
    From https://arxiv.org/pdf/2005.03191.pdf

    ÚreturnNc                    s   t ƒ  ¡  t ¡ | _d S r   )r   r	   r   r!   Úact_fn©r   r   r   r   r	   l   s   
zSwish.__init__Úxc                 C   s   ||   |¡ S )z_Apply Swish function

        Args:
            x: torch.Tensor
                Input.
        )rA   ©r   rC   r   r   r   Úforwardp   s   zSwish.forward)r@   N©r   r   r   r   r	   r   rE   r   r   r   r   r   r   f   s    r   c                       sB   e Zd ZdZddededdf‡ fdd	„Zd
edefdd„Z‡  Z	S )ÚGLUz(Implement Gated Linear Unit (GLU) moduler'   r   ÚdimÚact_namer@   Nc                    sŠ   t ƒ  ¡  || _| ¡ | _| jdkrtjdd| _d S | jdkr't ¡ | _d S | jdkr2t	ƒ | _d S | jdkr>t 
¡ | _d S t ¡ | _d S )Nr   Tr   r   r   r   )r   r	   rH   r   rI   r   r   rA   r   r   r!   r"   )r   rH   rI   r   r   r   r	   }   s   





zGLU.__init__rC   c                 C   s"   |j d| jd\}}||  |¡ S )zÁGLU forward
        Apply Swish function on the first half of input matrices
        with sigmoid of the second half.

        Args:
            x: torch.Tensor
                Input.

        é   ©rH   )ÚchunkrH   rA   )r   rC   Úhalf_xÚgater   r   r   rE      s   
zGLU.forward)r'   r   )
r   r   r   r   ÚintÚstrr	   r   rE   r   r   r   r   r   rG   z   s    rG   c                       s0   e Zd ZdZ			d	‡ fdd„	Zdd„ Z‡  ZS )
ÚGLUPointWiseConvaÅ  GLUPointWiseConv module
    used for conformer architecture,
    for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        output_dim: int
            output channel size.
        kernel_size: int
            kernel size
        glu_type: str, optional
            activation function one of
             ["sigmoid", "relu", "gelu"]
              default "sigmoid".
        bias_in_glu: bool, optional
            use addtive bias in glu
        causal: bool, optional
            if set to True, padding is set to the half of
             kernel size, ie, convolution can't see future frames.
              default False.

    r   TFc                    sø   t ƒ  ¡  || _|| _|| _|r tj||d |d|d d| _ntj||d |d|d d d| _|dkr;t ¡ | _	n%|dkrEt 
¡ | _	n|dkrOt ¡ | _	n|dkrXtƒ | _	ntd| j	› ƒ‚|rzt t d|d¡¡| _t t d|d¡¡| _d S d S )	NrJ   r%   ©Úpaddingr   r   r   r   zUnsupported activation type )r   r	   Úglu_typeÚ
output_dimÚbias_in_glur   ÚConv1dÚext_pw_conv_1dr!   Úglu_actr   r   r   Ú
ValueErrorÚ	Parameterr    ÚzerosÚb1Úb2)r   Ú	input_dimrU   Úkernel_sizerT   rV   Úcausalr   r   r   r	   ¶   s@   
	
û
û
þzGLUPointWiseConv.__init__c                 C   sb  |  g d¢¡}|  |¡}| jdkrZ| jr:|dd…d| j…dd…f | j |dd…| j| jd …dd…f | j  }nn|dd…d| j…dd…f |dd…| j| jd …dd…f  }nN| jr†|dd…d| j…dd…f | j |  |dd…| j| jd …dd…f | j ¡ }n"|dd…d| j…dd…f |  |dd…| j| jd …dd…f ¡ }|  g d¢¡}|S )zP
        Args:
            x: torch.Tensor
                input tensor
        ©r   rJ   r%   ÚbilinearNr   rJ   )ÚpermuterX   rT   rV   rU   r]   r^   rY   rD   r   r   r   rE   ä   s&   	

 &ÿ ÿ$&ÿ ÿzGLUPointWiseConv.forward)r   TF©r   r   r   r   r	   rE   r   r   r   r   r   rQ   œ   s    ù.rQ   c                       s,   e Zd ZdZ	d‡ fdd„	Zdd„ Z‡  ZS )ÚDepthWiseSeperableConv1da*  DepthWiseSeperableConv1d module used in Convnet module
    for the conformer, for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel will be used as a channel_out
             of the second conv1d layer.
             otherwise, it equal to 0, the second conv1d layer is skipped.
        kernel_size: int
            kernel_size
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
            will be used to compute the hidden channels of the Conv1D.
        padding: int, optional
            padding for the conv1d,
             default: 0.

    r   c                    s\   t ƒ  ¡  tj||| |d||d| _|dkr$t || |ddd¡| _nt ¡ | _|| _d S )Nr%   )rS   Úgroupsr   )r   r	   r   rW   Údw_convÚpw_convr"   Údepthwise_seperable_out_channel)r   r_   rj   r`   Údepthwise_multiplierrS   r   r   r   r	     s&   
ú	û

z!DepthWiseSeperableConv1d.__init__c                 C   s"   |   |¡}| jdkr|  |¡}|S )zQ

        Args:
            x: torch.Tensor
                input tensor
        r   )rh   rj   ri   rD   r   r   r   rE   =  s   


z DepthWiseSeperableConv1d.forward)r   re   r   r   r   r   rf     s
    úrf   c                       sD   e Zd ZdZ									d‡ fdd	„	Zd
d„ Zdd„ Z‡  ZS )Ú
ConvModuleay	  ConvModule Module for the conformer block.
    for more details see:
    https://arxiv.org/pdf/2005.08100v1.pdf

    Args:
        input_dim: int
            input channel size.
        ext_pw_out_channel: int
            if > 0, ext_pw_out_channel is a dim channel size
             for the last pointwise conv after swish activation.
        depthwise_seperable_out_channel: int
            if set different to 0, the number of
             depthwise_seperable_out_channel
             will be used as a channel_out of the second conv1d layer.
             otherwise, it equal to 0, the second conv1d layer is skipped.
        ext_pw_kernel_size: int
            kernel size of the conv pointwise of the conformer.
        kernel_size: int
            kernel size.
        depthwise_multiplier: int
            number of input_dim channels duplication. this value
             will be used to compute the hidden channels of the Conv1D.
        dropout_rate: float
            dropout rate.
        causal: bool, optional
            if set to True, convolution have no access
             to future frames. default False.
        batch_norm: bool, optional
            if set to True, apply batchnorm before activation.
            default False
        chunk_se: int, optional
            0 for offline SE.
            1 for streaming SE, where mean is computed
             by accumulated history until current chunk_se.
            2 for streaming SE, where mean is computed
             by only the current chunk.
        chunk_size: int, optional
            chunk size for cnn. default 18
        activation: str, optional
            activation function used in ConvModule,
            default: "relu".
        glu_type: str, optional
            activation function used for the glu,
            default: "sigmoid".
        bias_in_glu: bool, optional
            if set to True, use additive bias in the weight module
             before GLU.
        linear_glu_in_convm: bool, optional
            if set to True, use GLULinear module,
             otherwise, used GLUPointWiseConv module.
              default to False.
        export: bool, optional,
            if set to True, padding is equal to 0.  This is for inference,
             or onnx export.  Typically this is set by the export program or
             the decoder program, and it isn't present in your config file.
             default False
    Fr   é   r   r   Tc                    s  t ƒ  ¡  t |¡| _|| _|| _|| _|| _|| _	|| _
|| _|| _|  ¡  |	| _|| _|	r5t |¡| _t|ƒ| _t |¡| _|| _|rN|rIdn|d }n|d d }t|||||d| _|dkrq||krot ||¡| _d S d S |dkr€t || |¡| _d S d S )Nr   r%   rJ   rR   )r   r	   r   Ú	LayerNormÚ
layer_normr_   Úext_pw_out_channelÚext_pw_kernel_sizerj   rT   rV   Úlinear_glu_in_convmra   Ú_add_ext_pw_layerÚ
batch_normr`   ÚBatchNorm1dÚbn_layerr$   ÚactÚDropoutÚdropoutÚexportrf   Údw_sep_conv_1dÚLinearÚln2)r   r_   rp   rj   rq   r`   rk   Údropout_ratera   rt   Úchunk_seÚ
chunk_sizeÚ
activationrT   rV   rr   rz   rS   r   r   r   r	   …  sF   

ûÿÿzConvModule.__init__c                 C   sH  t  ¡  | _ | _ | _| _t  ¡ | _d | _| _| j	dkrŽ| j
r?t j| j| j	| jd| jd d| _| jdkr;d| _nd| _nt j| j| j	| jd| jd d d| _d| _| jret| j| j	| j| jƒ| _nt| j| j	| j| j| j| j
ƒ| _| j| j	kr‰d| _t  | j	| j¡| _dS d| _dS tj  t d¡¡| _tj  t d¡¡| _dS )	z—
        This function is an extension of __init__ function
        and dedicated to the convolution module creation
        of the conformer.
        Fr   r%   rR   TrJ   é   N)r   r"   Úln1Úglurv   rX   Úsqueeze_excitationÚ	apply_ln1Úfix_len1rp   ra   rW   r_   rq   rr   Ú	GLULinearrT   rV   rQ   r|   r    r[   ÚonesÚpw_conv_simplify_wr\   Úpw_conv_simplify_brB   r   r   r   rs   Ã  sZ   ÿ

û
ûüú	
zConvModule._add_ext_pw_layerc                 C   sê  |   |¡}| jdkr1|  |¡}| jr(| jdkr(|dd…d| jd  …dd…f }| jr0|  |¡}n|| jd  | jd  }|| jd  | jd  }|| }| 	g d¢¡}|  
|¡}| jrr| jdkrr|dd…dd…d| jd  …f }t| dƒrŠ| 	g d¢¡}|  |¡}| 	g d¢¡}| jr’|  |¡}|  |¡}| jdkrÓ|  |¡}| jrµ|dd…dd…d| jd  …f }| jrË| 	g d¢¡}|  |¡}| 	g d¢¡}| 	g d¢¡}n| d¡ 	g d¢¡}|| jd  | jd  }| d¡}|  |¡}|S )zeConvModule Forward.

        Args:
            x: torch.Tensor
                input tensor.
        r   r%   Nrb   r}   )r   r%   r‚   rJ   rJ   )ro   rp   r„   ra   rq   r†   rƒ   rŠ   r‹   rd   r{   r`   Úhasattrr}   rt   rv   rw   rX   r‡   r,   Úsqueezery   )r   rC   Úx_0Úx_1r   r   r   rE      sH   


"
€
"




"


zConvModule.forward)	FFr   rm   r   r   TFF)r   r   r   r   r	   rs   rE   r   r   r   r   r   rl   J  s    Cï>=rl   c                       ó.   e Zd ZdZ		d‡ fdd„	Zdd„ Z‡  ZS )	rˆ   a`  Linear + GLU module

    Args:
        input_dim: int
            input size
        output_dim: int
            output size.
        glu_type:
            activation function name used in glu module.
            default "sigmoid" (swish function).
        bias_in_glu: bool, optional
            If True, the addtive bias is added. Default False.
    r   Tc                    s.   t ƒ  ¡  t ||d |¡| _td|ƒ| _d S )NrJ   r'   )r   r	   r   r|   ÚlinearrG   rY   )r   r_   rU   rT   rV   r   r   r   r	   D  s   
zGLULinear.__init__c                 C   s   |   |¡}|  |¡S )zdGLULinear forward

        Args:
            x: torch.Tensor
                inpute tensor.
        )r‘   rY   rD   r   r   r   rE   O  s   

zGLULinear.forward©r   Tre   r   r   r   r   rˆ   5  s    ûrˆ   c                       r   )	ÚFeedForwarda  FeedForward Module.
    For more details see Conformer paper:
        https://arxiv.org/pdf/2005.08100.pdf

    Args:
        d_model: int
            input size.
        d_inner: int
            output size.
        dropout_rate: float,
            dropout rate.
        activation: str,
            activation function name,
            one of ["relu", "swish", "sigmoid"],
            sigmoid activation is only used with "glu_in_fnn=True",
            default "sigmoid".
        bias_in_glu: bool, optional
    r   Tc                    sZ   t ƒ  ¡  || _|| _t |¡| _t||||ƒ}t |t 	|¡t 
||¡t 	|¡¡| _d S r   )r   r	   Úd_modelÚd_innerr   rn   ro   rˆ   Ú
Sequentialrx   r|   Únet)r   r”   r•   r~   r   rV   Úmoduler   r   r   r	   n  s   


üzFeedForward.__init__c                 C   s   |   |  |¡¡}|S )zoFeedForward forward function.

        Args:
            x: torch.Tensor
                input tensor.
        )r—   ro   )r   rC   Úoutr   r   r   rE   ƒ  s   zFeedForward.forwardr’   re   r   r   r   r   r“   Z  s    úr“   c                 C   s"   |d }|| v r|   |¡ dS dS )zöPerform pre-hook in load_state_dict for backward compatibility.

    Note:
        We saved self.pe until v.0.5.2 but we have omitted it later.
        Therefore, we remove the item "pe" from `state_dict` for backward
        compatibility.

    ÚpeN)Úpop)Ú
state_dictÚprefixÚlocal_metadataÚstrictÚmissing_keysÚunexpected_keysÚ
error_msgsÚkr   r   r   Ú	_pre_hook  s   ÿr¤   c                       s2   e Zd ZdZd‡ fdd„	Zdd„ Zd	d
„ Z‡  ZS )ÚT5RelativeAttentionLogitBiasaS  
    This module implements the relative position bias described in Section
    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf

    The Huggingface implementation is used as a reference
    https://github.com/huggingface/transformers/blob/v4.30.0/src/
    transformers/models/t5/modeling_t5.py#L435

    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
    on relative position of the query and key. It is HxNxN, where H is the
    number of heads, N is the sequence length.

    I've made these modifications to the original T5 bias:
    - Skipping of the bucketing step. Original T5 bias converted rel
      position distances into logarithmically increasing buckets. This is
      supposed to help with length generalization.
    - I just directly use rel position index as bias values, as we don't
      need length generalization (40s max is good enough for ASR encoder),
      and it keeps ONNX export simple.
    - I've also extended it so that biases can be asymmetric, the default
      implementation treats L->R and R->L the same. Asymmetric was found to
      yield better results in my experiments.

    Args:
        num_heads: int
            Number of attention heads
        num_buckets: int
            Number of buckets to use for relative attention bias. This is the
            size of the learnable bias parameter. Bucketing is not yet
            supported, so this defaults to -1 which means no bucketing is
            used (max_distance determines size of bias param).
        max_distance: int
            Maximum distance to use for relative attention bias. With
            num_buckets=-1, this directly controls the max size of the bias
            parameter. When num_buckets > 0 is supported, this will control
            the maximum distance for logarithmic bucketing after which all
            positions are in the same bucket.
        symmetric: bool
            Whether to use symmetric or asymmetric biases. symmetric=False uses
            2x number of bias params to distinguish L->R from R->L. This was
            found to be better for the encoder.
    r'   éè  Fc                    sn   t ƒ  ¡  || _|| _|| _|| _| jdk | _| jr|| _ntdƒ‚| js,|  jd9  _t 	| j| j¡| _
d S )Nr   z;T5 attention bias with bucketed positions is not yet testedrJ   )r   r	   Ú	num_headsÚnum_bucketsÚmax_distanceÚ	symmetricÚ_skip_bucketingÚNotImplementedErrorr   Ú	EmbeddingÚbias_values)r   r§   r¨   r©   rª   r   r   r   r	   Ò  s   
ÿz%T5RelativeAttentionLogitBias.__init__c                 C   sÜ   |  d¡}tj||jtjdd d …d f }tj||jtjdd d d …f }|| }| || j k | j ¡}| || jd k| jd ¡}| jrI|}n|  |¡}| j	rV| 
¡ }n|| jd 7 }|  |¡}| ddd¡ d¡}|S )Nr%   ©ÚdeviceÚdtyperJ   r   )Úsizer    r+   r°   r(   Úmasked_fillr©   r«   Ú_bucket_relative_positionrª   Úabsr¨   r®   rd   r,   )r   rC   ÚmaxposÚcontext_positionÚmemory_positionÚrelative_positionÚbias_idxÚt5_rel_att_biasr   r   r   rE   ã  s.   

ÿ
ÿÿÿ


z$T5RelativeAttentionLogitBias.forwardc                 C   sÌ   d}| j s|  jd  _||dk tj¡| j 7 }t |¡}n
t |t |¡¡ }| jd }||k }|t | 	¡ | ¡t
 | j| ¡ | j|   tj¡ }t |t || jd ¡¡}|t |||¡7 }|S )Nr   rJ   r%   )ra   r¨   Útor    r(   rµ   ÚminÚ
zeros_likeÚlogÚfloatÚmathr©   Ú	full_likeÚwhere)r   r¹   Úrelative_bucketsÚ	max_exactÚis_smallÚrelative_position_if_larger   r   r   r´     s<   
ÿþ
ÿ
ÿþüþÿz6T5RelativeAttentionLogitBias._bucket_relative_position)r'   r¦   F)r   r   r   r   r	   rE   r´   r   r   r   r   r   r¥   ¦  s
    +"r¥   c                       s:   e Zd ZdZd
‡ fdd„	Zdd„ Zdejfdd	„Z‡  Z	S )ÚAbsolutePositionalEncodingai  Absolute Positional encoding module.
    This module implement Absolute sinusoidal positional encoding
    from: https://arxiv.org/pdf/1706.03762.pdf

    Args:
        d_model: int
            Input embedding size.
        dropout_rate: float
            dropout rate
        max_len: int, optional
            Maximum input length sequence, Default 5000

    éˆ  c                    sZ   t ƒ  ¡  || _t | j¡| _tjj|d| _	d| _
|  t d¡ d|¡¡ |  t¡ dS )z'Construct an PositionalEncoding object.©ÚpNç        r%   )r   r	   r”   rÁ   ÚsqrtÚxscaler    r   rx   ry   rš   Ú	extend_peÚtensorr.   Ú"_register_load_state_dict_pre_hookr¤   )r   r”   r~   Úmax_lenr   r   r   r	   ;  s   
z#AbsolutePositionalEncoding.__init__c                 C   s  | j dur+| j  d¡| d¡kr+| j j|jks| j j|jkr)| j j|j|jd| _ dS t | d¡| j¡}tjd| d¡tj	d 
d¡}t tjd| jdtj	dt d¡| j   ¡}t || ¡|dd…ddd…f< t || ¡|dd…ddd…f< | 
d¡}|j|j|jd| _ dS )	zSReset the positional encodings.

        Args:
            x: torch.Tensor
        Nr%   )r±   r°   r   ©r±   rJ   g     ˆÃ@r¯   )rš   r²   r±   r°   r¼   r    r\   r”   r+   Úfloat32r,   ÚexprÁ   r¿   ÚsinÚcos)r   rC   rš   ÚpositionÚdiv_termr   r   r   rÏ   E  s    ÿÿ  
z$AbsolutePositionalEncoding.extend_perC   c                 C   s:   |   |¡ || j | jdd…d| d¡…f  }|  |¡S )zãAdd positional encoding.

        Args:
            x: torch.Tensor
                Input tensor. shape is (batch, time, ...)

        Returns:
            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)

        Nr%   )rÏ   rÎ   rš   r²   ry   rD   r   r   r   rE   Z  s   
&
z"AbsolutePositionalEncoding.forward)rÉ   )
r   r   r   r   r	   rÏ   r    r   rE   r   r   r   r   r   rÈ   ,  s
    
rÈ   c                       s2   e Zd ZdZ‡ fdd„Zdedefdd„Z‡  ZS )ÚMeanVarianceNormLayerzèMean/variance normalization layer.

    Will subtract mean and multiply input by inverted standard deviation.
    Typically used as a very first layer in a model.

    Args:
        input_size: int
            layer input size.
    c                    s8   t ƒ  ¡  || _t t |¡¡| _t t |¡¡| _	d S r   )
r   r	   r
   r   r[   r    r\   Úglobal_meanr‰   Úglobal_invstd)r   r
   r   r   r   r	   v  s   
zMeanVarianceNormLayer.__init__Úinput_r@   c                 C   s   || j  | j S )ztMeanVarianceNormLayer Forward

        Args:
            input_: torch.Tensor
                input tensor.
        )rÛ   rÜ   )r   rÝ   r   r   r   rE   |  s   zMeanVarianceNormLayer.forwardrF   r   r   r   r   rÚ   k  s    
rÚ   c                       s|   e Zd ZdZ								ddeded	ed
edeeef dededededdf‡ fdd„Zddd„Z	d‡ fdd„	Z
‡  ZS )ÚCausalConv1Da¬  
    A causal version of nn.Conv1d where each step would have limited access to
    locations on its right or left
    All arguments are the same as nn.Conv1d except padding.

    If padding is set None, then paddings are set automatically to make it a
    causal convolution where each location would not see any steps on its right.

    If padding is set as a list (size of 2), then padding[0] would be used as
    left padding and padding[1] as right padding.
    It would make it possible to control the number of steps to be accessible
    on the right and left.
    This mode is not supported when stride > 1. padding[0]+padding[1] should
    be equal to (kernel_size - 1).
    r%   r   Tr\   NÚin_channelsÚout_channelsr`   ÚstriderS   Údilationrg   ÚbiasÚpadding_moder@   c                    sÚ   d | _ |d u r|d | _|d | _nD|dkr ||d kr tdƒ‚t|tƒr,|| _|| _n*t|tƒrNt|ƒdkrN|d |d  |d krN|d | _|d | _ntd|› dƒ‚| j| _t	ƒ j
||||d||||	|
|d d S )Nr%   z3No striding allowed for non-symmetric convolutions!rJ   r   zInvalid padding param: ú!)rß   rà   r`   rá   rS   râ   rg   rã   rä   r°   r±   )Úcache_drop_sizeÚ_left_paddingÚ_right_paddingrZ   Ú
isinstancerO   Úlistr/   Ú_max_cache_lenr   r	   ©r   rß   rà   r`   rá   rS   râ   rg   rã   rä   r°   r±   r   r   r   r	   —  s<   

ÿ

õzCausalConv1D.__init__c                 C   s¨   |d u rt j|| j| jfd}|}||fS t j|d| jfd}tj||gdd}| jdkr=|d d …d d …d | j …f }n|}|d d …d d …| d¡ d …f }||fS )N©r*   r   r'   rK   )ÚFr*   rç   rè   r    Úcatræ   r²   )r   rC   ÚcacheÚnew_xÚ
next_cacher   r   r   Úupdate_cacheÉ  s   	ù
 "zCausalConv1D.update_cachec                    s2   | j ||d\}}tƒ  |¡}|d u r|S ||fS )N)rð   )ró   r   rE   )r   rC   rð   r   r   r   rE   ×  s
   zCausalConv1D.forward©r%   r   r%   r%   Tr\   NNr   )r   r   r   r   rO   r   rP   Úboolr	   ró   rE   r   r   r   r   r   rÞ   †  sB    ôþýüû
úùø	÷
öó
2rÞ   c                       sp   e Zd ZdZ								ddeded	ed
edeeef dededededdf‡ fdd„Z‡ fdd„Z	‡  Z
S )ÚCausalConv2Dzâ
    A causal version of nn.Conv2d where each location in the 2D matrix would
    have no access to locations on its right or down
    All arguments are the same as nn.Conv2d except padding which should be
    set as None
    r%   r   Tr\   Nrß   rà   r`   rá   rS   râ   rg   rã   rä   r@   c                    sL   |d urt dƒ‚|d | _|d | _d}tƒ  |||||||||	|
|¡ d S )Nz8Argument padding should be set to None for CausalConv2D.r%   r   )rZ   rç   rè   r   r	   rì   r   r   r   r	   è  s$   

õzCausalConv2D.__init__c                    s*   t j|| j| jddfd}tƒ  |¡}|S )Nr   rí   )rî   r*   rç   rè   r   rE   rD   r   r   r   rE   
  s   þzCausalConv2D.forwardrô   )r   r   r   r   rO   r   rP   rõ   r	   rE   r   r   r   r   r   rö   à  s@    ôþýüû
úùø	÷
öó"rö   c                       sx   e Zd ZdZdddde ¡ df‡ fdd„	Zd	d
„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdefdd„Z‡  ZS )ÚNemoConvSubsamplinga|  Convlutional subsampling module, taken from NeMo ASR
    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)

    Striding Subsampling: "Speech-Transformer: A No-Recurrence
    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong
    et al. (https://ieeexplore.ieee.org/document/8462506)


    Compared with the EncoderConv2D (`input_layer: custom`), this is a
    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
      layer is kept as a regular convolution so as not to degrade accuracy.

    `Striding` and `dw_striding` are the same except that the latter uses
    depthwise convolutions after the first layer, whereas the former does not.

    Args:
        subsampling_factor (int): Time reduction factor
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        subsampling (str): The subsampling technique, choose from
            {"striding", "dw-striding", "striding_conv1d",
            "dw_striding_conv1d"}
        conv_channels (int): Number of channels for the convolution layers,
                            default is 256.
        subsampling_conv_chunking_factor (int): Input chunking factor which
            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
        activation (Module): activation function, default is nn.ReLU()
        is_causal (bool): whether to use causal Conv1/2D, where each step will
            have limited access to locations on its right or left
    é   Údw_stridingé   r%   Fc	                    sr  t ƒ  ¡  || _|| _|| _|| _|d dkrtdƒ‚tt 	|d¡ƒ| _
|| _|| _|dv | _|dkrA|dkrA|d dkrAtdƒ‚|| _d}	g }
|dkrîd| _d	| _d
| _| jrj| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _| jr|
 t|	|| j| jd d¡ n|
 tjj|	|| j| j| jd¡ |}	|
 |¡ t| j
d ƒD ]=}| jrÃ|
 t|	|	| j| jd |	d¡ n|
 tjj|	|	| j| j| j|	d¡ |
 tjj|	|ddddd¡ |
 |¡ |}	q®n|dkr[d| _d	| _d
| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| j
ƒD ].}| jr?|
 t|	|| j| jd d¡ n|
 tjj|	|| j| j| jd¡ |
 |¡ |}	q*n|dkrÝ|}	d| _d| _d
| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| j
ƒD ]B}| jr¸|
 t|	| j
|d kr­|n|| j| jd d¡ n|
 tjj|	| j
|d krÈ|n|| j| j| jd¡ |
 |¡ |}	q™n|dkre|}	d| _d| _d
| _| jd d | _| jd d | _|
 tjj|	|	| j| j| j|	dtjj|	| j
dkr|n|dddddg¡ |}	|
 |¡ t| j
d ƒD ]2}|
 tjj|	|	| j| j| j|	dtjj|	| j
|d krQ|n|dddddg¡ |
 |¡ |}	q1ntd|› dƒ‚|dv rœtj|tjd}t || j| j | j| j| j| j
d}tj !|t|ƒ |¡| _"d| _#n|dv r¨d | _"d
| _#ntd|› dƒ‚tjj$|
Ž | _%d S )NrJ   r   z*Sampling factor should be a multiply of 2!)rù   ÚstridingÚstriding_conv1dr'   r%   úAsubsampling_conv_chunking_factor should be -1, 1, or a power of 2rù   r‚   F)rß   rà   r`   rá   rS   )rß   rà   r`   rá   rS   rg   rû   rü   é   Údw_striding_conv1dzNot valid sub-sampling: rå   )rù   rû   rÓ   )ÚlengthsÚall_paddingsr`   rá   Ú	ceil_modeÚ
repeat_numT)rü   rÿ   )&r   r	   Ú_subsamplingÚ_conv_channelsÚ_feat_inÚ	_feat_outrZ   rO   rÁ   r¿   Ú_sampling_numÚsubsampling_factorÚ	is_causalÚsubsampling_causal_condÚ subsampling_conv_chunking_factorÚ_strideÚ_kernel_sizeÚ
_ceil_moderç   rè   rë   Úappendrö   r    r   ÚConv2dÚrangerÞ   rW   ÚextendrÐ   rÀ   Úcalc_lengthr|   r™   Úconv2d_subsamplingr–   Úconv)r   Úfeat_inÚfeat_outr	  ÚsubsamplingÚconv_channelsr  r   r
  rß   ÚlayersÚiÚ	in_lengthÚ
out_lengthr   r   r   r	   8  sÎ  

ÿûÿ
ûÿ	
úÿúÿúÿ

Ý
%ûÿ
ûÿ
	ê
ÿ÷ÿÿ÷ÿ
â
 úø÷ÿ
úÿö÷ÿ
æ

ú
zNemoConvSubsampling.__init__c                 C   s
   d| j gS )Nr%   ©r	  rB   r   r   r   Úget_sampling_framesY  s   
z'NemoConvSubsampling.get_sampling_framesc                 C   s   d| j d gS )Nr   r%   r  rB   r   r   r   Úget_streaming_cache_size\  s   z,NemoConvSubsampling.get_streaming_cache_sizec                 C   s„  | j r| d¡n| dd¡}| jdkrR| j rR| jdkr.d| j | j | j }t |¡|k}nd}|rL|  |¡\}}|sK| j	dkrF|  
|¡}n|  |¡}n|  |¡}n|  |¡}| j rq| ¡ \}}}}	|  | dd¡ ||d¡¡}n| dd¡}|du r|dfS |jd }
| d¡}t || j ¡}| jr¦| jr¦|| j }||dk  d7  < tjd|
|jd	 | d¡d¡| d¡k }|| d¡fS )
aø  
        Forward method for NeMo subsampling.

        Args:
            x[Batch, Time, Filters]: torch.Tensor
                input tensor
            x_mask: torch.Tensor
                input mask

        Returns:
            x: torch.Tensor
                Resulting tensor from subsampling (B, T //
                time_reduction_factor, feat_out)
            pad_mask: torch.Tensor
                tensor of padded hidden state sequences (B, 1, T //
                time_reduction_factor)
        r%   rJ   r'   ì        Trù   Nr   )r°   )r  r,   Ú	transposer  r  r  r    ÚnumelÚconv_split_by_batchr  Úconv_split_by_channelr  r²   r™   ÚreshapeÚshapeÚsumÚceilr	  r
  r  r+   r°   r.   )r   rC   ÚmaskÚx_ceilÚneed_to_splitÚsuccessÚbÚcÚtÚfÚmax_audio_lengthÚfeature_lensÚpadding_lengthÚfeature_lens_remainderÚpad_maskr   r   r   rE   _  sB   


€




ÿþzNemoConvSubsampling.forwardc                 C   sl  | j dkr´t ¡ ¡ d| j }| jd d }| jd }tjj | jd j	| |¡ tjj | jd j
| |¡ tdt| jƒdƒD ]>}tjj | j| j	| |¡ tjj | j| j
| |¡ tjj | j|d  j	| |¡ tjj | j|d  j
| |¡ q@| j| j | j d }tjj | jj	| |¡ tjj | jj
| |¡ W d   ƒ d S 1 s­w   Y  d S d S )Nrù   ç      ð?rJ   g      à¿r   r‚   r%   )r  r    Úno_gradr  r  r   ÚinitÚuniform_r  Úweightrã   r  r/   r  r  r  r™   )r   ÚscaleÚdw_maxÚpw_maxr7   Úfc_scaler   r   r   Úreset_parameters£  s"   



 ""ìÿz$NemoConvSubsampling.reset_parametersc                    sª   |  ¡ \}}}}|dkr|dfS ˆ jdkrˆ j}ndˆ j ˆ j ˆ j }t t t |¡| d¡¡}d| }|| }|dkrB|dfS t 	‡ fdd„t 
||d¡D ƒ¡dfS )	z:Tries to split input by batch, run conv and concat resultsr%   Fr"  rJ   r   c                    s   g | ]}ˆ   |¡‘qS r   ©r  ©Ú.0rL   rB   r   r   Ú
<listcomp>Ñ  s    z;NemoConvSubsampling.conv_split_by_batch.<locals>.<listcomp>T)r²   r  r  r  rÁ   r*  r¿   r    r$  rï   Úsplit)r   rC   r/  Ú_Úcfr,  rË   Únew_batch_sizer   rB   r   r%  ¼  s    
ÿüz'NemoConvSubsampling.conv_split_by_batchc           	   	      s  ˆj d |ƒ}ˆj d |ƒ}tˆjd ƒD ]i‰ | ¡ \}}}}ˆjdkr(ˆj}nt t t 	|¡d d¡¡}d| }t
|| ƒ}|dkrFd}t
|| ƒ}|dkrRd}ˆ ˆj ˆ d d  ||¡}t ‡ ‡fdd„t ||d¡D ƒd¡}ˆj ˆ d d  |ƒ}q|S )	zOFor dw convs, tries to split input by time, run conv and concat
        resultsr   r%   r"  rJ   r‚   c                    s"   g | ]}ˆj ˆ d  d   |ƒ‘qS )r‚   rB  rC  ©r  r   r   r   rE  ö  s   " z=NemoConvSubsampling.conv_split_by_channel.<locals>.<listcomp>rø   )r  r  r  r²   r  rÁ   r*  r¿   r    r$  rO   Úchannel_chunked_convrï   rF  )	r   rC   rG  r0  r1  rH  rË   Únew_cÚnew_tr   rJ  r   r&  Ö  s.   
ÿþz)NemoConvSubsampling.conv_split_by_channelc           	   	   C   s  d}g }t  ||d¡D ]x}| ¡ d }| jrTtjj|| jd | jd | jd | jd fd}tjj	||j
||| …dd…dd…dd…f |j||| … | jd|d}n&tjj	||j
||| …dd…dd…dd…f |j||| … | j| j|d}| |¡ ||7 }qt  |d¡S )z$Performs channel chunked convolutionr   r%   rí   N)rã   rá   rS   rg   )r    rF  r²   r
  r   r)   r*   r  r  Úconv2dr<  rã   rç   r  rï   )	r   r  r€   rC   ÚindÚ
out_chunksrL   ÚstepÚch_outr   r   r   rK  ü  s@   üþ	$ú	$ú

z(NemoConvSubsampling.channel_chunked_convr  c                 C   s.   |dkr|dkr|d dkrt dƒ‚|| _d S )Nr'   r%   rJ   r   rý   )rZ   r  )r   r  r   r   r   Ú'change_subsampling_conv_chunking_factor$  s   ÿ
z;NemoConvSubsampling.change_subsampling_conv_chunking_factor)r   r   r   r   r   r   r	   r   r!  rE   rA  r%  r&  rK  rO   rS  r   r   r   r   r   r÷     s*    %÷  #D&(ÿr÷   r%   c           	      C   s^   || }d}t |ƒD ]}t | jtjd| |¡| } |r"t | ¡nt | ¡} q
| jtjdS )z^Calculates the output length of a Tensor passed through a convolution or
    max pooling layerr8  rÓ   )r  r    Údivr¼   rÀ   r*  ÚfloorrO   )	r   r  r`   rá   r  r  Úadd_padÚoner  r   r   r   r  2  s   r  c                       sp   e Zd ZdZ‡ fdd„Zddd„Z			dded	ee d
ee dee deeeee ee f f
dd„Z	‡  Z
S )Ú	AttModulezAttention abstraction modulec                    s   t ƒ  ¡  d| _d S )NF)r   r	   Úexport_moderB   r   r   r   r	   A  s   

zAttModule.__init__Tc                 C   s
   || _ dS )zset the export modeN)rY  )r   Úmoder   r   r   Ú
set_exportE  ó   
zAttModule.set_exportNrC   ÚmemoryÚpos_embÚatt_maskr@   c                 C   s   ||||fS )a[  AttModule forward

        Args:
            x: torch.Tensor
                input tensor.
            memory: torch.Tensor, optional
                memory tensor.
            pos_emb: torch.Tensor, optional
                positional encoder embedding.
            att_mask: torch.Tensor, optional
                attention mask tensor.
        r   )r   rC   r]  r^  r_  r   r   r   rE   I  s   zAttModule.forward)T)NNN)r   r   r   r   r	   r[  r   r   ÚtuplerE   r   r   r   r   r   rX  >  s$    
ûþýüûúrX  c                   @   s   e Zd ZdZddd„ZdS )ÚAttBlockzBAttention Block module to support both Attention and Block module.Fc                 C   s
   d| j fS )zmemory dimensionsr%   )r
   )r   rÒ   r   r   r   Úmemory_dimsb  r\  zAttBlock.memory_dimsN)F)r   r   r   r   rb  r   r   r   r   ra  _  s    ra  r+  c                 C   sT   |d ur!|  d¡ d¡}|  |tj ¡} tj| dd |d¡}|S tj| dd}|S )Nr%   r   r'   rK   rÌ   )r,   Úeqr³   r    ÚinfÚsoftmax)Úscoresr+  Úattnr   r   r   Úmasked_softmaxg  s   ÿÿrh  c                       s¨   e Zd ZU dZejje ed< ejje	 ed< ejje	 ed< ejje	 ed< 							
dde	f‡ fdd„Z
	ddedededededee dee fdd„Z‡  ZS )ÚMultiHeadedAttentiona-  Multi-Head Attention layer with optional relative position embedding
    and GLU.

    Args:
        n_head: int
            the number of heads.
        n_feat: int
            input size features.
        dropout_rate: float
            dropout rate.
        use_LN: bool
            apply layer norm or not
        dropout_at_output: bool
            whether to apply dropout at output
        attention_inner_dim: int, optional
            the attention dimension used in the class,
            it can be different from the input dimension n_feat.
            default: -1 (equal to n_feat).
        use_pt_scaled_dot_product_attention: bool, optional
            if set True, use pytorch scaled dot product attention in training.
            NOTE: this will NOT be used in ONNX decoding due to a lack of
            support.  In that case, we use the original attention
            implementation, which shows no regression.
            default: False.
        n_value: int, optional
            if set to values other than -1, use a different dimension for
            value. With the default value (i.e. -1), it is backward compatible.
        group_size: int, optional. must divide `n_head`
            if group_size > 1:       GQA
            if group_size = 1:       MHA
            if group_size = n_head:  MQA
    Úinv_sqrt_d_kÚhÚh_kÚgr'   r   TFr%   Ú
group_sizec
           
         s<  t ƒ  ¡  |dkr|}|dkr|}|| dksJ ‚|| | _dt | j¡ | _|| _||	 dks4J dƒ‚|	| _||	 | _t	 
||¡| _t	 
|||	 ¡| _t	 
|||	 ¡| _t	 
||	 |¡| _tj d tt ¡| _t	j|d| _|| _|| _|r|	dkrtdƒ‚tjj ¡ | _tjj ¡ | _tjj ¡ | _ tjj	j! "¡ | _#d S )Nr'   r   r8  zgroup_size must divide n_headrÊ   r%   z'Cannot use PT Scaled Attention with GQA)$r   r	   Úd_krÁ   rÍ   rj  rk  rm  rl  r   r|   Úlinear_qÚlinear_kÚlinear_vÚ
linear_outr    ÚjitÚ	Attributer   r   rg  rx   ry   r~   Ú#use_pt_scaled_dot_product_attentionrZ   ÚaoÚquantizationÚ	QuantStubÚquant_qÚquant_xÚDeQuantStubÚdequantÚ	quantizedÚFloatFunctionalÚffunc)
r   Ún_headÚn_featr~   Úattention_inner_dimrT   rV   rv  Ún_valuern  r   r   r   r	     s4   


zMultiHeadedAttention.__init__NÚqueryÚkeyr&   Úpos_kÚpos_vr+  Úrelative_attention_biasc                 C   s  |  d¡}|  |¡ |d| j| j¡}	|  |¡ |d| j| j¡}
|  |¡ |d| j| j¡}| jr:t	j
 ¡ s:|	 dd¡n|	 dd¡| j }	|
 dd¡}
| dd¡}| jr²t	j
 ¡ s²d}|dury| d¡}|durk|| }n|}|j|	jkry| |	j¡}t	jj t	jjjjt	jjjjt	jjjjt	jjjjg¡ t	jjj|	|
||| jd}W d  ƒ n1 s¬w   Y  nÂ| j| jkrÌ|	 || j| jd| j¡}	t	 d|	|
¡}n
t	 |	|
 dd¡¡}|dur| j| jkrét	 d	|	|¡}n.|	  ¡  || j d| j¡ dd¡}t	 || dd¡¡}| dd¡ || j|  d¡|  d¡¡}|| }n|}|dur'|| }t!||ƒ}|| _"|  #|¡}t	 | |j¡|¡}|durt|  ¡  || j |  d¡|  d¡¡ dd¡}t	 ||¡ dd¡  ¡  || j|  d¡| j¡}|| }| dd¡  ¡  |d| j| j ¡}|  $|¡S )
aï  Compute 'Scaled Dot Product Attention'.

        Args:
            query: torch.Tensor
                query tensor (batch, time1, size)
            key: torch.Tensor
                key tensor (batch, time2, size)
            value: torch.Tensor
                value tensor (batch, time1, size)
            pos_k: torch.Tensor
                key tensor used for relative positional embedding.
            pos_v: torch.Tensor
                value tensor used for relative positional embedding.
            mask: torch.Tensor
                mask tensor (batch, time1, time2)
            relative_attention_bias: torch.Tensor
                bias added to attention logits w.r.t. relative positions
                (1, n_head, time1, time2)
        r   r'   r%   rJ   N)Ú	attn_maskÚ	dropout_pzb g h t d, b h s d -> b h t séþÿÿÿzb g h t d, t s d -> b h t s)%r²   rp  Úviewrk  ro  rq  rl  rr  rv  r    rt  Úis_scriptingr#  rj  r,   r±   r¼   r   Ú	attentionÚsdpa_kernelÚ
SDPBackendÚFLASH_ATTENTIONÚEFFICIENT_ATTENTIONÚMATHÚCUDNN_ATTENTIONr)   Úscaled_dot_product_attentionr~   r'  rm  ÚeinsumÚmatmulÚ
contiguousrh  rg  ry   rs  )r   r…  r†  r&   r‡  rˆ  r+  r‰  Ún_batchÚqr£   ÚvrŠ  rC   ÚAÚBÚ	reshape_qrf  rg  Úp_attnÚreshape_attnÚattn_vr   r   r   rE   Í  s˜   
ÿÿý




üÿûø€
ýÿÿ




ý
ü ÿ
zMultiHeadedAttention.forward)r'   r   TFr'   r%   r   )r   r   r   r   r    rt  ÚFinalrÀ   Ú__annotations__rO   r	   r   r   rE   r   r   r   r   r   ri  v  s>   
 !ö
ö8øþýüûúùøri  c                   @   s    e Zd ZdZejjdd„ ƒZdS )ÚMultiSequentialz,Multi-input multi-output torch.nn.Sequentialc                 G   s   | D ]}||Ž }q|S )zForward method implementation.r   )r   ÚargsÚmr   r   r   rE   K  s   
zMultiSequential.forwardN)r   r   r   r   r    rt  ÚignorerE   r   r   r   r   r¥  H  s    r¥  Úinput_layerÚtime_reductionc                 C   s@   | dv r
|dkr
dS | dv r|dkrdS | dv r|dkrdS d	S )
a  Get an offset. We will use the offset for determining #frames of a
    subsampled feature.

    Args:
        input_layer (str): Type of an input layer
        time_reduction (int): time reduction factor for downsampling a feature
    Returns:
        int: offset
    )rN  Ú	nemo_convrø   r‚   )rN  é   r%   é   é   r   r   )r©  rª  r   r   r   Ú
get_offsetS  s   
r¯  c                 C   s€   | j \}}}|  dd¡} tj| dddd…f d|fd|fd} | j \}}}|  |d||¡} |  ddd	d¡ ¡ } |  d||¡} | S )
zá
    For a given tensor with shape of (N, T, D), if sequence length T is
    longer than max_seq_len, this function unfold it to a
    (NT', max_seq_len, D) where T' is T // max_seq_len.
    Args:
        xs_pad: N, T, D
    r'   rŒ  .Nr%   )r`   rá   r   r‚   rJ   )r(  r#  rî   Úunfoldr  rd   r™  )Úxs_padÚmax_seq_lenrG  ÚDÚnew_bszÚslenr   r   r   Úunfold_tensorf  s   ýr¶  )r   )r   r   )r%   ))rÁ   Útypingr   r   r    Útorch.nn.functionalr   r)   rî   r   ÚModuler   r$   r?   r   rG   rQ   rf   rl   rˆ   r“   r¤   r¥   rÈ   rÚ   rW   rÞ   r  rö   r÷   r  rX  ra  rh  ri  r–   r¥  rP   rO   r¯  r¶  r   r   r   r   Ú<module>   sN   
	
1"jD l%6 ?Z6    
 !
þ S