o
    ni"4                     @   s  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dl mZ d dlmZmZmZmZmZmZmZmZ d dlmZ ddlmZ ddlmZ ddlmZmZmZ erqd d	lmZ d d
lm Z  e!e"Z#ddddiddiddigdgddiddiddigdZ$ee% e&d< e'dpddfZ(eee)e)f  e&d< e'dpdZ*ee) e&d< ej+G dd dZ,ej+G d d! d!Z-ej+G d"d# d#Z.G d$d% d%Z/ej0G d&d' d'Z1dS )(    N)deque)TYPE_CHECKINGAnyDictFinalListOptionalTupleUnion)	telemetry   )aggregate_mean)asset_registry)	InterfaceMetricMetricsMonitor)Deque)SettingsStatic1sz.*typeneuroncore_countersmemory_usedneuron_runtime_vcpu_usage)
tag_filtermetrics
vcpu_usagememory_infoneuron_hw_counters)periodneuron_runtimessystem_metricsNEURON_MONITOR_DEFAULT_CONFIGz	neuron-lsz/opt/aws/neuron/bin/neuron-lsz-jNEURON_LS_COMMANDzneuron-monitorz"/opt/aws/neuron/bin/neuron-monitorNEURON_MONITOR_PATHc                   @   s6   e Zd ZU eed< eed< eed< eed< eed< dS )_NeuronCoreMemoryUsage	constants
model_codemodel_shared_scratchpadruntime_memorytensorsN__name__
__module____qualname__int__annotations__ r0   r0   g/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/wandb/sdk/internal/system/assets/trainium.pyr$   <   s   
 r$   c                   @   s.   e Zd ZU eed< eed< eed< eed< dS )_HostMemoryUsageapplication_memoryr%   dma_buffersr)   Nr*   r0   r0   r0   r1   r2   E   s
   
 r2   c                   @   sF   e Zd ZU eeef ed< eed< eed< eed< eeef ed< dS )_Statsneuroncore_utilizationhost_total_memory_usage neuron_device_total_memory_usagehost_memory_usageneuroncore_memory_usageN)	r+   r,   r-   r   r.   floatr/   r2   r$   r0   r0   r0   r1   r5   M   s   
 r5   c                   @   s   e Zd ZU dZdZeed< ded< d dd	Zd d
dZde	de
e ddfddZd ddZd ddZdedefddZd ddZd ddZededefddZdefddZdS )!NeuronCoreStatszAWS Trainium stats.z	trn.{key}namezDeque[_Stats]samplesreturnNc                 C   sZ   t | jjjddd t| jd}tjt|dd W d   dS 1 s&w   Y  dS )z!Write neuron monitor config file.T)parentsexist_okw   )indentN)	pathlibPathneuron_monitor_config_pathparentmkdiropenjsondumpr!   )selffr0   r0   r1   write_neuron_monitor_config^   s   "z+NeuronCoreStats.write_neuron_monitor_configc              
   C   s   |    zOtd| jg}tj|tjdd5}| j s8|jdu r&| j	d q|j
 }|r3| j| | j r|  |	  W d   W dS 1 sLw   Y  W dS  tyn } ztd| W Y d}~dS d}~ww )z=Run neuron-monitor in a separate process to collect raw data.-cNstdoutstderr皙?neuron-monitor failed: {})rO   r#   rG   
subprocessPopenPIPEshutdown_eventis_setrR   waitreadlineraw_samplesappendkill	Exceptionloggererrorformat)rM   commandprocessraw_dataer0   r0   r1   neuron_monitorh   s6   




&zNeuronCoreStats.neuron_monitorpidrG   c                 C   sB   || _ |ptjddj| _tdd| _t | _t	 | _
d | _d S )NF)delete
   )maxlen)ri   tempfileNamedTemporaryFiler=   rG   r   r]   r>   	threadingEventrY   neuron_monitor_thread)rM   ri   rG   r0   r0   r1   __init__   s   

zNeuronCoreStats.__init__c                 C   sD   | j durdS td | j  tjd| jdd| _ | j   dS )z8Start the neuron-monitor thread for collecting raw data.NzStarting neuron-monitor threadNeuronCoreMntrT)r=   targetdaemon)	rq   ra   debugrY   clearro   Threadrh   startrM   r0   r0   r1   setup   s   


zNeuronCoreStats.setupc              
   C   s   t d z:z| j  | jdusJ | j  W n ty3 } zt d| W Y d}~nd}~ww W d| _dS W d| _dS d| _w )zStop the neuron-monitor thread.zStopping neuron-monitor threadNz(neuron-monitor thread failed to stop: {})	ra   rv   rY   setrq   joinr`   rb   rc   )rM   rg   r0   r0   r1   teardown   s   


zNeuronCoreStats.teardownentryc                 C   s    t |d t | jkpdtjv S )a-  Check if the entry should be saved.

        Checks if the pid in the entry matches the pid of the process.
        If not (as in the case of multi-process training with torchrun),
        checks if the LOCAL_RANK environment variable is set.

        todo: add matching by neuron_runtime_tag
        ri   
LOCAL_RANK)r.   ri   osenviron)rM   r   r0   r0   r1   _is_matching_entry   s    	z"NeuronCoreStats._is_matching_entryc              
      s  zut  jd } fdd|d D d }|d d }dd	 | D }|d
 d }|d }|d }|d }tdi |d }	dd	 |d  D }
ttjdd}|dkrd||| i}||
| i}
t	||||	|
d} j
| W d S  ty } zW Y d }~d S d }~ww )Nc                    s   g | ]}  |r|d  qS )report)r   ).0r   rz   r0   r1   
<listcomp>   s    z*NeuronCoreStats.sample.<locals>.<listcomp>neuron_runtime_datar   r   neuroncores_in_usec                 S   s   i | ]\}}t ||d  qS )r6   )r.   r   kvr0   r0   r1   
<dictcomp>   s    z*NeuronCoreStats.sample.<locals>.<dictcomp>r   neuron_runtime_used_byteshostneuron_deviceusage_breakdownc                 S   s$   i | ]\}}t |td i |qS )r0   )r.   r$   r   r0   r0   r1   r      s    r:   r   i)r6   r7   r8   r9   r:   r0   )rK   loadsr]   itemsr2   r.   r   r   getr5   r>   r^   r`   )rM   	raw_statsr   r   r6   r   r7   r8   r   r9   r:   
local_rankstatsrg   r0   rz   r1   sample   sV   

zNeuronCoreStats.samplec                 C      | j   d S N)r>   rw   rz   r0   r0   r1   rw         zNeuronCoreStats.clearr   c                    sF   i  dt dtddf fddt|  D ]	\}}|| q S )z2Flatten _Stats object into a flat dict of numbers.keyvaluer?   Nc                    s   t |ttfr|  |i} | d S t |tr=| D ]\}}t |tr0| d|  | q|  d| | qd S t |trUt|D ]\}}| d|  | qFd S d S )N.)
isinstancer.   r;   updatedictr   list	enumerate)r   r   retkkvvival	flattenedhelperr0   r1   r      s   




z-NeuronCoreStats.flatten_stats.<locals>.helper)strr   dataclassesasdictr   )r   kkkvvvr0   r   r1   flatten_stats   s
   zNeuronCoreStats.flatten_statsc                    s~    j si S i }tt} fdd j D D ]}| D ]\}}|| | qq| D ]\}}t|| jj|d< q-|S )Nc                 3   s    | ]}  |V  qd S r   )r   )r   r   rz   r0   r1   	<genexpr>"  s    z,NeuronCoreStats.aggregate.<locals>.<genexpr>)r   )	r>   collectionsdefaultdictr   r   r^   r   r=   rc   )rM   r   merged_samplesflattened_sampler   r   r0   rz   r1   	aggregate  s   zNeuronCoreStats.aggregater?   N)r+   r,   r-   __doc__r=   r   r/   rO   rh   r.   r   rr   r{   r~   r   boolr   r   rw   staticmethodr5   r   r   r0   r0   r0   r1   r<   X   s*   
 







9r<   c                   @   s\   e Zd Zdddddejddfdd	Zedefd
dZdddZ	dddZ
defddZdS )Trainium	interfacer   settingsr   rY   r?   Nc                 C   sT   | j j | _t|j|jg| _t| j| j|||| _	t
 }d|j_|| d S )NT)	__class__r+   lowerr=   r<   x_stats_pid"x_stats_neuron_monitor_config_pathr   r   metrics_monitorr   TelemetryRecordenvtrainium_publish_telemetry)rM   r   r   rY   telemetry_recordr0   r0   r1   rr   /  s    zTrainium.__init__c                 C   sj   t td  sdS ztjtdtjd }tt	
|dkr#W dS W dS  ttttjfy4   Y dS w )Nr   FT)universal_newlinesrS   )rE   rF   r"   existsrV   check_outputDEVNULLstriplenrK   r   OSError
ValueError	TypeErrorCalledProcessError)clsoutputr0   r0   r1   is_availableG  s$   zTrainium.is_availablec                 C   r   r   )r   ry   rz   r0   r0   r1   ry   ^  r   zTrainium.startc                 C   r   r   )r   finishrz   r0   r0   r1   r   a  r   zTrainium.finishc              
   C   s  zf| j d   i }td| j d jg}tj|tjd d.}	 |jd u r)t	d q|j
 }|rBt|}|di }|dd  nqW d    n1 sMw   Y  z
|  |  W n   Y | j|iW S  ty } ztd| i W  Y d }~S d }~ww )	Nr   rP   rQ   TrT   neuron_hardware_inforb   rU   )r   check_neuron_monitor_configr#   rG   rV   rW   rX   rR   timesleepr\   rK   r   r   popr_   r[   r=   r`   ra   rb   rc   )rM   r   rd   re   rf   parsed_datarg   r0   r0   r1   probed  sN   




zTrainium.prober   )r+   r,   r-   ro   rp   rr   classmethodr   r   ry   r   r   r   r0   r0   r0   r1   r   -  s    


r   )2r   r   rK   loggingr   rE   shutilrV   rm   ro   r   r   typingr   r   r   r   r   r   r	   r
   wandb.sdk.libr   aggregatorsr   r   
interfacesr   r   r   r   "wandb.sdk.internal.settings_staticr   	getLoggerr+   ra   r!   r   r/   whichr"   r   r#   	dataclassr$   r2   r5   r<   registerr   r0   r0   r0   r1   <module>   sb   
 (

 V