o
    	TiG                     @   s   d dl mZmZ d dlZd dlmZmZmZm	Z	m
Z
 d dlmZ e
de
ddgZeG dd dZd	d
 ZedkrPeeZe d  Zeejejej dS dS )    )	dataclassfieldN)DatasetFeaturesImageSequenceValue)HfArgumentParserstring)contentrolec                   @   sZ   e Zd ZU dZedddidZeed< edddidZe	ed	< ed
ddidZ
eed< dS )ScriptArgumentsa  
    Arguments for the script.

    Args:
        test_size (`float`, *optional*, defaults to `0.1`):
            Fraction of the dataset to include in the test split.
        push_to_hub (`bool`, *optional*, defaults to `False`):
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-internal-testing/zen"`):
            Hugging Face repository ID to push the dataset to.
    g?helpz5Fraction of the dataset to include in the test split.)defaultmetadata	test_sizeFz4Whether to push the dataset to the Hugging Face Hub.push_to_hubztrl-internal-testing/zen-imagez2Hugging Face repository ID to push the dataset to.repo_idN)__name__
__module____qualname____doc__r   r   float__annotations__r   boolr   str r   r   V/home/ubuntu/.local/lib/python3.10/site-packages/scripts/generate_zen_image_dataset.pyr      s   
 r   c                 C   s|  t jjdddd}g ddd |D d}tj|ttd	t dd
}|j| dd}|r3|j	|dd t jjdddd}g ddd |D d}tj|ttd	t dd
}|j| dd}|rf|j	|dd t jjdddd}g dg ddd |D d}tj|ttd	td	t dd
}|j| dd}|r|j	|dd t jjdddd}g dg dg ddd |D d}tj|ttd	td	td	t dd
}|j| dd}|r|j	|dd t jjdddd}g dg ddd |D d}tj|ttd	td	t dd
}	td	dt
td	ddd t
td!ddd d" |	j| dd}	|r/|	j	|d#d t jjdddd}g dg d$g d%d&d |D d'}tj|ttd	td	td!t d'd
}
|
j| dd}
|ro|
j	|d(d t jjdddd}g d)d*d+gg d,d-d.gd/d0gd1d2gd3gd4gd5d6gd7d8gd9d:gd;gg d<d=d>gd?gd@gdAgdBgdCdDgdEggddFgg dGddFgdFdFgdFdgdFgdgdFdgddgddgdFgg dHdFdFgdgdFgdgdgdFdFgdggdId |D dJ}tj|ttd	t
td	t
td!t dJd
}|j| dd}|r|j	|dKd t jjdddd}dLdMdNdOdPdNgdLdQdNdOdRdNgdLdSdNdOdTdNgdLdUdNdOdVdNgdLdWdNdOdXdNgdLdYdNdOdZdNgdLd[dNdOd\dNgdLd]dNdOd^dNgdLd_dNdOd`dNgdLdadNdOdbdNgdLdcdNdOdddNgdLdedNdOdfdNgdLdgdNdOdhdNgdLdidNdOdjdNgdLdkdNdOdldNgdLdmdNdOdndNgdLdodNdOdpdNgdLdqdNdOdrdNgdLdsdNdOdtdNggdud |D dv}tj|ttt dvd
}|j| dd}|r|j	|dwd t jjdddd}dLdMdNgdLdQdNgdLdSdNgdLdUdNgdLdWdNgdLdYdNgdLd[dNgdLd]dNgdLd_dNgdLdadNgdLdcdNgdLdedNgdLdgdNgdLdidNgdLdkdNgdLdmdNgdLdodNgdLdqdNgdLdsdNggdxd |D d}tj|ttt dd
}|j| dd}|r{|j	|dyd t jjdddd}dLdMdNgdLdQdNgdLdSdNgdLdUdNgdLdWdNgdLdYdNgdLd[dNgdLd]dNgdLd_dNgdLdadNgdLdcdNgdLdedNgdLdgdNgdLdidNgdLdkdNgdLdmdNgdLdodNgdLdqdNgdLdsdNggdOdPdNgdOdRdNgdOdTdNgdOdVdNgdOdXdNgdOdZdNgdOd\dNgdOd^dNgdOd`dNgdOdbdNgdOdddNgdOdfdNgdOdhdNgdOdjdNgdOdldNgdOdndNgdOdpdNgdOdrdNgdOdtdNggdzd |D d}tj|tttt dd
}|j| dd}|rk|j	|d{d t jjdddd}dLdMdNgdLdQdNgdLdSdNgdLdUdNgdLdWdNgdLdYdNgdLd[dNgdLd]dNgdLd_dNgdLdadNgdLdcdNgdLdedNgdLdgdNgdLdidNgdLdkdNgdLdmdNgdLdodNgdLdqdNgdLdsdNggdOdPdNgdOdRdNgdOdTdNgdOdVdNgdOdXdNgdOdZdNgdOd\dNgdOd^dNgdOd`dNgdOdbdNgdOdddNgdOdfdNgdOdhdNgdOdjdNgdOdldNgdOdndNgdOdpdNgdOdrdNgdOdtdNggdOd|dNgdOd}dNgdOd~dNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOddNgdOdpdNgdOddNggdd |D d}tj|ttttt dd
}|j| dd}|r|j	|dd t jjdddd}dLdMdNdOdPdNgdLdQdNdOdRdNgdLdSdNdOdTdNgdLdUdNdOdVdNgdLdWdNdOdXdNgdLdYdNdOdZdNgdLd[dNdOd\dNgdLd]dNdOd^dNgdLd_dNdOd`dNgdLdadNdOdbdNgdLdcdNdOdddNgdLdedNdOdfdNgdLdgdNdOdhdNgdLdidNdOdjdNgdLdkdNdOdldNgdLdmdNdOdndNgdLdodNdOdpdNgdLdqdNdOdrdNgdLdsdNdOdtdNggdLdMdNdOd|dNgdLdQdNdOd}dNgdLdSdNdOd~dNgdLdUdNdOddNgdLdWdNdOddNgdLdYdNdOddNgdLd[dNdOddNgdLd]dNdOddNgdLd_dNdOddNgdLdadNdOddNgdLdcdNdOddNgdLdedNdOddNgdLdgdNdOddNgdLdidNdOddNgdLdkdNdOddNgdLdmdNdOddNgdLdodNdOddNgdLdqdNdOdpdNgdLdsdNdOddNggdd |D d}tj|tttt dd
}|j| dd}|rD|j	|dd t jjdddd}dLdMdNgdLdQdNgdLdSdNgdLdUdNgdLdWdNgdLdYdNgdLd[dNgdLd]dNgdLd_dNgdLdadNgdLdcdNgdLdedNgdLdgdNgdLdidNgdLdkdNgdLdmdNgdLdodNgdLdqdNgdLdsdNggdOdPdNgdOdRdNgdOdTdNgdOddNgdOdXdNgdOdZdNgdOd\dNgdOddNgdOd`dNgdOddNgdOdddNgdOddNgdOdhdNgdOddNgdOddNgdOdndNgdOdpdNgdOdrdNgdOdtdNggg ddd |D d'}tj|ttttd!t d'd
}|j| dd}|r<|j	|dd d S d S )N    @   )      )size)zBeautiful is better than ugly.z!Explicit is better than implicit.zSimple is better than complex.z#Complex is better than complicated.zFlat is better than nested.zSparse is better than dense.zReadability counts.z7Special cases aren't special enough to break the rules.z#Although practicality beats purity.z"Errors should never pass silently.zUnless explicitly silenced.z9In the face of ambiguity, refuse the temptation to guess.zEThere should be one-- and preferably only one --obvious way to do it.zBAlthough that way may not be obvious at first unless you're Dutch.Now is better than never.z0Although never is often better than *right* now.z:If the implementation is hard to explain, it's a bad idea.z@If the implementation is easy to explain, it may be a good idea.z@Namespaces are one honking great idea -- let's do more of those!c                 S   0   g | ]\}}t jjd d||dfdt jqS g        g     o@   )lowhighr"   nprandomuniformastypeuint8.0hwr   r   r   
<listcomp>N      0 zmain.<locals>.<listcomp>)textimager
   )featuresF)r   shufflestandard_language_modeling)config_name)Beautiful is better thanzExplicit iszSimple is betterComplexFlat is better thanzSparse is betterReadabilityzSpecial cases aren't specialAlthough practicality beatszErrors should neverzUnless explicitly In the face of ambiguity, refusez$There should be one-- and preferablyz;Although that way may not be obvious at first unless you'rezNow iszAlthough never is oftenz)If the implementation is hard to explain,zIf the implementation is easyz Namespaces are one honking greatc                 S   r$   r%   r)   r/   r   r   r   r3   l   r4   )promptr6   standard_prompt_only) ugly.z better than implicit.z than complex. is better than complicated. nested.z than dense. counts.z enough to break the rules. purity. pass silently.z
 silenced. the temptation to guess.! only one --obvious way to do it.z Dutch. better than never.z better than *right* now. it's a bad idea.z# to explain, it may be a good idea.z  idea -- let's do more of those!c                 S   r$   r%   r)   r/   r   r   r   r3      r4   )rA   
completionr6   standard_prompt_completion)z
 the moon. worse than nothing. than a long vacation.z is always the answer.z chocolate. without any context.z is optional. enough to become unicorns.z	 reality.z pass their driving test. forgotten.z the opportunity to laugh.z two or more confusing methods. a time traveler.z never better. not even a possibility.z it's clearly the best choice. it's probably magic.  watermelon -- let's plant some!c                 S   r$   r%   r)   r/   r   r   r   r3      r4   )rA   chosenrejectedr6   standard_preference)z"Beautiful is better than the moon.zExplicit is worse than nothing.z&Simple is better than a long vacation.zComplex is always the answer.zFlat is better than chocolate.z%Sparse is better without any context.zReadability is optional.z7Special cases aren't special enough to become unicorns.z$Although practicality beats reality.z,Errors should never pass their driving test.zUnless explicitly forgotten.z:In the face of ambiguity, refuse the opportunity to laugh.zCThere should be one-- and preferably two or more confusing methods.zLAlthough that way may not be obvious at first unless you're a time traveler.zNow is never better.z/Although never is often not even a possibility.zGIf the implementation is hard to explain, it's clearly the best choice.z2If the implementation is easy it's probably magic.z@Namespaces are one honking great watermelon -- let's plant some!c                 S   r$   r%   r)   r/   r   r   r   r3     r4   )rX   rY   r6   )dtype)featurelengthr   )rA   completionslabels#standard_implicit_prompt_preference)rC   rO   rP   rD   rE   rQ   rF   rR   rG   rH   rS   rI   rJ   rT   rK   rU   rL   rV   rW   )TFFTTFTFTTFTTFTFTFFc                 S   r$   r%   r)   r/   r   r   r   r3   O  r4   )rA   rM   labelr6   standard_unpaired_preference)r;   zExplicit is better thanzSimple is better thanzComplex is better thanr=   zSparse is better thanzReadability countsz#Special cases aren't special enoughr?   zErrors should never passr@   z0There should be one-- and preferably only one --zAlthough that way may not bezNow is better thanzNever is often better thanz.If the implementation is hard to explain, it'sz,If the implementation is easy to explain, itzNamespaces are onez-Although practicality sometimes beats purity,z, let me think...rC   )z, of course,z
 implicit.z because clarity matters.z... let's keep it basic,z	 complex.z when needed,z complicated.z in terms of structure,rE   z... especially for readability.z  especially when others read it.z, unless...z they follow the rules.z some theoretical elegance,rG   z
 silently,z unless explicitly silenced.rI   )z way to do it,z  but sometimes it's not obvious.z3 especially when there's more than one possibility.z clear at first,z it will eventually emerge.z later.z problematic fixes.z% likely because it's too complicated.z might be a good design.z of those great ideas,z that solve many problems.z' the code should still aim for balance.T)FTF)TTFc                 S   r$   r%   r)   r/   r   r   r   r3     r4   )rA   r_   r`   r6   standard_stepwise_supervisionuserzWhat is better than ugly?)r   r   	assistantz
Beautiful.zWhat is better than implicit?z	Explicit.zWhat is better than complex?zSimple.z What is better than complicated?zComplex.zWhat is better than nested?zFlat.zWhat is better than dense?zSparse.zWhat counts?zReadability.z,Are special cases enough to break the rules?z;No, special cases aren't special enough to break the rules.zWhat beats purity?zPracticality.z What should never pass silently?zErrors.zWhen can errors pass silently?zWhen explicitly silenced.z,What should you do in the face of ambiguity?zRefuse the temptation to guess.z'How many ways should there be to do it?zOne, and preferably only one.z-For whom may the way not be obvious at first?zDutch.zWhat is better than never?r#   z!Is never better than *right* now?zYes, often.z;What does it mean if the implementation is hard to explain?zIt means it's a bad idea.z;What does it mean if the implementation is easy to explain?zIt means it may be a good idea.zAny great ideas?z&Namespaces are one honking great idea.c                 S   r$   r%   r)   r/   r   r   r   r3     r4   )messagesr6    conversational_language_modelingc                 S   r$   r%   r)   r/   r   r   r   r3     r4   conversational_prompt_onlyc                 S   r$   r%   r)   r/   r   r   r   r3     r4    conversational_prompt_completionzAcceptable.z
Explained.zVery complex.zVery complicated.z	Circular.zHeavy.zLooking complicated.z9Yes, special cases are special enough to break the rules.zNothing.z	Warnings.zNever.zGive up.zAs many as possible.zFrench.z	Some day.z
No, never.zIt means it's a good idea.z
Recursion.c                 S   r$   r%   r)   r/   r   r   r   r3   M  r4   conversational_preferencec                 S   r$   r%   r)   r/   r   r   r   r3     r4   )conversational_implicit_prompt_preference)TTTFTTTFTFTFTFFTTTTc                 S   r$   r%   r)   r/   r   r   r   r3     r4   "conversational_unpaired_preference)r*   r+   randintr   	from_dictr   r   r   train_test_splitr   r   Message)r   r   r   sizesdata"standard_language_modeling_datasetstandard_prompt_only_dataset"standard_prompt_completion_datasetstandard_preference_dataset+standard_implicit_prompt_preference_dataset$standard_unpaired_preference_dataset%standard_stepwise_supervision_dataset(conversational_language_modeling_dataset"conversational_prompt_only_dataset(conversational_prompt_completion_dataset!conversational_preference_dataset1conversational_implicit_prompt_preference_dataset*conversational_unpaired_preference_datasetr   r   r   main5   s  "-(B"-.(.0A
























































-
























































B-





































 .r   __main__)dataclassesr   r   numpyr*   datasetsr   r   r   r   r   transformersr	   rq   r   r   r   parserparse_args_into_dataclassesscript_argsr   r   r   r   r   r   r   <module>   s$        