o
    	Ti                     @   sx   d dl mZmZ d dlmZ d dlmZ eG dd dZdd Ze	dkr:eeZ
e
 d  Zeejejej d	S d	S )
    )	dataclassfield)Dataset)HfArgumentParserc                   @   sZ   e Zd ZU dZedddidZeed< edddidZe	ed	< ed
ddidZ
eed< dS )ScriptArgumentsa  
    Arguments for the script.

    Args:
        test_size (`float`, *optional*, defaults to `0.1`):
            Fraction of the dataset to include in the test split.
        push_to_hub (`bool`, *optional*, defaults to `False`):
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-internal-testing/zen"`):
            Hugging Face repository ID to push the dataset to.
    g?helpz5Fraction of the dataset to include in the test split.)defaultmetadata	test_sizeFz4Whether to push the dataset to the Hugging Face Hub.push_to_hubztrl-internal-testing/zenz2Hugging Face repository ID to push the dataset to.repo_idN)__name__
__module____qualname____doc__r   r
   float__annotations__r   boolr   str r   r   P/home/ubuntu/.local/lib/python3.10/site-packages/scripts/generate_zen_dataset.pyr      s   
 r   c                 C   sv  t dg di}|j| dd}|r|j|dd t dg di}|j| dd}|r2|j|d	d t g dg d
d}|j| dd}|rN|j|dd t g dg d
g dd}|j| dd}|rm|j|dd t g dg dd}|j| dd}|r|j|dd t g dg dg dd}|j| dd}|r|j|dd t g dddgg dddgddgdd gd!gd"gd#d$gd%d&gd'd(gd)gg d*d+d,gd-gd.gd/gd0gd1d2gd3ggdd4gg d5dd4gd4d4gd4dgd4gdgd4dgddgddgd4gg d6d4d4gdgd4gdgdgd4d4gdggd7}	|	j| dd}	|r&|	j|d8d t d9d:d;d<d=d>d<gd:d?d<d=d@d<gd:dAd<d=dBd<gd:dCd<d=dDd<gd:dEd<d=dFd<gd:dGd<d=dHd<gd:dId<d=dJd<gd:dKd<d=dLd<gd:dMd<d=dNd<gd:dOd<d=dPd<gd:dQd<d=dRd<gd:dSd<d=dTd<gd:dUd<d=dVd<gd:dWd<d=dXd<gd:dYd<d=dZd<gd:d[d<d=d\d<gd:d]d<d=d^d<gd:d_d<d=d`d<gd:dad<d=dbd<ggi}
|
j| dd}
|r|
j|dcd t dd:d;d<gd:d?d<gd:dAd<gd:dCd<gd:dEd<gd:dGd<gd:dId<gd:dKd<gd:dMd<gd:dOd<gd:dQd<gd:dSd<gd:dUd<gd:dWd<gd:dYd<gd:d[d<gd:d]d<gd:d_d<gd:dad<ggi}|j| dd}|r`|j|ddd t d:d;d<gd:d?d<gd:dAd<gd:dCd<gd:dEd<gd:dGd<gd:dId<gd:dKd<gd:dMd<gd:dOd<gd:dQd<gd:dSd<gd:dUd<gd:dWd<gd:dYd<gd:d[d<gd:d]d<gd:d_d<gd:dad<ggd=d>d<gd=d@d<gd=dBd<gd=dDd<gd=dFd<gd=dHd<gd=dJd<gd=dLd<gd=dNd<gd=dPd<gd=dRd<gd=dTd<gd=dVd<gd=dXd<gd=dZd<gd=d\d<gd=d^d<gd=d`d<gd=dbd<ggd}|j| dd}|r7|j|ded t d:d;d<gd:d?d<gd:dAd<gd:dCd<gd:dEd<gd:dGd<gd:dId<gd:dKd<gd:dMd<gd:dOd<gd:dQd<gd:dSd<gd:dUd<gd:dWd<gd:dYd<gd:d[d<gd:d]d<gd:d_d<gd:dad<ggd=d>d<gd=d@d<gd=dBd<gd=dDd<gd=dFd<gd=dHd<gd=dJd<gd=dLd<gd=dNd<gd=dPd<gd=dRd<gd=dTd<gd=dVd<gd=dXd<gd=dZd<gd=d\d<gd=d^d<gd=d`d<gd=dbd<ggd=dfd<gd=dgd<gd=dhd<gd=did<gd=djd<gd=dkd<gd=dld<gd=dmd<gd=dnd<gd=dod<gd=dpd<gd=dqd<gd=drd<gd=dsd<gd=dtd<gd=dud<gd=dvd<gd=d^d<gd=dwd<ggd}|j| dd}|rn|j|dxd t d:d;d<d=d>d<gd:d?d<d=d@d<gd:dAd<d=dBd<gd:dCd<d=dDd<gd:dEd<d=dFd<gd:dGd<d=dHd<gd:dId<d=dJd<gd:dKd<d=dLd<gd:dMd<d=dNd<gd:dOd<d=dPd<gd:dQd<d=dRd<gd:dSd<d=dTd<gd:dUd<d=dVd<gd:dWd<d=dXd<gd:dYd<d=dZd<gd:d[d<d=d\d<gd:d]d<d=d^d<gd:d_d<d=d`d<gd:dad<d=dbd<ggd:d;d<d=dfd<gd:d?d<d=dgd<gd:dAd<d=dhd<gd:dCd<d=did<gd:dEd<d=djd<gd:dGd<d=dkd<gd:dId<d=dld<gd:dKd<d=dmd<gd:dMd<d=dnd<gd:dOd<d=dod<gd:dQd<d=dpd<gd:dSd<d=dqd<gd:dUd<d=drd<gd:dWd<d=dsd<gd:dYd<d=dtd<gd:d[d<d=dud<gd:d]d<d=dvd<gd:d_d<d=d^d<gd:dad<d=dwd<ggd}|j| dd}|r|j|dyd t d:d;d<gd:d?d<gd:dAd<gd:dCd<gd:dEd<gd:dGd<gd:dId<gd:dKd<gd:dMd<gd:dOd<gd:dQd<gd:dSd<gd:dUd<gd:dWd<gd:dYd<gd:d[d<gd:d]d<gd:d_d<gd:dad<ggd=d>d<gd=d@d<gd=dBd<gd=did<gd=dFd<gd=dHd<gd=dJd<gd=dmd<gd=dNd<gd=dod<gd=dRd<gd=dqd<gd=dVd<gd=dsd<gd=dtd<gd=d\d<gd=d^d<gd=d`d<gd=dbd<ggg dzd}|j| dd}|r|j|d{d d S d S )|Ntext)zBeautiful is better than ugly.z!Explicit is better than implicit.zSimple is better than complex.z#Complex is better than complicated.zFlat is better than nested.zSparse is better than dense.zReadability counts.z7Special cases aren't special enough to break the rules.z#Although practicality beats purity.z"Errors should never pass silently.zUnless explicitly silenced.z9In the face of ambiguity, refuse the temptation to guess.zEThere should be one-- and preferably only one --obvious way to do it.zBAlthough that way may not be obvious at first unless you're Dutch.Now is better than never.z0Although never is often better than *right* now.z:If the implementation is hard to explain, it's a bad idea.z@If the implementation is easy to explain, it may be a good idea.z@Namespaces are one honking great idea -- let's do more of those!F)r
   shufflestandard_language_modeling)config_nameprompt)Beautiful is better thanzExplicit iszSimple is betterComplexFlat is better thanzSparse is betterReadabilityzSpecial cases aren't specialAlthough practicality beatszErrors should neverzUnless explicitly In the face of ambiguity, refusez$There should be one-- and preferablyz;Although that way may not be obvious at first unless you'rezNow iszAlthough never is oftenz)If the implementation is hard to explain,zIf the implementation is easyz Namespaces are one honking greatstandard_prompt_only) ugly.z better than implicit.z than complex. is better than complicated. nested.z than dense. counts.z enough to break the rules. purity. pass silently.z
 silenced. the temptation to guess.! only one --obvious way to do it.z Dutch. better than never.z better than *right* now. it's a bad idea.z# to explain, it may be a good idea.z  idea -- let's do more of those!)r   
completionstandard_prompt_completion)z
 the moon. worse than nothing. than a long vacation.z is always the answer.z chocolate. without any context.z is optional. enough to become unicorns.z	 reality.z pass their driving test. forgotten.z the opportunity to laugh.z two or more confusing methods. a time traveler.z never better. not even a possibility.z it's clearly the best choice. it's probably magic.  watermelon -- let's plant some!)r   chosenrejectedstandard_preference)z"Beautiful is better than the moon.zExplicit is worse than nothing.z&Simple is better than a long vacation.zComplex is always the answer.zFlat is better than chocolate.z%Sparse is better without any context.zReadability is optional.z7Special cases aren't special enough to become unicorns.z$Although practicality beats reality.z,Errors should never pass their driving test.zUnless explicitly forgotten.z:In the face of ambiguity, refuse the opportunity to laugh.zCThere should be one-- and preferably two or more confusing methods.zLAlthough that way may not be obvious at first unless you're a time traveler.zNow is never better.z/Although never is often not even a possibility.zGIf the implementation is hard to explain, it's clearly the best choice.z2If the implementation is easy it's probably magic.z@Namespaces are one honking great watermelon -- let's plant some!)r9   r:   #standard_implicit_prompt_preference)r$   r0   r1   r%   r&   r2   r'   r3   r(   r)   r4   r*   r+   r5   r,   r6   r-   r7   r8   )TFFTTFTFTTFTTFTFTFF)r   r.   labelstandard_unpaired_preference)r   zExplicit is better thanzSimple is better thanzComplex is better thanr   zSparse is better thanzReadability countsz#Special cases aren't special enoughr!   zErrors should never passr"   z0There should be one-- and preferably only one --zAlthough that way may not bezNow is better thanzNever is often better thanz.If the implementation is hard to explain, it'sz,If the implementation is easy to explain, itzNamespaces are onez-Although practicality sometimes beats purity,z, let me think...r$   )z, of course,z
 implicit.z because clarity matters.z... let's keep it basic,z	 complex.z when needed,z complicated.z in terms of structure,r&   z... especially for readability.z  especially when others read it.z, unless...z they follow the rules.z some theoretical elegance,r(   z
 silently,z unless explicitly silenced.r*   )z way to do it,z  but sometimes it's not obvious.z3 especially when there's more than one possibility.z clear at first,z it will eventually emerge.z later.z problematic fixes.z% likely because it's too complicated.z might be a good design.z of those great ideas,z that solve many problems.z' the code should still aim for balance.T)FTF)TTF)r   completionslabelsstandard_stepwise_supervisionmessagesuserzWhat is better than ugly?)rolecontent	assistantz
Beautiful.zWhat is better than implicit?z	Explicit.zWhat is better than complex?zSimple.z What is better than complicated?zComplex.zWhat is better than nested?zFlat.zWhat is better than dense?zSparse.zWhat counts?zReadability.z,Are special cases enough to break the rules?z;No, special cases aren't special enough to break the rules.zWhat beats purity?zPracticality.z What should never pass silently?zErrors.zWhen can errors pass silently?zWhen explicitly silenced.z,What should you do in the face of ambiguity?zRefuse the temptation to guess.z'How many ways should there be to do it?zOne, and preferably only one.z-For whom may the way not be obvious at first?zDutch.zWhat is better than never?r   z!Is never better than *right* now?zYes, often.z;What does it mean if the implementation is hard to explain?zIt means it's a bad idea.z;What does it mean if the implementation is easy to explain?zIt means it may be a good idea.zAny great ideas?z&Namespaces are one honking great idea. conversational_language_modelingconversational_prompt_only conversational_prompt_completionzAcceptable.z
Explained.zVery complex.zVery complicated.z	Circular.zHeavy.zLooking complicated.z9Yes, special cases are special enough to break the rules.zNothing.z	Warnings.zNever.zGive up.zAs many as possible.zFrench.z	Some day.z
No, never.zIt means it's a good idea.z
Recursion.conversational_preference)conversational_implicit_prompt_preference)TTTFTTTFTFTFTFFTTTT"conversational_unpaired_preference)r   	from_dicttrain_test_splitr   )r
   r   r   "standard_language_modeling_datasetstandard_prompt_only_dataset"standard_prompt_completion_datasetstandard_preference_dataset+standard_implicit_prompt_preference_dataset$standard_unpaired_preference_dataset%standard_stepwise_supervision_dataset(conversational_language_modeling_dataset"conversational_prompt_only_dataset(conversational_prompt_completion_dataset!conversational_preference_dataset1conversational_implicit_prompt_preference_dataset*conversational_unpaired_preference_datasetr   r   r   main1   s  ,A,-@
























































,
























































A,





































-r\   __main__N)dataclassesr   r   datasetsr   transformersr   r   r\   r   parserparse_args_into_dataclassesscript_argsr
   r   r   r   r   r   r   <module>   s       d