o
    Ni$                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	Zd
Zg dZG dd dejjZG dd dejjZdS )z=CivilComments from Jigsaw Unintended Bias Kaggle Competition.    )absolute_import)division)print_functionNa  
@article{DBLP:journals/corr/abs-1903-04561,
  author    = {Daniel Borkan and
               Lucas Dixon and
               Jeffrey Sorensen and
               Nithum Thain and
               Lucy Vasserman},
  title     = {Nuanced Metrics for Measuring Unintended Bias with Real Data for Text
               Classification},
  journal   = {CoRR},
  volume    = {abs/1903.04561},
  year      = {2019},
  url       = {http://arxiv.org/abs/1903.04561},
  archivePrefix = {arXiv},
  eprint    = {1903.04561},
  timestamp = {Sun, 31 Mar 2019 19:01:24 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1903-04561},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
a9  
This version of the CivilComments Dataset provides access to the primary
seven labels that were annotated by crowd workers, the toxicity and other
tags are a value between 0 and 1 indicating the fraction of annotators that
assigned these attributes to the comment text.

The other tags are only available for a fraction of the input examples. They
are currently ignored for the main dataset; the CivilCommentsIdentities set
includes those labels, but only consists of the subset of the data with them.
The other attributes that were part of the original CivilComments release are
included only in the raw data. See the Kaggle documentation for more details
about the available features.

The comments in this dataset come from an archive of the Civil Comments
platform, a commenting plugin for independent news sites. These public comments
were created from 2015 - 2017 and appeared on approximately 50 English-language
news sites across the world. When Civil Comments shut down in 2017, they chose
to make the public comments available in a lasting open archive to enable future
research. The original data, published on figshare, includes the public comment
text, some associated metadata such as article IDs, timestamps and
commenter-generated "civility" labels, but does not include user ids. Jigsaw
extended this dataset by adding additional labels for toxicity and identity
mentions. This data set is an exact replica of the data released for the
Jigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This
dataset is released under CC0, as is the underlying comment text.
z

The CivilComments set here includes all the data, but only the basic seven
labels (toxicity, severe_toxicity, obscene, threat, insult, identity_attack, and
sexual_explicit).
z
The CivilCommentsIdentities set here includes an extended set of identity labels
in addition to the basic seven labels. However, it only includes the subset
(roughly a quarter) of the data with all these features.
zchttps://storage.googleapis.com/jigsaw-unintended-bias-in-toxicity-classification/civil_comments.zip)malefemaletransgenderother_genderheterosexualhomosexual_gay_or_lesbianbisexualother_sexual_orientation	christianjewishmuslimhindubuddhistatheistother_religionblackwhiteasianlatinoother_race_or_ethnicityphysical_disability#intellectual_or_learning_disabilitypsychiatric_or_mental_illnessother_disabilityc                       s    e Zd ZdZ fddZ  ZS )CivilCommentsConfigz"Configuration for `CivilComments`.c                    s(   t t| j||tjdd || _d S )Nz1.0.0)namedescriptionversion)superr   __init__tfdscoreVersioninclude_identity_labels)selfr   r   r&   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/civil_comments.pyr"   j   s   

zCivilCommentsConfig.__init__)__name__
__module____qualname____doc__r"   __classcell__r*   r*   r(   r+   r   g   s    r   c                   @   sL   e Zd ZdZed eddededdgZdd Zdd	 Z	d
d Z
dd ZdS )CivilCommentsa  Classification and tagging of 2M comments on news sites.

  This version of the CivilComments Dataset provides access to the primary
  seven labels that were annotated by crowd workers, the toxicity and other
  tags are a value between 0 and 1 indicating the fraction of annotators that
  assigned these attributes to the comment text.

  The other tags are only available for a fraction of the input examples. They
  are currently ignored for the main dataset; the CivilCommentsIdentities set
  includes those labels, but only consists of the subset of the data with them.
  The other attributes that were part of the original CivilComments release are
  included only in the raw data. See the Kaggle documentation for more details
  about the available features.
  F)r   r   r&   CivilCommentsIdentitiesTc                 C   sZ   dt j i}g d}| jjr|t7 }|D ]}tj||< qt jj	| t
t j|ddtdS )Ntext)toxicitysevere_toxicityobscenethreatinsultidentity_attacksexual_explicit)r3   r4   zOhttps://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data)builderr   featuressupervised_keyshomepagecitation)r#   r<   Textbuilder_configr&   IDENTITY_LABELStffloat32r$   DatasetInfo_COMMON_DESCRIPTIONFeaturesDict	_CITATION)r'   r<   labelslabelr*   r*   r+   _info   s   
zCivilComments._infoc                 C   s   | t}tjjtjjtj	|dd| j
jddtjjtjjtj	|dd| j
jddtjjtjjtj	|dd| j
jddgS )zReturns SplitGenerators.z	train.csvtarget)filenametoxicity_labelr&   )r   
gen_kwargsztest_public_expanded.csvr4   ztest_private_expanded.csv)download_and_extract_DOWNLOAD_URLr#   r$   SplitGeneratorSplitTRAINospathjoinrA   r&   
VALIDATIONTEST)r'   
dl_managerdl_pathr*   r*   r+   _split_generators   s.   
zCivilComments._split_generatorsc                 C   sT   i }|d |d< t || |d< |D ]}|| s|tv r d S t || ||< q|S )Ncomment_textr3   r4   )floatrB   )r'   rowrN   other_labelsexamplerJ   r*   r*   r+   _parse_row_as_example   s   z#CivilComments._parse_row_as_examplec           	      c   s~    g d}|r|t 7 }tjj|#}t|}|D ]}| |||}|r,|d |fV  qW d   dS 1 s8w   Y  dS )a  Yields examples.

    Each example contains a text input followed by several toxicity subtype
    scores and identity labels if include_identity_labels is True.

    Args:
      filename: the path of the file to be read for this split.
      toxicity_label: indicates 'target' or 'toxicity' to capture the variation
        in the released labels for this dataset.
      include_identity_labels: Whether to include the additional extended set of
        identity labels. In this case, only those examples with values for all
        labels are yielded.

    Yields:
      A dictionary of features, all floating point except the input text.
    )r5   r6   r7   r8   r9   r:   idN)rB   rC   iogfileGFilecsv
DictReaderrb   )	r'   rM   rN   r&   rI   freaderr_   ra   r*   r*   r+   _generate_examples   s   
"z CivilComments._generate_examplesN)r,   r-   r.   r/   r   _CC_DESCRIPTION_CC_IDENTITIES_DESCRIPTIONBUILDER_CONFIGSrK   r\   rb   rk   r*   r*   r*   r+   r1   p   s"    'r1   )r/   
__future__r   r   r   rg   rU   tensorflow.compat.v2compatv2rC   tensorflow_datasets.public_api
public_apir#   rH   rF   rl   rm   rQ   rB   r$   BuilderConfigr   GeneratorBasedBuilderr1   r*   r*   r*   r+   <module>   s    	