o
    NiB                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZm	Z	m
Z
mZmZmZmZ ddlm  mZ ddlmZ ddlmZ dZd	Zg d
ZddgZG dd dejjZG dd dejjZ dS )zMovieLens dataset.    )absolute_import)division)print_functionN)AnyCallableDictIteratorListOptionalTuple)movielens_parsinga  
@article{10.1145/2827872,
author = {Harper, F. Maxwell and Konstan, Joseph A.},
title = {The MovieLens Datasets: History and Context},
year = {2015},
issue_date = {January 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {5},
number = {4},
issn = {2160-6455},
url = {https://doi.org/10.1145/2827872},
doi = {10.1145/2827872},
journal = {ACM Trans. Interact. Intell. Syst.},
month = dec,
articleno = {19},
numpages = {19},
keywords = {Datasets, recommendations, ratings, MovieLens}
}
a  
This dataset contains a set of movie ratings from the MovieLens website, a movie
recommendation service. This dataset was collected and maintained by [GroupLens]
(https://grouplens.org/), a research group at the University of Minnesota. There
are 5 versions included: "25m", "latest-small", "100k", "1m", "20m". In all
datasets, the movies data and ratings data are joined on "movieId". The 25m
dataset, latest-small dataset, and 20m dataset contain only movie data and
rating data. The 1m dataset and 100k dataset contain demographic data in
addition to movie and rating data.

- "25m": This is the latest stable version of the MovieLens dataset. It is
recommended for research purposes.
- "latest-small": This is a small subset of the latest version of the MovieLens
dataset. It is changed and updated over time by GroupLens.
- "100k": This is the oldest version of the MovieLens datasets. It is a small
dataset with demographic data.
- "1m": This is the largest MovieLens dataset that contains demographic data.
- "20m": This is one of the most used MovieLens datasets in academic papers
along with the 1m dataset.

For each version, users can view either only the movies data by adding the
"-movies" suffix (e.g. "25m-movies") or the ratings data joined with the movies
data (and users data in the 1m and 100k datasets) by adding the "-ratings"
suffix (e.g. "25m-ratings").

The features below are included in all versions with the "-ratings" suffix.

- "movie_id": a unique identifier of the rated movie
- "movie_title": the title of the rated movie with the release year in
parentheses
- "movie_genres": a sequence of genres to which the rated movie belongs
- "user_id": a unique identifier of the user who made the rating
- "user_rating": the score of the rating on a five-star scale
- "timestamp": the timestamp of the ratings, represented in seconds since
midnight Coordinated Universal Time (UTC) of January 1, 1970

The "100k-ratings" and "1m-ratings" versions in addition include the following
demographic features.

- "user_gender": gender of the user who made the rating; a true value
corresponds to male
- "bucketized_user_age": bucketized age values of the user who made the rating,
the values and the corresponding ranges are:
  - 1: "Under 18"
  - 18: "18-24"
  - 25: "25-34"
  - 35: "35-44"
  - 45: "45-49"
  - 50: "50-55"
  - 56: "56+"
- "user_occupation_label": the occupation of the user who made the rating
represented by an integer-encoded label; labels are preprocessed to be
consistent across different versions
- "user_occupation_text": the occupation of the user who made the rating in
the original string; different versions can have different set of raw text
labels
- "user_zip_code": the zip code of the user who made the rating

In addition, the "100k-ratings" dataset would also have a feature "raw_user_age"
which is the exact ages of the users who made the rating

Datasets with the "-movies" suffix contain only "movie_id", "movie_title", and
"movie_genres" features.
)25mlatest-small20m100k1mmoviesratingsc                       s   e Zd ZdZ				ddee dee dee deeegeee	e
eef f  f  ddf
 fdd	Zedefd
dZedefddZedefddZedeeegeee	e
eef f  f  fddZ  ZS )MovieLensConfigz$BuilderConfig for MovieLens dataset.Nformat_versiontable_optiondownload_url
parsing_fnreturnc                    sZ   |t vr
tdt  |tvrtdt tt| jdi | || _|| _|| _|| _	dS )a/  Constructs a MovieLensConfig.

    Args:
      format_version: a string to identify the format of the dataset, one of
          '_FORMAT_VERSIONS'.
      table_option: a string to identify the table to expose, one of
          '_TABLE_OPTIONS'.
      download_url: a string url for downloading the dataset.
      parsing_fn: a callable for parsing the data.
      **kwargs: keyword arguments forwarded to super.

    Raises:
      ValueError: if format_version is not one of '_FORMAT_VERSIONS' or if
          table_option is not one of '_TABLE_OPTIONS'.
    z!format_version must be one of %s.ztable_option must be one of %s.N )
_FORMAT_VERSIONS
ValueError_TABLE_OPTIONSsuperr   __init___format_version_table_option_download_url_parsing_fn)selfr   r   r   r   kwargs	__class__r   \/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/structured/movielens.pyr   {   s   
zMovieLensConfig.__init__c                 C      | j S N)r    r$   r   r   r(   r         zMovieLensConfig.format_versionc                 C   r)   r*   )r!   r+   r   r   r(   r      r,   zMovieLensConfig.table_optionc                 C   r)   r*   )r"   r+   r   r   r(   r      r,   zMovieLensConfig.download_urlc                 C   r)   r*   )r#   r+   r   r   r(   r      s   zMovieLensConfig.parsing_fn)NNNN)__name__
__module____qualname____doc__r
   strr   r   r   intr   r   r   propertyr   r   r   r   __classcell__r   r   r&   r(   r   x   sB    
$r   c                   @   s  e Zd ZdZededddddejded	ed
ddddej	dededddddejdededddddej	dededddddej
dededddddejdededddddejdededddddejdededdd dd!ejded"ed#dd dd!ej	dg
ZejdZd$ejjfd%d&Zd'ejjd$eejj fd(d)Z	*d.d+ee d$eeeeee f f  fd,d-Z!d*S )/	MovieLenszMovieLens rating dataset.z25m-ratingsa                This dataset contains 25,000,095 ratings across 62,423 movies,
              created by 162,541 users between January 09, 1995 and November 21,
              2019. This dataset is the latest stable version of the MovieLens
              dataset, generated on November 21, 2019.

              Each user has rated at least 20 movies. The ratings are in
              half-star increments. This dataset does not include demographic
              data.z0.1.0r   r   z8http://files.grouplens.org/datasets/movielens/ml-25m.zip)namedescriptionversionr   r   r   r   z
25m-moviesza              This dataset contains data of 62,423 movies rated in the 25m
              dataset.r   zlatest-small-ratingsa                This dataset contains 100,836 ratings across 9,742 movies, created
              by 610 users between March 29, 1996 and September 24, 2018. This
              dataset is generated on September 26, 2018 and is the a subset of
              the full latest version of the MovieLens dataset. This dataset
              is changed and updated over time.

              Each user has rated at least 20 movies. The ratings are in
              half-star increments. This dataset does not include demographic
              data.r   zAhttp://files.grouplens.org/datasets/movielens/ml-latest-small.zipzlatest-small-movieszi              This dataset contains data of 9,742 movies rated in the
              latest-small dataset.z100k-ratingsa                This dataset contains 100,000 ratings from 943 users on 1,682
              movies. This dataset is the oldest version of the MovieLens
              dataset.

              Each user has rated at least 20 movies. Ratings are in whole-star
              increments. This dataset contains demographic data of users in
              addition to data on movies and ratings.r   z9http://files.grouplens.org/datasets/movielens/ml-100k.zipz100k-moviesza              This dataset contains data of 1,682 movies rated in the 100k
              dataset.z
1m-ratingsa                 This dataset contains 1,000,209 anonymous ratings of approximately
              3,900 movies made by 6,040 MovieLens users who joined MovieLens in
              2000. This dataset is the largest dataset that includes
              demographic data.

              Each user has rated at least 20 movies. Ratings are in whole-star
              increments. In demographic data, age values are divided into
              ranges and the lowest age value for each range is used in the data
              instead of the actual values.r   z7http://files.grouplens.org/datasets/movielens/ml-1m.zipz	1m-movieszm              This dataset contains data of approximately 3,900 movies rated in
              the 1m dataset.z20m-ratingsax                This dataset contains 20,000,263 ratings across 27,278
              movies, created by 138,493 users between January 09, 1995 and
              March 31, 2015. This dataset was generated on October 17, 2016.

              Each user has rated at least 20 movies. Ratings are in half-star
              increments. This dataset does not contain demographic data.r   z8http://files.grouplens.org/datasets/movielens/ml-20m.zipz
20m-moviesz`              This dataset contains data of 27,278 movies rated in the 20m
              datasetr   c                 C   s  t jt jtjtjjg ddd}t jt jt jd}t jt jtjjg ddt jt jd}i }| j	j
dkr=|| n=| j	jdkrS|| || || n'| j	jd	krp|| || || |jt jd
 n
|| || tjj| ttj|ddtdS )z5Returns DatasetInfo according to self.builder_config.)Action	Adventure	AnimationChildrenComedyCrimeDocumentaryDramaFantasyz	Film-NoirHorrorIMAXMusicalMysteryRomancezSci-FiThrillerUnknownWarWesternz(no genres listed))names)movie_idmovie_titlemovie_genres)user_iduser_rating	timestamp)zacademic/educatorartistzclerical/adminzcustomer servicezdoctor/health careentertainmentzexecutive/managerialfarmer	homemakerlawyer	librarianzother/not specified
programmerretiredzsales/marketing	scientistzself-employedstudentztechnician/engineerztradesman/craftsman
unemployedwriter)user_genderbucketized_user_ageuser_occupation_labeluser_occupation_textuser_zip_coder   r   r   )raw_user_ageNz)https://grouplens.org/datasets/movielens/)builderr7   featuressupervised_keyshomepagecitation)tfstringtfdsre   Sequence
ClassLabelfloat32int64boolbuilder_configr   updater   coreDatasetInfo_DESCRIPTIONFeaturesDict	_CITATION)r$   movie_features_dictrating_features_dictdemographic_features_dictfeatures_dictr   r   r(   _infof  sL   







zMovieLens._info
dl_managerc                 C   s>   | | jj}tj|d| jj }tjj	tj
jd|idgS )zReturns SplitGenerators.zml-%sdir_path)r6   
gen_kwargs)download_and_extractrq   r   ospathjoinr   rk   rs   SplitGeneratorSplitTRAIN)r$   r}   extracted_pathr~   r   r   r(   _split_generators  s   
zMovieLens._split_generatorsNr~   c                 c   s    | j |D ]}|V  qdS )z>Yields examples by calling the corresponding parsing function.N)rq   r   )r$   r~   exr   r   r(   _generate_examples  s   zMovieLens._generate_examplesr*   )"r-   r.   r/   r0   r   textwrapdedentr   parse_current_ratings_dataparse_current_movies_dataparse_100k_ratings_dataparse_100k_movies_dataparse_1m_ratings_dataparse_1m_movies_dataBUILDER_CONFIGSrk   rs   VersionVERSIONrt   r|   downloadDownloadManagerr	   r   r   r
   r1   r   r   r2   r   r   r   r   r   r   r(   r5      s    	

  /?

r5   )!r0   
__future__r   r   r   r   r   typingr   r   r   r   r	   r
   r   tensorflow.compat.v2compatv2ri   tensorflow_datasets.public_api
public_apirk   tensorflow_datasets.structuredr   rw   ru   r   r   rs   BuilderConfigr   GeneratorBasedBuilderr5   r   r   r   r(   <module>   s    $A;