Source code for alpenglow.utils.DataShuffler

from alpenglow.Getter import Getter as rs
from .DataframeData import DataframeData
from alpenglow.ParameterDefaults import ParameterDefaults
import pandas as pd
import alpenglow.sip as sip


[docs]class DataShuffler(ParameterDefaults): """DataShuffler(seed=254938879,shuffle_mode=complete,input_file,output_file) This class is for shuffling datasets. Parameters ---------- seed : int The seed to initialize RNG-s. Should not be 0. shuffle_mode : string Possible values: complete, same_timestamp. input_file : string Input file name. output_file : string Output file name. data_format : string Input file format. Available values: online, online_id, online_id_noeval, online_attribute, offline, offlineTimestamp, category. See RecommenderData.cpp for details. Default: online_id. """ def __init__(self, **parameters): super().__init__(**parameters) self.used_parameters = set(['seed', 'shuffle_mode', 'data_format']) if("seed" not in self.parameters): self.parameters["seed"] = 254938879 if("shuffle_mode" not in self.parameters): self.parameters["shuffle_mode"] = "complete" if("data_format" not in self.parameters): self.parameters["data_format"] = "online_id"
[docs] def run(self): rs.collect() self.verbose = True # reading data recommender_data = rs.LegacyRecommenderData( file_name=self.parameters['input_file'], type=self.parameters["data_format"], experiment_termination_time=0 ) self.used_parameters.add("input_file") recommender_data_iterator = None if self.parameters['shuffle_mode'] == "same_timestamp" : recommender_data_iterator = rs.ShuffleIterator(seed=self.parameters["seed"]) elif self.parameters['shuffle_mode'] == "complete_online" : recommender_data_iterator = rs.RandomOnlineIterator(seed=self.parameters["seed"]) else: recommender_data_iterator = rs.RandomIterator(seed=self.parameters["seed"]) recommender_data_iterator.set_recommender_data(recommender_data) # data reading finished data_shuffle_experiment = rs.OnlineExperiment( random_seed=self.parameters["seed"], evaluation_start_time=0, experiment_termination_time=0, top_k=0, exclude_known=False, initialize_all=False, max_item=0, max_user=0 ) data_shuffle_experiment.set_recommender_data_iterator(recommender_data_iterator) data_logger = rs.InputLogger(**self.parameter_defaults( output_file="", )) data_shuffle_experiment.add_logger(data_logger) interrupt_logger = rs.InterruptLogger() data_shuffle_experiment.add_logger(interrupt_logger) if(self.verbose): proceeding_logger = rs.ProceedingLogger() proceeding_logger.set_data_iterator(recommender_data_iterator) data_shuffle_experiment.add_logger(proceeding_logger) created_objects = rs.get_and_clean() rs.set_experiment_environment(data_shuffle_experiment, created_objects) rs.initialize_all(created_objects) for i in created_objects: rs.run_self_test(i) self.check_unused_parameters() print("shuffling dataset...") if self.verbose else None data_shuffle_experiment.run()