Source code for alpenglow.OnlineExperiment

from .Getter import Getter as rs
from .utils.DataframeData import DataframeData
from .ParameterDefaults import ParameterDefaults
import pandas as pd
import alpenglow.sip as sip


[docs]class OnlineExperiment(ParameterDefaults): """OnlineExperiment(seed=254938879,top_k=100) This is the base class of every online experiment in Alpenglow. It builds the general experimental setup needed to run the online training and evaluation of a model. It also handles default parameters and the ability to override them when instantiating an experiment. Subclasses should implement the :code:`config()` method; for more information, check the documentation of this method as well. Online evaluation in Alpenglow is done by processing the data row-by-row and evaluating the model on each new record before providing the model with the new information. .. image:: /resources/online.png Evaluation is done by ranking the next item on the user's toplist and saving the rank. If the item is not found in the top :code:`top_k` items, the evaluation step returns :code:`NaN`. For a brief tutorial on using this class, see :doc:`/getting_started/3-five_minute_tutorial`. Parameters ---------- seed : int The seed to initialize RNG-s. Should not be 0. top_k : int The length of the toplists. network_mode : bool Instructs the experiment to treat :code:`data` as a directed graph, with :code:`source` and :code:`target` columns instead of :code:`user` and :code:`item`. """ def __init__(self, **parameters): super().__init__(**parameters) self.used_parameters = set(['seed', 'top_k', 'network_mode']) if("seed" not in self.parameters): self.parameters["seed"] = 254938879 if("top_k" not in self.parameters): self.parameters["top_k"] = 100
[docs] def run(self, data, experimentType=None, columns={}, verbose=True, out_file=None, exclude_known=False, initialize_all=False, calculate_toplists=False, experiment_termination_time=0, memory_log=True, shuffle_same_time=True, recode=True ): """ Parameters ---------- data : pandas.DataFrame or str The input data, see :doc:`/getting_started/3-five_minute_tutorial`. If this parameter is a string, it has to be in the format specified by :code:`experimentType`. experimentType : str The format of the input file if :code:`data` is a string columns : dict Optionally the mapping of the input DataFrame's columns' names to the expected ones. verbose : bool Whether to write information about the experiment while running out_file : str If set, the results of the experiment are also written to the file located at :code:`out_file`. exclude_known : bool If set to True, a user's previosly seen items are excluded from the toplist evaluation. The :code:`eval` columns of the input data should be set accordingly. calculate_toplists: bool or list Whether to actually compute the toplists or just the ranks (the latter is faster). It can be specified on a record-by-record basis, by giving a list of booleans as parameter. The calculated toplists can be acquired after the experiment's end by using :code:`get_predictions`. Setting this to non-False implies shuffle_same_time=False experiment_termination_time : int Stop the experiment at this timestamp. memory_log : bool Whether to log the results to memory (to be used optionally with out_file) shuffle_same_time : bool Whether to shuffle records with the same timestamp randomly. recode : bool Whether to automatically recode the entity columns so that they are indexed from 1 to n. If :code:`False`, the recoding needs to be handled before passing the DataFrame to the :code:`run` method. Returns ------- DataFrame Results DataFrame if memory_log=True, empty DataFrame otherwise """ rs.collect() self.verbose = verbose evaluation_start_time = 0 #TODO: start eval at this time colmap = {'user':'source', 'item':'target'} if self.parameter_default('network_mode', False): r_rename_dict = {k: columns[v] if v in columns else v for k, v in colmap.items()} else: r_rename_dict = columns rename_dict = {v: k for k, v in r_rename_dict.items()} self.user_codes = None self.item_codes = None # reading data if not isinstance(data, str): data = data.rename(columns=rename_dict) if recode: self.user_codes = dict(zip(data['user'].unique(), range(len(data)))) self.item_codes = dict(zip(data['item'].unique(), range(len(data)))) data['user'] = data['user'].map(self.user_codes) data['item'] = data['item'].map(self.item_codes) recommender_data = DataframeData(data) else: recommender_data = rs.LegacyRecommenderData( file_name=data, type=experimentType, experiment_termination_time=experiment_termination_time ) recommender_data.initialize() #read in data -> can find max user, max item max_user = recommender_data.get_max_user_id() max_item = recommender_data.get_max_item_id() recommender_data_iterator = None if not shuffle_same_time or calculate_toplists is not False: recommender_data_iterator = rs.SimpleIterator() else: recommender_data_iterator = rs.ShuffleIterator(seed=self.parameters["seed"]) recommender_data_iterator.set_recommender_data(recommender_data) # string attribute_container_name = getPot("set_attribute_container", ""); # if(attribute_container_name.length()==0) cerr << "WARNING: no attribute container was set into RecommenderData." << endl; # else { # InlineAttributeReader* attribute_container = jinja.get<InlineAttributeReader>(attribute_container_name); # recommender_data->set_attribute_container(attribute_container); # } # data reading finished #create experiment top_k = self.parameters['top_k'] seed = self.parameters['seed'] online_experiment = rs.OnlineExperiment( random_seed=seed, evaluation_start_time=evaluation_start_time, experiment_termination_time=experiment_termination_time, top_k=top_k, exclude_known=exclude_known, initialize_all=initialize_all, max_item=max_item, max_user=max_user ) #set data online_experiment.set_recommender_data_iterator(recommender_data_iterator) #get components (model, learner, loggers) = self._config(top_k, seed) #set loggers for l in loggers: online_experiment.add_logger(l) interrupt_logger = rs.InterruptLogger() online_experiment.add_logger(interrupt_logger) if(verbose): proceeding_logger = rs.ProceedingLogger() proceeding_logger.set_data_iterator(recommender_data_iterator) online_experiment.add_logger(proceeding_logger) ranking_logger = self._get_ranking_logger(top_k, evaluation_start_time, self.parameter_default('out_file', out_file), memory_log) ranking_logger.set_model(model) online_experiment.add_logger(ranking_logger) if type(calculate_toplists) is not bool or calculate_toplists: print('logging predictions') if self.verbose else None pred_creator = rs.ToplistCreatorPersonalized( top_k=top_k, exclude_known=exclude_known ) pred_creator.set_model(model) pred_logger = rs.PredictionLogger() pred_logger.set_prediction_creator(pred_creator) if type(calculate_toplists) is bool: online_experiment.add_logger(pred_logger) else: conditional_meta_logger = rs.ListConditionalMetaLogger( should_run_vector=[int(i) for i in calculate_toplists] ) conditional_meta_logger.set_logger(pred_logger) online_experiment.add_logger(conditional_meta_logger) self.predictions = pred_logger else: self.predictions = None if type(learner) == list: for obj in learner: online_experiment.add_updater(obj) else: online_experiment.add_updater(learner) #clean, initialize, test created_objects = rs.get_and_clean() rs.set_experiment_environment(online_experiment, created_objects) rs.initialize_all(created_objects) for i in created_objects: rs.run_self_test(i) self.check_unused_parameters() print("running experiment...") if self.verbose else None online_experiment.run() results = self._finished() if self.user_codes is not None and self.item_codes is not None: results['user'] = results['user'].map({v:k for k, v in self.user_codes.items()}) results['item'] = results['item'].map({v:k for k, v in self.item_codes.items()}) results = results.rename(columns=r_rename_dict) results.top_k = top_k return results
[docs] def get_predictions(self): """If the :code:`calculate_toplists` parameter is set when calling :code:`run`, this method can used to acquire the generated toplists. Returns ------- pandas.DataFrame DataFrame containing the columns record_id, time, user, item, rank and prediction. - **record_id** is the index of the record begin evaluated in the input DataFrame. Generally, there are :code:`top_k` rows with the same record_id. - **time** is the time of the evaluation - **user** is the user the toplist is generated for - **item** is the item of the toplist at the **rank** place - **prediction** is the prediction given by the model for the (user, item) pair at the time of evaluation. """ if self.predictions is not None: preds = self.predictions.get_predictions() preds_df = pd.DataFrame({ 'record_id': preds.ids, 'time': preds.times, 'user': preds.users, 'item': preds.items, 'rank': preds.ranks, 'prediction': preds.scores, }).sort_values(['record_id'])[['record_id', 'time', 'user', 'item', 'rank', 'prediction']] if self.user_codes is not None and self.item_codes is not None: preds_df['user'] = preds_df['user'].map({v:k for k, v in self.user_codes.items()}) preds_df['item'] = preds_df['item'].map({v:k for k, v in self.item_codes.items()}) return preds_df else: return None
def _get_ranking_logger(self, top_k, evaluation_start_time, out_file, memory_log): if out_file is None: out_file = "" else: print("logging to file " + out_file) if self.verbose else None self.ranking_logs = rs.RankingLogs() self.ranking_logs.top_k = top_k self.ranking_logger = rs.MemoryRankingLogger( evaluation_start_time=evaluation_start_time, out_file=out_file, memory_log=memory_log, top_k=top_k, random_seed=43211234 ) self.ranking_logger.set_ranking_logs(self.ranking_logs) return self.ranking_logger def _finished(self): logs = self.ranking_logs.logs top_k = self.ranking_logs.top_k df = pd.DataFrame.from_records( [( l.id, l.time, l.score, l.user, l.item, l.prediction, l.rank + 1 if l.rank < top_k else None ) for l in logs], columns=["id", "time", "score", "user", "item", "prediction", "rank"] ).set_index("id") df['rank']=df['rank'].astype(float) return df def _config(self, top_k, seed): """ This method needs to be implemented in every subclass of this class. It is called during the :code:`run()` method, and is required to build the model from the available C++ components. The expected return type is a python dictionary, with at least the :code:`learner`, :code:`model` and :code:`config` keys. The config key is expected to further contain at least the :code:`top_k` parameter. """ pass