I am using Surprise to evaluate various recommender system algorithms. I would like to calculate predictions and prediction coverage on all possible user and item permutations. My data is loaded in from predefined splits.
My strategy to calculate prediction coverage is to
- build a full trainset and fit
- get lists of all users and items
- iterate through the list and make predictions
- count exceptions where predictions are impossible to calculate prediction coverage.
Trying to call data.build_full_trainset())
yields the following error:
AttributeError: 'DatasetUserFolds' object has no attribute 'build_full_trainset'
Is there a way to build a full trainset when loading data from predefined folds?
Alternatively, I will attempt to combine the data externally from Surprise into a dataframe and redo the process. Or are there better approaches?
Thank you.
# %% #https://surprise.readthedocs.io/en/stable/getting_started.html#basic-usage import random import pickle import numpy as np import pandas as pd # from survey.data_cleaning import long_ratings from surprise import NormalPredictor from surprise import Dataset from surprise import Reader from surprise.model_selection import cross_validate from surprise.model_selection import GridSearchCV # from surprise.model_selection import LeaveOneOut, KFold from surprise.model_selection import PredefinedKFold #set random seed for reproducibility my_seed = 0 random.seed(my_seed) np.random.seed(my_seed) path = 'data/recommenders/' def load_splits(): """ Loads splits from files load data from splits created by colab code and stored to files. used in surprise_recommenders.py returns splits as dataset """ # path to dataset folder files_dir = 'data/recommenders/splits/' # This time, we'll use the built-in reader. reader = Reader(line_format='user item rating', sep=' ', skip_lines=0, rating_scale=(1, 5)) # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader) return data data = load_splits() pkf = PredefinedKFold() algos = { 'NormalPredictor': {'constructor': NormalPredictor, 'param_grid': {} }} key = "stratified_5_fold" cv_results={} print(f"Performing {key} cross validation.") for algo_name, v in algos.items(): print("Working on algorithm: ", algo_name) gs = GridSearchCV(v['constructor'], v['param_grid'], measures=['rmse', 'mae'], cv=pkf) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # Predict on full dataset # Use the weights that yields the best rmse: algo = gs.best_estimator['rmse'] algo.fit(data.build_full_trainset()) #predefined folds breaks it. cv_results[algo_name] = pd.DataFrame.from_dict(gs.cv_results)
Advertisement
Answer
TLDR; The model_selection documentation in Surprise indicates a “refit” method, that will fit data on a full trainset, however it explicitly doesn’t work with predefined folds.
Another major issue: oyyablokov’s comment on this issue suggests you cannot fit a model with data that has NaNs. So even if you have a full trainset, how does one create a full prediction matrix to calculate things like prediction coverage, which requires all users and item combinations with or without ratings?
My workaround was to create 3 Surprise datasets.
- The dataset from predefined folds to compute best_params
- The full dataset of ratings (combining all folds outside of Surprise)
- The full prediction matrix dataset including all possible combinations of users and items (with or without ratings).
After you find your best paramaters with grid search cross validation, you can find your predictions and coverage with something like this:
import pandas as pd from surprise import Dataset, Reader def get_pred_coverage(data_matrix, algo_constructor, best_params, verbose=False): """ Calculates coverage inputs: data_matrix: Numpy Matrix with 0, 1, 2 columns as user, service, rating algo_constructor: the Surprise algorithm constructor to pass the best params into best_params: Surprise gs.best_params to pass into algo. returns: coverage & full predictions """ reader=Reader(rating_scale=(1,5)) full_predictions = [] #list to store prediction results df = pd.DataFrame(data_matrix) if verbose: print(df.info()) df_no_nan = df.dropna(subset=[2]) if verbose: print(df_no_nan.head()) no_nan_dataset = Dataset.load_from_df(df_no_nan[[0,1,2]], reader) full_dataset = Dataset.load_from_df(df[[0, 1, 2]], reader) #Predict on full dataset # Use the weights that yields the best rmse: algo = algo_constructor(**best_params) # Pass the dictionary as double star keyword arguments to the algorithm constructor #Create a no-nan trainset to fit on no_nan_trainset = no_nan_dataset.build_full_trainset() algo.fit(no_nan_trainset) if verbose: print('Number of trainset users: ', no_nan_trainset.n_users, 'n') if verbose: print('Number of trainset items: ', no_nan_trainset.n_items, 'n') pred_set = full_dataset.build_full_trainset() if verbose: print('Number of users: ', pred_set.n_users, 'n') if verbose: print('Number of items: ', pred_set.n_items, 'n') #get all item ids pred_set_iids = list(pred_set.all_items()) # print(f'pred_set iids are {pred_set_iids}') iid_converter = lambda x: pred_set.to_raw_iid(x) pred_set_raw_iids = list(map(iid_converter, pred_set_iids)) #get all user ids pred_set_uids = list(pred_set.all_users()) uid_converter = lambda x: pred_set.to_raw_uid(x) pred_set_raw_uids = list(map(uid_converter, pred_set_uids)) # print(f'pred_set uids are {pred_set_uids}') for user in pred_set_raw_uids: for item in pred_set_raw_iids: r_ui = float(df[2].loc[(df[0] == user) & (df[1]== item)]) #find the rating, by user and value # print(f"r_ui is type {type(r_ui)} and value {r_ui}") prediction = algo.predict(uid = user, iid = item, r_ui=r_ui) # print(prediction) full_predictions.append(prediction) #access a tuple #5th element, dicitonary item "was_impossible" impossible_count = 0 for prediction in full_predictions: impossible_count += prediction[4]['was_impossible'] if verbose: print(f"for algo {algo}, impossible_count is {impossible_count} ") prediction_coverage = (pred_set.n_users*pred_set.n_items - impossible_count)/(pred_set.n_users*pred_set.n_items) print(f"prediction_coverage is {prediction_coverage}") return prediction_coverage, full_predictions