Skip to content
Advertisement

How to build a full trainset when loading data from predefined folds in Surprise?

I am using Surprise to evaluate various recommender system algorithms. I would like to calculate predictions and prediction coverage on all possible user and item permutations. My data is loaded in from predefined splits.

My strategy to calculate prediction coverage is to

  1. build a full trainset and fit
  2. get lists of all users and items
  3. iterate through the list and make predictions
  4. count exceptions where predictions are impossible to calculate prediction coverage.

Trying to call data.build_full_trainset()) yields the following error:

AttributeError: 'DatasetUserFolds' object has no attribute 'build_full_trainset'

Is there a way to build a full trainset when loading data from predefined folds?

Alternatively, I will attempt to combine the data externally from Surprise into a dataframe and redo the process. Or are there better approaches?

Thank you.

# %% #https://surprise.readthedocs.io/en/stable/getting_started.html#basic-usage

import random
import pickle
import numpy as np
import pandas as pd

# from survey.data_cleaning import long_ratings
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
# from surprise.model_selection import LeaveOneOut, KFold
from surprise.model_selection import PredefinedKFold

#set random seed for reproducibility
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

path = 'data/recommenders/'

def load_splits():
    """
    Loads splits from files load data from splits created by colab code and stored to files. used in surprise_recommenders.py

    returns splits as dataset
    """
    # path to dataset folder
    files_dir = 'data/recommenders/splits/'
    # This time, we'll use the built-in reader.
    reader = Reader(line_format='user item rating', sep=' ', skip_lines=0, rating_scale=(1, 5))

    # folds_files is a list of tuples containing file paths:
    # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
    train_file = files_dir + 'u%d.base'
    test_file = files_dir + 'u%d.test'
    folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

    data = Dataset.load_from_folds(folds_files, reader=reader)
    return data

data = load_splits()

pkf = PredefinedKFold()

algos = {
  'NormalPredictor': {'constructor': NormalPredictor,
                      'param_grid': {}
   }}

key = "stratified_5_fold"
cv_results={}
print(f"Performing {key} cross validation.")
for algo_name, v in algos.items():
    print("Working on algorithm: ", algo_name)
    gs = GridSearchCV(v['constructor'], v['param_grid'], measures=['rmse', 'mae'], cv=pkf)

    gs.fit(data)
    # best RMSE score
    print(gs.best_score['rmse'])
    
    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    # Predict on full dataset
    # Use the weights that yields the best rmse:
    algo = gs.best_estimator['rmse']
    algo.fit(data.build_full_trainset())     #predefined folds breaks it.


    cv_results[algo_name] = pd.DataFrame.from_dict(gs.cv_results)

Advertisement

Answer

TLDR; The model_selection documentation in Surprise indicates a “refit” method, that will fit data on a full trainset, however it explicitly doesn’t work with predefined folds.

Another major issue: oyyablokov’s comment on this issue suggests you cannot fit a model with data that has NaNs. So even if you have a full trainset, how does one create a full prediction matrix to calculate things like prediction coverage, which requires all users and item combinations with or without ratings?

My workaround was to create 3 Surprise datasets.

  1. The dataset from predefined folds to compute best_params
  2. The full dataset of ratings (combining all folds outside of Surprise)
  3. The full prediction matrix dataset including all possible combinations of users and items (with or without ratings).

After you find your best paramaters with grid search cross validation, you can find your predictions and coverage with something like this:

import pandas as pd
from surprise import Dataset, Reader

def get_pred_coverage(data_matrix, algo_constructor, best_params, verbose=False):
    """
    Calculates coverage
    inputs:
        data_matrix: Numpy Matrix with 0, 1, 2 columns as user, service, rating
        algo_constructor: the Surprise algorithm constructor to pass the best params into
        best_params: Surprise gs.best_params to pass into algo.

    returns: coverage & full predictions
    """
    reader=Reader(rating_scale=(1,5))

    full_predictions = [] #list to store prediction results
    
    df = pd.DataFrame(data_matrix)
    if verbose: print(df.info())
    df_no_nan = df.dropna(subset=[2])
    if verbose: print(df_no_nan.head())
    no_nan_dataset = Dataset.load_from_df(df_no_nan[[0,1,2]], reader)
    full_dataset = Dataset.load_from_df(df[[0, 1, 2]], reader)
    #Predict on full dataset
    # Use the weights that yields the best rmse:
    algo = algo_constructor(**best_params) # Pass the dictionary as double star keyword arguments to the algorithm constructor

    #Create a no-nan trainset to fit on 
    no_nan_trainset = no_nan_dataset.build_full_trainset()
    algo.fit(no_nan_trainset)
    if verbose: print('Number of trainset users: ', no_nan_trainset.n_users, 'n')
    if verbose: print('Number of trainset items: ', no_nan_trainset.n_items, 'n')

    pred_set = full_dataset.build_full_trainset()
    if verbose: print('Number of users: ', pred_set.n_users, 'n')
    if verbose: print('Number of items: ', pred_set.n_items, 'n')
    
    #get all item ids
    pred_set_iids = list(pred_set.all_items())
    # print(f'pred_set iids are {pred_set_iids}')
    iid_converter = lambda x: pred_set.to_raw_iid(x)
    pred_set_raw_iids = list(map(iid_converter, pred_set_iids))
    
    #get all user ids
    pred_set_uids = list(pred_set.all_users())
    uid_converter = lambda x: pred_set.to_raw_uid(x)
    pred_set_raw_uids = list(map(uid_converter, pred_set_uids))
    # print(f'pred_set uids are {pred_set_uids}')

    for user in pred_set_raw_uids:
        for item in pred_set_raw_iids:
            r_ui = float(df[2].loc[(df[0] == user) & (df[1]== item)])  #find the rating, by user and value
            # print(f"r_ui is type {type(r_ui)} and value {r_ui}")
            
            prediction = algo.predict(uid = user, iid = item, r_ui=r_ui)
            # print(prediction)
            full_predictions.append(prediction)
    #access a tuple 
    #5th element, dicitonary item "was_impossible"
    impossible_count = 0
    for prediction in full_predictions:
        impossible_count += prediction[4]['was_impossible']

    if verbose: print(f"for algo {algo}, impossible_count is {impossible_count} ")

    prediction_coverage = (pred_set.n_users*pred_set.n_items - impossible_count)/(pred_set.n_users*pred_set.n_items)
    print(f"prediction_coverage is {prediction_coverage}")

    return prediction_coverage, full_predictions
User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement