Here is my code.
import pandas as pd import numpy as np import json from xgboost import XGBRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score, mean_squared_error from sklearn.preprocessing import StandardScaler training_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/MLB_Training_Data.csv') df_model = training_data.copy() scaler = StandardScaler() features = [['OBS', 'Runs']] for feature in features: df_model[feature] = scaler.fit_transform(df_model[feature]) test_data = pd.read_csv('/Users/aus10/Desktop/MLB_Data/Test_Training_Data/Test_Data.csv') X = training_data.iloc[:,1] #independent columns y = training_data.iloc[:,-1] #target column X = X.values.reshape(-1,1) results = [] # fit final model model = XGBRegressor(objective="reg:squarederror", random_state=42) model.fit(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) model.fit(X_train, y_train) y_pred = model.predict(X_test) print('MSE train: %.3f, test: %.3f' % ( round(mean_squared_error(y_train, y_train_pred),2), round(mean_squared_error(y_test, y_test_pred),2) )) print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # define one new data instance index = 0 count = 0 while count < len(test_data): team = test_data.loc[index].at['Team'] OBS = test_data.loc[index].at['OBS'] Xnew = [[ OBS ]] # make a prediction ynew = model.predict(Xnew) # show the inputs and predicted outputs results.append( { 'Team': team, 'Runs': (round(ynew[0],2)) }) index += 1 count += 1 sorted_results = sorted(results, key=lambda k: k['Runs'], reverse=True) df = pd.DataFrame(sorted_results, columns=[ 'Team', 'Runs']) writer = pd.ExcelWriter('/Users/aus10/Desktop/MLB_Data/ML/Results/Projected_Runs_XGBoost.xlsx', engine='xlsxwriter') # pylint: disable=abstract-class-instantiated df.to_excel(writer, sheet_name='Sheet1', index=False) df.style.set_properties(**{'text-align': 'center'}) pd.set_option('display.max_colwidth', 100) pd.set_option('display.width', 1000) writer.save()
and the error I’m getting is TypeError: Input data can not be a list.
The data coming from test_data
is a csv with a team name and obs which is a float
like this NYY 0.324
Every way to solve it I’ve seen is just to put it in a 2d array like I did – Xnew = [[ OBS ]]
,
but I’m still getting the error.
Is there something else I need to do to the test_data coming in? I tried using values.reshape
, but that didn’t fix it either.
Advertisement
Answer
You need to transform your Xnew
:
Xnew = np.array(Xnew).reshape((1,-1))