I am running the following code:
from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel print(X.shape) print(X.values[:,list_of_relevant_features].dtype) print(y.values.dtype) kernel = DotProduct() + WhiteKernel() model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42) model_gp.fit(X.values[:,list_of_relevant_features], y.values) print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values))
The shape of my input is:
(19142, 21)
dtypes are each: float64
Added in Edit:
X and y are Pandas Dataframes.
After .values
they’re each numpy arrays
And I get the Error:
ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
I cant image a dataset of 20000 * 20 to be actually too big for gaussian processes, am I wrong?
The entire error message:
ValueError Traceback (most recent call last) filepath in 482 kernel = DotProduct() + WhiteKernel() 483 model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42) ----> 484 model_gp.fit(X.values[:,list_of_relevant_features], y.values) 485 print("GP: R2 score: ", model_gp.score(X.values[:,list_of_relevant_features], y.values)) 486 d:Toms_venvvenvlibsite-packagessklearngaussian_process_gpr.py in fit(self, X, y) 238 optima = [(self._constrained_optimization(obj_func, 239 self.kernel_.theta, --> 240 self.kernel_.bounds))] 241 242 # Additional runs are performed from log-uniform chosen initial d:Toms_venvvenvlibsite-packagessklearngaussian_process_gpr.py in _constrained_optimization(self, obj_func, initial_theta, bounds) 503 opt_res = scipy.optimize.minimize( 504 obj_func, initial_theta, method="L-BFGS-B", jac=True, --> 505 bounds=bounds) 506 _check_optimize_result("lbfgs", opt_res) 507 theta_opt, func_min = opt_res.x, opt_res.fun d:Toms_venvvenvlibsite-packagesscipyoptimize_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options) 616 elif meth == 'l-bfgs-b': 617 return _minimize_lbfgsb(fun, x0, args, jac, bounds, --> 618 callback=callback, **options) 619 elif meth == 'tnc': 620 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback, d:Toms_venvvenvlibsite-packagesscipyoptimizelbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options) 306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps, 307 bounds=new_bounds, --> 308 finite_diff_rel_step=finite_diff_rel_step) 309 310 func_and_grad = sf.fun_and_grad d:Toms_venvvenvlibsite-packagesscipyoptimizeoptimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess) 260 # calculation reduces overall function evaluations. 261 sf = ScalarFunction(fun, x0, args, grad, hess, --> 262 finite_diff_rel_step, bounds, epsilon=epsilon) 263 264 return sf d:Toms_venvvenvlibsite-packagesscipyoptimize_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon) 74 75 self._update_fun_impl = update_fun ---> 76 self._update_fun() 77 78 # Gradient evaluation d:Toms_venvvenvlibsite-packagesscipyoptimize_differentiable_functions.py in _update_fun(self) 164 def _update_fun(self): 165 if not self.f_updated: --> 166 self._update_fun_impl() 167 self.f_updated = True 168 d:Toms_venvvenvlibsite-packagesscipyoptimize_differentiable_functions.py in update_fun() 71 72 def update_fun(): ---> 73 self.f = fun_wrapped(self.x) 74 75 self._update_fun_impl = update_fun d:Toms_venvvenvlibsite-packagesscipyoptimize_differentiable_functions.py in fun_wrapped(x) 68 def fun_wrapped(x): 69 self.nfev += 1 ---> 70 return fun(x, *args) 71 72 def update_fun(): d:Toms_venvvenvlibsite-packagesscipyoptimizeoptimize.py in __call__(self, x, *args) 72 def __call__(self, x, *args): 73 """ returns the the function value """ ---> 74 self._compute_if_needed(x, *args) 75 return self._value 76 d:Toms_venvvenvlibsite-packagesscipyoptimizeoptimize.py in _compute_if_needed(self, x, *args) 66 if not np.all(x == self.x) or self._value is None or self.jac is None: 67 self.x = np.asarray(x).copy() ---> 68 fg = self.fun(x, *args) 69 self.jac = fg[1] 70 self._value = fg[0] d:Toms_venvvenvlibsite-packagessklearngaussian_process_gpr.py in obj_func(theta, eval_gradient) 229 if eval_gradient: 230 lml, grad = self.log_marginal_likelihood( --> 231 theta, eval_gradient=True, clone_kernel=False) 232 return -lml, -grad 233 else: d:Toms_venvvenvlibsite-packagessklearngaussian_process_gpr.py in log_marginal_likelihood(self, theta, eval_gradient, clone_kernel) 460 461 if eval_gradient: --> 462 K, K_gradient = kernel(self.X_train_, eval_gradient=True) 463 else: 464 K = kernel(self.X_train_) d:Toms_venvvenvlibsite-packagessklearngaussian_processkernels.py in __call__(self, X, Y, eval_gradient) 813 if eval_gradient: --> 814 K1, K1_gradient = self.k1(X, Y, eval_gradient=True) 815 K2, K2_gradient = self.k2(X, Y, eval_gradient=True) 816 return K1 + K2, np.dstack((K1_gradient, K2_gradient)) d:Toms_venvvenvlibsite-packagessklearngaussian_processkernels.py in __call__(self, X, Y, eval_gradient) 2110 X = np.atleast_2d(X) 2111 if Y is None: -> 2112 K = np.inner(X, X) + self.sigma_0 ** 2 2113 else: 2114 if eval_gradient: <__array_function__ internals> in inner(*args, **kwargs) ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.
Advertisement
Answer
I believe this happened because of the dot product kernel: The traceback in line 2112 leads to the numpy
inner product. So, the memory error you get actually arises in numpy
and not in Scikit-learn
. See also this SO question and this answer where it suggests that error is raised when numpy
is calculating the expected array size of the result of the inner prosuct which could lead to an integer overflow in 32-bit Python. My python setup is 64-bit so I can’t do a consistent test but the following snippet runs without error:
import numpy as np import pandas as pd from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel from sklearn.datasets import load_boston b = load_boston() X = [pd.DataFrame(b['data'])] y = b['target'] for i in range(50): X.append(pd.DataFrame(b['data'])) y = np.append(y,b['target']) X = pd.concat(X) X = pd.concat([X,X[X.columns[0:8]]],axis=1) print(X.values.shape,y.shape) kernel = DotProduct() + WhiteKernel() model_gp = GaussianProcessRegressor(kernel=kernel, random_state=42) model_gp.fit(X.values, y)
I would suggest running your model with less features in order to see at which array shape the memory error is raised. Alternatively you may try different kernels that don’t require the inner product of X
.