I am currently working on the “French Motor Claims Datasets freMTPL2freq” Kaggle competition (https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq). Unfortunately I get a “NotFittedError: All estimators failed to fit” error whenever I am using RandomizedSearchCV and I cannot figure out why that is. Any help is much appreciated.
import numpy as np import statsmodels.api as sm import scipy.stats as stats from matplotlib import pyplot as plt from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import KBinsDiscretizer from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold from sklearn.metrics import mean_poisson_deviance from sklearn.metrics import mean_squared_error from sklearn.ensemble import VotingRegressor from sklearn.ensemble import StackingRegressor from sklearn.metrics import mean_gamma_deviance from sklearn.metrics import mean_squared_error from xgboost import XGBRegressor data_freq = pd.read_csv('freMTPL2freq.csv') data_freq['Area'] = data_freq['Area'].str.replace(''','') data_freq['VehBrand'] = data_freq['VehBrand'].str.replace(''','') data_freq['VehGas'] = data_freq['VehGas'].str.replace(''','') data_freq['Region'] = data_freq['Region'].str.replace(''','') data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure'] y = data_freq['frequency'] X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1) X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42) pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density'] cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas'] from xgboost import XGBRegressor ct = ColumnTransformer([('pt', 'passthrough', pt_columns), ('ohe', OneHotEncoder(), cat_columns)]) pipe_xgbr = Pipeline([('cf_trans', ct), ('ssc', StandardScaler(with_mean = False)), ('xgb_regressor', XGBRegressor()) ]) param = {'xgb_regressor__n_estimators':[3, 5], 'xgb_regressor__max_depth':[3, 5, 7], 'xgb_regressor__learning_rate':[0.1, 0.5], 'xgb_regressor__colsample_bytree':[0.5, 0.8], 'xgb_regressor__subsample':[0.5, 0.8] } rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise') rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
The first five rows of the original dataframe data_freq look like this:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region 0 1.0 1 0.10 D 5 0 55 50 B12 Regular 1217 R82 1 3.0 1 0.77 D 5 0 55 50 B12 Regular 1217 R82 2 5.0 1 0.75 B 6 2 52 50 B12 Diesel 54 R22 3 10.0 1 0.09 B 7 0 46 50 B12 Diesel 76 R72 4 11.0 1 0.84 B 7 0 46 50 B12 Diesel 76 R72
The error I get is as follows:
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "C:UsersJananaconda3libsite-packagesjoblibexternalslokyprocess_executor.py", line 418, in _process_worker r = call_item() File "C:UsersJananaconda3libsite-packagesjoblibexternalslokyprocess_executor.py", line 272, in __call__ return self.fn(*self.args, **self.kwargs) File "C:UsersJananaconda3libsite-packagesjoblib_parallel_backends.py", line 608, in __call__ return self.func(*args, **kwargs) File "C:UsersJananaconda3libsite-packagesjoblibparallel.py", line 256, in __call__ for func, args, kwargs in self.items] File "C:UsersJananaconda3libsite-packagesjoblibparallel.py", line 256, in <listcomp> for func, args, kwargs in self.items] File "C:UsersJananaconda3libsite-packagessklearnutilsfixes.py", line 222, in __call__ return self.function(*args, **kwargs) File "C:UsersJananaconda3libsite-packagessklearnmodel_selection_validation.py", line 598, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "C:UsersJananaconda3libsite-packagessklearnpipeline.py", line 340, in fit fit_params_steps = self._check_fit_params(**fit_params) File "C:UsersJananaconda3libsite-packagessklearnpipeline.py", line 261, in _check_fit_params fit_params_steps[step][param] = pval KeyError: 'xgbr_regressor' """ The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-68-0c1886d1e985> in <module> ----> 1 rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure']) 2 #pipe_xgbr.fit(X_train, y_train) 3 #X_train.describe(include = 'all') ~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 ~anaconda3libsite-packagessklearnmodel_selection_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable ~anaconda3libsite-packagessklearnmodel_selection_search.py in _run_search(self, evaluate_candidates) 1633 evaluate_candidates(ParameterSampler( 1634 self.param_distributions, self.n_iter, -> 1635 random_state=self.random_state)) ~anaconda3libsite-packagessklearnmodel_selection_search.py in evaluate_candidates(candidate_params, cv, more_results) 807 (split_idx, (train, test)) in product( 808 enumerate(candidate_params), --> 809 enumerate(cv.split(X, y, groups)))) 810 811 if len(out) < 1: ~anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable) 1015 1016 with self._backend.retrieval_context(): -> 1017 self.retrieve() 1018 # Make sure that we get a last message telling us we are done 1019 elapsed_time = time.time() - self._start_time ~anaconda3libsite-packagesjoblibparallel.py in retrieve(self) 907 try: 908 if getattr(self._backend, 'supports_timeout', False): --> 909 self._output.extend(job.get(timeout=self.timeout)) 910 else: 911 self._output.extend(job.get()) ~anaconda3libsite-packagesjoblib_parallel_backends.py in wrap_future_result(future, timeout) 560 AsyncResults.get from multiprocessing.""" 561 try: --> 562 return future.result(timeout=timeout) 563 except LokyTimeoutError: 564 raise TimeoutError() ~anaconda3libconcurrentfutures_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError() ~anaconda3libconcurrentfutures_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result KeyError: 'xgbr_regressor'
I also tried running fit without the sample_weight parameter. In this case the error changes to:
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "C:UsersJananaconda3libsite-packagesjoblibexternalslokyprocess_executor.py", line 418, in _process_worker r = call_item() File "C:UsersJananaconda3libsite-packagesjoblibexternalslokyprocess_executor.py", line 272, in __call__ return self.fn(*self.args, **self.kwargs) File "C:UsersJananaconda3libsite-packagesjoblib_parallel_backends.py", line 608, in __call__ return self.func(*args, **kwargs) File "C:UsersJananaconda3libsite-packagesjoblibparallel.py", line 256, in __call__ for func, args, kwargs in self.items] File "C:UsersJananaconda3libsite-packagesjoblibparallel.py", line 256, in <listcomp> for func, args, kwargs in self.items] File "C:UsersJananaconda3libsite-packagessklearnutilsfixes.py", line 222, in __call__ return self.function(*args, **kwargs) File "C:UsersJananaconda3libsite-packagessklearnmodel_selection_validation.py", line 625, in _fit_and_score test_scores = _score(estimator, X_test, y_test, scorer, error_score) File "C:UsersJananaconda3libsite-packagessklearnmodel_selection_validation.py", line 687, in _score scores = scorer(estimator, X_test, y_test) File "C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py", line 74, in inner_f return f(**kwargs) File "C:UsersJananaconda3libsite-packagessklearnmetrics_regression.py", line 336, in mean_squared_error y_true, y_pred, multioutput) File "C:UsersJananaconda3libsite-packagessklearnmetrics_regression.py", line 88, in _check_reg_targets check_consistent_length(y_true, y_pred) File "C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py", line 316, in check_consistent_length lengths = [_num_samples(X) for X in arrays if X is not None] File "C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py", line 316, in <listcomp> lengths = [_num_samples(X) for X in arrays if X is not None] File "C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py", line 249, in _num_samples raise TypeError(message) TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'> """ The above exception was the direct cause of the following exception: TypeError Traceback (most recent call last) <ipython-input-69-a9be9cc5df4a> in <module> ----> 1 rscv.fit(X_train, y_train)#, xgbr_regressor__sample_weight = X_train['Exposure']) 2 #pipe_xgbr.fit(X_train, y_train) 3 #X_train.describe(include = 'all') ~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 ~anaconda3libsite-packagessklearnmodel_selection_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable ~anaconda3libsite-packagessklearnmodel_selection_search.py in _run_search(self, evaluate_candidates) 1633 evaluate_candidates(ParameterSampler( 1634 self.param_distributions, self.n_iter, -> 1635 random_state=self.random_state)) ~anaconda3libsite-packagessklearnmodel_selection_search.py in evaluate_candidates(candidate_params, cv, more_results) 807 (split_idx, (train, test)) in product( 808 enumerate(candidate_params), --> 809 enumerate(cv.split(X, y, groups)))) 810 811 if len(out) < 1: ~anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable) 1015 1016 with self._backend.retrieval_context(): -> 1017 self.retrieve() 1018 # Make sure that we get a last message telling us we are done 1019 elapsed_time = time.time() - self._start_time ~anaconda3libsite-packagesjoblibparallel.py in retrieve(self) 907 try: 908 if getattr(self._backend, 'supports_timeout', False): --> 909 self._output.extend(job.get(timeout=self.timeout)) 910 else: 911 self._output.extend(job.get()) ~anaconda3libsite-packagesjoblib_parallel_backends.py in wrap_future_result(future, timeout) 560 AsyncResults.get from multiprocessing.""" 561 try: --> 562 return future.result(timeout=timeout) 563 except LokyTimeoutError: 564 raise TimeoutError() ~anaconda3libconcurrentfutures_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError() ~anaconda3libconcurrentfutures_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
When setting verbose = 10 and n_jobs = 1 the following error message shows up:
Fitting 5 folds for each of 2 candidates, totalling 10 fits [CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5 C:UsersJananaconda3libsite-packagessklearnutilsvalidation.py:72: FutureWarning: Pass sample_weight=406477 1.0 393150 0.0 252885 0.0 260652 0.0 661256 0.0 ... 154663 0.0 398414 0.0 42890 0.0 640774 0.0 114446 0.0 Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error "will result in an error", FutureWarning) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-84-74435f74c470> in <module> ----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure']) 2 #pipe_xgbr.fit(X_train, y_train) 3 #X_train.describe(include = 'all') ~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs) 61 extra_args = len(args) - len(all_args) 62 if extra_args <= 0: ---> 63 return f(*args, **kwargs) 64 65 # extra_args > 0 ~anaconda3libsite-packagessklearnmodel_selection_search.py in fit(self, X, y, groups, **fit_params) 839 return results 840 --> 841 self._run_search(evaluate_candidates) 842 843 # multimetric is determined here because in the case of a callable ~anaconda3libsite-packagessklearnmodel_selection_search.py in _run_search(self, evaluate_candidates) 1633 evaluate_candidates(ParameterSampler( 1634 self.param_distributions, self.n_iter, -> 1635 random_state=self.random_state)) ~anaconda3libsite-packagessklearnmodel_selection_search.py in evaluate_candidates(candidate_params, cv, more_results) 807 (split_idx, (train, test)) in product( 808 enumerate(candidate_params), --> 809 enumerate(cv.split(X, y, groups)))) 810 811 if len(out) < 1: ~anaconda3libsite-packagesjoblibparallel.py in __call__(self, iterable) 1002 # remaining jobs. 1003 self._iterating = False -> 1004 if self.dispatch_one_batch(iterator): 1005 self._iterating = self._original_iterator is not None 1006 ~anaconda3libsite-packagesjoblibparallel.py in dispatch_one_batch(self, iterator) 833 return False 834 else: --> 835 self._dispatch(tasks) 836 return True 837 ~anaconda3libsite-packagesjoblibparallel.py in _dispatch(self, batch) 752 with self._lock: 753 job_idx = len(self._jobs) --> 754 job = self._backend.apply_async(batch, callback=cb) 755 # A job can complete so quickly than its callback is 756 # called before we get here, causing self._jobs to ~anaconda3libsite-packagesjoblib_parallel_backends.py in apply_async(self, func, callback) 207 def apply_async(self, func, callback=None): 208 """Schedule a func to be run""" --> 209 result = ImmediateResult(func) 210 if callback: 211 callback(result) ~anaconda3libsite-packagesjoblib_parallel_backends.py in __init__(self, batch) 588 # Don't delay the application, to avoid keeping the input 589 # arguments in memory --> 590 self.results = batch() 591 592 def get(self): ~anaconda3libsite-packagesjoblibparallel.py in __call__(self) 254 with parallel_backend(self._backend, n_jobs=self._n_jobs): 255 return [func(*args, **kwargs) --> 256 for func, args, kwargs in self.items] 257 258 def __len__(self): ~anaconda3libsite-packagesjoblibparallel.py in <listcomp>(.0) 254 with parallel_backend(self._backend, n_jobs=self._n_jobs): 255 return [func(*args, **kwargs) --> 256 for func, args, kwargs in self.items] 257 258 def __len__(self): ~anaconda3libsite-packagessklearnutilsfixes.py in __call__(self, *args, **kwargs) 220 def __call__(self, *args, **kwargs): 221 with config_context(**self.config): --> 222 return self.function(*args, **kwargs) ~anaconda3libsite-packagessklearnmodel_selection_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score) 623 624 fit_time = time.time() - start_time --> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score) 626 score_time = time.time() - start_time - fit_time 627 if return_train_score: ~anaconda3libsite-packagessklearnmodel_selection_validation.py in _score(estimator, X_test, y_test, scorer, error_score) 685 scores = scorer(estimator, X_test) 686 else: --> 687 scores = scorer(estimator, X_test, y_test) 688 except Exception: 689 if error_score == 'raise': ~anaconda3libsite-packagessklearnutilsvalidation.py in inner_f(*args, **kwargs) 72 "will result in an error", FutureWarning) 73 kwargs.update(zip(sig.parameters, args)) ---> 74 return f(**kwargs) 75 return inner_f 76 ~anaconda3libsite-packagessklearnmetrics_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared) 334 """ 335 y_type, y_true, y_pred, multioutput = _check_reg_targets( --> 336 y_true, y_pred, multioutput) 337 check_consistent_length(y_true, y_pred, sample_weight) 338 output_errors = np.average((y_true - y_pred) ** 2, axis=0, ~anaconda3libsite-packagessklearnmetrics_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype) 86 the dtype argument passed to check_array. 87 """ ---> 88 check_consistent_length(y_true, y_pred) 89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype) 90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype) ~anaconda3libsite-packagessklearnutilsvalidation.py in check_consistent_length(*arrays) 314 """ 315 --> 316 lengths = [_num_samples(X) for X in arrays if X is not None] 317 uniques = np.unique(lengths) 318 if len(uniques) > 1: ~anaconda3libsite-packagessklearnutilsvalidation.py in <listcomp>(.0) 314 """ 315 --> 316 lengths = [_num_samples(X) for X in arrays if X is not None] 317 uniques = np.unique(lengths) 318 if len(uniques) > 1: ~anaconda3libsite-packagessklearnutilsvalidation.py in _num_samples(x) 247 if hasattr(x, 'fit') and callable(x.fit): 248 # Don't get num_samples from an ensembles length! --> 249 raise TypeError(message) 250 251 if not hasattr(x, '__len__') and not hasattr(x, 'shape'): TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
Advertisement
Answer
Wow, that was a mess of a traceback, but I think I’ve finally found it. You set scoring=mean_squared_error
, and should instead use scoring="neg_mean_squared_error"
.
The metric function mean_squared_error
has signature (y_true, y_pred, *, <kwargs>)
, whereas the scorer obtained by using the string "neg_mean_squared_error"
has signature (estimator, X_test, y_test)
. So in the traceback, where you see
--> 687 scores = scorer(estimator, X_test, y_test)
it is calling mean_squared_error
with y_true=estimator
, y_test=X_test
, and sample_weight=y_test
(the first kwarg, and hence the FutureWarning about specifying keyword arguments as positional). Going deeper into the traceback, we see a check that the shapes of y_true
and y_pred
are compatible, but it thinks the former is your pipeline object (and hence the final error message)!