I’m trying to use RFE from scikit-learn with an estimator from statsmodels NegativeBinomial.
So I created my own class:
from sklearn.datasets import make_friedman1 from sklearn.feature_selection import RFE from sklearn.base import BaseEstimator import statsmodels.api as sm class MyEstimator(BaseEstimator): def __init__(self, formula_, data_, family_): self.model = sm.formula.glm(formula, data=data_, family=family_) def fit(self, **kwargs): self.model.fit() self.coef_ = self.model.params.values def predict(self, X): result = self.model.predict(X) return np.array(result) X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) dataset = pd.DataFrame({'X1':X[:,0], 'X2':X[:,1], 'X3':X[:,2], 'y':y}) estimator = MyEstimator("y ~ X1 + X2 + X3", dataset, sm.families.NegativeBinomial()) selector = RFE(estimator, n_features_to_select=5, step=1) selector = selector.fit()
But I get this error:
TypeError: fit() missing 2 required positional arguments: 'X' and 'y'
Does someone has an idea?
Advertisement
Answer
You can modify your code to require endog
and exog
variables, instead of using the formula
API:
import numpy as np import pandas as pd from sklearn.datasets import make_friedman1 from sklearn.feature_selection import RFE from sklearn.base import BaseEstimator import statsmodels.api as sm class MyEstimator(BaseEstimator): def __init__(self, family_): self.family_ = family_ def fit(self, exog, endog): self.model = sm.GLM(endog, exog, family=self.family_) fit_results = self.model.fit() self.coef_ = fit_results.params def predict(self, X): result = self.model.predict(X) return np.array(result) X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = MyEstimator(sm.families.NegativeBinomial()) selector = RFE(estimator, n_features_to_select=5, step=1) selector = selector.fit(X, y.reshape(-1,1)) print(selector.ranking_) # [1 1 3 1 1 5 1 6 4 2]