Skip to content
Advertisement

RFE from scikit-learn feature_selection with NegativeBinomial from statsmodels as estimator

I’m trying to use RFE from scikit-learn with an estimator from statsmodels NegativeBinomial.

So I created my own class:

from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm

class MyEstimator(BaseEstimator):
    def __init__(self, formula_, data_, family_):
        self.model = sm.formula.glm(formula, data=data_, family=family_)

    def fit(self, **kwargs):
        self.model.fit()
        self.coef_ = self.model.params.values

    def predict(self, X):
        result = self.model.predict(X)    
        return np.array(result)

X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)


dataset = pd.DataFrame({'X1':X[:,0], 'X2':X[:,1], 'X3':X[:,2], 'y':y})

estimator = MyEstimator("y ~ X1 + X2 + X3", dataset, sm.families.NegativeBinomial())

selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit()

But I get this error:

TypeError: fit() missing 2 required positional arguments: 'X' and 'y'

Does someone has an idea?

Advertisement

Answer

You can modify your code to require endog and exog variables, instead of using the formula API:

import numpy as np
import pandas as pd
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm

class MyEstimator(BaseEstimator):
    def __init__(self, family_):
        self.family_ = family_

    def fit(self, exog, endog):
        self.model = sm.GLM(endog, exog, family=self.family_)
        fit_results = self.model.fit()
        self.coef_ = fit_results.params

    def predict(self, X):
        result = self.model.predict(X)    
        return np.array(result)

X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)

estimator = MyEstimator(sm.families.NegativeBinomial())

selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y.reshape(-1,1))
print(selector.ranking_)
# [1 1 3 1 1 5 1 6 4 2]
User contributions licensed under: CC BY-SA
8 People found this is helpful
Advertisement