I’m trying to use XGBoost to predict a one target (one attribute) dataframe. Below my code. I run it on Colab
!sudo pip install xgboost !sudo pip install --upgrade xgboost import xgboost as xgb from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split data = [['sp37n1sy1bmjc6yp3m7wqefpz' ], ['sp36vfqtjv87pvw68zdmhnvxb'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36vues2ed9r6s196dmv4p00'], ['sp36vvgq6rq9sq1gv0nt19h20'], ['sp36ypgx7jmmsuujz2ww81n20'], ['sp37n1w451m6wtp6h4eq0wjb0'], ['sp36y99s6w9jm3614ugt52bpz'], ['sp37n1mywgv57qsg5r7hp7bpz'], ['sp36y9fbfz4t9c5znp27z3pbp']] df = pd.DataFrame(data) X = data[:-1] y = data[1:] X_train, X_test, y_train, y_test = train_test_split(X, y) regressor = xgb.XGBRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(str(X_train), str(y_train))
However, the following error is returned:
XGBoostError: [17:00:27] /workspace/dmlc-core/src/io/local_filesys.cc:86: LocalFileSystem.GetPathInfo: [['sp36ypgx7jmmsuujz2ww81n20'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp37n1w451m6wtp6h4eq0wjb0'], ['sp36vvgq6rq9sq1gv0nt19h20'], ['sp36vfqtjv87pvw68zdmhnvxb'], ['sp37n1sy1bmjc6yp3m7wqefpz'], ['sp37n1mywgv57qsg5r7hp7bpz'], ['sp36vues2ed9r6s196dmv4p00'], ['sp36y99s6w9jm3614ugt52bpz']] error: File name too long Stack trace: [bt] (0) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::io::LocalFileSystem::GetPathInfo(dmlc::io::URI const&)+0x567) [0x7f6f13f157c7] [bt] (1) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::io::InputSplitBase::InitInputFileInfo(std::string const&, bool)+0x14e) [0x7f6f13f044de] [bt] (2) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::io::InputSplitBase::Init(dmlc::io::FileSystem*, char const*, unsigned long, bool)+0x43) [0x7f6f13f04be3] [bt] (3) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::InputSplit::Create(char const*, char const*, unsigned int, unsigned int, char const*, bool, int, unsigned long, bool)+0xb7a) [0x7f6f13eed18a] [bt] (4) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::InputSplit::Create(char const*, unsigned int, unsigned int, char const*)+0x1e) [0x7f6f13eed81e] [bt] (5) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::Parser<unsigned int, float>* dmlc::data::CreateLibSVMParser<unsigned int, float>(std::string const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&, unsigned int, unsigned int)+0x1a) [0x7f6f13ecb09a] [bt] (6) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(dmlc::Parser<unsigned int, float>* dmlc::data::CreateParser_<unsigned int, float>(char const*, unsigned int, unsigned int, char const*)+0x15b) [0x7f6f13ebc23b] [bt] (7) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(xgboost::DMatrix::Load(std::string const&, bool, bool, std::string const&, unsigned long)+0x2df) [0x7f6f13c91a0f] [bt] (8) /usr/local/lib/python3.7/dist-packages/xgboost/./lib/libxgboost.so(XGDMatrixCreateFromFile+0xc2) [0x7f6f13c5f5b2]
if I change the last line to
regressor.fit(X_train, y_train)
I get this error:
TypeError: can not initialize DMatrix from list
What I’m doing wrong? any clue?
Advertisement
Answer
XGBoost cannot handle categorical variables, so they need to be encoded before passing to XGBoost model. There are many ways you can encode your varaibles according to the nature of the categorical variable. Since I believe that your string have some order so Label Encoding is suited for your categorical variables:
Full code:
import xgboost as xgb import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split data = [['sp37n1sy1bmjc6yp3m7wqefpz' ], ['sp36vfqtjv87pvw68zdmhnvxb'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36y965ksqnmq0b0b58y1p00'], ['sp36vues2ed9r6s196dmv4p00'], ['sp36vvgq6rq9sq1gv0nt19h20'], ['sp36ypgx7jmmsuujz2ww81n20'], ['sp37n1w451m6wtp6h4eq0wjb0'], ['sp36y99s6w9jm3614ugt52bpz'], ['sp37n1mywgv57qsg5r7hp7bpz'], ['sp36y9fbfz4t9c5znp27z3pbp']] df = pd.DataFrame(data) X = df[:-1] y = df[1:] le = LabelEncoder() X = le.fit_transform(X) y = le.fit_transform(y) X = np.array(X).reshape(-1,1) #convert to 2D X_train, X_test, y_train, y_test = train_test_split(X, y) regressor = xgb.XGBRegressor( n_estimators=100, reg_lambda=1, gamma=0, max_depth=3 ) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) y_predictions = [int(round(y,0)) for y in y_pred] print("Encoded Predictions",y_predictions) #encoded predictions print("String predictions",le.inverse_transform(y_predictions)) #original string predictions print() print("Encoded Actual value",y_test) #encoded print("String Actual value",le.inverse_transform(y_test)) #original test values