注意
前往末尾下载完整的示例代码。
分类数据入门
对分类数据的实验性支持。
之前,用户需要先自行运行编码器,然后才能将数据传递给 XGBoost,这会创建稀疏矩阵并可能增加内存使用量。本演示展示了对分类数据的实验性支持,未来计划提供更高级的功能。
在版本 1.5.0 中添加。
另请参见
from typing import Tuple
import numpy as np
import pandas as pd
import xgboost as xgb
def make_categorical(
n_samples: int, n_features: int, n_categories: int, onehot: bool
) -> Tuple[pd.DataFrame, pd.Series]:
"""Make some random data for demo."""
rng = np.random.RandomState(1994)
pd_dict = {}
for i in range(n_features + 1):
c = rng.randint(low=0, high=n_categories, size=n_samples)
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
df = pd.DataFrame(pd_dict)
label = df.iloc[:, 0]
df = df.iloc[:, 1:]
for i in range(0, n_features):
label += df.iloc[:, i]
label += 1
df = df.astype("category")
categories = np.arange(0, n_categories)
for col in df.columns:
df[col] = df[col].cat.set_categories(categories)
if onehot:
return pd.get_dummies(df), label
return df, label
def main() -> None:
# Use builtin categorical data support
# For scikit-learn interface, the input data should be pandas DataFrame or cudf
# DataFrame with categorical features. If an numpy/cupy array is used instead, the
# `feature_types` for `XGBRegressor` should be set accordingly.
X, y = make_categorical(100, 10, 4, False)
# Specify `enable_categorical` to True, also we use onehot-encoding-based split here
# for demonstration. For details see the document of `max_cat_to_onehot`.
reg = xgb.XGBRegressor(
tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda"
)
reg.fit(X, y, eval_set=[(X, y)])
# Pass in already encoded data
X_enc, y_enc = make_categorical(100, 10, 4, True)
reg_enc = xgb.XGBRegressor(tree_method="hist", device="cuda")
reg_enc.fit(X_enc, y_enc, eval_set=[(X_enc, y_enc)])
reg_results = np.array(reg.evals_result()["validation_0"]["rmse"])
reg_enc_results = np.array(reg_enc.evals_result()["validation_0"]["rmse"])
# Check that they have same results
np.testing.assert_allclose(reg_results, reg_enc_results)
# Convert to DMatrix for SHAP value
booster: xgb.Booster = reg.get_booster()
m = xgb.DMatrix(X, enable_categorical=True) # specify categorical data support.
SHAP = booster.predict(m, pred_contribs=True)
margin = booster.predict(m, output_margin=True)
np.testing.assert_allclose(
np.sum(SHAP, axis=len(SHAP.shape) - 1), margin, rtol=1e-3
)
if __name__ == "__main__":
main()