XGBoost 模型逐步优化指南

system

1. 基础准备阶段

1.1 数据预处理

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# 加载数据
data = pd.read_csv('dataset.csv')

# 处理缺失值
imputer = SimpleImputer(strategy='median')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 特征编码
categorical_cols = ['category_feature']
data = pd.get_dummies(data, columns=categorical_cols)

# 特征缩放
scaler = StandardScaler()
numerical_cols = [col for col in data.columns if col not in categorical_cols]
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# 划分数据集
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

1.2 创建基准模型

import xgboost as xgb
from sklearn.metrics import accuracy_score

# 创建DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 基准参数
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

# 训练基准模型
model = xgb.train(params, dtrain, num_boost_round=100)
preds = model.predict(dtest)
accuracy = accuracy_score(y_test, preds > 0.5)
print(f"基准模型准确率: {accuracy:.4f}")

system

2. 参数优化阶段

2.1 调整树结构参数

# 网格搜索树结构参数
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

best_score = 0
best_params = {}

for depth in param_grid['max_depth']:
    for weight in param_grid['min_child_weight']:
        for gamma in param_grid['gamma']:
            params = {
                'max_depth': depth,
                'min_child_weight': weight,
                'gamma': gamma,
                'objective': 'binary:logistic',
                'eval_metric': 'logloss'
            }
            
            # 交叉验证
            cv_results = xgb.cv(
                params, dtrain,
                num_boost_round=100,
                nfold=5,
                metrics={'logloss'},
                early_stopping_rounds=10,
                seed=42
            )
            
            # 选择最佳参数
            min_logloss = cv_results['test-logloss-mean'].min()
            if min_logloss < best_score or best_score == 0:
                best_score = min_logloss
                best_params = params.copy()

print(f"最佳树结构参数: {best_params}")

2.2 优化正则化参数

# 优化正则化参数
param_grid = {
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5, 2]
}

for alpha in param_grid['reg_alpha']:
    for lambda_val in param_grid['reg_lambda']:
        params = best_params.copy()
        params.update({
            'reg_alpha': alpha,
            'reg_lambda': lambda_val
        })
        
        # 交叉验证
        cv_results = xgb.cv(
            params, dtrain,
            num_boost_round=100,
            nfold=5,
            metrics={'logloss'},
            early_stopping_rounds=10,
            seed=42
        )
        
        # 更新最佳参数
        min_logloss = cv_results['test-logloss-mean'].min()
        if min_logloss < best_score:
            best_score = min_logloss
            best_params = params.copy()

print(f"优化后参数: {best_params}")

2.3 调整学习率和树的数量

# 调整学习率和树的数量
learning_rates = [0.01, 0.05, 0.1]
best_num_rounds = 0
best_lr = 0

for lr in learning_rates:
    params = best_params.copy()
    params['learning_rate'] = lr
    
    # 使用早停确定最佳树的数量
    evals_result = {}
    model = xgb.train(
        params, dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dtest, 'eval')],
        early_stopping_rounds=20,
        evals_result=evals_result,
        verbose_eval=False
    )
    
    # 记录最佳迭代次数
    best_iteration = model.best_iteration
    best_score = model.best_score
    
    if best_score < best_score or best_score == 0:
        best_num_rounds = best_iteration
        best_lr = lr

best_params['learning_rate'] = best_lr
print(f"最佳学习率: {best_lr}, 最佳树数量: {best_num_rounds}")

system

3. 特征工程阶段

3.1 特征重要性分析

# 训练模型获取特征重要性
final_model = xgb.train(best_params, dtrain, num_boost_round=best_num_rounds)
importance = final_model.get_score(importance_type='gain')

# 可视化特征重要性
import matplotlib.pyplot as plt

sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
features, scores = zip(*sorted_importance)

plt.figure(figsize=(12, 8))
plt.barh(features[:20], scores[:20])
plt.xlabel('特征重要性')
plt.title('Top 20 特征重要性')
plt.gca().invert_yaxis()
plt.show()

3.2 特征选择和创建

# 选择重要特征
important_features = features[:20]  # 选择前20个重要特征

# 创建新特征
X_train_eng = X_train.copy()
X_test_eng = X_test.copy()

# 1. 创建交互特征
for i in range(len(important_features)):
    for j in range(i+1, len(important_features)):
        col1 = important_features[i]
        col2 = important_features[j]
        X_train_eng[f'{col1}_{col2}'] = X_train[col1] * X_train[col2]
        X_test_eng[f'{col1}_{col2}'] = X_test[col1] * X_test[col2]

# 2. 创建多项式特征
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(X_train[important_features])
poly_feature_names = poly.get_feature_names_out(important_features)

X_train_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=X_train.index)
X_test_poly = pd.DataFrame(poly.transform(X_test[important_features]), columns=poly_feature_names, index=X_test.index)

# 合并特征
X_train_final = pd.concat([X_train_eng, X_train_poly], axis=1)
X_test_final = pd.concat([X_test_eng, X_test_poly], axis=1)

# 更新DMatrix
dtrain_final = xgb.DMatrix(X_train_final, label=y_train)
dtest_final = xgb.DMatrix(X_test_final, label=y_test)

system

4. 高级优化阶段

4.1 自定义损失函数

# 自定义Focal Loss
def focal_loss(preds, dtrain):
    labels = dtrain.get_label()
    alpha = 0.25
    gamma = 2.0
    
    preds = 1.0 / (1.0 + np.exp(-preds))  # sigmoid转换
    grad = alpha * labels * (1 - preds)**gamma * (gamma * preds * np.log(preds) + preds - 1)
    hess = alpha * labels * (1 - preds)**gamma * (
        gamma * (gamma - 1) * preds * (np.log(preds))**2 +
        2 * gamma * preds * np.log(preds) * (1 - preds) +
        preds * (1 - preds)
    )
    return grad, hess

# 使用自定义损失训练
params_custom = best_params.copy()
params_custom.update({'disable_default_eval_metric': 1})

model_custom = xgb.train(
    params_custom, dtrain_final,
    num_boost_round=best_num_rounds,
    obj=focal_loss,
    feval=lambda preds, dtrain: ('logloss', xgb.metrics.log_loss(dtrain.get_label(), 1/(1+np.exp(-preds))))
)

4.2 贝叶斯优化

from bayes_opt import BayesianOptimization

# 定义优化函数
def xgb_cv(max_depth, learning_rate, gamma, min_child_weight, subsample, colsample_bytree):
    params = {
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'seed': 42
    }
    
    cv_results = xgb.cv(
        params, dtrain_final,
        num_boost_round=100,
        nfold=5,
        metrics={'logloss'},
        early_stopping_rounds=10
    )
    
    # 返回负的logloss用于最大化
    return -cv_results['test-logloss-mean'].min()

# 设置参数边界
pbounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'gamma': (0, 0.5),
    'min_child_weight': (1, 10),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1)
}

# 运行贝叶斯优化
optimizer = BayesianOptimization(
    f=xgb_cv,
    pbounds=pbounds,
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=25)
best_bayes_params = optimizer.max['params']
best_bayes_params['max_depth'] = int(best_bayes_params['max_depth'])

system

5. 集成和最终调整

5.1 模型集成

# 创建多个模型集成
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

# 创建不同参数的模型
model1 = XGBClassifier(**best_params, n_estimators=best_num_rounds)
model2 = XGBClassifier(**best_bayes_params, n_estimators=best_num_rounds)
model3 = XGBClassifier(**best_params, learning_rate=best_lr*0.5, n_estimators=int(best_num_rounds*1.5))

# 创建集成模型
ensemble = VotingClassifier(
    estimators=[
        ('model1', model1),
        ('model2', model2),
        ('model3', model3)
    ],
    voting='soft'
)

# 训练集成模型
ensemble.fit(X_train_final, y_train)
ensemble_preds = ensemble.predict(X_test_final)
ensemble_acc = accuracy_score(y_test, ensemble_preds)
print(f"集成模型准确率: {ensemble_acc:.4f}")

5.2 模型校准

from sklearn.calibration import CalibratedClassifierCV

# 校准模型
calibrated = CalibratedClassifierCV(ensemble, method='isotonic', cv=3)
calibrated.fit(X_train_final, y_train)

# 评估校准效果
calibrated_preds = calibrated.predict_proba(X_test_final)[:, 1]
calibrated_acc = accuracy_score(y_test, calibrated_preds > 0.5)
print(f"校准后准确率: {calibrated_acc:.4f}")

system

优化关键点总结

优化阶段	关键操作	预期提升
数据预处理	缺失值处理、特征缩放、编码	建立可靠的数据基础
参数优化	树结构、正则化、学习率调整	提升模型泛化能力
特征工程	特征选择、交互特征、多项式特征	提高特征表达能力
高级优化	自定义损失、贝叶斯优化	精细调优模型性能
模型集成	多模型投票、堆叠	减少方差，提高稳定性
模型校准	概率校准	提高概率预测准确性

graph TD
    A[数据预处理] --> B[基准模型]
    B --> C[树结构参数优化]
    C --> D[正则化参数优化]
    D --> E[学习率/树数量优化]
    E --> F[特征工程]
    F --> G[自定义损失函数]
    G --> H[贝叶斯优化]
    H --> I[模型集成]
    I --> J[模型校准]
    J --> K[最终评估]

system

参数优先级排序
eta + num_round
max_depth 和 min_child_weight
gamma
subsample 和 colsample_bytree
lambda/alpha

system

总结：XGBoost 参数使用黄金法则

经验分享：在多数案例中，调整max_depth和eta即可解决70%的问题，正则化参数解决20%，其余参数解决最后10%的特殊需求。

从默认值开始：多数参数默认值已优化

优先调关键参数：

eta + num_round (学习过程)
max_depth (模型复杂度)
min_child_weight (防止过拟合)

逐步调整：一次只调整1-2个参数
善用正则化：gamma, lambda, alpha
始终使用早停：early_stopping_rounds=50
利用采样技术：subsample, colsample_by*

根据数据选择树方法：

小数据：exact
大数据：hist或gpu_hist