-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathXGBoost.py
More file actions
96 lines (80 loc) · 3.36 KB
/
XGBoost.py
File metadata and controls
96 lines (80 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# References:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_is_fitted
from scipy.sparse import issparse
from sklearn.model_selection import GridSearchCV
file_path = ['../raw/sale_data/economics_train.csv', '../raw/sale_data/economics_test.csv']
X = pd.read_csv(file_path[0], index_col='Id')
X_test = pd.read_csv(file_path[1], index_col='Id')
X.drop(axis=0, subset=['GDP'], inplace=True)
y = X.GDP
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=9)
xgb_mdl = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb_mdl.fit(X_train, y_train)
preds = xgb_mdl.predict(X_valid)
mae = mean_absolute_error(preds, y_valid)
print("The Mean Absolute Error:" , mae)
# Optimize XGBoost with GPU
class XGBoostClassifierDMatrix(xgb.XGBClassifier, BaseEstimator, ClassifierMixin):
"""
Wrapper for XGBoost classifier to accept DMatrix in fit.
"""
def __init__(self, use_dmatrix=True, **kwargs):
super().__init__(**kwargs)
self.use_dmatrix = use_dmatrix
self.xgb_dm = None
def fit(self, X, y=None, sample_weight=None, eval_set=None, **kwargs):
if self.use_dmatrix:
if isinstance(X, xgb.DMatrix):
self.xgb_dm = X
else:
self.xgb_dm = xgb.DMatrix(X, label=y) # labels/target values (y) for supervised learning
super().fit(self.xgb_dm, y, sample_weight=sample_weight, eval_set=eval_set, **kwargs)
else:
super().fit(X, y, sample_weight=sample_weight, eval_set=eval_set, **kwargs)
return self
def predict(self, X, **kwargs):
if self.use_dmatrix and not isinstance(X, xgb.DMatrix):
X = xgb.DMatrix(X)
return super().predict(X, **kwargs)
def fit_predict(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.predict(X)
print(XGBoostClassifierDMatrix.__mro__)
# Why Use DMatrix instead of NumPy array, Pandas DataFrame, SciPy sparse matrix, or CSV file?
# Optimized for fast matrix operations (2-5x faster than raw arrays); required for GPU acceleration (device='cuda'); compresses data (critical for large datasets).
def optimized_grid_search_xgb_gpu(X_train, y_train):
param_grid = {
'learning_rate': [0.1, 0.05],
'max_depth': [3, 5, 7],
'colsample_bytree': [0.8, 0.7, 0.6, 0.5],
'subsample': [0.8, 0.7, 0.6, 0.5],
'reg_lambda': [1, 5],
'reg_alpha': [0, 1],
'n_estimators': [100, 500, 1000],
}
# Convert to numpy arrays (optional, XGBoost handles DataFrames)
X_train_np = np.array(X_train)
y_train_np = np.array(y_train)
model = XGBoostClassifierDMatrix(
device='cuda',
enable_categorical=True,
use_dmatrix=True #
)
grid_search = GridSearchCV(
model,
param_grid,
cv=5,
scoring='accuracy',
verbose=1
)
grid_search.fit(X_train_np, y_train_np)
print(f"Best parameters: {grid_search.best_params_}")
return grid_search