class xgb_fill(BaseEstimator, TransformerMixin): #建立一个xgb_fill的填补方法
def __init__(self,
num_list: list = None,
cate_list: list = None,
diff_num: int =8,
random_state: int = 0):
self.num_list = num_list
self.cate_list = cate_list
self.diff_num = diff_num
self.random_state = random_state
self.xgb_cla_dict = {}
self.xgb_reg_dict = {}
def fit(self, X, y=None):
from tqdm import tqdm
X = X.copy()
if self.num_list is None:
self.num_list = []
for col in X.columns:
kind = get_kind(x=X[col], diff_limit=self.diff_num)
if kind == 'numeric':
self.num_list.append(col)
if self.cate_list is None:
self.cate_list = []
for col in X.columns:
kind = get_kind(x=X[col], diff_limit=self.diff_num)
if kind == 'categorical':
self.cate_list.append(col)
for col in tqdm(self.cate_list):
file = X.copy()
if file[col].isnull().any():
df = pd.get_dummies(file, columns=[i for i in self.cate_list if i != col],
prefix=[i for i in self.cate_list if i != col],
dummy_na=True)
not_null = df.dropna(subset=[col])
x_ = not_null.drop([col], axis=1)
y_ = not_null[col]
xgb_cla = xgb.XGBClassifier(random_state=self.random_state)
xgb_cla.fit(x_, y_)
self.xgb_cla_dict[col] = xgb_cla
for col in tqdm(self.num_list):
file = X.copy()
if file[col].isnull().any():
df = pd.get_dummies(file, columns=self.cate_list, dummy_na=True, prefix=self.cate_list)
not_null = df.dropna(subset=[col])
x_ = not_null.drop([col], axis=1)
y_ = not_null[col]
xgb_reg = xgb.XGBRegressor(random_state=self.random_state, objective='reg:squarederror')
xgb_reg.fit(x_, y_)
self.xgb_reg_dict[col] = xgb_reg
print('fit xgb fill the Na success!')
return self
def transform(self, X):
X = X.copy()
from tqdm import tqdm
for col in tqdm(self.cate_list):
file = X.copy()
if file[col].isnull().any():
df = pd.get_dummies(file, columns=[i for i in self.cate_list if i != col],
prefix=[i for i in self.cate_list if i != col],
dummy_na=True)
not_null = df.dropna(subset=[col])
null = df.drop(not_null.index)
null[col] = self.xgb_cla_dict[col].predict(null.drop([col], axis=1))
X[col] = pd.concat([null, not_null], axis=0)[col]
else:
X[col] = file[col]
for col in tqdm(self.num_list):
file = X.copy()
if file[col].isnull().any():
df = pd.get_dummies(file, columns=self.cate_list, dummy_na=True, prefix=self.cate_list)
not_null = df.dropna(subset=[col])
null = df.drop(not_null.index)
null[col] = self.xgb_reg_dict[col].predict(null.drop([col], axis=1))
X[col] = pd.concat([null, not_null], axis=0)[col]
else:
X[col] = file[col]
print('transform xgb fill the NA success!')
return X
[21:55:25] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [21:55:25] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.