集成学习不是一个单独的机器学习算法,而是通过在数据上构建多个模型,集成所有模型的建模结果,基本上现在的所有机器学习都能看到集成学习的身影
综合考虑多个弱评估器的结果,综合得到最终的结果,以此来获得比单个模型更好的回归或分类表现。
在集成学习中,主要用多个弱学习器来进行学习,完整的集成模型叫做集成评估器(ensemble estimator),组成评估器的每个模型都叫做基评估器(base estimator)。
集成学习模型主要分为三类: 袋装法(bagging)、提升法(boosting)以及堆叠法(stacking)。
袋装法的核心思想是通过构建互相独立的评估器,然后对其预测进行平均或投票的原则来决定集成模型的结果。
袋装法的代表作是随机森林。
随机森林的基础是决策树模型,决策树的基础知识由前章讲解过了。
决策树作为基评估器。
在此基础上,使用bagging方法,使用多个决策树,构建随机森林。
## 计算gini系数
def gini(data):
data_label = data.iloc[:, -1]
label_num = data_label.value_counts() #有几类,每一类的数量
res = 0
for k in label_num.keys():
p_k = label_num[k]/len(data_label)
res += p_k ** 2
return 1 - res
## 决策树节点类
class Node:
def __init__(self,max_setcions = 10):
self.type = None
self.threshold = None
self.feature = None
self.left = None
self.right = None
self.label = None
self.max_setcions = max_setcions
def fit(self, data, targets, stop_n):
count = np.bincount(targets)
condition = len(data) <= stop_n or np.max(count) / np.sum(count) > 0.99
if condition:
self.type = 'leaf'
self.label = np.argmax(count)
else:
self.type = 'root'
best_feature_i = 0
best_hold = 0
best_l_i = None
best_r_i = None
G = None
for i in range(len(data)):
feature_i = data[:,i]
if len(targets)<=self.max_setcions:
hold = feature_i
else:
holds = np.linspace(np.min(feature_i),np.max(feature_i),self.max_setcions)
for hold in holds:
l_i = (feature_i<=hold)
r_i = (feature_i>hold)
new_g = (len(l_i)*gini(targets[l_i])+len(r_i)*gini(targets[r_i]))/len(targets)
if G is None or new_g<G:
G = new_g
best_feature_i = i
best_hold = hold
best_l_i = l_i
best_r_i = r_i
self.feature = best_feature_i
self.threshold = best_hold
self.left = Node(self.max_setcions)
self.left.fit(data[best_l_i],targets[best_l_i],stop_n)
self.right = Node(self.max_setcions)
self.right.fit(data[best_r_i],targets[best_r_i],stop_n)
def decide(self,data):
if self.type == 'leaf':
return self.label
if data[self.feature] <=self.threshold:
return self.left.decide(data)
else:
return self.right.decide(data)
class DecisionTree:
def __init__(self, stop_n = 20, max_setcions=10):
self.stop_n = stop_n
self.max_setcions = max_setcions
## 训练
def fit(self,data,targets):
self.root.fit(data,targets,self.stop_n)
## 预测
def predict(self, data):
res = []
for data_i in data:
res.append(self.root.decide(data_i))
return np.array(res)
## 评估
def score(self, x_test, y_test):
res = self.predict(x_test)
return np.sum(res == y_test)/len(y_test)
class RandomTree:
def __init__(self, n_estimators = 32, n_samples = 100, seed = 100, stop_n = 32, max_setcions=10):
self.n_estimators = n_estimators
self.n_samples = n_samples
self.seed = seed
self.stop_n = stop_n
self.max_setcions = max_setcions
## 构建集成学习中的学习集,其中每个基学习器是决策树
self.estimators = [DecisionTree(stop_n=self.stop_n,max_setcions= self.max_setcions) for i in range(self.n_estimators)]
def fit(self, data, targets):
for i in range(self.n_estimators):
## 随机采样
sample_id = np.random.choice(len(data), self.n_samples)
self.estimators[i].fit(data[sample_id], targets[sample_id])
def predict(self, data, t = None):
res = []
for estimator in self.estimators:
res.append(estimator.predict(data))
res = np.array(res)
res = np.array([np.argmax(np.bincount(res[:,i])) for i in range(res.shape[1])])
return res
def score(self, x_test, y_test):
res = self.predict(x_test,y_test)
return np.sum(res == y_test) /len(y_test)
一文看懂随机森林 - Random Forest(4个实现步骤+10个优缺点) (easyai.tech)
机器学习手撕代码(2)决策树及随机森林_决策树,随机森林比较分类决策边界代码-CSDN博客