基于威斯康辛乳腺癌数据集,采用决策树的方法进行肿瘤预测。
1.加载sklearn自带的威斯康星乳腺癌数据集,探索数据。
2.进行数据集分割。
3.配置决策树模型。
4.训练决策树模型。
5.模型预测。
6.模型评估。
7.参数调优。可以根据评估结果,对模型设置或调整为更优的参数,使评估结果更准确。
from sklearn import tree # 导入决策树包
from sklearn.metrics import accuracy_score # 导入准确率评价指标
from sklearn.datasets import load_breast_cancer # 导入威斯康星乳腺癌数据集
import numpy as np
import matplotlib.pyplot as plt
import graphviz
def showDataSet(dataMat, labelMat):
"""
数据可视化
Parameters:
dataMat - 数据矩阵
labelMat - 数据标签
Returns:
无
"""
data_plus = [] #正样本
data_minus = [] #负样本
for i in range(len(dataMat)):
if labelMat[i] > 0:
data_plus.append(dataMat[i])
else:
data_minus.append(dataMat[i])
data_plus_np = np.array(data_plus) #转换为numpy矩阵
data_minus_np = np.array(data_minus) #转换为numpy矩阵
plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图
plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图
plt.show()
# 1.加载数据集:
breast_cancer = load_breast_cancer() # 载入数据集
# print('breast_cancer部分数据集特征')
# print(breast_cancer.data[:10])
# print(breast_cancer.data.shape) # 569*30
# 2.配置模型
clf_gini = tree.DecisionTreeClassifier(criterion = 'entropy',max_depth=3) #按照基尼系数加载决策树模型
#3.训练模型
clf_gini.fit(breast_cancer.data[:500], breast_cancer.target[:500]) #模型训练,取前80%作训练集
# 4.模型预测
predictions_gini = clf_gini.predict(breast_cancer.data[500:]) # 模型测试,取后20%作测试集
print("predictions_gini",predictions_gini)
#5.模型评估
print("CART:")
print('Accuracy:%s'% accuracy_score(breast_cancer.target[500:], predictions_gini))
errArr = np.mat(np.ones((len(breast_cancer.data[500:]), 1)))
LabelArr=breast_cancer.target[500:]
predictions_gini=np.mat(predictions_gini).T
# print(predictions_gini)
# print(np.mat(LabelArr).T)
showDataSet(breast_cancer.data[:500],breast_cancer.target[:500]) # 训练集
showDataSet(breast_cancer.data[500:],breast_cancer.target[500:]) # 测试集
print("TP: ",errArr[(predictions_gini == 1) & (predictions_gini == np.mat(LabelArr).T)].sum())
print("FP: ",errArr[(predictions_gini == 1) & (predictions_gini != np.mat(LabelArr).T)].sum())
print("TN: ",errArr[(predictions_gini == 0) & (predictions_gini == np.mat(LabelArr).T)].sum())
print("FN: ",errArr[(predictions_gini == 0) & (predictions_gini != np.mat(LabelArr).T)].sum())
# 决策树可视化
feature_name = ['mean radius', 'mean texture',
'mean perimeter', 'mean area',
'mean smoothness', 'mean compactness',
'mean concavity', 'mean concave points',
'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error',
'perimeter error', 'area error',
'smoothness error', 'compactness error',
'concavity error', 'concave points error',
'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture',
'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness',
'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension']
class_name = ['Not', 'Is']
treeAB_d4_dot = tree.export_graphviz(
clf_gini
, out_file=None
, feature_names=feature_name
, class_names=class_name
)
graph = graphviz.Source(treeAB_d4_dot)
print(graph)
graph.render("work/practice/TreeForAgeAndBalanceD4")
predictions_gini [1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1]
CART:
Accuracy:0.9710144927536232
TP: 51.0
FP: 1.0
TN: 16.0
FN: 1.0