步骤三使用C4.5算法来构造决策树,核心代码如下所示:
defC45_createTree(dataset, labels, test_dataset):
classList = [example[-1] for examplein dataset]
if classList.count(classList[0]) ==len(classList):
# 类别完全相同,停止划分
return classList[0]
if len(dataset[0]) == 1:
# 遍历完所有特征时返回出现次数最多的
return majorityCnt(classList)
bestFeat = C45_chooseBestFeatureToSplit(dataset)
bestFeatLabel = labels[bestFeat]
print(u"此时最优索引为:" + (bestFeatLabel))
print(u"最优索引对应的子典为:", {bestFeatLabel:cds.ids2lables[bestFeatLabel]})
C45Tree = {bestFeatLabel: {}}
del (labels[bestFeat])
# 得到列表包括节点所有的属性值
featValues = [example[bestFeat] forexample in dataset]
uniqueVals = set(featValues)
if pre_pruning:
ans = []
for index inrange(len(test_dataset)):
ans.append(test_dataset[index][-1])
result_counter = Counter()
for vec in dataset:
result_counter[vec[-1]] += 1
leaf_output =result_counter.most_common(1)[0][0]
root_acc =cal_acc(test_output=[leaf_output] * len(test_dataset), label=ans)
outputs = []
ans = []
for value in uniqueVals:
cut_testset =splitdataset(test_dataset, bestFeat, value)
cut_dataset =splitdataset(dataset, bestFeat, value)
for vec in cut_testset:
ans.append(vec[-1])
result_counter = Counter()
for vec in cut_dataset:
result_counter[vec[-1]]+= 1
leaf_output =result_counter.most_common(1)[0][0]
outputs += [leaf_output] *len(cut_testset)
cut_acc = cal_acc(test_output=outputs,label=ans)
if cut_acc <= root_acc:
return leaf_output
for value in uniqueVals:
subLabels = labels[:]
C45Tree[bestFeatLabel][value] =C45_createTree(
splitdataset(dataset, bestFeat,value),
subLabels,
splitdataset(test_dataset,bestFeat, value))
if post_pruning:
tree_output =classifytest(C45Tree,
featLabels=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖率'],
testDataSet=test_dataset)
ans = []
for vec in test_dataset:
ans.append(vec[-1])
root_acc = cal_acc(tree_output,ans)
result_counter = Counter()
for vec in dataset:
result_counter[vec[-1]] += 1
leaf_output =result_counter.most_common(1)[0][0]
cut_acc = cal_acc([leaf_output] *len(test_dataset), ans)
if cut_acc >= root_acc:
return leaf_output
return C45Tree
defC45_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
baseEnt = cal_entropy(dataset)
bestInfoGain_ratio = 0.0
bestFeature = -1
for i in range(numFeatures): # 遍历所有特征
featList = [example[i] forexample in dataset]
uniqueVals = set(featList) # 将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
newEnt = 0.0
IV = 0.0
for value in uniqueVals: # 计算每种划分方式的信息熵
subdataset =splitdataset(dataset, i, value)
p = len(subdataset) /float(len(dataset))
newEnt += p *cal_entropy(subdataset)
IV = IV - p * log(p, 2)
infoGain = baseEnt - newEnt
if (IV == 0): # fix the overflow bug
continue
infoGain_ratio = infoGain /IV # 这个feature的infoGain_ratio
print(u"C4.5中第%d个特征的信息增益率为:%.3f" % (i,infoGain_ratio))
if (infoGain_ratio >bestInfoGain_ratio): # 选择最大的gain ratio
bestInfoGain_ratio =infoGain_ratio
bestFeature = i # 选择最大的gain ratio对应的feature
return bestFeature