西电数据挖掘决策树算法.doc
文本预览下载声明
数据挖掘算法实验报告
实验题目
基于决策树的分类算法,属性的选择采用ID3 或C4.5策略,采用如下的数据建立分类决策树。
算法基本思想的描述
ID3选择具有最高信息熵增益的属性作为分裂属性,基于这种原则我们首先可以算出初始集合的熵,然后分别求出以各个属性为分裂属性时的熵,然后将通过上面得到的数据算出以各个属性为分裂属性时的信心增益,选择具有最大的信息增益属性作为我们的分裂属性。
编程实现算法
#include iostream
#include math.h
#include string.h
using namespace std;
#define SIZE 14
struct Data
{
char age[10];
char income[10];
char student[10];
char credit_rating[20];
char buys_computer[10];
};
Data data [SIZE]={
{=30,high,no,fair,no},
{=30,high,no,excellent,no},
{31...40,high,no,fair,yes},
{40,medium,no,fair,yes},
{40,low,yes,fair,yes},
{40,low,yes,excellent,no},
{31...40,low,yes,excellent,yes},
{=30,medium,no,fair,no},
{=30,low,yes,fair,yes},
{40,medium,yes,fair,yes},
{=30,medium,yes,excellent,yes},
{31...40,medium,no,excellent,yes},
{31...40,high,yes,fair,yes},
{40,medium,no,excellent,no}
};
double calculate(double a,double b);
void origin_entropy(Data data[],double entropy);
void age_entropy(Data data[],double entropy);
void income_entropy(Data data[],double entropy);
void student_entropy(Data data[],double entropy);
void credit_rating_entropy(Data data[],double entropy);
int main()
{
double origin=0,age=0,student=0,credit_rating=0,income=0;
origin_entropy(data,origin);
age_entropy(data,age);
student_entropy(data,student);
income_entropy(data,income);
credit_rating_entropy(data,credit_rating);
coutinfo(D)=originendl;
cout用age作为分裂属性时:\n熵info(age)(D)=age\t\t信息增益为:origin-ageendl;
cout用income作为分裂属性时:\n熵info(income)(D)=income\t\t信息增益为:origin-incomeendl;
cout用student作为分裂属性时:\n熵info(student)(D)=):student\t\t信息增益为:origin-studentendl;
cout用credit_rating作为分裂属性时:\n熵info(credit_rating)(D)=credit_rating\t信息增益为:origin-credit_ratingendl;
return 0;
}
double calculate(double a,double b)
{
if(a==0)
return 0;
else
return (a/b)*log10(b/a)/log10(2);
}
void origin_entropy(Data data[],double entropy)
{
int i;
double yes=0, no=0;
for( i=0;iSIZE;i++)
{
if(strcmp(data[i].buys_computer,yes)==0)
yes++;
else
no+
显示全部