« | September 2025 | » | 日 | 一 | 二 | 三 | 四 | 五 | 六 | | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | | | | | |
| 公告 |
数据仓库&数据挖掘
对某一件事需要坚持方能真正完成这件事
薛 峰
2009.02.03 |
Blog信息 |
blog名称:数据仓库与数据挖掘 日志总数:85 评论数量:14 留言数量:0 访问次数:723134 建立时间:2005年3月17日 |

| |
[综合]ID3算法源程序(C语言) 网上资源, 软件技术, 电脑与网络, 科学研究
薛 峰 发表于 2005/3/25 15:43:53 |
PROTO.HNEGENTROPY negentropy ( REAL **, UINT, NODE*, UINT );void print_tree ( NODE* , CHAR** );void free_tree ( NODE * );NODE* ID3 ( MATRIX * , NODE* , UINT , UINT );void err_exit ( CHAR* , UINT );MATRIX *build_matrix ( UINT, UINT );void free_matrix ( MATRIX * );void read_matrix ( CHAR *, MATRIX * );void file_size ( CHAR * , UINT * , UINT * );CHAR **read_tags ( CHAR * , UINT );void free_tags ( CHAR **, UINT);ID3.htypedef unsigned int UINT;typedef unsigned long ULONG;typedef char CHAR;typedef unsigned char BOOL;typedef double REAL;typedef struct node { UINT idx; /* ID code for attribute */ REAL threshold; /* Numerical threshold for attribute test */ struct node *on; /* Address of 'on' node */ struct node *off; /* Address of 'off' node */ struct node *parent; /* Addess of parent node */} NODE;typedef struct ne_struct { REAL ne; UINT status;} NEGENTROPY;typedef struct matrix { UINT width; UINT height; REAL **data;} MATRIX;enum UINT { INACTIVE, OFF, ON };#define LN_2 0.693147180559945309417#define entropy(x) (x > 0 ? x * log(x) / LN_2 : 0.0)/** FILE: id3.c** Author: Andrew Colin** DISCLAIMER: No liability is assumed by the author for any use made* of this program.** DISTRIBUTION: Any use may be made of this program, as long as the* clear acknowledgment is made to the author in code and runtime* executables*/#include <stdio.h>#include <stdlib.h>#include <math.h>#include <float.h>#include <limits.h>#include <string.h>#include <conio.h>#include <time.h>#include "id3.h"#include "proto.h"/*-------------------------------------------------------------------*/MATRIX *build_matrix (UINT width, UINT height){ MATRIX *_matrix; UINT i; _matrix = (MATRIX*) malloc (sizeof (MATRIX)); if (!_matrix) err_exit (__FILE__, __LINE__); _matrix->width = width; _matrix->height = height; _matrix->data = (REAL**) malloc (height * sizeof (REAL*)); if (_matrix->data == NULL) err_exit(__FILE__, __LINE__); for (i=0; i<height; i++) { _matrix->data[i] = (REAL*) malloc (width * sizeof(REAL)); if (_matrix->data[i] == NULL) err_exit(__FILE__, __LINE__); } return _matrix;}/*-------------------------------------------------------------------*//** Standard error handler function*/void err_exit (CHAR* file, UINT line){ printf("\n Fatal error in file %s, line %u", file, line); exit(0);}/*-------------------------------------------------------------------*/void file_size (CHAR *file_name, UINT *width, UINT *height)/** Given the name of a file of numeric data, this routine counts* the numbers of rows and columns. It's assumed that the number* of entries is the same in each row, and an error is flagged if this* is not the case.**/{ FILE *f; UINT buf_size = 0xFF, _width = 0; CHAR *buffer, *ptr; *width = *height = 0; buffer = (CHAR*) malloc (buf_size * sizeof (CHAR)); if (buffer == NULL) err_exit (__FILE__, __LINE__); /* Open price file - abort if filename invalid */ f = fopen(file_name, "r"); if (f == NULL) { printf("\n File not found : %s\n", file_name); err_exit (__FILE__, __LINE__); } /* Get number of entries in first row */ if (fgets(buffer, buf_size, f) != NULL) { ++*height; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++*width; ptr = strtok (NULL, " ,"); } } /* Count numbers of subsequent rows */ while (!feof(f)) { if (fgets(buffer, buf_size, f) != NULL) { if (strlen(buffer) > strlen("\n")) /* if line is more than a NL char */ { ++*height; _width = 0; ptr = strtok (buffer, " ,"); while (ptr != NULL) { ++_width; ptr = strtok (NULL, " ,"); } if (*width != _width) { printf("\n Number of entries in file %s did not agree", file_name); err_exit (__FILE__, __LINE__); } } } } free (buffer);}/*-------------------------------------------------------------------*/void free_matrix (MATRIX *_matrix){ UINT i; for (i=0; i<_matrix->height; i++) free (_matrix->data[i]); free (_matrix->data); free (_matrix);}/*-------------------------------------------------------------------*/void free_tags ( CHAR** varname, UINT width){ UINT i; for (i=0; i<width; i++) free(varname[i]); free (varname);}/*-------------------------------------------------------------------*/void free_tree ( NODE *node ){ /* * Frees the memory allocated to a tree structure */ if (node == NULL) return; else { free_tree (node->on); free_tree (node->off); free(node); }}/*-------------------------------------------------------------------*/NODE* ID3 ( MATRIX *matrix, NODE* parent, UINT target, UINT state)/* Routine to build a decision tree, based on Quinlan's ID3 algorithm. */{ NEGENTROPY negentropy_struct; NODE *node; UINT n_vars = matrix->width, n_samples = matrix->height, i, j, split; REAL **data = matrix->data; REAL best_threshold, min_negentropy, _negentropy; /* Allocate memory for this node */ node = (NODE*) malloc (sizeof(NODE)); if (!node) err_exit (__FILE__, __LINE__); /* Set up links in decision tree */ node->parent = parent; /* Set address of parent node */ if (parent != NULL) /* parent to child; not relevant for root node */ { /* Pass address of this node to the parent node */ if (state == ON) parent->on = node; else if (state == OFF) parent->off = node; } /* * Select attribute with lowest negentropy for splitting. Scan through * ALL attributes (except the target) and ALL data samples. This is * pretty inefficient for data sets with repeated values, but will do * for illustrative purposes */ min_negentropy = 1.0; for (i=0; i<n_vars; i++) { for (j=0; j<n_samples; j++) { if (i != target) { /* Set trial values for this node... */ node->idx = i; node->threshold = data[j][i]; /* ...and calculate the negentropy of this partition */ negentropy_struct = negentropy (data, n_samples, node, target); _negentropy = negentropy_struct.ne; /* If this negentropy is lower than any other, retain the index and threshold for future use */ if (_negentropy < min_negentropy) { min_negentropy = _negentropy; split = i; best_threshold = data[j][i]; } } /*if (i != target)*/ } /*for (j=0; j<n_samples; j++)*/ } /*for (i=0; i<n_vars; i++)*/ /* Save the combination of best attribute and threshold value */ node->idx = split; node->threshold = best_threshold; /* * If the negentropy routine found itself at an end-of-branch * for the decision tree, the 'status' flag in 'negentropy_struct' * is set to ON or OFF and the node labelled accordingly. Otherwise, * ID3 continues to call itself until all end-of-branch nodes are * found. */ if (negentropy_struct.status != INACTIVE) { node->on = node->off = NULL; node->idx = negentropy_struct.status; } else { node->on = ID3 (matrix, node, target, ON); node->off = ID3 (matrix, node, target, OFF); } return node;}/*-------------------------------------------------------------------*/void main (int argv, char *argc[]){ MATRIX *matrix; NODE *node; UINT target, n_vars, n_samples; CHAR data_file[13], tag_file[13]; /* Longest file name in DOS */ CHAR **tag_names; /* Set up file names */ if (argv != 2) { printf("\nUsage: id3 [datafile]"); exit(0); } else { printf("\nWelcome to ID3"); printf("\nLast compiled on %s, %s", __DATE__, __TIME__); printf("\n"); strcpy(data_file, argc[1]); strcpy(tag_file, argc[1]); strcat(data_file, ".dat"); strcat(tag_file, ".tag"); } /* Read dimensions of data file */ file_size (data_file, &n_vars, &n_samples); /* Read labels for columns of data */ tag_names = read_tags (tag_file, n_vars); /* Allocate storage for data... */ matrix = build_matrix (n_vars, n_samples); /* ...and read it from disk */ read_matrix (data_file, matrix); /* Classification target is last column */ target = n_vars - 1; /* Return root of decision tree - ID3 continues to call itself recursively */ node = ID3 ( matrix, NULL, target, 0 ); print_tree(node, tag_names); printf("\n"); free_tags (tag_names, n_vars); free_matrix(matrix); free_tree (node);}/*-------------------------------------------------------------------*/NEGENTROPY negentropy ( REAL **data, UINT n_samples, NODE *local, UINT target){ /* * Calculates the entropy of classification of an attribute, given * a table of attributes already used, the attribute on which splitting * is to be taken, and the target attribute. Entropy is calculated in * bits, so logs are taken to base 2 by dividing by LN_2. * * The returned value always lies in the (closed) range [0, 1]. */ NEGENTROPY ret_val; NODE *_node, *_parent; UINT on_ctr, off_ctr, p1, p2, i, _match; REAL p_on, p_off, negentropy_on, negentropy_off; on_ctr = off_ctr = p1 = p2 = 0; /* Scan through all supplied data samples */ for (i=0; i<n_samples; i++) { /* * If pattern matches the current position in the decision tree, * then use this vector. The match is determined by passing up * the decision tree and checking whether 'data[idx] >= threshold' * matches at each step, where idx and threshold are taken from * each node in turn. */ _match = 1; _node = local; while (_node->parent != NULL) { /* If at the root node, all entries match*/ _parent = _node->parent; if (_node == _parent->on) { /* if parent node is ON */ if (data[i][_parent->idx] < _parent->threshold) _match = 0; } else if (_node == _parent->off) { /* if parent node is OFF */ if (data[i][_parent->idx] >= _parent->threshold) _match = 0; } _node = _parent; } if (_match) { if (data[i][local->idx] >= local->threshold) { on_ctr++; if (data[i][target] >= 0.5) p1++; } else { off_ctr++; if (data[i][target] >= 0.5) p2++; } } } /* for (i=0; i<n_samples; i++) */ /* 1: Entropy of subtable with activation ON */ /* * We now have the numbers of samples that match this part of the * decision tree, and the number of samples for which the supplied * condition are true. From these quantities we can find the negentropy of * this partition. */ if (on_ctr > 0) { p_on = (REAL) p1 / (REAL) on_ctr; p_off = 1 - p_on; negentropy_on = -entropy (p_on) - entropy (p_off); } else negentropy_on = 0.0; /* 2: Entropy of subtable with activation OFF */ if (off_ctr > 0) { p_on = (REAL) p2 / (REAL) off_ctr; p_off = 1 - p_on; negentropy_off = -entropy (p_on) - entropy (p_off); } else negentropy_off = 0.0; ret_val.ne = (negentropy_on * on_ctr + negentropy_off * off_ctr); ret_val.ne /= (on_ctr + off_ctr); /* * If all values examined were the same, set 'ret_val.status' to * the target value since this will be an end-of-branch node */ if ((p1 == on_ctr) && (p2 == off_ctr)) ret_val.status = ON; else if ((p1 == 0) && (p2 == 0)) ret_val.status = OFF; else ret_val.status = INACTIVE; return ret_val;}/*-------------------------------------------------------------------*/void print_tree (NODE *node, CHAR** tag_names){ /* * Displays algebraic equivalent of the decision tree */ if (node->on == NULL) { if (node->idx == ON) printf("ON"); else if (node->idx == OFF) printf("OFF"); return; } else printf("if { %s >= %1.2f then ", tag_names[node->idx], node->threshold); print_tree ( node->on, tag_names ); printf(" else "); print_tree ( node->off, tag_names ); printf( " } " );}/*-------------------------------------------------------------------*/void read_matrix (CHAR filename[], MATRIX *matrix){ UINT i, j; UINT height = matrix->height; UINT width = matrix->width; REAL **data = matrix->data; FILE *f; /* Open price file */ if ((f = fopen(filename, "r")) == NULL) { printf("\n File not found : %s\n", filename); err_exit (__FILE__, __LINE__); } for (i=0; i<height; i++) for (j=0; j<width; j++) { fscanf(f, "%lf\n", &data[i][j] ); } fclose(f);}/*-------------------------------------------------------------------*/CHAR **read_tags (CHAR *tag_file, UINT width){ FILE *f; CHAR **_varname; UINT i; CHAR buffer[0xFF]; f = fopen(tag_file, "r"); if (f == NULL) { printf("\n File not found : %s\n", tag_file); err_exit (__FILE__, __LINE__); } _varname = (CHAR**) malloc (width * sizeof (CHAR*)); for (i=0; i<width; i++) _varname[i] = (CHAR*) malloc (0xFF * sizeof (CHAR)); i = 0; while (!feof(f)) { if (fgets(buffer, 0xFF, f) != NULL) { if (strlen(buffer) > strlen("\n")) { if (i>width-1) { printf("\nMore variable names were detected than data items."); printf("\nPlease correct this problem before proceeding"); exit(0); } sscanf (buffer, "%[a-zA-Z0-9-_;:!@#$%^&*(){}[]]", _varname[i]); i++; } } } if (i<width) { printf("\nFewer variable names than data items were detected."); printf("\nPlease correct this problem before proceeding"); exit(0); } fclose (f); return _varname;} |
|
回复:ID3算法源程序(C语言) 网上资源, 软件技术, 电脑与网络, 科学研究
ddhz0703(游客)发表评论于2010/1/6 13:48:28 |
请问LZ那个.dat格式是什么样的啊?能给出一个例子吗?谢谢 |
|
回复:ID3算法源程序(C语言) 网上资源, 软件技术, 电脑与网络, 科学研究
ss(游客)发表评论于2009/5/15 11:58:46 |
妇科检查 妇科疾病 妇科病 妇科 滴虫性阴道炎 处女膜修复 无痛流产 超导可视无痛人流 北京男科医院
北京妇科医院 包皮手术 包皮环切 包皮过长 包皮包茎 包茎手术 白带异常 |
|
» 1 »
|