日撸代码300行：第66-68天(主动学习之 ALEC)

代码来自闵老师”日撸 Java 三百行（61-70天）
日撸 Java 三百行（61-70天，决策树与集成学习）_闵帆的博客-CSDN博客
本次代码的实现是基于高斯密度，ALEC算法原文是基于密度峰值，同样是基于密度聚类，稍微还是有一些差别。
基于聚类的主动学习的基本思想如下:
Step 1. 将对象按代表性递减排序;
Step 2. 假设当前数据块有 N个对象, 选择最具代表性的前 $\sqrt{N}$ 个查询其标签 (类别).
Step 3. 如果这 $\sqrt{N}$ 个标签具有相同类别, 就认为该块为纯的, 其它对象均分类为同一类别. 结束.
Step 4. 如果不纯，将当前块划分为两个子块, 分别 Goto Step 3.
package machinelearning.activelearning;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.Instances;

public class Alec {
	
	/**
	 * The whole data set.
	 */
	Instances dataset;
	
	/**
	 * The maximal number of queries that can be provided.
	 */
	int maxNumQuery;
	
	/**
	 * The actual number of queries.
	 */
	int numQuery;
	
	/**
	 * The radius, also dc in the paper. It is employed for density computation.
	 */
	double radius;
	
	/**
	 * The densities of instances, also rho in the paper.
	 */
	double[] densities;
	
	/**
	 * Distance to master
	 */
	double[] distanceToMaster;
	
	/**
	 * Sorted indices, where the first element indicates the instance with the biggest density.
	 */
	int[] descendantDensities;
	
	/**
	 * Priority
	 */
	double[] priority;
	
	/**
	 * The maximal distance between any pair of points.
	 */
	double maximalDistance;
	
	/**
	 * Who is my master?
	 */
	int[] masters;
	
	/**
	 * Predicted labels.
	 */
	int[] predictedLabels;
	
	/**
	 * Instance status. 0 for unprocessed, 1 for queried, 2 for classified.
	 */
	int[] instanceStatusArray;
	
	/**
	 * The descendant indices to show the representativeness of instances in a descendant order.
	 */
	int[] descendantRepresentatives;
	
	/**
	 * Indicate the cluster of each instance. It is only used in clusterInTwo(int[]);
	 */
	int[] clusterIndices;
	
	/**
	 * Blocks with size no more than this threshold should not be split further.
	 */
	int smallBlockThreshold = 3;
	
	
	/**
	 * *********************************************************
	 * The constructor.
	 * @param paraFilename
	 * *********************************************************
	 */
	public Alec(String paraFilename) {
		// TODO Auto-generated constructor stub
		try {
			FileReader tempReader = new FileReader(paraFilename);
			dataset = new Instances(tempReader);
			dataset.setClassIndex(dataset.numAttributes() - 1);
			tempReader.close();
		} catch (Exception e) {
			// TODO: handle exception
			System.out.println(e);
			System.exit(0);
		}//of try
		
		computeMaximalDistance();
		clusterIndices = new int[dataset.numInstances()];
		
	}//of the constructor
	
	/**
	 * ***********************************************************
	 * * Merge sort in descendant order to obtain an index array. The original
	 * array is unchanged. The method should be tested further. <br>
	 * Examples: input [1.2, 2.3, 0.4, 0.5], output [1, 0, 3, 2]. <br>
	 * input [3.1, 5.2, 6.3, 2.1, 4.4], output [2, 1, 4, 0, 3].
	 * 
	 * @param paraArray  The original array
	 * @return   The sorted indices.
	 * ***********************************************************
	 */
	public static int[] mergeSortToIndices(double[] paraArray) {
		int tempLength = paraArray.length;
		int[][] resultMatrix = new int[2][tempLength];
		
		//Initialize（这里初始化第一组就够了，第二组的数据在排序的时候是从第一组复制的）
		int tempIndex = 0;
		for (int i = 0; i < tempLength; i++) {
			resultMatrix[tempIndex][i] = i;
		}//of for i
		
		// Merge
		int tempCurrentLength = 1;
		// The indices for current merged groups.
		int tempFirstStart, tempSecondStart, tempSecondEnd;
		
		while (tempCurrentLength < tempLength) {
			// Divide into a number of groups.
			// Here the boundary is adaptive to array length not equal to 2^k.
			//Math.ceil()的作用是上取整，返回的是总共分了多少个“两组”数据。每进行一次for循环，完成一轮归并，当while条件不满足时，已经完成排序。
			//每一次是进行两组数的归并，所以每一轮的组数是（tempLength + 0.0）/(tempCurrentLength * 2);每一组数据的长度是tempCurrentLength
			for (int i = 0; i < Math.ceil((tempLength + 0.0) / tempCurrentLength /2); i++) {
				// Boundaries of the group
				tempFirstStart = i * tempCurrentLength * 2;
				tempSecondStart = tempFirstStart + tempCurrentLength;
				tempSecondEnd = tempSecondStart + tempCurrentLength - 1;
				
				if (tempSecondEnd >= tempLength) {
					tempSecondEnd = tempLength - 1;
				}//of if
				
				// Merge this group
				int tempFirstIndex = tempFirstStart;
				int tempSecondIndex = tempSecondStart;
				int tempCurrentIndex = tempFirstStart;
				
				if (tempSecondStart >= tempLength) {
					for (int j = tempFirstIndex; j < tempLength; j++) {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
						tempFirstIndex ++;
						tempCurrentIndex ++;
					}//of for j
					break;
				}//of if
				
				while ((tempFirstIndex <= tempSecondStart - 1) && (tempSecondIndex <= tempSecondEnd)) {
					if (paraArray[resultMatrix[tempIndex % 2][tempFirstIndex]] <= paraArray[resultMatrix[tempIndex % 2][tempSecondIndex]]) {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempSecondIndex];
						tempSecondIndex ++;
					}else {
						resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][tempFirstIndex];
						tempFirstIndex ++;
					}//of if
					tempCurrentIndex ++;
				}//of while
				
				// Remaining part
				for (int j = tempFirstIndex; j < tempSecondStart; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
					tempCurrentIndex++;
				}//of for j
				
				for (int j = tempSecondIndex; j <= tempSecondEnd; j++) {
					resultMatrix[(tempIndex + 1) % 2][tempCurrentIndex] = resultMatrix[tempIndex % 2][j];
					tempCurrentIndex++;
				}//of for j
			}//of for i
			
			tempCurrentLength *= 2;
			tempIndex ++;     //交替使用两个数组为当前组，另一个数组用于新一轮排序时复制索引号。
		}//of while
		
		return resultMatrix[tempIndex % 2];
	}//of mergeSortToIndices
	
	/**
	 * **********************************************************************
	 * The Euclidean distance between two instances. Other distance measures
	 * unsupported for simplicity.
	 * 
	 * @param paraI   The index of the first instance.
	 * @param paraJ   The index of the second instance.
	 * @return   The distance.
	 * **********************************************************************
	 */
	public double distance(int paraI, int paraJ) {
		double resultDistance = 0;
		double tempDifference;
		
		for (int i = 0; i < dataset.numAttributes() - 1; i++) {
			tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
			resultDistance += tempDifference * tempDifference;
		}//of for i
		
		resultDistance = Math.sqrt(resultDistance);
		
		return resultDistance;
	}//of distance
	
	/**
	 * *****************************************************************
	 * Compute the maximal distance. The result is stored in a member variable.
	 * *****************************************************************
	 */
	public void computeMaximalDistance() {
		maximalDistance = 0;
		double tempDistance;
		for (int i = 0; i < dataset.numInstances(); i++) {
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);
				if (maximalDistance < tempDistance) {
					maximalDistance = tempDistance;
				}//of if
			}//of for j
		}//of for i
		
		System.out.println("maximalDistance = " + maximalDistance);
	}//of computeMaximalDistance
	
	/**
	 * ****************************************************************
	 * Compute the densities using Gaussian kernel.
	 * 这里不同于原文的密度峰值聚类，这里用的高斯核函数计算实例的密度。密度峰值是设定一个距离阈值，在距离范围内的实例个数就是当前实例的密度。
	 * 
	 * @param paraBlock    The given block.
	 * ****************************************************************
	 */
	public void computeDensitiesGaussian() {
		System.out.println("radius = " + radius);
		densities = new double[dataset.numInstances()];
		double tempDistance;
		
		for (int i = 0; i < dataset.numInstances(); i++) {
			for (int j = 0; j < dataset.numInstances(); j++) {
				tempDistance = distance(i, j);
				densities[i] += Math.exp(-tempDistance * tempDistance /(radius * radius));
			}//of for j
		}//of for i
		
		System.out.println("The densities are " + Arrays.toString(densities) + "\r\n");
	}//of computeDensitiesGaussian
	
	/**
	 * ***************************************************************
	 * Compute distanceToMaster, the distance to its master.
	 * ***************************************************************
	 */
	public void computeDistanceToMaster() {
		distanceToMaster = new double[dataset.numInstances()];
		masters = new int[dataset.numInstances()];
		descendantDensities = new int[dataset.numInstances()];
		instanceStatusArray = new int[dataset.numInstances()];
		
		descendantDensities = mergeSortToIndices(densities);
		distanceToMaster[descendantDensities[0]] = maximalDistance;
		
		double tempDistance;
		for (int i = 1; i < dataset.numInstances(); i++) {
			// Initialize.
			distanceToMaster[descendantDensities[i]] = maximalDistance;
			//只有密度比自己大的才可能是自己的Master，所以排序在i以后实例不用计算。
			for (int j = 0; j <= i - 1; j++) {
				tempDistance = distance(descendantDensities[i], descendantDensities[j]);
				if (distanceToMaster[descendantDensities[i]] > tempDistance) {
					distanceToMaster[descendantDensities[i]] = tempDistance;
					masters[descendantDensities[i]] = descendantDensities[j];
				}//of if
			}//of for j
		}//of for i
		System.out.println("First compute, masters = " + Arrays.toString(masters));
		System.out.println("descendantDensities = " + Arrays.toString(descendantDensities));
	}//of computeDistanceToMaster
	
	/**
	 * ****************************************************************
	 * Compute priority. Element with higher priority is more likely to be
	 * selected as a cluster center. Now it is rho * distanceToMaster. It can
	 * also be rho^alpha * distanceToMaster.
	 * ****************************************************************
	 */
	public void computePriority() {
		priority = new double[dataset.numInstances()];
		for (int i = 0; i < dataset.numInstances(); i++) {
			priority[i] = densities[i] * distanceToMaster[i];
		}//of for i
	}//of computePriority
	
	/**
	 * *******************************************************************
	 * The block of a node should be same as its master. This recursive method is efficient.
	 * @param paraIndex  The index of the given node.
	 * @return  The cluster index of the current node.
	 * *******************************************************************
	 */
	public int coincideWithMaster(int paraIndex) {
		if (clusterIndices[paraIndex] == -1) {
			int tempMaster = masters[paraIndex];
			clusterIndices[paraIndex] = coincideWithMaster(tempMaster);
		}//of if
		
		return clusterIndices[paraIndex];
	}//of coincideWithMaster
	
	/**
	 * *********************************************************************
	 * Cluster a block in two. According to the master tree.
	 * 
	 * @param paraBlock  The given block.
	 * @return  The new blocks where the two most represent instances serve as the root.
	 * *********************************************************************
	 */
	public int[][] clusterInTwo(int[] paraBlock) {
		// Reinitialize. In fact, only instances in the given block is considered.
		Arrays.fill(clusterIndices, -1);
		
		// Initialize the cluster number of the two roots.
		//这里把数组paraBlock的前两个元素存储的序号，所对应的簇标签分别设置为0和1.
		//paraBlock的前两个序号对应的原始数据集的实例标签如果相同，这里有没有影响？？？
		for (int i = 0; i < 2; i++) {
			clusterIndices[paraBlock[i]] = i;
		}//of for i
		
		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] != -1) {
				continue;
			}//of if
			
			//i实例和自己的Master具有相同的类标签。
			clusterIndices[paraBlock[i]] = coincideWithMaster(masters[paraBlock[i]]);
		}//of for i
		
		//The sub blocks.
		int[][] resultBlock = new int[2][];
		int tempFirstBlockCount = 0;
		//长度是clusterIndices.length，此时没在paraBlock中的实例，上面代码已经填充了-1.
		//只有paraBlock[i]才有数据。如果这里长度设置为paraBlock.length，下面判断条件应该是clusterIndices[paraBlock[i]] == 0。
		//否则i无法将所有实例全部遍历。
		for (int i = 0; i <clusterIndices.length; i++) {
			if (clusterIndices[i] == 0) {
				tempFirstBlockCount++;
			}//of if
		}//of for i
		resultBlock[0] = new int[tempFirstBlockCount];
		resultBlock[1] = new int[paraBlock.length - tempFirstBlockCount];
		
		// Copy. You can design shorter code when the number of clusters is greater than 2.
		int tempFirstIndex = 0;
		int tempSecondIndex = 0;
		for (int i = 0; i < paraBlock.length; i++) {
			if (clusterIndices[paraBlock[i]] == 0) {
				resultBlock[0][tempFirstIndex] = paraBlock[i];
				tempFirstIndex ++;
			} else {
				resultBlock[1][tempSecondIndex] = paraBlock[i];
				tempSecondIndex ++;
			}//of if
		}//of for i
		
		System.out.println("Split (" + paraBlock.length + ") instances "
				+ Arrays.toString(paraBlock) + "\r\nto (" + resultBlock[0].length + ") instances "
				+ Arrays.toString(resultBlock[0]) + "\r\nand (" + resultBlock[1].length
				+ ") instances " + Arrays.toString(resultBlock[1]));
		
		return resultBlock;
	}//of clusterInTwo
	
	/**
	 * **************************************************************
	 * Classify instances in the block by simple voting.
	 * 
	 * @param paraBlock   The given block.
	 * **************************************************************
	 */
	public void vote(int[] paraBlock) {
		int[] tempClassCounts = new int[dataset.numClasses()];
		
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 1) {
				//"1"代表可查询标签的实例。
				tempClassCounts[(int)dataset.instance(paraBlock[i]).classValue()]++;
			}//of if
		}//of for i
		
		int tempMaxClass = -1;
		int tempMaxCount = -1;
		
		for (int i = 0; i < tempClassCounts.length; i++) {
			if (tempMaxCount < tempClassCounts[i]) {
				tempMaxCount = tempClassCounts[i];
				tempMaxClass = i;
			}//of if
		}//of for i
		
		// Classify unprocessed instances.
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 0) {
				predictedLabels[paraBlock[i]] = tempMaxClass;
				instanceStatusArray[paraBlock[i]] = 2;
			}//of if
		}//of for i
	}//of vote
	
	/**
	 * *****************************************************************************************************
	 * Cluster based active learning. Prepare for
	 * 
	 * @param paraRatio   The ratio of the maximal distance as the dc.
	 * @param paraMaxNumQuery    The maximal number of queries for the whole dataset.
	 * @param paraSmallBlockThreshold    The small block threshold.
	 * *****************************************************************************************************
	 */
	public void clusterBasedActiveLearning(double paraRatio, int paraMaxNumQuery, int paraSmallBlockThreshold) {
		radius = maximalDistance * paraRatio;
		smallBlockThreshold = paraSmallBlockThreshold;
		
		maxNumQuery = paraMaxNumQuery;
		predictedLabels = new int[dataset.numInstances()];
		
		for (int i = 0; i < dataset.numInstances(); i++) {
			predictedLabels[i] = -1;
		}//of for i
		
		computeDensitiesGaussian();
		computeDistanceToMaster();
		computePriority();
		descendantRepresentatives = mergeSortToIndices(priority);
		System.out.println("descendantRepresentatives = " + Arrays.toString(descendantRepresentatives));
		
		numQuery = 0;
		
		clusterBasedActiveLearning(descendantRepresentatives);
	}//of clusterBasedActiveLearning
	
	/**
	 * *******************************************************************************************************************
	 * Cluster based active learning.
	 * 
	 * @param paraBlock  The given block. This block must be sorted according to the priority in descendant order.
	 * *******************************************************************************************************************
	 */
	public void clusterBasedActiveLearning(int[] paraBlock) {
		System.out.println("clusterBasedActiveLearning for block " + Arrays.toString(paraBlock));
		
		// Step 1. How many labels are queried for this block.
		int tempExpectedQueries = (int)Math.sqrt(paraBlock.length);
		int tempNumQuery = 0;
		
		for (int i = 0; i < paraBlock.length; i++) {
			if (instanceStatusArray[paraBlock[i]] == 1) {
				tempNumQuery ++;
			}//of if
		}//of for i
		
		// Step 2. Vote for small blocks.
		if ((tempNumQuery >= tempExpectedQueries) && (paraBlock.length <= smallBlockThreshold)) {
			System.out.println("" + tempNumQuery + " instances are queried, vote for block: \r\n"
					+ Arrays.toString(paraBlock));
			vote(paraBlock);
			return;
		}//of if
		
		// Step 3. Query enough labels.
		for (int i = 0; i < tempExpectedQueries; i++) {
			if (numQuery >= maxNumQuery) {
				System.out.println("No more queries are provided, numQuery = " + numQuery + ".");
				vote(paraBlock);
				return;
			}//of if
			
			if (instanceStatusArray[paraBlock[i]] == 0) {
				instanceStatusArray[paraBlock[i]] = 1;
				predictedLabels[paraBlock[i]] = (int)dataset.instance(paraBlock[i]).classValue();
				numQuery ++;
			}//of if
		}//of for i
		
		//Step 4. Pure?
		int tempFirstLabel = predictedLabels[paraBlock[0]];
		boolean tempPure = true;
		for (int i = 1; i < tempExpectedQueries; i++) {
			if (predictedLabels[paraBlock[i]] != tempFirstLabel) {
				tempPure = false;
				break;
			}//of if
		}//of for i
		
		if (tempPure) {
			System.out.println("Classify for pure block: " + Arrays.toString(paraBlock));
			for (int i = tempExpectedQueries; i < paraBlock.length; i++) {
				if (instanceStatusArray[paraBlock[i]] == 0) {
					instanceStatusArray[paraBlock[i]] = 2;
					predictedLabels[paraBlock[i]] = tempFirstLabel;
				}//of if
				
			}//of for i
			return;
		}//of if
		
		// Step 5. Split in two and process them independently.
		int[][] tempBlocks = clusterInTwo(paraBlock);
		for (int i = 0; i < 2; i++) {
			clusterBasedActiveLearning(tempBlocks[i]);
		}//of for i
	}//of clusterBasedActiveLearning
	
	/**
	 ****************************************************** 
	 * Show the statistics information.
	 ******************************************************
	 */
	public String toString() {
		int[] tempStatusCounts = new int[3];
		double tempCorrect = 0;
		
		for (int i = 0; i < dataset.numInstances(); i++) {
			tempStatusCounts[instanceStatusArray[i]]++;
			if (predictedLabels[i] == (int) dataset.instance(i).classValue()) {
				tempCorrect ++;
			}//of if
		}//of for i
		
		String resultString = "(unhandled, queried, classified) = " + Arrays.toString(tempStatusCounts);
		
		resultString += "\r\nCorrect = " + tempCorrect + ", accuracy = " + (tempCorrect / dataset.numInstances());
		
		return resultString;
	}//ofr toString
	
	/**
	 * ********************************************************************
	 * The entrance of the program.
	 * 
	 * @param args
	 * ********************************************************************
	 */
	public static void main(String args[]) {
		long tempStart = System.currentTimeMillis();
		
		System.out.println("Starting ALEC.");
		String arffFileName = "E:/Datasets/UCIdatasets/其他数据集/iris.arff";
		Alec tempAlec = new Alec(arffFileName);
		
		// The settings for iris
		tempAlec.clusterBasedActiveLearning(0.15, 30, 3);
		System.out.println(tempAlec);
		
		long tempEnd = System.currentTimeMillis();
		System.out.println("Runtime: " + (tempEnd - tempStart) + "ms.");
	}//of main
	
}//of Alec