/** * Recursively splits nodes of a tree starting from the supplied node. * The splitting stops for any node for which the number of instances/points * falls below a given threshold (given by m_MaxInstInLeaf), or if the * maximum relative width/range of the instances/points * (i.e. max_i(max(att_i) - min(att_i)) ) falls below a given threshold * (given by m_MinBoxRelWidth). * * @param node The node to start splitting from. * @param universe The attribute ranges of the whole dataset. * @param depth The depth of the supplied node. * @throws Exception If there is some problem * splitting. */ protected void splitNodes(KDTreeNode node, double[][] universe, int depth) throws Exception { double[][] nodeRanges = m_EuclideanDistance.initializeRanges(m_InstList, node.m_Start, node.m_End); if (node.numInstances() <= m_MaxInstInLeaf || getMaxRelativeNodeWidth(nodeRanges, universe) <= m_MinBoxRelWidth) return; // splitting a node so it is no longer a leaf m_NumLeaves--; if (depth > m_MaxDepth) m_MaxDepth = depth; m_Splitter.splitNode(node, m_NumNodes, nodeRanges, universe); m_NumNodes += 2; m_NumLeaves += 2; splitNodes(node.m_Left, universe, depth + 1); splitNodes(node.m_Right, universe, depth + 1); }
/** * Assigns instances to the current centers called candidates. * * @param node The node to start assigning the instances from. * @param centers all the current centers. * @param candidates the current centers the method works on. * @param assignments the center index for each instance. * @param pc the threshold value for pruning. * @throws Exception If there is some problem assigning * instances to centers. */ protected void determineAssignments(KDTreeNode node, Instances centers, int[] candidates, int[] assignments, double pc) throws Exception { // reduce number of owners for current hyper rectangle int[] owners = refineOwners(node, centers, candidates); // only one owner if (owners.length == 1) { // all instances of this node are owned by one center for (int i = node.m_Start; i <= node.m_End; i++) { assignments[m_InstList[i]] // the assignment of this instance = owners[0]; // is the current owner } } else if (!node.isALeaf()) { // more than one owner and it is not a leaf determineAssignments(node.m_Left, centers, owners, assignments, pc); determineAssignments(node.m_Right, centers, owners, assignments, pc); } else { // this is a leaf and there are more than 1 owner // XMeans. assignSubToCenters(node, centers, owners, assignments); } }
/** * Finds the closest point in the hyper rectangle to a given point. Change the * given point to this closest point by clipping of at all the dimensions to * be clipped of. If the point is inside the rectangle it stays unchanged. The * return value is true if the point was not changed, so the the return value * is true if the point was inside the rectangle. * * @param node The current KDTreeNode in whose hyperrectangle the closest * point is to be found. * @param x a point * @return true if the input point stayed unchanged. */ protected boolean clipToInsideHrect(KDTreeNode node, Instance x) { boolean inside = true; for (int i = 0; i < m_Instances.numAttributes(); i++) { // TODO treat nominals differently!?? if (x.value(i) < node.m_NodeRanges[i][MIN]) { x.setValue(i, node.m_NodeRanges[i][MIN]); inside = false; } else if (x.value(i) > node.m_NodeRanges[i][MAX]) { x.setValue(i, node.m_NodeRanges[i][MAX]); inside = false; } } return inside; }
/** * Assigns instances of this node to center. Center to be assign to is decided * by the distance function. * * @param node The KDTreeNode whose instances are to be assigned. * @param centers all the input centers * @param centList the list of centers to work with * @param assignments index list of last assignments * @throws Exception If there is error assigning the instances. */ public void assignSubToCenters(KDTreeNode node, Instances centers, int[] centList, int[] assignments) throws Exception { // todo: undecided situations // WARNING: assignments is "input/output-parameter" // should not be null and the following should not happen if (assignments == null) { assignments = new int[m_Instances.numInstances()]; for (int i = 0; i < assignments.length; i++) { assignments[i] = -1; } } // set assignments for all instances of this node for (int i = node.m_Start; i <= node.m_End; i++) { int instIndex = m_InstList[i]; Instance inst = m_Instances.instance(instIndex); // if (instList[i] == 664) System.out.println("664***"); int newC = m_EuclideanDistance.closestPoint(inst, centers, centList); // int newC = clusterProcessedInstance(inst, centers); assignments[instIndex] = newC; } }
/** * Assigns instances of this node to center. Center to be assign to is decided * by the distance function. * * @param node The KDTreeNode whose instances are to be assigned. * @param centers all the input centers * @param centList the list of centers to work with * @param assignments index list of last assignments * @throws Exception If there is error assigning the instances. */ public void assignSubToCenters(KDTreeNode node, Instances centers, int[] centList, int[] assignments) throws Exception { // todo: undecided situations int numCent = centList.length; // WARNING: assignments is "input/output-parameter" // should not be null and the following should not happen if (assignments == null) { assignments = new int[m_Instances.numInstances()]; for (int i = 0; i < assignments.length; i++) { assignments[i] = -1; } } // set assignments for all instances of this node for (int i = node.m_Start; i <= node.m_End; i++) { int instIndex = m_InstList[i]; Instance inst = m_Instances.instance(instIndex); // if (instList[i] == 664) System.out.println("664***"); int newC = m_EuclideanDistance.closestPoint(inst, centers, centList); // int newC = clusterProcessedInstance(inst, centers); assignments[instIndex] = newC; } }
/** * Assigns instances of this node to center. Center to be assign to is decided * by the distance function. * * @param node The KDTreeNode whose instances are to be assigned. * @param centers all the input centers * @param centList the list of centers to work with * @param assignments index list of last assignments * @throws Exception If there is error assigning the instances. */ public void assignSubToCenters(KDTreeNode node, Instances centers, int[] centList, int[] assignments) throws Exception { // todo: undecided situations int numCent = centList.length; // WARNING: assignments is "input/output-parameter" // should not be null and the following should not happen if (assignments == null) { assignments = new int[m_Instances.numInstances()]; for (int i = 0; i < assignments.length; i++) { assignments[i] = -1; } } // set assignments for all instances of this node for (int i = node.m_Start; i <= node.m_End; i++) { int instIndex = m_InstList[i]; Instance inst = m_Instances.instance(instIndex); // if (instList[i] == 664) System.out.println(Thread.currentThread().getStackTrace()[1].getClassName() +"664***"); int newC = m_EuclideanDistance.closestPoint(inst, centers, centList); // int newC = clusterProcessedInstance(inst, centers); assignments[instIndex] = newC; } }
/** * Builds the KDTree on the supplied set of instances/points. It * is adviseable to run the replace missing attributes filter * on the passed instances first. * NOTE: This method should not be called from outside this * class. Outside classes should call setInstances(Instances) * instead. * * @param instances The instances to build the tree on * @throws Exception if something goes wrong */ protected void buildKDTree(Instances instances) throws Exception { checkMissing(instances); if (m_EuclideanDistance == null) m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance( instances); else m_EuclideanDistance.setInstances(instances); m_Instances = instances; int numInst = m_Instances.numInstances(); // Make the global index list m_InstList = new int[numInst]; for (int i = 0; i < numInst; i++) { m_InstList[i] = i; } double[][] universe = m_EuclideanDistance.getRanges(); // initializing internal fields of KDTreeSplitter m_Splitter.setInstances(m_Instances); m_Splitter.setInstanceList(m_InstList); m_Splitter.setEuclideanDistanceFunction(m_EuclideanDistance); m_Splitter.setNodeWidthNormalization(m_NormalizeNodeWidth); // building tree m_NumNodes = m_NumLeaves = 1; m_MaxDepth = 0; m_Root = new KDTreeNode(m_NumNodes, 0, m_Instances.numInstances() - 1, universe); splitNodes(m_Root, universe, m_MaxDepth + 1); }
/** * Returns the distance between a point and an hyperrectangle. * * @param node The current node from whose hyperrectangle * the distance is to be measured. * @param x the point * @return the distance * @throws Exception If some problem occurs in determining * the distance to the hyperrectangle. */ protected double distanceToHrect(KDTreeNode node, Instance x) throws Exception { double distance = 0.0; Instance closestPoint = (Instance)x.copy(); boolean inside; inside = clipToInsideHrect(node, closestPoint); if (!inside) distance = m_EuclideanDistance.distance(closestPoint, x); return distance; }
/** * Returns the distance between a point and an hyperrectangle. * * @param node The current node from whose hyperrectangle * the distance is to be measured. * @param x the point * @return the distance * @throws Exception If some problem occurs in determining * the distance to the hyperrectangle. */ protected double distanceToHrect(KDTreeNode node, Instance x) throws Exception { double distance = 0.0; Instance closestPoint = new Instance(x); boolean inside; inside = clipToInsideHrect(node, closestPoint); if (!inside) distance = m_EuclideanDistance.distance(closestPoint, x); return distance; }
/** * Recursively adds an instance to the tree starting from * the supplied KDTreeNode. * NOTE: This should not be called by outside classes, * outside classes should instead call update(Instance) * method. * * @param inst The instance to add to the tree * @param node The node to start the recursive search * from, for the leaf node where the supplied instance * would go. * @throws Exception If some error occurs while adding * the instance. */ protected void addInstanceToTree(Instance inst, KDTreeNode node) throws Exception { if (node.isALeaf()) { int instList[] = new int[m_Instances.numInstances()]; try { System.arraycopy(m_InstList, 0, instList, 0, node.m_End + 1); // m_InstList.squeezeIn(m_End, // index); if (node.m_End < m_InstList.length - 1) System.arraycopy(m_InstList, node.m_End + 1, instList, node.m_End + 2, m_InstList.length - node.m_End - 1); instList[node.m_End + 1] = m_Instances.numInstances() - 1; } catch (ArrayIndexOutOfBoundsException ex) { System.err.println("m_InstList.length: " + m_InstList.length + " instList.length: " + instList.length + "node.m_End+1: " + (node.m_End + 1) + "m_InstList.length-node.m_End+1: " + (m_InstList.length - node.m_End - 1)); throw ex; } m_InstList = instList; node.m_End++; node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst, node.m_NodeRanges); m_Splitter.setInstanceList(m_InstList); // split this leaf node if necessary double[][] universe = m_EuclideanDistance.getRanges(); if (node.numInstances() > m_MaxInstInLeaf && getMaxRelativeNodeWidth(node.m_NodeRanges, universe) > m_MinBoxRelWidth) { m_Splitter.splitNode(node, m_NumNodes, node.m_NodeRanges, universe); m_NumNodes += 2; } }// end if node is a leaf else { if (m_EuclideanDistance.valueIsSmallerEqual(inst, node.m_SplitDim, node.m_SplitValue)) { addInstanceToTree(inst, node.m_Left); afterAddInstance(node.m_Right); } else addInstanceToTree(inst, node.m_Right); node.m_End++; node.m_NodeRanges = m_EuclideanDistance.updateRanges(inst, node.m_NodeRanges); } }
/** * Refines the ownerlist. * * @param node The current tree node. * @param centers all centers * @param candidates the indexes of those centers that are candidates. * @return list of owners * @throws Exception If some problem occurs in refining. */ protected int[] refineOwners(KDTreeNode node, Instances centers, int[] candidates) throws Exception { int[] owners = new int[candidates.length]; double minDistance = Double.POSITIVE_INFINITY; int ownerIndex = -1; Instance owner; int numCand = candidates.length; double[] distance = new double[numCand]; boolean[] inside = new boolean[numCand]; for (int i = 0; i < numCand; i++) { distance[i] = distanceToHrect(node, centers.instance(candidates[i])); inside[i] = (distance[i] == 0.0); if (distance[i] < minDistance) { minDistance = distance[i]; ownerIndex = i; } } owner = (Instance)centers.instance(candidates[ownerIndex]).copy(); // are there other owners // loop also goes over already found owner, keeps order // in owner list int index = 0; for (int i = 0; i < numCand; i++) { // 1. all centers that are points within rectangle are owners if ((inside[i]) // 2. take all points with same distance to the rect. as the owner || (distance[i] == distance[ownerIndex])) { // add competitor to owners list owners[index++] = candidates[i]; } else { Instance competitor = (Instance)centers.instance(candidates[i]).copy(); if // 3. point has larger distance to rectangle but still can compete // with owner for some points in the rectangle (!candidateIsFullOwner(node, owner, competitor)) { // also add competitor to owners list owners[index++] = candidates[i]; } } } int[] result = new int[index]; for (int i = 0; i < index; i++) result[i] = owners[i]; return result; }
/** * Returns true if candidate is a full owner in respect to a competitor. * <p> * * The candidate has been the closer point to the current rectangle or even * has been a point within the rectangle. The competitor is competing with the * candidate for a few points out of the rectangle although it is a point * further away from the rectangle then the candidate. The extrem point is the * corner of the rectangle that is furthest away from the candidate towards * the direction of the competitor. * * If the distance candidate to this extreme point is smaller then the * distance competitor to this extreme point, then it is proven that none of * the points in the rectangle can be owned be the competitor and the * candidate is full owner of the rectangle in respect to this competitor. See * also D. Pelleg and A. Moore's paper 'Accelerating exact k-means Algorithms * with Geometric Reasoning'. * <p> * * @param node The current KDTreeNode / hyperrectangle. * @param candidate instance that is candidate to be owner * @param competitor instance that competes against the candidate * @return true if candidate is full owner * @throws Exception If some problem occurs. */ protected boolean candidateIsFullOwner(KDTreeNode node, Instance candidate, Instance competitor) throws Exception { // get extreme point Instance extreme = (Instance)candidate.copy(); for (int i = 0; i < m_Instances.numAttributes(); i++) { if ((competitor.value(i) - candidate.value(i)) > 0) { extreme.setValue(i, node.m_NodeRanges[i][MAX]); } else { extreme.setValue(i, node.m_NodeRanges[i][MIN]); } } boolean isFullOwner = m_EuclideanDistance.distance(extreme, candidate) < m_EuclideanDistance .distance(extreme, competitor); return isFullOwner; }
/** * Refines the ownerlist. * * @param node The current tree node. * @param centers all centers * @param candidates the indexes of those centers that are candidates. * @return list of owners * @throws Exception If some problem occurs in refining. */ protected int[] refineOwners(KDTreeNode node, Instances centers, int[] candidates) throws Exception { int[] owners = new int[candidates.length]; double minDistance = Double.POSITIVE_INFINITY; int ownerIndex = -1; Instance owner; int numCand = candidates.length; double[] distance = new double[numCand]; boolean[] inside = new boolean[numCand]; for (int i = 0; i < numCand; i++) { distance[i] = distanceToHrect(node, centers.instance(candidates[i])); inside[i] = (distance[i] == 0.0); if (distance[i] < minDistance) { minDistance = distance[i]; ownerIndex = i; } } owner = new Instance(centers.instance(candidates[ownerIndex])); // are there other owners // loop also goes over already found owner, keeps order // in owner list int index = 0; for (int i = 0; i < numCand; i++) { // 1. all centers that are points within rectangle are owners if ((inside[i]) // 2. take all points with same distance to the rect. as the owner || (distance[i] == distance[ownerIndex])) { // add competitor to owners list owners[index++] = candidates[i]; } else { Instance competitor = new Instance(centers.instance(candidates[i])); if // 3. point has larger distance to rectangle but still can compete // with owner for some points in the rectangle (!candidateIsFullOwner(node, owner, competitor)) { // also add competitor to owners list owners[index++] = candidates[i]; } } } int[] result = new int[index]; for (int i = 0; i < index; i++) result[i] = owners[i]; return result; }
/** * Returns true if candidate is a full owner in respect to a competitor. * <p> * * The candidate has been the closer point to the current rectangle or even * has been a point within the rectangle. The competitor is competing with the * candidate for a few points out of the rectangle although it is a point * further away from the rectangle then the candidate. The extrem point is the * corner of the rectangle that is furthest away from the candidate towards * the direction of the competitor. * * If the distance candidate to this extreme point is smaller then the * distance competitor to this extreme point, then it is proven that none of * the points in the rectangle can be owned be the competitor and the * candidate is full owner of the rectangle in respect to this competitor. See * also D. Pelleg and A. Moore's paper 'Accelerating exact k-means Algorithms * with Geometric Reasoning'. * <p> * * @param node The current KDTreeNode / hyperrectangle. * @param candidate instance that is candidate to be owner * @param competitor instance that competes against the candidate * @return true if candidate is full owner * @throws Exception If some problem occurs. */ protected boolean candidateIsFullOwner(KDTreeNode node, Instance candidate, Instance competitor) throws Exception { // get extreme point Instance extreme = new Instance(candidate); for (int i = 0; i < m_Instances.numAttributes(); i++) { if ((competitor.value(i) - candidate.value(i)) > 0) { extreme.setValue(i, node.m_NodeRanges[i][MAX]); } else { extreme.setValue(i, node.m_NodeRanges[i][MIN]); } } boolean isFullOwner = m_EuclideanDistance.distance(extreme, candidate) < m_EuclideanDistance .distance(extreme, competitor); return isFullOwner; }
/** * Corrects the start and end indices of a * KDTreeNode after an instance is added to * the tree. The start and end indices for * the master index array (m_InstList) * stored in the nodes need to be updated * for all nodes in the subtree on the * right of a node where the instance * was added. * NOTE: No outside class should call this * method. * * @param node KDTreeNode whose start and end indices * need to be updated. */ protected void afterAddInstance(KDTreeNode node) { node.m_Start++; node.m_End++; if (!node.isALeaf()) { afterAddInstance(node.m_Left); afterAddInstance(node.m_Right); } }