/** * Set the dummy priming data (two-instance dataset that contains global * min/max for numeric attributes) for the distance function to use when * normalizing numeric attributes. This method should be called when filters * that transform the data are being used, and *after* the first iteration of * k-means has completed. At this point, the reduce task can compute global * min/max for transformed attributes using the partial summary metadata for * the clusters computed in the first iteration * * @param priming the dummy priming data to use in the distance function */ public void setDummyDistancePrimingData(Instances priming) throws DistributedWekaException { if (m_kMeans == null) { throw new DistributedWekaException("Must call init() first"); } m_distanceFunction = new EuclideanDistance(); m_distanceFunction.setInstances(priming); try { m_kMeans.setDistanceFunction(m_distanceFunction); m_updateDistanceFunction = false; } catch (Exception ex) { throw new DistributedWekaException(ex); } }
/** * Builds the BallTree on the supplied set of * instances/points (supplied with setInstances(Instances) * method and referenced by the m_Instances field). This * method should not be called by outside classes. They * should only use setInstances(Instances) method. * * @throws Exception If no instances are supplied * (m_Instances is null), or if some other error in the * supplied BallTreeConstructor occurs while building * the tree. */ protected void buildTree() throws Exception { if(m_Instances==null) throw new Exception("No instances supplied yet. Have to call " + "setInstances(instances) with a set of Instances " + "first."); m_InstList = new int[m_Instances.numInstances()]; for(int i=0; i<m_InstList.length; i++) { m_InstList[i] = i; } //end for m_DistanceFunction.setInstances(m_Instances); m_TreeConstructor.setInstances(m_Instances); m_TreeConstructor.setInstanceList(m_InstList); m_TreeConstructor.setEuclideanDistanceFunction( (EuclideanDistance)m_DistanceFunction); m_Root = m_TreeConstructor.buildTree(); }
/** * Parses a given list of options. Valid options are: * * <!-- options-start --> <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String nnSearchClass = Utils.getOption('A', options); if (nnSearchClass.length() != 0) { String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass); if (nnSearchClassSpec.length == 0) { throw new Exception("Invalid DistanceFunction specification string."); } String className = nnSearchClassSpec[0]; nnSearchClassSpec[0] = ""; setDistanceFunction((DistanceFunction) Utils.forName( DistanceFunction.class, className, nnSearchClassSpec)); } else { setDistanceFunction(new EuclideanDistance()); } setMeasurePerformance(Utils.getFlag('P', options)); }
/** * Builds the ball tree top down. * * @return The root node of the tree. * @throws Exception If there is problem building the tree. */ @Override public BallNode buildTree() throws Exception { BallNode root; m_NumNodes = m_MaxDepth = 0; m_NumLeaves = 1; m_Splitter.setInstances(m_Instances); m_Splitter.setInstanceList(m_InstList); m_Splitter .setEuclideanDistanceFunction((EuclideanDistance) m_DistanceFunction); root = new BallNode(0, m_InstList.length - 1, 0); root.setPivot(BallNode.calcCentroidPivot(m_InstList, m_Instances)); root.setRadius(BallNode.calcRadius(m_InstList, m_Instances, root.getPivot(), m_DistanceFunction)); splitNodes(root, m_MaxDepth + 1, root.m_Radius); return root; }
/** * clusters an instance that has been through the filters. * * @param instance the instance to assign a cluster to * @param updateErrors if true, update the within clusters sum of errors * @param useFastDistCalc whether to use the fast distance calculation or not * @return a cluster number */ private int clusterProcessedInstance(Instance instance, boolean updateErrors, boolean useFastDistCalc) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist; if (useFastDistCalc) dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i), minDist); else dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } if (updateErrors) { if (m_DistanceFunction instanceof EuclideanDistance) { //Euclidean distance to Squared Euclidean distance minDist *= minDist; } m_squaredErrors[bestCluster] += minDist; } return bestCluster; }
/** * performs initialization of members */ @Override protected void initializeMembers() { super.initializeMembers(); m_TrainsetNew = null; m_TestsetNew = null; m_Alpha = 0.99; m_Sigma = 1.0; m_Repeats = 0; m_SequenceLimit = SEQ_LIMIT_GRAPHKERNEL; m_filterType = SMO.FILTER_NORMALIZE; m_IncludeNumAttributes = true; m_MatrixY = null; m_MatrixW = null; m_MatrixD = null; m_MatrixS = null; m_MatrixFStar = null; m_Data = null; m_DistanceFunction = new EuclideanDistance(); }
/** * Parses a given list of options. Valid options are: * <!-- options-start --> <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String nnSearchClass = Utils.getOption('A', options); if(nnSearchClass.length() != 0) { String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass); if(nnSearchClassSpec.length == 0) { throw new Exception("Invalid DistanceFunction specification string."); } String className = nnSearchClassSpec[0]; nnSearchClassSpec[0] = ""; setDistanceFunction( (DistanceFunction) Utils.forName( DistanceFunction.class, className, nnSearchClassSpec) ); } else { setDistanceFunction(new EuclideanDistance()); } setMeasurePerformance(Utils.getFlag('P',options)); }
/** * Builds the ball tree top down. * @return The root node of the tree. * @throws Exception If there is problem building * the tree. */ public BallNode buildTree() throws Exception { BallNode root; m_NumNodes = m_MaxDepth = 0; m_NumLeaves = 1; m_Splitter.setInstances(m_Instances); m_Splitter.setInstanceList(m_InstList); m_Splitter. setEuclideanDistanceFunction((EuclideanDistance)m_DistanceFunction); root = new BallNode(0, m_InstList.length-1, 0); root.setPivot(BallNode.calcCentroidPivot(m_InstList, m_Instances)); root.setRadius(BallNode.calcRadius(m_InstList, m_Instances, root.getPivot(), m_DistanceFunction)); splitNodes(root, m_MaxDepth+1, root.m_Radius); return root; }
/** * clusters an instance that has been through the filters * * @param instance the instance to assign a cluster to * @param updateErrors if true, update the within clusters sum of errors * @return a cluster number */ private int clusterProcessedInstance(Instance instance, boolean updateErrors) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } if (updateErrors) { if (m_DistanceFunction instanceof EuclideanDistance) { // Euclidean distance to Squared Euclidean distance minDist *= minDist; } m_squaredErrors[bestCluster] += minDist; } return bestCluster; }
private IBk useCosine() { IBk ibk = new IBk(); Instances data = ClassificationModel.getInstance().getInstances(); Normalize normalizer = new Normalize(); try { normalizer.setInputFormat(data); // Euclidean Distance working over normalized instances = Cosine Similarity according to Foundations of Statistical Natural Processing Language p.301 // As long as attribute normalization is disabled. Instances normalizedInstances; normalizedInstances = Filter.useFilter(data, normalizer); ClassificationModel.getInstance().setInstances(normalizedInstances); DistanceFunction df = new EuclideanDistance(); ((EuclideanDistance) df).setDontNormalize(true); ibk.getNearestNeighbourSearchAlgorithm().setDistanceFunction(df); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return ibk; }
/** * sets the distance function to use for instance comparison. * * @param df the new distance function to use * @throws Exception if instances cannot be processed */ public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof EuclideanDistance) && !(df instanceof ManhattanDistance)) { throw new Exception( "SimpleKMeans currently only supports the Euclidean and Manhattan distances."); } m_DistanceFunction = df; }
/** * sets the distance function to use for instance comparison. * * @param df the new distance function to use * @throws Exception if instances cannot be processed */ public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof EuclideanDistance) && !(df instanceof ManhattanDistance)) { throw new Exception( "KMeansPlusPlus only supports the Euclidean and Manhattan distances."); } m_DistanceFunction = df; }
/** * Builds the tree on the given set of instances. P.S.: For internal use only. * Outside classes should call setInstances(). * * @param insts The instances on which to build the cover tree. * @throws Exception If the supplied set of Instances is empty, or if there * are missing values. */ protected void buildCoverTree(Instances insts) throws Exception { if (insts.numInstances() == 0) { throw new Exception( "CoverTree: Empty set of instances. Cannot build tree."); } checkMissing(insts); if (m_EuclideanDistance == null) { m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance(insts); } else { m_EuclideanDistance.setInstances(insts); } Stack<DistanceNode> point_set = new Stack<DistanceNode>(); Stack<DistanceNode> consumed_set = new Stack<DistanceNode>(); Instance point_p = insts.instance(0); int p_idx = 0; double max_dist = -1, dist = 0.0; for (int i = 1; i < insts.numInstances(); i++) { DistanceNode temp = new DistanceNode(); temp.dist = new Stack<Double>(); dist = Math.sqrt(m_DistanceFunction.distance(point_p, insts.instance(i), Double.POSITIVE_INFINITY)); if (dist > max_dist) { max_dist = dist; insts.instance(i); } temp.dist.push(dist); temp.idx = i; point_set.push(temp); } max_dist = max_set(point_set); m_Root = batch_insert(p_idx, get_scale(max_dist), get_scale(max_dist), point_set, consumed_set); }
/** * Sets the distance function to use for nearest neighbour search. Currently * only EuclideanDistance is supported. * * @param df the distance function to use * @throws Exception if not EuclideanDistance */ @Override public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof EuclideanDistance)) { throw new Exception("CoverTree currently only works with " + "EuclideanDistanceFunction."); } m_DistanceFunction = m_EuclideanDistance = (EuclideanDistance) df; }
/** * Builds the KDTree on the supplied set of instances/points. It * is adviseable to run the replace missing attributes filter * on the passed instances first. * NOTE: This method should not be called from outside this * class. Outside classes should call setInstances(Instances) * instead. * * @param instances The instances to build the tree on * @throws Exception if something goes wrong */ protected void buildKDTree(Instances instances) throws Exception { checkMissing(instances); if (m_EuclideanDistance == null) m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance( instances); else m_EuclideanDistance.setInstances(instances); m_Instances = instances; int numInst = m_Instances.numInstances(); // Make the global index list m_InstList = new int[numInst]; for (int i = 0; i < numInst; i++) { m_InstList[i] = i; } double[][] universe = m_EuclideanDistance.getRanges(); // initializing internal fields of KDTreeSplitter m_Splitter.setInstances(m_Instances); m_Splitter.setInstanceList(m_InstList); m_Splitter.setEuclideanDistanceFunction(m_EuclideanDistance); m_Splitter.setNodeWidthNormalization(m_NormalizeNodeWidth); // building tree m_NumNodes = m_NumLeaves = 1; m_MaxDepth = 0; m_Root = new KDTreeNode(m_NumNodes, 0, m_Instances.numInstances() - 1, universe); splitNodes(m_Root, universe, m_MaxDepth + 1); }
/** * sets the distance function to use for nearest neighbour search. * * @param df the distance function to use * @throws Exception if not EuclideanDistance */ public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof EuclideanDistance)) throw new Exception("KDTree currently only works with " + "EuclideanDistanceFunction."); m_DistanceFunction = m_EuclideanDistance = (EuclideanDistance) df; }
/** * sets the distance function to use for instance comparison. * * @param df the new distance function to use * @throws Exception if instances cannot be processed */ public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof EuclideanDistance) && !(df instanceof ManhattanDistance)) { throw new Exception("SimpleKMeans currently only supports the Euclidean and Manhattan distances."); } m_DistanceFunction = df; }
protected double[] moveCentroid(int centroidIndex,Instances members/*,boolean updateClusterInfo*/){ double[] vals=new double[members.numAttributes()]; for(int j=0;j<members.numAttributes();j++){ if(m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()){ vals[j]=members.meanOrMode(j); } } m_ClusterCentroids.add(decideCentroid(vals,members)); return vals; }
/** * Builds the tree on the given set of instances. * P.S.: For internal use only. Outside classes * should call setInstances(). * @param insts The instances on which to build * the cover tree. * @throws Exception If the supplied set of * Instances is empty, or if there are missing * values. */ protected void buildCoverTree(Instances insts) throws Exception { if (insts.numInstances() == 0) throw new Exception( "CoverTree: Empty set of instances. Cannot build tree."); checkMissing(insts); if (m_EuclideanDistance == null) m_DistanceFunction = m_EuclideanDistance = new EuclideanDistance(insts); else m_EuclideanDistance.setInstances(insts); Stack<DistanceNode> point_set = new Stack<DistanceNode>(); Stack<DistanceNode> consumed_set = new Stack<DistanceNode>(); Instance point_p = insts.instance(0); int p_idx = 0; double max_dist=-1, dist=0.0; Instance max_q=point_p; for (int i = 1; i < insts.numInstances(); i++) { DistanceNode temp = new DistanceNode(); temp.dist = new Stack<Double>(); dist = Math.sqrt(m_DistanceFunction.distance(point_p, insts.instance(i), Double.POSITIVE_INFINITY)); if(dist > max_dist) { max_dist = dist; max_q = insts.instance(i); } temp.dist.push(dist); temp.idx = i; point_set.push(temp); } max_dist = max_set(point_set); m_Root = batch_insert(p_idx, get_scale(max_dist), get_scale(max_dist), point_set, consumed_set); }
/** * Make the final PreconstructedKMeans clusterer to wrap the centroids and * stats found during map-reduce. * * @param best the best result from the runs of k-means that were performed in * parallel * @param preprocess any pre-processing filters applied * @param initialStartingPoints the initial starting centroids * @param finalNumIterations the final number of iterations performed * @return a final clusterer object * @throws DistributedWekaException if a problem occurs */ protected Clusterer makeFinalClusterer(KMeansReduceTask best, Filter preprocess, Instances initialStartingPoints, int finalNumIterations) throws DistributedWekaException { Clusterer finalClusterer = null; PreconstructedKMeans finalKMeans = new PreconstructedKMeans(); // global priming data for the distance function (this will be in // the transformed space if we're using preprocessing filters) Instances globalPrimingData = best.getGlobalDistanceFunctionPrimingData(); NormalizableDistance dist = new EuclideanDistance(); dist.setInstances(globalPrimingData); finalKMeans.setClusterCentroids(best.getCentroidsForRun()); finalKMeans.setFinalNumberOfIterations(finalNumIterations + 1); if (initialStartingPoints != null) { finalKMeans.setInitialStartingPoints(initialStartingPoints); } try { finalKMeans.setDistanceFunction(dist); finalKMeans.setClusterStats(best.getAggregatedCentroidSummaries()); } catch (Exception e) { throw new DistributedWekaException(e); } if (!getInitWithRandomCentroids()) { finalKMeans.setInitializationMethod(new SelectedTag( SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION)); } finalKMeans.setDisplayStdDevs(getDisplayCentroidStdDevs()); finalClusterer = finalKMeans; if (preprocess != null) { PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer(); fc.setFilter(preprocess); fc.setClusterer(finalKMeans); finalClusterer = fc; } return finalClusterer; }