public LassoValidationSummary validateLassoAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, LassoDetectionModel lassoDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LassoDetectionAlgorithm lassoDetectionAlgorithm = (LassoDetectionAlgorithm) lassoDetectionModel.getDetectionAlgorithm(); LassoValidationSummary lassoValidationSummary = new LassoValidationSummary(); lassoValidationSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm); LassoDistJob lassoDistJob = new LassoDistJob(); lassoDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, lassoDetectionModel, lassoValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; lassoValidationSummary.setValidationTime(time); return lassoValidationSummary; }
public RidgeRegressionValidationSummary validateRidgeRegressionAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, RidgeRegressionDetectionModel ridgeRegressionDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm = (RidgeRegressionDetectionAlgorithm) ridgeRegressionDetectionModel.getDetectionAlgorithm(); RidgeRegressionValidationSummary ridgeRegressionValidationSummary = new RidgeRegressionValidationSummary(); ridgeRegressionValidationSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm); RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob(); ridgeRegressionDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, ridgeRegressionDetectionModel, ridgeRegressionValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; ridgeRegressionValidationSummary.setValidationTime(time); return ridgeRegressionValidationSummary; }
public LinearRegressionValidationSummary validateLinearRegressionAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, LinearRegressionDetectionModel linearRegressionDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm = (LinearRegressionDetectionAlgorithm) linearRegressionDetectionModel.getDetectionAlgorithm(); LinearRegressionValidationSummary linearRegressionValidationSummary = new LinearRegressionValidationSummary(); linearRegressionValidationSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm); LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob(); linearRegressionDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, linearRegressionDetectionModel, linearRegressionValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; linearRegressionValidationSummary.setValidationTime(time); return linearRegressionValidationSummary; }
public LogisticRegressionValidationSummary validateLogisticRegressionAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, LogisticRegressionDetectionModel logisticRegressionDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LogisticRegressionDetectionAlgorithm logisticRegressionDetectionAlgorithm = (LogisticRegressionDetectionAlgorithm) logisticRegressionDetectionModel.getDetectionAlgorithm(); LogisticRegressionValidationSummary logisticRegressionValidationSummary = new LogisticRegressionValidationSummary(sc.sc(), logisticRegressionDetectionAlgorithm.getNumClasses(), indexing, marking); LogisticRegressionDistJob logisticRegressionDistJob = new LogisticRegressionDistJob(); logisticRegressionDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, logisticRegressionDetectionModel, logisticRegressionValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; logisticRegressionValidationSummary.setTotalValidationTime(time); return logisticRegressionValidationSummary; }
public SVMValidationSummary validateSVMAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, SVMDetectionModel svmDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); SVMDetectionAlgorithm svmDetectionAlgorithm = (SVMDetectionAlgorithm) svmDetectionModel.getDetectionAlgorithm(); SVMValidationSummary svmValidationSummary = new SVMValidationSummary(sc.sc(), 2, indexing, marking); SVMDistJob svmDistJob = new SVMDistJob(); svmDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, svmDetectionModel, svmValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; svmValidationSummary.setTotalValidationTime(time); return svmValidationSummary; }
public GradientBoostedTreesValidationSummary validateGradientBoostedTreesAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm = (GradientBoostedTreesDetectionAlgorithm) gradientBoostedTreesDetectionModel.getDetectionAlgorithm(); GradientBoostedTreesValidationSummary gradientBoostedTreesValidationSummary = new GradientBoostedTreesValidationSummary(sc.sc(), gradientBoostedTreesDetectionAlgorithm.getNumClasses(), indexing, marking); GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob(); gradientBoostedTreesDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, gradientBoostedTreesDetectionModel, gradientBoostedTreesValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; gradientBoostedTreesValidationSummary.setTotalValidationTime(time); return gradientBoostedTreesValidationSummary; }
public RandomForestValidationSummary validateRandomForestAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, RandomForestDetectionModel randomForestDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); RandomForestDetectionAlgorithm randomForestDetectionAlgorithm = (RandomForestDetectionAlgorithm) randomForestDetectionModel.getDetectionAlgorithm(); RandomForestValidationSummary randomForestValidationSummary = new RandomForestValidationSummary(sc.sc(), randomForestDetectionAlgorithm.getNumClasses(), indexing, marking); RandomForestDistJob randomForestDistJob = new RandomForestDistJob(); randomForestDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, randomForestDetectionModel, randomForestValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; randomForestValidationSummary.setTotalValidationTime(time); return randomForestValidationSummary; }
public NaiveBayesValidationSummary validateNaiveBayesAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, NaiveBayesDetectionModel naiveBayesDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) naiveBayesDetectionModel.getDetectionAlgorithm(); NaiveBayesValidationSummary naiveBayesValidationSummary = new NaiveBayesValidationSummary(sc.sc(), naiveBayesDetectionAlgorithm.getNumClasses(), indexing, marking); NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob(); naiveBayesDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, naiveBayesDetectionModel, naiveBayesValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; naiveBayesValidationSummary.setTotalValidationTime(time); return naiveBayesValidationSummary; }
public DecisionTreeValidationSummary validateDecisionTreeAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DecisionTreeDetectionModel decisionTreeDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) decisionTreeDetectionModel.getDetectionAlgorithm(); DecisionTreeValidationSummary decisionTreeValidationSummary = new DecisionTreeValidationSummary(sc.sc(), decisionTreeDetectionAlgorithm.getNumClasses(), indexing, marking); DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob(); decisionTreeDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, decisionTreeDetectionModel, decisionTreeValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; decisionTreeValidationSummary.setTotalValidationTime(time); return decisionTreeValidationSummary; }
public GaussianMixtureValidationSummary validateGaussianMixtureAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, GaussianMixtureDetectionModel gaussianMixtureDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) gaussianMixtureDetectionModel.getDetectionAlgorithm(); GaussianMixtureValidationSummary gaussianMixtureValidationSummary = new GaussianMixtureValidationSummary(sc.sc(), gaussianMixtureDetectionAlgorithm.getK(), indexing, marking); GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob(); gaussianMixtureDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, gaussianMixtureDetectionModel, gaussianMixtureValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; gaussianMixtureValidationSummary.setTotalValidationTime(time); return gaussianMixtureValidationSummary; }
public KmeansValidationSummary validateKMeansAthenaFeatures(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, KMeansDetectionModel kMeansDetectionModel, Indexing indexing, Marking marking) { long start = System.nanoTime(); // <-- start JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); KMeansDetectionAlgorithm kMeansDetectionAlgorithm = (KMeansDetectionAlgorithm) kMeansDetectionModel.getDetectionAlgorithm(); KmeansValidationSummary kmeansValidationSummary = new KmeansValidationSummary(sc.sc(), kMeansDetectionAlgorithm.getK(), indexing, marking); KMeansDistJob KMeansDistJob = new KMeansDistJob(); KMeansDistJob.validate(mongoRDD, athenaMLFeatureConfiguration, kMeansDetectionModel, kmeansValidationSummary); long end = System.nanoTime(); // <-- start long time = end - start; kmeansValidationSummary.setTotalValidationTime(time); return kmeansValidationSummary; }
/** * Uses the Hadoop API to get the input objects. Then uses the geneesc framewes to do tes evaluesion.es * * @param transformation * @return */ @Override public JavaRDD<ObjectValue> evaluate(Transformation transformation) { JavaSparkContext sc = NotaQL.SparkFactory.getSparkContext(); String mongoDBHost = "mongodb://" + NotaQL.prop.getProperty("mongodb_host", "localhost") + ":27017/"; Configuration config = new Configuration(); config.set("mongo.input.uri", mongoDBHost + databaseName + "." + collectionName); // add partial filter to query in mongodb if(!noQuery && transformation.getInPredicate() != null) { final BSONObject query = FilterTranslator.toMongoDBQuery(transformation.getInPredicate()); logger.info("Sending query to MongoDB: " + query.toString()); config.set("mongo.input.query", query.toString()); } final SparkTransformationEvaluator evaluator = new SparkTransformationEvaluator(transformation); JavaPairRDD<Object, BSONObject> mongoRDD = sc.newAPIHadoopRDD(config, MongoInputFormat.class, Object.class, BSONObject.class); // convert all objects in rdd to inner format final JavaRDD<Value> converted = mongoRDD.map(t -> ValueConverter.convertToNotaQL(t._2)); // filter the ones not fulfilling the input filter (queries of MongoDB are less expressive than NotaQL) final JavaRDD<Value> filtered = converted.filter(v -> transformation.satisfiesInPredicate((ObjectValue) v)); // process all input return evaluator.process(filtered); }
private void setupJob() { _job.setInputFormatClass(MongoInputFormat.class); _job.setMapperClass(BulkImportMapper.class); _job.setMapOutputKeyClass(ImmutableBytesWritable.class); _job.setMapOutputValueClass(Put.class); MongoConfigUtil.setInputURI(getConfiguration(), _mongoURI); MongoConfigUtil.setReadSplitsFromSecondary(getConfiguration(), true); }
public LassoDetectionModel generateLassoAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { LassoModelSummary lassoModelSummary = new LassoModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start LassoDetectionAlgorithm lassoDetectionAlgorithm = (LassoDetectionAlgorithm) detectionAlgorithm; LassoDetectionModel lassoDetectionModel = new LassoDetectionModel(); lassoDetectionModel.setLassoDetectionAlgorithm(lassoDetectionAlgorithm); lassoModelSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm); lassoDetectionModel.setFeatureConstraint(featureConstraint); lassoDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); lassoDetectionModel.setIndexing(indexing); lassoDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LassoDistJob lassoDistJob = new LassoDistJob(); LassoModel lassoModel = lassoDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, lassoDetectionAlgorithm, marking, lassoModelSummary); lassoDetectionModel.setModel(lassoModel); long end = System.nanoTime(); // <-- start long time = end - start; lassoModelSummary.setTotalLearningTime(time); lassoDetectionModel.setClassificationModelSummary(lassoModelSummary); return lassoDetectionModel; }
public RidgeRegressionDetectionModel generateRidgeRegressionAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { RidgeRegressionModelSummary ridgeRegressionModelSummary = new RidgeRegressionModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm = (RidgeRegressionDetectionAlgorithm) detectionAlgorithm; RidgeRegressionDetectionModel ridgeRegressionDetectionModel = new RidgeRegressionDetectionModel(); ridgeRegressionDetectionModel.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm); ridgeRegressionModelSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm); ridgeRegressionDetectionModel.setFeatureConstraint(featureConstraint); ridgeRegressionDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); ridgeRegressionDetectionModel.setIndexing(indexing); ridgeRegressionDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob(); RidgeRegressionModel ridgeRegressionModel = ridgeRegressionDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, ridgeRegressionDetectionAlgorithm, marking, ridgeRegressionModelSummary); ridgeRegressionDetectionModel.setModel(ridgeRegressionModel); long end = System.nanoTime(); // <-- start long time = end - start; ridgeRegressionModelSummary.setTotalLearningTime(time); ridgeRegressionDetectionModel.setClassificationModelSummary(ridgeRegressionModelSummary); return ridgeRegressionDetectionModel; }
public LinearRegressionDetectionModel generateLinearRegressionAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { LinearRegressionModelSummary linearRegressionModelSummary = new LinearRegressionModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm = (LinearRegressionDetectionAlgorithm) detectionAlgorithm; LinearRegressionDetectionModel linearRegressionDetectionModel = new LinearRegressionDetectionModel(); linearRegressionDetectionModel.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm); linearRegressionModelSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm); linearRegressionDetectionModel.setFeatureConstraint(featureConstraint); linearRegressionDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); linearRegressionDetectionModel.setIndexing(indexing); linearRegressionDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob(); LinearRegressionModel linearRegressionModel = linearRegressionDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, linearRegressionDetectionAlgorithm, marking, linearRegressionModelSummary); linearRegressionDetectionModel.setModel(linearRegressionModel); long end = System.nanoTime(); // <-- start long time = end - start; linearRegressionModelSummary.setTotalLearningTime(time); linearRegressionDetectionModel.setClassificationModelSummary(linearRegressionModelSummary); return linearRegressionDetectionModel; }
public LogisticRegressionDetectionModel generateLogisticRegressionAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { LogisticRegressionModelSummary logisticRegressionModelSummary = new LogisticRegressionModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start LogisticRegressionDetectionAlgorithm logisticRegressionDetectionAlgorithm = (LogisticRegressionDetectionAlgorithm) detectionAlgorithm; LogisticRegressionDetectionModel logisticRegressionDetectionModel = new LogisticRegressionDetectionModel(); logisticRegressionDetectionModel.setLogisticRegressionDetectionAlgorithm(logisticRegressionDetectionAlgorithm); logisticRegressionModelSummary.setLogisticRegressionDetectionAlgorithm(logisticRegressionDetectionAlgorithm); logisticRegressionDetectionModel.setFeatureConstraint(featureConstraint); logisticRegressionDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); logisticRegressionDetectionModel.setIndexing(indexing); logisticRegressionDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); LogisticRegressionDistJob logisticRegressionDistJob = new LogisticRegressionDistJob(); LogisticRegressionModel logisticRegressionModel = logisticRegressionDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, logisticRegressionDetectionAlgorithm, marking, logisticRegressionModelSummary); logisticRegressionDetectionModel.setModel(logisticRegressionModel); long end = System.nanoTime(); // <-- start long time = end - start; logisticRegressionModelSummary.setTotalLearningTime(time); logisticRegressionDetectionModel.setClassificationModelSummary(logisticRegressionModelSummary); return logisticRegressionDetectionModel; }
public SVMDetectionModel generateSVMAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { SVMModelSummary svmModelSummary = new SVMModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start SVMDetectionAlgorithm svmDetectionAlgorithm = (SVMDetectionAlgorithm) detectionAlgorithm; SVMDetectionModel svmDetectionModel = new SVMDetectionModel(); svmDetectionModel.setSVMDetectionAlgorithm(svmDetectionAlgorithm); svmModelSummary.setSVMDetectionAlgorithm(svmDetectionAlgorithm); svmDetectionModel.setFeatureConstraint(featureConstraint); svmDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); svmDetectionModel.setIndexing(indexing); svmDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); SVMDistJob svmDistJob = new SVMDistJob(); SVMModel svmModel = svmDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, svmDetectionAlgorithm, marking, svmModelSummary); svmDetectionModel.setSVMModel(svmModel); long end = System.nanoTime(); // <-- start long time = end - start; svmModelSummary.setTotalLearningTime(time); svmDetectionModel.setClassificationModelSummary(svmModelSummary); return svmDetectionModel; }
public GradientBoostedTreesDetectionModel generateGradientBoostedTreesAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { GradientBoostedTreesModelSummary gradientBoostedTreesModelSummary = new GradientBoostedTreesModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm = (GradientBoostedTreesDetectionAlgorithm) detectionAlgorithm; GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel = new GradientBoostedTreesDetectionModel(); gradientBoostedTreesDetectionModel.setGradientBoostedTreesDetectionAlgorithm(gradientBoostedTreesDetectionAlgorithm); gradientBoostedTreesModelSummary.setGradientBoostedTreesDetectionAlgorithm(gradientBoostedTreesDetectionAlgorithm); gradientBoostedTreesDetectionModel.setFeatureConstraint(featureConstraint); gradientBoostedTreesDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); gradientBoostedTreesDetectionModel.setIndexing(indexing); gradientBoostedTreesDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob(); GradientBoostedTreesModel decisionTreeModel = gradientBoostedTreesDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, gradientBoostedTreesDetectionAlgorithm, marking, gradientBoostedTreesModelSummary); gradientBoostedTreesDetectionModel.setGradientBoostedTreestModel(decisionTreeModel); long end = System.nanoTime(); // <-- start long time = end - start; gradientBoostedTreesModelSummary.setTotalLearningTime(time); gradientBoostedTreesDetectionModel.setClassificationModelSummary(gradientBoostedTreesModelSummary); return gradientBoostedTreesDetectionModel; }
public RandomForestDetectionModel generateRandomForestAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { RandomForestModelSummary randomForestModelSummary = new RandomForestModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start RandomForestDetectionAlgorithm randomForestDetectionAlgorithm = (RandomForestDetectionAlgorithm) detectionAlgorithm; RandomForestDetectionModel randomForestDetectionModel = new RandomForestDetectionModel(); randomForestDetectionModel.setRandomForestDetectionAlgorithm(randomForestDetectionAlgorithm); randomForestModelSummary.setRandomForestDetectionAlgorithm(randomForestDetectionAlgorithm); randomForestDetectionModel.setFeatureConstraint(featureConstraint); randomForestDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); randomForestDetectionModel.setIndexing(indexing); randomForestDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); RandomForestDistJob randomForestDistJob = new RandomForestDistJob(); RandomForestModel decisionTreeModel = randomForestDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, randomForestDetectionAlgorithm, marking, randomForestModelSummary); randomForestDetectionModel.setRandomForestModel(decisionTreeModel); long end = System.nanoTime(); // <-- start long time = end - start; randomForestModelSummary.setTotalLearningTime(time); randomForestDetectionModel.setClassificationModelSummary(randomForestModelSummary); return randomForestDetectionModel; }
public NaiveBayesDetectionModel generateNaiveBayesAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { NaiveBayesModelSummary naiveBayesModelSummary = new NaiveBayesModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) detectionAlgorithm; NaiveBayesDetectionModel naiveBayesDetectionModel = new NaiveBayesDetectionModel(); naiveBayesDetectionModel.setNaiveBayesDetectionAlgorithm(naiveBayesDetectionAlgorithm); naiveBayesModelSummary.setNaiveBayesDetectionAlgorithm(naiveBayesDetectionAlgorithm); naiveBayesDetectionModel.setFeatureConstraint(featureConstraint); naiveBayesDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); naiveBayesDetectionModel.setIndexing(indexing); naiveBayesDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob(); NaiveBayesModel naiveBayesModel = naiveBayesDistJob.generateModelWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, naiveBayesDetectionAlgorithm, marking, naiveBayesModelSummary); naiveBayesDetectionModel.setNaiveBayesModel(naiveBayesModel); long end = System.nanoTime(); // <-- start long time = end - start; naiveBayesModelSummary.setTotalLearningTime(time); naiveBayesDetectionModel.setClassificationModelSummary(naiveBayesModelSummary); return naiveBayesDetectionModel; }
public DecisionTreeDetectionModel generateDecisionTreeAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { DecisionTreeModelSummary decisionTreeModelSummary = new DecisionTreeModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) detectionAlgorithm; DecisionTreeDetectionModel decisionTreeDetectionModel = new DecisionTreeDetectionModel(); decisionTreeDetectionModel.setDecisionTreeDetectionAlgorithm(decisionTreeDetectionAlgorithm); decisionTreeModelSummary.setDecisionTreeDetectionAlgorithm(decisionTreeDetectionAlgorithm); decisionTreeDetectionModel.setFeatureConstraint(featureConstraint); decisionTreeDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); decisionTreeDetectionModel.setIndexing(indexing); decisionTreeDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob(); DecisionTreeModel decisionTreeModel = decisionTreeDistJob.generateDecisionTreeWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, decisionTreeDetectionAlgorithm, marking, decisionTreeModelSummary); decisionTreeDetectionModel.setDecisionTreeModel(decisionTreeModel); long end = System.nanoTime(); // <-- start long time = end - start; decisionTreeModelSummary.setTotalLearningTime(time); decisionTreeDetectionModel.setClassificationModelSummary(decisionTreeModelSummary); return decisionTreeDetectionModel; }
public GaussianMixtureDetectionModel generateGaussianMixtureAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { GaussianMixtureModelSummary gaussianMixtureModelSummary = new GaussianMixtureModelSummary( sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) detectionAlgorithm; GaussianMixtureDetectionModel gaussianMixtureDetectionModel = new GaussianMixtureDetectionModel(); gaussianMixtureDetectionModel.setGaussianMixtureDetectionAlgorithm(gaussianMixtureDetectionAlgorithm); gaussianMixtureModelSummary.setGaussianMixtureDetectionAlgorithm(gaussianMixtureDetectionAlgorithm); gaussianMixtureDetectionModel.setFeatureConstraint(featureConstraint); gaussianMixtureDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); gaussianMixtureDetectionModel.setIndexing(indexing); gaussianMixtureDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob(); GaussianMixtureModel gaussianMixtureModel = gaussianMixtureDistJob.generateGaussianMixtureWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, gaussianMixtureDetectionAlgorithm, gaussianMixtureModelSummary); gaussianMixtureDetectionModel.setkGaussianMixtureModel(gaussianMixtureModel); long end = System.nanoTime(); // <-- start long time = end - start; gaussianMixtureModelSummary.setTotalLearningTime(time); gaussianMixtureDetectionModel.setClusterModelSummary(gaussianMixtureModelSummary); gaussianMixtureModelSummary.setGaussianMixtureModel(gaussianMixtureModel); return gaussianMixtureDetectionModel; }
public KMeansDetectionModel generateKMeansAthenaDetectionModel(JavaSparkContext sc, FeatureConstraint featureConstraint, AthenaMLFeatureConfiguration athenaMLFeatureConfiguration, DetectionAlgorithm detectionAlgorithm, Indexing indexing, Marking marking) { KmeansModelSummary kmeansModelSummary = new KmeansModelSummary(sc.sc(), indexing, marking); long start = System.nanoTime(); // <-- start KMeansDetectionAlgorithm kMeansDetectionAlgorithm = (KMeansDetectionAlgorithm) detectionAlgorithm; KMeansDetectionModel kMeansDetectionModel = new KMeansDetectionModel(); kMeansDetectionModel.setkMeansDetectionAlgorithm(kMeansDetectionAlgorithm); kmeansModelSummary.setkMeansDetectionAlgorithm(kMeansDetectionAlgorithm); kMeansDetectionModel.setFeatureConstraint(featureConstraint); kMeansDetectionModel.setAthenaMLFeatureConfiguration(athenaMLFeatureConfiguration); kMeansDetectionModel.setIndexing(indexing); kMeansDetectionModel.setMarking(marking); JavaPairRDD<Object, BSONObject> mongoRDD; mongoRDD = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); KMeansDistJob KMeansDistJob = new KMeansDistJob(); KMeansModel kMeansModel = KMeansDistJob.generateKmeansWithPreprocessing(mongoRDD, athenaMLFeatureConfiguration, kMeansDetectionAlgorithm, kmeansModelSummary); kMeansDetectionModel.setkMeansModel(kMeansModel); long end = System.nanoTime(); // <-- start long time = end - start; kmeansModelSummary.setTotalLearningTime(time); kMeansDetectionModel.setClusterModelSummary(kmeansModelSummary); return kMeansDetectionModel; }
public static JavaPairRDD<Object, BSONObject> mongoRdd(JavaSparkContext sparkContext, String mongoHost, long mongoPort, String db, String collection) { Configuration mongodbConfig = new Configuration(); mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat"); mongodbConfig.set("mongo.input.uri", String.format("mongodb://%s:%d/%s.%s", mongoHost, mongoPort, db, collection)); return sparkContext.newAPIHadoopRDD(mongodbConfig, MongoInputFormat.class, Object.class, BSONObject.class); }
/** * Instantiates a new Mongo extractor. */ public MongoExtractor() { super(); this.inputFormat = new MongoInputFormat(); this.outputFormat = new MongoOutputFormat(); }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if(args.length < 3) { System.err.println("Usage: MapReduceExercise " + "[mongodb input uri] " + "[mongodb output uri] " + "update=[true or false]"); System.err.println("Example: MapReduceExercise " + "mongodb://127.0.0.1:27017/movielens.ratings " + "mongodb://127.0.0.1:27017/movielens.ratings.stats update=false"); System.err.println("Example: MapReduceExercise " + "mongodb://127.0.0.1:27017/movielens.ratings " + "mongodb://127.0.0.1:27017/movielens.movies update=true"); System.exit(-1); } Class outputValueClass = BSONWritable.class; Class reducerClass = Reduce.class; if(args[2].equals("update=true")) { outputValueClass = MongoUpdateWritable.class; reducerClass = ReduceUpdater.class; } Configuration conf = new Configuration(); // Set MongoDB-specific configuration items conf.setClass("mongo.job.mapper", Map.class, Mapper.class); conf.setClass("mongo.job.reducer", reducerClass, Reducer.class); conf.setClass("mongo.job.mapper.output.key", IntWritable.class, Object.class); conf.setClass("mongo.job.mapper.output.value", DoubleWritable.class, Object.class); conf.setClass("mongo.job.output.key", NullWritable.class, Object.class); conf.setClass("mongo.job.output.value", outputValueClass, Object.class); conf.set("mongo.input.uri", args[0]); conf.set("mongo.output.uri", args[1]); Job job = Job.getInstance(conf); // Set Hadoop-specific job parameters job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(outputValueClass); job.setMapperClass(Map.class); job.setReducerClass(reducerClass); job.setJarByClass(MapReduceExercise.class); job.submit(); }