Java 类org.apache.spark.api.java.JavaSparkContext 实例源码
项目:big-data-benchmark
文件:SparkWordCount.java
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("Usage:");
System.err.println(" SparkWordCount <sourceFile> <targetFile>");
System.exit(1);
}
SparkConf conf = new SparkConf()
.setAppName("Word Count");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> textFile = sc.textFile(args[0]);
JavaRDD<String> words = textFile.flatMap(LineIterator::new);
JavaPairRDD<String, Long> pairs =
words.mapToPair(s -> new Tuple2<>(s, 1L));
JavaPairRDD<String, Long> counts =
pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);
System.out.println("Starting task..");
long t = System.currentTimeMillis();
counts.saveAsTextFile(args[1] + "_" + t);
System.out.println("Time=" + (System.currentTimeMillis() - t));
}
项目:Sempala
文件:Spark.java
/**
* Initializes a Spark connection. Use it afterwards for execution of Spark
* SQL queries.
*
* @param appName
* the name of the app that will be used with this Spark
* connection
* @param database
* name of the database that will be used with this Spark
* connection
*/
public Spark(String appName, String database) {
// TODO check what will happen if there is already in use the same app
// name
this.sparkConfiguration = new SparkConf().setAppName(appName);
this.javaContext = new JavaSparkContext(sparkConfiguration);
this.hiveContext = new HiveContext(javaContext);
// TODO check what kind of exception can be thrown here if there is a
// problem with spark connection
this.hiveContext.sql(String.format("CREATE DATABASE %s", database));
// TODO check what kind of exception is thrown if database already
// use the created database
this.hiveContext.sql((String.format("USE %s", database)));
}
项目:oryx2
文件:KMeansUpdate.java
/**
* @param sparkContext active Spark Context
* @param trainData training data on which to build a model
* @param hyperParameters ordered list of hyper parameter values to use in building model
* @param candidatePath directory where additional model files can be written
* @return a {@link PMML} representation of a model trained on the given data
*/
@Override
public PMML buildModel(JavaSparkContext sparkContext,
JavaRDD<String> trainData,
List<?> hyperParameters,
Path candidatePath) {
int numClusters = (Integer) hyperParameters.get(0);
Preconditions.checkArgument(numClusters > 1);
log.info("Building KMeans Model with {} clusters", numClusters);
JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
numberOfRuns, initializationStrategy);
return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}
项目:BLASpark
文件:OtherOperations.java
public static DistributedMatrix GetLU(DistributedMatrix A, JavaSparkContext jsc) {
DistributedMatrix returnedMatrix;
if( A.getClass() == IndexedRowMatrix.class) {
returnedMatrix = OtherOperations.GetLU_IRW((IndexedRowMatrix) A);
}
else if (A.getClass() == CoordinateMatrix.class) {
returnedMatrix = OtherOperations.GetLU_COORD((CoordinateMatrix) A);
}
else if (A.getClass() == BlockMatrix.class){
// TODO: Implement this operation
//returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
returnedMatrix = null;
}
else {
returnedMatrix = null;
}
return returnedMatrix;
}
项目:BLASpark
文件:OtherOperations.java
public static DistributedMatrix GetD(DistributedMatrix A, boolean inverseValues, JavaSparkContext jsc) {
DistributedMatrix returnedMatrix;
if( A.getClass() == IndexedRowMatrix.class) {
returnedMatrix = OtherOperations.GetD_IRW((IndexedRowMatrix) A, inverseValues, jsc);
}
else if (A.getClass() == CoordinateMatrix.class) {
returnedMatrix = OtherOperations.GetD_COORD((CoordinateMatrix) A, inverseValues, jsc);
}
else if (A.getClass() == BlockMatrix.class){
// TODO: Implement this operation
//returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
returnedMatrix = null;
}
else {
returnedMatrix = null;
}
return returnedMatrix;
}
项目:ViraPipe
文件:InterleaveMulti.java
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
//TODO: Handle also compressed files
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
项目:ViraPipe
文件:Decompress.java
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
项目:Apache-Spark-2x-for-Java-Developers
文件:WordCount.java
public static void wordCountJava8( String filename )
{
// Define a configuration to use to interact with Spark
SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");
// Create a Java version of the Spark Context from the configuration
JavaSparkContext sc = new JavaSparkContext(conf);
// Load the input data, which is a text file read from the command line
JavaRDD<String> input = sc.textFile( filename );
// Java 8 with lambdas: split the input string into words
// TODO here a change has happened
JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );
// Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile( "output" );
}
项目:Apache-Spark-2x-for-Java-Developers
文件:SparkWordCount.java
public static void main(String[] args) throws Exception {
System.out.println(System.getProperty("hadoop.home.dir"));
String inputPath = args[0];
String outputPath = args[1];
FileUtils.deleteQuietly(new File(outputPath));
JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");
JavaRDD<String> rdd = sc.textFile(inputPath);
JavaPairRDD<String, Integer> counts = rdd
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
.reduceByKey((x, y) -> x + y);
counts.saveAsTextFile(outputPath);
sc.close();
}
项目:incubator-sdap-mudrod
文件:SessionExtractor.java
/**
* loadClickStremFromTxt:Load click stream form txt file
*
* @param clickthroughFile
* txt file
* @param sc
* the spark context
* @return clickstream list in JavaRDD format {@link ClickStream}
*/
public JavaRDD<ClickStream> loadClickStremFromTxt(String clickthroughFile, JavaSparkContext sc) {
return sc.textFile(clickthroughFile).flatMap(new FlatMapFunction<String, ClickStream>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@SuppressWarnings("unchecked")
@Override
public Iterator<ClickStream> call(String line) throws Exception {
List<ClickStream> clickthroughs = (List<ClickStream>) ClickStream.parseFromTextLine(line);
return (Iterator<ClickStream>) clickthroughs;
}
});
}
项目:incubator-sdap-mudrod
文件:SparkDriver.java
public SparkDriver(Properties props) {
SparkConf conf = new SparkConf().setAppName(props.getProperty(MudrodConstants.SPARK_APP_NAME, "MudrodSparkApp")).setIfMissing("spark.master", props.getProperty(MudrodConstants.SPARK_MASTER))
.set("spark.hadoop.validateOutputSpecs", "false").set("spark.files.overwrite", "true");
String esHost = props.getProperty(MudrodConstants.ES_UNICAST_HOSTS);
String esPort = props.getProperty(MudrodConstants.ES_HTTP_PORT);
if (!"".equals(esHost)) {
conf.set("es.nodes", esHost);
}
if (!"".equals(esPort)) {
conf.set("es.port", esPort);
}
conf.set("spark.serializer", KryoSerializer.class.getName());
conf.set("es.batch.size.entries", "1500");
sc = new JavaSparkContext(conf);
sqlContext = new SQLContext(sc);
}
项目:betleopard
文件:LiveBetMain.java
private void init() throws IOException {
final ClientConfig config = new ClientConfig();
client = HazelcastClient.newHazelcastClient(config);
final SparkConf conf = new SparkConf()
.set("hazelcast.server.addresses", "127.0.0.1:5701")
.set("hazelcast.server.groupName", "dev")
.set("hazelcast.server.groupPass", "dev-pass")
.set("hazelcast.spark.valueBatchingEnabled", "true")
.set("hazelcast.spark.readBatchSize", "5000")
.set("hazelcast.spark.writeBatchSize", "5000");
sc = new JavaSparkContext("local", "appname", conf);
loadHistoricalRaces();
createRandomUsers();
createFutureEvent();
}
项目:MinoanER
文件:CNPNeighborsUnnormalized.java
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsSUM(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
项目:MinoanER
文件:CNPARCS.java
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> run2(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
JavaPairRDD<Integer, Int2FloatLinkedOpenHashMap> topKneighborCandidates = getTopKNeighborSimsSUMWithScores(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
项目:MinoanER
文件:CNPNeighbors.java
/**
*
* @param topKvalueCandidates the topK results per entity, acquired from value similarity
* @param rawTriples1 the rdf triples of the first entity collection
* @param rawTriples2 the rdf triples of the second entity collection
* @param SEPARATOR the delimiter that separates subjects, predicates and objects in the rawTriples1 and rawTriples2 files
* @param entityIds1 the mapping of entity urls to entity ids, as it was used in blocking
* @param entityIds2
* @param MIN_SUPPORT_THRESHOLD the minimum support threshold, below which, relations are discarded from top relations
* @param K the K for topK candidate matches
* @param N the N for topN rdf relations (and neighbors)
* @param jsc the java spark context used to load files and broadcast variables
* @return topK neighbor candidates per entity
*/
public JavaPairRDD<Integer, IntArrayList> run(JavaPairRDD<Integer,Int2FloatLinkedOpenHashMap> topKvalueCandidates,
JavaRDD<String> rawTriples1,
JavaRDD<String> rawTriples2,
String SEPARATOR,
JavaRDD<String> entityIds1,
JavaRDD<String> entityIds2,
float MIN_SUPPORT_THRESHOLD,
int K,
int N,
JavaSparkContext jsc) {
Map<Integer,IntArrayList> inNeighbors = new HashMap<>(new RelationsRank().run(rawTriples1, SEPARATOR, entityIds1, MIN_SUPPORT_THRESHOLD, N, true, jsc));
inNeighbors.putAll(new RelationsRank().run(rawTriples2, SEPARATOR, entityIds2, MIN_SUPPORT_THRESHOLD, N, false, jsc));
Broadcast<Map<Integer,IntArrayList>> inNeighbors_BV = jsc.broadcast(inNeighbors);
//JavaPairRDD<Tuple2<Integer, Integer>, Float> neighborSims = getNeighborSims(topKvalueCandidates, inNeighbors_BV);
//JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSimsOld(neighborSims, K);
JavaPairRDD<Integer, IntArrayList> topKneighborCandidates = getTopKNeighborSims(topKvalueCandidates, inNeighbors_BV, K);
return topKneighborCandidates;
}
项目:oryx2
文件:ExampleBatchLayerUpdate.java
@Override
public void runUpdate(JavaSparkContext sparkContext,
long timestamp,
JavaPairRDD<String,String> newData,
JavaPairRDD<String,String> pastData,
String modelDirString,
TopicProducer<String,String> modelUpdateTopic) throws IOException {
JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
String modelString;
try {
modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
} catch (JsonProcessingException jpe) {
throw new IOException(jpe);
}
modelUpdateTopic.send("MODEL", modelString);
}
项目:MinoanER
文件:BlocksFromEntityIndexTest.java
@Before
public void setUp() {
System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
spark = SparkSession.builder()
.appName("test")
.config("spark.sql.warehouse.dir", "/file:/tmp")
.config("spark.executor.instances", 1)
.config("spark.executor.cores", 1)
.config("spark.executor.memory", "1G")
.config("spark.driver.maxResultSize", "1g")
.config("spark.master", "local")
.getOrCreate();
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
项目:MinoanER
文件:BlockFilteringAdvancedTest.java
@Before
public void setUp() {
System.setProperty("hadoop.home.dir", "C:\\Users\\VASILIS\\Documents\\hadoop_home"); //only for local mode
spark = SparkSession.builder()
.appName("test")
.config("spark.sql.warehouse.dir", "/file:/tmp")
.config("spark.executor.instances", 1)
.config("spark.executor.cores", 1)
.config("spark.executor.memory", "1G")
.config("spark.driver.maxResultSize", "1g")
.config("spark.master", "local")
.getOrCreate();
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
项目:ParquetUtils
文件:ParquetRepartTest.java
@BeforeClass
public static void createContext() throws IOException {
Configuration hdfsConfig = HDFSUtils.getConfiguration();
SparkConf config = new SparkConf();
config.setMaster("local[*]");
config.setAppName("my JUnit running Spark");
sc = new JavaSparkContext(config);
fileSystem = FileSystem.get(hdfsConfig);
sqlContext = new SQLContext(sc);
engine = new ParquetRepartEngine(fileSystem, sqlContext);
}
项目:oryx2
文件:ALSUpdate.java
private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext,
PMML pmml,
Path modelParentPath,
Broadcast<Map<String,Integer>> bUserIDToIndex,
Broadcast<Map<String,Integer>> bItemIDToIndex) {
String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
int rank = userRDD.first()._2().length;
return new MatrixFactorizationModel(
rank,
readAndConvertFeatureRDD(userRDD, bUserIDToIndex),
readAndConvertFeatureRDD(productRDD, bItemIDToIndex));
}
项目:mutantpdb
文件:App.java
public static void main( String[] args )
{
Dataset<Row> mutations = DataProvider.getMutationsToStructures();
List<String> pdbIds = mutations.select(col("pdbId"))
.distinct().toJavaRDD().map(t -> t.getString(0)).collect();
List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
SaprkUtils.stopSparkSession();
JavaSparkContext sc = SaprkUtils.getSparkContext();
Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);
MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
.downloadMmtfFiles(Arrays.asList("5IRC"), sc)
.flatMapToPair(new StructureToPolymerChains())
.flatMapToPair(new AddResidueToKey(bcmut))
.mapValues(new StructureToBioJava())
.mapToPair(new FilterResidue())
.filter(t -> t._2!=null).keys()
.map(t -> t.replace(".", ","))
.saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
sc.close();
}
项目:BLASpark
文件:OtherOperations.java
public static DistributedMatrix[] GetLU(DistributedMatrix A, boolean diagonalInL, boolean diagonalInU, JavaSparkContext jsc) {
if((diagonalInL && diagonalInU) || (!diagonalInL && !diagonalInU)) {
LOG.error("Diagonal values must be in either upper or lower matrices");
System.exit(-1);
}
DistributedMatrix[] returnedMatrices;
if( A.getClass() == IndexedRowMatrix.class) {
returnedMatrices = OtherOperations.GetLU_IRW((IndexedRowMatrix) A, diagonalInL, diagonalInU, jsc);
}
else if (A.getClass() == CoordinateMatrix.class) {
returnedMatrices = OtherOperations.GetLU_COORD((CoordinateMatrix) A, diagonalInL, diagonalInU, jsc);
}
else if (A.getClass() == BlockMatrix.class){
// TODO: Implement this operation
//returnedMatrices = OtherOperations.GetLU_BCK((BlockMatrix) A, diagonalInL, diagonalInU, jsc);
returnedMatrices = null;
}
else {
returnedMatrices = null;
}
return returnedMatrices;
}
项目:BLASpark
文件:L2.java
public static DenseVector DGEMV(double alpha, DistributedMatrix A, DenseVector x, double beta, DenseVector y, JavaSparkContext jsc){
// First form y := beta*y.
if (beta != 1.0) {
if (beta == 0.0) {
y = Vectors.zeros(y.size()).toDense();
}
else {
BLAS.scal(beta, y);
}
}
if (alpha == 0.0) {
return y;
}
DenseVector tmpVector = Vectors.zeros(y.size()).toDense();
// Form y := alpha*A*x + y.
// Case of IndexedRowMatrix
if( A.getClass() == IndexedRowMatrix.class) {
tmpVector = L2.DGEMV_IRW((IndexedRowMatrix) A, alpha, x, jsc);
}
else if (A.getClass() == CoordinateMatrix.class) {
tmpVector = L2.DGEMV_COORD((CoordinateMatrix) A, alpha, x, jsc);
}
else if (A.getClass() == BlockMatrix.class){
tmpVector = L2.DGEMV_BCK((BlockMatrix) A, alpha, x, jsc);
}
else {
tmpVector = null;
}
BLAS.axpy(1.0, tmpVector, y);
return y;
}
项目:BLASpark
文件:L2.java
private static DenseVector DGEMV_COORD(CoordinateMatrix matrix, double alpha, DenseVector vector, JavaSparkContext jsc) {
JavaRDD<MatrixEntry> items = matrix.entries().toJavaRDD();
DenseVector result = items.mapPartitions(new MatrixEntriesMultiplication(vector, alpha))
.reduce(new MatrixEntriesMultiplicationReducer());
return result;
}
项目:gcp
文件:Spark1Batch.java
public static void main(String[] args) {
SparkConf sc = new SparkConf().setAppName("POC-Batch");
try(JavaSparkContext jsc = new JavaSparkContext(sc)) {
JavaRDD<ExampleXML> records = jsc.wholeTextFiles("input/")
.map(t -> t._2())
.map(new ParseXML());
System.out.printf("Amount of XMLs: %d\n", records.count());
}
}
项目:athena
文件:MachineLearningManagerImpl.java
public DecisionTreeValidationSummary validateDecisionTreeAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
DecisionTreeDetectionModel decisionTreeDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
DecisionTreeDetectionAlgorithm decisionTreeDetectionAlgorithm = (DecisionTreeDetectionAlgorithm) decisionTreeDetectionModel.getDetectionAlgorithm();
DecisionTreeValidationSummary decisionTreeValidationSummary =
new DecisionTreeValidationSummary(sc.sc(), decisionTreeDetectionAlgorithm.getNumClasses(), indexing, marking);
DecisionTreeDistJob decisionTreeDistJob = new DecisionTreeDistJob();
decisionTreeDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
decisionTreeDetectionModel,
decisionTreeValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
decisionTreeValidationSummary.setTotalValidationTime(time);
return decisionTreeValidationSummary;
}
项目:athena
文件:MachineLearningManagerImpl.java
public GaussianMixtureValidationSummary validateGaussianMixtureAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
GaussianMixtureDetectionModel gaussianMixtureDetectionModel,
Indexing indexing,
Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
GaussianMixtureDetectionAlgorithm gaussianMixtureDetectionAlgorithm = (GaussianMixtureDetectionAlgorithm) gaussianMixtureDetectionModel.getDetectionAlgorithm();
GaussianMixtureValidationSummary gaussianMixtureValidationSummary =
new GaussianMixtureValidationSummary(sc.sc(), gaussianMixtureDetectionAlgorithm.getK(), indexing, marking);
GaussianMixtureDistJob gaussianMixtureDistJob = new GaussianMixtureDistJob();
gaussianMixtureDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
gaussianMixtureDetectionModel,
gaussianMixtureValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
gaussianMixtureValidationSummary.setTotalValidationTime(time);
return gaussianMixtureValidationSummary;
}
项目:Java-Data-Science-Cookbook
文件:ScalaTest.java
public static void main( String[] args ){
String inputFile = "data/dummy.txt";
SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
JavaSparkContext sparkContext = new JavaSparkContext(configuration);
JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();
long numberA = logData.filter(new Function<String,Boolean>(){
private static final long serialVersionUID = 1L;
public Boolean call(String s){
return s.length() == 0;
}
}).count();
sparkContext.close();
System.out.println("Empty Lines: " + numberA);
}
项目:athena
文件:MachineLearningManagerImpl.java
public LassoValidationSummary validateLassoAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
LassoDetectionModel lassoDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
LassoDetectionAlgorithm lassoDetectionAlgorithm =
(LassoDetectionAlgorithm) lassoDetectionModel.getDetectionAlgorithm();
LassoValidationSummary lassoValidationSummary = new LassoValidationSummary();
lassoValidationSummary.setLassoDetectionAlgorithm(lassoDetectionAlgorithm);
LassoDistJob lassoDistJob = new LassoDistJob();
lassoDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
lassoDetectionModel,
lassoValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
lassoValidationSummary.setValidationTime(time);
return lassoValidationSummary;
}
项目:athena
文件:MachineLearningManagerImpl.java
public RidgeRegressionValidationSummary validateRidgeRegressionAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
RidgeRegressionDetectionModel ridgeRegressionDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
RidgeRegressionDetectionAlgorithm ridgeRegressionDetectionAlgorithm =
(RidgeRegressionDetectionAlgorithm) ridgeRegressionDetectionModel.getDetectionAlgorithm();
RidgeRegressionValidationSummary ridgeRegressionValidationSummary = new RidgeRegressionValidationSummary();
ridgeRegressionValidationSummary.setRidgeRegressionDetectionAlgorithm(ridgeRegressionDetectionAlgorithm);
RidgeRegressionDistJob ridgeRegressionDistJob = new RidgeRegressionDistJob();
ridgeRegressionDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
ridgeRegressionDetectionModel,
ridgeRegressionValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
ridgeRegressionValidationSummary.setValidationTime(time);
return ridgeRegressionValidationSummary;
}
项目:ViraPipe
文件:Interleave.java
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
Path fqpath = new Path(fqPath);
String fqname = fqpath.getName();
String[] ns = fqname.split("\\.");
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
splitRDD.foreach( split -> {
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);
});
}
项目:ViraPipe
文件:Interleave.java
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);
JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);
zips.foreach( splits -> {
Path path = splits._1.getPath();
FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);
writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
});
}
项目:Explainer
文件:SparkUtils.java
public static SparkUtils getInstance() {
SparkUtils result = instance;
if (result == null) {
synchronized (SparkUtils.class) {
result = instance;
if (result == null) {
instance = result = new SparkUtils();
SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Explainer");
jsc = new JavaSparkContext(sparkConf);
sqlContext = new SQLContext(jsc);
}
}
}
return result;
}
项目:oryx2
文件:ALSUpdate.java
@Override
public void publishAdditionalModelData(JavaSparkContext sparkContext,
PMML pmml,
JavaRDD<String> newData,
JavaRDD<String> pastData,
Path modelParentPath,
TopicProducer<String, String> modelUpdateTopic) {
// Send item updates first, before users. That way, user-based endpoints like /recommend
// may take longer to not return 404, but when they do, the result will be more complete.
log.info("Sending item / Y data as model updates");
String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
String updateBroker = modelUpdateTopic.getUpdateBroker();
String topic = modelUpdateTopic.getTopic();
// For now, there is no use in sending known users for each item
productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic));
log.info("Sending user / X data as model updates");
String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
if (noKnownItems) {
userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic));
} else {
log.info("Sending known item data with model updates");
JavaRDD<String[]> allData =
(pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN);
JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true);
userRDD.join(knownItems).foreachPartition(
new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic));
}
}
项目:athena
文件:MachineLearningManagerImpl.java
public NaiveBayesValidationSummary validateNaiveBayesAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
NaiveBayesDetectionModel naiveBayesDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
NaiveBayesDetectionAlgorithm naiveBayesDetectionAlgorithm = (NaiveBayesDetectionAlgorithm) naiveBayesDetectionModel.getDetectionAlgorithm();
NaiveBayesValidationSummary naiveBayesValidationSummary =
new NaiveBayesValidationSummary(sc.sc(), naiveBayesDetectionAlgorithm.getNumClasses(), indexing, marking);
NaiveBayesDistJob naiveBayesDistJob = new NaiveBayesDistJob();
naiveBayesDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
naiveBayesDetectionModel,
naiveBayesValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
naiveBayesValidationSummary.setTotalValidationTime(time);
return naiveBayesValidationSummary;
}
项目:athena
文件:MachineLearningManagerImpl.java
public GradientBoostedTreesValidationSummary validateGradientBoostedTreesAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
GradientBoostedTreesDetectionModel gradientBoostedTreesDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
GradientBoostedTreesDetectionAlgorithm gradientBoostedTreesDetectionAlgorithm =
(GradientBoostedTreesDetectionAlgorithm) gradientBoostedTreesDetectionModel.getDetectionAlgorithm();
GradientBoostedTreesValidationSummary gradientBoostedTreesValidationSummary =
new GradientBoostedTreesValidationSummary(sc.sc(),
gradientBoostedTreesDetectionAlgorithm.getNumClasses(), indexing, marking);
GradientBoostedTreesDistJob gradientBoostedTreesDistJob = new GradientBoostedTreesDistJob();
gradientBoostedTreesDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
gradientBoostedTreesDetectionModel,
gradientBoostedTreesValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
gradientBoostedTreesValidationSummary.setTotalValidationTime(time);
return gradientBoostedTreesValidationSummary;
}
项目:movie-recommender
文件:SparkModule.java
@Provides
CassandraIo<RawRating> providesCassandraRatingIO(JavaSparkContext sparkContext) {
if (ratingCassandraIo != null) {
return ratingCassandraIo;
}
ratingCassandraIo = new CassandraIo<>(RawRating.class, "dev", "ratings");
ratingCassandraIo.setSparkContext(sparkContext);
return ratingCassandraIo;
}
项目:metadata-qa-marc
文件:ParallelValidator.java
public static void main(String[] args) throws ParseException {
final Validator validator = new Validator(args);
ValidatorParameters params = validator.getParameters();
validator.setDoPrintInProcessRecord(false);
logger.info("Input file is " + params.getArgs());
SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
JavaSparkContext context = new JavaSparkContext(conf);
System.err.println(validator.getParameters().formatParameters());
JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);
JavaRDD<String> baseCountsRDD = inputFile
.flatMap(content -> {
MarcReader reader = ReadMarc.getMarcStringReader(content);
Record marc4jRecord = reader.next();
MarcRecord marcRecord = MarcFactory.createFromMarc4j(
marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
validator.processRecord(marcRecord, 1);
return ValidationErrorFormatter
.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
.iterator();
}
);
baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
}
项目:athena
文件:MachineLearningManagerImpl.java
public LinearRegressionValidationSummary validateLinearRegressionAthenaFeatures(JavaSparkContext sc,
FeatureConstraint featureConstraint,
AthenaMLFeatureConfiguration athenaMLFeatureConfiguration,
LinearRegressionDetectionModel linearRegressionDetectionModel,
Indexing indexing, Marking marking) {
long start = System.nanoTime(); // <-- start
JavaPairRDD<Object, BSONObject> mongoRDD;
mongoRDD = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
LinearRegressionDetectionAlgorithm linearRegressionDetectionAlgorithm =
(LinearRegressionDetectionAlgorithm) linearRegressionDetectionModel.getDetectionAlgorithm();
LinearRegressionValidationSummary linearRegressionValidationSummary =
new LinearRegressionValidationSummary();
linearRegressionValidationSummary.setLinearRegressionDetectionAlgorithm(linearRegressionDetectionAlgorithm);
LinearRegressionDistJob linearRegressionDistJob = new LinearRegressionDistJob();
linearRegressionDistJob.validate(mongoRDD,
athenaMLFeatureConfiguration,
linearRegressionDetectionModel,
linearRegressionValidationSummary);
long end = System.nanoTime(); // <-- start
long time = end - start;
linearRegressionValidationSummary.setValidationTime(time);
return linearRegressionValidationSummary;
}
项目:rdf2x
文件:TestUtils.java
/**
* Parse RDF file from resources folder
* @param sc spark context to use for parsing
* @param fileName name of the file to parse
* @return RDD of quads from the requested file
*/
public static JavaRDD<Quad> getQuadsRDD(JavaSparkContext sc, String fileName) {
QuadParser parser = new ElephasQuadParser(
new QuadParserConfig()
.setBatchSize(2),
sc
);
String path = TestUtils.getDatasetPath(fileName);
return parser.parseQuads(path);
}