@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; } Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = MapFileOutputFormat.getReaders(path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Writable entry = MapFileOutputFormat.getEntry(readers, partitioner, key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); return 0; }
private void assertData(int totalShardCount) throws IOException { Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>(); for (int i = 0; i < totalShardCount; i++) { HdfsDirectory directory = new HdfsDirectory(configuration, new Path(path, ShardUtil.getShardName(i))); DirectoryReader reader = DirectoryReader.open(directory); int numDocs = reader.numDocs(); for (int d = 0; d < numDocs; d++) { Document document = reader.document(d); IndexableField field = document.getField("id"); Integer id = (Integer) field.numericValue(); int partition = partitioner.getPartition(new IntWritable(id), null, totalShardCount); assertEquals(i, partition); } reader.close(); } }
private static void createShard(Configuration configuration, int i, Path path, int totalShardCount) throws IOException { HdfsDirectory hdfsDirectory = new HdfsDirectory(configuration, path); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new KeywordAnalyzer()); TieredMergePolicy mergePolicy = (TieredMergePolicy) conf.getMergePolicy(); mergePolicy.setUseCompoundFile(false); IndexWriter indexWriter = new IndexWriter(hdfsDirectory, conf); Partitioner<IntWritable, IntWritable> partitioner = new HashPartitioner<IntWritable, IntWritable>(); int partition = partitioner.getPartition(new IntWritable(i), null, totalShardCount); assertEquals(i, partition); Document doc = getDoc(i); indexWriter.addDocument(doc); indexWriter.close(); }
/** * Test {@link LembosMapReduceRunner#initJob(String[])} works as expected for a map only job. * * @throws Exception if anything goes wrong */ @Test public void testMapOnlyJob() throws Exception { final String moduleName = "LembosMapReduceRunnerTest-testMapOnlyJob"; final String modulePath = TestUtils.getModulePath(moduleName); final Job job = getJob(moduleName, modulePath, null, null); assertNotNull(job.getMapperClass()); assertNull(job.getCombinerClass()); // assertNull(job.getGroupingComparator()); // Throws an exception because our map output key is // WritableComparable and can't subclass itself assertEquals(HashPartitioner.class, job.getPartitionerClass()); assertEquals(Reducer.class, job.getReducerClass()); // Defaults to the Hadoop Reducer // assertNull(job.getSortComparator()); // Throws an exception because our map output key is // WritableComparable and can't subclass itself assertNull(job.getConfiguration().get("boolean")); assertNull(job.getConfiguration().get("double")); assertNull(job.getConfiguration().get("float")); assertNull(job.getConfiguration().get("int")); assertNull(job.getConfiguration().get("long")); assertNull(job.getConfiguration().get("string")); }
/** * Get the {@link Partitioner} class for the job. * * @return the {@link Partitioner} class for the job. */ @SuppressWarnings("unchecked") public Class<? extends Partitioner<?,?>> getPartitionerClass() throws ClassNotFoundException { return (Class<? extends Partitioner<?,?>>) conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class); }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Path titlesDir = new Path(conf.get("pagerank.titles_dir")); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); IntWritable page = new IntWritable(); Text title = new Text(); float[] pageRanks = new float[topN.size()]; String[] titles = new String[topN.size()]; // The order of the entries is reversed. The priority queue is in // non-decreasing order and we want the highest PageRank first. for (int i = pageRanks.length - 1; i >= 0; i--) { Map.Entry<Float, Integer> entry = topN.poll(); // Get the title of the page from the title index. page.set(entry.getValue()); MapFileOutputFormat.getEntry(readers, partitioner, page, title); pageRanks[i] = entry.getKey(); titles[i] = title.toString(); } for (MapFile.Reader reader : readers) { reader.close(); } for (int i = 0; i < pageRanks.length; i++) { context.write(new FloatWritable(pageRanks[i]), new Text(titles[i])); } }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); Path titlesDir = new Path(conf.get("inlinks.titles_dir")); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(titlesDir, conf); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); IntWritable page = new IntWritable(); Text title = new Text(); int[] inLinks = new int[topN.size()]; String[] titles = new String[topN.size()]; for (int i = inLinks.length - 1; i >= 0; i--) { Map.Entry<Integer, Integer> entry = topN.poll(); page.set(entry.getValue()); MapFileOutputFormat.getEntry(readers, partitioner, page, title); inLinks[i] = entry.getKey(); titles[i] = title.toString(); } for (MapFile.Reader reader : readers) { reader.close(); } for (int i = 0; i < inLinks.length; i++) { context.write(new IntWritable(inLinks[i]), new Text(titles[i])); } }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { JobBuilder.printUsage(this, "<path> <key>"); return -1; } Path path = new Path(args[0]); IntWritable key = new IntWritable(Integer.parseInt(args[1])); Reader[] readers = MapFileOutputFormat.getReaders(path, getConf()); Partitioner<IntWritable, Text> partitioner = new HashPartitioner<IntWritable, Text>(); Text val = new Text(); Reader reader = readers[partitioner.getPartition(key, val, readers.length)]; Writable entry = reader.get(key, val); if (entry == null) { System.err.println("Key not found: " + key); return -1; } NcdcRecordParser parser = new NcdcRecordParser(); IntWritable nextKey = new IntWritable(); do { parser.parse(val.toString()); System.out.printf("%s\t%s\n", parser.getStationId(), parser.getYear()); } while (reader.next(nextKey, val) && key.equals(nextKey)); return 0; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Job job = new Job(getConf()); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); /* by default start */ job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Mapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(1); job.setReducerClass(Reducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); /* by default end */ return job.waitForCompletion(true) ? 0 : 1; }
/** * w/ reduce. * @throws Exception if failed */ @Test public void reduce() throws Exception { FileEditor.put(new File(folder.getRoot(), "input/test.txt"), "Hello, world!"); Path root = new Path(folder.getRoot().toURI()); Path base = new Path(root, "output"); ClassDescription client = new ClassDescription("com.example.StageClient"); MapReduceStageInfo info = new MapReduceStageInfo( new StageInfo("simple", "simple", "simple"), Arrays.asList(new MapReduceStageInfo.Input( new Path(root, "input/*.txt").toString(), classOf(Text.class), classOf(TextInputFormat.class), classOf(Mapper.class), Collections.emptyMap())), Arrays.asList(new MapReduceStageInfo.Output( "out", classOf(NullWritable.class), classOf(Text.class), classOf(TextOutputFormat.class), Collections.emptyMap())), Collections.emptyList(), new MapReduceStageInfo.Shuffle( classOf(LongWritable.class), classOf(Text.class), classOf(HashPartitioner.class), null, classOf(LongWritable.Comparator.class), classOf(LongWritable.Comparator.class), classOf(SimpleReducer.class)), base.toString()); MapReduceStageEmitter.emit(client, info, javac); int status = MapReduceRunner.execute( new Configuration(), client, "testing", Collections.emptyMap(), javac.compile()); assertThat("exit status code", status, is(0)); assertThat(collect("output"), contains("Hello, world!")); }
@Override public void map(ShortArrayWritable inKey, MatrixBlockWritable inValue, Context context) throws IOException, InterruptedException { // This task gets each block M_{i,j}, loads the corresponding stripe j // of the vector v_{k-1} and produces the partial result of the stripe i // of the vector v_k. Configuration conf = context.getConfiguration(); int iter = Integer.parseInt(conf.get("pagerank.iteration")); int numPages = Integer.parseInt(conf.get("pagerank.num_pages")); short blockSize = Short.parseShort(conf.get("pagerank.block_size")); Writable[] blockIndexes = inKey.get(); short i = ((ShortWritable) blockIndexes[0]).get(); short j = ((ShortWritable) blockIndexes[1]).get(); int vjSize = (j > numPages / blockSize) ? (numPages % blockSize) : blockSize; FloatWritable[] vj = new FloatWritable[vjSize]; if (iter == 1) { // Initial PageRank vector with 1/n for all pages. for (int k = 0; k < vj.length; k++) { vj[k] = new FloatWritable(1.0f / numPages); } } else { // Load the stripe j of the vector v_{k-1} from the MapFiles. Path outputDir = MapFileOutputFormat.getOutputPath(context).getParent(); Path vjDir = new Path(outputDir, "v" + (iter - 1)); MapFile.Reader[] readers = MapFileOutputFormat.getReaders(vjDir, conf); Partitioner<ShortWritable, FloatArrayWritable> partitioner = new HashPartitioner<ShortWritable, FloatArrayWritable>(); ShortWritable key = new ShortWritable(j); FloatArrayWritable value = new FloatArrayWritable(); MapFileOutputFormat.getEntry(readers, partitioner, key, value); Writable[] writables = value.get(); for (int k = 0; k < vj.length; k++) { vj[k] = (FloatWritable) writables[k]; } for (MapFile.Reader reader : readers) { reader.close(); } } // Initialize the partial result i of the vector v_k. int viSize = (i > numPages / blockSize) ? (numPages % blockSize) : blockSize; FloatWritable[] vi = new FloatWritable[viSize]; for (int k = 0; k < vi.length; k++) { vi[k] = new FloatWritable(0); } // Multiply M_{i,j} by the stripe j of the vector v_{k-1} to obtain the // partial result i of the vector v_k. Writable[][] blockColumns = inValue.get(); for (int k = 0; k < blockColumns.length; k++) { Writable[] blockColumn = blockColumns[k]; if (blockColumn.length > 0) { int vDegree = ((ShortWritable) blockColumn[0]).get(); for (int columnIndex = 1; columnIndex < blockColumn.length; columnIndex++) { int l = ((ShortWritable) blockColumn[columnIndex]).get(); vi[l].set(vi[l].get() + (1.0f / vDegree) * vj[k].get()); } } } context.write(new ShortWritable(i), new FloatArrayWritable(vi)); }
@Override public void visitRank(PORank op) throws VisitorException { try{ // Rank implementation has 3 vertices // Vertex 1 has POCounterTez produce output tuples and send to Vertex 3 via 1-1 edge. // Vertex 1 also sends the count of tuples of each task in Vertex 1 to Vertex 2 which is a single reducer. // Vertex 3 has PORankTez which consumes from Vertex 2 as broadcast input and also tuples from Vertex 1 and // produces tuples with updated ranks based on the count of tuples from Vertex 2. // This is different from MR implementation where POCounter updates job counters, and that is // copied by JobControlCompiler into the PORank job's jobconf. // Previous operator is always POCounterTez (Vertex 1) TezOperator counterOper = curTezOp; POCounterTez counterTez = (POCounterTez) counterOper.plan.getLeaves().get(0); //Construct Vertex 2 TezOperator statsOper = getTezOp(); tezPlan.add(statsOper); POCounterStatsTez counterStatsTez = new POCounterStatsTez(OperatorKey.genOpKey(scope)); statsOper.plan.addAsLeaf(counterStatsTez); statsOper.setRequestedParallelism(1); statsOper.setDontEstimateParallelism(true); //Construct Vertex 3 TezOperator rankOper = getTezOp(); tezPlan.add(rankOper); PORankTez rankTez = new PORankTez(op); rankOper.plan.addAsLeaf(rankTez); curTezOp = rankOper; // Connect counterOper vertex to rankOper vertex by 1-1 edge rankOper.setRequestedParallelismByReference(counterOper); TezEdgeDescriptor edge = TezCompilerUtil.connect(tezPlan, counterOper, rankOper); rankOper.setUseMRMapSettings(counterOper.isUseMRMapSettings()); TezCompilerUtil.configureValueOnlyTupleOutput(edge, DataMovementType.ONE_TO_ONE); counterTez.setTuplesOutputKey(rankOper.getOperatorKey().toString()); rankTez.setTuplesInputKey(counterOper.getOperatorKey().toString()); // Connect counterOper vertex to statsOper vertex by Shuffle edge edge = TezCompilerUtil.connect(tezPlan, counterOper, statsOper); // Task id edge.setIntermediateOutputKeyClass(IntWritable.class.getName()); edge.partitionerClass = HashPartitioner.class; // Number of records in that task edge.setIntermediateOutputValueClass(LongWritable.class.getName()); counterTez.setStatsOutputKey(statsOper.getOperatorKey().toString()); counterStatsTez.setInputKey(counterOper.getOperatorKey().toString()); // Connect statsOper vertex to rankOper vertex by Broadcast edge edge = TezCompilerUtil.connect(tezPlan, statsOper, rankOper); // Map of task id, offset count based on total number of records is in the value TezCompilerUtil.configureValueOnlyTupleOutput(edge, DataMovementType.BROADCAST); counterStatsTez.setOutputKey(rankOper.getOperatorKey().toString()); rankTez.setStatsInputKey(statsOper.getOperatorKey().toString()); phyToTezOpMap.put(op, rankOper); } catch (Exception e) { int errCode = 2034; String msg = "Error compiling operator " + op.getClass().getSimpleName(); throw new TezCompilerException(msg, errCode, PigException.BUG, e); } }
protected final GroupingOptions groupWithComparator( Class<? extends RawComparator<?>> comparator) { return groupingOptions(HashPartitioner.class, comparator); }
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption(); addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns")); String similarityClassnameArg = parsedArgs.get("--similarityClassname"); String distributedSimilarityClassname; try { distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg) .getSimilarityImplementationClassName(); } catch (IllegalArgumentException iae) { distributedSimilarityClassname = similarityClassnameArg; } int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path weightsPath = new Path(tempDirPath, "weights"); Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class, VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class, VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class); weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); weights.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class, CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class, SequenceFileOutputFormat.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseSimilarity.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class, Mapper.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class, EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); asMatrix.setPartitionerClass(HashPartitioner.class); asMatrix.setGroupingComparatorClass(SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); asMatrix.waitForCompletion(true); } return 0; }
public static void main(String[] args) throws Exception { if (args.length != 4) { System.out .println("USAGE: <Number of vertices> <Number of edges per vertex> <Number of partitions> <Outpath>"); return; } System.out.println(Arrays.toString(args)); Configuration conf = new Configuration(); conf.setInt("hama.num.vertices", Integer.parseInt(args[0])); conf.setInt("hama.num.partitions", Integer.parseInt(args[2])); conf.setInt("number.edges", Integer.parseInt(args[1])); Job job = new Job(conf); Path generated = new Path(new Path(args[3]).getParent(), "generated"); FileOutputFormat.setOutputPath(job, generated); FileSystem.get(conf).delete(generated, true); job.setJobName("RangeWriter"); job.setJarByClass(SortGenMapper.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(RangeInputFormat.class); job.waitForCompletion(true); conf.setInt("max.id", Integer.valueOf(args[0])); job = new Job(conf); FileOutputFormat.setOutputPath(job, new Path(args[3])); FileSystem.get(conf).delete(new Path(args[3]), true); job.setJobName("Random Vertex Writer"); FileInputFormat.addInputPath(job, generated); job.setJarByClass(RandomMapper.class); job.setMapperClass(RandomMapper.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(conf.getInt("hama.num.partitions", 2)); job.setPartitionerClass(HashPartitioner.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); }