@Test public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for compressed block"); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue( reader.next(key,block),"Input Split for block version contains at least one block"); assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes"); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block"); reader.close(); }
@Test public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinTransaction> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); BitcoinTransaction transaction = new BitcoinTransaction(); int transactCount=0; while (reader.next(key,transaction)) { transactCount++; } assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions"); reader.close(); }
@Test public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); Job job = Job.getInstance(conf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, conf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); List<InputSplit> splits = format.getSplits(job); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); assertEquals( 1, splits.size(),"Only one split generated for compressed block"); RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block"); block=reader.getCurrentValue(); assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes"); assertFalse( reader.nextKeyValue(),"No further blocks in compressed block"); reader.close(); }
@Test public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); Job job = Job.getInstance(conf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, conf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat(); List<InputSplit> splits = format.getSplits(job); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); assertEquals( 1, splits.size(),"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinTransaction> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); int transactCount=0; while (reader.nextKeyValue()) { transactCount++; } assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions"); reader.close(); }
public static void main(String[] args) throws IOException { JobConf job = new JobConf(DataBalancer.class); job.setJobName(DataBalancer.class.getSimpleName()); job.setMapperClass(MapRecordOnly.class); job.setReducerClass(ReduceRecordOnly.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(Integer.parseInt(args[2])); if (args.length > 3) { if (args[3].startsWith("bzip")) FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); if (args[3].startsWith("gz")) FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } JobClient.runJob(job); }
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, super.getConf(), args); if (job == null) { return -1; } job.setMapperClass(CleanerMapper.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); return job.waitForCompletion(true) ? 0 : 1; }
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, getConf(), args); if (job == null) { return -1; } job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputFormatClass(MapFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); return job.waitForCompletion(true) ? 0 : 1; }
@Test public void testMultipleClose() throws IOException { URL testFileUrl = getClass().getClassLoader(). getResource("recordSpanningMultipleSplits.txt.bz2"); assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input. LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[])null); LineRecordReader reader = new LineRecordReader(conf, split); LongWritable key = new LongWritable(); Text value = new Text(); //noinspection StatementWithEmptyBody while (reader.next(key, value)) ; reader.close(); reader.close(); BZip2Codec codec = new BZip2Codec(); codec.setConf(conf); Set<Decompressor> decompressors = new HashSet<Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.add(CodecPool.getDecompressor(codec)); } assertEquals(10, decompressors.size()); }
@Test public void testMultipleClose() throws IOException { URL testFileUrl = getClass().getClassLoader(). getResource("recordSpanningMultipleSplits.txt.bz2"); assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input. LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); //noinspection StatementWithEmptyBody while (reader.nextKeyValue()) ; reader.close(); reader.close(); BZip2Codec codec = new BZip2Codec(); codec.setConf(conf); Set<Decompressor> decompressors = new HashSet<Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.add(CodecPool.getDecompressor(codec)); } assertEquals(10, decompressors.size()); }
@Test public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinBlock> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); BitcoinBlock block = new BitcoinBlock(); assertTrue( reader.next(key,block),"Input Split for block version contains at least one block"); assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions"); assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4"); assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs"); assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25"); BytesWritable emptyKey = new BytesWritable(); BitcoinBlock emptyBlock = new BitcoinBlock(); assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block"); reader.close(); }
@Test public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); Job job = Job.getInstance(conf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, conf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="version4comp.blk.bz2"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat(); List<InputSplit> splits = format.getSplits(job); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); assertEquals( 1, splits.size(),"Only one split generated for compressed block"); RecordReader<BytesWritable, BitcoinBlock> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); BytesWritable key = new BytesWritable(); BitcoinBlock block = new BitcoinBlock(); assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block"); block=reader.getCurrentValue(); assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions"); assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4"); assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs"); assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25"); assertFalse( reader.nextKeyValue(),"No further blocks in compressed block"); reader.close(); }
@DataProvider(name="test-hdfs-partitioner") public static Object[][] data() { List<Object[]> parameters = new ArrayList<Object[]>(); for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) { for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) { parameters.add(new Object[]{outputFileType, compressionClass}); } } return parameters.toArray(new Object[0][]); }
@DataProvider(name="test-hdfs-extractor") public static Object[][] data() { List<Object[]> parameters = new ArrayList<Object[]>(); for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) { for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) { parameters.add(new Object[]{outputFileType, compressionClass}); } } return parameters.toArray(new Object[0][]); }
private void setCompression(Path path, Job job) { String location=path.getName(); if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput( job, false); } }
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } }
@Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (comp == Compression.bz2 || comp == Compression.bz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (comp == Compression.gz) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } }
public void testBzip2TextCompression() throws IOException { runTextCompressionTest(new BZip2Codec(), 4); }
public void testBzip2SequenceFileCompression() throws Exception { runSequenceFileCompressionTest(new BZip2Codec(), 4); }
/** * Test using the bzip2 codec for reading */ @Test public void testBzip2() throws IOException { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, jobConf); localFs.delete(workDir, true); System.out.println(COLOR_BR_CYAN + "testBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL); // copy prebuilt (correct!) version of concat.bz2 to HDFS final String fn = "concat" + bzip2.getDefaultExtension(); Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.copyFromLocalFile(fnLocal, fnHDFS); writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n"); FileInputFormat.setInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); // extends FileInputFormat format.configure(jobConf); format.setMinSplitSize(256); // work around 2-byte splits issue // [135 splits for a 208-byte file and a 62-byte file(!)] InputSplit[] splits = format.getSplits(jobConf, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.bz2")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], jobConf); assertEquals("splits[0] num lines", 6, results.size()); assertEquals("splits[0][5]", "member #3", results.get(5).toString()); results = readSplit(format, splits[1], jobConf); assertEquals("splits[1] num lines", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of bzip2", results.get(1).toString()); }
/** * Extended bzip2 test, similar to BuiltInGzipDecompressor test above. */ @Test public void testMoreBzip2() throws IOException { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, jobConf); localFs.delete(workDir, true); System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL); // copy single-member test file to HDFS String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension(); Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.copyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension(); Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.copyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.setInputPaths(jobConf, workDir); // here's first pair of BlockDecompressorStreams: final FileInputStream in1 = new FileInputStream(fnLocal1.toString()); final FileInputStream in2 = new FileInputStream(fnLocal2.toString()); assertEquals("concat bytes available", 2567, in1.available()); assertEquals("concat bytes available", 3056, in2.available()); /* // FIXME // The while-loop below dies at the beginning of the 2nd concatenated // member (after 17 lines successfully read) with: // // java.io.IOException: bad block header // at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock( // CBZip2InputStream.java:527) // // It is not critical to concatenated-gzip support, HADOOP-6835, so it's // simply commented out for now (and HADOOP-6852 filed). If and when the // latter issue is resolved--perhaps by fixing an error here--this code // should be reenabled. Note that the doMultipleBzip2BufferSizes() test // below uses the same testCompressThenConcat.txt.bz2 file but works fine. CompressionInputStream cin2 = bzip2.createInputStream(in2); LineReader in = new LineReader(cin2); Text out = new Text(); int numBytes, totalBytes=0, lineNum=0; while ((numBytes = in.readLine(out)) > 0) { ++lineNum; totalBytes += numBytes; } in.close(); assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes); assertEquals("total uncompressed lines in concatenated test file", 84, lineNum); */ // test CBZip2InputStream with lots of different input-buffer sizes doMultipleBzip2BufferSizes(jobConf); }
@Test public void readExcelInputFormatBzip2CompressedExcel2013MultiSheetAll() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, job); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013testmultisheet.xlsx.bz2"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1,inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader,"Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 1 (first sheet)"); assertEquals("[excel2013testmultisheet.xlsx.bz2]Sheet1!A1",spreadSheetKey.toString(),"Input Split for Excel file has keyname == \"[excel2013testmultisheet.xlsx.bz2]Sheet1!A1\""); assertEquals(4,spreadSheetValue.get().length,"Input Split for Excel file contains row 1 with 4 columns"); assertEquals("test1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(),"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(),"Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertEquals("test4",((SpreadSheetCellDAO)spreadSheetValue.get()[3]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 4 == \"test4\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 2 (first sheet)"); assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 2 with 1 column"); assertEquals("4",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 2 with cell 1 == \"4\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 3 (first sheet)"); assertEquals(5,spreadSheetValue.get().length,"Input Split for Excel file contains row 3 with 5 columns"); assertEquals("31/12/99",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\""); assertEquals("5",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 2 == \"5\""); assertNull(spreadSheetValue.get()[2],"Input Split for Excel file contains row 3 with cell 3 == null"); assertNull(spreadSheetValue.get()[3],"Input Split for Excel file contains row 3 with cell 4 == null"); assertEquals("null",((SpreadSheetCellDAO)spreadSheetValue.get()[4]).getFormattedValue(),"Input Split for Excel file contains row 3 with cell 5 == \"null\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 4 (first sheet)"); assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 4 with 1 column"); assertEquals("1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 4 with cell 1 == \"1\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 5 (first sheet)"); assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 5 with 3 columns"); assertEquals("2",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 1 == \"2\""); assertEquals("6",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 2== \"6\""); assertEquals("10",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 5 with cell 3== \"10\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 6 (first sheet)"); assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 6 with 3 columns"); assertEquals("3",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 1 == \"3\""); assertEquals("4",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 2== \"4\""); assertEquals("15",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 6 with cell 3== \"15\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 7 (second sheet)"); assertEquals("8",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 7 with cell 1 == \"8\""); assertEquals("99",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 7 with cell 2 == \"99\""); assertEquals(2,spreadSheetValue.get().length,"Input Split for Excel file contains row 7 with 2 columns"); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 8 (second sheet)"); assertEquals(1,spreadSheetValue.get().length,"Input Split for Excel file contains row 8 with 1 column"); assertEquals("test",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 8 with cell 1 == \"test\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 9 (second sheet)"); assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 9 with 3 columns"); assertNull(spreadSheetValue.get()[0],"Input Split for Excel file contains row 9 with cell 1 == null"); assertNull(spreadSheetValue.get()[1],"Input Split for Excel file contains row 9 with cell 2 == null"); assertEquals("seven",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 9 with cell 3 == \"seven\""); }
public static void main(String[] args) throws IOException { JobConf job = new JobConf(DataGenerator.class); FileSystem dfs = FileSystem.get(job); String maxFile = "/maxtemp"; dfs.delete(new Path(maxFile), true); job.setJobName(DataGenerator.class.getSimpleName() + "max ID"); job.setMapperClass(MapMaxId.class); job.setCombinerClass(CombineMaxId.class); job.setReducerClass(ReduceMaxId.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(VLongWritable.class); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(maxFile)); job.setNumReduceTasks(1); JobClient.runJob(job); job = new JobConf(DataGenerator.class); job.set("hyracks.maxid.file", maxFile); job.setInt("hyracks.x", Integer.parseInt(args[2])); dfs.delete(new Path(args[1]), true); job.setJobName(DataGenerator.class.getSimpleName()); job.setMapperClass(MapRecordGen.class); job.setReducerClass(ReduceRecordGen.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(Integer.parseInt(args[3])); if (args.length > 4) { if (args[4].startsWith("bzip")) FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); if (args[4].startsWith("gz")) FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } JobClient.runJob(job); }
public static void main(String[] args) throws Exception { if (args.length < 3) { System.out .println("ManyTxtToFewSeqJob <inputPath> <outputPath> <# mappers> <compressionCodec>"); System.out.println(); System.out .println("Example: ManyTxtToFewSeqJob ./input ./output 20 snappy"); return; } // Get values from args String inputPath = args[0]; String outputPath = args[1]; String numberOfMappers = args[2]; String compressionCodec = args[3]; // Create job Job job = new Job(); job.setJobName("ManyTxtToFewSeqJob"); job.setJarByClass(ManyTxtToFewSeqJob.class); // Define input format and path job.setInputFormatClass(ConfigurableInputFormat.class); ConfigurableInputFormat.setInputPath(job, inputPath); ConfigurableInputFormat.setMapperNumber(job, Integer.parseInt(numberOfMappers)); // Define output format and path job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if (compressionCodec.toLowerCase().equals("gzip")) { SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else if (compressionCodec.toLowerCase().equals("bzip2")) { SequenceFileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else { SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); // Define the mapper and reducer job.setMapperClass(ConsalidatorMapper.class); // job.setReducerClass(Reducer.class); // Define the key and value format job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setNumReduceTasks(0); Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); hdfs.delete(new Path(outputPath), true); // Exit job.waitForCompletion(true); }
/** * Create a bzip2 input stream. * @param is input stream * @return an uncompressed input stream * @throws IOException if an error occurs while creating the input stream */ public static InputStream createBZip2InputStream(final InputStream is) throws IOException { return new BZip2Codec().createInputStream(is); }
/** * Create a bzip2 output stream. * @param os the output stream to compress * @return a compressed output stream * @throws IOException if an error occurs while creating the output stream */ public static OutputStream createBZip2OutputStream(final OutputStream os) throws IOException { return new BZip2Codec().createOutputStream(os); }