@Override public void writeFields(DataOutputStream out) throws IOException { FSImageSerialization.writeLong(inodeId, out); FSImageSerialization.writeString(path, out); FSImageSerialization.writeShort(replication, out); FSImageSerialization.writeLong(mtime, out); FSImageSerialization.writeLong(atime, out); FSImageSerialization.writeLong(blockSize, out); new ArrayWritable(Block.class, blocks).write(out); permissions.write(out); if (this.opCode == OP_ADD) { AclEditLogUtil.write(aclEntries, out); XAttrEditLogProto.Builder b = XAttrEditLogProto.newBuilder(); b.addAllXAttrs(PBHelper.convertXAttrProto(xAttrs)); b.build().writeDelimitedTo(out); FSImageSerialization.writeString(clientName,out); FSImageSerialization.writeString(clientMachine,out); FSImageSerialization.writeBoolean(overwrite, out); FSImageSerialization.writeByte(storagePolicyId, out); // write clientId and callId writeRpcIds(rpcClientId, rpcCallId, out); } }
@Override public void writeFields(DataOutputStream out) throws IOException { FSImageSerialization.writeString(trg, out); DeprecatedUTF8 info[] = new DeprecatedUTF8[srcs.length]; int idx = 0; for(int i=0; i<srcs.length; i++) { info[idx++] = new DeprecatedUTF8(srcs[i]); } new ArrayWritable(DeprecatedUTF8.class, info).write(out); FSImageSerialization.writeLong(timestamp, out); // rpc ids writeRpcIds(rpcClientId, rpcCallId, out); }
@Override public void writeFields(DataOutputStream out) throws IOException { FSImageSerialization.writeLong(inodeId, out); FSImageSerialization.writeString(path, out); FSImageSerialization.writeShort(replication, out); FSImageSerialization.writeLong(mtime, out); FSImageSerialization.writeLong(atime, out); FSImageSerialization.writeLong(blockSize, out); new ArrayWritable(Block.class, blocks).write(out); permissions.write(out); if (this.opCode == OP_ADD) { AclEditLogUtil.write(aclEntries, out); XAttrEditLogProto.Builder b = XAttrEditLogProto.newBuilder(); b.addAllXAttrs(PBHelperClient.convertXAttrProto(xAttrs)); b.build().writeDelimitedTo(out); FSImageSerialization.writeString(clientName,out); FSImageSerialization.writeString(clientMachine,out); FSImageSerialization.writeBoolean(overwrite, out); FSImageSerialization.writeByte(storagePolicyId, out); // write clientId and callId writeRpcIds(rpcClientId, rpcCallId, out); } }
@Override public String toString() { Iterator<Entry<Writable, Writable>> i = entrySet().iterator(); if (!i.hasNext()) return "{}"; StringBuilder sb = new StringBuilder(); sb.append('{'); for (;;) { Entry<Writable, Writable> e = i.next(); Writable key = e.getKey(); Writable value = e.getValue(); sb.append(key == this ? "(this Map)" : key); sb.append('='); if (value instanceof ArrayWritable) { sb.append(Arrays.toString(((ArrayWritable) value).get())); } else { sb.append(value == this ? "(this Map)" : value); } if (!i.hasNext()) return sb.append('}').toString(); sb.append(", "); } }
@Test public void readExcelInputFormatExcel2003Empty() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003empty.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.locale.bcp47","de"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1, inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader,"Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 1"); assertEquals(0,spreadSheetValue.get().length,"Input Split for Excel file contain row 1 and is empty"); assertFalse(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains no further row"); }
@Test public void readExcelInputFormatExcel2013Empty() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013empty.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1, inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader,"Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals(0,spreadSheetValue.get().length, "Input Split for Excel file contain row 1 and is empty"); assertFalse(reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains no further row"); }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test2"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNull(reader, "Null record reader implies invalid password"); }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // low footprint job.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test2"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1,inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNull(reader,"Null record reader implies invalid password"); }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // low footprint job.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test2"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1,inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNull(reader,"Null record reader implies invalid password"); }
@Test public void readExcelInputFormatExcel2013Empty() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013empty.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.locale.bcp47","de"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.nextKeyValue(),"Input Split for Excel file contains row 1"); spreadSheetKey=reader.getCurrentKey(); spreadSheetValue=reader.getCurrentValue(); assertEquals( 0,spreadSheetValue.get().length,"Input Split for Excel file contain row 1 and is empty"); assertFalse( reader.nextKeyValue(),"Input Split for Excel file contains no further row"); }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedNegative() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47","de"); // for decryption simply set the password conf.set("hadoopoffice.read.security.crypt.password","test2"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); InterruptedException ex = assertThrows(InterruptedException.class, ()->reader.initialize(splits.get(0),context),"Exception is thrown in case of wrong password"); }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedNegative() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47","de"); // for decryption simply set the password conf.set("hadoopoffice.read.security.crypt.password","test2"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); InterruptedException ex = assertThrows(InterruptedException.class, ()->reader.initialize(splits.get(0),context),"Exception is thrown in case of wrong password"); }
/** * Pulls the correct selector from the MapWritable data element given the queryType * <p> * Pulls first element of array if element is an array type */ public static String getSelectorByQueryType(MapWritable dataMap, QuerySchema qSchema, DataSchema dSchema) { String selector; String fieldName = qSchema.getSelectorName(); if (dSchema.isArrayElement(fieldName)) { if (dataMap.get(dSchema.getTextName(fieldName)) instanceof WritableArrayWritable) { String[] selectorArray = ((WritableArrayWritable) dataMap.get(dSchema.getTextName(fieldName))).toStrings(); selector = selectorArray[0]; } else { String[] elementArray = ((ArrayWritable) dataMap.get(dSchema.getTextName(fieldName))).toStrings(); selector = elementArray[0]; } } else { selector = dataMap.get(dSchema.getTextName(fieldName)).toString(); } return selector; }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedPositiveLowFootprint() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47","de"); // low footprint conf.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password conf.set("hadoopoffice.read.security.crypt.password","test"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); assertNotNull( reader,"Format returned null RecordReader"); reader.initialize(splits.get(0),context); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.nextKeyValue(),"Input Split for Excel file contains row 1"); spreadSheetKey=reader.getCurrentKey(); spreadSheetValue=reader.getCurrentValue(); assertEquals( "[excel2003encrypt.xls]Sheet1!A1", spreadSheetKey.toString(),"Input Split for Excel file has keyname == \"[excel2003encrypt.xls]Sheet1!A1\""); assertEquals( 3, spreadSheetValue.get().length,"Input Split for Excel file contains row 1 with 3 columns"); assertEquals( "test1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals( "Sheet1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(),"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals( "A1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(),"Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals( "test2", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals( "test3", ((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 3 == \"test3\""); }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedNegativeLowFootprint() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47","de"); // low footprint conf.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password conf.set("hadoopoffice.read.security.crypt.password","test2"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); InterruptedException ex = assertThrows(InterruptedException.class, ()->reader.initialize(splits.get(0),context),"Exception is thrown in case of wrong password"); }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedNegativeLowFootprint() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47","de"); // low footprint conf.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password conf.set("hadoopoffice.read.security.crypt.password","test2"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, file); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); ExcelFileInputFormat format = new ExcelFileInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals( 1, splits.size(),"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context); InterruptedException ex = assertThrows(InterruptedException.class, ()->reader.initialize(splits.get(0),context),"Exception is thrown in case of wrong password"); }
@Override public Writable serialize(Object obj, ObjectInspector objectInspector) throws SerDeException { if (!objectInspector.getCategory().equals(ObjectInspector.Category.STRUCT)) { throw new SerDeException("Cannot serialize " + objectInspector.getCategory() + ". Can only serialize a struct"); } StructObjectInspector inspector = (StructObjectInspector) objectInspector; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); Writable[] arr = new Writable[fields.size()]; for (int i = 0; i < fields.size(); i++) { StructField field = fields.get(i); Object subObj = inspector.getStructFieldData(obj, field); ObjectInspector subInspector = field.getFieldObjectInspector(); arr[i] = createPrimitive(subObj, (PrimitiveObjectInspector) subInspector); } serdeSize = arr.length; return new ArrayWritable(Writable.class, arr); }
@Override public Object getStructFieldData(final Object data, final StructField fieldRef) { if (data == null) { return null; } if (data instanceof ArrayWritable) { final ArrayWritable arr = (ArrayWritable) data; return arr.get()[((StructFieldImpl) fieldRef).getIndex()]; } //since setStructFieldData and create return a list, getStructFieldData should be able to //handle list data. This is required when table serde is ParquetHiveSerDe and partition serde //is something else. if (data instanceof List) { return ((List) data).get(((StructFieldImpl) fieldRef).getIndex()); } throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); }
public void write(Writable w) throws IOException { if (w instanceof TypedBytesWritable) { writeTypedBytes((TypedBytesWritable) w); } else if (w instanceof BytesWritable) { writeBytes((BytesWritable) w); } else if (w instanceof ByteWritable) { writeByte((ByteWritable) w); } else if (w instanceof BooleanWritable) { writeBoolean((BooleanWritable) w); } else if (w instanceof IntWritable) { writeInt((IntWritable) w); } else if (w instanceof VIntWritable) { writeVInt((VIntWritable) w); } else if (w instanceof LongWritable) { writeLong((LongWritable) w); } else if (w instanceof VLongWritable) { writeVLong((VLongWritable) w); } else if (w instanceof FloatWritable) { writeFloat((FloatWritable) w); } else if (w instanceof DoubleWritable) { writeDouble((DoubleWritable) w); } else if (w instanceof Text) { writeText((Text) w); } else if (w instanceof ArrayWritable) { writeArray((ArrayWritable) w); } else if (w instanceof MapWritable) { writeMap((MapWritable) w); } else if (w instanceof SortedMapWritable) { writeSortedMap((SortedMapWritable) w); } else if (w instanceof Record) { writeRecord((Record) w); } else { writeWritable(w); // last resort } }
public void writeArray(ArrayWritable aw) throws IOException { Writable[] writables = aw.get(); out.writeVectorHeader(writables.length); for (Writable writable : writables) { write(writable); } }
public Class<? extends Writable> readType() throws IOException { Type type = in.readType(); if (type == null) { return null; } switch (type) { case BYTES: return BytesWritable.class; case BYTE: return ByteWritable.class; case BOOL: return BooleanWritable.class; case INT: return VIntWritable.class; case LONG: return VLongWritable.class; case FLOAT: return FloatWritable.class; case DOUBLE: return DoubleWritable.class; case STRING: return Text.class; case VECTOR: return ArrayWritable.class; case MAP: return MapWritable.class; case WRITABLE: return Writable.class; default: throw new RuntimeException("unknown type"); } }
public void write(Writable w) throws IOException { if (w instanceof TypedBytesWritable) { writeTypedBytes((TypedBytesWritable) w); } else if (w instanceof BytesWritable) { writeBytes((BytesWritable) w); } else if (w instanceof ByteWritable) { writeByte((ByteWritable) w); } else if (w instanceof BooleanWritable) { writeBoolean((BooleanWritable) w); } else if (w instanceof IntWritable) { writeInt((IntWritable) w); } else if (w instanceof VIntWritable) { writeVInt((VIntWritable) w); } else if (w instanceof LongWritable) { writeLong((LongWritable) w); } else if (w instanceof VLongWritable) { writeVLong((VLongWritable) w); } else if (w instanceof FloatWritable) { writeFloat((FloatWritable) w); } else if (w instanceof DoubleWritable) { writeDouble((DoubleWritable) w); } else if (w instanceof Text) { writeText((Text) w); } else if (w instanceof ArrayWritable) { writeArray((ArrayWritable) w); } else if (w instanceof MapWritable) { writeMap((MapWritable) w); } else if (w instanceof SortedMapWritable) { writeSortedMap((SortedMapWritable<?>) w); } else if (w instanceof Record) { writeRecord((Record) w); } else { writeWritable(w); // last resort } }
@Override public RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelRecordReader( (FileSplit) split,job,reporter); } catch (FormatNotUnderstoodException e) { // log LOGIF.error(e); } catch (GeneralSecurityException gse) { LOGIF.error(gse); } return null; }
/** * * Create an empty value * * @return value */ @Override public ArrayWritable createValue() { ArrayWritable newArrayWritable = new ArrayWritable(SpreadSheetCellDAO.class); newArrayWritable.set(new SpreadSheetCellDAO[0]); return newArrayWritable; }
@Override public List<Object> getStructFieldsDataAsList(final Object data) { if (data == null) { return null; } if (data instanceof ArrayWritable) { final ArrayWritable arr = (ArrayWritable) data; final Object[] arrWritable = arr.get(); return new ArrayList<Object>(Arrays.asList(arrWritable)); } throw new UnsupportedOperationException("Cannot inspect " + data.getClass().getCanonicalName()); }
@Override public RecordReader<Text,ArrayWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws IOException { /** Create reader **/ try { // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally ctx.getConfiguration().set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel"); return new ExcelRecordReader(ctx.getConfiguration(), (FileSplit) split); } catch (FormatNotUnderstoodException e) { // log LOG.error(e); } catch (GeneralSecurityException gse) { LOG.error(gse); } return null; }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedPositive() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedPositive() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2003encrypt.xls]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2003encrypt.xls]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); }
@Override public ArrayWritable readRow(Object[] data) { String[] writables = new String[data.length]; for (int i = 0; i < data.length; i++) { writables[i] = data[i].toString(); } return new ArrayWritable(writables); }
@Test public void readExcelInputFormatExcel2013LinkedWorkbook() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013linkedworkbooks.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks",true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks",false); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2013linkedworkbooks.xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2013linkedworkbooks.xlsx]Sheet1!A1\""); assertEquals( 3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals( "Sheet1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals( "test3", ((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns"); assertEquals( "3", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)"); assertEquals("5", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"5\""); }
@Test public void readExcelInputFormatExcel2013SingleSheetEncryptedKeyStorePositive() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2013encrypt.xlsx"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); String keystoreFilename="keystore.jceks"; String filenameKeyStore=classLoader.getResource(keystoreFilename).getFile().toString(); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // for decryption set the keystore to retrieve the password job.set("hadoopoffice.read.security.crypt.credential.keystore.file", filenameKeyStore); job.set("hadoopoffice.read.security.crypt.credential.keystore.type","JCEKS"); job.set("hadoopoffice.read.security.crypt.credential.keystore.password","changeit"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue( reader.next(spreadSheetKey,spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2013encrypt.xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2013encrypt.xlsx]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); }
@Override public Object deserialize(Writable writable) throws SerDeException { // Different segments could contain different schemas. // Especially the column orders could be different. // Here we re-map the column names to the real column ids. SchemaWritable reader = (SchemaWritable) writable; if (this.projectCols != reader.columns) { // Don't have to do it every time, only when schema is changed. mapColIndex(reader.columns); projectCols = reader.columns; } if (!isMapNeeded) { serdeSize = columnNames.size(); return reader; } else { Writable[] projectWritables = reader.get(); Writable[] writables = new Writable[columnNames.size()]; for (int i = 0; i < validColIndexes.length; i++) { int colIndex = validColIndexes[i]; int mapColId = validColMapIds[i]; writables[colIndex] = projectWritables[mapColId]; } serdeSize = validColIndexes.length; return new ArrayWritable(Writable.class, writables); } }
/** * Method to take an input json array format string and output an ArrayWritable */ public static ArrayWritable jsonArrayStringtoArrayWritable(String jsonString) { String modString = jsonString.replaceFirst("\\[", ""); modString = modString.replaceFirst("\\]", ""); modString = modString.replaceAll("\"", ""); String[] elements = modString.split("\\s*,\\s*"); logger.debug("elements = "); for (String element : elements) { logger.debug("element: " + element); } return new ArrayWritable(elements); }
@Test public void readExcelInputFormatExcel2003SingleSheetEncryptedPositiveLowFootprint() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="excel2003encrypt.xls"; String fileNameSpreadSheet=classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47","de"); // low footprint job.set("hadoopoffice.read.lowFootprint", "true"); // for decryption simply set the password job.set("hadoopoffice.read.security.crypt.password","test"); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals(1,inputSplits.length,"Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader,"Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey,spreadSheetValue),"Input Split for Excel file contains row 1"); assertEquals("[excel2003encrypt.xls]Sheet1!A1",spreadSheetKey.toString(),"Input Split for Excel file has keyname == \"[excel2003encrypt.xls]Sheet1!A1\""); assertEquals(3,spreadSheetValue.get().length,"Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getSheetName(),"Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1",((SpreadSheetCellDAO)spreadSheetValue.get()[0]).getAddress(),"Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2",((SpreadSheetCellDAO)spreadSheetValue.get()[1]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3",((SpreadSheetCellDAO)spreadSheetValue.get()[2]).getFormattedValue(),"Input Split for Excel file contains row 1 with cell 3 == \"test3\""); }