Java 类org.apache.hadoop.mapred.FileSplit 实例源码
项目:multiple-dimension-spread
文件:HiveVectorizedReaderSetting.java
public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
this.hiveReaderConfig = hiveReaderConfig;
rbCtx = Utilities.getVectorizedRowBatchCtx( job );
partitionValues = new Object[rbCtx.getPartitionColumnCount()];
if( 0 < partitionValues.length ){
rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
}
TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
columnNames = rbCtx.getRowColumnNames();
needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );
projectionColumn = new boolean[columnNames.length];
assignors = new IColumnVectorAssignor[columnNames.length];
for( int id : needColumnIds ){
projectionColumn[id] = true;
assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
}
}
项目:multiple-dimension-spread
文件:MDSHiveLineInputFormat.java
@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
FileSplit fileSplit = (FileSplit)split;
Path path = fileSplit.getPath();
FileSystem fs = path.getFileSystem( job );
long fileLength = fs.getLength( path );
long start = fileSplit.getStart();
long length = fileSplit.getLength();
InputStream in = fs.open( path );
IJobReporter jobReporter = new HadoopJobReporter( reporter );
jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
if ( hiveConfig.isVectorMode() ){
IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
}
else{
return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
}
}
项目:QDrill
文件:HiveDrillNativeScanBatchCreator.java
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
项目:Hydrograph
文件:DelimitedAndFixedWidthRecordReader.java
public DelimitedAndFixedWidthRecordReader(JobConf conf, FileSplit split)
throws IOException {
lengthsAndDelimiters = DelimitedAndFixedWidthHelper
.modifyIdentifier(conf.get("lengthsAndDelimiters").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR));
lengthsAndDelimitersType = conf.get("lengthsAndDelimitersType").split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR);
quote = conf.get("quote");
charsetName = conf.get("charsetName");
start = split.getStart();
pos = start;
end = start + split.getLength();
file = split.getPath();
fs = file.getFileSystem(conf);
fileIn = fs.open(split.getPath());
fileIn.seek(start);
inputStreamReader = new InputStreamReader(fileIn, charsetName);
singleChar = new char[1];
stringBuilder = new StringBuilder();
isQuotePresent = isQuotePresent(quote);
}
项目:hadoop
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:hadoop
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:dremio-oss
文件:FileSplitParquetRecordReader.java
public FileSplitParquetRecordReader(
final OperatorContext oContext,
final ParquetReaderFactory readerFactory,
final List<SchemaPath> columnsToRead,
final List<SchemaPath> groupScanColumns,
final List<FilterCondition> conditions,
final FileSplit fileSplit,
final ParquetMetadata footer,
final JobConf jobConf,
final boolean vectorize,
final boolean enableDetailedTracing
) {
this.oContext = oContext;
this.columnsToRead = columnsToRead;
this.groupScanColumns = groupScanColumns;
this.conditions = conditions;
this.fileSplit = fileSplit;
this.footer = footer;
this.jobConf = jobConf;
this.readerFactory = readerFactory;
this.vectorize = vectorize;
this.enableDetailedTracing = enableDetailedTracing;
}
项目:dremio-oss
文件:FileSplitParquetRecordReader.java
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private static List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
项目:aliyun-oss-hadoop-fs
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:aliyun-oss-hadoop-fs
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:big-c
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:big-c
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:emr-dynamodb-connector
文件:ImportRecordReaderFactory.java
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
// CombineFileSplit indicates the new export format which includes a manifest file
if (inputSplit instanceof CombineFileSplit) {
int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
throw new IOException("Unknown version: " + job.get(DynamoDBConstants
.EXPORT_FORMAT_VERSION));
}
return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
} else if (inputSplit instanceof FileSplit) {
// FileSplit indicates the old data pipeline format which doesn't include a manifest file
Path path = ((FileSplit) inputSplit).getPath();
return new ImportRecordReader(job, path);
} else {
throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
+ " " + inputSplit.getClass());
}
}
项目:indexr
文件:IndexRRecordReader.java
public IndexRRecordReader(InputSplit inputSplit, Configuration configuration) throws IOException {
FileSplit fileSplit = (FileSplit) inputSplit;
Preconditions.checkState(fileSplit.getStart() == 0, "Segment should not splited");
Path filePath = fileSplit.getPath();
// Hive may ask to read a file located on local file system.
// We have to get the real file system by path's schema.
FileSystem fileSystem = FileSystem.get(filePath.toUri(), FileSystem.get(configuration).getConf());
if (SegmentHelper.checkSegmentByPath(filePath)) {
ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath);
IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener);
if (fd != null) {
segment = fd.open();
offset = 0L;
rowIterator = segment.rowTraversal().iterator();
getIncludeColumns(configuration, segment);
}
} else {
LOG.warn("ignore " + filePath);
}
}
项目:es-hadoop-v2.2.0
文件:EsHiveInputFormat.java
@Override
public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
// first, merge input table properties (since there's no access to them ...)
Settings settings = HadoopSettingsManager.loadFrom(job);
//settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES)));
Log log = LogFactory.getLog(getClass());
// move on to initialization
InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log);
settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenateAndUriEncode(HiveUtils.columnToAlias(settings), ","));
// set read resource
settings.setResourceRead(settings.getResourceRead());
HiveUtils.init(settings, log);
// decorate original splits as FileSplit
InputSplit[] shardSplits = super.getSplits(job, numSplits);
FileSplit[] wrappers = new FileSplit[shardSplits.length];
Path path = new Path(job.get(HiveConstants.TABLE_LOCATION));
for (int i = 0; i < wrappers.length; i++) {
wrappers[i] = new EsHiveSplit(shardSplits[i], path);
}
return wrappers;
}
项目:drill
文件:HiveDrillNativeScanBatchCreator.java
/**
* Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
* format finds the row group numbers for input split.
*/
private List<Integer> getRowGroupNumbersFromFileSplit(final FileSplit split,
final ParquetMetadata footer) throws IOException {
final List<BlockMetaData> blocks = footer.getBlocks();
final long splitStart = split.getStart();
final long splitLength = split.getLength();
final List<Integer> rowGroupNums = Lists.newArrayList();
int i = 0;
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
rowGroupNums.add(i);
}
i++;
}
return rowGroupNums;
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testOrcDataStream()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
File file = File.createTempFile("presto_test", "orc");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hadoop-2.6.0-cdh5.4.3
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:hadoop-2.6.0-cdh5.4.3
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:presto
文件:TestHiveFileFormats.java
@Test(enabled = false)
public void testRcTextPageSource()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new ColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:systemml
文件:RemoteParForColocatedNLineInputFormat.java
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException
{
InputSplit[] tmp = super.getSplits(job, numSplits);
//get partitioning information
MatrixCharacteristics mc = MRJobConfiguration.getPartitionedMatrixSize(job);
PDataPartitionFormat dpf = MRJobConfiguration.getPartitioningFormat(job);
PartitionFormat pf = new PartitionFormat(dpf, -1);
int blen = (int) (pf.isRowwise() ? pf.getNumRows(mc) : pf.getNumColumns(mc));
String fname = MRJobConfiguration.getPartitioningFilename(job);
//create wrapper splits
InputSplit[] ret = new InputSplit[ tmp.length ];
for( int i=0; i<tmp.length; i++ ) {
//check for robustness of subsequent cast
if( tmp[i] instanceof FileSplit )
ret[i] = new RemoteParForColocatedFileSplit( (FileSplit) tmp[i], fname, blen );
else
ret[i] = tmp[i];
}
return ret;
}
项目:hadoop-2.6.0-cdh5.4.3
文件:LineDocRecordReader.java
/**
* Constructor
* @param job
* @param split
* @throws IOException
*/
public LineDocRecordReader(Configuration job, FileSplit split)
throws IOException {
long start = split.getStart();
long end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
InputStream in = fileIn;
boolean skipFirstLine = false;
if (start != 0) {
skipFirstLine = true; // wait till BufferedInputStream to skip
--start;
fileIn.seek(start);
}
this.in = new BufferedInputStream(in);
if (skipFirstLine) { // skip first line and re-establish "start".
start += LineDocRecordReader.readData(this.in, null, EOL);
}
this.start = start;
this.pos = start;
this.end = end;
}
项目:hadoop-2.6.0-cdh5.4.3
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:hadoop-2.6.0-cdh5.4.3
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:systemml
文件:DelegatingInputFormat.java
@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
Reporter reporter) throws IOException {
// Find the InputFormat and then the RecordReader from the
// TaggedInputSplit.
TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
.newInstance(taggedInputSplit.getInputFormatClass(), conf);
InputSplit inputSplit = taggedInputSplit.getInputSplit();
if (inputSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) inputSplit;
conf.set(MRConfigurationNames.MR_MAP_INPUT_FILE, fileSplit.getPath().toString());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_START, fileSplit.getStart());
conf.setLong(MRConfigurationNames.MR_MAP_INPUT_LENGTH, fileSplit.getLength());
}
return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
reporter);
}
项目:hadoop-EAR
文件:TeraValidate.java
public void map(Text key, Text value, OutputCollector<Text,Text> output,
Reporter reporter) throws IOException {
if (lastKey == null) {
filename = getFilename((FileSplit) reporter.getInputSplit());
output.collect(new Text(filename + ":begin"), key);
lastKey = new Text();
this.output = output;
} else {
if (key.compareTo(lastKey) < 0) {
output.collect(error, new Text("misorder in " + filename +
" last: '" + lastKey +
"' current: '" + key + "'"));
}
}
lastKey.set(key);
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testParquetUseColumnNames()
throws Exception
{
List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new ParquetHiveSerDe();
File file = File.createTempFile("presto_test", "parquet");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
// Reverse the order of the columns to test access by name, not by index
Collections.reverse(testColumns);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true);
testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hadoop-EAR
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:hops
文件:StreamXmlRecordReader.java
public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
JobConf job, FileSystem fs) throws IOException {
super(in, split, reporter, job, fs);
beginMark_ = checkJobGet(CONF_NS + "begin");
endMark_ = checkJobGet(CONF_NS + "end");
maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
synched_ = false;
slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
if (slowMatch_) {
beginPat_ = makePatternCDataOrMark(beginMark_);
endPat_ = makePatternCDataOrMark(endMark_);
}
init();
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testParquet()
throws Exception
{
List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new ParquetHiveSerDe();
File file = File.createTempFile("presto_test", "parquet");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false);
testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testDwrfDataStream()
throws Exception
{
List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
ObjectInspector objectInspector = testColumn.getObjectInspector();
return !hasType(objectInspector, PrimitiveCategory.DATE);
}));
HiveOutputFormat<?, ?> outputFormat = new com.facebook.hive.orc.OrcOutputFormat();
InputFormat<?, ?> inputFormat = new com.facebook.hive.orc.OrcInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new com.facebook.hive.orc.OrcSerde();
File file = File.createTempFile("presto_test", "dwrf");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
testPageSourceFactory(new DwrfPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, testColumns);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hops
文件:AutoInputFormat.java
public RecordReader getRecordReader(InputSplit split, JobConf job,
Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
项目:incubator-mrql
文件:StormParsedInputFormat.java
public ParsedRecordReader ( FileSplit split,
Configuration conf,
Class<? extends Parser> parser_class,
Trees args ) throws IOException {
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(conf);
fsin = fs.open(split.getPath());
try {
parser = parser_class.newInstance();
} catch (Exception ex) {
throw new Error("Unrecognized parser:"+parser_class);
};
parser.initialize(args);
parser.open(fsin,start,end);
result = null;
}
项目:solr-hadoop-common
文件:XMLInputFormat.java
public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException {
log.info("Setting up XMLRecordReader for path: [" + split.getPath() + "]");
log.info("startTag=" + jobConf.get(START_TAG_KEY) + ", endTag=" + jobConf.get(END_TAG_KEY));
startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(jobConf);
path = split.getPath().getName();
fsin = fs.open(split.getPath());
fsin.seek(start);
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testRCBinary()
throws Exception
{
List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
// RC file does not support complex type as key of a map
return !testColumn.getName().equals("t_map_null_key_complex_key_value");
}));
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new LazyBinaryColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:presto
文件:TestHiveFileFormats.java
@Test(enabled = false)
public void testRcBinaryPageSource()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new LazyBinaryColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new RcFilePageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:systemml
文件:PickFromCompactInputFormat.java
public RangePickRecordReader(JobConf job, FileSplit split)
throws IOException
{
parseSelectedRangeString(job.get(SELECTED_RANGES));
// check if the current part file needs to be processed
path = split.getPath();
totLength = split.getLength();
currentStream = IOUtilFunctions.getFileSystem(path, job).open(path);
currPart = getIndexInTheArray(path.getName());
if ( currPart < beginPart || currPart > endPart ) {
noRecordsNeeded = true;
return;
}
int part0=job.getInt(PARTITION_OF_ZERO, -1);
boolean contain0s=false;
long numZeros =0;
if(part0==currPart) {
contain0s = true;
numZeros = job.getLong(NUMBER_OF_ZERO, 0);
}
reader=new ReadWithZeros(currentStream, contain0s, numZeros);
}
项目:cascading.csv
文件:CsvRecordReaderTest.java
/**
* Helper function that iterates through Recrord Reader and asserts RecrordCount
*/
public void testForReadAllRecordsNotStrict(String fileName, int expectedRecordCount) throws IOException {
CsvInputFormat inputFormat = helper.createCSVInputFormat(conf);
File inputFile = helper.getFile(fileName);
Path inputPath = new Path(inputFile.getAbsoluteFile().toURI().toString());
FileSplit split = helper.createFileSplit(inputPath, 0, inputFile.length());
RecordReader createdReader = helper.createRecordReader(inputFormat, split, jobConf);
LongWritable key = new LongWritable();
ListWritable<Text> value = new ListWritable<Text>(Text.class);
int actualRecordCount = 0;
while (createdReader.next(key, value)) {
actualRecordCount++;
}
assertEquals(expectedRecordCount, actualRecordCount);
}
项目:QDrill
文件:DrillTextRecordReader.java
public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context,
char delimiter, List<SchemaPath> columns) {
this.delimiter = (byte) delimiter;
this.split = split;
setColumns(columns);
if (!isStarQuery()) {
String pathStr;
for (SchemaPath path : columns) {
assert path.getRootSegment().isNamed();
pathStr = path.getRootSegment().getPath();
Preconditions.checkArgument(pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
"Selected column(s) must have name 'columns' or must be plain '*'");
if (path.getRootSegment().getChild() != null) {
Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index");
int index = path.getRootSegment().getChild().getArraySegment().getIndex();
columnIds.add(index);
}
}
Collections.sort(columnIds);
numCols = columnIds.size();
}
TextInputFormat inputFormat = new TextInputFormat();
JobConf job = new JobConf(fsConf);
job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
job.setInputFormat(inputFormat.getClass());
try {
reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
key = reader.createKey();
value = reader.createValue();
totalRecordsRead = 0;
} catch (Exception e) {
handleAndRaise("Failure in creating record reader", e);
}
}
项目:QDrill
文件:TextFormatPlugin.java
@Override
public RecordReader getRecordReader(FragmentContext context, DrillFileSystem dfs, FileWork fileWork,
List<SchemaPath> columns) throws ExecutionSetupException {
Path path = dfs.makeQualified(new Path(fileWork.getPath()));
FileSplit split = new FileSplit(path, fileWork.getStart(), fileWork.getLength(), new String[]{""});
if (context.getOptions().getOption(ExecConstants.ENABLE_NEW_TEXT_READER_KEY).bool_val == true) {
TextParsingSettings settings = new TextParsingSettings();
settings.set((TextFormatConfig)formatConfig);
return new CompliantTextRecordReader(split, dfs, context, settings, columns);
} else {
char delim = ((TextFormatConfig)formatConfig).getFieldDelimiter();
return new DrillTextRecordReader(split, dfs.getConf(), context, delim, columns);
}
}