Java 类org.apache.hadoop.mapred.InputFormat 实例源码
项目:ditb
文件:TestTableInputFormat.java
void testInputFormat(Class<? extends InputFormat> clazz) throws IOException {
final JobConf job = MapreduceTestingShim.getJobConf(mrCluster);
job.setInputFormat(clazz);
job.setOutputFormat(NullOutputFormat.class);
job.setMapperClass(ExampleVerifier.class);
job.setNumReduceTasks(0);
LOG.debug("submitting job.");
final RunningJob run = JobClient.runJob(job);
assertTrue("job failed!", run.isSuccessful());
assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter());
assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter());
assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter());
assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter());
assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter());
assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters()
.findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter());
}
项目:hadoop
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:hadoop
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hadoop
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:ditb
文件:TableMapReduceUtil.java
/**
* @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
*/
public static void addDependencyJars(JobConf job) throws IOException {
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
job,
// when making changes here, consider also mapreduce.TableMapReduceUtil
// pull job classes
job.getMapOutputKeyClass(),
job.getMapOutputValueClass(),
job.getOutputKeyClass(),
job.getOutputValueClass(),
job.getPartitionerClass(),
job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
job.getCombinerClass());
}
项目:aliyun-oss-hadoop-fs
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:aliyun-oss-hadoop-fs
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:aliyun-oss-hadoop-fs
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:big-c
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:big-c
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:big-c
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hive-phoenix-handler
文件:HiveInputFormat.java
public static InputFormat<WritableComparable, Writable> getInputFormatFromCache(
Class inputFormatClass, JobConf job) throws IOException {
InputFormat<WritableComparable, Writable> instance = inputFormats.get(inputFormatClass);
if (instance == null) {
try {
instance = (InputFormat<WritableComparable, Writable>) ReflectionUtil
.newInstance(inputFormatClass, job);
// HBase input formats are not thread safe today. See HIVE-8808.
String inputFormatName = inputFormatClass.getName().toLowerCase();
if (!inputFormatName.contains("hbase")) {
inputFormats.put(inputFormatClass, instance);
}
} catch (Exception e) {
throw new IOException("Cannot create an instance of InputFormat class "
+ inputFormatClass.getName() + " as specified in mapredWork!", e);
}
}
return instance;
}
项目:drill
文件:SequenceFileRecordReader.java
private org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> getRecordReader(
final InputFormat<BytesWritable, BytesWritable> inputFormat,
final JobConf jobConf) throws ExecutionSetupException {
try {
final UserGroupInformation ugi = ImpersonationUtil.createProxyUgi(this.opUserName, this.queryUserName);
return ugi.doAs(new PrivilegedExceptionAction<org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable>>() {
@Override
public org.apache.hadoop.mapred.RecordReader<BytesWritable, BytesWritable> run() throws Exception {
return inputFormat.getRecordReader(split, jobConf, Reporter.NULL);
}
});
} catch (IOException | InterruptedException e) {
throw new ExecutionSetupException(
String.format("Error in creating sequencefile reader for file: %s, start: %d, length: %d",
split.getPath(), split.getStart(), split.getLength()), e);
}
}
项目:drill
文件:ConvertHiveParquetScanToDrillParquetScan.java
/**
* Get the input format from given {@link StorageDescriptor}
* @param properties
* @param hiveReadEntry
* @param sd
* @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
*/
private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties,
final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) {
final Table hiveTable = hiveReadEntry.getTable();
try {
final String inputFormatName = sd.getInputFormat();
if (!Strings.isNullOrEmpty(inputFormatName)) {
return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
}
final JobConf job = new JobConf(hiveConf);
HiveUtilities.addConfToJob(job, properties);
return HiveUtilities.getInputFormatClass(job, sd, hiveTable);
} catch (final Exception e) {
logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]",
hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
return null;
}
}
项目:drill
文件:HiveUtilities.java
/**
* Utility method which gets table or partition {@link InputFormat} class. First it
* tries to get the class name from given StorageDescriptor object. If it doesn't contain it tries to get it from
* StorageHandler class set in table properties. If not found throws an exception.
* @param job {@link JobConf} instance needed incase the table is StorageHandler based table.
* @param sd {@link StorageDescriptor} instance of currently reading partition or table (for non-partitioned tables).
* @param table Table object
* @throws Exception
*/
public static Class<? extends InputFormat<?, ?>> getInputFormatClass(final JobConf job, final StorageDescriptor sd,
final Table table) throws Exception {
final String inputFormatName = sd.getInputFormat();
if (Strings.isNullOrEmpty(inputFormatName)) {
final String storageHandlerClass = table.getParameters().get(META_TABLE_STORAGE);
if (Strings.isNullOrEmpty(storageHandlerClass)) {
throw new ExecutionSetupException("Unable to get Hive table InputFormat class. There is neither " +
"InputFormat class explicitly specified nor StorageHandler class");
}
final HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(job, storageHandlerClass);
return (Class<? extends InputFormat<?, ?>>) storageHandler.getInputFormatClass();
} else {
return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName) ;
}
}
项目:hadoop-etl-udfs
文件:HdfsSerDeImportServiceTest.java
private void runImportRCFile(ExaIterator ctx, List<HCatTableColumn> columns, List<HCatTableColumn> partitionColumns, List<OutputColumnSpec> outputColumns, String file) throws Exception {
List<HCatSerDeParameter> serDeParameters = new ArrayList<>();
serDeParameters.add(new HCatSerDeParameter("serialization.format", "1"));
String inputFormatClassName = "org.apache.hadoop.hive.ql.io.RCFileInputFormat";
String serDeClassName = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe";
String hdfsUser = "hdfs";
boolean useKerberos = false;
List<String> hdfsServers = new ArrayList<>();
hdfsServers.add("file:///");
final Configuration conf = new Configuration();
FileSystem fs = HdfsService.getFileSystem(hdfsServers,conf);
InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) UdfUtils.getInstanceByName(inputFormatClassName);
SerDe serDe = (SerDe) UdfUtils.getInstanceByName(serDeClassName);
HdfsSerDeImportService.importFile(fs, file, partitionColumns, inputFormat, serDe, serDeParameters, hdfsServers, hdfsUser, columns, outputColumns, useKerberos, ctx);
}
项目:presto
文件:HiveFileIterator.java
public HiveFileIterator(
Path path,
FileSystem fileSystem,
DirectoryLister directoryLister,
NamenodeStats namenodeStats,
String partitionName,
InputFormat<?, ?> inputFormat,
Properties schema,
List<HivePartitionKey> partitionKeys,
TupleDomain<HiveColumnHandle> effectivePredicate)
{
this.partitionName = requireNonNull(partitionName, "partitionName is null");
this.inputFormat = requireNonNull(inputFormat, "inputFormat is null");
this.schema = requireNonNull(schema, "schema is null");
this.partitionKeys = requireNonNull(partitionKeys, "partitionKeys is null");
this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null");
this.path = requireNonNull(path, "path is null");
this.fileSystem = requireNonNull(fileSystem, "fileSystem is null");
this.directoryLister = requireNonNull(directoryLister, "directoryLister is null");
this.namenodeStats = requireNonNull(namenodeStats, "namenodeStats is null");
}
项目:hadoop-2.6.0-cdh5.4.3
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hadoop-2.6.0-cdh5.4.3
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:FlexMap
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hadoop-2.6.0-cdh5.4.3
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hops
文件:TestMultipleInputs.java
@Test
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:hadoop-2.6.0-cdh5.4.3
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testParquet()
throws Exception
{
List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new ParquetHiveSerDe();
File file = File.createTempFile("presto_test", "parquet");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(false);
testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hadoop-EAR
文件:InputSampler.java
/**
* For each split sampled, emit when the ratio of the number of records
* retained to the total record count is less than the specified
* frequency.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>();
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
long records = 0;
long kept = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
++records;
if ((double) kept / records < freq) {
++kept;
samples.add(key);
key = reader.createKey();
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:hadoop-EAR
文件:DistCp.java
private static JobConf createJobConf(Configuration conf, boolean useFastCopy) {
Class<? extends InputFormat> inputFormat =
(useFastCopy) ? FastCopyInputFormat.class : CopyInputFormat.class;
JobConf jobconf = new JobConf(conf, DistCp.class);
jobconf.setJobName(NAME);
// turn off speculative execution, because DFS doesn't handle
// multiple writers to the same file.
jobconf.setReduceSpeculativeExecution(false);
jobconf.setMapOutputKeyClass(FilePairComparable.class);
jobconf.setMapOutputValueClass(Text.class);
jobconf.setOutputKeyClass(FilePairComparable.class);
jobconf.setOutputValueClass(Text.class);
jobconf.setInputFormat(inputFormat);
jobconf.setMapperClass(CopyFilesTask.class);
jobconf.setReducerClass(CopyFilesTask.class);
// Prevent the reducer from starting until all maps are done.
jobconf.setInt("mapred.job.rushreduce.reduce.threshold", 0);
jobconf.setFloat("mapred.reduce.slowstart.completed.maps", 1.0f);
return jobconf;
}
项目:hadoop-plus
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testRCBinary()
throws Exception
{
List<TestColumn> testColumns = ImmutableList.copyOf(filter(TEST_COLUMNS, testColumn -> {
// RC file does not support complex type as key of a map
return !testColumn.getName().equals("t_map_null_key_complex_key_value");
}));
HiveOutputFormat<?, ?> outputFormat = new RCFileOutputFormat();
InputFormat<?, ?> inputFormat = new RCFileInputFormat<>();
@SuppressWarnings("deprecation")
SerDe serde = new LazyBinaryColumnarSerDe();
File file = File.createTempFile("presto_test", "rc-binary");
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
testCursorProvider(new ColumnarBinaryHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
testCursorProvider(new GenericHiveRecordCursorProvider(), split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hadoop-plus
文件:InputSampler.java
/**
* From each split sampled, take the first numSamples / numSplits records.
*/
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.length);
int splitStep = splits.length / splitsToSample;
int samplesPerSplit = numSamples / splitsToSample;
long records = 0;
for (int i = 0; i < splitsToSample; ++i) {
RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
job, Reporter.NULL);
K key = reader.createKey();
V value = reader.createValue();
while (reader.next(key, value)) {
samples.add(key);
key = reader.createKey();
++records;
if ((i+1) * samplesPerSplit <= records) {
break;
}
}
reader.close();
}
return (K[])samples.toArray();
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testParquetUseColumnNames()
throws Exception
{
List<TestColumn> testColumns = getTestColumnsSupportedByParquet();
HiveOutputFormat<?, ?> outputFormat = new MapredParquetOutputFormat();
InputFormat<?, ?> inputFormat = new MapredParquetInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new ParquetHiveSerDe();
File file = File.createTempFile("presto_test", "parquet");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, testColumns, NUM_ROWS);
// Reverse the order of the columns to test access by name, not by index
Collections.reverse(testColumns);
HiveRecordCursorProvider cursorProvider = new ParquetRecordCursorProvider(true);
testCursorProvider(cursorProvider, split, inputFormat, serde, testColumns, NUM_ROWS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:presto
文件:TestHiveFileFormats.java
@Test
public void testOrcDataStream()
throws Exception
{
HiveOutputFormat<?, ?> outputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat();
InputFormat<?, ?> inputFormat = new org.apache.hadoop.hive.ql.io.orc.OrcInputFormat();
@SuppressWarnings("deprecation")
SerDe serde = new org.apache.hadoop.hive.ql.io.orc.OrcSerde();
File file = File.createTempFile("presto_test", "orc");
file.delete();
try {
FileSplit split = createTestFile(file.getAbsolutePath(), outputFormat, serde, null, TEST_COLUMNS, NUM_ROWS);
testPageSourceFactory(new OrcPageSourceFactory(TYPE_MANAGER), split, inputFormat, serde, TEST_COLUMNS);
}
finally {
//noinspection ResultOfMethodCallIgnored
file.delete();
}
}
项目:hazelcast-jet
文件:ReadHdfsP.java
@Override
public void init(@Nonnull Context context) {
logger = context.jetInstance().getHazelcastInstance().getLoggingService().getLogger(ReadHdfsP.class);
try {
int totalParallelism = context.totalParallelism();
InputFormat inputFormat = jobConf.getInputFormat();
InputSplit[] splits = inputFormat.getSplits(jobConf, totalParallelism);
IndexedInputSplit[] indexedInputSplits = new IndexedInputSplit[splits.length];
Arrays.setAll(indexedInputSplits, i -> new IndexedInputSplit(i, splits[i]));
Address[] addrs = context.jetInstance().getCluster().getMembers()
.stream().map(Member::getAddress).toArray(Address[]::new);
assigned = assignSplitsToMembers(indexedInputSplits, addrs);
printAssignments(assigned);
} catch (IOException e) {
throw rethrow(e);
}
}
项目:hazelcast-jet
文件:ReadHdfsP.java
@Override
@Nonnull
public List<Processor> get(int count) {
Map<Integer, List<IndexedInputSplit>> processorToSplits =
range(0, assignedSplits.size()).mapToObj(i -> new SimpleImmutableEntry<>(i, assignedSplits.get(i)))
.collect(groupingBy(e -> e.getKey() % count,
mapping(Entry::getValue, toList())));
range(0, count)
.forEach(processor -> processorToSplits.computeIfAbsent(processor, x -> emptyList()));
InputFormat inputFormat = jobConf.getInputFormat();
return processorToSplits
.values().stream()
.map(splits -> splits.isEmpty()
? Processors.noopP().get()
: new ReadHdfsP<>(splits.stream()
.map(IndexedInputSplit::getSplit)
.map(split -> uncheckCall(() ->
inputFormat.getRecordReader(split, jobConf, NULL)))
.collect(toList()), mapper)
).collect(toList());
}
项目:presto
文件:HiveUtil.java
static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget)
{
String inputFormatName = getInputFormatName(schema);
try {
JobConf jobConf = new JobConf(configuration);
Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName);
if (symlinkTarget && (inputFormatClass == SymlinkTextInputFormat.class)) {
// symlink targets are always TextInputFormat
inputFormatClass = TextInputFormat.class;
}
return ReflectionUtils.newInstance(inputFormatClass, jobConf);
}
catch (ClassNotFoundException | RuntimeException e) {
throw new RuntimeException("Unable to create input format " + inputFormatName, e);
}
}
项目:pbase
文件:TableMapReduceUtil.java
/**
* @see org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil#addDependencyJars(org.apache.hadoop.mapreduce.Job)
*/
public static void addDependencyJars(JobConf job) throws IOException {
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addHBaseDependencyJars(job);
org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.addDependencyJars(
job,
// when making changes here, consider also mapreduce.TableMapReduceUtil
// pull job classes
job.getMapOutputKeyClass(),
job.getMapOutputValueClass(),
job.getOutputKeyClass(),
job.getOutputValueClass(),
job.getPartitionerClass(),
job.getClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class),
job.getClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class),
job.getCombinerClass());
}
项目:FlexMap
文件:TestMultipleInputs.java
public void testAddInputPathWithMapper() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class,
MapClass.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class, MapClass2.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
final Map<Path, Class<? extends Mapper>> maps = MultipleInputs
.getMapperTypeMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
assertEquals(MapClass.class, maps.get(new Path("/foo")));
assertEquals(MapClass2.class, maps.get(new Path("/bar")));
}
项目:QDrill
文件:ConvertHiveParquetScanToDrillParquetScan.java
/**
* Rule is matched when all of the following match:
* 1) GroupScan in given DrillScalRel is an {@link HiveScan}
* 2) {@link HiveScan} is not already rewritten using Drill's native readers
* 3) InputFormat in Hive table metadata and all partitions metadata contains the same value
* {@link MapredParquetInputFormat}
* 4) No error occurred while checking for the above conditions. An error is logged as warning.
*
* @param call
* @return True if the rule can be applied. False otherwise
*/
@Override
public boolean matches(RelOptRuleCall call) {
final DrillScanRel scanRel = (DrillScanRel) call.rel(0);
final PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
if (!(scanRel.getGroupScan() instanceof HiveScan) || ((HiveScan) scanRel.getGroupScan()).isNativeReader()) {
return false;
}
final HiveScan hiveScan = (HiveScan) scanRel.getGroupScan();
final Table hiveTable = hiveScan.hiveReadEntry.getTable();
final Class<? extends InputFormat> tableInputFormat = getInputFormatFromSD(hiveTable, hiveTable.getSd());
if (tableInputFormat == null || !tableInputFormat.equals(MapredParquetInputFormat.class)) {
return false;
}
final List<HivePartition> partitions = hiveScan.hiveReadEntry.getHivePartitionWrappers();
if (partitions == null) {
return true;
}
// Make sure all partitions have the same input format as the table input format
for (HivePartition partition : partitions) {
Class<? extends InputFormat> inputFormat = getInputFormatFromSD(hiveTable, partition.getPartition().getSd());
if (inputFormat == null || !inputFormat.equals(tableInputFormat)) {
return false;
}
}
return true;
}
项目:QDrill
文件:ConvertHiveParquetScanToDrillParquetScan.java
/**
* Get the input format from given {@link StorageDescriptor}
* @param hiveTable
* @param sd
* @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
*/
private Class<? extends InputFormat> getInputFormatFromSD(final Table hiveTable, final StorageDescriptor sd) {
try {
return (Class<? extends InputFormat>) Class.forName(sd.getInputFormat());
} catch (ReflectiveOperationException e) {
logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]",
hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
return null;
}
}
项目:QDrill
文件:HiveScan.java
private void splitInput(final Properties properties, final StorageDescriptor sd, final Partition partition)
throws ReflectiveOperationException, IOException {
final JobConf job = new JobConf();
for (final Object obj : properties.keySet()) {
job.set((String) obj, (String) properties.get(obj));
}
for (final Map.Entry<String, String> entry : hiveReadEntry.hiveConfigOverride.entrySet()) {
job.set(entry.getKey(), entry.getValue());
}
InputFormat<?, ?> format = (InputFormat<?, ?>)
Class.forName(sd.getInputFormat()).getConstructor().newInstance();
job.setInputFormat(format.getClass());
final Path path = new Path(sd.getLocation());
final FileSystem fs = path.getFileSystem(job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
format = job.getInputFormat();
for (final InputSplit split : format.getSplits(job, 1)) {
inputSplits.add(split);
partitionMap.put(split, partition);
}
}
final String numRowsProp = properties.getProperty("numRows");
logger.trace("HiveScan num rows property = {}", numRowsProp);
if (numRowsProp != null) {
final long numRows = Long.valueOf(numRowsProp);
// starting from hive-0.13, when no statistics are available, this property is set to -1
// it's important to note that the value returned by hive may not be up to date
if (numRows > 0) {
rowCount += numRows;
}
}
}
项目:hadoop
文件:TestMultipleInputs.java
public void testAddInputPathWithFormat() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
MultipleInputs.addInputPath(conf, new Path("/bar"),
KeyValueTextInputFormat.class);
final Map<Path, InputFormat> inputs = MultipleInputs
.getInputFormatMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar"))
.getClass());
}