Java 类org.apache.hadoop.util.bloom.BloomFilter 实例源码
项目:bloomfilter-course
文件:MRBloomFilter.java
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
// TODO Create a FileSystem object
FileSystem fs = FileSystem.get(context.getConfiguration());
// TODO get the cache files from the context
URI[] uris = context.getCacheFiles();
if (uris.length > 0) {
// TODO create a new Bloom filter
filter = new BloomFilter();
// TODO call the filter's readFields method, passing in an FSDataInputStream
filter.readFields(fs.open(new Path(uris[0].toString())));
} else {
throw new IOException(
"Bloom filter file not in DistributedCache");
}
}
项目:spork-streaming
文件:BuildBloom.java
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) return null;
// Strip off the initial level of bag
DataBag values = (DataBag)input.get(0);
Iterator<Tuple> it = values.iterator();
Tuple t = it.next();
// If the input tuple has only one field, then we'll extract
// that field and serialize it into a key. If it has multiple
// fields, we'll serialize the whole tuple.
byte[] b;
if (t.size() == 1) b = DataType.toBytes(t.get(0));
else b = DataType.toBytes(t, DataType.TUPLE);
Key k = new Key(b);
filter = new BloomFilter(vSize, numHash, hType);
filter.add(k);
return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:spork
文件:Bloom.java
private void init() throws IOException {
filter = new BloomFilter();
String dir = "./" + getFilenameFromPath(bloomFile);
String[] partFiles = new File(dir)
.list(new FilenameFilter() {
@Override
public boolean accept(File current, String name) {
return name.startsWith("part");
}
});
String dcFile = dir + "/" + partFiles[0];
DataInputStream dis = new DataInputStream(new FileInputStream(dcFile));
try {
filter.readFields(dis);
} finally {
dis.close();
}
}
项目:spork
文件:BuildBloom.java
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) return null;
// Strip off the initial level of bag
DataBag values = (DataBag)input.get(0);
Iterator<Tuple> it = values.iterator();
Tuple t = it.next();
// If the input tuple has only one field, then we'll extract
// that field and serialize it into a key. If it has multiple
// fields, we'll serialize the whole tuple.
byte[] b;
if (t.size() == 1) b = DataType.toBytes(t.get(0));
else b = DataType.toBytes(t, DataType.TUPLE);
Key k = new Key(b);
filter = new BloomFilter(vSize, numHash, hType);
filter.add(k);
return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:PonIC
文件:BuildBloom.java
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) return null;
// Strip off the initial level of bag
DataBag values = (DataBag)input.get(0);
Iterator<Tuple> it = values.iterator();
Tuple t = it.next();
// If the input tuple has only one field, then we'll extract
// that field and serialize it into a key. If it has multiple
// fields, we'll serialize the whole tuple.
byte[] b;
if (t.size() == 1) b = DataType.toBytes(t.get(0));
else b = DataType.toBytes(t, DataType.TUPLE);
Key k = new Key(b);
filter = new BloomFilter(vSize, numHash, hType);
filter.add(k);
return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:sedge
文件:BuildBloom.java
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) return null;
// Strip off the initial level of bag
DataBag values = (DataBag)input.get(0);
Iterator<Tuple> it = values.iterator();
Tuple t = it.next();
// If the input tuple has only one field, then we'll extract
// that field and serialize it into a key. If it has multiple
// fields, we'll serialize the whole tuple.
byte[] b;
if (t.size() == 1) b = DataType.toBytes(t.get(0));
else b = DataType.toBytes(t, DataType.TUPLE);
Key k = new Key(b);
filter = new BloomFilter(vSize, numHash, hType);
filter.add(k);
return TupleFactory.getInstance().newTuple(bloomOut());
}
项目:hiped2
文件:BloomFilterDumper.java
public static BloomFilter readFromAvro(InputStream is) throws IOException {
DataFileStream<Object> reader =
new DataFileStream<Object>(
is, new GenericDatumReader<Object>());
reader.hasNext();
BloomFilter filter = new BloomFilter();
AvroBytesRecord
.fromGenericRecord((GenericRecord) reader.next(), filter);
IOUtils.closeQuietly(is);
IOUtils.closeQuietly(reader);
return filter;
}
项目:hiped2
文件:BloomFilterCreator.java
/**
* The MapReduce driver - setup and launch the job.
*
* @param args the command-line arguments
* @return the process exit code
* @throws Exception if something goes wrong
*/
public int run(final String[] args) throws Exception {
Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
int result = cli.runCmd();
if (result != 0) {
return result;
}
Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));
Configuration conf = super.getConf();
JobConf job = new JobConf(conf);
job.setJarByClass(BloomFilterCreator.class);
job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString());
job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());
job.setInputFormat(KeyValueTextInputFormat.class);
job.setOutputFormat(AvroOutputFormat.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(BloomFilter.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(BloomFilter.class);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}
项目:hiped2
文件:BloomFilterCreator.java
@Override
public void map(Text key, Text value,
OutputCollector<NullWritable, BloomFilter> output,
Reporter reporter) throws IOException {
System.out.println("K[" + key + "]");
int age = Integer.valueOf(value.toString());
if (age > 30) {
filter.add(new Key(key.toString().getBytes()));
}
collector = output;
}
项目:hiped2
文件:BloomFilterCreator.java
@Override
public void reduce(NullWritable key, Iterator<BloomFilter> values,
OutputCollector<AvroWrapper<GenericRecord>,
NullWritable> output,
Reporter reporter) throws IOException {
while (values.hasNext()) {
BloomFilter bf = values.next();
filter.or(bf);
System.out.println(filter);
}
collector = output;
}
项目:hiped2
文件:BloomFilterCreator.java
public static BloomFilter readFromAvro(InputStream is) throws IOException {
DataFileStream<Object> reader =
new DataFileStream<Object>(
is, new GenericDatumReader<Object>());
reader.hasNext();
BloomFilter filter = new BloomFilter();
AvroBytesRecord
.fromGenericRecord((GenericRecord) reader.next(), filter);
IOUtils.closeQuietly(is);
IOUtils.closeQuietly(reader);
return filter;
}
项目:bloomfilter-course
文件:Trainer.java
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) {
// TODO calculate the optimal Bloom filter size
// TODO and the optimal number of hash functions
int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate);
int nbHash = getOptimalK(numMembers, vectorSize);
// TODO create new Bloom filter
BloomFilter filter = new BloomFilter(vectorSize, nbHash,
Hash.MURMUR_HASH);
return filter;
}
项目:bloomfilter-course
文件:Trainer.java
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) {
// TODO calculate the optimal Bloom filter size
// TODO and the optimal number of hash functions
// TODO create new Bloom filter
return null;
}
项目:Gaffer
文件:AccumuloIDWithinSetRetriever.java
ElementIteratorReadIntoMemory() throws RetrieverException {
vertices = extractVertices(seedsIter);
// Create Bloom filter, read through set of entities and add them to
// Bloom filter
final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(),
vertices.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator());
addToBloomFilter(vertices, filter);
initialise(filter);
}
项目:Gaffer
文件:AccumuloIDBetweenSetsRetriever.java
ElementIteratorReadIntoMemory() throws RetrieverException {
verticesA = extractVertices(seedSetAIter);
verticesB = extractVertices(seedSetBIter);
// Create Bloom filter, read through set of entities B and add them
// to Bloom filter
final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(),
verticesB.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator());
addToBloomFilter(verticesB, filter);
initialise(filter);
}
项目:Gaffer
文件:AccumuloSetRetriever.java
protected void addToBloomFilter(final Iterator<? extends Object> vertices, final BloomFilter filter)
throws RetrieverException {
try {
while (vertices.hasNext()) {
addToBloomFilter(vertices.next(), filter);
}
} finally {
CloseableUtil.close(vertices);
}
}
项目:Gaffer
文件:AccumuloSetRetriever.java
protected void addToBloomFilter(final Iterator<? extends EntityId> seeds, final BloomFilter filter1,
final BloomFilter filter2) throws RetrieverException {
try {
while (seeds.hasNext()) {
addToBloomFilter(seeds.next(), filter1, filter2);
}
} finally {
CloseableUtil.close(seeds);
}
}
项目:Gaffer
文件:AccumuloSetRetriever.java
private void addToBloomFilter(final Object vertex, final BloomFilter filter) throws RetrieverException {
try {
filter.add(new org.apache.hadoop.util.bloom.Key(elementConverter.serialiseVertex(vertex)));
} catch (final AccumuloElementConversionException e) {
throw new RetrieverException("Failed to add identifier to the bloom key", e);
}
}
项目:spork-streaming
文件:Bloom.java
/**
* For testing only, do not use directly.
*/
public void setFilter(DataByteArray dba) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(dba.get()));
filter = new BloomFilter();
filter.readFields(dis);
}
项目:spork-streaming
文件:BuildBloomBase.java
protected DataByteArray bloomOr(Tuple input) throws IOException {
filter = new BloomFilter(vSize, numHash, hType);
try {
DataBag values = (DataBag)input.get(0);
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple t = it.next();
filter.or(bloomIn((DataByteArray)t.get(0)));
}
} catch (ExecException ee) {
throw new IOException(ee);
}
return bloomOut();
}
项目:spork-streaming
文件:BuildBloomBase.java
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(b.get()));
BloomFilter f = new BloomFilter();
f.readFields(dis);
return f;
}
项目:spork
文件:Bloom.java
/**
* For testing only, do not use directly.
*/
public void setFilter(DataByteArray dba) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(dba.get()));
filter = new BloomFilter();
filter.readFields(dis);
}
项目:spork
文件:BuildBloomBase.java
protected DataByteArray bloomOr(Tuple input) throws IOException {
filter = new BloomFilter(vSize, numHash, hType);
try {
DataBag values = (DataBag)input.get(0);
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple t = it.next();
filter.or(bloomIn((DataByteArray)t.get(0)));
}
} catch (ExecException ee) {
throw new IOException(ee);
}
return bloomOut();
}
项目:spork
文件:BuildBloomBase.java
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(b.get()));
BloomFilter f = new BloomFilter();
f.readFields(dis);
return f;
}
项目:PonIC
文件:Bloom.java
/**
* For testing only, do not use directly.
*/
public void setFilter(DataByteArray dba) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(dba.get()));
filter = new BloomFilter();
filter.readFields(dis);
}
项目:PonIC
文件:BuildBloomBase.java
protected DataByteArray bloomOr(Tuple input) throws IOException {
filter = new BloomFilter(vSize, numHash, hType);
try {
DataBag values = (DataBag)input.get(0);
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple t = it.next();
filter.or(bloomIn((DataByteArray)t.get(0)));
}
} catch (ExecException ee) {
throw new IOException(ee);
}
return bloomOut();
}
项目:PonIC
文件:BuildBloomBase.java
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(b.get()));
BloomFilter f = new BloomFilter();
f.readFields(dis);
return f;
}
项目:sedge
文件:Bloom.java
/**
* For testing only, do not use directly.
*/
public void setFilter(DataByteArray dba) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(dba.get()));
filter = new BloomFilter();
filter.readFields(dis);
}
项目:sedge
文件:BuildBloomBase.java
protected DataByteArray bloomOr(Tuple input) throws IOException {
filter = new BloomFilter(vSize, numHash, hType);
try {
DataBag values = (DataBag)input.get(0);
for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
Tuple t = it.next();
filter.or(bloomIn((DataByteArray)t.get(0)));
}
} catch (ExecException ee) {
throw new IOException(ee);
}
return bloomOut();
}
项目:sedge
文件:BuildBloomBase.java
protected BloomFilter bloomIn(DataByteArray b) throws IOException {
DataInputStream dis = new DataInputStream(new
ByteArrayInputStream(b.get()));
BloomFilter f = new BloomFilter();
f.readFields(dis);
return f;
}
项目:hiped2
文件:BloomFilterDumper.java
public static BloomFilter fromFile(File f) throws IOException {
return readFromAvro(FileUtils.openInputStream(f));
}
项目:hiped2
文件:BloomFilterDumper.java
public static BloomFilter fromPath(Configuration config, Path path) throws IOException {
FileSystem hdfs = path.getFileSystem(config);
return readFromAvro(hdfs.open(path));
}
项目:hiped2
文件:BloomFilterCreator.java
@Override
protected void reduce(NullWritable key, Iterable<BloomFilter> values, Context context) throws IOException, InterruptedException {
for (BloomFilter bf : values) {
filter.or(bf);
}
}
项目:hiped2
文件:BloomFilterCreator.java
public static BloomFilter fromFile(File f) throws IOException {
return readFromAvro(FileUtils.openInputStream(f));
}
项目:bloomfilter-course
文件:Tester.java
public void redisMembershipTestWithFilter(Path input, Path bloom)
throws Exception {
System.out.println("Testing Redis set membership of " + input
+ " using a Bloom filter " + bloom);
// TODO create a fileSystem object
FileSystem fs = FileSystem.get(getConf());
// TODO connect to Redis at localhost, port 6379
jedis = new Jedis("localhost", 6379);
jedis.connect();
// TODO Create a new BloomFilter object
BloomFilter filter = new BloomFilter();
// TODO call readFields with an FSDataInputStream from the file
filter.readFields(fs.open(bloom));
// TODO Open the testing file for read
String line = null;
int numBFhits = 0, numhits = 0, numlines = 0;
BufferedReader rdr = new BufferedReader(new InputStreamReader(
fs.open(input)));
// TODO create a new Key to re-use
Key key = new Key();
long start = System.currentTimeMillis();
while ((line = rdr.readLine()) != null) {
// TODO increment numlines
++numlines;
// TODO set the bytes of the key to line's bytes with a weight of 1.0
key.set(line.getBytes(), 1.0);
// TODO membership test the key
if (filter.membershipTest(key)) {
// TODO increment numBFhits
++numBFhits;
// TODO test jedis using sismember
if (jedis.sismember(REDIS_SET_KEY, line)) {
// TODO increment numhits
++numhits;
}
}
}
long finish = System.currentTimeMillis();
// TODO close the file reader and Redis client
rdr.close();
jedis.disconnect();
System.out.println("Took " + (finish - start) + " ms to check Redis "
+ numlines + " times for " + numhits
+ " successful tests. Bloom filter hits: " + numBFhits
+ " False postives: " + (numBFhits - numhits));
}
项目:bloomfilter-course
文件:Trainer.java
@Override
public int run(String[] args) throws Exception {
if (args.length != 4) {
System.err
.println("Usage: Trainer <totrain> <nummembers> <falseposrate> <bfoutfile>");
return 1;
}
// Parse command line arguments
Path inputFile = new Path(args[0]);
int numMembers = Integer.parseInt(args[1]);
float falsePosRate = Float.parseFloat(args[2]);
Path bfFile = new Path(args[3]);
// TODO Create a new Jedis object using localhost at port 6379
jedis = new Jedis("localhost", 6379);
// TODO delete the REDIS_SET_KEY
jedis.del(REDIS_SET_KEY);
// TODO Create a new Bloom filter
BloomFilter filter = createBloomFilter(numMembers, falsePosRate);
// TODO open the file for read
FileSystem fs = FileSystem.get(getConf());
String line = null;
int numRecords = 0;
BufferedReader rdr = new BufferedReader(new InputStreamReader(
fs.open(inputFile)));
while ((line = rdr.readLine()) != null) {
// TODO if the line is not empty
if (!line.isEmpty()) {
// TODO add the line to the Bloom filter
filter.add(new Key(line.getBytes()));
// TODO use Jedis client's "sadd" method to set
jedis.sadd(REDIS_SET_KEY, line);
// TODO increment numRecords
++numRecords;
}
}
// TODO Close reader, disconnect Jedis client
rdr.close();
jedis.disconnect();
System.out.println("Trained Bloom filter with " + numRecords
+ " entries.");
System.out.println("Serializing Bloom filter to HDFS at " + bfFile);
// TODO create anew FSDataOutputStream using the FileSystem
FSDataOutputStream strm = fs.create(bfFile);
// TODO pass the stream to the Bloom filter
filter.write(strm);
// TODO close the stream
strm.flush();
strm.close();
System.out.println("Done training Bloom filter.");
return 0;
}
项目:Gaffer
文件:AccumuloSetRetriever.java
protected void addToBloomFilter(final Iterable<? extends Object> vertices, final BloomFilter filter)
throws RetrieverException {
addToBloomFilter(vertices.iterator(), filter);
}
项目:Gaffer
文件:AccumuloSetRetriever.java
protected void addToBloomFilter(final EntityId seed, final BloomFilter filter1, final BloomFilter filter2)
throws RetrieverException {
addToBloomFilter(seed.getVertex(), filter1);
addToBloomFilter(seed.getVertex(), filter2);
}
项目:Gaffer
文件:AbstractCoreKeyIteratorSettingsFactory.java
@Override
public IteratorSetting getBloomFilterIteratorSetting(final BloomFilter filter) throws IteratorSettingException {
return new IteratorSettingBuilder(AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_PRIORITY,
AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_NAME, CoreKeyBloomFilterIterator.class).bloomFilter(filter).build();
}
项目:spork-streaming
文件:Bloom.java
private void init() throws IOException {
filter = new BloomFilter();
String dcFile = "./" + getFilenameFromPath(bloomFile) +
"/part-r-00000";
filter.readFields(new DataInputStream(new FileInputStream(dcFile)));
}