@Override protected void setup(Context context) throws IOException, InterruptedException { // TODO Create a FileSystem object FileSystem fs = FileSystem.get(context.getConfiguration()); // TODO get the cache files from the context URI[] uris = context.getCacheFiles(); if (uris.length > 0) { // TODO create a new Bloom filter filter = new BloomFilter(); // TODO call the filter's readFields method, passing in an FSDataInputStream filter.readFields(fs.open(new Path(uris[0].toString()))); } else { throw new IOException( "Bloom filter file not in DistributedCache"); } }
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; // Strip off the initial level of bag DataBag values = (DataBag)input.get(0); Iterator<Tuple> it = values.iterator(); Tuple t = it.next(); // If the input tuple has only one field, then we'll extract // that field and serialize it into a key. If it has multiple // fields, we'll serialize the whole tuple. byte[] b; if (t.size() == 1) b = DataType.toBytes(t.get(0)); else b = DataType.toBytes(t, DataType.TUPLE); Key k = new Key(b); filter = new BloomFilter(vSize, numHash, hType); filter.add(k); return TupleFactory.getInstance().newTuple(bloomOut()); }
private void init() throws IOException { filter = new BloomFilter(); String dir = "./" + getFilenameFromPath(bloomFile); String[] partFiles = new File(dir) .list(new FilenameFilter() { @Override public boolean accept(File current, String name) { return name.startsWith("part"); } }); String dcFile = dir + "/" + partFiles[0]; DataInputStream dis = new DataInputStream(new FileInputStream(dcFile)); try { filter.readFields(dis); } finally { dis.close(); } }
public static BloomFilter readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = new DataFileStream<Object>( is, new GenericDatumReader<Object>()); reader.hasNext(); BloomFilter filter = new BloomFilter(); AvroBytesRecord .fromGenericRecord((GenericRecord) reader.next(), filter); IOUtils.closeQuietly(is); IOUtils.closeQuietly(reader); return filter; }
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(BloomFilterCreator.class); job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString()); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }
@Override public void map(Text key, Text value, OutputCollector<NullWritable, BloomFilter> output, Reporter reporter) throws IOException { System.out.println("K[" + key + "]"); int age = Integer.valueOf(value.toString()); if (age > 30) { filter.add(new Key(key.toString().getBytes())); } collector = output; }
@Override public void reduce(NullWritable key, Iterator<BloomFilter> values, OutputCollector<AvroWrapper<GenericRecord>, NullWritable> output, Reporter reporter) throws IOException { while (values.hasNext()) { BloomFilter bf = values.next(); filter.or(bf); System.out.println(filter); } collector = output; }
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) { // TODO calculate the optimal Bloom filter size // TODO and the optimal number of hash functions int vectorSize = getOptimalBloomFilterSize(numMembers, falsePosRate); int nbHash = getOptimalK(numMembers, vectorSize); // TODO create new Bloom filter BloomFilter filter = new BloomFilter(vectorSize, nbHash, Hash.MURMUR_HASH); return filter; }
public BloomFilter createBloomFilter(int numMembers, float falsePosRate) { // TODO calculate the optimal Bloom filter size // TODO and the optimal number of hash functions // TODO create new Bloom filter return null; }
ElementIteratorReadIntoMemory() throws RetrieverException { vertices = extractVertices(seedsIter); // Create Bloom filter, read through set of entities and add them to // Bloom filter final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(), vertices.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator()); addToBloomFilter(vertices, filter); initialise(filter); }
ElementIteratorReadIntoMemory() throws RetrieverException { verticesA = extractVertices(seedSetAIter); verticesB = extractVertices(seedSetBIter); // Create Bloom filter, read through set of entities B and add them // to Bloom filter final BloomFilter filter = BloomFilterUtils.getBloomFilter(store.getProperties().getFalsePositiveRate(), verticesB.size(), store.getProperties().getMaxBloomFilterToPassToAnIterator()); addToBloomFilter(verticesB, filter); initialise(filter); }
protected void addToBloomFilter(final Iterator<? extends Object> vertices, final BloomFilter filter) throws RetrieverException { try { while (vertices.hasNext()) { addToBloomFilter(vertices.next(), filter); } } finally { CloseableUtil.close(vertices); } }
protected void addToBloomFilter(final Iterator<? extends EntityId> seeds, final BloomFilter filter1, final BloomFilter filter2) throws RetrieverException { try { while (seeds.hasNext()) { addToBloomFilter(seeds.next(), filter1, filter2); } } finally { CloseableUtil.close(seeds); } }
private void addToBloomFilter(final Object vertex, final BloomFilter filter) throws RetrieverException { try { filter.add(new org.apache.hadoop.util.bloom.Key(elementConverter.serialiseVertex(vertex))); } catch (final AccumuloElementConversionException e) { throw new RetrieverException("Failed to add identifier to the bloom key", e); } }
/** * For testing only, do not use directly. */ public void setFilter(DataByteArray dba) throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(dba.get())); filter = new BloomFilter(); filter.readFields(dis); }
protected DataByteArray bloomOr(Tuple input) throws IOException { filter = new BloomFilter(vSize, numHash, hType); try { DataBag values = (DataBag)input.get(0); for (Iterator<Tuple> it = values.iterator(); it.hasNext();) { Tuple t = it.next(); filter.or(bloomIn((DataByteArray)t.get(0))); } } catch (ExecException ee) { throw new IOException(ee); } return bloomOut(); }
protected BloomFilter bloomIn(DataByteArray b) throws IOException { DataInputStream dis = new DataInputStream(new ByteArrayInputStream(b.get())); BloomFilter f = new BloomFilter(); f.readFields(dis); return f; }
public static BloomFilter fromFile(File f) throws IOException { return readFromAvro(FileUtils.openInputStream(f)); }
public static BloomFilter fromPath(Configuration config, Path path) throws IOException { FileSystem hdfs = path.getFileSystem(config); return readFromAvro(hdfs.open(path)); }
@Override protected void reduce(NullWritable key, Iterable<BloomFilter> values, Context context) throws IOException, InterruptedException { for (BloomFilter bf : values) { filter.or(bf); } }
public void redisMembershipTestWithFilter(Path input, Path bloom) throws Exception { System.out.println("Testing Redis set membership of " + input + " using a Bloom filter " + bloom); // TODO create a fileSystem object FileSystem fs = FileSystem.get(getConf()); // TODO connect to Redis at localhost, port 6379 jedis = new Jedis("localhost", 6379); jedis.connect(); // TODO Create a new BloomFilter object BloomFilter filter = new BloomFilter(); // TODO call readFields with an FSDataInputStream from the file filter.readFields(fs.open(bloom)); // TODO Open the testing file for read String line = null; int numBFhits = 0, numhits = 0, numlines = 0; BufferedReader rdr = new BufferedReader(new InputStreamReader( fs.open(input))); // TODO create a new Key to re-use Key key = new Key(); long start = System.currentTimeMillis(); while ((line = rdr.readLine()) != null) { // TODO increment numlines ++numlines; // TODO set the bytes of the key to line's bytes with a weight of 1.0 key.set(line.getBytes(), 1.0); // TODO membership test the key if (filter.membershipTest(key)) { // TODO increment numBFhits ++numBFhits; // TODO test jedis using sismember if (jedis.sismember(REDIS_SET_KEY, line)) { // TODO increment numhits ++numhits; } } } long finish = System.currentTimeMillis(); // TODO close the file reader and Redis client rdr.close(); jedis.disconnect(); System.out.println("Took " + (finish - start) + " ms to check Redis " + numlines + " times for " + numhits + " successful tests. Bloom filter hits: " + numBFhits + " False postives: " + (numBFhits - numhits)); }
@Override public int run(String[] args) throws Exception { if (args.length != 4) { System.err .println("Usage: Trainer <totrain> <nummembers> <falseposrate> <bfoutfile>"); return 1; } // Parse command line arguments Path inputFile = new Path(args[0]); int numMembers = Integer.parseInt(args[1]); float falsePosRate = Float.parseFloat(args[2]); Path bfFile = new Path(args[3]); // TODO Create a new Jedis object using localhost at port 6379 jedis = new Jedis("localhost", 6379); // TODO delete the REDIS_SET_KEY jedis.del(REDIS_SET_KEY); // TODO Create a new Bloom filter BloomFilter filter = createBloomFilter(numMembers, falsePosRate); // TODO open the file for read FileSystem fs = FileSystem.get(getConf()); String line = null; int numRecords = 0; BufferedReader rdr = new BufferedReader(new InputStreamReader( fs.open(inputFile))); while ((line = rdr.readLine()) != null) { // TODO if the line is not empty if (!line.isEmpty()) { // TODO add the line to the Bloom filter filter.add(new Key(line.getBytes())); // TODO use Jedis client's "sadd" method to set jedis.sadd(REDIS_SET_KEY, line); // TODO increment numRecords ++numRecords; } } // TODO Close reader, disconnect Jedis client rdr.close(); jedis.disconnect(); System.out.println("Trained Bloom filter with " + numRecords + " entries."); System.out.println("Serializing Bloom filter to HDFS at " + bfFile); // TODO create anew FSDataOutputStream using the FileSystem FSDataOutputStream strm = fs.create(bfFile); // TODO pass the stream to the Bloom filter filter.write(strm); // TODO close the stream strm.flush(); strm.close(); System.out.println("Done training Bloom filter."); return 0; }
protected void addToBloomFilter(final Iterable<? extends Object> vertices, final BloomFilter filter) throws RetrieverException { addToBloomFilter(vertices.iterator(), filter); }
protected void addToBloomFilter(final EntityId seed, final BloomFilter filter1, final BloomFilter filter2) throws RetrieverException { addToBloomFilter(seed.getVertex(), filter1); addToBloomFilter(seed.getVertex(), filter2); }
@Override public IteratorSetting getBloomFilterIteratorSetting(final BloomFilter filter) throws IteratorSettingException { return new IteratorSettingBuilder(AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_PRIORITY, AccumuloStoreConstants.BLOOM_FILTER_ITERATOR_NAME, CoreKeyBloomFilterIterator.class).bloomFilter(filter).build(); }
private void init() throws IOException { filter = new BloomFilter(); String dcFile = "./" + getFilenameFromPath(bloomFile) + "/part-r-00000"; filter.readFields(new DataInputStream(new FileInputStream(dcFile))); }