@Test public void testCompoundBloomSizing() { int bloomBlockByteSize = 4096; int bloomBlockBitSize = bloomBlockByteSize * 8; double targetErrorRate = 0.01; long maxKeysPerChunk = ByteBloomFilter.idealMaxKeys(bloomBlockBitSize, targetErrorRate); long bloomSize1 = bloomBlockByteSize * 8; long bloomSize2 = ByteBloomFilter.computeBitSize(maxKeysPerChunk, targetErrorRate); double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1); assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001); }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt, NoOpDataBlockEncoder.INSTANCE); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }
private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo) throws IOException { System.out.println("Block index size as per heapsize: " + reader.indexSize()); System.out.println(asSeparateLines(reader.toString())); System.out.println("Trailer:\n " + asSeparateLines(reader.getTrailer().toString())); System.out.println("Fileinfo:"); for (Map.Entry<byte[], byte[]> e : fileInfo.entrySet()) { System.out.print(FOUR_SPACES + Bytes.toString(e.getKey()) + " = "); if (Bytes.compareTo(e.getKey(), Bytes.toBytes("MAX_SEQ_ID_KEY")) == 0) { long seqid = Bytes.toLong(e.getValue()); System.out.println(seqid); } else if (Bytes.compareTo(e.getKey(), Bytes.toBytes("TIMERANGE")) == 0) { TimeRangeTracker timeRangeTracker = new TimeRangeTracker(); Writables.copyWritable(e.getValue(), timeRangeTracker); System.out.println(timeRangeTracker.getMinimumTimestamp() + "...." + timeRangeTracker.getMaximumTimestamp()); } else if (Bytes.compareTo(e.getKey(), FileInfo.AVG_KEY_LEN) == 0 || Bytes.compareTo(e.getKey(), FileInfo.AVG_VALUE_LEN) == 0) { System.out.println(Bytes.toInt(e.getValue())); } else { System.out.println(Bytes.toStringBinary(e.getValue())); } } System.out.println("Mid-key: " + Bytes.toStringBinary(reader.midkey())); // Printing bloom information DataInput bloomMeta = reader.getBloomFilterMetadata(); BloomFilter bloomFilter = null; if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader); System.out.println("Bloom filter:"); if (bloomFilter != null) { System.out.println(FOUR_SPACES + bloomFilter.toString().replaceAll( ByteBloomFilter.STATS_RECORD_SEP, "\n" + FOUR_SPACES)); } else { System.out.println(FOUR_SPACES + "Not present"); } }
private void printMeta(HFile.Reader reader, Map<byte[], byte[]> fileInfo) throws IOException { System.out.println("Block index size as per heapsize: " + reader.indexSize()); System.out.println(asSeparateLines(reader.toString())); System.out.println("Trailer:\n " + asSeparateLines(reader.getTrailer().toString())); System.out.println("Fileinfo:"); for (Map.Entry<byte[], byte[]> e : fileInfo.entrySet()) { System.out.print(FOUR_SPACES + Bytes.toString(e.getKey()) + " = "); if (Bytes.compareTo(e.getKey(), Bytes.toBytes("MAX_SEQ_ID_KEY")) == 0) { long seqid = Bytes.toLong(e.getValue()); System.out.println(seqid); } else if (Bytes.compareTo(e.getKey(), Bytes.toBytes("TIMERANGE")) == 0) { TimeRangeTracker timeRangeTracker = new TimeRangeTracker(); Writables.copyWritable(e.getValue(), timeRangeTracker); System.out.println(timeRangeTracker.getMinimumTimestamp() + "...." + timeRangeTracker.getMaximumTimestamp()); } else if (Bytes.compareTo(e.getKey(), FileInfo.AVG_KEY_LEN) == 0 || Bytes.compareTo(e.getKey(), FileInfo.AVG_VALUE_LEN) == 0) { System.out.println(Bytes.toInt(e.getValue())); } else { System.out.println(Bytes.toStringBinary(e.getValue())); } } System.out.println("Mid-key: " + Bytes.toStringBinary(reader.midkey())); // Printing general bloom information DataInput bloomMeta = reader.getGeneralBloomFilterMetadata(); BloomFilter bloomFilter = null; if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader); System.out.println("Bloom filter:"); if (bloomFilter != null) { System.out.println(FOUR_SPACES + bloomFilter.toString().replaceAll( ByteBloomFilter.STATS_RECORD_SEP, "\n" + FOUR_SPACES)); } else { System.out.println(FOUR_SPACES + "Not present"); } // Printing delete bloom information bloomMeta = reader.getDeleteBloomFilterMetadata(); bloomFilter = null; if (bloomMeta != null) bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader); System.out.println("Delete Family Bloom filter:"); if (bloomFilter != null) { System.out.println(FOUR_SPACES + bloomFilter.toString().replaceAll(ByteBloomFilter.STATS_RECORD_SEP, "\n" + FOUR_SPACES)); } else { System.out.println(FOUR_SPACES + "Not present"); } }