/** * Validates the false positive ratio by computing its z-value and comparing * it to the provided threshold. * * @param falsePosRate experimental positive rate * @param nTrials the number of Bloom filter checks * @param zValueBoundary z-value boundary, positive for an upper bound and * negative for a lower bound * @param cbf the compound Bloom filter we are using * @param additionalMsg additional message to include in log output and * assertion failures */ private void validateFalsePosRate(double falsePosRate, int nTrials, double zValueBoundary, CompoundBloomFilter cbf, String additionalMsg) { double p = BloomFilterFactory.getErrorRate(conf); double zValue = (falsePosRate - p) / Math.sqrt(p * (1 - p) / nTrials); String assortedStatsStr = " (targetErrorRate=" + p + ", falsePosRate=" + falsePosRate + ", nTrials=" + nTrials + ")"; LOG.info("z-value is " + zValue + assortedStatsStr); boolean isUpperBound = zValueBoundary > 0; if (isUpperBound && zValue > zValueBoundary || !isUpperBound && zValue < zValueBoundary) { String errorMsg = "False positive rate z-value " + zValue + " is " + (isUpperBound ? "higher" : "lower") + " than " + zValueBoundary + assortedStatsStr + ". Per-chunk stats:\n" + cbf.formatTestingStats(); fail(errorMsg + additionalMsg); } }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }
private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt, NoOpDataBlockEncoder.INSTANCE); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose }