//! An implementation of a [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter). #[cfg(all(test, feature = "arbitrary"))] mod conformance; use crate::{sha256::Sha256, Hasher}; use bytes::{Buf, BufMut}; use commonware_codec::{ codec::{Read, Write}, error::Error as CodecError, EncodeSize, FixedSize, }; use commonware_utils::bitmap::BitMap; use core::{ marker::PhantomData, num::{NonZeroU64, NonZeroU8, NonZeroUsize}, }; #[cfg(feature = "std")] use { commonware_utils::rational::BigRationalExt, num_rational::BigRational, num_traits::{One, ToPrimitive, Zero}, }; /// Rational approximation of ln(2) with 6 digits of precision: 14397/20769. #[cfg(feature = "std")] const LN2: (u64, u64) = (14397, 20769); /// Rational approximation of 1/ln(2) with 6 digits of precision: 29145/20201. #[cfg(feature = "std")] const LN2_INV: (u64, u64) = (29145, 20201); /// A [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter). /// /// This implementation uses the Kirsch-Mitzenmacher optimization to derive `k` hash functions /// from two hash values, which are in turn derived from a single hash digest. This provides /// efficient hashing for [BloomFilter::insert] and [BloomFilter::contains] operations. /// /// # Hasher Selection /// /// The `H` type parameter specifies the hash function to use. It defaults to [Sha256]. /// The hasher's digest must be at least 16 bytes (128 bits) long, this is enforced at /// compile time. /// /// When choosing a hasher, consider: /// /// - **Security**: If the bloom filter accepts untrusted input, use a cryptographically /// secure hash function to prevent attackers from crafting inputs that cause excessive /// collisions (degrading the filter to always return `true`). /// /// - **Determinism**: If the bloom filter must produce consistent results across runs /// or machines (e.g. for serialization or consensus-critical applications), avoid keyed /// or randomized hash functions. Both [Sha256] and [Blake3](crate::blake3::Blake3) /// are deterministic. /// /// - **Performance**: Hash function performance varies with the size of items inserted /// and queried. [Sha256] is faster for smaller items (up to ~2KB), while /// [Blake3](crate::blake3::Blake3) is faster for larger items (4KB+). #[derive(Clone, Debug)] pub struct BloomFilter { hashers: u8, bits: BitMap, _marker: PhantomData, } impl PartialEq for BloomFilter { fn eq(&self, other: &Self) -> bool { self.hashers == other.hashers && self.bits == other.bits } } impl Eq for BloomFilter {} impl BloomFilter { /// Compile-time assertion that the digest is at least 16 bytes. const _ASSERT_DIGEST_AT_LEAST_16_BYTES: () = assert!( ::SIZE >= 16, "digest must be at least 128 bits (16 bytes)" ); /// Creates a new [BloomFilter] with `hashers` hash functions and `bits` bits. /// /// The number of bits will be rounded up to the next power of 2. If that would /// overflow, the maximum power of 2 for the platform (2^63 on 64-bit) is used. pub fn new(hashers: NonZeroU8, bits: NonZeroUsize) -> Self { let bits = bits .get() .checked_next_power_of_two() .unwrap_or(1 << (usize::BITS - 1)); Self { hashers: hashers.get(), bits: BitMap::zeroes(bits as u64), _marker: PhantomData, } } /// Creates a new [BloomFilter] with optimal parameters for the expected number /// of items and desired false positive rate. /// /// Uses exact rational arithmetic for full determinism across all platforms. /// /// # Arguments /// /// * `expected_items` - Number of items expected to be inserted /// * `fp_rate` - False positive rate as a rational (e.g., `BigRational::from_frac_u64(1, 100)` for 1%) /// /// # Panics /// /// Panics if `fp_rate` is not in (0, 1). #[cfg(feature = "std")] pub fn with_rate(expected_items: NonZeroUsize, fp_rate: BigRational) -> Self { let bits = Self::optimal_bits(expected_items.get(), &fp_rate); let hashers = Self::optimal_hashers(expected_items.get(), bits); Self { hashers, bits: BitMap::zeroes(bits as u64), _marker: PhantomData, } } /// Returns the number of hashers used by the filter. pub const fn hashers(&self) -> NonZeroU8 { NonZeroU8::new(self.hashers).expect("hashers is never zero") } /// Returns the number of bits used by the filter. pub const fn bits(&self) -> NonZeroUsize { NonZeroUsize::new(self.bits.len() as usize).expect("bits is never zero") } /// Generate `num_hashers` bit indices for a given item. fn indices(&self, item: &[u8]) -> impl Iterator { #[allow(path_statements)] Self::_ASSERT_DIGEST_AT_LEAST_16_BYTES; // Extract two 64-bit hash values from the digest of the item let digest = H::hash(item); let h1 = u64::from_be_bytes(digest[0..8].try_into().unwrap()); let mut h2 = u64::from_be_bytes(digest[8..16].try_into().unwrap()); // Ensure h2 is odd (non-zero). If h2 were 0, all k hash functions would // produce the same index (h1), defeating the purpose of multiple hashers. h2 |= 1; // Generate `hashers` hashes using the Kirsch-Mitzenmacher optimization: // // `h_i(x) = (h1(x) + i * h2(x)) mod m` let hashers = self.hashers as u64; let mask = self.bits.len() - 1; (0..hashers).map(move |hasher| h1.wrapping_add(hasher.wrapping_mul(h2)) & mask) } /// Inserts an item into the [BloomFilter]. pub fn insert(&mut self, item: &[u8]) { let indices = self.indices(item); for index in indices { self.bits.set(index, true); } } /// Checks if an item is possibly in the [BloomFilter]. /// /// Returns `true` if the item is probably in the set, and `false` if it is definitely not. pub fn contains(&self, item: &[u8]) -> bool { let indices = self.indices(item); for index in indices { if !self.bits.get(index) { return false; } } true } /// Estimates the current false positive probability. /// /// This approximates the false positive rate as `f^k` where `f` is the fill ratio /// (proportion of bits set to 1) and `k` is the number of hash functions. /// /// Returns a [`BigRational`] for exact representation and cross-platform determinism. #[cfg(feature = "std")] pub fn estimated_false_positive_rate(&self) -> BigRational { let ones = self.bits.count_ones(); let len = self.bits.len(); let fill_ratio = BigRational::new(ones.into(), len.into()); fill_ratio.pow(self.hashers as i32) } /// Estimates the number of items that have been inserted. /// /// Uses the formula `n = -(m/k) * ln(1 - x/m)` where `m` is the number of bits, /// `k` is the number of hash functions, and `x` is the number of bits set to 1. /// /// Returns a [`BigRational`] using `log2_floor` for the logarithm computation. #[cfg(feature = "std")] pub fn estimated_count(&self) -> BigRational { let m = self.bits.len(); let x = self.bits.count_ones(); let k = self.hashers as u64; if x >= m { return BigRational::from_usize(usize::MAX); } // ln(1 - x/m) = log2(1 - x/m) * ln(2) let one_minus_fill = BigRational::new((m - x).into(), m.into()); let log2_val = one_minus_fill.log2_floor(16); let ln2 = BigRational::from_frac_u64(LN2.0, LN2.1); let ln_result = &log2_val * &ln2; // n = -(m/k) * ln(1 - x/m) let m_over_k = BigRational::new(m.into(), k.into()); -m_over_k * ln_result } /// Calculates the optimal number of hash functions for a given capacity and bit count. /// /// Uses [`BigRational`] for determinism. The result is clamped to [1, 16] since /// beyond ~10-12 hashes provides negligible improvement while increasing CPU cost. #[cfg(feature = "std")] pub fn optimal_hashers(expected_items: usize, bits: usize) -> u8 { if expected_items == 0 { return 1; } // k = (m/n) * ln(2) let ln2 = BigRational::from_frac_u64(LN2.0, LN2.1); let k_ratio = BigRational::from_usize(bits) * ln2 / BigRational::from_usize(expected_items); k_ratio.to_integer().to_u8().unwrap_or(16).clamp(1, 16) } /// Calculates the optimal number of bits for a given capacity and false positive rate. /// /// Uses exact rational arithmetic for full determinism across all platforms. /// The result is rounded up to the next power of 2. If that would overflow, the maximum /// power of 2 for the platform (2^63 on 64-bit) is used. /// /// Formula: m = -n * log2(p) / ln(2) /// /// # Panics /// /// Panics if `fp_rate` is not in (0, 1). #[cfg(feature = "std")] pub fn optimal_bits(expected_items: usize, fp_rate: &BigRational) -> usize { assert!( fp_rate > &BigRational::zero() && fp_rate < &BigRational::one(), "false positive rate must be in (0, 1)" ); // log2(p) is negative for p < 1. Use floor to get a more negative value, // which results in more bits (conservative choice to not exceed target FP rate). let log2_p = fp_rate.log2_floor(16); // m = -n * log2(p) / ln(2) = -n * log2(p) * (1/ln(2)) // Since log2(p) < 0 for p < 1, -log2(p) > 0 let n = BigRational::from_usize(expected_items); let ln2_inv = BigRational::from_frac_u64(LN2_INV.0, LN2_INV.1); let bits_rational = -(&n * &log2_p * &ln2_inv); let raw = bits_rational.ceil_to_u128().unwrap_or(1) as usize; raw.max(1) .checked_next_power_of_two() .unwrap_or(1 << (usize::BITS - 1)) } } impl Write for BloomFilter { fn write(&self, buf: &mut impl BufMut) { self.hashers.write(buf); self.bits.write(buf); } } impl Read for BloomFilter { // The number of hashers and the number of bits that the bitmap must have. type Cfg = (NonZeroU8, NonZeroU64); fn read_cfg( buf: &mut impl Buf, (hashers_cfg, bits_cfg): &Self::Cfg, ) -> Result { if !bits_cfg.get().is_power_of_two() { return Err(CodecError::Invalid( "BloomFilter", "bits must be a power of 2", )); } let hashers = u8::read_cfg(buf, &())?; if hashers != hashers_cfg.get() { return Err(CodecError::Invalid( "BloomFilter", "hashers doesn't match config", )); } let bits = BitMap::read_cfg(buf, &bits_cfg.get())?; if bits.len() != bits_cfg.get() { return Err(CodecError::Invalid( "BloomFilter", "bitmap length doesn't match config", )); } Ok(Self { hashers, bits, _marker: PhantomData, }) } } impl EncodeSize for BloomFilter { fn encode_size(&self) -> usize { self.hashers.encode_size() + self.bits.encode_size() } } #[cfg(feature = "arbitrary")] impl arbitrary::Arbitrary<'_> for BloomFilter { fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { // Ensure at least 1 hasher let hashers = u8::arbitrary(u)?.max(1); // Generate u64 in u16 range to avoid OOM, then round to power of two let bits_len = u.int_in_range(0..=u16::MAX as u64)?.next_power_of_two(); let mut bits = BitMap::with_capacity(bits_len); for _ in 0..bits_len { bits.push(u.arbitrary::()?); } Ok(Self { hashers, bits, _marker: PhantomData, }) } } #[cfg(test)] mod tests { use super::*; use commonware_codec::{Decode, Encode}; use commonware_utils::{NZUsize, NZU64, NZU8}; #[test] fn test_insert_and_contains() { let mut bf = BloomFilter::::new(NZU8!(10), NZUsize!(1000)); let item1 = b"hello"; let item2 = b"world"; let item3 = b"bloomfilter"; bf.insert(item1); bf.insert(item2); assert!(bf.contains(item1)); assert!(bf.contains(item2)); assert!(!bf.contains(item3)); } #[test] fn test_empty() { let bf = BloomFilter::::new(NZU8!(5), NZUsize!(100)); assert!(!bf.contains(b"anything")); } #[test] fn test_false_positives() { let mut bf = BloomFilter::::new(NZU8!(10), NZUsize!(100)); for i in 0..10usize { bf.insert(&i.to_be_bytes()); } // Check for inserted items for i in 0..10usize { assert!(bf.contains(&i.to_be_bytes())); } // Check for non-inserted items and count false positives let mut false_positives = 0; for i in 100..1100usize { if bf.contains(&i.to_be_bytes()) { false_positives += 1; } } // A small bloom filter with many items will have some false positives. // The exact number is probabilistic, but it should not be zero and not all should be FPs. assert!(false_positives > 0); assert!(false_positives < 1000); } #[test] fn test_codec_roundtrip() { let mut bf = BloomFilter::::new(NZU8!(5), NZUsize!(128)); bf.insert(b"test1"); bf.insert(b"test2"); let cfg = (NZU8!(5), NZU64!(128)); let encoded = bf.encode(); let decoded = BloomFilter::::decode_cfg(encoded, &cfg).unwrap(); assert_eq!(bf, decoded); } #[test] fn test_codec_empty() { let bf = BloomFilter::::new(NZU8!(4), NZUsize!(128)); let cfg = (NZU8!(4), NZU64!(128)); let encoded = bf.encode(); let decoded = BloomFilter::::decode_cfg(encoded, &cfg).unwrap(); assert_eq!(bf, decoded); } #[test] fn test_codec_with_invalid_hashers() { let mut bf = BloomFilter::::new(NZU8!(5), NZUsize!(128)); bf.insert(b"test1"); let encoded = bf.encode(); // Too large let cfg = (NZU8!(10), NZU64!(128)); let decoded = BloomFilter::::decode_cfg(encoded.clone(), &cfg); assert!(matches!( decoded, Err(CodecError::Invalid( "BloomFilter", "hashers doesn't match config" )) )); // Too small let cfg = (NZU8!(4), NZU64!(128)); let decoded = BloomFilter::::decode_cfg(encoded, &cfg); assert!(matches!( decoded, Err(CodecError::Invalid( "BloomFilter", "hashers doesn't match config" )) )); } #[test] fn test_codec_with_invalid_bits() { let mut bf = BloomFilter::::new(NZU8!(5), NZUsize!(128)); bf.insert(b"test1"); let encoded = bf.encode(); // Wrong bit count let cfg = (NZU8!(5), NZU64!(64)); let result = BloomFilter::::decode_cfg(encoded.clone(), &cfg); assert!(matches!(result, Err(CodecError::InvalidLength(128)))); let cfg = (NZU8!(5), NZU64!(256)); let result = BloomFilter::::decode_cfg(encoded.clone(), &cfg); assert!(matches!( result, Err(CodecError::Invalid( "BloomFilter", "bitmap length doesn't match config" )) )); // Non-power-of-2 bits let cfg = (NZU8!(5), NZU64!(100)); let result = BloomFilter::::decode_cfg(encoded, &cfg); assert!(matches!( result, Err(CodecError::Invalid( "BloomFilter", "bits must be a power of 2" )) )); } #[test] fn test_statistics() { let mut bf = BloomFilter::::new(NZU8!(7), NZUsize!(1024)); // Empty filter should have 0 estimated count and FP rate assert_eq!(bf.estimated_count(), BigRational::zero()); assert_eq!(bf.estimated_false_positive_rate(), BigRational::zero()); // Insert some items for i in 0..100usize { bf.insert(&i.to_be_bytes()); } // Estimated count should be reasonably close to 100 let estimated = bf.estimated_count(); let lower = BigRational::from_usize(75); let upper = BigRational::from_usize(125); assert!(estimated > lower && estimated < upper); // FP rate should be non-zero after insertions assert!(bf.estimated_false_positive_rate() > BigRational::zero()); assert!(bf.estimated_false_positive_rate() < BigRational::one()); } #[test] fn test_with_rate() { // Create a filter for 1000 items with 1% false positive rate let fp_rate = BigRational::from_frac_u64(1, 100); let mut bf = BloomFilter::::with_rate(NZUsize!(1000), fp_rate.clone()); // Verify getters return expected values let expected_bits = BloomFilter::::optimal_bits(1000, &fp_rate); let expected_hashers = BloomFilter::::optimal_hashers(1000, expected_bits); assert_eq!(bf.bits().get(), expected_bits); assert_eq!(bf.hashers().get(), expected_hashers); // Insert 1000 items for i in 0..1000usize { bf.insert(&i.to_be_bytes()); } // All inserted items should be found for i in 0..1000usize { assert!(bf.contains(&i.to_be_bytes())); } // Count false positives on non-inserted items let mut false_positives = 0; for i in 1000..2000usize { if bf.contains(&i.to_be_bytes()) { false_positives += 1; } } // With 1% target FP rate, we expect around 10 false positives out of 1000 // Allow some variance (should be well under 2%) assert!(false_positives < 20); } #[test] fn test_optimal_hashers() { // For 1000 items in 10000 bits, optimal k = (10000/1000) * ln(2) = 6.93 // Integer math truncates to 6 let k = BloomFilter::::optimal_hashers(1000, 10000); assert_eq!(k, 6); // For 100 items in 1000 bits, optimal k = (1000/100) * ln(2) = 6.93 // Integer math truncates to 6 let k = BloomFilter::::optimal_hashers(100, 1000); assert_eq!(k, 6); // Edge case: very few bits per item, clamped to 1 let k = BloomFilter::::optimal_hashers(1000, 100); assert_eq!(k, 1); // Edge case: many bits per item, clamped to 16 let k = BloomFilter::::optimal_hashers(100, 100000); assert_eq!(k, 16); // Edge case: zero items returns 1 let k = BloomFilter::::optimal_hashers(0, 1000); assert_eq!(k, 1); // Edge case: extreme values that would overflow (n << 16 wraps to 0 for n >= 2^48) // Should not panic, should return clamped value let k = BloomFilter::::optimal_hashers(1 << 48, 1000); assert_eq!(k, 1); let k = BloomFilter::::optimal_hashers(usize::MAX, usize::MAX); assert!((1..=16).contains(&k)); } #[test] fn test_optimal_bits() { // For 1000 items with 1% FP rate // Formula: m = -n * ln(p) / (ln(2))^2 = -1000 * ln(0.01) / 0.4804 = 9585 // Rounded to next power of 2 = 16384 let fp_1pct = BigRational::from_frac_u64(1, 100); let bits = BloomFilter::::optimal_bits(1000, &fp_1pct); assert_eq!(bits, 16384); assert!(bits.is_power_of_two()); // For 10000 items with 0.001% FP rate (need significantly more bits) // Formula: m = -10000 * ln(0.00001) / 0.4804 = 239627 // Rounded to next power of 2 = 262144 let fp_001pct = BigRational::from_frac_u64(1, 100_000); let bits_lower_fp = BloomFilter::::optimal_bits(10000, &fp_001pct); assert_eq!(bits_lower_fp, 262144); assert!(bits_lower_fp.is_power_of_two()); } #[test] fn test_bits_extreme_values() { let fp_001pct = BigRational::from_frac_u64(1, 10_000); let fp_1pct = BigRational::from_frac_u64(1, 100); // Very large expected_items let bits = BloomFilter::::optimal_bits(usize::MAX / 2, &fp_001pct); assert!(bits.is_power_of_two()); assert!(bits > 0); // Large but reasonable values let bits = BloomFilter::::optimal_bits(1_000_000_000, &fp_001pct); assert!(bits.is_power_of_two()); // Zero items let bits = BloomFilter::::optimal_bits(0, &fp_1pct); assert!(bits.is_power_of_two()); assert_eq!(bits, 1); // 0 * bpe rounds up to 1 } #[test] fn test_with_rate_deterministic() { let fp_rate = BigRational::from_frac_u64(1, 100); let bf1 = BloomFilter::::with_rate(NZUsize!(1000), fp_rate.clone()); let bf2 = BloomFilter::::with_rate(NZUsize!(1000), fp_rate); assert_eq!(bf1.bits(), bf2.bits()); assert_eq!(bf1.hashers(), bf2.hashers()); } #[test] fn test_optimal_bits_matches_formula() { // For 1000 items at 1% FP rate // m = -1000 * log2(0.01) / ln(2) = 9585 // Rounded to power of 2 = 16384 let fp_rate = BigRational::from_frac_u64(1, 100); let bits = BloomFilter::::optimal_bits(1000, &fp_rate); assert_eq!(bits, 16384); } }