Skip to content

Commit 3aad860

Browse files
authored
Merge pull request #4 from alecmocatta/fixes
fixes, stricter integer casts
2 parents 1104972 + d77ba3b commit 3aad860

File tree

8 files changed

+342
-87
lines changed

8 files changed

+342
-87
lines changed

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ addons:
1313

1414
env:
1515
global:
16-
- FMT_VERSION=nightly-2018-10-14
17-
- CLIPPY_VERSION=nightly-2018-10-14
18-
- DOCS_RS_VERSION=nightly-2018-10-14 # should be nightly-2018-06-03 but packed_simd fails https://github.com/onur/docs.rs/blob/32102ce458b34f750ef190182116d9583491a64b/Vagrantfile#L50
16+
- FMT_VERSION=nightly-2018-11-23
17+
- CLIPPY_VERSION=nightly-2018-11-23
18+
- DOCS_RS_VERSION=nightly-2018-10-20 # https://github.com/rust-lang/docs.rs/blob/084f76241f148ffeb8f9ddd900fcda2908a09f56/Vagrantfile#L50
1919

2020
matrix:
2121
include:

src/count_min.rs

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
// SOFTWARE.
2222

23+
use super::f64_to_usize;
2324
use serde::{de::Deserialize, ser::Serialize};
2425
use std::{
25-
borrow::Borrow, cmp::max, fmt, hash::{Hash, Hasher}, marker::PhantomData, ops
26+
borrow::Borrow, cmp::max, convert::TryFrom, fmt, hash::{Hash, Hasher}, marker::PhantomData, ops
2627
};
2728
use traits::{Intersect, IntersectPlusUnionIsPlus, New, UnionAssign};
2829
use twox_hash::XxHash;
@@ -165,7 +166,7 @@ where
165166

166167
fn optimal_width(tolerance: f64) -> usize {
167168
let e = tolerance;
168-
let width = (2.0 / e).round() as usize;
169+
let width = f64_to_usize((2.0 / e).round());
169170
max(2, width)
170171
.checked_next_power_of_two()
171172
.expect("Width would be way too large")
@@ -178,27 +179,19 @@ where
178179
}
179180

180181
fn optimal_k_num(probability: f64) -> usize {
181-
max(1, ((1.0 - probability).ln() / 0.5_f64.ln()) as usize)
182+
max(
183+
1,
184+
f64_to_usize(((1.0 - probability).ln() / 0.5_f64.ln()).floor()),
185+
)
182186
}
183187

184188
fn offsets<Q: ?Sized>(&self, key: &Q) -> impl Iterator<Item = usize>
185189
where
186190
Q: Hash,
187191
K: Borrow<Q>,
188192
{
189-
// if k_i < 2 {
190-
// let sip = &mut self.hashers[k_i as usize].clone();
191-
// key.hash(sip);
192-
// let hash = sip.finish();
193-
// hashes[k_i as usize] = hash;
194-
// hash as usize & self.mask
195-
// } else {
196-
// hashes[0]
197-
// .wrapping_add((k_i as u64).wrapping_mul(hashes[1]) %
198-
// 0xffffffffffffffc5) as usize & self.mask
199-
// }
200193
let mask = self.mask;
201-
hashes(key).map(move |hash| hash as usize & mask)
194+
hashes(key).map(move |hash| usize::try_from(hash & u64::try_from(mask).unwrap()).unwrap())
202195
}
203196
}
204197

src/distinct.rs

Lines changed: 101 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,83 @@
4444
// https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD countApproxDistinct
4545
// is_x86_feature_detected ?
4646

47+
use super::{f64_to_u8, u64_to_f64, usize_to_f64};
4748
use packed_simd::{self, Cast, FromBits, IntoBits};
4849
use std::{
49-
cmp::{self, Ordering}, convert::identity, fmt, hash::{Hash, Hasher}, marker::PhantomData, ops::{self, Range}
50+
cmp::{self, Ordering}, convert::{identity, TryFrom}, fmt, hash::{Hash, Hasher}, marker::PhantomData, ops::{self, Range}
5051
};
5152
use traits::{Intersect, IntersectPlusUnionIsPlus, New, UnionAssign};
5253
use twox_hash::XxHash;
5354

5455
mod consts;
5556
use self::consts::*;
5657

58+
/// Like [`HyperLogLog`] but implements `Ord` and `Eq` by using the estimate of the cardinality.
59+
#[derive(Serialize, Deserialize)]
60+
#[serde(bound = "")]
61+
pub struct HyperLogLogMagnitude<V>(HyperLogLog<V>);
62+
impl<V: Hash> Ord for HyperLogLogMagnitude<V> {
63+
#[inline(always)]
64+
fn cmp(&self, other: &Self) -> Ordering {
65+
self.0.len().partial_cmp(&other.0.len()).unwrap()
66+
}
67+
}
68+
impl<V: Hash> PartialOrd for HyperLogLogMagnitude<V> {
69+
#[inline(always)]
70+
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
71+
self.0.len().partial_cmp(&other.0.len())
72+
}
73+
}
74+
impl<V: Hash> PartialEq for HyperLogLogMagnitude<V> {
75+
#[inline(always)]
76+
fn eq(&self, other: &Self) -> bool {
77+
self.0.len().eq(&other.0.len())
78+
}
79+
}
80+
impl<V: Hash> Eq for HyperLogLogMagnitude<V> {}
81+
impl<V: Hash> Clone for HyperLogLogMagnitude<V> {
82+
fn clone(&self) -> Self {
83+
HyperLogLogMagnitude(self.0.clone())
84+
}
85+
}
86+
impl<V: Hash> New for HyperLogLogMagnitude<V> {
87+
type Config = f64;
88+
fn new(config: &Self::Config) -> Self {
89+
HyperLogLogMagnitude(New::new(config))
90+
}
91+
}
92+
impl<V: Hash> Intersect for HyperLogLogMagnitude<V> {
93+
fn intersect<'a>(iter: impl Iterator<Item = &'a Self>) -> Option<Self>
94+
where
95+
Self: Sized + 'a,
96+
{
97+
Intersect::intersect(iter.map(|x| &x.0)).map(HyperLogLogMagnitude)
98+
}
99+
}
100+
impl<'a, V: Hash> UnionAssign<&'a HyperLogLogMagnitude<V>> for HyperLogLogMagnitude<V> {
101+
fn union_assign(&mut self, rhs: &'a Self) {
102+
self.0.union_assign(&rhs.0)
103+
}
104+
}
105+
impl<'a, V: Hash> ops::AddAssign<&'a V> for HyperLogLogMagnitude<V> {
106+
fn add_assign(&mut self, rhs: &'a V) {
107+
self.0.add_assign(rhs)
108+
}
109+
}
110+
impl<'a, V: Hash> ops::AddAssign<&'a Self> for HyperLogLogMagnitude<V> {
111+
fn add_assign(&mut self, rhs: &'a Self) {
112+
self.0.add_assign(&rhs.0)
113+
}
114+
}
115+
impl<V: Hash> fmt::Debug for HyperLogLogMagnitude<V> {
116+
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
117+
self.0.fmt(fmt)
118+
}
119+
}
120+
impl<V> IntersectPlusUnionIsPlus for HyperLogLogMagnitude<V> {
121+
const VAL: bool = <HyperLogLog<V> as IntersectPlusUnionIsPlus>::VAL;
122+
}
123+
57124
/// An implementation of the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) data structure with *bias correction*.
58125
///
59126
/// See [*HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm*](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) and [*HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm*](https://ai.google/research/pubs/pub40671) for background on HyperLogLog with bias correction.
@@ -75,13 +142,13 @@ where
75142
/// Create an empty `HyperLogLog` data structure with the specified error tolerance.
76143
pub fn new(error_rate: f64) -> Self {
77144
assert!(0.0 < error_rate && error_rate < 1.0);
78-
let p = (f64::log2(1.04 / error_rate) * 2.0).ceil() as u8;
145+
let p = f64_to_u8((f64::log2(1.04 / error_rate) * 2.0).ceil());
79146
assert!(0 < p && p < 64);
80147
let alpha = Self::get_alpha(p);
81148
Self {
82149
alpha,
83150
zero: 1 << p,
84-
sum: (1 << p) as f64,
151+
sum: f64::from(1 << p),
85152
p,
86153
m: vec![0; 1 << p].into_boxed_slice(),
87154
marker: PhantomData,
@@ -93,7 +160,7 @@ where
93160
Self {
94161
alpha: hll.alpha,
95162
zero: hll.m.len(),
96-
sum: hll.m.len() as f64,
163+
sum: usize_to_f64(hll.m.len()),
97164
p: hll.p,
98165
m: vec![0; hll.m.len()].into_boxed_slice(),
99166
marker: PhantomData,
@@ -109,14 +176,14 @@ where
109176
let j = x & (self.m.len() as u64 - 1);
110177
let w = x >> self.p;
111178
let rho = Self::get_rho(w, 64 - self.p);
112-
let mjr = &mut self.m[j as usize];
179+
let mjr = &mut self.m[usize::try_from(j).unwrap()];
113180
let old = *mjr;
114181
let new = cmp::max(old, rho);
115182
self.zero -= if old == 0 { 1 } else { 0 };
116183

117184
// see pow_bithack()
118-
self.sum -= f64::from_bits(u64::max_value().wrapping_sub(old as u64) << 54 >> 2)
119-
- f64::from_bits(u64::max_value().wrapping_sub(new as u64) << 54 >> 2);
185+
self.sum -= f64::from_bits(u64::max_value().wrapping_sub(u64::from(old)) << 54 >> 2)
186+
- f64::from_bits(u64::max_value().wrapping_sub(u64::from(new)) << 54 >> 2);
120187

121188
*mjr = new;
122189
}
@@ -125,7 +192,8 @@ where
125192
pub fn len(&self) -> f64 {
126193
let v = self.zero;
127194
if v > 0 {
128-
let h = self.m.len() as f64 * (self.m.len() as f64 / v as f64).ln();
195+
let h =
196+
usize_to_f64(self.m.len()) * (usize_to_f64(self.m.len()) / usize_to_f64(v)).ln();
129197
if h <= Self::get_threshold(self.p - 4) {
130198
return h;
131199
}
@@ -170,8 +238,8 @@ where
170238
}
171239
}
172240
}
173-
self.zero = zero.wrapping_sum() as usize;
174-
self.sum = sum.sum() as f64;
241+
self.zero = usize::try_from(zero.wrapping_sum()).unwrap();
242+
self.sum = f64::from(sum.sum());
175243
// https://github.com/AdamNiederer/faster/issues/37
176244
// (src.m.simd_iter(faster::u8s(0)),self.m.simd_iter_mut(faster::u8s(0))).zip()
177245
}
@@ -208,14 +276,14 @@ where
208276
}
209277
}
210278
}
211-
self.zero = zero.wrapping_sum() as usize;
212-
self.sum = sum.sum() as f64;
279+
self.zero = usize::try_from(zero.wrapping_sum()).unwrap();
280+
self.sum = f64::from(sum.sum());
213281
}
214282

215283
/// Clears the `HyperLogLog` data structure, as if it was new.
216284
pub fn clear(&mut self) {
217285
self.zero = self.m.len();
218-
self.sum = self.m.len() as f64;
286+
self.sum = usize_to_f64(self.m.len());
219287
self.m.iter_mut().for_each(|x| {
220288
*x = 0;
221289
});
@@ -231,12 +299,12 @@ where
231299
4 => 0.673,
232300
5 => 0.697,
233301
6 => 0.709,
234-
_ => 0.7213 / (1.0 + 1.079 / (1_u64 << p) as f64),
302+
_ => 0.7213 / (1.0 + 1.079 / u64_to_f64(1_u64 << p)),
235303
}
236304
}
237305

238306
fn get_rho(w: u64, max_width: u8) -> u8 {
239-
let rho = max_width - (64 - w.leading_zeros() as u8) + 1;
307+
let rho = max_width - (64 - u8::try_from(w.leading_zeros()).unwrap()) + 1;
240308
assert!(0 < rho && rho < 65);
241309
rho
242310
}
@@ -245,7 +313,7 @@ where
245313
let bias_vector = BIAS_DATA[(p - 4) as usize];
246314
let neighbors = Self::get_nearest_neighbors(e, RAW_ESTIMATE_DATA[(p - 4) as usize]);
247315
assert_eq!(neighbors.len(), 6);
248-
bias_vector[neighbors].into_iter().sum::<f64>() / 6.0_f64
316+
bias_vector[neighbors].iter().sum::<f64>() / 6.0_f64
249317
}
250318

251319
fn get_nearest_neighbors(e: f64, estimate_vector: &[f64]) -> Range<usize> {
@@ -275,8 +343,8 @@ where
275343
}
276344

277345
fn ep(&self) -> f64 {
278-
let e = self.alpha * (self.m.len() * self.m.len()) as f64 / self.sum;
279-
if e <= (5 * self.m.len()) as f64 {
346+
let e = self.alpha * usize_to_f64(self.m.len() * self.m.len()) / self.sum;
347+
if e <= usize_to_f64(5 * self.m.len()) {
280348
e - Self::estimate_bias(e, self.p)
281349
} else {
282350
e
@@ -346,6 +414,14 @@ where
346414
self.push(rhs)
347415
}
348416
}
417+
impl<'a, V: ?Sized> ops::AddAssign<&'a Self> for HyperLogLog<V>
418+
where
419+
V: Hash,
420+
{
421+
fn add_assign(&mut self, rhs: &'a Self) {
422+
self.union(rhs)
423+
}
424+
}
349425
impl<V: ?Sized> IntersectPlusUnionIsPlus for HyperLogLog<V> {
350426
const VAL: bool = true;
351427
}
@@ -444,25 +520,25 @@ impl Sad<packed_simd::u8x8> {
444520
#[inline(always)]
445521
unsafe fn sad(a: packed_simd::u8x8, b: packed_simd::u8x8) -> u64 {
446522
assert_eq!(b, packed_simd::u8x8::splat(0));
447-
(0..8).map(|i| a.extract(i) as u64).sum()
523+
(0..8).map(|i| u64::from(a.extract(i))).sum()
448524
}
449525
}
450526

451527
#[cfg(test)]
452528
mod test {
453-
use super::HyperLogLog;
529+
use super::{super::f64_to_usize, HyperLogLog};
454530
use std::f64;
455531

456532
#[test]
457533
fn pow_bithack() {
458534
// build the float from x, manipulating it to be the mantissa we want.
459535
// no portability issues in theory https://doc.rust-lang.org/stable/std/primitive.f64.html#method.from_bits
460536
for x in 0_u8..65 {
461-
let a = 2.0_f64.powi(-(x as i32));
462-
let b = f64::from_bits(u64::max_value().wrapping_sub(x as u64) << 54 >> 2);
463-
let c = f32::from_bits(u32::max_value().wrapping_sub(x as u32) << 25 >> 2);
537+
let a = 2.0_f64.powi(-(i32::from(x)));
538+
let b = f64::from_bits(u64::max_value().wrapping_sub(u64::from(x)) << 54 >> 2);
539+
let c = f32::from_bits(u32::max_value().wrapping_sub(u32::from(x)) << 25 >> 2);
464540
assert_eq!(a, b);
465-
assert_eq!(a, c as f64);
541+
assert_eq!(a, f64::from(c));
466542
}
467543
}
468544

@@ -505,7 +581,7 @@ mod test {
505581
let actual = 100_000.0;
506582
let p = 0.05;
507583
let mut hll = HyperLogLog::new(p);
508-
for i in 0..actual as usize {
584+
for i in 0..f64_to_usize(actual) {
509585
hll.push(&i);
510586
}
511587

src/lib.rs

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
//! As these implementations are often in hot code paths, unsafe is used, albeit only when necessary to a) achieve the asymptotically optimal algorithm or b) mitigate an observed bottleneck.
2323
2424
#![doc(html_root_url = "https://docs.rs/streaming_algorithms/0.1.0")]
25-
#![feature(nll, specialization, convert_id)]
25+
#![feature(nll, specialization, convert_id, try_trait, try_from)]
2626
#![warn(
2727
missing_copy_implementations,
2828
missing_debug_implementations,
@@ -43,10 +43,7 @@
4343
clippy::if_not_else,
4444
clippy::op_ref,
4545
clippy::needless_pass_by_value,
46-
clippy::cast_possible_truncation,
47-
clippy::cast_sign_loss,
48-
clippy::cast_precision_loss,
49-
clippy::cast_lossless,
46+
clippy::suspicious_op_assign_impl,
5047
clippy::float_cmp
5148
)]
5249

@@ -70,3 +67,35 @@ pub use distinct::*;
7067
pub use sample::*;
7168
pub use top::*;
7269
pub use traits::*;
70+
71+
// TODO: replace all instances of the following with a.try_into().unwrap() if/when that exists https://github.com/rust-lang/rust/pull/47857
72+
#[allow(
73+
clippy::cast_possible_truncation,
74+
clippy::cast_sign_loss,
75+
clippy::cast_precision_loss
76+
)]
77+
fn f64_to_usize(a: f64) -> usize {
78+
assert!(a.is_sign_positive() && a <= usize::max_value() as f64 && a.fract() == 0.0);
79+
a as usize
80+
}
81+
82+
#[allow(
83+
clippy::cast_possible_truncation,
84+
clippy::cast_sign_loss,
85+
clippy::cast_precision_loss
86+
)]
87+
fn f64_to_u8(a: f64) -> u8 {
88+
assert!(a.is_sign_positive() && a <= f64::from(u8::max_value()) && a.fract() == 0.0);
89+
a as u8
90+
}
91+
92+
#[allow(clippy::cast_precision_loss)]
93+
fn usize_to_f64(a: usize) -> f64 {
94+
assert!(a as u64 <= 1_u64 << 53);
95+
a as f64
96+
}
97+
#[allow(clippy::cast_precision_loss)]
98+
fn u64_to_f64(a: u64) -> f64 {
99+
assert!(a <= 1_u64 << 53);
100+
a as f64
101+
}

0 commit comments

Comments
 (0)