vendor/rand-0.6.1/src/seq/index.rs

   1 // Copyright 2018 Developers of the Rand project.
   2 //
   3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   4 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   5 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   6 // option. This file may not be copied, modified, or distributed
   7 // except according to those terms.
   8
   9 //! Index sampling
  10
  11 #[cfg(feature="alloc")] use core::slice;
  12
  13 #[cfg(feature="std")] use std::vec;
  14 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::{self, Vec};
  15 // BTreeMap is not as fast in tests, but better than nothing.
  16 #[cfg(feature="std")] use std::collections::{HashSet};
  17 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet;
  18
  19 #[cfg(feature="alloc")] use distributions::{Distribution, Uniform};
  20 use Rng;
  21
  22 /// A vector of indices.
  23 ///
  24 /// Multiple internal representations are possible.
  25 #[derive(Clone, Debug)]
  26 pub enum IndexVec {
  27     #[doc(hidden)] U32(Vec<u32>),
  28     #[doc(hidden)] USize(Vec<usize>),
  29 }
  30
  31 impl IndexVec {
  32     /// Returns the number of indices
  33     pub fn len(&self) -> usize {
  34         match self {
  35             &IndexVec::U32(ref v) => v.len(),
  36             &IndexVec::USize(ref v) => v.len(),
  37         }
  38     }
  39
  40     /// Return the value at the given `index`.
  41     ///
  42     /// (Note: we cannot implement `std::ops::Index` because of lifetime
  43     /// restrictions.)
  44     pub fn index(&self, index: usize) -> usize {
  45         match self {
  46             &IndexVec::U32(ref v) => v[index] as usize,
  47             &IndexVec::USize(ref v) => v[index],
  48         }
  49     }
  50
  51     /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
  52     pub fn into_vec(self) -> Vec<usize> {
  53         match self {
  54             IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(),
  55             IndexVec::USize(v) => v,
  56         }
  57     }
  58
  59     /// Iterate over the indices as a sequence of `usize` values
  60     pub fn iter<'a>(&'a self) -> IndexVecIter<'a> {
  61         match self {
  62             &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()),
  63             &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()),
  64         }
  65     }
  66
  67     /// Convert into an iterator over the indices as a sequence of `usize` values
  68     pub fn into_iter(self) -> IndexVecIntoIter {
  69         match self {
  70             IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()),
  71             IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()),
  72         }
  73     }
  74 }
  75
  76 impl PartialEq for IndexVec {
  77     fn eq(&self, other: &IndexVec) -> bool {
  78         use self::IndexVec::*;
  79         match (self, other) {
  80             (&U32(ref v1), &U32(ref v2)) => v1 == v2,
  81             (&USize(ref v1), &USize(ref v2)) => v1 == v2,
  82             (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len())
  83                 && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)),
  84             (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len())
  85                 && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)),
  86         }
  87     }
  88 }
  89
  90 impl From<Vec<u32>> for IndexVec {
  91     fn from(v: Vec<u32>) -> Self {
  92         IndexVec::U32(v)
  93     }
  94 }
  95
  96 impl From<Vec<usize>> for IndexVec {
  97     fn from(v: Vec<usize>) -> Self {
  98         IndexVec::USize(v)
  99     }
 100 }
 101
 102 /// Return type of `IndexVec::iter`.
 103 #[derive(Debug)]
 104 pub enum IndexVecIter<'a> {
 105     #[doc(hidden)] U32(slice::Iter<'a, u32>),
 106     #[doc(hidden)] USize(slice::Iter<'a, usize>),
 107 }
 108
 109 impl<'a> Iterator for IndexVecIter<'a> {
 110     type Item = usize;
 111     fn next(&mut self) -> Option<usize> {
 112         use self::IndexVecIter::*;
 113         match self {
 114             &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
 115             &mut USize(ref mut iter) => iter.next().cloned(),
 116         }
 117     }
 118
 119     fn size_hint(&self) -> (usize, Option<usize>) {
 120         match self {
 121             &IndexVecIter::U32(ref v) => v.size_hint(),
 122             &IndexVecIter::USize(ref v) => v.size_hint(),
 123         }
 124     }
 125 }
 126
 127 impl<'a> ExactSizeIterator for IndexVecIter<'a> {}
 128
 129 /// Return type of `IndexVec::into_iter`.
 130 #[derive(Clone, Debug)]
 131 pub enum IndexVecIntoIter {
 132     #[doc(hidden)] U32(vec::IntoIter<u32>),
 133     #[doc(hidden)] USize(vec::IntoIter<usize>),
 134 }
 135
 136 impl Iterator for IndexVecIntoIter {
 137     type Item = usize;
 138
 139     fn next(&mut self) -> Option<Self::Item> {
 140         use self::IndexVecIntoIter::*;
 141         match self {
 142             &mut U32(ref mut v) => v.next().map(|i| i as usize),
 143             &mut USize(ref mut v) => v.next(),
 144         }
 145     }
 146
 147     fn size_hint(&self) -> (usize, Option<usize>) {
 148         use self::IndexVecIntoIter::*;
 149         match self {
 150             &U32(ref v) => v.size_hint(),
 151             &USize(ref v) => v.size_hint(),
 152         }
 153     }
 154 }
 155
 156 impl ExactSizeIterator for IndexVecIntoIter {}
 157
 158
 159 /// Randomly sample exactly `amount` distinct indices from `0..length`, and
 160 /// return them in random order (fully shuffled).
 161 ///
 162 /// This method is used internally by the slice sampling methods, but it can
 163 /// sometimes be useful to have the indices themselves so this is provided as
 164 /// an alternative.
 165 ///
 166 /// The implementation used is not specified; we automatically select the
 167 /// fastest available algorithm for the `length` and `amount` parameters
 168 /// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
 169 /// complexity is `O(amount)`, except that when `amount` is small, performance
 170 /// is closer to `O(amount^2)`, and when `length` is close to `amount` then
 171 /// `O(length)`.
 172 ///
 173 /// Note that performance is significantly better over `u32` indices than over
 174 /// `u64` indices. Because of this we hide the underlying type behind an
 175 /// abstraction, `IndexVec`.
 176 ///
 177 /// If an allocation-free `no_std` function is required, it is suggested
 178 /// to adapt the internal `sample_floyd` implementation.
 179 ///
 180 /// Panics if `amount > length`.
 181 pub fn sample<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
 182     where R: Rng + ?Sized,
 183 {
 184     if amount > length {
 185         panic!("`amount` of samples must be less than or equal to `length`");
 186     }
 187     if length > (::core::u32::MAX as usize) {
 188         // We never want to use inplace here, but could use floyd's alg
 189         // Lazy version: always use the cache alg.
 190         return sample_rejection(rng, length, amount);
 191     }
 192     let amount = amount as u32;
 193     let length = length as u32;
 194
 195     // Choice of algorithm here depends on both length and amount. See:
 196     // https://github.com/rust-random/rand/pull/479
 197     // We do some calculations with f32. Accuracy is not very important.
 198
 199     if amount < 163 {
 200         const C: [[f32; 2]; 2] = [[1.6, 8.0/45.0], [10.0, 70.0/9.0]];
 201         let j = if length < 500_000 { 0 } else { 1 };
 202         let amount_fp = amount as f32;
 203         let m4 = C[0][j] * amount_fp;
 204         // Short-cut: when amount < 12, floyd's is always faster
 205         if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
 206             sample_inplace(rng, length, amount)
 207         } else {
 208             sample_floyd(rng, length, amount)
 209         }
 210     } else {
 211         const C: [f32; 2] = [270.0, 330.0/9.0];
 212         let j = if length < 500_000 { 0 } else { 1 };
 213         if (length as f32) < C[j] * (amount as f32) {
 214             sample_inplace(rng, length, amount)
 215         } else {
 216             // note: could have a specific u32 impl, but I'm lazy and
 217             // generics don't have usable conversions
 218             sample_rejection(rng, length as usize, amount as usize)
 219         }
 220     }
 221 }
 222
 223 /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
 224 /// combination algorithm.
 225 ///
 226 /// The output values are fully shuffled. (Overhead is under 50%.)
 227 ///
 228 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
 229 fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
 230     where R: Rng + ?Sized,
 231 {
 232     // For small amount we use Floyd's fully-shuffled variant. For larger
 233     // amounts this is slow due to Vec::insert performance, so we shuffle
 234     // afterwards. Benchmarks show little overhead from extra logic.
 235     let floyd_shuffle = amount < 50;
 236
 237     debug_assert!(amount <= length);
 238     let mut indices = Vec::with_capacity(amount as usize);
 239     for j in length - amount .. length {
 240         let t = rng.gen_range(0, j + 1);
 241         if floyd_shuffle {
 242             if let Some(pos) = indices.iter().position(|&x| x == t) {
 243                 indices.insert(pos, j);
 244                 continue;
 245             }
 246         } else {
 247             if indices.contains(&t) {
 248                 indices.push(j);
 249                 continue;
 250             }
 251         }
 252         indices.push(t);
 253     }
 254     if !floyd_shuffle {
 255         // Reimplement SliceRandom::shuffle with smaller indices
 256         for i in (1..amount).rev() {
 257             // invariant: elements with index > i have been locked in place.
 258             indices.swap(i as usize, rng.gen_range(0, i + 1) as usize);
 259         }
 260     }
 261     IndexVec::from(indices)
 262 }
 263
 264 /// Randomly sample exactly `amount` indices from `0..length`, using an inplace
 265 /// partial Fisher-Yates method.
 266 /// Sample an amount of indices using an inplace partial fisher yates method.
 267 ///
 268 /// This allocates the entire `length` of indices and randomizes only the first `amount`.
 269 /// It then truncates to `amount` and returns.
 270 ///
 271 /// This method is not appropriate for large `length` and potentially uses a lot
 272 /// of memory; because of this we only implement for `u32` index (which improves
 273 /// performance in all cases).
 274 ///
 275 /// Set-up is `O(length)` time and memory and shuffling is `O(amount)` time.
 276 fn sample_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
 277     where R: Rng + ?Sized,
 278 {
 279     debug_assert!(amount <= length);
 280     let mut indices: Vec<u32> = Vec::with_capacity(length as usize);
 281     indices.extend(0..length);
 282     for i in 0..amount {
 283         let j: u32 = rng.gen_range(i, length);
 284         indices.swap(i as usize, j as usize);
 285     }
 286     indices.truncate(amount as usize);
 287     debug_assert_eq!(indices.len(), amount as usize);
 288     IndexVec::from(indices)
 289 }
 290
 291 /// Randomly sample exactly `amount` indices from `0..length`, using rejection
 292 /// sampling.
 293 ///
 294 /// Since `amount <<< length` there is a low chance of a random sample in
 295 /// `0..length` being a duplicate. We test for duplicates and resample where
 296 /// necessary. The algorithm is `O(amount)` time and memory.
 297 fn sample_rejection<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
 298     where R: Rng + ?Sized,
 299 {
 300     debug_assert!(amount < length);
 301     #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount);
 302     #[cfg(not(feature="std"))] let mut cache = BTreeSet::new();
 303     let distr = Uniform::new(0, length);
 304     let mut indices = Vec::with_capacity(amount);
 305     for _ in 0..amount {
 306         let mut pos = distr.sample(rng);
 307         while !cache.insert(pos) {
 308             pos = distr.sample(rng);
 309         }
 310         indices.push(pos);
 311     }
 312
 313     debug_assert_eq!(indices.len(), amount);
 314     IndexVec::from(indices)
 315 }
 316
 317 #[cfg(test)]
 318 mod test {
 319     use super::*;
 320
 321     #[test]
 322     fn test_sample_boundaries() {
 323         let mut r = ::test::rng(404);
 324
 325         assert_eq!(sample_inplace(&mut r, 0, 0).len(), 0);
 326         assert_eq!(sample_inplace(&mut r, 1, 0).len(), 0);
 327         assert_eq!(sample_inplace(&mut r, 1, 1).into_vec(), vec![0]);
 328
 329         assert_eq!(sample_rejection(&mut r, 1, 0).len(), 0);
 330
 331         assert_eq!(sample_floyd(&mut r, 0, 0).len(), 0);
 332         assert_eq!(sample_floyd(&mut r, 1, 0).len(), 0);
 333         assert_eq!(sample_floyd(&mut r, 1, 1).into_vec(), vec![0]);
 334
 335         // These algorithms should be fast with big numbers. Test average.
 336         let sum: usize = sample_rejection(&mut r, 1 << 25, 10)
 337                 .into_iter().sum();
 338         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 339
 340         let sum: usize = sample_floyd(&mut r, 1 << 25, 10)
 341                 .into_iter().sum();
 342         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 343     }
 344
 345     #[test]
 346     fn test_sample_alg() {
 347         let seed_rng = ::test::rng;
 348
 349         // We can't test which algorithm is used directly, but Floyd's alg
 350         // should produce different results from the others. (Also, `inplace`
 351         // and `cached` currently use different sizes thus produce different results.)
 352
 353         // A small length and relatively large amount should use inplace
 354         let (length, amount): (usize, usize) = (100, 50);
 355         let v1 = sample(&mut seed_rng(420), length, amount);
 356         let v2 = sample_inplace(&mut seed_rng(420), length as u32, amount as u32);
 357         assert!(v1.iter().all(|e| e < length));
 358         assert_eq!(v1, v2);
 359
 360         // Test Floyd's alg does produce different results
 361         let v3 = sample_floyd(&mut seed_rng(420), length as u32, amount as u32);
 362         assert!(v1 != v3);
 363
 364         // A large length and small amount should use Floyd
 365         let (length, amount): (usize, usize) = (1<<20, 50);
 366         let v1 = sample(&mut seed_rng(421), length, amount);
 367         let v2 = sample_floyd(&mut seed_rng(421), length as u32, amount as u32);
 368         assert!(v1.iter().all(|e| e < length));
 369         assert_eq!(v1, v2);
 370
 371         // A large length and larger amount should use cache
 372         let (length, amount): (usize, usize) = (1<<20, 600);
 373         let v1 = sample(&mut seed_rng(422), length, amount);
 374         let v2 = sample_rejection(&mut seed_rng(422), length, amount);
 375         assert!(v1.iter().all(|e| e < length));
 376         assert_eq!(v1, v2);
 377     }
 378 }