From cf9f06cd16b25442fc791d1ea31f1cb920756cf3 Mon Sep 17 00:00:00 2001 From: Alexey Shuksto Date: Mon, 18 May 2020 20:50:01 +0300 Subject: [PATCH 1/2] improve rust version with memmap and custom hash --- build/rust/Cargo.lock | 71 +++---------- build/rust/Cargo.toml | 3 +- src/freq01.rs | 231 +++++++++++++++++++++++++++++++----------- 3 files changed, 191 insertions(+), 114 deletions(-) diff --git a/build/rust/Cargo.lock b/build/rust/Cargo.lock index edfc9c3..65d895c 100644 --- a/build/rust/Cargo.lock +++ b/build/rust/Cargo.lock @@ -1,14 +1,5 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -[[package]] -name = "aho-corasick" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" -dependencies = [ - "memchr", -] - [[package]] name = "atty" version = "0.2.14" @@ -64,19 +55,12 @@ dependencies = [ "syn", ] -[[package]] -name = "fnv" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" - [[package]] name = "freq" version = "0.1.0" dependencies = [ "clap", - "fnv", - "regex", + "memmap", ] [[package]] @@ -90,9 +74,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61565ff7aaace3525556587bd2dc31d4a07071957be715e63ce7b1eccf51a8f4" +checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71" dependencies = [ "libc", ] @@ -119,10 +103,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" [[package]] -name = "memchr" -version = "2.3.3" +name = "memmap" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] [[package]] name = "os_str_bytes" @@ -158,40 +146,22 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8872cf6f48eee44265156c111456a700ab3483686b3f96df4cf5481c89157319" +checksum = "53f5ffe53a6b28e37c9c1ce74893477864d64f74778a93a4beb43c8fa167f639" dependencies = [ "unicode-xid", ] [[package]] name = "quote" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42934bc9c8ab0d3b273a16d8551c8f0fcff46be73276ca083ec2414c15c4ba5e" +checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea" dependencies = [ "proc-macro2", ] -[[package]] -name = "regex" -version = "1.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", - "thread_local", -] - -[[package]] -name = "regex-syntax" -version = "0.6.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" - [[package]] name = "strsim" version = "0.10.0" @@ -200,9 +170,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4696caa4048ac7ce2bcd2e484b3cef88c1004e41b8e945a277e2c25dc0b72060" +checksum = "1425de3c33b0941002740a420b1a906a350b88d08b82b2c8a01035a3f9447bac" dependencies = [ "proc-macro2", "quote", @@ -238,15 +208,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "unicode-segmentation" version = "1.6.0" diff --git a/build/rust/Cargo.toml b/build/rust/Cargo.toml index 63f419b..2dceea5 100644 --- a/build/rust/Cargo.toml +++ b/build/rust/Cargo.toml @@ -8,8 +8,7 @@ edition = "2018" [dependencies] clap = "3.0.0-beta.1" -fnv = "1" -regex = "1" +memmap = "0.7" [[bin]] name = "freq01" diff --git a/src/freq01.rs b/src/freq01.rs index 2000737..50b69a9 100644 --- a/src/freq01.rs +++ b/src/freq01.rs @@ -1,10 +1,15 @@ -use clap::Clap; -use fnv::FnvHashMap; use std::{ + cmp::Ordering, fs::File, - io::{BufRead, BufReader, BufWriter, Write}, + io::{BufWriter, Read, Write}, }; +use clap::Clap; +use memmap::*; + +const H: usize = 2_166_136_261; +const P: usize = 0x0100_0193; + /// Counts number of unique `[a-zA-Z]+` words in input. #[derive(Clap, Debug)] #[clap(version = "0.1.0")] @@ -15,44 +20,30 @@ struct Opts { output: Option, } -struct FreqDict { - dict: FnvHashMap, u32>, -} - fn main() { let opts: Opts = Opts::parse(); - let mut input = open_input(&opts); - let mut output = create_output(&opts); + let input = open_mmap(&opts); - let mut word = Vec::with_capacity(16); - let mut dict = FreqDict::new(); + let mut word = Vec::with_capacity(256); + let mut hash = H; + let mut dict = FrequencyHashMap::new(); - let mut buffer = [0u8; 16 * 1024]; - loop { - let read_count = input - .read(&mut buffer) - .unwrap_or_else(|e| panic!("Unable to read bytes from '{}': {}", opts.input, e)); - - if read_count == 0 { - break; - } - - for &byte in buffer.iter().take(read_count) { - if b'a' <= byte && byte <= b'z' { - word.push(byte); - continue; - } else if b'A' <= byte && byte <= b'Z' { - word.push(byte ^ 0x20); - continue; - } - - dict.add_word(&word); + for &byte in input.iter() { + if (b'a' <= byte && byte <= b'z') || (b'A' <= byte && byte <= b'Z') { + hash = (hash ^ (byte | 0x20) as usize) * P; + word.push(byte); + } else if !word.is_empty() { + dict.register(hash, &word); word.clear(); + hash = H; } } - dict.add_word(&word); + if !word.is_empty() { + dict.register(hash, &word); + } - for (count, word) in dict.get_freq() { + let mut output = create_output(&opts); + for (count, word) in dict.into_iter() { writeln!(&mut output, "{} {}", count, word).unwrap_or_else(|e| { let output = opts.output.as_ref().map_or("-", |s| s.as_str()); panic!("Unable to write results in '{}': {}", output, e) @@ -60,44 +51,170 @@ fn main() { } } -impl FreqDict { - fn new() -> Self { - FreqDict { - dict: FnvHashMap::default(), +struct FrequencyHashMap { + buckets: Vec>, + capacity: usize, + length: usize, + mask: usize, + max: usize, +} + +#[derive(Clone, Debug)] +struct FrequencyHashEntry { + key: Box<[u8]>, + value: usize, + hash: usize, +} + +struct FrequencyHashIntoIter { + iter: std::vec::IntoIter> +} + +impl FrequencyHashMap { + const INITIAL: usize = 128; + const LOAD_FACTOR: f32 = 0.9; + + fn new() -> FrequencyHashMap { + FrequencyHashMap { + buckets: vec![None; Self::INITIAL], + capacity: Self::INITIAL, + length: 0, + mask: Self::INITIAL - 1, + max: (Self::LOAD_FACTOR * Self::INITIAL as f32) as usize, + } + } + + fn register(&mut self, hash: usize, word: &[u8]) { + let mut index = hash & self.mask; + loop { + match unsafe { self.buckets.get_unchecked_mut(index) } { + Some(entry) => { + if entry.hash == hash && entry.key.eq_ignore_ascii_case(word) { + entry.value += 1; + return; + } else { + index = (index + 1) & self.mask + } + } + none => { + none.replace(FrequencyHashEntry { + key: word.to_ascii_lowercase().into_boxed_slice(), + value: 1, + hash, + }); + + self.length += 1; + if self.length > self.max { + self.ensure_capacity(); + } + break; + } + } + } + } + + fn ensure_capacity(&mut self) { + while self.length > self.max { + self.capacity *= 2; + self.mask = self.capacity - 1; + self.max = (Self::LOAD_FACTOR * self.capacity as f32) as usize; + } + + let new_buckets = vec![None; self.capacity]; + for bucket in std::mem::replace(&mut self.buckets, new_buckets) { + if let Some(entry) = bucket { + let mut index = entry.hash & self.mask; + loop { + match unsafe { self.buckets.get_unchecked_mut(index) } { + Some(_) => index = (index + 1) & self.mask, + none => { + none.replace(entry); + break; + } + } + } + } + } + } +} + +impl IntoIterator for FrequencyHashMap { + type Item = (usize, String); + type IntoIter = FrequencyHashIntoIter; + + fn into_iter(self) -> Self::IntoIter { + let mut buckets = self.buckets; + buckets.sort_unstable(); + + FrequencyHashIntoIter { + iter: buckets.into_iter() } } +} + +impl Ord for FrequencyHashEntry { + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&other.value, &self.value) + .then_with(|| Ord::cmp(self.key.as_ref(), other.key.as_ref())) + } +} + +impl PartialOrd for FrequencyHashEntry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(Ord::cmp(&self, &other)) + } +} + +impl Eq for FrequencyHashEntry {} - fn add_word(&mut self, word: &[u8]) { - if !word.is_empty() { - if let Some(counter) = self.dict.get_mut(word) { - *counter += 1; - } else { - self.dict.insert(word.into(), 1); +impl PartialEq for FrequencyHashEntry { + fn eq(&self, other: &Self) -> bool { + Ord::cmp(&self, &other) == Ordering::Equal + } +} + +impl Iterator for FrequencyHashIntoIter { + type Item = (usize, String); + + fn next(&mut self) -> Option { + while let Some(opt) = self.iter.next() { + if let Some(entry) = opt { + let key = std::str::from_utf8(&entry.key).unwrap().to_owned(); + return Some((entry.value, key)); } } + None } - fn get_freq(&self) -> Vec<(u32, &str)> { - let mut freq = self.dict.iter() - .map(|(w, c)| { - let key = std::str::from_utf8(w).unwrap(); - (*c, key) - }) - .collect::>(); - freq.sort_unstable_by(|(c1, w1), (c2, w2)| { - Ord::cmp(c1, c2).reverse().then_with(|| Ord::cmp(w1, w2)) - }); - freq + fn size_hint(&self) -> (usize, Option) { + let (_, upper) = self.iter.size_hint(); + (0, upper) } } -fn open_input(opts: &Opts) -> Box { +fn open_mmap(opts: &Opts) -> Mmap { match opts.input.as_str() { - "-" => Box::new(BufReader::new(std::io::stdin())), + "-" => { + let mut buffer = vec![]; + std::io::stdin() + .read_to_end(&mut buffer) + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)); + let mut mmap = MmapOptions::new() + .len(buffer.len()) + .map_anon() + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)); + mmap.copy_from_slice(&buffer); + mmap.make_read_only() + .unwrap_or_else(|e| panic!("Unable to read STDIN: {}", e)) + } fnm => { let file = File::open(fnm) .unwrap_or_else(|e| panic!("Unable to open '{}' for reading: {}", fnm, e)); - Box::new(BufReader::new(file)) + unsafe { + MmapOptions::new() + .map(&file) + .unwrap_or_else(|e| panic!("Unable to read '{}' in memory: {}", fnm, e)) + } } } } From e695bee0a095a07437ca5a7579ea0d567b063054 Mon Sep 17 00:00:00 2001 From: Alexey Shuksto Date: Wed, 20 May 2020 12:55:48 +0300 Subject: [PATCH 2/2] add a bit of magic --- src/freq01.rs | 58 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/src/freq01.rs b/src/freq01.rs index 50b69a9..330e53d 100644 --- a/src/freq01.rs +++ b/src/freq01.rs @@ -7,7 +7,9 @@ use std::{ use clap::Clap; use memmap::*; -const H: usize = 2_166_136_261; +/// FNV1 hash basis +const H: usize = 0x811c_9dc5; +/// FNV1 hash prime const P: usize = 0x0100_0193; /// Counts number of unique `[a-zA-Z]+` words in input. @@ -24,22 +26,26 @@ fn main() { let opts: Opts = Opts::parse(); let input = open_mmap(&opts); - let mut word = Vec::with_capacity(256); let mut hash = H; let mut dict = FrequencyHashMap::new(); - for &byte in input.iter() { + let mut word_start = 0; + let mut word_end = 0; + for (idx, &byte) in input.iter().enumerate() { if (b'a' <= byte && byte <= b'z') || (b'A' <= byte && byte <= b'Z') { - hash = (hash ^ (byte | 0x20) as usize) * P; - word.push(byte); - } else if !word.is_empty() { - dict.register(hash, &word); - word.clear(); - hash = H; + hash ^= (byte & 0x1F) as usize; + hash *= P; + word_end = idx + 1; + } else { + if word_start < word_end { + dict.register(hash, &input[word_start..word_end]); + hash = H; + } + word_start = idx + 1; } } - if !word.is_empty() { - dict.register(hash, &word); + if word_start < word_end { + dict.register(hash, &input[word_start..word_end]); } let mut output = create_output(&opts); @@ -67,7 +73,7 @@ struct FrequencyHashEntry { } struct FrequencyHashIntoIter { - iter: std::vec::IntoIter> + iter: std::vec::IntoIter>, } impl FrequencyHashMap { @@ -89,7 +95,7 @@ impl FrequencyHashMap { loop { match unsafe { self.buckets.get_unchecked_mut(index) } { Some(entry) => { - if entry.hash == hash && entry.key.eq_ignore_ascii_case(word) { + if entry.same_as(hash, word) { entry.value += 1; return; } else { @@ -97,11 +103,7 @@ impl FrequencyHashMap { } } none => { - none.replace(FrequencyHashEntry { - key: word.to_ascii_lowercase().into_boxed_slice(), - value: 1, - hash, - }); + none.replace(FrequencyHashEntry::new(hash, word)); self.length += 1; if self.length > self.max { @@ -147,11 +149,29 @@ impl IntoIterator for FrequencyHashMap { buckets.sort_unstable(); FrequencyHashIntoIter { - iter: buckets.into_iter() + iter: buckets.into_iter(), } } } +impl FrequencyHashEntry { + #[inline] + fn new(hash: usize, word: &[u8]) -> FrequencyHashEntry { + FrequencyHashEntry { + key: word.iter().map(|b| b | 0x20).collect(), + hash, + value: 1, + } + } + + #[inline] + fn same_as(&self, hash: usize, word: &[u8]) -> bool { + hash == self.hash + && self.key.len() == word.len() + && Iterator::zip(self.key.iter(), word.iter()).all(|(&l, &r)| l == (r | 0x20)) + } +} + impl Ord for FrequencyHashEntry { fn cmp(&self, other: &Self) -> Ordering { Ord::cmp(&other.value, &self.value)