vendor/unicode-ident/src/lib.rs

   1 //! [![github]](https://github.com/dtolnay/unicode-ident)&ensp;[![crates-io]](https://crates.io/crates/unicode-ident)&ensp;[![docs-rs]](https://docs.rs/unicode-ident)
   2 //!
   3 //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
   4 //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
   5 //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logoColor=white&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgdmlld0JveD0iMCAwIDUxMiA1MTIiPjxwYXRoIGZpbGw9IiNmNWY1ZjUiIGQ9Ik00ODguNiAyNTAuMkwzOTIgMjE0VjEwNS41YzAtMTUtOS4zLTI4LjQtMjMuNC0zMy43bC0xMDAtMzcuNWMtOC4xLTMuMS0xNy4xLTMuMS0yNS4zIDBsLTEwMCAzNy41Yy0xNC4xIDUuMy0yMy40IDE4LjctMjMuNCAzMy43VjIxNGwtOTYuNiAzNi4yQzkuMyAyNTUuNSAwIDI2OC45IDAgMjgzLjlWMzk0YzAgMTMuNiA3LjcgMjYuMSAxOS45IDMyLjJsMTAwIDUwYzEwLjEgNS4xIDIyLjEgNS4xIDMyLjIgMGwxMDMuOS01MiAxMDMuOSA1MmMxMC4xIDUuMSAyMi4xIDUuMSAzMi4yIDBsMTAwLTUwYzEyLjItNi4xIDE5LjktMTguNiAxOS45LTMyLjJWMjgzLjljMC0xNS05LjMtMjguNC0yMy40LTMzLjd6TTM1OCAyMTQuOGwtODUgMzEuOXYtNjguMmw4NS0zN3Y3My4zek0xNTQgMTA0LjFsMTAyLTM4LjIgMTAyIDM4LjJ2LjZsLTEwMiA0MS40LTEwMi00MS40di0uNnptODQgMjkxLjFsLTg1IDQyLjV2LTc5LjFsODUtMzguOHY3NS40em0wLTExMmwtMTAyIDQxLjQtMTAyLTQxLjR2LS42bDEwMi0zOC4yIDEwMiAzOC4ydi42em0yNDAgMTEybC04NSA0Mi41di03OS4xbDg1LTM4Ljh2NzUuNHptMC0xMTJsLTEwMiA0MS40LTEwMi00MS40di0uNmwxMDItMzguMiAxMDIgMzguMnYuNnoiPjwvcGF0aD48L3N2Zz4K
   6 //!
   7 //! <br>
   8 //!
   9 //! Implementation of [Unicode Standard Annex #31][tr31] for determining which
  10 //! `char` values are valid in programming language identifiers.
  11 //!
  12 //! [tr31]: https://www.unicode.org/reports/tr31/
  13 //!
  14 //! This crate is a better optimized implementation of the older `unicode-xid`
  15 //! crate. This crate uses less static storage, and is able to classify both
  16 //! ASCII and non-ASCII codepoints with better performance, 2&ndash;10&times;
  17 //! faster than `unicode-xid`.
  18 //!
  19 //! <br>
  20 //!
  21 //! ## Comparison of performance
  22 //!
  23 //! The following table shows a comparison between five Unicode identifier
  24 //! implementations.
  25 //!
  26 //! - `unicode-ident` is this crate;
  27 //! - [`unicode-xid`] is a widely used crate run by the "unicode-rs" org;
  28 //! - `ucd-trie` and `fst` are two data structures supported by the
  29 //!   [`ucd-generate`] tool;
  30 //! - [`roaring`] is a Rust implementation of Roaring bitmap.
  31 //!
  32 //! The *static storage* column shows the total size of `static` tables that the
  33 //! crate bakes into your binary, measured in 1000s of bytes.
  34 //!
  35 //! The remaining columns show the **cost per call** to evaluate whether a
  36 //! single `char` has the XID\_Start or XID\_Continue Unicode property,
  37 //! comparing across different ratios of ASCII to non-ASCII codepoints in the
  38 //! input data.
  39 //!
  40 //! [`unicode-xid`]: https://github.com/unicode-rs/unicode-xid
  41 //! [`ucd-generate`]: https://github.com/BurntSushi/ucd-generate
  42 //! [`roaring`]: https://github.com/RoaringBitmap/roaring-rs
  43 //!
  44 //! | | static storage | 0% nonascii | 1% | 10% | 100% nonascii |
  45 //! |---|---|---|---|---|---|
  46 //! | **`unicode-ident`** | 9.75 K | 0.96 ns | 0.95 ns | 1.09 ns | 1.55 ns |
  47 //! | **`unicode-xid`** | 11.34 K | 1.88 ns | 2.14 ns | 3.48 ns | 15.63 ns |
  48 //! | **`ucd-trie`** | 9.95 K | 1.29 ns | 1.28 ns | 1.36 ns | 2.15 ns |
  49 //! | **`fst`** | 133 K | 55.1 ns | 54.9 ns | 53.2 ns | 28.5 ns |
  50 //! | **`roaring`** | 66.1 K | 2.78 ns | 3.09 ns | 3.37 ns | 4.70 ns |
  51 //!
  52 //! Source code for the benchmark is provided in the *bench* directory of this
  53 //! repo and may be repeated by running `cargo criterion`.
  54 //!
  55 //! <br>
  56 //!
  57 //! ## Comparison of data structures
  58 //!
  59 //! #### unicode-xid
  60 //!
  61 //! They use a sorted array of character ranges, and do a binary search to look
  62 //! up whether a given character lands inside one of those ranges.
  63 //!
  64 //! ```rust
  65 //! # const _: &str = stringify! {
  66 //! static XID_Continue_table: [(char, char); 763] = [
  67 //!     ('\u{30}', '\u{39}'),  // 0-9
  68 //!     ('\u{41}', '\u{5a}'),  // A-Z
  69 //! # "
  70 //!     …
  71 //! # "
  72 //!     ('\u{e0100}', '\u{e01ef}'),
  73 //! ];
  74 //! # };
  75 //! ```
  76 //!
  77 //! The static storage used by this data structure scales with the number of
  78 //! contiguous ranges of identifier codepoints in Unicode. Every table entry
  79 //! consumes 8 bytes, because it consists of a pair of 32-bit `char` values.
  80 //!
  81 //! In some ranges of the Unicode codepoint space, this is quite a sparse
  82 //! representation &ndash; there are some ranges where tens of thousands of
  83 //! adjacent codepoints are all valid identifier characters. In other places,
  84 //! the representation is quite inefficient. A characater like `µ` (U+00B5)
  85 //! which is surrounded by non-identifier codepoints consumes 64 bits in the
  86 //! table, while it would be just 1 bit in a dense bitmap.
  87 //!
  88 //! On a system with 64-byte cache lines, binary searching the table touches 7
  89 //! cache lines on average. Each cache line fits only 8 table entries.
  90 //! Additionally, the branching performed during the binary search is probably
  91 //! mostly unpredictable to the branch predictor.
  92 //!
  93 //! Overall, the crate ends up being about 10&times; slower on non-ASCII input
  94 //! compared to the fastest crate.
  95 //!
  96 //! A potential improvement would be to pack the table entries more compactly.
  97 //! Rust's `char` type is a 21-bit integer padded to 32 bits, which means every
  98 //! table entry is holding 22 bits of wasted space, adding up to 3.9 K. They
  99 //! could instead fit every table entry into 6 bytes, leaving out some of the
 100 //! padding, for a 25% improvement in space used. With some cleverness it may be
 101 //! possible to fit in 5 bytes or even 4 bytes by storing a low char and an
 102 //! extent, instead of low char and high char. I don't expect that performance
 103 //! would improve much but this could be the most efficient for space across all
 104 //! the libraries, needing only about 7 K to store.
 105 //!
 106 //! #### ucd-trie
 107 //!
 108 //! Their data structure is a compressed trie set specifically tailored for
 109 //! Unicode codepoints. The design is credited to Raph Levien in
 110 //! [rust-lang/rust#33098].
 111 //!
 112 //! [rust-lang/rust#33098]: https://github.com/rust-lang/rust/pull/33098
 113 //!
 114 //! ```rust
 115 //! pub struct TrieSet {
 116 //!     tree1_level1: &'static [u64; 32],
 117 //!     tree2_level1: &'static [u8; 992],
 118 //!     tree2_level2: &'static [u64],
 119 //!     tree3_level1: &'static [u8; 256],
 120 //!     tree3_level2: &'static [u8],
 121 //!     tree3_level3: &'static [u64],
 122 //! }
 123 //! ```
 124 //!
 125 //! It represents codepoint sets using a trie to achieve prefix compression. The
 126 //! final states of the trie are embedded in leaves or "chunks", where each
 127 //! chunk is a 64-bit integer. Each bit position of the integer corresponds to
 128 //! whether a particular codepoint is in the set or not. These chunks are not
 129 //! just a compact representation of the final states of the trie, but are also
 130 //! a form of suffix compression. In particular, if multiple ranges of 64
 131 //! contiguous codepoints have the same Unicode properties, then they all map to
 132 //! the same chunk in the final level of the trie.
 133 //!
 134 //! Being tailored for Unicode codepoints, this trie is partitioned into three
 135 //! disjoint sets: tree1, tree2, tree3. The first set corresponds to codepoints
 136 //! \[0, 0x800), the second \[0x800, 0x10000) and the third \[0x10000,
 137 //! 0x110000). These partitions conveniently correspond to the space of 1 or 2
 138 //! byte UTF-8 encoded codepoints, 3 byte UTF-8 encoded codepoints and 4 byte
 139 //! UTF-8 encoded codepoints, respectively.
 140 //!
 141 //! Lookups in this data structure are significantly more efficient than binary
 142 //! search. A lookup touches either 1, 2, or 3 cache lines based on which of the
 143 //! trie partitions is being accessed.
 144 //!
 145 //! One possible performance improvement would be for this crate to expose a way
 146 //! to query based on a UTF-8 encoded string, returning the Unicode property
 147 //! corresponding to the first character in the string. Without such an API, the
 148 //! caller is required to tokenize their UTF-8 encoded input data into `char`,
 149 //! hand the `char` into `ucd-trie`, only for `ucd-trie` to undo that work by
 150 //! converting back into the variable-length representation for trie traversal.
 151 //!
 152 //! #### fst
 153 //!
 154 //! Uses a [finite state transducer][fst]. This representation is built into
 155 //! [ucd-generate] but I am not aware of any advantage over the `ucd-trie`
 156 //! representation. In particular `ucd-trie` is optimized for storing Unicode
 157 //! properties while `fst` is not.
 158 //!
 159 //! [fst]: https://github.com/BurntSushi/fst
 160 //! [ucd-generate]: https://github.com/BurntSushi/ucd-generate
 161 //!
 162 //! As far as I can tell, the main thing that causes `fst` to have large size
 163 //! and slow lookups for this use case relative to `ucd-trie` is that it does
 164 //! not specialize for the fact that only 21 of the 32 bits in a `char` are
 165 //! meaningful. There are some dense arrays in the structure with large ranges
 166 //! that could never possibly be used.
 167 //!
 168 //! #### roaring
 169 //!
 170 //! This crate is a pure-Rust implementation of [Roaring Bitmap], a data
 171 //! structure designed for storing sets of 32-bit unsigned integers.
 172 //!
 173 //! [Roaring Bitmap]: https://roaringbitmap.org/about/
 174 //!
 175 //! Roaring bitmaps are compressed bitmaps which tend to outperform conventional
 176 //! compressed bitmaps such as WAH, EWAH or Concise. In some instances, they can
 177 //! be hundreds of times faster and they often offer significantly better
 178 //! compression.
 179 //!
 180 //! In this use case the performance was reasonably competitive but still
 181 //! substantially slower than the Unicode-optimized crates. Meanwhile the
 182 //! compression was significantly worse, requiring 6&times; as much storage for
 183 //! the data structure.
 184 //!
 185 //! I also benchmarked the [`croaring`] crate which is an FFI wrapper around the
 186 //! C reference implementation of Roaring Bitmap. This crate was consistently
 187 //! about 15% slower than pure-Rust `roaring`, which could just be FFI overhead.
 188 //! I did not investigate further.
 189 //!
 190 //! [`croaring`]: https://crates.io/crates/croaring
 191 //!
 192 //! #### unicode-ident
 193 //!
 194 //! This crate is most similar to the `ucd-trie` library, in that it's based on
 195 //! bitmaps stored in the leafs of a trie representation, achieving both prefix
 196 //! compression and suffix compression.
 197 //!
 198 //! The key differences are:
 199 //!
 200 //! - Uses a single 2-level trie, rather than 3 disjoint partitions of different
 201 //!   depth each.
 202 //! - Uses significantly larger chunks: 512 bits rather than 64 bits.
 203 //! - Compresses the XID\_Start and XID\_Continue properties together
 204 //!   simultaneously, rather than duplicating identical trie leaf chunks across
 205 //!   the two.
 206 //!
 207 //! The following diagram show the XID\_Start and XID\_Continue Unicode boolean
 208 //! properties in uncompressed form, in row-major order:
 209 //!
 210 //! <table>
 211 //! <tr><th>XID_Start</th><th>XID_Continue</th></tr>
 212 //! <tr>
 213 //! <td><img alt="XID_Start bitmap" width="256" src="https://user-images.githubusercontent.com/1940490/168647353-c6eeb922-afec-49b2-9ef5-c03e9d1e0760.png"></td>
 214 //! <td><img alt="XID_Continue bitmap" width="256" src="https://user-images.githubusercontent.com/1940490/168647367-f447cca7-2362-4d7d-8cd7-d21c011d329b.png"></td>
 215 //! </tr>
 216 //! </table>
 217 //!
 218 //! Uncompressed, these would take 140 K to store, which is beyond what would be
 219 //! reasonable. However, as you can see there is a large degree of similarity
 220 //! between the two bitmaps and across the rows, which lends well to
 221 //! compression.
 222 //!
 223 //! This crate stores one 512-bit "row" of the above bitmaps in the leaf level
 224 //! of a trie, and a single additional level to index into the leafs. It turns
 225 //! out there are 124 unique 512-bit chunks across the two bitmaps so 7 bits are
 226 //! sufficient to index them.
 227 //!
 228 //! The chunk size of 512 bits is selected as the size that minimizes the total
 229 //! size of the data structure. A smaller chunk, like 256 or 128 bits, would
 230 //! achieve better deduplication but require a larger index. A larger chunk
 231 //! would increase redundancy in the leaf bitmaps. 512 bit chunks are the
 232 //! optimum for total size of the index plus leaf bitmaps.
 233 //!
 234 //! In fact since there are only 124 unique chunks, we can use an 8-bit index
 235 //! with a spare bit to index at the half-chunk level. This achieves an
 236 //! additional 8.5% compression by eliminating redundancies between the second
 237 //! half of any chunk and the first half of any other chunk. Note that this is
 238 //! not the same as using chunks which are half the size, because it does not
 239 //! necessitate raising the size of the trie's first level.
 240 //!
 241 //! In contrast to binary search or the `ucd-trie` crate, performing lookups in
 242 //! this data structure is straight-line code with no need for branching.
 243
 244 #![no_std]
 245 #![allow(clippy::doc_markdown, clippy::must_use_candidate)]
 246
 247 #[rustfmt::skip]
 248 mod tables;
 249
 250 use crate::tables::{ASCII_CONTINUE, ASCII_START, CHUNK, LEAF, TRIE_CONTINUE, TRIE_START};
 251
 252 pub fn is_xid_start(ch: char) -> bool {
 253     if ch.is_ascii() {
 254         return ASCII_START.0[ch as usize];
 255     }
 256     let chunk = *TRIE_START.0.get(ch as usize / 8 / CHUNK).unwrap_or(&0);
 257     let offset = chunk as usize * CHUNK / 2 + ch as usize / 8 % CHUNK;
 258     unsafe { LEAF.0.get_unchecked(offset) }.wrapping_shr(ch as u32 % 8) & 1 != 0
 259 }
 260
 261 pub fn is_xid_continue(ch: char) -> bool {
 262     if ch.is_ascii() {
 263         return ASCII_CONTINUE.0[ch as usize];
 264     }
 265     let chunk = *TRIE_CONTINUE.0.get(ch as usize / 8 / CHUNK).unwrap_or(&0);
 266     let offset = chunk as usize * CHUNK / 2 + ch as usize / 8 % CHUNK;
 267     unsafe { LEAF.0.get_unchecked(offset) }.wrapping_shr(ch as u32 % 8) & 1 != 0
 268 }