diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f99d2e9..74ade77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,9 +1,10 @@ -name: Run CI +name: CI on: push: - branches: ["auto"] + branches: ["main"] pull_request: - branches: ["**"] + merge_group: + types: [checks_requested] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -14,12 +15,12 @@ env: jobs: ci: - name: Run CI + name: Build and Test runs-on: ubuntu-latest strategy: matrix: - rust: [1.56.0, nightly, beta, stable] + rust: [1.70.0, nightly, beta, stable] steps: - uses: actions/checkout@v2 @@ -35,8 +36,11 @@ jobs: run: | cargo build --no-default-features cargo build - - name: Tests - run: cargo test --all + cargo build --features malloc_size_of + - uses: actions-rs/cargo@v1 + with: + command: test + args: --all - name: Build codegen run: | cd string-cache-codegen && cargo build && cd .. @@ -47,7 +51,7 @@ jobs: build_result: - name: homu build finished + name: Result runs-on: ubuntu-latest needs: - "ci" diff --git a/Cargo.toml b/Cargo.toml index 6067114..e73215e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,12 +1,13 @@ [package] name = "string_cache" -version = "0.8.6" # Also update README.md when making a semver-breaking change -authors = [ "The Servo Project Developers" ] +version = "0.9.0" # Also update README.md when making a semver-breaking change +authors = ["The Servo Project Developers"] description = "A string interning library for Rust, developed as part of the Servo project." license = "MIT OR Apache-2.0" repository = "https://github.com/servo/string-cache" -documentation = "https://docs.rs/string_cache/" +documentation = "https://docs.rs/string_cache" edition = "2018" +rust-version = "1.70.0" # Do not `exclude` ./string-cache-codegen because we want to include # ./string-cache-codegen/shared.rs, and `include` is a pain to use @@ -23,9 +24,9 @@ default = ["serde_support"] [dependencies] precomputed-hash = "0.1" -once_cell = "1.10.0" serde = { version = "1", optional = true } -phf_shared = "0.10" +malloc_size_of = { version = "0.1", default-features = false, optional = true } +phf_shared = "0.13" new_debug_unreachable = "1.0.2" parking_lot = "0.12" diff --git a/README.md b/README.md index fdf4c0a..429d1ec 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ In `Cargo.toml`: ```toml [dependencies] -string_cache = "0.8" +string_cache = "0.9" ``` In `lib.rs`: @@ -31,10 +31,10 @@ In `Cargo.toml`: build = "build.rs" [dependencies] -string_cache = "0.8" +string_cache = "0.9" [build-dependencies] -string_cache_codegen = "0.5" +string_cache_codegen = "0.6" ``` In `build.rs`: diff --git a/integration-tests/Cargo.toml b/integration-tests/Cargo.toml index a0b047c..4562747 100644 --- a/integration-tests/Cargo.toml +++ b/integration-tests/Cargo.toml @@ -16,11 +16,11 @@ test = true unstable = [] [dependencies] -string_cache = { version = "0.8", path = ".." } +string_cache = { version = "0.9", path = ".." } [dev-dependencies] -rand = "0.8" -string_cache_codegen = { version = "0.5", path = "../string-cache-codegen" } +rand = { version = "0.8", features = ["small_rng"] } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } [build-dependencies] -string_cache_codegen = { version = "0.5", path = "../string-cache-codegen" } +string_cache_codegen = { version = "0.6", path = "../string-cache-codegen" } diff --git a/integration-tests/build.rs b/integration-tests/build.rs index da40873..6293e4c 100644 --- a/integration-tests/build.rs +++ b/integration-tests/build.rs @@ -9,6 +9,7 @@ fn main() { "a", "b", "address", + "defaults", "area", "body", "font-weight", @@ -16,6 +17,9 @@ fn main() { "html", "head", "id", + "❤", + "❤💯", + "❤💯❤💯", ]) .write_to_file(&Path::new(&env::var("OUT_DIR").unwrap()).join("test_atom.rs")) .unwrap() diff --git a/integration-tests/src/bench.rs b/integration-tests/src/bench.rs index 4d8f012..45e7199 100644 --- a/integration-tests/src/bench.rs +++ b/integration-tests/src/bench.rs @@ -153,7 +153,7 @@ bench_all!([eq ne lt clone_string] for longer_string = super::longer_dynamic_a, super::longer_dynamic_b); bench_all!([eq ne intern as_ref clone is_static lt] - for static_atom = test_atom!("a"), test_atom!("b")); + for static_atom = test_atom!("defaults"), test_atom!("font-weight")); bench_all!([intern as_ref clone is_inline] for short_inline_atom = mk("e"), mk("f")); @@ -168,13 +168,13 @@ bench_all!([eq ne intern as_ref clone is_dynamic lt] for longer_dynamic_atom = mk(super::longer_dynamic_a), mk(super::longer_dynamic_b)); bench_all!([intern as_ref clone is_static] - for static_at_runtime = mk("a"), mk("b")); + for static_at_runtime = mk("defaults"), mk("font-weight")); bench_all!([ne lt x_static y_inline] - for static_vs_inline = test_atom!("a"), mk("f")); + for static_vs_inline = test_atom!("defaults"), mk("f")); bench_all!([ne lt x_static y_dynamic] - for static_vs_dynamic = test_atom!("a"), mk(super::longer_dynamic_b)); + for static_vs_dynamic = test_atom!("defaults"), mk(super::longer_dynamic_b)); bench_all!([ne lt x_inline y_dynamic] for inline_vs_dynamic = mk("e"), mk(super::longer_dynamic_b)); diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs index aaacdff..a788d93 100644 --- a/integration-tests/src/lib.rs +++ b/integration-tests/src/lib.rs @@ -45,9 +45,12 @@ fn test_as_slice() { #[test] fn test_types() { assert!(Atom::from("").is_static()); - assert!(Atom::from("id").is_static()); - assert!(Atom::from("body").is_static()); - assert!(Atom::from("a").is_static()); + assert!(Atom::from("defaults").is_static()); + assert!(Atom::from("font-weight").is_static()); + assert!(Atom::from("id").is_inline()); + assert!(Atom::from("body").is_inline()); + assert!(Atom::from("a").is_inline()); + assert!(Atom::from("address").is_inline()); assert!(Atom::from("c").is_inline()); assert!(Atom::from("zz").is_inline()); assert!(Atom::from("zzz").is_inline()); @@ -168,11 +171,13 @@ fn repr() { // static atom table, the tag values, etc. // Static atoms - check_static("a", test_atom!("a")); - check_static("address", test_atom!("address")); - check_static("area", test_atom!("area")); + check_static("defaults", test_atom!("defaults")); + check_static("font-weight", test_atom!("font-weight")); // Inline atoms + check("a", 0x0000_0000_0000_6111); + check("address", 0x7373_6572_6464_6171); + check("area", 0x0000_0061_6572_6141); check("e", 0x0000_0000_0000_6511); check("xyzzy", 0x0000_797A_7A79_7851); check("xyzzy01", 0x3130_797A_7A79_7871); @@ -193,8 +198,13 @@ fn test_threads() { #[test] fn atom_macro() { + assert_eq!(test_atom!("a"), Atom::from("a")); assert_eq!(test_atom!("body"), Atom::from("body")); + assert_eq!(test_atom!("address"), Atom::from("address")); + assert_eq!(test_atom!("❤"), Atom::from("❤")); + assert_eq!(test_atom!("❤💯"), Atom::from("❤💯")); assert_eq!(test_atom!("font-weight"), Atom::from("font-weight")); + assert_eq!(test_atom!("❤💯❤💯"), Atom::from("❤💯❤💯")); } #[test] @@ -292,7 +302,8 @@ fn test_from_string() { #[test] fn test_try_static() { - assert!(Atom::try_static("head").is_some()); + assert!(Atom::try_static("defaults").is_some()); + assert!(Atom::try_static("head").is_none()); assert!(Atom::try_static("not in the static table").is_none()); } diff --git a/src/atom.rs b/src/atom.rs index 321b0a4..5a8aa7f 100644 --- a/src/atom.rs +++ b/src/atom.rs @@ -7,7 +7,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use crate::dynamic_set::{Entry, DYNAMIC_SET}; +use crate::dynamic_set::{dynamic_set, Entry}; use crate::static_sets::StaticAtomSet; use debug_unreachable::debug_unreachable; @@ -82,6 +82,15 @@ pub struct Atom { phantom: PhantomData, } +// This isn't really correct as the Atoms can technically take up space. But I guess it's ok +// as it is possible to measure the size of the atom set separately/ +#[cfg(feature = "malloc_size_of")] +impl malloc_size_of::MallocSizeOf for Atom { + fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize { + 0 + } +} + // FIXME: bound removed from the struct definition before of this error for pack_static: // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable" // https://github.com/rust-lang/rust/issues/57563 @@ -99,6 +108,25 @@ impl Atom { } } + /// For the atom!() macros + #[inline(always)] + #[doc(hidden)] + pub const fn pack_inline(mut n: u64, len: u8) -> Self { + if cfg!(target_endian = "big") { + // Reverse order of top 7 bytes. + // Bottom 8 bits of `n` are zero, and we need that to remain so. + // String data is stored in top 7 bytes, tag and length in bottom byte. + n = n.to_le() << 8; + } + + let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n; + Self { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } + fn tag(&self) -> u8 { (self.unsafe_data.get() & TAG_MASK) as u8 } @@ -186,21 +214,23 @@ impl Hash for Atom { impl<'a, Static: StaticAtomSet> From> for Atom { fn from(string_to_add: Cow<'a, str>) -> Self { - Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { - let len = string_to_add.len(); - if len <= MAX_INLINE_LEN { - let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); - { - let dest = inline_atom_slice_mut(&mut data); - dest[..len].copy_from_slice(string_to_add.as_bytes()) - } - Atom { - // INLINE_TAG ensures this is never zero - unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, - phantom: PhantomData, - } - } else { - let ptr: std::ptr::NonNull = DYNAMIC_SET.insert(string_to_add, hash.g); + let len = string_to_add.len(); + if len == 0 { + Self::pack_static(Static::empty_string_index()) + } else if len <= MAX_INLINE_LEN { + let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET); + { + let dest = inline_atom_slice_mut(&mut data); + dest[..len].copy_from_slice(string_to_add.as_bytes()); + } + Atom { + // INLINE_TAG ensures this is never zero + unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, + phantom: PhantomData, + } + } else { + Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| { + let ptr: std::ptr::NonNull = dynamic_set().insert(string_to_add, hash.g); let data = ptr.as_ptr() as u64; debug_assert!(0 == data & TAG_MASK); Atom { @@ -208,8 +238,8 @@ impl<'a, Static: StaticAtomSet> From> for Atom { unsafe_data: unsafe { NonZeroU64::new_unchecked(data) }, phantom: PhantomData, } - } - }) + }) + } } } @@ -236,7 +266,7 @@ impl Drop for Atom { // Out of line to guide inlining. fn drop_slow(this: &mut Atom) { - DYNAMIC_SET.remove(this.unsafe_data.get() as *mut Entry); + dynamic_set().remove(this.unsafe_data.get() as *mut Entry); } } } @@ -254,8 +284,9 @@ impl ops::Deref for Atom { } INLINE_TAG => { let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET; + debug_assert!(len as usize <= MAX_INLINE_LEN); let src = inline_atom_slice(&self.unsafe_data); - str::from_utf8_unchecked(&src[..(len as usize)]) + str::from_utf8_unchecked(src.get_unchecked(..(len as usize))) } STATIC_TAG => Static::get().atoms[self.static_index() as usize], _ => debug_unreachable!(), @@ -361,28 +392,24 @@ impl Atom { #[inline(always)] fn inline_atom_slice(x: &NonZeroU64) -> &[u8] { - unsafe { let x: *const NonZeroU64 = x; let mut data = x as *const u8; // All except the lowest byte, which is first in little-endian, last in big-endian. if cfg!(target_endian = "little") { - data = data.offset(1); + data = unsafe { data.offset(1) }; } let len = 7; - slice::from_raw_parts(data, len) - } + unsafe { slice::from_raw_parts(data, len) } } #[inline(always)] -fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { - unsafe { +fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] { let x: *mut u64 = x; let mut data = x as *mut u8; // All except the lowest byte, which is first in little-endian, last in big-endian. if cfg!(target_endian = "little") { - data = data.offset(1); + data = unsafe { data.offset(1) }; } let len = 7; - slice::from_raw_parts_mut(data, len) - } + unsafe { slice::from_raw_parts_mut(data, len) } } diff --git a/src/dynamic_set.rs b/src/dynamic_set.rs index 46e7a54..4442b4d 100644 --- a/src/dynamic_set.rs +++ b/src/dynamic_set.rs @@ -7,13 +7,13 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use once_cell::sync::Lazy; use parking_lot::Mutex; use std::borrow::Cow; use std::mem; use std::ptr::NonNull; use std::sync::atomic::AtomicIsize; use std::sync::atomic::Ordering::SeqCst; +use std::sync::OnceLock; const NB_BUCKETS: usize = 1 << 12; // 4096 const BUCKET_MASK: u32 = (1 << 12) - 1; @@ -38,16 +38,20 @@ fn entry_alignment_is_sufficient() { assert!(mem::align_of::() >= ENTRY_ALIGNMENT); } -pub(crate) static DYNAMIC_SET: Lazy = Lazy::new(|| { +pub(crate) fn dynamic_set() -> &'static Set { // NOTE: Using const initialization for buckets breaks the small-stack test. // ``` // // buckets: [Mutex>>; NB_BUCKETS], // const MUTEX: Mutex>> = Mutex::new(None); // let buckets = Box::new([MUTEX; NB_BUCKETS]); // ``` - let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); - Set { buckets } -}); + static DYNAMIC_SET: OnceLock = OnceLock::new(); + + DYNAMIC_SET.get_or_init(|| { + let buckets = (0..NB_BUCKETS).map(|_| Mutex::new(None)).collect(); + Set { buckets } + }) +} impl Set { pub(crate) fn insert(&self, string: Cow, hash: u32) -> NonNull { diff --git a/src/lib.rs b/src/lib.rs index 441cb4e..3cc29b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,10 +25,10 @@ //! In `Cargo.toml`: //! ```toml //! [dependencies] -//! string_cache = "0.8" +//! string_cache = "0.9" //! //! [dev-dependencies] -//! string_cache_codegen = "0.5" +//! string_cache_codegen = "0.6" //! ``` //! //! In `build.rs`: diff --git a/string-cache-codegen/Cargo.toml b/string-cache-codegen/Cargo.toml index 5eb5125..20eced9 100644 --- a/string-cache-codegen/Cargo.toml +++ b/string-cache-codegen/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "string_cache_codegen" -version = "0.5.2" # Also update ../README.md when making a semver-breaking change +version = "0.6.1" # Also update ../README.md when making a semver-breaking change authors = [ "The Servo Project Developers" ] description = "A codegen library for string-cache, developed as part of the Servo project." license = "MIT OR Apache-2.0" @@ -13,7 +13,7 @@ name = "string_cache_codegen" path = "lib.rs" [dependencies] -phf_generator = "0.10" -phf_shared = "0.10" +phf_generator = "0.13" +phf_shared = "0.13" proc-macro2 = "1" quote = "1" diff --git a/string-cache-codegen/lib.rs b/string-cache-codegen/lib.rs index 0fe4819..525ef3a 100644 --- a/string-cache-codegen/lib.rs +++ b/string-cache-codegen/lib.rs @@ -19,10 +19,10 @@ //! build = "build.rs" //! //! [dependencies] -//! string_cache = "0.8" +//! string_cache = "0.9" //! //! [build-dependencies] -//! string_cache_codegen = "0.5" +//! string_cache_codegen = "0.6" //! ``` //! //! In `build.rs`: @@ -68,8 +68,9 @@ #![recursion_limit = "128"] +use proc_macro2::Ident; use quote::quote; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufWriter, Write}; use std::path::Path; @@ -81,7 +82,7 @@ pub struct AtomType { static_set_doc: Option, macro_name: String, macro_doc: Option, - atoms: HashSet, + atoms: BTreeSet, } impl AtomType { @@ -114,7 +115,7 @@ impl AtomType { atom_doc: None, static_set_doc: None, macro_doc: None, - atoms: HashSet::new(), + atoms: BTreeSet::new(), } } @@ -181,20 +182,71 @@ impl AtomType { ) } + #[cfg(test)] + /// Write generated code to destination [`Vec`] and return it as [`String`] + /// + /// Used mostly for testing or displaying a value. + pub fn write_to_string(&mut self, mut destination: Vec) -> io::Result { + destination.write_all( + self.to_tokens() + .to_string() + // Insert some newlines to make the generated code slightly easier to read. + .replace(" [ \"", "[\n\"") + .replace("\" , ", "\",\n") + .replace(" ( \"", "\n( \"") + .replace("; ", ";\n") + .as_bytes(), + )?; + let str = String::from_utf8(destination).unwrap(); + Ok(str) + } + fn to_tokens(&mut self) -> proc_macro2::TokenStream { // `impl Default for Atom` requires the empty string to be in the static set. // This also makes sure the set in non-empty, // which would cause divisions by zero in rust-phf. self.atoms.insert(String::new()); - let atoms: Vec<&str> = self.atoms.iter().map(|s| &**s).collect(); - let hash_state = phf_generator::generate_hash(&atoms); + // Strings over 7 bytes + empty string added to static set. + // Otherwise stored inline. + let (static_strs, inline_strs): (Vec<_>, Vec<_>) = self + .atoms + .iter() + .map(String::as_str) + .partition(|s| s.len() > 7 || s.is_empty()); + + // Static strings + let hash_state = phf_generator::generate_hash(&static_strs); let phf_generator::HashState { key, disps, map } = hash_state; let (disps0, disps1): (Vec<_>, Vec<_>) = disps.into_iter().unzip(); - let atoms: Vec<&str> = map.iter().map(|&idx| atoms[idx]).collect(); + let atoms: Vec<&str> = map.iter().map(|&idx| static_strs[idx]).collect(); let empty_string_index = atoms.iter().position(|s| s.is_empty()).unwrap() as u32; let indices = 0..atoms.len() as u32; + fn is_valid_ident(name: &str) -> bool { + let begins_with_letter_or_underscore = name + .chars() + .next() + .is_some_and(|c| c.is_alphabetic() || c == '_'); + let is_alphanumeric = name.chars().all(|c| c.is_alphanumeric() || c == '_'); + + begins_with_letter_or_underscore && is_alphanumeric + } + + let atoms_for_idents: Vec<&str> = atoms + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let atom_idents: Vec = atoms_for_idents.iter().map(|atom| new_term(atom)).collect(); + + let istrs_for_idents: Vec<&str> = inline_strs + .iter() + .copied() + .filter(|x| is_valid_ident(x)) + .collect(); + let istr_idents: Vec = istrs_for_idents.iter().map(|atom| new_term(atom)).collect(); + let hashes: Vec = atoms .iter() .map(|string| { @@ -221,23 +273,51 @@ impl AtomType { Some(ref doc) => quote!(#[doc = #doc]), None => quote!(), }; - let new_term = - |string: &str| proc_macro2::Ident::new(string, proc_macro2::Span::call_site()); + fn new_term(string: &str) -> Ident { + Ident::new(string, proc_macro2::Span::call_site()) + } let static_set_name = new_term(&format!("{}StaticSet", type_name)); let type_name = new_term(type_name); let macro_name = new_term(&*self.macro_name); let module = module.parse::().unwrap(); let atom_prefix = format!("ATOM_{}_", type_name.to_string().to_uppercase()); - let const_names: Vec<_> = atoms + let new_const_name = |atom: &str| { + let mut name = atom_prefix.clone(); + for c in atom.chars() { + name.push_str(&format!("_{:02X}", c as u32)) + } + new_term(&name) + }; + let const_names: Vec<_> = atoms.iter().copied().map(new_const_name).collect(); + let ident_const_names: Vec<_> = atoms_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + let ident_inline_const_names: Vec<_> = istrs_for_idents + .iter() + .copied() + .map(new_const_name) + .collect(); + + // Inline strings + let (inline_const_names, inline_values_and_lengths): (Vec<_>, Vec<_>) = inline_strs .iter() - .map(|atom| { - let mut name = atom_prefix.clone(); - for c in atom.chars() { - name.push_str(&format!("_{:02X}", c as u32)) + .map(|s| { + let const_name = new_const_name(s); + + let mut value = 0u64; + for (index, c) in s.bytes().enumerate() { + value = value | ((c as u64) << (index * 8 + 8)); } - new_term(&name) + + let len = s.len() as u8; + + (const_name, (value, len)) }) - .collect(); + .unzip(); + let (inline_values, inline_lengths): (Vec<_>, Vec<_>) = + inline_values_and_lengths.into_iter().unzip(); quote! { #atom_doc @@ -265,6 +345,9 @@ impl AtomType { #( pub const #const_names: #type_name = #type_name::pack_static(#indices); )* + #( + pub const #inline_const_names: #type_name = #type_name::pack_inline(#inline_values, #inline_lengths); + )* #macro_doc #[macro_export] @@ -272,6 +355,15 @@ impl AtomType { #( (#atoms) => { #module::#const_names }; )* + #( + (#inline_strs) => { #module::#inline_const_names }; + )* + #( + (#atom_idents) => { #module::#ident_const_names }; + )* + #( + (#istr_idents) => { #module::#ident_inline_const_names }; + )* } } } @@ -284,3 +376,18 @@ impl AtomType { self.write_to(BufWriter::new(File::create(path)?)) } } + +#[test] +fn test_iteration_order() { + let x1 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x1"); + + let x2 = crate::AtomType::new("foo::Atom", "foo_atom!") + .atoms(&["x", "xlink", "svg", "test"]) + .write_to_string(Vec::new()) + .expect("write to string cache x2"); + + assert_eq!(x1, x2); +}