hstr/
lib.rs

1#![cfg_attr(feature = "atom_size_128", feature(integer_atomics))]
2//! See [Atom] for more information.
3
4use core::str;
5use std::{
6    fmt::{Debug, Display},
7    hash::Hash,
8    mem::{self, forget, transmute},
9    num::NonZeroU8,
10    ops::Deref,
11    str::from_utf8_unchecked,
12};
13
14use debug_unreachable::debug_unreachable;
15use once_cell::sync::Lazy;
16
17pub use crate::dynamic::{global_atom_store_gc, AtomStore};
18use crate::tagged_value::TaggedValue;
19
20mod dynamic;
21mod global_store;
22mod tagged_value;
23#[cfg(test)]
24mod tests;
25
26/// An immutable string which is cheap to clone, compare, hash, and has small
27/// size.
28///
29/// # Usecase
30///
31/// This type is designed for the compilers or build tools like a bundler
32/// written in Rust. String interning is much costly than simply allocating a
33/// string, but in compilers, hashing and comparison of strings are very
34/// frequent operations for some of strings, and the other strings are mostly
35/// just passed around. According to DoD, we should optimize types
36/// for operations that occur frequently, and this type is the result.
37///
38/// # Features
39///
40/// ## No mutex on creation and destruction.
41///
42/// This type also considers concurrent processing of AST nodes. Parsers
43/// generates lots of [Atom]s, so the creation of [Atom] should never involve a
44/// global mutex, because parsers are embarrassingly parallel. Also, the AST
45/// nodes are typically dropped in parallel. So [Drop] implementation of [Atom]
46/// should not involve a global mutex, too.
47///
48///
49/// ## Small size (One `u64`)
50///
51/// The most of strings are simply passed around, so the size of [Atom] should
52/// be small as possible.
53///
54/// ```rust
55/// # use std::mem::size_of;
56/// # if !cfg!(feature = "atom_size_128") {
57/// use hstr::Atom;
58/// assert!(size_of::<Atom>() == size_of::<u64>());
59/// assert!(size_of::<Option<Atom>>() == size_of::<u64>());
60/// # }
61/// ````
62///
63///
64/// ## Fast equality check (in most cases)
65///
66/// Equality comparison is O(1) in most cases. If two atoms are from the same
67/// [AtomStore], or they are from different stores but they are
68/// [`AtomStore::merge`]d, they are compared by numeric equality.
69///
70/// If two strings are created from different [AtomStore]s, they are compared
71/// using `strcmp` by default. But `hstr` allows you to make `==` faster -
72/// [`AtomStore::merge`].
73///
74///
75/// ## Fast [Hash] implementation
76///
77/// [Atom] precompute the hash value of long strings when they are created, so
78/// it is `O(1)` to compute hash.
79///
80///
81/// ## Small strings as inline data
82///
83/// Small strings are stored in the [Atom] itself without any allocation.
84///
85///
86/// ## Creating atoms
87///
88/// If you are working on a module which creates lots of [Atom]s, you are
89/// recommended to use [AtomStore] API because it's faster. But if you are not,
90/// you can use global APIs for convenience.
91///
92/// ## Dealloc
93///
94/// - Atoms stored in the local `AtomStore` have the same lifetime as the store
95///   itself, and will be deallocated when the store is dropped.
96/// - Atoms created via the `atom!` macro or `String::into` are stored in the
97///   global atom store. By default, these atoms are never deallocated. To clean
98///   up unused atoms, call [global_atom_store_gc].
99pub struct Atom {
100    // If this Atom is a dynamic one, this is *const Entry
101    unsafe_data: TaggedValue,
102}
103
104#[doc(hidden)]
105pub type CachedAtom = Lazy<Atom>;
106
107#[doc(hidden)]
108pub const fn inline_atom(s: &str) -> Option<Atom> {
109    dynamic::inline_atom(s)
110}
111
112/// Create an atom from a string literal. This atom is never dropped.
113#[macro_export]
114macro_rules! atom {
115    ($s:expr) => {{
116        const INLINE: ::core::option::Option<$crate::Atom> = $crate::inline_atom($s);
117        // This condition can be evaluated at compile time to enable inlining as a
118        // simple constant
119        if INLINE.is_some() {
120            INLINE.unwrap()
121        } else {
122            // Otherwise we use a
123            #[inline(never)]
124            fn get_atom() -> $crate::Atom {
125                static CACHE: $crate::CachedAtom =
126                    $crate::CachedAtom::new(|| $crate::Atom::from($s));
127
128                (*CACHE).clone()
129            }
130
131            get_atom()
132        }
133    }};
134}
135
136impl Default for Atom {
137    #[inline(never)]
138    fn default() -> Self {
139        atom!("")
140    }
141}
142
143/// Immutable, so it's safe to be shared between threads
144unsafe impl Send for Atom {}
145
146/// Immutable, so it's safe to be shared between threads
147unsafe impl Sync for Atom {}
148
149impl Display for Atom {
150    #[inline]
151    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
152        Display::fmt(self.as_str(), f)
153    }
154}
155
156impl Debug for Atom {
157    #[inline]
158    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
159        Debug::fmt(self.as_str(), f)
160    }
161}
162
163#[cfg(feature = "serde")]
164impl serde::ser::Serialize for Atom {
165    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
166    where
167        S: serde::ser::Serializer,
168    {
169        serializer.serialize_str(self)
170    }
171}
172
173#[cfg(feature = "serde")]
174impl<'de> serde::de::Deserialize<'de> for Atom {
175    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
176    where
177        D: serde::Deserializer<'de>,
178    {
179        String::deserialize(deserializer).map(Self::new)
180    }
181}
182const DYNAMIC_TAG: u8 = 0b_00;
183const INLINE_TAG: u8 = 0b_01; // len in upper nybble
184const INLINE_TAG_INIT: NonZeroU8 = unsafe { NonZeroU8::new_unchecked(INLINE_TAG) };
185const TAG_MASK: u8 = 0b_11;
186const LEN_OFFSET: usize = 4;
187const LEN_MASK: u8 = 0xf0;
188
189// const STATIC_SHIFT_BITS: usize = 32;
190
191impl Atom {
192    #[inline(always)]
193    pub fn new<S>(s: S) -> Self
194    where
195        Self: From<S>,
196    {
197        Self::from(s)
198    }
199
200    #[inline(always)]
201    fn tag(&self) -> u8 {
202        self.unsafe_data.tag() & TAG_MASK
203    }
204
205    /// Return true if this is a dynamic Atom.
206    #[inline(always)]
207    fn is_dynamic(&self) -> bool {
208        self.tag() == DYNAMIC_TAG
209    }
210}
211
212impl Atom {
213    fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
214        let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
215        let buffer = unsafe { &mut *buffer.as_mut_ptr() };
216
217        if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
218            buffer_prefix.copy_from_slice(s.as_bytes());
219            let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
220            f(as_str);
221            Atom::from(&*as_str)
222        } else {
223            let mut string = s.to_owned();
224            f(&mut string);
225            Atom::from(string)
226        }
227    }
228
229    /// Like [`to_ascii_uppercase`].
230    ///
231    /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
232    pub fn to_ascii_uppercase(&self) -> Self {
233        for (i, b) in self.bytes().enumerate() {
234            if let b'a'..=b'z' = b {
235                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
236            }
237        }
238        self.clone()
239    }
240
241    /// Like [`to_ascii_lowercase`].
242    ///
243    /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
244    pub fn to_ascii_lowercase(&self) -> Self {
245        for (i, b) in self.bytes().enumerate() {
246            if let b'A'..=b'Z' = b {
247                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
248            }
249        }
250        self.clone()
251    }
252}
253
254impl Atom {
255    fn get_hash(&self) -> u64 {
256        match self.tag() {
257            DYNAMIC_TAG => {
258                unsafe { crate::dynamic::deref_from(self.unsafe_data) }
259                    .header
260                    .header
261                    .hash
262            }
263            INLINE_TAG => {
264                // This is passed as input to the caller's `Hasher` implementation, so it's okay
265                // that this isn't really a hash
266                self.unsafe_data.hash()
267            }
268            _ => unsafe { debug_unreachable!() },
269        }
270    }
271
272    fn as_str(&self) -> &str {
273        match self.tag() {
274            DYNAMIC_TAG => unsafe {
275                let item = crate::dynamic::deref_from(self.unsafe_data);
276                from_utf8_unchecked(transmute::<&[u8], &'static [u8]>(&item.slice))
277            },
278            INLINE_TAG => {
279                let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
280                let src = self.unsafe_data.data();
281                unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
282            }
283            _ => unsafe { debug_unreachable!() },
284        }
285    }
286}
287
288#[cfg(test)]
289impl Atom {
290    pub(crate) fn ref_count(&self) -> usize {
291        match self.tag() {
292            DYNAMIC_TAG => {
293                let ptr = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
294
295                triomphe::ThinArc::strong_count(&ptr.0)
296            }
297            _ => 1,
298        }
299    }
300}
301
302impl PartialEq for Atom {
303    #[inline(never)]
304    fn eq(&self, other: &Self) -> bool {
305        if self.unsafe_data == other.unsafe_data {
306            return true;
307        }
308
309        // If one is inline and the other is not, the length is different.
310        // If one is static and the other is not, it's different.
311        if self.tag() != other.tag() {
312            return false;
313        }
314
315        if self.is_dynamic() && other.is_dynamic() {
316            let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
317            let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) };
318
319            if te.header.header.hash != oe.header.header.hash {
320                return false;
321            }
322
323            return te.slice == oe.slice;
324        }
325
326        if self.get_hash() != other.get_hash() {
327            return false;
328        }
329
330        // If the store is different, the string may be the same, even though the
331        // `unsafe_data` is different
332        self.as_str() == other.as_str()
333    }
334}
335
336impl Eq for Atom {}
337
338impl Hash for Atom {
339    #[inline(always)]
340    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
341        state.write_u64(self.get_hash());
342    }
343}
344
345impl Drop for Atom {
346    #[inline(always)]
347    fn drop(&mut self) {
348        if self.is_dynamic() {
349            unsafe { drop(crate::dynamic::restore_arc(self.unsafe_data)) }
350        }
351    }
352}
353
354impl Clone for Atom {
355    #[inline(always)]
356    fn clone(&self) -> Self {
357        Self::from_alias(self.unsafe_data)
358    }
359}
360
361impl Atom {
362    #[inline]
363    pub(crate) fn from_alias(alias: TaggedValue) -> Self {
364        if alias.tag() & TAG_MASK == DYNAMIC_TAG {
365            unsafe {
366                let arc = crate::dynamic::restore_arc(alias);
367                forget(arc.clone());
368                forget(arc);
369            }
370        }
371
372        Self { unsafe_data: alias }
373    }
374}
375
376impl Deref for Atom {
377    type Target = str;
378
379    #[inline(always)]
380    fn deref(&self) -> &Self::Target {
381        self.as_str()
382    }
383}
384
385impl AsRef<str> for Atom {
386    #[inline(always)]
387    fn as_ref(&self) -> &str {
388        self.as_str()
389    }
390}
391
392impl PartialEq<str> for Atom {
393    #[inline]
394    fn eq(&self, other: &str) -> bool {
395        self.as_str() == other
396    }
397}
398
399impl PartialEq<&'_ str> for Atom {
400    #[inline]
401    fn eq(&self, other: &&str) -> bool {
402        self.as_str() == *other
403    }
404}
405
406impl PartialEq<Atom> for str {
407    #[inline]
408    fn eq(&self, other: &Atom) -> bool {
409        self == other.as_str()
410    }
411}
412
413/// NOT A PUBLIC API
414#[cfg(feature = "rkyv")]
415impl rkyv::Archive for Atom {
416    type Archived = rkyv::string::ArchivedString;
417    type Resolver = rkyv::string::StringResolver;
418
419    #[allow(clippy::unit_arg)]
420    unsafe fn resolve(&self, pos: usize, resolver: Self::Resolver, out: *mut Self::Archived) {
421        rkyv::string::ArchivedString::resolve_from_str(self, pos, resolver, out)
422    }
423}
424
425/// NOT A PUBLIC API
426#[cfg(feature = "rkyv")]
427impl<S: rkyv::ser::Serializer + ?Sized> rkyv::Serialize<S> for Atom {
428    fn serialize(&self, serializer: &mut S) -> Result<Self::Resolver, S::Error> {
429        String::serialize(&self.to_string(), serializer)
430    }
431}
432
433/// NOT A PUBLIC API
434#[cfg(feature = "rkyv")]
435impl<D> rkyv::Deserialize<Atom, D> for rkyv::string::ArchivedString
436where
437    D: ?Sized + rkyv::Fallible,
438{
439    fn deserialize(&self, deserializer: &mut D) -> Result<Atom, <D as rkyv::Fallible>::Error> {
440        let s: String = self.deserialize(deserializer)?;
441
442        Ok(Atom::new(s))
443    }
444}
445
446#[cfg(test)]
447mod macro_tests {
448
449    use super::*;
450    #[test]
451    fn test_atom() {
452        // Test enough to exceed the small string optimization
453        assert_eq!(atom!(""), Atom::default());
454        assert_eq!(atom!(""), Atom::from(""));
455        assert_eq!(atom!("a"), Atom::from("a"));
456        assert_eq!(atom!("ab"), Atom::from("ab"));
457        assert_eq!(atom!("abc"), Atom::from("abc"));
458        assert_eq!(atom!("abcd"), Atom::from("abcd"));
459        assert_eq!(atom!("abcde"), Atom::from("abcde"));
460        assert_eq!(atom!("abcdef"), Atom::from("abcdef"));
461        assert_eq!(atom!("abcdefg"), Atom::from("abcdefg"));
462        assert_eq!(atom!("abcdefgh"), Atom::from("abcdefgh"));
463        assert_eq!(atom!("abcdefghi"), Atom::from("abcdefghi"));
464    }
465
466    #[test]
467    fn test_inline_atom() {
468        // This is a silly test, just asserts that rustc can evaluate the pattern from
469        // our macro in a constant context.
470        const STR: Atom = {
471            let inline = inline_atom("hello");
472            if inline.is_some() {
473                inline.unwrap()
474            } else {
475                unreachable!();
476            }
477        };
478        assert_eq!(STR, Atom::from("hello"));
479    }
480}