hstr/
lib.rs

1#![cfg_attr(feature = "atom_size_128", feature(integer_atomics))]
2//! See [Atom] for more information.
3
4use core::str;
5use std::{
6    fmt::{Debug, Display},
7    hash::Hash,
8    mem::{self, forget, transmute, ManuallyDrop},
9    num::NonZeroU8,
10    ops::Deref,
11    str::from_utf8_unchecked,
12};
13
14use debug_unreachable::debug_unreachable;
15use once_cell::sync::Lazy;
16
17pub use crate::dynamic::{global_atom_store_gc, AtomStore};
18use crate::{
19    macros::{get_hash, impl_from_alias, partial_eq},
20    tagged_value::TaggedValue,
21};
22
23mod dynamic;
24mod global_store;
25mod macros;
26mod tagged_value;
27#[cfg(test)]
28mod tests;
29pub mod wtf8;
30mod wtf8_atom;
31
32pub use wtf8_atom::Wtf8Atom;
33
34/// An immutable string which is cheap to clone, compare, hash, and has small
35/// size.
36///
37/// # Usecase
38///
39/// This type is designed for the compilers or build tools like a bundler
40/// written in Rust. String interning is much costly than simply allocating a
41/// string, but in compilers, hashing and comparison of strings are very
42/// frequent operations for some of strings, and the other strings are mostly
43/// just passed around. According to DoD, we should optimize types
44/// for operations that occur frequently, and this type is the result.
45///
46/// # Features
47///
48/// ## No mutex on creation and destruction.
49///
50/// This type also considers concurrent processing of AST nodes. Parsers
51/// generates lots of [Atom]s, so the creation of [Atom] should never involve a
52/// global mutex, because parsers are embarrassingly parallel. Also, the AST
53/// nodes are typically dropped in parallel. So [Drop] implementation of [Atom]
54/// should not involve a global mutex, too.
55///
56///
57/// ## Small size (One `u64`)
58///
59/// The most of strings are simply passed around, so the size of [Atom] should
60/// be small as possible.
61///
62/// ```rust
63/// # use std::mem::size_of;
64/// # if !cfg!(feature = "atom_size_128") {
65/// use hstr::Atom;
66/// assert!(size_of::<Atom>() == size_of::<u64>());
67/// assert!(size_of::<Option<Atom>>() == size_of::<u64>());
68/// # }
69/// ````
70///
71///
72/// ## Fast equality check (in most cases)
73///
74/// Equality comparison is O(1) in most cases. If two atoms are from the same
75/// [AtomStore], or they are from different stores but they are
76/// [`AtomStore::merge`]d, they are compared by numeric equality.
77///
78/// If two strings are created from different [AtomStore]s, they are compared
79/// using `strcmp` by default. But `hstr` allows you to make `==` faster -
80/// [`AtomStore::merge`].
81///
82///
83/// ## Fast [Hash] implementation
84///
85/// [Atom] precompute the hash value of long strings when they are created, so
86/// it is `O(1)` to compute hash.
87///
88///
89/// ## Small strings as inline data
90///
91/// Small strings are stored in the [Atom] itself without any allocation.
92///
93///
94/// ## Creating atoms
95///
96/// If you are working on a module which creates lots of [Atom]s, you are
97/// recommended to use [AtomStore] API because it's faster. But if you are not,
98/// you can use global APIs for convenience.
99///
100/// ## Dealloc
101///
102/// - Atoms stored in the local `AtomStore` have the same lifetime as the store
103///   itself, and will be deallocated when the store is dropped.
104/// - Atoms created via the `atom!` macro or `String::into` are stored in the
105///   global atom store. By default, these atoms are never deallocated. To clean
106///   up unused atoms, call [global_atom_store_gc].
107pub struct Atom {
108    // If this Atom is a dynamic one, this is *const Entry
109    unsafe_data: TaggedValue,
110}
111
112#[doc(hidden)]
113pub type CachedAtom = Lazy<Atom>;
114
115#[doc(hidden)]
116pub const fn inline_atom(s: &str) -> Option<Atom> {
117    dynamic::inline_atom(s)
118}
119
120/// Create an atom from a string literal. This atom is never dropped.
121#[macro_export]
122macro_rules! atom {
123    ($s:expr) => {{
124        const INLINE: ::core::option::Option<$crate::Atom> = $crate::inline_atom($s);
125        // This condition can be evaluated at compile time to enable inlining as a
126        // simple constant
127        if INLINE.is_some() {
128            INLINE.unwrap()
129        } else {
130            // Otherwise we use a
131            #[inline(never)]
132            fn get_atom() -> $crate::Atom {
133                static CACHE: $crate::CachedAtom =
134                    $crate::CachedAtom::new(|| $crate::Atom::from($s));
135
136                (*CACHE).clone()
137            }
138
139            get_atom()
140        }
141    }};
142}
143
144impl Default for Atom {
145    #[inline(never)]
146    fn default() -> Self {
147        atom!("")
148    }
149}
150
151/// Immutable, so it's safe to be shared between threads
152unsafe impl Send for Atom {}
153
154/// Immutable, so it's safe to be shared between threads
155unsafe impl Sync for Atom {}
156
157impl Display for Atom {
158    #[inline]
159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160        Display::fmt(self.as_str(), f)
161    }
162}
163
164impl Debug for Atom {
165    #[inline]
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        Debug::fmt(self.as_str(), f)
168    }
169}
170
171#[cfg(feature = "serde")]
172impl serde::ser::Serialize for Atom {
173    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
174    where
175        S: serde::ser::Serializer,
176    {
177        serializer.serialize_str(self)
178    }
179}
180
181#[cfg(feature = "serde")]
182impl<'de> serde::de::Deserialize<'de> for Atom {
183    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
184    where
185        D: serde::Deserializer<'de>,
186    {
187        String::deserialize(deserializer).map(Self::new)
188    }
189}
190const DYNAMIC_TAG: u8 = 0b_00;
191const INLINE_TAG: u8 = 0b_01; // len in upper nybble
192const INLINE_TAG_INIT: NonZeroU8 = unsafe { NonZeroU8::new_unchecked(INLINE_TAG) };
193const TAG_MASK: u8 = 0b_11;
194const LEN_OFFSET: usize = 4;
195const LEN_MASK: u8 = 0xf0;
196
197// const STATIC_SHIFT_BITS: usize = 32;
198
199impl Atom {
200    #[inline(always)]
201    pub fn new<S>(s: S) -> Self
202    where
203        Self: From<S>,
204    {
205        Self::from(s)
206    }
207
208    #[inline(always)]
209    fn tag(&self) -> u8 {
210        self.unsafe_data.tag() & TAG_MASK
211    }
212
213    /// Return true if this is a dynamic Atom.
214    #[inline(always)]
215    fn is_dynamic(&self) -> bool {
216        self.tag() == DYNAMIC_TAG
217    }
218}
219
220impl Atom {
221    fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
222        let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
223        let buffer = unsafe { &mut *buffer.as_mut_ptr() };
224
225        if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
226            buffer_prefix.copy_from_slice(s.as_bytes());
227            let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
228            f(as_str);
229            Atom::from(&*as_str)
230        } else {
231            let mut string = s.to_owned();
232            f(&mut string);
233            Atom::from(string)
234        }
235    }
236
237    /// Like [`to_ascii_uppercase`].
238    ///
239    /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
240    pub fn to_ascii_uppercase(&self) -> Self {
241        for (i, b) in self.bytes().enumerate() {
242            if let b'a'..=b'z' = b {
243                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
244            }
245        }
246        self.clone()
247    }
248
249    /// Like [`to_ascii_lowercase`].
250    ///
251    /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
252    pub fn to_ascii_lowercase(&self) -> Self {
253        for (i, b) in self.bytes().enumerate() {
254            if let b'A'..=b'Z' = b {
255                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
256            }
257        }
258        self.clone()
259    }
260}
261
262impl Atom {
263    fn get_hash(&self) -> u64 {
264        get_hash!(self)
265    }
266
267    fn as_str(&self) -> &str {
268        match self.tag() {
269            DYNAMIC_TAG => unsafe {
270                let item = crate::dynamic::deref_from(self.unsafe_data);
271                from_utf8_unchecked(transmute::<&[u8], &'static [u8]>(&item.slice))
272            },
273            INLINE_TAG => {
274                let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
275                let src = self.unsafe_data.data();
276                unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
277            }
278            _ => unsafe { debug_unreachable!() },
279        }
280    }
281}
282
283#[cfg(test)]
284impl Atom {
285    pub(crate) fn ref_count(&self) -> usize {
286        match self.tag() {
287            DYNAMIC_TAG => {
288                let ptr = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
289
290                triomphe::ThinArc::strong_count(&ptr.0)
291            }
292            _ => 1,
293        }
294    }
295}
296
297impl PartialEq for Atom {
298    #[inline(never)]
299    fn eq(&self, other: &Self) -> bool {
300        partial_eq!(self, other);
301
302        // If the store is different, the string may be the same, even though the
303        // `unsafe_data` is different
304        self.as_str() == other.as_str()
305    }
306}
307
308impl Eq for Atom {}
309
310impl Hash for Atom {
311    #[inline(always)]
312    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
313        state.write_u64(self.get_hash());
314    }
315}
316
317impl Drop for Atom {
318    #[inline(always)]
319    fn drop(&mut self) {
320        if self.is_dynamic() {
321            unsafe { drop(crate::dynamic::restore_arc(self.unsafe_data)) }
322        }
323    }
324}
325
326impl Clone for Atom {
327    #[inline(always)]
328    fn clone(&self) -> Self {
329        Self::from_alias(self.unsafe_data)
330    }
331}
332
333impl_from_alias!(Atom);
334
335impl Deref for Atom {
336    type Target = str;
337
338    #[inline(always)]
339    fn deref(&self) -> &Self::Target {
340        self.as_str()
341    }
342}
343
344impl AsRef<str> for Atom {
345    #[inline(always)]
346    fn as_ref(&self) -> &str {
347        self.as_str()
348    }
349}
350
351impl PartialEq<str> for Atom {
352    #[inline]
353    fn eq(&self, other: &str) -> bool {
354        self.as_str() == other
355    }
356}
357
358impl PartialEq<&'_ str> for Atom {
359    #[inline]
360    fn eq(&self, other: &&str) -> bool {
361        self.as_str() == *other
362    }
363}
364
365impl PartialEq<Atom> for str {
366    #[inline]
367    fn eq(&self, other: &Atom) -> bool {
368        self == other.as_str()
369    }
370}
371
372/// NOT A PUBLIC API
373#[cfg(feature = "rkyv")]
374impl rkyv::Archive for Atom {
375    type Archived = rkyv::string::ArchivedString;
376    type Resolver = rkyv::string::StringResolver;
377
378    #[allow(clippy::unit_arg)]
379    unsafe fn resolve(&self, pos: usize, resolver: Self::Resolver, out: *mut Self::Archived) {
380        rkyv::string::ArchivedString::resolve_from_str(self, pos, resolver, out)
381    }
382}
383
384/// NOT A PUBLIC API
385#[cfg(feature = "rkyv")]
386impl<S: rkyv::ser::Serializer + ?Sized> rkyv::Serialize<S> for Atom {
387    fn serialize(&self, serializer: &mut S) -> Result<Self::Resolver, S::Error> {
388        String::serialize(&self.to_string(), serializer)
389    }
390}
391
392/// NOT A PUBLIC API
393#[cfg(feature = "rkyv")]
394impl<D> rkyv::Deserialize<Atom, D> for rkyv::string::ArchivedString
395where
396    D: ?Sized + rkyv::Fallible,
397{
398    fn deserialize(&self, deserializer: &mut D) -> Result<Atom, <D as rkyv::Fallible>::Error> {
399        let s: String = self.deserialize(deserializer)?;
400
401        Ok(Atom::new(s))
402    }
403}
404
405impl Atom {
406    /// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without
407    /// validation.
408    ///
409    /// # Safety
410    ///
411    /// The caller must ensure that the WTF-8 atom contains only valid UTF-8
412    /// data (no unpaired surrogates). This function performs no validation
413    /// and will create an invalid `Atom` if the input contains unpaired
414    /// surrogates.
415    ///
416    /// This is a zero-cost conversion that preserves all internal optimizations
417    /// (inline storage, precomputed hashes, etc.) since both types have
418    /// identical internal representation.
419    pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self {
420        let s = ManuallyDrop::new(s);
421        Atom {
422            unsafe_data: s.unsafe_data,
423        }
424    }
425}
426
427#[cfg(test)]
428mod macro_tests {
429
430    use super::*;
431    #[test]
432    fn test_atom() {
433        // Test enough to exceed the small string optimization
434        assert_eq!(atom!(""), Atom::default());
435        assert_eq!(atom!(""), Atom::from(""));
436        assert_eq!(atom!("a"), Atom::from("a"));
437        assert_eq!(atom!("ab"), Atom::from("ab"));
438        assert_eq!(atom!("abc"), Atom::from("abc"));
439        assert_eq!(atom!("abcd"), Atom::from("abcd"));
440        assert_eq!(atom!("abcde"), Atom::from("abcde"));
441        assert_eq!(atom!("abcdef"), Atom::from("abcdef"));
442        assert_eq!(atom!("abcdefg"), Atom::from("abcdefg"));
443        assert_eq!(atom!("abcdefgh"), Atom::from("abcdefgh"));
444        assert_eq!(atom!("abcdefghi"), Atom::from("abcdefghi"));
445    }
446
447    #[test]
448    fn test_inline_atom() {
449        // This is a silly test, just asserts that rustc can evaluate the pattern from
450        // our macro in a constant context.
451        const STR: Atom = {
452            let inline = inline_atom("hello");
453            if inline.is_some() {
454                inline.unwrap()
455            } else {
456                unreachable!();
457            }
458        };
459        assert_eq!(STR, Atom::from("hello"));
460    }
461}