hstr/
lib.rs

1#![cfg_attr(feature = "atom_size_128", feature(integer_atomics))]
2//! See [Atom] for more information.
3
4use core::str;
5use std::{
6    fmt::{Debug, Display},
7    hash::Hash,
8    mem::{self, forget, transmute},
9    num::NonZeroU8,
10    ops::Deref,
11    str::from_utf8_unchecked,
12};
13
14use debug_unreachable::debug_unreachable;
15use once_cell::sync::Lazy;
16
17pub use crate::dynamic::AtomStore;
18use crate::tagged_value::TaggedValue;
19
20mod dynamic;
21mod global_store;
22mod tagged_value;
23#[cfg(test)]
24mod tests;
25
26/// An immutable string which is cheap to clone, compare, hash, and has small
27/// size.
28///
29/// # Usecase
30///
31/// This type is designed for the compilers or build tools like a bundler
32/// written in Rust. String interning is much costly than simply allocating a
33/// string, but in compilers, hashing and comparison of strings are very
34/// frequent operations for some of strings, and the other strings are mostly
35/// just passed around. According to DoD, we should optimize types
36/// for operations that occur frequently, and this type is the result.
37///
38/// # Features
39///
40/// ## No mutex on creation and destruction.
41///
42/// This type also considers concurrent processing of AST nodes. Parsers
43/// generates lots of [Atom]s, so the creation of [Atom] should never involve a
44/// global mutex, because parsers are embarrassingly parallel. Also, the AST
45/// nodes are typically dropped in parallel. So [Drop] implementation of [Atom]
46/// should not involve a global mutex, too.
47///
48///
49/// ## Small size (One `u64`)
50///
51/// The most of strings are simply passed around, so the size of [Atom] should
52/// be small as possible.
53///
54/// ```rust
55/// # use std::mem::size_of;
56/// # if !cfg!(feature = "atom_size_128") {
57/// use hstr::Atom;
58/// assert!(size_of::<Atom>() == size_of::<u64>());
59/// assert!(size_of::<Option<Atom>>() == size_of::<u64>());
60/// # }
61/// ````
62///
63///
64/// ## Fast equality check (in most cases)
65///
66/// Equality comparison is O(1) in most cases. If two atoms are from the same
67/// [AtomStore], or they are from different stores but they are
68/// [`AtomStore::merge`]d, they are compared by numeric equality.
69///
70/// If two strings are created from different [AtomStore]s, they are compared
71/// using `strcmp` by default. But `hstr` allows you to make `==` faster -
72/// [`AtomStore::merge`].
73///
74///
75/// ## Fast [Hash] implementation
76///
77/// [Atom] precompute the hash value of long strings when they are created, so
78/// it is `O(1)` to compute hash.
79///
80///
81/// ## Small strings as inline data
82///
83/// Small strings are stored in the [Atom] itself without any allocation.
84///
85///
86/// # Creating atoms
87///
88/// If you are working on a module which creates lots of [Atom]s, you are
89/// recommended to use [AtomStore] API because it's faster. But if you are not,
90/// you can use global APIs for convenience.
91pub struct Atom {
92    // If this Atom is a dynamic one, this is *const Entry
93    unsafe_data: TaggedValue,
94}
95
96#[doc(hidden)]
97pub type CachedAtom = Lazy<Atom>;
98
99/// Create an atom from a string literal. This atom is never dropped.
100#[macro_export]
101macro_rules! atom {
102    ($s:tt) => {{
103        #[inline(never)]
104        fn get_atom() -> $crate::Atom {
105            static CACHE: $crate::CachedAtom = $crate::CachedAtom::new(|| $crate::Atom::from($s));
106
107            (*CACHE).clone()
108        }
109
110        get_atom()
111    }};
112}
113
114impl Default for Atom {
115    #[inline(never)]
116    fn default() -> Self {
117        atom!("")
118    }
119}
120
121/// Immutable, so it's safe to be shared between threads
122unsafe impl Send for Atom {}
123
124/// Immutable, so it's safe to be shared between threads
125unsafe impl Sync for Atom {}
126
127impl Display for Atom {
128    #[inline]
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        Display::fmt(self.as_str(), f)
131    }
132}
133
134impl Debug for Atom {
135    #[inline]
136    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
137        Debug::fmt(self.as_str(), f)
138    }
139}
140
141#[cfg(feature = "serde")]
142impl serde::ser::Serialize for Atom {
143    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
144    where
145        S: serde::ser::Serializer,
146    {
147        serializer.serialize_str(self)
148    }
149}
150
151#[cfg(feature = "serde")]
152impl<'de> serde::de::Deserialize<'de> for Atom {
153    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
154    where
155        D: serde::Deserializer<'de>,
156    {
157        String::deserialize(deserializer).map(Self::new)
158    }
159}
160const DYNAMIC_TAG: u8 = 0b_00;
161const INLINE_TAG: u8 = 0b_01; // len in upper nybble
162const INLINE_TAG_INIT: NonZeroU8 = unsafe { NonZeroU8::new_unchecked(INLINE_TAG) };
163const STATIC_TAG: u8 = 0b_10;
164const TAG_MASK: u8 = 0b_11;
165const LEN_OFFSET: usize = 4;
166const LEN_MASK: u8 = 0xf0;
167
168// const STATIC_SHIFT_BITS: usize = 32;
169
170impl Atom {
171    #[inline(always)]
172    pub fn new<S>(s: S) -> Self
173    where
174        Self: From<S>,
175    {
176        Self::from(s)
177    }
178
179    #[inline(always)]
180    fn tag(&self) -> u8 {
181        self.unsafe_data.tag() & TAG_MASK
182    }
183
184    /// Return true if this is a dynamic Atom.
185    #[inline(always)]
186    fn is_dynamic(&self) -> bool {
187        self.tag() == DYNAMIC_TAG
188    }
189}
190
191impl Atom {
192    fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
193        let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
194        let buffer = unsafe { &mut *buffer.as_mut_ptr() };
195
196        if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
197            buffer_prefix.copy_from_slice(s.as_bytes());
198            let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
199            f(as_str);
200            Atom::from(&*as_str)
201        } else {
202            let mut string = s.to_owned();
203            f(&mut string);
204            Atom::from(string)
205        }
206    }
207
208    /// Like [`to_ascii_uppercase`].
209    ///
210    /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
211    pub fn to_ascii_uppercase(&self) -> Self {
212        for (i, b) in self.bytes().enumerate() {
213            if let b'a'..=b'z' = b {
214                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
215            }
216        }
217        self.clone()
218    }
219
220    /// Like [`to_ascii_lowercase`].
221    ///
222    /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
223    pub fn to_ascii_lowercase(&self) -> Self {
224        for (i, b) in self.bytes().enumerate() {
225            if let b'A'..=b'Z' = b {
226                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
227            }
228        }
229        self.clone()
230    }
231}
232
233impl Atom {
234    #[inline(never)]
235    fn get_hash(&self) -> u64 {
236        match self.tag() {
237            DYNAMIC_TAG => {
238                unsafe { crate::dynamic::deref_from(self.unsafe_data) }
239                    .header
240                    .header
241                    .header
242                    .hash
243            }
244            STATIC_TAG => {
245                todo!("static hash")
246            }
247            INLINE_TAG => {
248                // This is passed as input to the caller's `Hasher` implementation, so it's okay
249                // that this isn't really a hash
250                self.unsafe_data.hash()
251            }
252            _ => unsafe { debug_unreachable!() },
253        }
254    }
255
256    #[inline(never)]
257    fn as_str(&self) -> &str {
258        match self.tag() {
259            DYNAMIC_TAG => unsafe {
260                let item = crate::dynamic::deref_from(self.unsafe_data);
261                from_utf8_unchecked(transmute::<&[u8], &'static [u8]>(&item.slice))
262            },
263            STATIC_TAG => {
264                todo!("static as_str")
265            }
266            INLINE_TAG => {
267                let len = (self.unsafe_data.tag() & LEN_MASK) >> LEN_OFFSET;
268                let src = self.unsafe_data.data();
269                unsafe { std::str::from_utf8_unchecked(&src[..(len as usize)]) }
270            }
271            _ => unsafe { debug_unreachable!() },
272        }
273    }
274}
275
276#[cfg(test)]
277impl Atom {
278    pub(crate) fn ref_count(&self) -> usize {
279        match self.tag() {
280            DYNAMIC_TAG => {
281                let ptr = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
282
283                triomphe::ThinArc::strong_count(&ptr.0)
284            }
285            _ => 1,
286        }
287    }
288}
289
290impl PartialEq for Atom {
291    #[inline(never)]
292    fn eq(&self, other: &Self) -> bool {
293        if self.unsafe_data == other.unsafe_data {
294            return true;
295        }
296
297        // If one is inline and the other is not, the length is different.
298        // If one is static and the other is not, it's different.
299        if self.tag() != other.tag() {
300            return false;
301        }
302
303        if self.is_dynamic() && other.is_dynamic() {
304            let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
305            let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) };
306
307            if te.header.header.header.hash != oe.header.header.header.hash {
308                return false;
309            }
310
311            return te.slice == oe.slice;
312        }
313
314        if self.get_hash() != other.get_hash() {
315            return false;
316        }
317
318        // If the store is different, the string may be the same, even though the
319        // `unsafe_data` is different
320        self.as_str() == other.as_str()
321    }
322}
323
324impl Eq for Atom {}
325
326impl Hash for Atom {
327    #[inline(always)]
328    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
329        state.write_u64(self.get_hash());
330    }
331}
332
333impl Drop for Atom {
334    #[inline(always)]
335    fn drop(&mut self) {
336        if self.is_dynamic() {
337            unsafe { drop(crate::dynamic::restore_arc(self.unsafe_data)) }
338        }
339    }
340}
341
342impl Clone for Atom {
343    #[inline(always)]
344    fn clone(&self) -> Self {
345        Self::from_alias(self.unsafe_data)
346    }
347}
348
349impl Atom {
350    #[inline]
351    pub(crate) fn from_alias(alias: TaggedValue) -> Self {
352        if alias.tag() & TAG_MASK == DYNAMIC_TAG {
353            unsafe {
354                let arc = crate::dynamic::restore_arc(alias);
355                forget(arc.clone());
356                forget(arc);
357            }
358        }
359
360        Self { unsafe_data: alias }
361    }
362}
363
364impl Deref for Atom {
365    type Target = str;
366
367    #[inline(always)]
368    fn deref(&self) -> &Self::Target {
369        self.as_str()
370    }
371}
372
373impl AsRef<str> for Atom {
374    #[inline(always)]
375    fn as_ref(&self) -> &str {
376        self.as_str()
377    }
378}
379
380impl PartialEq<str> for Atom {
381    #[inline]
382    fn eq(&self, other: &str) -> bool {
383        self.as_str() == other
384    }
385}
386
387impl PartialEq<&'_ str> for Atom {
388    #[inline]
389    fn eq(&self, other: &&str) -> bool {
390        self.as_str() == *other
391    }
392}
393
394impl PartialEq<Atom> for str {
395    #[inline]
396    fn eq(&self, other: &Atom) -> bool {
397        self == other.as_str()
398    }
399}
400
401/// NOT A PUBLIC API
402#[cfg(feature = "rkyv")]
403impl rkyv::Archive for Atom {
404    type Archived = rkyv::string::ArchivedString;
405    type Resolver = rkyv::string::StringResolver;
406
407    #[allow(clippy::unit_arg)]
408    unsafe fn resolve(&self, pos: usize, resolver: Self::Resolver, out: *mut Self::Archived) {
409        rkyv::string::ArchivedString::resolve_from_str(self, pos, resolver, out)
410    }
411}
412
413/// NOT A PUBLIC API
414#[cfg(feature = "rkyv")]
415impl<S: rkyv::ser::Serializer + ?Sized> rkyv::Serialize<S> for Atom {
416    fn serialize(&self, serializer: &mut S) -> Result<Self::Resolver, S::Error> {
417        String::serialize(&self.to_string(), serializer)
418    }
419}
420
421/// NOT A PUBLIC API
422#[cfg(feature = "rkyv")]
423impl<D> rkyv::Deserialize<Atom, D> for rkyv::string::ArchivedString
424where
425    D: ?Sized + rkyv::Fallible,
426{
427    fn deserialize(&self, deserializer: &mut D) -> Result<Atom, <D as rkyv::Fallible>::Error> {
428        let s: String = self.deserialize(deserializer)?;
429
430        Ok(Atom::new(s))
431    }
432}