string_cache/
atom.rs

1// Copyright 2014 The Servo Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use crate::dynamic_set::{dynamic_set, Entry};
11use crate::static_sets::StaticAtomSet;
12use debug_unreachable::debug_unreachable;
13
14use std::borrow::Cow;
15use std::cmp::Ordering::{self, Equal};
16use std::fmt;
17use std::hash::{Hash, Hasher};
18use std::marker::PhantomData;
19use std::mem;
20use std::num::NonZeroU64;
21use std::ops;
22use std::slice;
23use std::str;
24use std::sync::atomic::Ordering::SeqCst;
25
26const DYNAMIC_TAG: u8 = 0b_00;
27const INLINE_TAG: u8 = 0b_01; // len in upper nybble
28const STATIC_TAG: u8 = 0b_10;
29const TAG_MASK: u64 = 0b_11;
30const LEN_OFFSET: u64 = 4;
31const LEN_MASK: u64 = 0xF0;
32
33const MAX_INLINE_LEN: usize = 7;
34const STATIC_SHIFT_BITS: usize = 32;
35
36/// Represents a string that has been interned.
37///
38/// While the type definition for `Atom` indicates that it generic on a particular
39/// implementation of an atom set, you don't need to worry about this.  Atoms can be static
40/// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they
41/// can be dynamic and created by you on an `EmptyStaticAtomSet`.
42///
43/// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted;
44/// this means that you may need to `.clone()` an atom to keep copies to it in different
45/// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`.
46///
47/// ## Creating an atom at runtime
48///
49/// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code
50/// may then do something like read data from somewhere and extract tokens that need to be
51/// compared to the atoms.  In this case, you can use `Atom::from(&str)` or
52/// `Atom::from(String)`.  These create a reference-counted atom which will be
53/// automatically freed when all references to it are dropped.
54///
55/// This means that your application can safely have a loop which tokenizes data, creates
56/// atoms from the tokens, and compares the atoms to a predefined set of keywords, without
57/// running the risk of arbitrary memory consumption from creating large numbers of atoms —
58/// as long as your application does not store clones of the atoms it creates along the
59/// way.
60///
61/// For example, the following is safe and will not consume arbitrary amounts of memory:
62///
63/// ```ignore
64/// let untrusted_data = "large amounts of text ...";
65///
66/// for token in untrusted_data.split_whitespace() {
67///     let atom = Atom::from(token); // interns the string
68///
69///     if atom == Atom::from("keyword") {
70///         // handle that keyword
71///     } else if atom == Atom::from("another_keyword") {
72///         // handle that keyword
73///     } else {
74///         println!("unknown keyword");
75///     }
76/// } // atom is dropped here, so it is not kept around in memory
77/// ```
78#[derive(PartialEq, Eq)]
79// NOTE: Deriving PartialEq requires that a given string must always be interned the same way.
80pub struct Atom<Static> {
81    unsafe_data: NonZeroU64,
82    phantom: PhantomData<Static>,
83}
84
85// This isn't really correct as the Atoms can technically take up space. But I guess it's ok
86// as it is possible to measure the size of the atom set separately/
87#[cfg(feature = "malloc_size_of")]
88impl<Static: StaticAtomSet> malloc_size_of::MallocSizeOf for Atom<Static> {
89    fn size_of(&self, _ops: &mut malloc_size_of::MallocSizeOfOps) -> usize {
90        0
91    }
92}
93
94// FIXME: bound removed from the struct definition before of this error for pack_static:
95// "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable"
96// https://github.com/rust-lang/rust/issues/57563
97impl<Static> Atom<Static> {
98    /// For the atom!() macros
99    #[inline(always)]
100    #[doc(hidden)]
101    pub const fn pack_static(n: u32) -> Self {
102        Self {
103            unsafe_data: unsafe {
104                // STATIC_TAG ensures this is non-zero
105                NonZeroU64::new_unchecked((STATIC_TAG as u64) | ((n as u64) << STATIC_SHIFT_BITS))
106            },
107            phantom: PhantomData,
108        }
109    }
110
111    /// For the atom!() macros
112    #[inline(always)]
113    #[doc(hidden)]
114    pub const fn pack_inline(mut n: u64, len: u8) -> Self {
115        if cfg!(target_endian = "big") {
116            // Reverse order of top 7 bytes.
117            // Bottom 8 bits of `n` are zero, and we need that to remain so.
118            // String data is stored in top 7 bytes, tag and length in bottom byte.
119            n = n.to_le() << 8;
120        }
121
122        let data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET) | n;
123        Self {
124            // INLINE_TAG ensures this is never zero
125            unsafe_data: unsafe { NonZeroU64::new_unchecked(data) },
126            phantom: PhantomData,
127        }
128    }
129
130    fn tag(&self) -> u8 {
131        (self.unsafe_data.get() & TAG_MASK) as u8
132    }
133}
134
135impl<Static: StaticAtomSet> Atom<Static> {
136    /// Return the internal representation. For testing.
137    #[doc(hidden)]
138    pub fn unsafe_data(&self) -> u64 {
139        self.unsafe_data.get()
140    }
141
142    /// Return true if this is a static Atom. For testing.
143    #[doc(hidden)]
144    pub fn is_static(&self) -> bool {
145        self.tag() == STATIC_TAG
146    }
147
148    /// Return true if this is a dynamic Atom. For testing.
149    #[doc(hidden)]
150    pub fn is_dynamic(&self) -> bool {
151        self.tag() == DYNAMIC_TAG
152    }
153
154    /// Return true if this is an inline Atom. For testing.
155    #[doc(hidden)]
156    pub fn is_inline(&self) -> bool {
157        self.tag() == INLINE_TAG
158    }
159
160    fn static_index(&self) -> u64 {
161        self.unsafe_data.get() >> STATIC_SHIFT_BITS
162    }
163
164    /// Get the hash of the string as it is stored in the set.
165    pub fn get_hash(&self) -> u32 {
166        match self.tag() {
167            DYNAMIC_TAG => {
168                let entry = self.unsafe_data.get() as *const Entry;
169                unsafe { (*entry).hash }
170            }
171            STATIC_TAG => Static::get().hashes[self.static_index() as usize],
172            INLINE_TAG => {
173                let data = self.unsafe_data.get();
174                // This may or may not be great...
175                ((data >> 32) ^ data) as u32
176            }
177            _ => unsafe { debug_unreachable!() },
178        }
179    }
180
181    pub fn try_static(string_to_add: &str) -> Option<Self> {
182        Self::try_static_internal(string_to_add).ok()
183    }
184
185    fn try_static_internal(string_to_add: &str) -> Result<Self, phf_shared::Hashes> {
186        let static_set = Static::get();
187        let hash = phf_shared::hash(&*string_to_add, &static_set.key);
188        let index = phf_shared::get_index(&hash, static_set.disps, static_set.atoms.len());
189
190        if static_set.atoms[index as usize] == string_to_add {
191            Ok(Self::pack_static(index))
192        } else {
193            Err(hash)
194        }
195    }
196}
197
198impl<Static: StaticAtomSet> Default for Atom<Static> {
199    #[inline]
200    fn default() -> Self {
201        Atom::pack_static(Static::empty_string_index())
202    }
203}
204
205impl<Static: StaticAtomSet> Hash for Atom<Static> {
206    #[inline]
207    fn hash<H>(&self, state: &mut H)
208    where
209        H: Hasher,
210    {
211        state.write_u32(self.get_hash())
212    }
213}
214
215impl<'a, Static: StaticAtomSet> From<Cow<'a, str>> for Atom<Static> {
216    fn from(string_to_add: Cow<'a, str>) -> Self {
217        let len = string_to_add.len();
218        if len == 0 {
219            Self::pack_static(Static::empty_string_index())
220        } else if len <= MAX_INLINE_LEN {
221            let mut data: u64 = (INLINE_TAG as u64) | ((len as u64) << LEN_OFFSET);
222            {
223                let dest = inline_atom_slice_mut(&mut data);
224                dest[..len].copy_from_slice(string_to_add.as_bytes());
225            }
226            Atom {
227                // INLINE_TAG ensures this is never zero
228                unsafe_data: unsafe { NonZeroU64::new_unchecked(data) },
229                phantom: PhantomData,
230            }
231        } else {
232            Self::try_static_internal(&*string_to_add).unwrap_or_else(|hash| {
233                let ptr: std::ptr::NonNull<Entry> = dynamic_set().insert(string_to_add, hash.g);
234                let data = ptr.as_ptr() as u64;
235                debug_assert!(0 == data & TAG_MASK);
236                Atom {
237                    // The address of a ptr::NonNull is non-zero
238                    unsafe_data: unsafe { NonZeroU64::new_unchecked(data) },
239                    phantom: PhantomData,
240                }
241            })
242        }
243    }
244}
245
246impl<Static: StaticAtomSet> Clone for Atom<Static> {
247    #[inline(always)]
248    fn clone(&self) -> Self {
249        if self.tag() == DYNAMIC_TAG {
250            let entry = self.unsafe_data.get() as *const Entry;
251            unsafe { &*entry }.ref_count.fetch_add(1, SeqCst);
252        }
253        Atom { ..*self }
254    }
255}
256
257impl<Static> Drop for Atom<Static> {
258    #[inline]
259    fn drop(&mut self) {
260        if self.tag() == DYNAMIC_TAG {
261            let entry = self.unsafe_data.get() as *const Entry;
262            if unsafe { &*entry }.ref_count.fetch_sub(1, SeqCst) == 1 {
263                drop_slow(self)
264            }
265        }
266
267        // Out of line to guide inlining.
268        fn drop_slow<Static>(this: &mut Atom<Static>) {
269            dynamic_set().remove(this.unsafe_data.get() as *mut Entry);
270        }
271    }
272}
273
274impl<Static: StaticAtomSet> ops::Deref for Atom<Static> {
275    type Target = str;
276
277    #[inline]
278    fn deref(&self) -> &str {
279        unsafe {
280            match self.tag() {
281                DYNAMIC_TAG => {
282                    let entry = self.unsafe_data.get() as *const Entry;
283                    &(*entry).string
284                }
285                INLINE_TAG => {
286                    let len = (self.unsafe_data() & LEN_MASK) >> LEN_OFFSET;
287                    debug_assert!(len as usize <= MAX_INLINE_LEN);
288                    let src = inline_atom_slice(&self.unsafe_data);
289                    str::from_utf8_unchecked(src.get_unchecked(..(len as usize)))
290                }
291                STATIC_TAG => Static::get().atoms[self.static_index() as usize],
292                _ => debug_unreachable!(),
293            }
294        }
295    }
296}
297
298impl<Static: StaticAtomSet> fmt::Debug for Atom<Static> {
299    #[inline]
300    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
301        let ty_str = unsafe {
302            match self.tag() {
303                DYNAMIC_TAG => "dynamic",
304                INLINE_TAG => "inline",
305                STATIC_TAG => "static",
306                _ => debug_unreachable!(),
307            }
308        };
309
310        write!(f, "Atom('{}' type={})", &*self, ty_str)
311    }
312}
313
314impl<Static: StaticAtomSet> PartialOrd for Atom<Static> {
315    #[inline]
316    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
317        if self.unsafe_data == other.unsafe_data {
318            return Some(Equal);
319        }
320        self.as_ref().partial_cmp(other.as_ref())
321    }
322}
323
324impl<Static: StaticAtomSet> Ord for Atom<Static> {
325    #[inline]
326    fn cmp(&self, other: &Self) -> Ordering {
327        if self.unsafe_data == other.unsafe_data {
328            return Equal;
329        }
330        self.as_ref().cmp(other.as_ref())
331    }
332}
333
334// AsciiExt requires mutating methods, so we just implement the non-mutating ones.
335// We don't need to implement is_ascii because there's no performance improvement
336// over the one from &str.
337impl<Static: StaticAtomSet> Atom<Static> {
338    fn from_mutated_str<F: FnOnce(&mut str)>(s: &str, f: F) -> Self {
339        let mut buffer = mem::MaybeUninit::<[u8; 64]>::uninit();
340        let buffer = unsafe { &mut *buffer.as_mut_ptr() };
341
342        if let Some(buffer_prefix) = buffer.get_mut(..s.len()) {
343            buffer_prefix.copy_from_slice(s.as_bytes());
344            let as_str = unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) };
345            f(as_str);
346            Atom::from(&*as_str)
347        } else {
348            let mut string = s.to_owned();
349            f(&mut string);
350            Atom::from(string)
351        }
352    }
353
354    /// Like [`to_ascii_uppercase`].
355    ///
356    /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
357    pub fn to_ascii_uppercase(&self) -> Self {
358        for (i, b) in self.bytes().enumerate() {
359            if let b'a'..=b'z' = b {
360                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_uppercase());
361            }
362        }
363        self.clone()
364    }
365
366    /// Like [`to_ascii_lowercase`].
367    ///
368    /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
369    pub fn to_ascii_lowercase(&self) -> Self {
370        for (i, b) in self.bytes().enumerate() {
371            if let b'A'..=b'Z' = b {
372                return Atom::from_mutated_str(self, |s| s[i..].make_ascii_lowercase());
373            }
374        }
375        self.clone()
376    }
377
378    /// Like [`eq_ignore_ascii_case`].
379    ///
380    /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
381    pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
382        (self == other) || self.eq_str_ignore_ascii_case(&**other)
383    }
384
385    /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`.
386    ///
387    /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
388    pub fn eq_str_ignore_ascii_case(&self, other: &str) -> bool {
389        (&**self).eq_ignore_ascii_case(other)
390    }
391}
392
393#[inline(always)]
394fn inline_atom_slice(x: &NonZeroU64) -> &[u8] {
395        let x: *const NonZeroU64 = x;
396        let mut data = x as *const u8;
397        // All except the lowest byte, which is first in little-endian, last in big-endian.
398        if cfg!(target_endian = "little") {
399            data = unsafe { data.offset(1) };
400        }
401        let len = 7;
402        unsafe { slice::from_raw_parts(data, len) }   
403}
404
405#[inline(always)]
406fn inline_atom_slice_mut(x: &mut u64) -> &mut [u8] {   
407        let x: *mut u64 = x;
408        let mut data = x as *mut u8;
409        // All except the lowest byte, which is first in little-endian, last in big-endian.
410        if cfg!(target_endian = "little") {
411            data = unsafe { data.offset(1) };
412        }
413        let len = 7;
414        unsafe { slice::from_raw_parts_mut(data, len) }
415}