1extern crate alloc;
23
24use alloc::{
25 borrow::{Borrow, Cow},
26 string::String,
27 vec::Vec,
28};
29use core::{
30 cmp::Ordering,
31 fmt, hash,
32 iter::{FromIterator, IntoIterator},
33 mem::transmute,
34 ops::Deref,
35 slice, str,
36 str::FromStr,
37};
38
39mod not_quite_std;
40
41static UTF8_REPLACEMENT_CHARACTER: &[u8] = b"\xEF\xBF\xBD";
42
43#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
49pub struct CodePoint {
50 value: u32,
51}
52
53impl Copy for CodePoint {}
54
55impl fmt::Debug for CodePoint {
58 #[inline]
59 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
60 write!(formatter, "U+{:04X}", self.value)
61 }
62}
63
64impl CodePoint {
65 #[inline]
71 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
72 CodePoint { value }
73 }
74
75 #[inline]
79 pub fn from_u32(value: u32) -> Option<CodePoint> {
80 match value {
81 0..=0x10ffff => Some(CodePoint { value }),
82 _ => None,
83 }
84 }
85
86 #[inline]
90 pub fn from_char(value: char) -> CodePoint {
91 CodePoint {
92 value: value as u32,
93 }
94 }
95
96 #[inline]
98 pub fn to_u32(&self) -> u32 {
99 self.value
100 }
101
102 #[inline]
106 pub fn to_char(&self) -> Option<char> {
107 match self.value {
108 0xd800..=0xdfff => None,
109 _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
110 }
111 }
112
113 #[inline]
118 pub fn to_char_lossy(&self) -> char {
119 self.to_char().unwrap_or('\u{FFFD}')
120 }
121}
122
123#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
128pub struct Wtf8Buf {
129 bytes: Vec<u8>,
130}
131
132impl Deref for Wtf8Buf {
133 type Target = Wtf8;
134
135 fn deref(&self) -> &Wtf8 {
136 unsafe { transmute(&*self.bytes) }
137 }
138}
139
140impl fmt::Debug for Wtf8Buf {
144 #[inline]
145 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
146 Wtf8::fmt(self, formatter)
147 }
148}
149
150impl Default for Wtf8Buf {
151 #[inline]
152 fn default() -> Self {
153 Self::new()
154 }
155}
156
157impl FromStr for Wtf8Buf {
158 type Err = core::convert::Infallible;
159
160 #[inline]
161 fn from_str(s: &str) -> Result<Self, Self::Err> {
162 Ok(Wtf8Buf {
163 bytes: s.as_bytes().to_vec(),
164 })
165 }
166}
167
168impl Wtf8Buf {
169 #[inline]
171 pub fn new() -> Wtf8Buf {
172 Wtf8Buf { bytes: Vec::new() }
173 }
174
175 #[inline]
178 pub fn with_capacity(n: usize) -> Wtf8Buf {
179 Wtf8Buf {
180 bytes: Vec::with_capacity(n),
181 }
182 }
183
184 #[inline]
190 pub fn from_string(string: String) -> Wtf8Buf {
191 Wtf8Buf {
192 bytes: string.into_bytes(),
193 }
194 }
195
196 #[inline]
202 #[allow(clippy::should_implement_trait)]
203 pub fn from_str(s: &str) -> Wtf8Buf {
204 Wtf8Buf {
205 bytes: s.as_bytes().to_vec(),
206 }
207 }
208
209 pub fn from_ill_formed_utf16(v: &[u16]) -> Wtf8Buf {
215 let mut string = Wtf8Buf::with_capacity(v.len());
216 for item in not_quite_std::decode_utf16(v.iter().cloned()) {
217 match item {
218 Ok(c) => string.push_char(c),
219 Err(s) => {
220 let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
222 not_quite_std::push_code_point(&mut string, code_point)
225 }
226 }
227 }
228 string
229 }
230
231 #[inline]
239 pub fn reserve(&mut self, additional: usize) {
240 self.bytes.reserve(additional)
241 }
242
243 #[inline]
246 pub fn capacity(&self) -> usize {
247 self.bytes.capacity()
248 }
249
250 #[inline]
252 pub fn push_str(&mut self, other: &str) {
253 self.bytes.extend_from_slice(other.as_bytes())
254 }
255
256 #[inline]
262 pub fn push_wtf8(&mut self, other: &Wtf8) {
263 match (self.final_lead_surrogate(), other.initial_trail_surrogate()) {
264 (Some(lead), Some(trail)) => {
266 let len_without_lead_surrogate = self.len() - 3;
267 self.bytes.truncate(len_without_lead_surrogate);
268 let other_without_trail_surrogate = &other.bytes[3..];
269 self.bytes.reserve(4 + other_without_trail_surrogate.len());
271 self.push_char(decode_surrogate_pair(lead, trail));
272 self.bytes.extend_from_slice(other_without_trail_surrogate);
273 }
274 _ => self.bytes.extend_from_slice(&other.bytes),
275 }
276 }
277
278 #[inline]
280 pub fn push_char(&mut self, c: char) {
281 not_quite_std::push_code_point(self, CodePoint::from_char(c))
282 }
283
284 #[inline]
290 pub fn push(&mut self, code_point: CodePoint) {
291 if let trail @ 0xdc00..=0xdfff = code_point.to_u32() {
292 if let Some(lead) = self.final_lead_surrogate() {
293 let len_without_lead_surrogate = self.len() - 3;
294 self.bytes.truncate(len_without_lead_surrogate);
295 self.push_char(decode_surrogate_pair(lead, trail as u16));
296 return;
297 }
298 }
299
300 not_quite_std::push_code_point(self, code_point)
302 }
303
304 #[inline]
311 pub fn truncate(&mut self, new_len: usize) {
312 assert!(not_quite_std::is_code_point_boundary(self, new_len));
313 self.bytes.truncate(new_len)
314 }
315
316 pub fn into_string(self) -> Result<String, Wtf8Buf> {
324 match self.next_surrogate(0) {
325 None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
326 Some(_) => Err(self),
327 }
328 }
329
330 pub fn into_string_lossy(mut self) -> String {
337 let mut pos = 0;
338 loop {
339 match self.next_surrogate(pos) {
340 Some((surrogate_pos, _)) => {
341 pos = surrogate_pos + 3;
342 self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER);
343 }
344 None => return unsafe { String::from_utf8_unchecked(self.bytes) },
345 }
346 }
347 }
348}
349
350impl FromIterator<CodePoint> for Wtf8Buf {
355 fn from_iter<T: IntoIterator<Item = CodePoint>>(iterable: T) -> Wtf8Buf {
356 let mut string = Wtf8Buf::new();
357 string.extend(iterable);
358 string
359 }
360}
361
362impl Extend<CodePoint> for Wtf8Buf {
367 fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iterable: T) {
368 let iterator = iterable.into_iter();
369 let (low, _high) = iterator.size_hint();
370 self.bytes.reserve(low);
372 for code_point in iterator {
373 self.push(code_point);
374 }
375 }
376}
377
378pub struct Wtf8 {
383 bytes: [u8],
384}
385
386impl PartialEq for Wtf8 {
388 fn eq(&self, other: &Wtf8) -> bool {
389 self.bytes.eq(&other.bytes)
390 }
391}
392
393impl Eq for Wtf8 {}
395
396impl PartialOrd for Wtf8 {
398 #[inline]
399 fn partial_cmp(&self, other: &Wtf8) -> Option<Ordering> {
400 Some(self.bytes.cmp(&other.bytes))
401 }
402
403 #[inline]
404 fn lt(&self, other: &Wtf8) -> bool {
405 self.bytes.lt(&other.bytes)
406 }
407
408 #[inline]
409 fn le(&self, other: &Wtf8) -> bool {
410 self.bytes.le(&other.bytes)
411 }
412
413 #[inline]
414 fn gt(&self, other: &Wtf8) -> bool {
415 self.bytes.gt(&other.bytes)
416 }
417
418 #[inline]
419 fn ge(&self, other: &Wtf8) -> bool {
420 self.bytes.ge(&other.bytes)
421 }
422}
423
424impl Ord for Wtf8 {
426 #[inline]
427 fn cmp(&self, other: &Wtf8) -> Ordering {
428 self.bytes.cmp(&other.bytes)
429 }
430}
431
432impl fmt::Debug for Wtf8 {
436 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
437 formatter.write_str("\"")?;
438 let mut pos = 0;
439 loop {
440 match self.next_surrogate(pos) {
441 None => break,
442 Some((surrogate_pos, surrogate)) => {
443 formatter.write_str(unsafe {
444 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
445 })?;
446 write!(formatter, "\\u{{{surrogate:X}}}")?;
447 pos = surrogate_pos + 3;
448 }
449 }
450 }
451 formatter.write_str(unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
452 formatter.write_str("\"")
453 }
454}
455
456impl Wtf8 {
457 #[inline]
461 pub const fn from_str(value: &str) -> &Wtf8 {
462 unsafe { transmute(value.as_bytes()) }
463 }
464
465 #[inline]
467 pub const fn len(&self) -> usize {
468 self.bytes.len()
469 }
470
471 #[inline]
473 pub const fn is_empty(&self) -> bool {
474 self.bytes.is_empty()
475 }
476
477 #[inline]
484 pub fn slice(&self, begin: usize, end: usize) -> &Wtf8 {
485 if begin <= end
487 && not_quite_std::is_code_point_boundary(self, begin)
488 && not_quite_std::is_code_point_boundary(self, end)
489 {
490 unsafe { not_quite_std::slice_unchecked(self, begin, end) }
491 } else {
492 not_quite_std::slice_error_fail(self, begin, end)
493 }
494 }
495
496 #[inline]
503 pub fn slice_from(&self, begin: usize) -> &Wtf8 {
504 if not_quite_std::is_code_point_boundary(self, begin) {
506 unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) }
507 } else {
508 not_quite_std::slice_error_fail(self, begin, self.len())
509 }
510 }
511
512 #[inline]
519 pub fn slice_to(&self, end: usize) -> &Wtf8 {
520 if not_quite_std::is_code_point_boundary(self, end) {
522 unsafe { not_quite_std::slice_unchecked(self, 0, end) }
523 } else {
524 not_quite_std::slice_error_fail(self, 0, end)
525 }
526 }
527
528 #[inline]
535 pub fn ascii_byte_at(&self, position: usize) -> u8 {
536 match self.bytes[position] {
537 ascii_byte @ 0x00..=0x7f => ascii_byte,
538 _ => 0xff,
539 }
540 }
541
542 #[inline]
544 pub fn code_points(&self) -> Wtf8CodePoints {
545 Wtf8CodePoints {
546 bytes: self.bytes.iter(),
547 }
548 }
549
550 #[inline]
556 pub fn as_str(&self) -> Option<&str> {
557 match self.next_surrogate(0) {
560 None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
561 Some(_) => None,
562 }
563 }
564
565 #[inline]
567 pub const fn as_bytes(&self) -> &[u8] {
568 &self.bytes
569 }
570
571 pub fn to_string_lossy(&self) -> Cow<str> {
579 let surrogate_pos = match self.next_surrogate(0) {
580 None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
581 Some((pos, _)) => pos,
582 };
583 let wtf8_bytes = &self.bytes;
584 let mut utf8_bytes = Vec::with_capacity(self.len());
585 utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
586 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER);
587 let mut pos = surrogate_pos + 3;
588 loop {
589 match self.next_surrogate(pos) {
590 Some((surrogate_pos, _)) => {
591 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
592 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER);
593 pos = surrogate_pos + 3;
594 }
595 None => {
596 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
597 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
598 }
599 }
600 }
601 }
602
603 #[inline]
610 pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits {
611 IllFormedUtf16CodeUnits {
612 code_points: self.code_points(),
613 extra: 0,
614 }
615 }
616
617 #[inline]
633 pub const unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Wtf8 {
634 unsafe { transmute(bytes) }
635 }
636
637 #[inline]
638 fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
639 let mut iter = self.bytes[pos..].iter();
640 loop {
641 let b = match iter.next() {
642 None => return None,
643 Some(&b) => b,
644 };
645 if b < 0x80 {
646 pos += 1;
647 } else if b < 0xe0 {
648 iter.next();
649 pos += 2;
650 } else if b == 0xed {
651 match (iter.next(), iter.next()) {
652 (Some(&b2), Some(&b3)) if b2 >= 0xa0 => {
653 return Some((pos, decode_surrogate(b2, b3)))
654 }
655 _ => pos += 3,
656 }
657 } else if b < 0xf0 {
658 iter.next();
659 iter.next();
660 pos += 3;
661 } else {
662 iter.next();
663 iter.next();
664 iter.next();
665 pos += 4;
666 }
667 }
668 }
669
670 #[inline]
671 fn final_lead_surrogate(&self) -> Option<u16> {
672 let len = self.len();
673 if len < 3 {
674 return None;
675 }
676 let seq = &self.bytes[len - 3..];
677 if seq[0] == 0xed && 0xa0 <= seq[1] && seq[1] <= 0xaf {
678 Some(decode_surrogate(seq[1], seq[2]))
679 } else {
680 None
681 }
682 }
683
684 #[inline]
685 fn initial_trail_surrogate(&self) -> Option<u16> {
686 let len = self.len();
687 if len < 3 {
688 return None;
689 }
690 let seq = &self.bytes[..3];
691 if seq[0] == 0xed && 0xb0 <= seq[1] && seq[1] <= 0xbf {
692 Some(decode_surrogate(seq[1], seq[2]))
693 } else {
694 None
695 }
696 }
697}
698
699#[inline]
700fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
701 0xd800 | (second_byte as u16 & 0x3f) << 6 | third_byte as u16 & 0x3f
703}
704
705#[inline]
706fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
707 let code_point = 0x10000 + (((lead as u32 - 0xd800) << 10) | (trail as u32 - 0xdc00));
708 unsafe { char::from_u32_unchecked(code_point) }
709}
710
711#[derive(Clone)]
715pub struct Wtf8CodePoints<'a> {
716 bytes: slice::Iter<'a, u8>,
717}
718
719impl<'a> Iterator for Wtf8CodePoints<'a> {
720 type Item = CodePoint;
721
722 #[inline]
723 fn next(&mut self) -> Option<CodePoint> {
724 not_quite_std::next_code_point(&mut self.bytes).map(|value| {
725 unsafe { CodePoint::from_u32_unchecked(value) }
727 })
728 }
729
730 #[inline]
731 fn size_hint(&self) -> (usize, Option<usize>) {
732 let (len, _) = self.bytes.size_hint();
733 (len.saturating_add(3) / 4, Some(len))
734 }
735}
736
737#[derive(Clone)]
738pub struct IllFormedUtf16CodeUnits<'a> {
739 code_points: Wtf8CodePoints<'a>,
740 extra: u16,
741}
742
743impl<'a> Iterator for IllFormedUtf16CodeUnits<'a> {
744 type Item = u16;
745
746 #[inline]
747 fn next(&mut self) -> Option<u16> {
748 not_quite_std::next_utf16_code_unit(self)
749 }
750
751 #[inline]
752 fn size_hint(&self) -> (usize, Option<usize>) {
753 let (low, high) = self.code_points.size_hint();
754 (low, high.and_then(|n| n.checked_mul(2)))
758 }
759}
760
761impl PartialEq<&Wtf8> for Wtf8Buf {
762 fn eq(&self, other: &&Wtf8) -> bool {
763 **self == **other
764 }
765}
766
767impl PartialEq<Wtf8Buf> for &Wtf8 {
768 fn eq(&self, other: &Wtf8Buf) -> bool {
769 **self == **other
770 }
771}
772
773impl hash::Hash for CodePoint {
774 #[inline]
775 fn hash<H: hash::Hasher>(&self, state: &mut H) {
776 self.value.hash(state)
777 }
778}
779
780impl hash::Hash for Wtf8Buf {
781 #[inline]
782 fn hash<H: hash::Hasher>(&self, state: &mut H) {
783 Wtf8::hash(self, state)
784 }
785}
786
787impl hash::Hash for Wtf8 {
788 #[inline]
789 fn hash<H: hash::Hasher>(&self, state: &mut H) {
790 state.write(&self.bytes);
791 0xfeu8.hash(state)
792 }
793}
794
795impl Borrow<Wtf8> for Wtf8Buf {
796 #[inline]
797 fn borrow(&self) -> &Wtf8 {
798 self
799 }
800}
801
802impl ToOwned for Wtf8 {
803 type Owned = Wtf8Buf;
804
805 #[inline]
806 fn to_owned(&self) -> Wtf8Buf {
807 Wtf8Buf {
808 bytes: self.bytes.to_vec(),
809 }
810 }
811}
812
813impl<'a> From<&'a Wtf8> for Cow<'a, Wtf8> {
814 #[inline]
815 fn from(s: &'a Wtf8) -> Cow<'a, Wtf8> {
816 Cow::Borrowed(s)
817 }
818}
819
820impl<'a> From<&'a str> for &'a Wtf8 {
821 #[inline]
822 fn from(s: &'a str) -> &'a Wtf8 {
823 Wtf8::from_str(s)
824 }
825}
826
827#[cfg(test)]
828mod tests {
829 use alloc::{format, vec};
830 use core::mem::transmute;
831
832 use super::*;
833
834 #[test]
835 fn code_point_from_u32() {
836 assert!(CodePoint::from_u32(0).is_some());
837 assert!(CodePoint::from_u32(0xd800).is_some());
838 assert!(CodePoint::from_u32(0x10ffff).is_some());
839 assert!(CodePoint::from_u32(0x110000).is_none());
840 }
841
842 #[test]
843 fn code_point_to_u32() {
844 fn c(value: u32) -> CodePoint {
845 CodePoint::from_u32(value).unwrap()
846 }
847 assert_eq!(c(0).to_u32(), 0);
848 assert_eq!(c(0xd800).to_u32(), 0xd800);
849 assert_eq!(c(0x10ffff).to_u32(), 0x10ffff);
850 }
851
852 #[test]
853 fn code_point_from_char() {
854 assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
855 assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1f4a9);
856 }
857
858 #[test]
859 fn code_point_to_string() {
860 let cp_a = CodePoint::from_char('a');
861 assert_eq!(format!("{cp_a:?}"), "U+0061");
862 let cp_poop = CodePoint::from_char('💩');
863 assert_eq!(format!("{cp_poop:?}"), "U+1F4A9");
864 }
865
866 #[test]
867 fn code_point_to_char() {
868 fn c(value: u32) -> CodePoint {
869 CodePoint::from_u32(value).unwrap()
870 }
871 assert_eq!(c(0x61).to_char(), Some('a'));
872 assert_eq!(c(0x1f4a9).to_char(), Some('💩'));
873 assert_eq!(c(0xd800).to_char(), None);
874 }
875
876 #[test]
877 fn code_point_to_char_lossy() {
878 fn c(value: u32) -> CodePoint {
879 CodePoint::from_u32(value).unwrap()
880 }
881 assert_eq!(c(0x61).to_char_lossy(), 'a');
882 assert_eq!(c(0x1f4a9).to_char_lossy(), '💩');
883 assert_eq!(c(0xd800).to_char_lossy(), '\u{FFFD}');
884 }
885
886 #[test]
887 fn wtf8buf_new() {
888 assert_eq!(Wtf8Buf::new().bytes, b"");
889 }
890
891 #[test]
892 fn wtf8buf_from_str() {
893 assert_eq!(Wtf8Buf::from_str("").bytes, b"");
894 assert_eq!(
895 Wtf8Buf::from_str("aé 💩").bytes,
896 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
897 );
898 }
899
900 #[test]
901 fn wtf8buf_from_string() {
902 assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b"");
903 assert_eq!(
904 Wtf8Buf::from_string(String::from("aé 💩")).bytes,
905 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
906 );
907 }
908
909 #[test]
910 fn wtf8buf_from_ill_formed_utf16() {
911 assert_eq!(Wtf8Buf::from_ill_formed_utf16(&[]).bytes, b"");
912 assert_eq!(
913 Wtf8Buf::from_ill_formed_utf16(&[0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9]).bytes,
914 b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"
915 );
916 }
917
918 #[test]
919 fn wtf8buf_push_str() {
920 let mut string = Wtf8Buf::new();
921 assert_eq!(string.bytes, b"");
922 string.push_str("aé 💩");
923 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
924 }
925
926 #[test]
927 fn wtf8buf_push_char() {
928 let mut string = Wtf8Buf::from_str("aé ");
929 assert_eq!(string.bytes, b"a\xC3\xA9 ");
930 string.push_char('💩');
931 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
932 }
933
934 #[test]
935 fn wtf8buf_push() {
936 let mut string = Wtf8Buf::from_str("aé ");
937 assert_eq!(string.bytes, b"a\xC3\xA9 ");
938 string.push(CodePoint::from_char('💩'));
939 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
940
941 fn c(value: u32) -> CodePoint {
942 CodePoint::from_u32(value).unwrap()
943 }
944
945 let mut string = Wtf8Buf::new();
946 string.push(c(0xd83d)); string.push(c(0xdca9)); assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); let mut string = Wtf8Buf::new();
951 string.push(c(0xd83d)); string.push(c(0x20)); string.push(c(0xdca9)); assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
955
956 let mut string = Wtf8Buf::new();
957 string.push(c(0xd800)); string.push(c(0xdbff)); assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
960
961 let mut string = Wtf8Buf::new();
962 string.push(c(0xd800)); string.push(c(0xe000)); assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
965
966 let mut string = Wtf8Buf::new();
967 string.push(c(0xd7ff)); string.push(c(0xdc00)); assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
970
971 let mut string = Wtf8Buf::new();
972 string.push(c(0x61)); string.push(c(0xdc00)); assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
975
976 let mut string = Wtf8Buf::new();
977 string.push(c(0xdc00)); assert_eq!(string.bytes, b"\xED\xB0\x80");
979 }
980
981 #[test]
982 fn wtf8buf_push_wtf8() {
983 let mut string = Wtf8Buf::from_str("aé");
984 assert_eq!(string.bytes, b"a\xC3\xA9");
985 string.push_wtf8(Wtf8::from_str(" 💩"));
986 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
987
988 fn w(value: &[u8]) -> &Wtf8 {
989 unsafe { transmute(value) }
990 }
991
992 let mut string = Wtf8Buf::new();
993 string.push_wtf8(w(b"\xED\xA0\xBD")); string.push_wtf8(w(b"\xED\xB2\xA9")); assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); let mut string = Wtf8Buf::new();
998 string.push_wtf8(w(b"\xED\xA0\xBD")); string.push_wtf8(w(b" ")); string.push_wtf8(w(b"\xED\xB2\xA9")); assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1002
1003 let mut string = Wtf8Buf::new();
1004 string.push_wtf8(w(b"\xED\xA0\x80")); string.push_wtf8(w(b"\xED\xAF\xBF")); assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1007
1008 let mut string = Wtf8Buf::new();
1009 string.push_wtf8(w(b"\xED\xA0\x80")); string.push_wtf8(w(b"\xEE\x80\x80")); assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
1012
1013 let mut string = Wtf8Buf::new();
1014 string.push_wtf8(w(b"\xED\x9F\xBF")); string.push_wtf8(w(b"\xED\xB0\x80")); assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1017
1018 let mut string = Wtf8Buf::new();
1019 string.push_wtf8(w(b"a")); string.push_wtf8(w(b"\xED\xB0\x80")); assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
1022
1023 let mut string = Wtf8Buf::new();
1024 string.push_wtf8(w(b"\xED\xB0\x80")); assert_eq!(string.bytes, b"\xED\xB0\x80");
1026 }
1027
1028 #[test]
1029 fn wtf8buf_truncate() {
1030 let mut string = Wtf8Buf::from_str("aé");
1031 string.truncate(1);
1032 assert_eq!(string.bytes, b"a");
1033 }
1034
1035 #[test]
1036 #[should_panic]
1037 fn wtf8buf_truncate_fail_code_point_boundary() {
1038 let mut string = Wtf8Buf::from_str("aé");
1039 string.truncate(2);
1040 }
1041
1042 #[test]
1043 #[should_panic]
1044 fn wtf8buf_truncate_fail_longer() {
1045 let mut string = Wtf8Buf::from_str("aé");
1046 string.truncate(4);
1047 }
1048
1049 #[test]
1050 fn wtf8buf_into_string() {
1051 let mut string = Wtf8Buf::from_str("aé 💩");
1052 assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩")));
1053 string.push(CodePoint::from_u32(0xd800).unwrap());
1054 assert_eq!(string.clone().into_string(), Err(string));
1055 }
1056
1057 #[test]
1058 fn wtf8buf_into_string_lossy() {
1059 let mut string = Wtf8Buf::from_str("aé 💩");
1060 assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩"));
1061 string.push(CodePoint::from_u32(0xd800).unwrap());
1062 assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
1063 }
1064
1065 #[test]
1066 fn wtf8buf_from_iterator() {
1067 fn f(values: &[u32]) -> Wtf8Buf {
1068 values
1069 .iter()
1070 .map(|&c| CodePoint::from_u32(c).unwrap())
1071 .collect::<Wtf8Buf>()
1072 }
1073 assert_eq!(
1074 f(&[0x61, 0xe9, 0x20, 0x1f4a9]).bytes,
1075 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
1076 );
1077
1078 assert_eq!(f(&[0xd83d, 0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); assert_eq!(
1080 f(&[0xd83d, 0x20, 0xdca9]).bytes,
1081 b"\xED\xA0\xBD \xED\xB2\xA9"
1082 );
1083 assert_eq!(f(&[0xd800, 0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1084 assert_eq!(f(&[0xd800, 0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1085 assert_eq!(f(&[0xd7ff, 0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1086 assert_eq!(f(&[0x61, 0xdc00]).bytes, b"\x61\xED\xB0\x80");
1087 assert_eq!(f(&[0xdc00]).bytes, b"\xED\xB0\x80");
1088 }
1089
1090 #[test]
1091 fn wtf8buf_extend() {
1092 fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
1093 fn c(value: &u32) -> CodePoint {
1094 CodePoint::from_u32(*value).unwrap()
1095 }
1096 let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
1097 string.extend(extended.iter().map(c));
1098 string
1099 }
1100
1101 assert_eq!(
1102 e(&[0x61, 0xe9], &[0x20, 0x1f4a9]).bytes,
1103 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
1104 );
1105
1106 assert_eq!(e(&[0xd83d], &[0xdca9]).bytes, b"\xF0\x9F\x92\xA9"); assert_eq!(
1108 e(&[0xd83d, 0x20], &[0xdca9]).bytes,
1109 b"\xED\xA0\xBD \xED\xB2\xA9"
1110 );
1111 assert_eq!(e(&[0xd800], &[0xdbff]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1112 assert_eq!(e(&[0xd800], &[0xe000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1113 assert_eq!(e(&[0xd7ff], &[0xdc00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1114 assert_eq!(e(&[0x61], &[0xdc00]).bytes, b"\x61\xED\xB0\x80");
1115 assert_eq!(e(&[], &[0xdc00]).bytes, b"\xED\xB0\x80");
1116 }
1117
1118 #[test]
1119 fn wtf8buf_debug() {
1120 let mut string = Wtf8Buf::from_str("aé 💩");
1121 string.push(CodePoint::from_u32(0xd800).unwrap());
1122 assert_eq!(format!("{string:?}"), r#""aé 💩\u{D800}""#);
1123 }
1124
1125 #[test]
1126 fn wtf8buf_as_slice() {
1127 assert_eq!(Wtf8Buf::from_str("aé"), Wtf8::from_str("aé"));
1128 }
1129
1130 #[test]
1131 fn wtf8_debug() {
1132 let mut string = Wtf8Buf::from_str("aé 💩");
1133 string.push(CodePoint::from_u32(0xd800).unwrap());
1134 let string_ref = &*string;
1135 assert_eq!(format!("{string_ref:?}"), r#""aé 💩\u{D800}""#);
1136 }
1137
1138 #[test]
1139 fn wtf8_from_str() {
1140 assert_eq!(&Wtf8::from_str("").bytes, b"");
1141 assert_eq!(
1142 &Wtf8::from_str("aé 💩").bytes,
1143 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
1144 );
1145 }
1146
1147 #[test]
1148 fn wtf8_as_bytes() {
1149 assert_eq!(Wtf8::from_str("").as_bytes(), b"");
1150 assert_eq!(
1151 Wtf8::from_str("aé 💩").as_bytes(),
1152 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
1153 );
1154 }
1155
1156 #[test]
1157 fn wtf8_from_bytes_unchecked() {
1158 assert_eq!(unsafe { &Wtf8::from_bytes_unchecked(b"").bytes }, b"");
1159 assert_eq!(
1160 unsafe { &Wtf8::from_bytes_unchecked(b"a\xC3\xA9 \xF0\x9F\x92\xA9").bytes },
1161 b"a\xC3\xA9 \xF0\x9F\x92\xA9"
1162 );
1163 assert_eq!(
1164 unsafe { Wtf8::from_bytes_unchecked(b"a\xC3\xA9 \xF0\x9F\x92\xA9") },
1165 Wtf8::from_str("aé 💩")
1166 )
1167 }
1168
1169 #[test]
1170 fn wtf8_cow() {
1171 let s: Cow<Wtf8> = Cow::from(Wtf8::from_str("aé 💩"));
1172 assert!(matches!(s, Cow::Borrowed(_)));
1173 let owned: Wtf8Buf = s.into_owned();
1174 assert_eq!(owned, Wtf8Buf::from_str("aé 💩"));
1175 }
1176
1177 #[test]
1178 fn wtf8_len() {
1179 assert_eq!(Wtf8::from_str("").len(), 0);
1180 assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1181 }
1182
1183 #[test]
1184 fn wtf8_slice() {
1185 assert_eq!(&Wtf8::from_str("aé 💩").slice(1, 4).bytes, b"\xC3\xA9 ");
1186 }
1187
1188 #[test]
1189 #[should_panic]
1190 fn wtf8_slice_not_code_point_boundary() {
1191 Wtf8::from_str("aé 💩").slice(2, 4);
1192 }
1193
1194 #[test]
1195 fn wtf8_slice_from() {
1196 assert_eq!(
1197 &Wtf8::from_str("aé 💩").slice_from(1).bytes,
1198 b"\xC3\xA9 \xF0\x9F\x92\xA9"
1199 );
1200 }
1201
1202 #[test]
1203 #[should_panic]
1204 fn wtf8_slice_from_not_code_point_boundary() {
1205 Wtf8::from_str("aé 💩").slice_from(2);
1206 }
1207
1208 #[test]
1209 fn wtf8_slice_to() {
1210 assert_eq!(&Wtf8::from_str("aé 💩").slice_to(4).bytes, b"a\xC3\xA9 ");
1211 }
1212
1213 #[test]
1214 #[should_panic]
1215 fn wtf8_slice_to_not_code_point_boundary() {
1216 Wtf8::from_str("aé 💩").slice_from(5);
1217 }
1218
1219 #[test]
1220 fn wtf8_ascii_byte_at() {
1221 let slice = Wtf8::from_str("aé 💩");
1222 assert_eq!(slice.ascii_byte_at(0), b'a');
1223 assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1224 assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1225 assert_eq!(slice.ascii_byte_at(3), b' ');
1226 assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1227 }
1228
1229 #[test]
1230 fn wtf8_code_points() {
1231 fn c(value: u32) -> CodePoint {
1232 CodePoint::from_u32(value).unwrap()
1233 }
1234 fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1235 string
1236 .code_points()
1237 .map(|c| c.to_char())
1238 .collect::<Vec<_>>()
1239 }
1240 let mut string = Wtf8Buf::from_str("é ");
1241 assert_eq!(cp(&string), vec![Some('é'), Some(' ')]);
1242 string.push(c(0xd83d));
1243 assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]);
1244 string.push(c(0xdca9));
1245 assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]);
1246 }
1247
1248 #[test]
1249 fn wtf8_as_str() {
1250 assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1251 assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1252 let mut string = Wtf8Buf::new();
1253 string.push(CodePoint::from_u32(0xd800).unwrap());
1254 assert_eq!(string.as_str(), None);
1255 }
1256
1257 #[test]
1258 fn wtf8_to_string_lossy() {
1259 assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1260 assert_eq!(
1261 Wtf8::from_str("aé 💩").to_string_lossy(),
1262 Cow::Borrowed("aé 💩")
1263 );
1264 let mut string = Wtf8Buf::from_str("aé 💩");
1265 string.push(CodePoint::from_u32(0xd800).unwrap());
1266 assert_eq!(string.to_string_lossy(), {
1267 let o: Cow<str> = Cow::Owned(String::from("aé 💩�"));
1268 o
1269 });
1270 }
1271
1272 #[test]
1273 fn wtf8_to_ill_formed_utf16() {
1274 let mut string = Wtf8Buf::from_str("aé ");
1275 string.push(CodePoint::from_u32(0xd83d).unwrap());
1276 string.push_char('💩');
1277 assert_eq!(
1278 string.to_ill_formed_utf16().collect::<Vec<_>>(),
1279 vec![0x61, 0xe9, 0x20, 0xd83d, 0xd83d, 0xdca9]
1280 );
1281 }
1282}