1 //! Utilities for the `str` primitive type.
3 //! *[See also the `str` primitive type](str).*
5 #![stable(feature = "rust1", since = "1.0.0")]
6 // Many of the usings in this module are only used in the test configuration.
7 // It's cleaner to just turn off the unused_imports warning than to fix them.
8 #![allow(unused_imports)]
10 use core
::borrow
::{Borrow, BorrowMut}
;
11 use core
::iter
::FusedIterator
;
14 use core
::str::pattern
::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}
;
15 use core
::unicode
::conversions
;
17 use crate::borrow
::ToOwned
;
18 use crate::boxed
::Box
;
19 use crate::slice
::{Concat, Join, SliceIndex}
;
20 use crate::string
::String
;
23 #[stable(feature = "rust1", since = "1.0.0")]
24 pub use core
::str::pattern
;
25 #[stable(feature = "encode_utf16", since = "1.8.0")]
26 pub use core
::str::EncodeUtf16
;
27 #[stable(feature = "split_ascii_whitespace", since = "1.34.0")]
28 pub use core
::str::SplitAsciiWhitespace
;
29 #[stable(feature = "split_inclusive", since = "1.51.0")]
30 pub use core
::str::SplitInclusive
;
31 #[stable(feature = "rust1", since = "1.0.0")]
32 pub use core
::str::SplitWhitespace
;
33 #[stable(feature = "rust1", since = "1.0.0")]
34 pub use core
::str::{from_utf8, from_utf8_mut, Bytes, CharIndices, Chars}
;
35 #[stable(feature = "rust1", since = "1.0.0")]
36 pub use core
::str::{from_utf8_unchecked, from_utf8_unchecked_mut, ParseBoolError}
;
37 #[stable(feature = "str_escape", since = "1.34.0")]
38 pub use core
::str::{EscapeDebug, EscapeDefault, EscapeUnicode}
;
39 #[stable(feature = "rust1", since = "1.0.0")]
40 pub use core
::str::{FromStr, Utf8Error}
;
42 #[stable(feature = "rust1", since = "1.0.0")]
43 pub use core
::str::{Lines, LinesAny}
;
44 #[stable(feature = "rust1", since = "1.0.0")]
45 pub use core
::str::{MatchIndices, RMatchIndices}
;
46 #[stable(feature = "rust1", since = "1.0.0")]
47 pub use core
::str::{Matches, RMatches}
;
48 #[stable(feature = "rust1", since = "1.0.0")]
49 pub use core
::str::{RSplit, Split}
;
50 #[stable(feature = "rust1", since = "1.0.0")]
51 pub use core
::str::{RSplitN, SplitN}
;
52 #[stable(feature = "rust1", since = "1.0.0")]
53 pub use core
::str::{RSplitTerminator, SplitTerminator}
;
54 #[unstable(feature = "utf8_chunks", issue = "99543")]
55 pub use core
::str::{Utf8Chunk, Utf8Chunks}
;
57 /// Note: `str` in `Concat<str>` is not meaningful here.
58 /// This type parameter of the trait only exists to enable another impl.
59 #[cfg(not(no_global_oom_handling))]
60 #[unstable(feature = "slice_concat_ext", issue = "27747")]
61 impl<S
: Borrow
<str>> Concat
<str> for [S
] {
64 fn concat(slice
: &Self) -> String
{
69 #[cfg(not(no_global_oom_handling))]
70 #[unstable(feature = "slice_concat_ext", issue = "27747")]
71 impl<S
: Borrow
<str>> Join
<&str> for [S
] {
74 fn join(slice
: &Self, sep
: &str) -> String
{
75 unsafe { String::from_utf8_unchecked(join_generic_copy(slice, sep.as_bytes())) }
79 #[cfg(not(no_global_oom_handling))]
80 macro_rules
! specialize_for_lengths
{
81 ($separator
:expr
, $target
:expr
, $iter
:expr
; $
($num
:expr
),*) => {{
82 let mut target
= $target
;
84 let sep_bytes
= $separator
;
85 match $separator
.len() {
87 // loops with hardcoded sizes run much faster
88 // specialize the cases with small separator lengths
91 copy_slice_and_advance
!(target
, sep_bytes
);
92 let content_bytes
= s
.borrow().as_ref();
93 copy_slice_and_advance
!(target
, content_bytes
);
98 // arbitrary non-zero size fallback
100 copy_slice_and_advance
!(target
, sep_bytes
);
101 let content_bytes
= s
.borrow().as_ref();
102 copy_slice_and_advance
!(target
, content_bytes
);
110 #[cfg(not(no_global_oom_handling))]
111 macro_rules
! copy_slice_and_advance
{
112 ($target
:expr
, $bytes
:expr
) => {
113 let len
= $bytes
.len();
114 let (head
, tail
) = { $target }
.split_at_mut(len
);
115 head
.copy_from_slice($bytes
);
120 // Optimized join implementation that works for both Vec<T> (T: Copy) and String's inner vec
121 // Currently (2018-05-13) there is a bug with type inference and specialization (see issue #36262)
122 // For this reason SliceConcat<T> is not specialized for T: Copy and SliceConcat<str> is the
123 // only user of this function. It is left in place for the time when that is fixed.
125 // the bounds for String-join are S: Borrow<str> and for Vec-join Borrow<[T]>
126 // [T] and str both impl AsRef<[T]> for some T
127 // => s.borrow().as_ref() and we always have slices
128 #[cfg(not(no_global_oom_handling))]
129 fn join_generic_copy
<B
, T
, S
>(slice
: &[S
], sep
: &[T
]) -> Vec
<T
>
132 B
: AsRef
<[T
]> + ?Sized
,
135 let sep_len
= sep
.len();
136 let mut iter
= slice
.iter();
138 // the first slice is the only one without a separator preceding it
139 let first
= match iter
.next() {
140 Some(first
) => first
,
141 None
=> return vec
![],
144 // compute the exact total length of the joined Vec
145 // if the `len` calculation overflows, we'll panic
146 // we would have run out of memory anyway and the rest of the function requires
147 // the entire Vec pre-allocated for safety
148 let reserved_len
= sep_len
149 .checked_mul(iter
.len())
151 slice
.iter().map(|s
| s
.borrow().as_ref().len()).try_fold(n
, usize::checked_add
)
153 .expect("attempt to join into collection with len > usize::MAX");
155 // prepare an uninitialized buffer
156 let mut result
= Vec
::with_capacity(reserved_len
);
157 debug_assert
!(result
.capacity() >= reserved_len
);
159 result
.extend_from_slice(first
.borrow().as_ref());
162 let pos
= result
.len();
163 let target
= result
.spare_capacity_mut().get_unchecked_mut(..reserved_len
- pos
);
165 // Convert the separator and slices to slices of MaybeUninit
166 // to simplify implementation in specialize_for_lengths
167 let sep_uninit
= core
::slice
::from_raw_parts(sep
.as_ptr().cast(), sep
.len());
168 let iter_uninit
= iter
.map(|it
| {
169 let it
= it
.borrow().as_ref();
170 core
::slice
::from_raw_parts(it
.as_ptr().cast(), it
.len())
173 // copy separator and slices over without bounds checks
174 // generate loops with hardcoded offsets for small separators
175 // massive improvements possible (~ x2)
176 let remain
= specialize_for_lengths
!(sep_uninit
, target
, iter_uninit
; 0, 1, 2, 3, 4);
178 // A weird borrow implementation may return different
179 // slices for the length calculation and the actual copy.
180 // Make sure we don't expose uninitialized bytes to the caller.
181 let result_len
= reserved_len
- remain
.len();
182 result
.set_len(result_len
);
187 #[stable(feature = "rust1", since = "1.0.0")]
188 impl Borrow
<str> for String
{
190 fn borrow(&self) -> &str {
195 #[stable(feature = "string_borrow_mut", since = "1.36.0")]
196 impl BorrowMut
<str> for String
{
198 fn borrow_mut(&mut self) -> &mut str {
203 #[cfg(not(no_global_oom_handling))]
204 #[stable(feature = "rust1", since = "1.0.0")]
205 impl ToOwned
for str {
208 fn to_owned(&self) -> String
{
209 unsafe { String::from_utf8_unchecked(self.as_bytes().to_owned()) }
212 fn clone_into(&self, target
: &mut String
) {
213 let mut b
= mem
::take(target
).into_bytes();
214 self.as_bytes().clone_into(&mut b
);
215 *target
= unsafe { String::from_utf8_unchecked(b) }
219 /// Methods for string slices.
222 /// Converts a `Box<str>` into a `Box<[u8]>` without copying or allocating.
229 /// let s = "this is a string";
230 /// let boxed_str = s.to_owned().into_boxed_str();
231 /// let boxed_bytes = boxed_str.into_boxed_bytes();
232 /// assert_eq!(*boxed_bytes, *s.as_bytes());
234 #[rustc_allow_incoherent_impl]
235 #[stable(feature = "str_box_extras", since = "1.20.0")]
236 #[must_use = "`self` will be dropped if the result is not used"]
238 pub fn into_boxed_bytes(self: Box
<str>) -> Box
<[u8]> {
242 /// Replaces all matches of a pattern with another string.
244 /// `replace` creates a new [`String`], and copies the data from this string slice into it.
245 /// While doing so, it attempts to find matches of a pattern. If it finds any, it
246 /// replaces them with the replacement string slice.
253 /// let s = "this is old";
255 /// assert_eq!("this is new", s.replace("old", "new"));
256 /// assert_eq!("than an old", s.replace("is", "an"));
259 /// When the pattern doesn't match:
262 /// let s = "this is old";
263 /// assert_eq!(s, s.replace("cookie monster", "little lamb"));
265 #[cfg(not(no_global_oom_handling))]
266 #[rustc_allow_incoherent_impl]
267 #[must_use = "this returns the replaced string as a new allocation, \
268 without modifying the original"]
269 #[stable(feature = "rust1", since = "1.0.0")]
271 pub fn replace
<'a
, P
: Pattern
<'a
>>(&'a
self, from
: P
, to
: &str) -> String
{
272 let mut result
= String
::new();
273 let mut last_end
= 0;
274 for (start
, part
) in self.match_indices(from
) {
275 result
.push_str(unsafe { self.get_unchecked(last_end..start) }
);
277 last_end
= start
+ part
.len();
279 result
.push_str(unsafe { self.get_unchecked(last_end..self.len()) }
);
283 /// Replaces first N matches of a pattern with another string.
285 /// `replacen` creates a new [`String`], and copies the data from this string slice into it.
286 /// While doing so, it attempts to find matches of a pattern. If it finds any, it
287 /// replaces them with the replacement string slice at most `count` times.
294 /// let s = "foo foo 123 foo";
295 /// assert_eq!("new new 123 foo", s.replacen("foo", "new", 2));
296 /// assert_eq!("faa fao 123 foo", s.replacen('o', "a", 3));
297 /// assert_eq!("foo foo new23 foo", s.replacen(char::is_numeric, "new", 1));
300 /// When the pattern doesn't match:
303 /// let s = "this is old";
304 /// assert_eq!(s, s.replacen("cookie monster", "little lamb", 10));
306 #[cfg(not(no_global_oom_handling))]
307 #[rustc_allow_incoherent_impl]
308 #[must_use = "this returns the replaced string as a new allocation, \
309 without modifying the original"]
310 #[stable(feature = "str_replacen", since = "1.16.0")]
311 pub fn replacen
<'a
, P
: Pattern
<'a
>>(&'a
self, pat
: P
, to
: &str, count
: usize) -> String
{
312 // Hope to reduce the times of re-allocation
313 let mut result
= String
::with_capacity(32);
314 let mut last_end
= 0;
315 for (start
, part
) in self.match_indices(pat
).take(count
) {
316 result
.push_str(unsafe { self.get_unchecked(last_end..start) }
);
318 last_end
= start
+ part
.len();
320 result
.push_str(unsafe { self.get_unchecked(last_end..self.len()) }
);
324 /// Returns the lowercase equivalent of this string slice, as a new [`String`].
326 /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property
329 /// Since some characters can expand into multiple characters when changing
330 /// the case, this function returns a [`String`] instead of modifying the
331 /// parameter in-place.
340 /// assert_eq!("hello", s.to_lowercase());
343 /// A tricky example, with sigma:
348 /// assert_eq!("σ", sigma.to_lowercase());
350 /// // but at the end of a word, it's ς, not σ:
351 /// let odysseus = "ὈΔΥΣΣΕΎΣ";
353 /// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase());
356 /// Languages without case are not changed:
359 /// let new_year = "农历新年";
361 /// assert_eq!(new_year, new_year.to_lowercase());
363 #[cfg(not(no_global_oom_handling))]
364 #[rustc_allow_incoherent_impl]
365 #[must_use = "this returns the lowercase string as a new String, \
366 without modifying the original"]
367 #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
368 pub fn to_lowercase(&self) -> String
{
369 let out
= convert_while_ascii(self.as_bytes(), u8::to_ascii_lowercase
);
371 // Safety: we know this is a valid char boundary since
372 // out.len() is only progressed if ascii bytes are found
373 let rest
= unsafe { self.get_unchecked(out.len()..) }
;
375 // Safety: We have written only valid ASCII to our vec
376 let mut s
= unsafe { String::from_utf8_unchecked(out) }
;
378 for (i
, c
) in rest
[..].char_indices() {
380 // Σ maps to σ, except at the end of a word where it maps to ς.
381 // This is the only conditional (contextual) but language-independent mapping
382 // in `SpecialCasing.txt`,
383 // so hard-code it rather than have a generic "condition" mechanism.
384 // See https://github.com/rust-lang/rust/issues/26035
385 map_uppercase_sigma(rest
, i
, &mut s
)
387 match conversions
::to_lower(c
) {
388 [a
, '
\0'
, _
] => s
.push(a
),
403 fn map_uppercase_sigma(from
: &str, i
: usize, to
: &mut String
) {
404 // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
405 // for the definition of `Final_Sigma`.
406 debug_assert
!('Σ'
.len_utf8() == 2);
407 let is_word_final
= case_ignoreable_then_cased(from
[..i
].chars().rev())
408 && !case_ignoreable_then_cased(from
[i
+ 2..].chars());
409 to
.push_str(if is_word_final { "ς" }
else { "σ" }
);
412 fn case_ignoreable_then_cased
<I
: Iterator
<Item
= char>>(iter
: I
) -> bool
{
413 use core
::unicode
::{Case_Ignorable, Cased}
;
414 match iter
.skip_while(|&c
| Case_Ignorable(c
)).next() {
421 /// Returns the uppercase equivalent of this string slice, as a new [`String`].
423 /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property
426 /// Since some characters can expand into multiple characters when changing
427 /// the case, this function returns a [`String`] instead of modifying the
428 /// parameter in-place.
437 /// assert_eq!("HELLO", s.to_uppercase());
440 /// Scripts without case are not changed:
443 /// let new_year = "农历新年";
445 /// assert_eq!(new_year, new_year.to_uppercase());
448 /// One character can become multiple:
450 /// let s = "tschüß";
452 /// assert_eq!("TSCHÜSS", s.to_uppercase());
454 #[cfg(not(no_global_oom_handling))]
455 #[rustc_allow_incoherent_impl]
456 #[must_use = "this returns the uppercase string as a new String, \
457 without modifying the original"]
458 #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
459 pub fn to_uppercase(&self) -> String
{
460 let out
= convert_while_ascii(self.as_bytes(), u8::to_ascii_uppercase
);
462 // Safety: we know this is a valid char boundary since
463 // out.len() is only progressed if ascii bytes are found
464 let rest
= unsafe { self.get_unchecked(out.len()..) }
;
466 // Safety: We have written only valid ASCII to our vec
467 let mut s
= unsafe { String::from_utf8_unchecked(out) }
;
469 for c
in rest
.chars() {
470 match conversions
::to_upper(c
) {
471 [a
, '
\0'
, _
] => s
.push(a
),
486 /// Converts a [`Box<str>`] into a [`String`] without copying or allocating.
493 /// let string = String::from("birthday gift");
494 /// let boxed_str = string.clone().into_boxed_str();
496 /// assert_eq!(boxed_str.into_string(), string);
498 #[stable(feature = "box_str", since = "1.4.0")]
499 #[rustc_allow_incoherent_impl]
500 #[must_use = "`self` will be dropped if the result is not used"]
502 pub fn into_string(self: Box
<str>) -> String
{
503 let slice
= Box
::<[u8]>::from(self);
504 unsafe { String::from_utf8_unchecked(slice.into_vec()) }
507 /// Creates a new [`String`] by repeating a string `n` times.
511 /// This function will panic if the capacity would overflow.
518 /// assert_eq!("abc".repeat(4), String::from("abcabcabcabc"));
521 /// A panic upon overflow:
524 /// // this will panic at runtime
525 /// let huge = "0123456789abcdef".repeat(usize::MAX);
527 #[cfg(not(no_global_oom_handling))]
528 #[rustc_allow_incoherent_impl]
530 #[stable(feature = "repeat_str", since = "1.16.0")]
531 pub fn repeat(&self, n
: usize) -> String
{
532 unsafe { String::from_utf8_unchecked(self.as_bytes().repeat(n)) }
535 /// Returns a copy of this string where each character is mapped to its
536 /// ASCII upper case equivalent.
538 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
539 /// but non-ASCII letters are unchanged.
541 /// To uppercase the value in-place, use [`make_ascii_uppercase`].
543 /// To uppercase ASCII characters in addition to non-ASCII characters, use
544 /// [`to_uppercase`].
549 /// let s = "Grüße, Jürgen ❤";
551 /// assert_eq!("GRüßE, JüRGEN ❤", s.to_ascii_uppercase());
554 /// [`make_ascii_uppercase`]: str::make_ascii_uppercase
555 /// [`to_uppercase`]: #method.to_uppercase
556 #[cfg(not(no_global_oom_handling))]
557 #[rustc_allow_incoherent_impl]
558 #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase()`"]
559 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
561 pub fn to_ascii_uppercase(&self) -> String
{
562 let mut s
= self.to_owned();
563 s
.make_ascii_uppercase();
567 /// Returns a copy of this string where each character is mapped to its
568 /// ASCII lower case equivalent.
570 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
571 /// but non-ASCII letters are unchanged.
573 /// To lowercase the value in-place, use [`make_ascii_lowercase`].
575 /// To lowercase ASCII characters in addition to non-ASCII characters, use
576 /// [`to_lowercase`].
581 /// let s = "Grüße, Jürgen ❤";
583 /// assert_eq!("grüße, jürgen ❤", s.to_ascii_lowercase());
586 /// [`make_ascii_lowercase`]: str::make_ascii_lowercase
587 /// [`to_lowercase`]: #method.to_lowercase
588 #[cfg(not(no_global_oom_handling))]
589 #[rustc_allow_incoherent_impl]
590 #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase()`"]
591 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
593 pub fn to_ascii_lowercase(&self) -> String
{
594 let mut s
= self.to_owned();
595 s
.make_ascii_lowercase();
600 /// Converts a boxed slice of bytes to a boxed string slice without checking
601 /// that the string contains valid UTF-8.
608 /// let smile_utf8 = Box::new([226, 152, 186]);
609 /// let smile = unsafe { std::str::from_boxed_utf8_unchecked(smile_utf8) };
611 /// assert_eq!("☺", &*smile);
613 #[stable(feature = "str_box_extras", since = "1.20.0")]
616 pub unsafe fn from_boxed_utf8_unchecked(v
: Box
<[u8]>) -> Box
<str> {
617 unsafe { Box::from_raw(Box::into_raw(v) as *mut str) }
620 /// Converts the bytes while the bytes are still ascii.
621 /// For better average performance, this is happens in chunks of `2*size_of::<usize>()`.
622 /// Returns a vec with the converted bytes.
625 #[cfg(not(no_global_oom_handling))]
626 fn convert_while_ascii(b
: &[u8], convert
: fn(&u8) -> u8) -> Vec
<u8> {
627 let mut out
= Vec
::with_capacity(b
.len());
629 const USIZE_SIZE
: usize = mem
::size_of
::<usize>();
630 const MAGIC_UNROLL
: usize = 2;
631 const N
: usize = USIZE_SIZE
* MAGIC_UNROLL
;
632 const NONASCII_MASK
: usize = usize::from_ne_bytes([0x80; USIZE_SIZE
]);
636 while i
+ N
<= b
.len() {
637 // Safety: we have checks the sizes `b` and `out` to know that our
638 let in_chunk
= b
.get_unchecked(i
..i
+ N
);
639 let out_chunk
= out
.spare_capacity_mut().get_unchecked_mut(i
..i
+ N
);
642 for j
in 0..MAGIC_UNROLL
{
643 // read the bytes 1 usize at a time (unaligned since we haven't checked the alignment)
644 // safety: in_chunk is valid bytes in the range
645 bits
|= in_chunk
.as_ptr().cast
::<usize>().add(j
).read_unaligned();
647 // if our chunks aren't ascii, then return only the prior bytes as init
648 if bits
& NONASCII_MASK
!= 0 {
652 // perform the case conversions on N bytes (gets heavily autovec'd)
654 // safety: in_chunk and out_chunk is valid bytes in the range
655 let out
= out_chunk
.get_unchecked_mut(j
);
656 out
.write(convert(in_chunk
.get_unchecked(j
)));
659 // mark these bytes as initialised