]> git.proxmox.com Git - rustc.git/blame - vendor/toml_edit/src/parser/strings.rs
New upstream version 1.75.0+dfsg1
[rustc.git] / vendor / toml_edit / src / parser / strings.rs
CommitLineData
0a29b90c
FG
1use std::borrow::Cow;
2use std::char;
3use std::ops::RangeInclusive;
4
fe692bf9 5use winnow::combinator::alt;
49aad941 6use winnow::combinator::cut_err;
fe692bf9 7use winnow::combinator::delimited;
49aad941
FG
8use winnow::combinator::fail;
9use winnow::combinator::opt;
10use winnow::combinator::peek;
fe692bf9
FG
11use winnow::combinator::preceded;
12use winnow::combinator::repeat;
49aad941 13use winnow::combinator::success;
fe692bf9 14use winnow::combinator::terminated;
49aad941 15use winnow::prelude::*;
add651ee 16use winnow::stream::Stream;
fe692bf9
FG
17use winnow::token::any;
18use winnow::token::none_of;
19use winnow::token::one_of;
20use winnow::token::tag;
21use winnow::token::take_while;
add651ee 22use winnow::trace::trace;
0a29b90c
FG
23
24use crate::parser::errors::CustomError;
25use crate::parser::numbers::HEXDIG;
26use crate::parser::prelude::*;
27use crate::parser::trivia::{from_utf8_unchecked, newline, ws, ws_newlines, NON_ASCII, WSCHAR};
28
29// ;; String
30
31// string = ml-basic-string / basic-string / ml-literal-string / literal-string
add651ee
FG
32pub(crate) fn string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
33 trace(
34 "string",
35 alt((
36 ml_basic_string,
37 basic_string,
38 ml_literal_string,
39 literal_string.map(Cow::Borrowed),
40 )),
41 )
49aad941 42 .parse_next(input)
0a29b90c
FG
43}
44
45// ;; Basic String
46
47// basic-string = quotation-mark *basic-char quotation-mark
add651ee
FG
48pub(crate) fn basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
49 trace("basic-string", |input: &mut Input<'i>| {
50 let _ = one_of(QUOTATION_MARK).parse_next(input)?;
0a29b90c 51
add651ee
FG
52 let mut c = Cow::Borrowed("");
53 if let Some(ci) = opt(basic_chars).parse_next(input)? {
54 c = ci;
55 }
56 while let Some(ci) = opt(basic_chars).parse_next(input)? {
57 c.to_mut().push_str(&ci);
58 }
0a29b90c 59
add651ee
FG
60 let _ = cut_err(one_of(QUOTATION_MARK))
61 .context(StrContext::Label("basic string"))
62 .parse_next(input)?;
0a29b90c 63
add651ee
FG
64 Ok(c)
65 })
66 .parse_next(input)
0a29b90c
FG
67}
68
69// quotation-mark = %x22 ; "
70pub(crate) const QUOTATION_MARK: u8 = b'"';
71
72// basic-char = basic-unescaped / escaped
add651ee 73fn basic_chars<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
0a29b90c
FG
74 alt((
75 // Deviate from the official grammar by batching the unescaped chars so we build a string a
76 // chunk at a time, rather than a `char` at a time.
fe692bf9
FG
77 take_while(1.., BASIC_UNESCAPED)
78 .try_map(std::str::from_utf8)
0a29b90c
FG
79 .map(Cow::Borrowed),
80 escaped.map(|c| Cow::Owned(String::from(c))),
81 ))
49aad941 82 .parse_next(input)
0a29b90c
FG
83}
84
85// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
86pub(crate) const BASIC_UNESCAPED: (
87 (u8, u8),
88 u8,
89 RangeInclusive<u8>,
90 RangeInclusive<u8>,
91 RangeInclusive<u8>,
92) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
93
94// escaped = escape escape-seq-char
add651ee 95fn escaped(input: &mut Input<'_>) -> PResult<char> {
49aad941 96 preceded(ESCAPE, escape_seq_char).parse_next(input)
0a29b90c
FG
97}
98
99// escape = %x5C ; \
100pub(crate) const ESCAPE: u8 = b'\\';
101
102// escape-seq-char = %x22 ; " quotation mark U+0022
103// escape-seq-char =/ %x5C ; \ reverse solidus U+005C
104// escape-seq-char =/ %x62 ; b backspace U+0008
105// escape-seq-char =/ %x66 ; f form feed U+000C
106// escape-seq-char =/ %x6E ; n line feed U+000A
107// escape-seq-char =/ %x72 ; r carriage return U+000D
108// escape-seq-char =/ %x74 ; t tab U+0009
109// escape-seq-char =/ %x75 4HEXDIG ; uXXXX U+XXXX
110// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX U+XXXXXXXX
add651ee 111fn escape_seq_char(input: &mut Input<'_>) -> PResult<char> {
0a29b90c
FG
112 dispatch! {any;
113 b'b' => success('\u{8}'),
114 b'f' => success('\u{c}'),
115 b'n' => success('\n'),
116 b'r' => success('\r'),
117 b't' => success('\t'),
add651ee
FG
118 b'u' => cut_err(hexescape::<4>).context(StrContext::Label("unicode 4-digit hex code")),
119 b'U' => cut_err(hexescape::<8>).context(StrContext::Label("unicode 8-digit hex code")),
0a29b90c
FG
120 b'\\' => success('\\'),
121 b'"' => success('"'),
122 _ => {
49aad941 123 cut_err(fail::<_, char, _>)
add651ee
FG
124 .context(StrContext::Label("escape sequence"))
125 .context(StrContext::Expected(StrContextValue::CharLiteral('b')))
126 .context(StrContext::Expected(StrContextValue::CharLiteral('f')))
127 .context(StrContext::Expected(StrContextValue::CharLiteral('n')))
128 .context(StrContext::Expected(StrContextValue::CharLiteral('r')))
129 .context(StrContext::Expected(StrContextValue::CharLiteral('t')))
130 .context(StrContext::Expected(StrContextValue::CharLiteral('u')))
131 .context(StrContext::Expected(StrContextValue::CharLiteral('U')))
132 .context(StrContext::Expected(StrContextValue::CharLiteral('\\')))
133 .context(StrContext::Expected(StrContextValue::CharLiteral('"')))
0a29b90c
FG
134 }
135 }
49aad941 136 .parse_next(input)
0a29b90c
FG
137}
138
add651ee 139pub(crate) fn hexescape<const N: usize>(input: &mut Input<'_>) -> PResult<char> {
fe692bf9 140 take_while(0..=N, HEXDIG)
0a29b90c
FG
141 .verify(|b: &[u8]| b.len() == N)
142 .map(|b: &[u8]| unsafe { from_utf8_unchecked(b, "`is_ascii_digit` filters out on-ASCII") })
49aad941 143 .verify_map(|s| u32::from_str_radix(s, 16).ok())
fe692bf9 144 .try_map(|h| char::from_u32(h).ok_or(CustomError::OutOfRange))
49aad941 145 .parse_next(input)
0a29b90c
FG
146}
147
148// ;; Multiline Basic String
149
150// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
151// ml-basic-string-delim
add651ee
FG
152fn ml_basic_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
153 trace(
154 "ml-basic-string",
155 delimited(
156 ML_BASIC_STRING_DELIM,
157 preceded(opt(newline), cut_err(ml_basic_body)),
158 cut_err(ML_BASIC_STRING_DELIM),
159 )
160 .context(StrContext::Label("multiline basic string")),
0a29b90c 161 )
49aad941 162 .parse_next(input)
0a29b90c
FG
163}
164
165// ml-basic-string-delim = 3quotation-mark
166pub(crate) const ML_BASIC_STRING_DELIM: &[u8] = b"\"\"\"";
167
168// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
add651ee 169fn ml_basic_body<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
0a29b90c 170 let mut c = Cow::Borrowed("");
add651ee 171 if let Some(ci) = opt(mlb_content).parse_next(input)? {
0a29b90c
FG
172 c = ci;
173 }
add651ee 174 while let Some(ci) = opt(mlb_content).parse_next(input)? {
0a29b90c
FG
175 c.to_mut().push_str(&ci);
176 }
177
add651ee
FG
178 while let Some(qi) = opt(mlb_quotes(none_of(b'\"').value(()))).parse_next(input)? {
179 if let Some(ci) = opt(mlb_content).parse_next(input)? {
0a29b90c
FG
180 c.to_mut().push_str(qi);
181 c.to_mut().push_str(&ci);
add651ee 182 while let Some(ci) = opt(mlb_content).parse_next(input)? {
0a29b90c
FG
183 c.to_mut().push_str(&ci);
184 }
185 } else {
186 break;
187 }
188 }
189
add651ee 190 if let Some(qi) = opt(mlb_quotes(tag(ML_BASIC_STRING_DELIM).value(()))).parse_next(input)? {
0a29b90c
FG
191 c.to_mut().push_str(qi);
192 }
193
add651ee 194 Ok(c)
0a29b90c
FG
195}
196
197// mlb-content = mlb-char / newline / mlb-escaped-nl
198// mlb-char = mlb-unescaped / escaped
add651ee 199fn mlb_content<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
0a29b90c
FG
200 alt((
201 // Deviate from the official grammar by batching the unescaped chars so we build a string a
202 // chunk at a time, rather than a `char` at a time.
fe692bf9
FG
203 take_while(1.., MLB_UNESCAPED)
204 .try_map(std::str::from_utf8)
0a29b90c 205 .map(Cow::Borrowed),
49aad941 206 // Order changed fromg grammar so `escaped` can more easily `cut_err` on bad escape sequences
0a29b90c
FG
207 mlb_escaped_nl.map(|_| Cow::Borrowed("")),
208 escaped.map(|c| Cow::Owned(String::from(c))),
209 newline.map(|_| Cow::Borrowed("\n")),
210 ))
49aad941 211 .parse_next(input)
0a29b90c
FG
212}
213
214// mlb-quotes = 1*2quotation-mark
215fn mlb_quotes<'i>(
add651ee
FG
216 mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
217) -> impl Parser<Input<'i>, &'i str, ContextError> {
218 move |input: &mut Input<'i>| {
219 let start = input.checkpoint();
0a29b90c
FG
220 let res = terminated(b"\"\"", peek(term.by_ref()))
221 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
49aad941 222 .parse_next(input);
0a29b90c
FG
223
224 match res {
add651ee
FG
225 Err(winnow::error::ErrMode::Backtrack(_)) => {
226 input.reset(start);
227 terminated(b"\"", peek(term.by_ref()))
228 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
229 .parse_next(input)
230 }
0a29b90c
FG
231 res => res,
232 }
233 }
234}
235
236// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
237pub(crate) const MLB_UNESCAPED: (
238 (u8, u8),
239 u8,
240 RangeInclusive<u8>,
241 RangeInclusive<u8>,
242 RangeInclusive<u8>,
243) = (WSCHAR, 0x21, 0x23..=0x5B, 0x5D..=0x7E, NON_ASCII);
244
245// mlb-escaped-nl = escape ws newline *( wschar / newline
246// When the last non-whitespace character on a line is a \,
247// it will be trimmed along with all whitespace
248// (including newlines) up to the next non-whitespace
249// character or closing delimiter.
add651ee 250fn mlb_escaped_nl(input: &mut Input<'_>) -> PResult<()> {
fe692bf9 251 repeat(1.., (ESCAPE, ws, ws_newlines))
49aad941 252 .map(|()| ())
0a29b90c 253 .value(())
49aad941 254 .parse_next(input)
0a29b90c
FG
255}
256
257// ;; Literal String
258
259// literal-string = apostrophe *literal-char apostrophe
add651ee
FG
260pub(crate) fn literal_string<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
261 trace(
262 "literal-string",
263 delimited(
264 APOSTROPHE,
265 cut_err(take_while(0.., LITERAL_CHAR)),
266 cut_err(APOSTROPHE),
267 )
268 .try_map(std::str::from_utf8)
269 .context(StrContext::Label("literal string")),
49aad941 270 )
49aad941 271 .parse_next(input)
0a29b90c
FG
272}
273
274// apostrophe = %x27 ; ' apostrophe
275pub(crate) const APOSTROPHE: u8 = b'\'';
276
277// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
278pub(crate) const LITERAL_CHAR: (
279 u8,
280 RangeInclusive<u8>,
281 RangeInclusive<u8>,
282 RangeInclusive<u8>,
283) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
284
285// ;; Multiline Literal String
286
287// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
288// ml-literal-string-delim
add651ee
FG
289fn ml_literal_string<'i>(input: &mut Input<'i>) -> PResult<Cow<'i, str>> {
290 trace(
291 "ml-literal-string",
292 delimited(
293 (ML_LITERAL_STRING_DELIM, opt(newline)),
294 cut_err(ml_literal_body.map(|t| {
295 if t.contains("\r\n") {
296 Cow::Owned(t.replace("\r\n", "\n"))
297 } else {
298 Cow::Borrowed(t)
299 }
300 })),
301 cut_err(ML_LITERAL_STRING_DELIM),
302 )
303 .context(StrContext::Label("multiline literal string")),
0a29b90c 304 )
49aad941 305 .parse_next(input)
0a29b90c
FG
306}
307
308// ml-literal-string-delim = 3apostrophe
309pub(crate) const ML_LITERAL_STRING_DELIM: &[u8] = b"'''";
310
311// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
add651ee 312fn ml_literal_body<'i>(input: &mut Input<'i>) -> PResult<&'i str> {
0a29b90c 313 (
fe692bf9
FG
314 repeat(0.., mll_content).map(|()| ()),
315 repeat(
316 0..,
317 (
318 mll_quotes(none_of(APOSTROPHE).value(())),
319 repeat(1.., mll_content).map(|()| ()),
320 ),
321 )
49aad941 322 .map(|()| ()),
0a29b90c
FG
323 opt(mll_quotes(tag(ML_LITERAL_STRING_DELIM).value(()))),
324 )
325 .recognize()
fe692bf9 326 .try_map(std::str::from_utf8)
49aad941 327 .parse_next(input)
0a29b90c
FG
328}
329
330// mll-content = mll-char / newline
add651ee 331fn mll_content(input: &mut Input<'_>) -> PResult<u8> {
49aad941 332 alt((one_of(MLL_CHAR), newline)).parse_next(input)
0a29b90c
FG
333}
334
335// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
336const MLL_CHAR: (
337 u8,
338 RangeInclusive<u8>,
339 RangeInclusive<u8>,
340 RangeInclusive<u8>,
341) = (0x9, 0x20..=0x26, 0x28..=0x7E, NON_ASCII);
342
343// mll-quotes = 1*2apostrophe
344fn mll_quotes<'i>(
add651ee
FG
345 mut term: impl winnow::Parser<Input<'i>, (), ContextError>,
346) -> impl Parser<Input<'i>, &'i str, ContextError> {
347 move |input: &mut Input<'i>| {
348 let start = input.checkpoint();
0a29b90c
FG
349 let res = terminated(b"''", peek(term.by_ref()))
350 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
49aad941 351 .parse_next(input);
0a29b90c
FG
352
353 match res {
add651ee
FG
354 Err(winnow::error::ErrMode::Backtrack(_)) => {
355 input.reset(start);
356 terminated(b"'", peek(term.by_ref()))
357 .map(|b| unsafe { from_utf8_unchecked(b, "`bytes` out non-ASCII") })
358 .parse_next(input)
359 }
0a29b90c
FG
360 res => res,
361 }
362 }
363}
364
365#[cfg(test)]
366mod test {
367 use super::*;
368
369 #[test]
370 fn basic_string() {
371 let input =
372 r#""I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF. \U0002070E""#;
373 let expected = "I\'m a string. \"You can quote me\". Name\tJosé\nLocation\tSF. \u{2070E}";
49aad941 374 let parsed = string.parse(new_input(input));
0a29b90c
FG
375 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
376 }
377
378 #[test]
379 fn ml_basic_string() {
380 let cases = [
381 (
382 r#""""
383Roses are red
384Violets are blue""""#,
385 r#"Roses are red
386Violets are blue"#,
387 ),
388 (r#"""" \""" """"#, " \"\"\" "),
389 (r#"""" \\""""#, " \\"),
390 ];
391
392 for &(input, expected) in &cases {
49aad941 393 let parsed = string.parse(new_input(input));
0a29b90c
FG
394 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
395 }
396
397 let invalid_cases = [r#"""" """#, r#"""" \""""#];
398
399 for input in &invalid_cases {
49aad941 400 let parsed = string.parse(new_input(input));
0a29b90c
FG
401 assert!(parsed.is_err());
402 }
403 }
404
405 #[test]
406 fn ml_basic_string_escape_ws() {
407 let inputs = [
408 r#""""
409The quick brown \
410
411
412 fox jumps over \
413 the lazy dog.""""#,
414 r#""""\
415 The quick brown \
416 fox jumps over \
417 the lazy dog.\
418 """"#,
419 ];
420 for input in &inputs {
421 let expected = "The quick brown fox jumps over the lazy dog.";
49aad941 422 let parsed = string.parse(new_input(input));
0a29b90c
FG
423 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
424 }
425 let empties = [
426 r#""""\
427 """"#,
428 r#""""
429\
430 \
431""""#,
432 ];
433 for input in &empties {
434 let expected = "";
49aad941 435 let parsed = string.parse(new_input(input));
0a29b90c
FG
436 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
437 }
438 }
439
440 #[test]
441 fn literal_string() {
442 let inputs = [
ed00b5ec
FG
443 r"'C:\Users\nodejs\templates'",
444 r"'\\ServerX\admin$\system32\'",
0a29b90c 445 r#"'Tom "Dubs" Preston-Werner'"#,
ed00b5ec 446 r"'<\i\c*\s*>'",
0a29b90c
FG
447 ];
448
449 for input in &inputs {
450 let expected = &input[1..input.len() - 1];
49aad941 451 let parsed = string.parse(new_input(input));
0a29b90c
FG
452 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
453 }
454 }
455
456 #[test]
457 fn ml_literal_string() {
458 let inputs = [
ed00b5ec 459 r"'''I [dw]on't need \d{2} apples'''",
0a29b90c
FG
460 r#"''''one_quote''''"#,
461 ];
462 for input in &inputs {
463 let expected = &input[3..input.len() - 3];
49aad941 464 let parsed = string.parse(new_input(input));
0a29b90c
FG
465 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
466 }
467
468 let input = r#"'''
469The first newline is
470trimmed in raw strings.
471 All other whitespace
472 is preserved.
473'''"#;
474 let expected = &input[4..input.len() - 3];
49aad941 475 let parsed = string.parse(new_input(input));
0a29b90c
FG
476 assert_eq!(parsed.as_deref(), Ok(expected), "Parsing {input:?}");
477 }
478}