1 //! This example shows an example of how to parse an escaped string. The
2 //! rules for the string are similar to JSON and rust. A string is:
4 //! - Enclosed by double quotes
5 //! - Can contain any raw unescaped code point besides \ and "
6 //! - Matches the following escape sequences: \b, \f, \n, \r, \t, \", \\, \/
7 //! - Matches code points like Rust: \u{XXXX}, where XXXX can be up to 6
9 //! - an escape followed by whitespace consumes all whitespace between the
10 //! escape and the next non-whitespace character
12 use winnow
::ascii
::multispace1
;
13 use winnow
::combinator
::alt
;
14 use winnow
::combinator
::fold_repeat
;
15 use winnow
::combinator
::{delimited, preceded}
;
16 use winnow
::error
::{FromExternalError, ParseError}
;
17 use winnow
::prelude
::*;
18 use winnow
::token
::{take_till1, take_while}
;
20 /// Parse a string. Use a loop of `parse_fragment` and push all of the fragments
21 /// into an output string.
22 pub fn parse_string
<'a
, E
>(input
: &'a
str) -> IResult
<&'a
str, String
, E
>
24 E
: ParseError
<&'a
str> + FromExternalError
<&'a
str, std
::num
::ParseIntError
>,
26 // fold_repeat is the equivalent of iterator::fold. It runs a parser in a loop,
27 // and for each output value, calls a folding function on each output value.
28 let build_string
= fold_repeat(
30 // Our parser function – parses a single string fragment
32 // Our init value, an empty string
34 // Our folding function. For each fragment, append the fragment to the
36 |mut string
, fragment
| {
38 StringFragment
::Literal(s
) => string
.push_str(s
),
39 StringFragment
::EscapedChar(c
) => string
.push(c
),
40 StringFragment
::EscapedWS
=> {}
46 // Finally, parse the string. Note that, if `build_string` could accept a raw
47 // " character, the closing delimiter " would never match. When using
48 // `delimited` with a looping parser (like fold_repeat), be sure that the
49 // loop won't accidentally match your closing delimiter!
50 delimited('
"', build_string, '"'
).parse_next(input
)
53 /// A string fragment contains a fragment of a string being parsed: either
54 /// a non-empty Literal (a series of non-escaped characters), a single
55 /// parsed escaped character, or a block of escaped whitespace.
56 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
57 enum StringFragment
<'a
> {
63 /// Combine `parse_literal`, `parse_escaped_whitespace`, and `parse_escaped_char`
64 /// into a `StringFragment`.
65 fn parse_fragment
<'a
, E
>(input
: &'a
str) -> IResult
<&'a
str, StringFragment
<'a
>, E
>
67 E
: ParseError
<&'a
str> + FromExternalError
<&'a
str, std
::num
::ParseIntError
>,
70 // The `map` combinator runs a parser, then applies a function to the output
72 parse_literal
.map(StringFragment
::Literal
),
73 parse_escaped_char
.map(StringFragment
::EscapedChar
),
74 parse_escaped_whitespace
.value(StringFragment
::EscapedWS
),
79 /// Parse a non-empty block of text that doesn't include \ or "
80 fn parse_literal
<'a
, E
: ParseError
<&'a
str>>(input
: &'a
str) -> IResult
<&'a
str, &'a
str, E
> {
81 // `take_till1` parses a string of 0 or more characters that aren't one of the
83 let not_quote_slash
= take_till1("\"\\");
85 // `verify` runs a parser, then runs a verification function on the output of
86 // the parser. The verification function accepts the output only if it
87 // returns true. In this case, we want to ensure that the output of take_till1
90 .verify(|s
: &str| !s
.is_empty())
94 // parser combinators are constructed from the bottom up:
95 // first we write parsers for the smallest elements (escaped characters),
96 // then combine them into larger parsers.
98 /// Parse an escaped character: \n, \t, \r, \u{00AC}, etc.
99 fn parse_escaped_char
<'a
, E
>(input
: &'a
str) -> IResult
<&'a
str, char, E
>
101 E
: ParseError
<&'a
str> + FromExternalError
<&'a
str, std
::num
::ParseIntError
>,
105 // `alt` tries each parser in sequence, returning the result of
106 // the first successful match
109 // The `value` parser returns a fixed value (the first argument) if its
110 // parser (the second argument) succeeds. In these cases, it looks for
111 // the marker characters (n, r, t, etc) and returns the matching
112 // character (\n, \r, \t, etc).
126 /// Parse a unicode sequence, of the form u{XXXX}, where XXXX is 1 to 6
127 /// hexadecimal numerals. We will combine this later with `parse_escaped_char`
128 /// to parse sequences like \u{00AC}.
129 fn parse_unicode
<'a
, E
>(input
: &'a
str) -> IResult
<&'a
str, char, E
>
131 E
: ParseError
<&'a
str> + FromExternalError
<&'a
str, std
::num
::ParseIntError
>,
133 // `take_while` parses between `m` and `n` bytes (inclusive) that match
134 // a predicate. `parse_hex` here parses between 1 and 6 hexadecimal numerals.
135 let parse_hex
= take_while(1..=6, |c
: char| c
.is_ascii_hexdigit());
137 // `preceded` takes a prefix parser, and if it succeeds, returns the result
138 // of the body parser. In this case, it parses u{XXXX}.
139 let parse_delimited_hex
= preceded(
141 // `delimited` is like `preceded`, but it parses both a prefix and a suffix.
142 // It returns the result of the middle parser. In this case, it parses
143 // {XXXX}, where XXXX is 1 to 6 hex numerals, and returns XXXX
144 delimited('{', parse_hex, '}'
),
147 // `try_map` takes the result of a parser and applies a function that returns
148 // a Result. In this case we take the hex bytes from parse_hex and attempt to
149 // convert them to a u32.
150 let parse_u32
= parse_delimited_hex
.try_map(move |hex
| u32::from_str_radix(hex
, 16));
152 // verify_map is like try_map, but it takes an Option instead of a Result. If
153 // the function returns None, verify_map returns an error. In this case, because
154 // not all u32 values are valid unicode code points, we have to fallibly
155 // convert to char with from_u32.
156 parse_u32
.verify_map(std
::char::from_u32
).parse_next(input
)
159 /// Parse a backslash, followed by any amount of whitespace. This is used later
160 /// to discard any escaped whitespace.
161 fn parse_escaped_whitespace
<'a
, E
: ParseError
<&'a
str>>(
163 ) -> IResult
<&'a
str, &'a
str, E
> {
164 preceded('
\\'
, multispace1
).parse_next(input
)