]>
Commit | Line | Data |
---|---|---|
7cac9316 XL |
1 | use std::borrow::Cow; |
2 | use std::char; | |
3 | use std::str; | |
4 | use std::string; | |
5 | ||
6 | use self::Token::*; | |
7 | ||
8 | #[derive(Eq, PartialEq, Debug)] | |
9 | pub enum Token<'a> { | |
10 | Whitespace(&'a str), | |
11 | Newline, | |
12 | Comment(&'a str), | |
13 | ||
14 | Equals, | |
15 | Period, | |
16 | Comma, | |
17 | Colon, | |
18 | Plus, | |
19 | LeftBrace, | |
20 | RightBrace, | |
21 | LeftBracket, | |
22 | RightBracket, | |
23 | ||
24 | Keylike(&'a str), | |
25 | String { src: &'a str, val: Cow<'a, str> }, | |
26 | } | |
27 | ||
28 | #[derive(Eq, PartialEq, Debug)] | |
29 | pub enum Error { | |
30 | InvalidCharInString(usize, char), | |
31 | InvalidEscape(usize, char), | |
32 | InvalidHexEscape(usize, char), | |
33 | InvalidEscapeValue(usize, u32), | |
34 | NewlineInString(usize), | |
35 | Unexpected(usize, char), | |
36 | UnterminatedString(usize), | |
37 | NewlineInTableKey(usize), | |
38 | EmptyTableKey(usize), | |
39 | Wanted { at: usize, expected: &'static str, found: &'static str }, | |
40 | } | |
41 | ||
42 | #[derive(Clone)] | |
43 | pub struct Tokenizer<'a> { | |
44 | input: &'a str, | |
45 | chars: CrlfFold<'a>, | |
46 | } | |
47 | ||
48 | #[derive(Clone)] | |
49 | struct CrlfFold<'a> { | |
50 | chars: str::CharIndices<'a>, | |
51 | } | |
52 | ||
53 | #[derive(Debug)] | |
54 | enum MaybeString { | |
55 | NotEscaped(usize), | |
56 | Owned(string::String), | |
57 | } | |
58 | ||
59 | impl<'a> Tokenizer<'a> { | |
60 | pub fn new(input: &'a str) -> Tokenizer<'a> { | |
61 | let mut t = Tokenizer { | |
62 | input: input, | |
63 | chars: CrlfFold { | |
64 | chars: input.char_indices(), | |
65 | }, | |
66 | }; | |
67 | // Eat utf-8 BOM | |
68 | t.eatc('\u{feff}'); | |
69 | return t | |
70 | } | |
71 | ||
72 | pub fn next(&mut self) -> Result<Option<Token<'a>>, Error> { | |
73 | let token = match self.chars.next() { | |
74 | Some((_, '\n')) => Newline, | |
75 | Some((start, ' ')) => self.whitespace_token(start), | |
76 | Some((start, '\t')) => self.whitespace_token(start), | |
77 | Some((start, '#')) => self.comment_token(start), | |
78 | Some((_, '=')) => Equals, | |
79 | Some((_, '.')) => Period, | |
80 | Some((_, ',')) => Comma, | |
81 | Some((_, ':')) => Colon, | |
82 | Some((_, '+')) => Plus, | |
83 | Some((_, '{')) => LeftBrace, | |
84 | Some((_, '}')) => RightBrace, | |
85 | Some((_, '[')) => LeftBracket, | |
86 | Some((_, ']')) => RightBracket, | |
87 | Some((start, '\'')) => return self.literal_string(start).map(Some), | |
88 | Some((start, '"')) => return self.basic_string(start).map(Some), | |
89 | Some((start, ch)) if is_keylike(ch) => self.keylike(start), | |
90 | ||
91 | Some((start, ch)) => return Err(Error::Unexpected(start, ch)), | |
92 | None => return Ok(None), | |
93 | }; | |
94 | Ok(Some(token)) | |
95 | } | |
96 | ||
97 | pub fn peek(&mut self) -> Result<Option<Token<'a>>, Error> { | |
98 | self.clone().next() | |
99 | } | |
100 | ||
101 | pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> { | |
102 | match self.peek()? { | |
103 | Some(ref found) if expected == *found => {} | |
104 | Some(_) => return Ok(false), | |
105 | None => return Ok(false), | |
106 | } | |
107 | drop(self.next()); | |
108 | Ok(true) | |
109 | } | |
110 | ||
111 | pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> { | |
112 | let current = self.current(); | |
113 | match self.next()? { | |
114 | Some(found) => { | |
115 | if expected == found { | |
116 | Ok(()) | |
117 | } else { | |
118 | Err(Error::Wanted { | |
119 | at: current, | |
120 | expected: expected.describe(), | |
121 | found: found.describe(), | |
122 | }) | |
123 | } | |
124 | } | |
125 | None => { | |
126 | Err(Error::Wanted { | |
127 | at: self.input.len(), | |
128 | expected: expected.describe(), | |
129 | found: "eof", | |
130 | }) | |
131 | } | |
132 | } | |
133 | } | |
134 | ||
135 | pub fn table_key(&mut self) -> Result<Cow<'a, str>, Error> { | |
136 | let current = self.current(); | |
137 | match self.next()? { | |
138 | Some(Token::Keylike(k)) => Ok(k.into()), | |
139 | Some(Token::String { src, val }) => { | |
140 | let offset = self.substr_offset(src); | |
141 | if val == "" { | |
142 | return Err(Error::EmptyTableKey(offset)) | |
143 | } | |
144 | match src.find("\n") { | |
145 | None => Ok(val), | |
146 | Some(i) => Err(Error::NewlineInTableKey(offset + i)), | |
147 | } | |
148 | } | |
149 | Some(other) => { | |
150 | Err(Error::Wanted { | |
151 | at: current, | |
152 | expected: "a table key", | |
153 | found: other.describe(), | |
154 | }) | |
155 | } | |
156 | None => { | |
157 | Err(Error::Wanted { | |
158 | at: self.input.len(), | |
159 | expected: "a table key", | |
160 | found: "eof", | |
161 | }) | |
162 | } | |
163 | } | |
164 | } | |
165 | ||
166 | pub fn eat_whitespace(&mut self) -> Result<(), Error> { | |
167 | while self.eatc(' ') || self.eatc('\t') { | |
168 | // ... | |
169 | } | |
170 | Ok(()) | |
171 | } | |
172 | ||
173 | pub fn eat_comment(&mut self) -> Result<bool, Error> { | |
174 | if !self.eatc('#') { | |
175 | return Ok(false) | |
176 | } | |
177 | drop(self.comment_token(0)); | |
178 | self.eat_newline_or_eof().map(|()| true) | |
179 | } | |
180 | ||
181 | pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> { | |
182 | let current = self.current(); | |
183 | match self.next()? { | |
184 | None | | |
185 | Some(Token::Newline) => Ok(()), | |
186 | Some(other) => { | |
187 | Err(Error::Wanted { | |
188 | at: current, | |
189 | expected: "newline", | |
190 | found: other.describe(), | |
191 | }) | |
192 | } | |
193 | } | |
194 | } | |
195 | ||
196 | pub fn skip_to_newline(&mut self) { | |
197 | loop { | |
198 | match self.chars.next() { | |
199 | Some((_, '\n')) | | |
200 | None => break, | |
201 | _ => {} | |
202 | } | |
203 | } | |
204 | } | |
205 | ||
206 | fn eatc(&mut self, ch: char) -> bool { | |
207 | match self.chars.clone().next() { | |
208 | Some((_, ch2)) if ch == ch2 => { | |
209 | self.chars.next(); | |
210 | true | |
211 | } | |
212 | _ => false, | |
213 | } | |
214 | } | |
215 | ||
216 | pub fn current(&mut self) -> usize { | |
217 | self.chars.clone().next().map(|i| i.0).unwrap_or(self.input.len()) | |
218 | } | |
219 | ||
220 | pub fn input(&self) -> &'a str { | |
221 | self.input | |
222 | } | |
223 | ||
224 | fn whitespace_token(&mut self, start: usize) -> Token<'a> { | |
225 | while self.eatc(' ') || self.eatc('\t') { | |
226 | // ... | |
227 | } | |
228 | Whitespace(&self.input[start..self.current()]) | |
229 | } | |
230 | ||
231 | fn comment_token(&mut self, start: usize) -> Token<'a> { | |
232 | while let Some((_, ch)) = self.chars.clone().next() { | |
233 | if ch != '\t' && (ch < '\u{20}' || ch > '\u{10ffff}') { | |
234 | break | |
235 | } | |
236 | self.chars.next(); | |
237 | } | |
238 | Comment(&self.input[start..self.current()]) | |
239 | } | |
240 | ||
241 | fn read_string(&mut self, | |
242 | delim: char, | |
243 | start: usize, | |
244 | new_ch: &mut FnMut(&mut Tokenizer, &mut MaybeString, | |
245 | bool, usize, char) | |
246 | -> Result<(), Error>) | |
247 | -> Result<Token<'a>, Error> { | |
248 | let mut multiline = false; | |
249 | if self.eatc(delim) { | |
250 | if self.eatc(delim) { | |
251 | multiline = true; | |
252 | } else { | |
253 | return Ok(String { | |
254 | src: &self.input[start..start+2], | |
255 | val: Cow::Borrowed(""), | |
256 | }) | |
257 | } | |
258 | } | |
259 | let mut val = MaybeString::NotEscaped(self.current()); | |
260 | let mut n = 0; | |
261 | 'outer: loop { | |
262 | n += 1; | |
263 | match self.chars.next() { | |
264 | Some((i, '\n')) => { | |
265 | if multiline { | |
266 | if self.input.as_bytes()[i] == b'\r' { | |
267 | val.to_owned(&self.input[..i]); | |
268 | } | |
269 | if n == 1 { | |
270 | val = MaybeString::NotEscaped(self.current()); | |
271 | } else { | |
272 | val.push('\n'); | |
273 | } | |
274 | continue | |
275 | } else { | |
276 | return Err(Error::NewlineInString(i)) | |
277 | } | |
278 | } | |
279 | Some((i, ch)) if ch == delim => { | |
280 | if multiline { | |
281 | for _ in 0..2 { | |
282 | if !self.eatc(delim) { | |
283 | val.push(delim); | |
284 | continue 'outer | |
285 | } | |
286 | } | |
287 | } | |
288 | return Ok(String { | |
289 | src: &self.input[start..self.current()], | |
290 | val: val.into_cow(&self.input[..i]), | |
291 | }) | |
292 | } | |
293 | Some((i, c)) => try!(new_ch(self, &mut val, multiline, i, c)), | |
294 | None => return Err(Error::UnterminatedString(start)) | |
295 | } | |
296 | } | |
297 | } | |
298 | ||
299 | fn literal_string(&mut self, start: usize) -> Result<Token<'a>, Error> { | |
300 | self.read_string('\'', start, &mut |_me, val, _multi, i, ch| { | |
301 | if ch == '\u{09}' || ('\u{20}' <= ch && ch <= '\u{10ffff}') { | |
302 | val.push(ch); | |
303 | Ok(()) | |
304 | } else { | |
305 | Err(Error::InvalidCharInString(i, ch)) | |
306 | } | |
307 | }) | |
308 | } | |
309 | ||
310 | fn basic_string(&mut self, start: usize) -> Result<Token<'a>, Error> { | |
311 | self.read_string('"', start, &mut |me, val, multi, i, ch| { | |
312 | match ch { | |
313 | '\\' => { | |
314 | val.to_owned(&me.input[..i]); | |
315 | match me.chars.next() { | |
316 | Some((_, '"')) => val.push('"'), | |
317 | Some((_, '\\')) => val.push('\\'), | |
318 | Some((_, 'b')) => val.push('\u{8}'), | |
319 | Some((_, 'f')) => val.push('\u{c}'), | |
320 | Some((_, 'n')) => val.push('\n'), | |
321 | Some((_, 'r')) => val.push('\r'), | |
322 | Some((_, 't')) => val.push('\t'), | |
323 | Some((i, c @ 'u')) | | |
324 | Some((i, c @ 'U')) => { | |
325 | let len = if c == 'u' {4} else {8}; | |
326 | val.push(try!(me.hex(start, i, len))); | |
327 | } | |
328 | Some((_, '\n')) if multi => { | |
329 | while let Some((_, ch)) = me.chars.clone().next() { | |
330 | match ch { | |
331 | ' ' | '\t' | '\n' => { | |
332 | me.chars.next(); | |
333 | } | |
334 | _ => break, | |
335 | } | |
336 | } | |
337 | } | |
338 | Some((i, c)) => return Err(Error::InvalidEscape(i, c)), | |
339 | None => return Err(Error::UnterminatedString(start)), | |
340 | } | |
341 | Ok(()) | |
342 | } | |
343 | ch if '\u{20}' <= ch && ch <= '\u{10ffff}' => { | |
344 | val.push(ch); | |
345 | Ok(()) | |
346 | } | |
347 | _ => Err(Error::InvalidCharInString(i, ch)) | |
348 | } | |
349 | }) | |
350 | } | |
351 | ||
352 | fn hex(&mut self, start: usize, i: usize, len: usize) -> Result<char, Error> { | |
353 | let mut val = 0; | |
354 | for _ in 0..len { | |
355 | match self.chars.next() { | |
356 | Some((_, ch)) if '0' <= ch && ch <= '9' => { | |
357 | val = val * 16 + (ch as u32 - '0' as u32); | |
358 | } | |
359 | Some((_, ch)) if 'A' <= ch && ch <= 'F' => { | |
360 | val = val * 16 + (ch as u32 - 'A' as u32) + 10; | |
361 | } | |
362 | Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)), | |
363 | None => return Err(Error::UnterminatedString(start)), | |
364 | } | |
365 | } | |
366 | match char::from_u32(val) { | |
367 | Some(ch) => Ok(ch), | |
368 | None => Err(Error::InvalidEscapeValue(i, val)), | |
369 | } | |
370 | } | |
371 | ||
372 | fn keylike(&mut self, start: usize) -> Token<'a> { | |
373 | while let Some((_, ch)) = self.chars.clone().next() { | |
374 | if !is_keylike(ch) { | |
375 | break | |
376 | } | |
377 | self.chars.next(); | |
378 | } | |
379 | Keylike(&self.input[start..self.current()]) | |
380 | } | |
381 | ||
382 | pub fn substr_offset(&self, s: &'a str) -> usize { | |
383 | assert!(s.len() <= self.input.len()); | |
384 | let a = self.input.as_ptr() as usize; | |
385 | let b = s.as_ptr() as usize; | |
386 | assert!(a <= b); | |
387 | b - a | |
388 | } | |
389 | } | |
390 | ||
391 | impl<'a> Iterator for CrlfFold<'a> { | |
392 | type Item = (usize, char); | |
393 | ||
394 | fn next(&mut self) -> Option<(usize, char)> { | |
395 | self.chars.next().map(|(i, c)| { | |
396 | if c == '\r' { | |
397 | let mut attempt = self.chars.clone(); | |
398 | if let Some((_, '\n')) = attempt.next() { | |
399 | self.chars = attempt; | |
400 | return (i, '\n') | |
401 | } | |
402 | } | |
403 | (i, c) | |
404 | }) | |
405 | } | |
406 | } | |
407 | ||
408 | impl MaybeString { | |
409 | fn push(&mut self, ch: char) { | |
410 | match *self { | |
411 | MaybeString::NotEscaped(..) => {} | |
412 | MaybeString::Owned(ref mut s) => s.push(ch), | |
413 | } | |
414 | } | |
415 | ||
416 | fn to_owned(&mut self, input: &str) { | |
417 | match *self { | |
418 | MaybeString::NotEscaped(start) => { | |
419 | *self = MaybeString::Owned(input[start..].to_owned()); | |
420 | } | |
421 | MaybeString::Owned(..) => {} | |
422 | } | |
423 | } | |
424 | ||
425 | fn into_cow<'a>(self, input: &'a str) -> Cow<'a, str> { | |
426 | match self { | |
427 | MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]), | |
428 | MaybeString::Owned(s) => Cow::Owned(s), | |
429 | } | |
430 | } | |
431 | } | |
432 | ||
433 | fn is_keylike(ch: char) -> bool { | |
434 | ('A' <= ch && ch <= 'Z') || | |
435 | ('a' <= ch && ch <= 'z') || | |
436 | ('0' <= ch && ch <= '9') || | |
437 | ch == '-' || | |
438 | ch == '_' | |
439 | } | |
440 | ||
441 | impl<'a> Token<'a> { | |
442 | pub fn describe(&self) -> &'static str { | |
443 | match *self { | |
444 | Token::Keylike(_) => "an identifier", | |
445 | Token::Equals => "an equals", | |
446 | Token::Period => "a period", | |
447 | Token::Comment(_) => "a comment", | |
448 | Token::Newline => "a newline", | |
449 | Token::Whitespace(_) => "whitespace", | |
450 | Token::Comma => "a comma", | |
451 | Token::RightBrace => "a right brace", | |
452 | Token::LeftBrace => "a left brace", | |
453 | Token::RightBracket => "a right bracket", | |
454 | Token::LeftBracket => "a left bracket", | |
455 | Token::String { .. } => "a string", | |
456 | Token::Colon => "a colon", | |
457 | Token::Plus => "a plus", | |
458 | } | |
459 | } | |
460 | } | |
461 | ||
462 | #[cfg(test)] | |
463 | mod tests { | |
464 | use std::borrow::Cow; | |
465 | use super::{Tokenizer, Token, Error}; | |
466 | ||
467 | fn err(input: &str, err: Error) { | |
468 | let mut t = Tokenizer::new(input); | |
469 | let token = t.next().unwrap_err(); | |
470 | assert_eq!(token, err); | |
471 | assert!(t.next().unwrap().is_none()); | |
472 | } | |
473 | ||
474 | #[test] | |
475 | fn literal_strings() { | |
476 | fn t(input: &str, val: &str) { | |
477 | let mut t = Tokenizer::new(input); | |
478 | let token = t.next().unwrap().unwrap(); | |
479 | assert_eq!(token, Token::String { | |
480 | src: input, | |
481 | val: Cow::Borrowed(val), | |
482 | }); | |
483 | assert!(t.next().unwrap().is_none()); | |
484 | } | |
485 | ||
486 | t("''", ""); | |
487 | t("''''''", ""); | |
488 | t("'''\n'''", ""); | |
489 | t("'a'", "a"); | |
490 | t("'\"a'", "\"a"); | |
491 | t("''''a'''", "'a"); | |
492 | t("'''\n'a\n'''", "'a\n"); | |
493 | t("'''a\n'a\r\n'''", "a\n'a\n"); | |
494 | } | |
495 | ||
496 | #[test] | |
497 | fn basic_strings() { | |
498 | fn t(input: &str, val: &str) { | |
499 | let mut t = Tokenizer::new(input); | |
500 | let token = t.next().unwrap().unwrap(); | |
501 | assert_eq!(token, Token::String { | |
502 | src: input, | |
503 | val: Cow::Borrowed(val), | |
504 | }); | |
505 | assert!(t.next().unwrap().is_none()); | |
506 | } | |
507 | ||
508 | t(r#""""#, ""); | |
509 | t(r#""""""""#, ""); | |
510 | t(r#""a""#, "a"); | |
511 | t(r#""""a""""#, "a"); | |
512 | t(r#""\t""#, "\t"); | |
513 | t(r#""\u0000""#, "\0"); | |
514 | t(r#""\U00000000""#, "\0"); | |
515 | t(r#""\U000A0000""#, "\u{A0000}"); | |
516 | t(r#""\\t""#, "\\t"); | |
517 | t("\"\"\"\\\n\"\"\"", ""); | |
518 | t("\"\"\"\\\n \t \t \\\r\n \t \n \t \r\n\"\"\"", ""); | |
519 | t(r#""\r""#, "\r"); | |
520 | t(r#""\n""#, "\n"); | |
521 | t(r#""\b""#, "\u{8}"); | |
522 | t(r#""a\fa""#, "a\u{c}a"); | |
523 | t(r#""\"a""#, "\"a"); | |
524 | t("\"\"\"\na\"\"\"", "a"); | |
525 | t("\"\"\"\n\"\"\"", ""); | |
526 | err(r#""\a"#, Error::InvalidEscape(2, 'a')); | |
527 | err("\"\\\n", Error::InvalidEscape(2, '\n')); | |
528 | err("\"\\\r\n", Error::InvalidEscape(2, '\n')); | |
529 | err("\"\\", Error::UnterminatedString(0)); | |
530 | err("\"\u{0}", Error::InvalidCharInString(1, '\u{0}')); | |
531 | err(r#""\U00""#, Error::InvalidHexEscape(5, '"')); | |
532 | err(r#""\U00"#, Error::UnterminatedString(0)); | |
533 | err(r#""\uD800"#, Error::InvalidEscapeValue(2, 0xd800)); | |
534 | err(r#""\UFFFFFFFF"#, Error::InvalidEscapeValue(2, 0xffffffff)); | |
535 | } | |
536 | ||
537 | #[test] | |
538 | fn keylike() { | |
539 | fn t(input: &str) { | |
540 | let mut t = Tokenizer::new(input); | |
541 | let token = t.next().unwrap().unwrap(); | |
542 | assert_eq!(token, Token::Keylike(input)); | |
543 | assert!(t.next().unwrap().is_none()); | |
544 | } | |
545 | t("foo"); | |
546 | t("0bar"); | |
547 | t("bar0"); | |
548 | t("1234"); | |
549 | t("a-b"); | |
550 | t("a_B"); | |
551 | t("-_-"); | |
552 | t("___"); | |
553 | } | |
554 | ||
555 | #[test] | |
556 | fn all() { | |
557 | fn t(input: &str, expected: &[Token]) { | |
558 | let mut tokens = Tokenizer::new(input); | |
559 | let mut actual = Vec::new(); | |
560 | while let Some(token) = tokens.next().unwrap() { | |
561 | actual.push(token); | |
562 | } | |
563 | for (a, b) in actual.iter().zip(expected) { | |
564 | assert_eq!(a, b); | |
565 | } | |
566 | assert_eq!(actual.len(), expected.len()); | |
567 | } | |
568 | ||
569 | t(" a ", &[ | |
570 | Token::Whitespace(" "), | |
571 | Token::Keylike("a"), | |
572 | Token::Whitespace(" "), | |
573 | ]); | |
574 | ||
575 | t(" a\t [[]] \t [] {} , . =\n# foo \r\n#foo \n ", &[ | |
576 | Token::Whitespace(" "), | |
577 | Token::Keylike("a"), | |
578 | Token::Whitespace("\t "), | |
579 | Token::LeftBracket, | |
580 | Token::LeftBracket, | |
581 | Token::RightBracket, | |
582 | Token::RightBracket, | |
583 | Token::Whitespace(" \t "), | |
584 | Token::LeftBracket, | |
585 | Token::RightBracket, | |
586 | Token::Whitespace(" "), | |
587 | Token::LeftBrace, | |
588 | Token::RightBrace, | |
589 | Token::Whitespace(" "), | |
590 | Token::Comma, | |
591 | Token::Whitespace(" "), | |
592 | Token::Period, | |
593 | Token::Whitespace(" "), | |
594 | Token::Equals, | |
595 | Token::Newline, | |
596 | Token::Comment("# foo "), | |
597 | Token::Newline, | |
598 | Token::Comment("#foo "), | |
599 | Token::Newline, | |
600 | Token::Whitespace(" "), | |
601 | ]); | |
602 | } | |
603 | ||
604 | #[test] | |
605 | fn bare_cr_bad() { | |
606 | err("\r", Error::Unexpected(0, '\r')); | |
607 | err("'\n", Error::NewlineInString(1)); | |
608 | err("'\u{0}", Error::InvalidCharInString(1, '\u{0}')); | |
609 | err("'", Error::UnterminatedString(0)); | |
610 | err("\u{0}", Error::Unexpected(0, '\u{0}')); | |
611 | } | |
612 | ||
613 | #[test] | |
614 | fn bad_comment() { | |
615 | let mut t = Tokenizer::new("#\u{0}"); | |
616 | t.next().unwrap().unwrap(); | |
617 | assert_eq!(t.next(), Err(Error::Unexpected(1, '\u{0}'))); | |
618 | assert!(t.next().unwrap().is_none()); | |
619 | } | |
620 | } |