]>
Commit | Line | Data |
---|---|---|
353b0b11 FG |
1 | #[allow(unused, deprecated)] |
2 | use std::ascii::AsciiExt; | |
3 | use std::error::Error; | |
4 | use std::fmt; | |
5 | use std::iter::Enumerate; | |
6 | use std::str::Bytes; | |
7 | ||
8 | use super::{Mime, Source, ParamSource, Indexed, CHARSET, UTF_8}; | |
9 | ||
10 | #[derive(Debug)] | |
11 | pub enum ParseError { | |
12 | MissingSlash, | |
13 | MissingEqual, | |
14 | MissingQuote, | |
15 | InvalidToken { | |
16 | pos: usize, | |
17 | byte: u8, | |
18 | }, | |
19 | } | |
20 | ||
21 | impl ParseError { | |
22 | fn s(&self) -> &str { | |
23 | use self::ParseError::*; | |
24 | ||
25 | match *self { | |
26 | MissingSlash => "a slash (/) was missing between the type and subtype", | |
27 | MissingEqual => "an equals sign (=) was missing between a parameter and its value", | |
28 | MissingQuote => "a quote (\") was missing from a parameter value", | |
29 | InvalidToken { .. } => "an invalid token was encountered", | |
30 | } | |
31 | } | |
32 | } | |
33 | ||
34 | impl fmt::Display for ParseError { | |
35 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
36 | if let ParseError::InvalidToken { pos, byte } = *self { | |
37 | write!(f, "{}, {:X} at position {}", self.s(), byte, pos) | |
38 | } else { | |
39 | f.write_str(self.s()) | |
40 | } | |
41 | } | |
42 | } | |
43 | ||
44 | impl Error for ParseError { | |
45 | // Minimum Rust is 1.15, Error::description was still required then | |
46 | #[allow(deprecated)] | |
47 | fn description(&self) -> &str { | |
48 | self.s() | |
49 | } | |
50 | } | |
51 | ||
52 | pub fn parse(s: &str) -> Result<Mime, ParseError> { | |
53 | if s == "*/*" { | |
54 | return Ok(::STAR_STAR); | |
55 | } | |
56 | ||
57 | let mut iter = s.bytes().enumerate(); | |
58 | // toplevel | |
59 | let mut start; | |
60 | let slash; | |
61 | loop { | |
62 | match iter.next() { | |
63 | Some((_, c)) if is_token(c) => (), | |
64 | Some((i, b'/')) if i > 0 => { | |
65 | slash = i; | |
66 | start = i + 1; | |
67 | break; | |
68 | }, | |
69 | None => return Err(ParseError::MissingSlash), // EOF and no toplevel is no Mime | |
70 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
71 | pos: pos, | |
72 | byte: byte, | |
73 | }) | |
74 | }; | |
75 | ||
76 | } | |
77 | ||
78 | // sublevel | |
79 | let mut plus = None; | |
80 | loop { | |
81 | match iter.next() { | |
82 | Some((i, b'+')) if i > start => { | |
83 | plus = Some(i); | |
84 | }, | |
85 | Some((i, b';')) if i > start => { | |
86 | start = i; | |
87 | break; | |
88 | }, | |
89 | Some((_, c)) if is_token(c) => (), | |
90 | None => { | |
91 | return Ok(Mime { | |
92 | source: Source::Dynamic(s.to_ascii_lowercase()), | |
93 | slash: slash, | |
94 | plus: plus, | |
95 | params: ParamSource::None, | |
96 | }); | |
97 | }, | |
98 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
99 | pos: pos, | |
100 | byte: byte, | |
101 | }) | |
102 | }; | |
103 | } | |
104 | ||
105 | // params | |
106 | let params = params_from_str(s, &mut iter, start)?; | |
107 | ||
108 | let src = match params { | |
109 | ParamSource::Utf8(_) => s.to_ascii_lowercase(), | |
110 | ParamSource::Custom(semicolon, ref indices) => lower_ascii_with_params(s, semicolon, indices), | |
111 | ParamSource::None => { | |
112 | // Chop off the empty list | |
113 | s[..start].to_ascii_lowercase() | |
114 | } | |
115 | }; | |
116 | ||
117 | Ok(Mime { | |
118 | source: Source::Dynamic(src), | |
119 | slash: slash, | |
120 | plus: plus, | |
121 | params: params, | |
122 | }) | |
123 | } | |
124 | ||
125 | ||
126 | fn params_from_str(s: &str, iter: &mut Enumerate<Bytes>, mut start: usize) -> Result<ParamSource, ParseError> { | |
127 | let semicolon = start; | |
128 | start += 1; | |
129 | let mut params = ParamSource::None; | |
130 | 'params: while start < s.len() { | |
131 | let name; | |
132 | // name | |
133 | 'name: loop { | |
134 | match iter.next() { | |
135 | Some((i, b' ')) if i == start => { | |
136 | start = i + 1; | |
137 | continue 'params; | |
138 | }, | |
139 | Some((_, c)) if is_token(c) => (), | |
140 | Some((i, b'=')) if i > start => { | |
141 | name = Indexed(start, i); | |
142 | start = i + 1; | |
143 | break 'name; | |
144 | }, | |
145 | None => return Err(ParseError::MissingEqual), | |
146 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
147 | pos: pos, | |
148 | byte: byte, | |
149 | }), | |
150 | } | |
151 | } | |
152 | ||
153 | let value; | |
154 | // values must be restrict-name-char or "anything goes" | |
155 | let mut is_quoted = false; | |
156 | ||
157 | 'value: loop { | |
158 | if is_quoted { | |
159 | match iter.next() { | |
160 | Some((i, b'"')) if i > start => { | |
161 | value = Indexed(start, i); | |
162 | break 'value; | |
163 | }, | |
164 | Some((_, c)) if is_restricted_quoted_char(c) => (), | |
165 | None => return Err(ParseError::MissingQuote), | |
166 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
167 | pos: pos, | |
168 | byte: byte, | |
169 | }), | |
170 | } | |
171 | } else { | |
172 | match iter.next() { | |
173 | Some((i, b'"')) if i == start => { | |
174 | is_quoted = true; | |
175 | start = i + 1; | |
176 | }, | |
177 | Some((_, c)) if is_token(c) => (), | |
178 | Some((i, b';')) if i > start => { | |
179 | value = Indexed(start, i); | |
180 | start = i + 1; | |
181 | break 'value; | |
182 | } | |
183 | None => { | |
184 | value = Indexed(start, s.len()); | |
185 | start = s.len(); | |
186 | break 'value; | |
187 | }, | |
188 | ||
189 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
190 | pos: pos, | |
191 | byte: byte, | |
192 | }), | |
193 | } | |
194 | } | |
195 | } | |
196 | ||
197 | if is_quoted { | |
198 | 'ws: loop { | |
199 | match iter.next() { | |
200 | Some((i, b';')) => { | |
201 | // next param | |
202 | start = i + 1; | |
203 | break 'ws; | |
204 | }, | |
205 | Some((_, b' ')) => { | |
206 | // skip whitespace | |
207 | }, | |
208 | None => { | |
209 | // eof | |
210 | start = s.len(); | |
211 | break 'ws; | |
212 | }, | |
213 | Some((pos, byte)) => return Err(ParseError::InvalidToken { | |
214 | pos: pos, | |
215 | byte: byte, | |
216 | }), | |
217 | } | |
218 | } | |
219 | } | |
220 | ||
221 | match params { | |
222 | ParamSource::Utf8(i) => { | |
223 | let i = i + 2; | |
224 | let charset = Indexed(i, "charset".len() + i); | |
225 | let utf8 = Indexed(charset.1 + 1, charset.1 + "utf-8".len() + 1); | |
226 | params = ParamSource::Custom(semicolon, vec![ | |
227 | (charset, utf8), | |
228 | (name, value), | |
229 | ]); | |
230 | }, | |
231 | ParamSource::Custom(_, ref mut vec) => { | |
232 | vec.push((name, value)); | |
233 | }, | |
234 | ParamSource::None => { | |
235 | if semicolon + 2 == name.0 && CHARSET == &s[name.0..name.1] { | |
236 | if UTF_8 == &s[value.0..value.1] { | |
237 | params = ParamSource::Utf8(semicolon); | |
238 | continue 'params; | |
239 | } | |
240 | } | |
241 | params = ParamSource::Custom(semicolon, vec![(name, value)]); | |
242 | }, | |
243 | } | |
244 | } | |
245 | Ok(params) | |
246 | } | |
247 | ||
248 | fn lower_ascii_with_params(s: &str, semi: usize, params: &[(Indexed, Indexed)]) -> String { | |
249 | let mut owned = s.to_owned(); | |
250 | owned[..semi].make_ascii_lowercase(); | |
251 | ||
252 | for &(ref name, ref value) in params { | |
253 | owned[name.0..name.1].make_ascii_lowercase(); | |
254 | // Since we just converted this part of the string to lowercase, | |
255 | // we can skip the `Name == &str` unicase check and do a faster | |
256 | // memcmp instead. | |
257 | if &owned[name.0..name.1] == CHARSET.source { | |
258 | owned[value.0..value.1].make_ascii_lowercase(); | |
259 | } | |
260 | } | |
261 | ||
262 | owned | |
263 | } | |
264 | ||
265 | // From [RFC6838](http://tools.ietf.org/html/rfc6838#section-4.2): | |
266 | // | |
267 | // > All registered media types MUST be assigned top-level type and | |
268 | // > subtype names. The combination of these names serves to uniquely | |
269 | // > identify the media type, and the subtype name facet (or the absence | |
270 | // > of one) identifies the registration tree. Both top-level type and | |
271 | // > subtype names are case-insensitive. | |
272 | // > | |
273 | // > Type and subtype names MUST conform to the following ABNF: | |
274 | // > | |
275 | // > type-name = restricted-name | |
276 | // > subtype-name = restricted-name | |
277 | // > | |
278 | // > restricted-name = restricted-name-first *126restricted-name-chars | |
279 | // > restricted-name-first = ALPHA / DIGIT | |
280 | // > restricted-name-chars = ALPHA / DIGIT / "!" / "#" / | |
281 | // > "$" / "&" / "-" / "^" / "_" | |
282 | // > restricted-name-chars =/ "." ; Characters before first dot always | |
283 | // > ; specify a facet name | |
284 | // > restricted-name-chars =/ "+" ; Characters after last plus always | |
285 | // > ; specify a structured syntax suffix | |
286 | ||
287 | // However, [HTTP](https://tools.ietf.org/html/rfc7231#section-3.1.1.1): | |
288 | // | |
289 | // > media-type = type "/" subtype *( OWS ";" OWS parameter ) | |
290 | // > type = token | |
291 | // > subtype = token | |
292 | // > parameter = token "=" ( token / quoted-string ) | |
293 | // | |
294 | // Where token is defined as: | |
295 | // | |
296 | // > token = 1*tchar | |
297 | // > tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / | |
298 | // > "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA | |
299 | // | |
300 | // So, clearly, ¯\_(Ä_/¯ | |
301 | ||
302 | macro_rules! byte_map { | |
303 | ($($flag:expr,)*) => ([ | |
304 | $($flag != 0,)* | |
305 | ]) | |
306 | } | |
307 | ||
308 | static TOKEN_MAP: [bool; 256] = byte_map![ | |
309 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
310 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
311 | 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, | |
312 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, | |
313 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
314 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, | |
315 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
316 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, | |
317 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
318 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
319 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
320 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
321 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
322 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
323 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
324 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
325 | ]; | |
326 | ||
327 | fn is_token(c: u8) -> bool { | |
328 | TOKEN_MAP[c as usize] | |
329 | } | |
330 | ||
331 | fn is_restricted_quoted_char(c: u8) -> bool { | |
332 | c > 31 && c != 127 | |
333 | } | |
334 | ||
335 | #[test] | |
336 | #[allow(warnings)] // ... ranges deprecated | |
337 | fn test_lookup_tables() { | |
338 | for (i, &valid) in TOKEN_MAP.iter().enumerate() { | |
339 | let i = i as u8; | |
340 | let should = match i { | |
341 | b'a'...b'z' | | |
342 | b'A'...b'Z' | | |
343 | b'0'...b'9' | | |
344 | b'!' | | |
345 | b'#' | | |
346 | b'$' | | |
347 | b'%' | | |
348 | b'&' | | |
349 | b'\'' | | |
350 | b'*' | | |
351 | b'+' | | |
352 | b'-' | | |
353 | b'.' | | |
354 | b'^' | | |
355 | b'_' | | |
356 | b'`' | | |
357 | b'|' | | |
358 | b'~' => true, | |
359 | _ => false | |
360 | }; | |
361 | assert_eq!(valid, should, "{:?} ({}) should be {}", i as char, i, should); | |
362 | } | |
363 | } |