]>
git.proxmox.com Git - rustc.git/blob - src/vendor/unicode-segmentation/scripts/unicode.py
3 # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
13 # This script uses the following Unicode tables:
14 # - DerivedCoreProperties.txt
15 # - auxiliary/GraphemeBreakProperty.txt
16 # - auxiliary/WordBreakProperty.txt
20 # Since this should not require frequent updates, we just store this
21 # out-of-line and check the unicode.rs file into git.
23 import fileinput
, re
, os
, sys
, operator
25 preamble
= '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
26 // file at the top-level directory of this distribution and at
27 // http://rust-lang.org/COPYRIGHT.
29 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
30 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
31 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
32 // option. This file may not be copied, modified, or distributed
33 // except according to those terms.
35 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
37 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
40 # Mapping taken from Table 12 from:
41 # http://www.unicode.org/reports/tr44/#General_Category_Values
42 expanded_categories
= {
43 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
44 'Lm': ['L'], 'Lo': ['L'],
45 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
46 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
47 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
48 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
49 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
50 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
51 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
54 # these are the surrogate codepoints, which are not valid rust characters
55 surrogate_codepoints
= (0xd800, 0xdfff)
58 return surrogate_codepoints
[0] <= n
<= surrogate_codepoints
[1]
61 if not os
.path
.exists(os
.path
.basename(f
)):
62 os
.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
65 if not os
.path
.exists(os
.path
.basename(f
)):
66 sys
.stderr
.write("cannot load %s" % f
)
75 for line
in fileinput
.input(f
):
76 data
= line
.split(';');
79 cp
= int(data
[0], 16);
83 for i
in xrange(range_start
, cp
):
86 if data
[1].endswith(", First>"):
92 [code_org
, name
, gencat
, combine
, bidi
,
93 decomp
, deci
, digit
, num
, mirror
,
94 old
, iso
, upcase
, lowcase
, titlecase
] = udict
[code
];
96 # place letter in categories as appropriate
97 for cat
in [gencat
, "Assigned"] + expanded_categories
.get(gencat
, []):
98 if cat
not in gencats
:
100 gencats
[cat
].append(code
)
102 gencats
= group_cats(gencats
)
105 def group_cats(cats
):
108 cats_out
[cat
] = group_cat(cats
[cat
])
113 letters
= sorted(set(cat
))
114 cur_start
= letters
.pop(0)
116 for letter
in letters
:
117 assert letter
> cur_end
, \
118 "cur_end: %s, letter: %s" % (hex(cur_end
), hex(letter
))
119 if letter
== cur_end
+ 1:
122 cat_out
.append((cur_start
, cur_end
))
123 cur_start
= cur_end
= letter
124 cat_out
.append((cur_start
, cur_end
))
127 def ungroup_cat(cat
):
135 def format_table_content(f
, content
, indent
):
138 for chunk
in content
.split(","):
139 if len(line
) + len(chunk
) < 98:
146 f
.write(line
+ ",\n")
147 line
= " "*indent
+ chunk
150 def load_properties(f
, interestingprops
):
153 re1
= re
.compile("^ *([0-9A-F]+) *; *(\w+)")
154 re2
= re
.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
156 for line
in fileinput
.input(os
.path
.basename(f
)):
173 if interestingprops
and prop
not in interestingprops
:
177 if prop
not in props
:
179 props
[prop
].append((d_lo
, d_hi
))
181 # optimize if possible
183 props
[prop
] = group_cat(ungroup_cat(props
[prop
]))
188 return "'\\u{%x}'" % c
190 def emit_table(f
, name
, t_data
, t_type
= "&'static [(char, char)]", is_pub
=True,
191 pfun
=lambda x
: "(%s,%s)" % (escape_char(x
[0]), escape_char(x
[1])), is_const
=True):
196 pub_string
= "pub " + pub_string
197 f
.write(" %s %s: %s = &[\n" % (pub_string
, name
, t_type
))
205 format_table_content(f
, data
, 8)
208 def emit_util_mod(f
):
212 pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
213 use core::cmp::Ordering::{Equal, Less, Greater};
214 r.binary_search_by(|&(lo,hi)| {
215 if lo <= c && c <= hi { Equal }
216 else if hi < c { Less }
222 fn is_alphabetic(c: char) -> bool {
224 'a' ... 'z' | 'A' ... 'Z' => true,
225 c if c > '\x7f' => super::derived_property::Alphabetic(c),
231 fn is_numeric(c: char) -> bool {
234 c if c > '\x7f' => super::general_category::N(c),
240 pub fn is_alphanumeric(c: char) -> bool {
241 is_alphabetic(c) || is_numeric(c)
247 def emit_property_module(f
, mod
, tbl
, emit
):
248 f
.write("mod %s {\n" % mod
)
249 for cat
in sorted(emit
):
250 emit_table(f
, "%s_table" % cat
, tbl
[cat
], is_pub
=False)
251 f
.write(" #[inline]\n")
252 f
.write(" pub fn %s(c: char) -> bool {\n" % cat
)
253 f
.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat
)
257 def emit_break_module(f
, break_table
, break_cats
, name
):
258 Name
= name
.capitalize()
259 f
.write("""pub mod %s {
260 use core::result::Result::{Ok, Err};
262 pub use self::%sCat::*;
264 #[allow(non_camel_case_types)]
265 #[derive(Clone, Copy, PartialEq, Eq)]
267 """ % (name
, Name
, Name
))
269 break_cats
.append("Any")
271 for cat
in break_cats
:
272 f
.write((" %sC_" % Name
[0]) + cat
+ ",\n")
275 fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
276 use core::cmp::Ordering::{Equal, Less, Greater};
277 match r.binary_search_by(|&(lo, hi, _)| {
278 if lo <= c && c <= hi { Equal }
279 else if hi < c { Less }
283 let (_, _, cat) = r[idx];
290 pub fn %s_category(c: char) -> %sCat {
291 bsearch_range_value_table(c, %s_cat_table)
294 """ % (Name
, Name
, Name
[0], name
, Name
, name
))
296 emit_table(f
, "%s_cat_table" % name
, break_table
, "&'static [(char, char, %sCat)]" % Name
,
297 pfun
=lambda x
: "(%s,%s,%sC_%s)" % (escape_char(x
[0]), escape_char(x
[1]), Name
[0], x
[2]),
298 is_pub
=False, is_const
=True)
301 if __name__
== "__main__":
303 if os
.path
.exists(r
):
305 with
open(r
, "w") as rf
:
306 # write the file's preamble
309 # download and parse all the data
311 with
open("ReadMe.txt") as readme
:
312 pattern
= "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
313 unicode_version
= re
.search(pattern
, readme
.read()).groups()
315 /// The version of [Unicode](http://www.unicode.org/)
316 /// that this version of unicode-segmentation is based on.
317 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
318 """ % unicode_version
)
320 gencats
= load_gencats("UnicodeData.txt")
321 derived
= load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
324 for (name
, cat
, pfuns
) in ("general_category", gencats
, ["N"]), \
325 ("derived_property", derived
, ["Alphabetic"]):
326 emit_property_module(rf
, name
, cat
, pfuns
)
328 ### grapheme cluster module
329 # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
330 grapheme_cats
= load_properties("auxiliary/GraphemeBreakProperty.txt", [])
334 # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
335 # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
336 # Thus, we have to remove Cs from the Control category
338 # 0x0a and 0x0d (CR and LF) are not in the Control category for Graphemes.
339 # However, the Graphemes iterator treats these as a special case, so they
340 # should be included in grapheme_cats["Control"] for our implementation.
341 grapheme_cats
["Control"] = group_cat(list(
342 (set(ungroup_cat(grapheme_cats
["Control"]))
343 |
set(ungroup_cat(grapheme_cats
["CR"]))
344 |
set(ungroup_cat(grapheme_cats
["LF"])))
345 - set(ungroup_cat([surrogate_codepoints
]))))
346 del(grapheme_cats
["CR"])
347 del(grapheme_cats
["LF"])
350 for cat
in grapheme_cats
:
351 grapheme_table
.extend([(x
, y
, cat
) for (x
, y
) in grapheme_cats
[cat
]])
352 grapheme_table
.sort(key
=lambda w
: w
[0])
353 emit_break_module(rf
, grapheme_table
, grapheme_cats
.keys(), "grapheme")
356 word_cats
= load_properties("auxiliary/WordBreakProperty.txt", [])
358 for cat
in word_cats
:
359 word_table
.extend([(x
, y
, cat
) for (x
, y
) in word_cats
[cat
]])
360 word_table
.sort(key
=lambda w
: w
[0])
361 emit_break_module(rf
, word_table
, word_cats
.keys(), "word")