]>
git.proxmox.com Git - rustc.git/blob - src/libstd_unicode/unicode.py
3 # Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
13 # This script uses the following Unicode tables:
14 # - DerivedCoreProperties.txt
15 # - DerivedNormalizationProps.txt
16 # - EastAsianWidth.txt
17 # - auxiliary/GraphemeBreakProperty.txt
23 # Since this should not require frequent updates, we just store this
24 # out-of-line and check the unicode.rs file into git.
26 import fileinput
, re
, os
, sys
, operator
, math
28 preamble
= '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
29 // file at the top-level directory of this distribution and at
30 // http://rust-lang.org/COPYRIGHT.
32 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
33 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
34 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
35 // option. This file may not be copied, modified, or distributed
36 // except according to those terms.
38 // NOTE: The following code was generated by "./unicode.py", do not edit directly
40 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
42 use version::UnicodeVersion;
43 use bool_trie::{BoolTrie, SmallBoolTrie};
46 # Mapping taken from Table 12 from:
47 # http://www.unicode.org/reports/tr44/#General_Category_Values
48 expanded_categories
= {
49 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
50 'Lm': ['L'], 'Lo': ['L'],
51 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
52 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
53 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
54 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
55 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
56 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
57 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
60 # these are the surrogate codepoints, which are not valid rust characters
61 surrogate_codepoints
= (0xd800, 0xdfff)
64 if not os
.path
.exists(os
.path
.basename(f
)):
65 os
.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
68 if not os
.path
.exists(os
.path
.basename(f
)):
69 sys
.stderr
.write("cannot load %s" % f
)
73 return surrogate_codepoints
[0] <= n
<= surrogate_codepoints
[1]
75 def load_unicode_data(f
):
87 for line
in fileinput
.input(f
):
88 data
= line
.split(';')
95 for i
in range(range_start
, cp
):
98 if data
[1].endswith(", First>"):
104 (code_org
, name
, gencat
, combine
, bidi
,
105 decomp
, deci
, digit
, num
, mirror
,
106 old
, iso
, upcase
, lowcase
, titlecase
) = udict
[code
]
108 # generate char to char direct common and simple conversions
109 # uppercase to lowercase
110 if lowcase
!= "" and code_org
!= lowcase
:
111 to_lower
[code
] = (int(lowcase
, 16), 0, 0)
113 # lowercase to uppercase
114 if upcase
!= "" and code_org
!= upcase
:
115 to_upper
[code
] = (int(upcase
, 16), 0, 0)
118 if titlecase
.strip() != "" and code_org
!= titlecase
:
119 to_title
[code
] = (int(titlecase
, 16), 0, 0)
121 # store decomposition, if given
123 if decomp
.startswith('<'):
125 for i
in decomp
.split()[1:]:
126 seq
.append(int(i
, 16))
127 compat_decomp
[code
] = seq
130 for i
in decomp
.split():
131 seq
.append(int(i
, 16))
132 canon_decomp
[code
] = seq
134 # place letter in categories as appropriate
135 for cat
in [gencat
, "Assigned"] + expanded_categories
.get(gencat
, []):
136 if cat
not in gencats
:
138 gencats
[cat
].append(code
)
140 # record combining class, if any
142 if combine
not in combines
:
143 combines
[combine
] = []
144 combines
[combine
].append(code
)
146 # generate Not_Assigned from Assigned
147 gencats
["Cn"] = gen_unassigned(gencats
["Assigned"])
148 # Assigned is not a real category
149 del(gencats
["Assigned"])
150 # Other contains Not_Assigned
151 gencats
["C"].extend(gencats
["Cn"])
152 gencats
= group_cats(gencats
)
153 combines
= to_combines(group_cats(combines
))
155 return (canon_decomp
, compat_decomp
, gencats
, combines
, to_upper
, to_lower
, to_title
)
157 def load_special_casing(f
, to_upper
, to_lower
, to_title
):
159 for line
in fileinput
.input(f
):
160 data
= line
.split('#')[0].split(';')
162 code
, lower
, title
, upper
, _comment
= data
164 code
, lower
, title
, upper
, condition
, _comment
= data
165 if condition
.strip(): # Only keep unconditional mappins
170 lower
= lower
.strip()
171 title
= title
.strip()
172 upper
= upper
.strip()
174 for (map_
, values
) in [(to_lower
, lower
), (to_upper
, upper
), (to_title
, title
)]:
176 values
= [int(i
, 16) for i
in values
.split()]
177 for _
in range(len(values
), 3):
179 assert len(values
) == 3
182 def group_cats(cats
):
185 cats_out
[cat
] = group_cat(cats
[cat
])
190 letters
= sorted(set(cat
))
191 cur_start
= letters
.pop(0)
193 for letter
in letters
:
194 assert letter
> cur_end
, \
195 "cur_end: %s, letter: %s" % (hex(cur_end
), hex(letter
))
196 if letter
== cur_end
+ 1:
199 cat_out
.append((cur_start
, cur_end
))
200 cur_start
= cur_end
= letter
201 cat_out
.append((cur_start
, cur_end
))
204 def ungroup_cat(cat
):
212 def gen_unassigned(assigned
):
213 assigned
= set(assigned
)
214 return ([i
for i
in range(0, 0xd800) if i
not in assigned
] +
215 [i
for i
in range(0xe000, 0x110000) if i
not in assigned
])
217 def to_combines(combs
):
220 for (lo
, hi
) in combs
[comb
]:
221 combs_out
.append((lo
, hi
, comb
))
222 combs_out
.sort(key
=lambda comb
: comb
[0])
225 def format_table_content(f
, content
, indent
):
228 for chunk
in content
.split(","):
229 if len(line
) + len(chunk
) < 98:
236 f
.write(line
+ ",\n")
237 line
= " "*indent
+ chunk
240 def load_properties(f
, interestingprops
):
243 re1
= re
.compile("^ *([0-9A-F]+) *; *(\w+)")
244 re2
= re
.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
246 for line
in fileinput
.input(os
.path
.basename(f
)):
263 if interestingprops
and prop
not in interestingprops
:
267 if prop
not in props
:
269 props
[prop
].append((d_lo
, d_hi
))
271 # optimize if possible
273 props
[prop
] = group_cat(ungroup_cat(props
[prop
]))
278 return "'\\u{%x}'" % c
if c
!= 0 else "'\\0'"
280 def emit_table(f
, name
, t_data
, t_type
= "&[(char, char)]", is_pub
=True,
281 pfun
=lambda x
: "(%s,%s)" % (escape_char(x
[0]), escape_char(x
[1]))):
285 f
.write(" %sconst %s: %s = &[\n" % (pub_string
, name
, t_type
))
293 format_table_content(f
, data
, 8)
296 def compute_trie(rawdata
, chunksize
):
300 for i
in range(len(rawdata
) // chunksize
):
301 data
= rawdata
[i
* chunksize
: (i
+ 1) * chunksize
]
302 child
= '|'.join(map(str, data
))
303 if child
not in childmap
:
304 childmap
[child
] = len(childmap
)
305 child_data
.extend(data
)
306 root
.append(childmap
[child
])
307 return (root
, child_data
)
309 def emit_bool_trie(f
, name
, t_data
, is_pub
=True):
311 rawdata
= [False] * 0x110000
312 for (lo
, hi
) in t_data
:
313 for cp
in range(lo
, hi
+ 1):
316 # convert to bitmap chunks of 64 bits each
318 for i
in range(0x110000 // CHUNK
):
321 if rawdata
[i
* 64 + j
]:
328 f
.write(" %sconst %s: &super::BoolTrie = &super::BoolTrie {\n" % (pub_string
, name
))
330 data
= ','.join('0x%016x' % chunk
for chunk
in chunks
[0:0x800 // CHUNK
])
331 format_table_content(f
, data
, 12)
334 # 0x800..0x10000 trie
335 (r2
, r3
) = compute_trie(chunks
[0x800 // CHUNK
: 0x10000 // CHUNK
], 64 // CHUNK
)
337 data
= ','.join(str(node
) for node
in r2
)
338 format_table_content(f
, data
, 12)
341 data
= ','.join('0x%016x' % chunk
for chunk
in r3
)
342 format_table_content(f
, data
, 12)
345 # 0x10000..0x110000 trie
346 (mid
, r6
) = compute_trie(chunks
[0x10000 // CHUNK
: 0x110000 // CHUNK
], 64 // CHUNK
)
347 (r4
, r5
) = compute_trie(mid
, 64)
349 data
= ','.join(str(node
) for node
in r4
)
350 format_table_content(f
, data
, 12)
353 data
= ','.join(str(node
) for node
in r5
)
354 format_table_content(f
, data
, 12)
357 data
= ','.join('0x%016x' % chunk
for chunk
in r6
)
358 format_table_content(f
, data
, 12)
363 def emit_small_bool_trie(f
, name
, t_data
, is_pub
=True):
364 last_chunk
= max(hi
// 64 for (lo
, hi
) in t_data
)
365 n_chunks
= last_chunk
+ 1
366 chunks
= [0] * n_chunks
367 for (lo
, hi
) in t_data
:
368 for cp
in range(lo
, hi
+ 1):
369 if cp
// 64 >= len(chunks
):
370 print(cp
, cp
// 64, len(chunks
), lo
, hi
)
371 chunks
[cp
// 64] |
= 1 << (cp
& 63)
376 f
.write(" %sconst %s: &super::SmallBoolTrie = &super::SmallBoolTrie {\n"
377 % (pub_string
, name
))
379 (r1
, r2
) = compute_trie(chunks
, 1)
382 data
= ','.join(str(node
) for node
in r1
)
383 format_table_content(f
, data
, 12)
387 data
= ','.join('0x%016x' % node
for node
in r2
)
388 format_table_content(f
, data
, 12)
393 def emit_property_module(f
, mod
, tbl
, emit
):
394 f
.write("pub mod %s {\n" % mod
)
395 for cat
in sorted(emit
):
396 if cat
in ["Cc", "White_Space", "Pattern_White_Space"]:
397 emit_small_bool_trie(f
, "%s_table" % cat
, tbl
[cat
])
398 f
.write(" pub fn %s(c: char) -> bool {\n" % cat
)
399 f
.write(" %s_table.lookup(c)\n" % cat
)
402 emit_bool_trie(f
, "%s_table" % cat
, tbl
[cat
])
403 f
.write(" pub fn %s(c: char) -> bool {\n" % cat
)
404 f
.write(" %s_table.lookup(c)\n" % cat
)
408 def emit_conversions_module(f
, to_upper
, to_lower
, to_title
):
409 f
.write("pub mod conversions {")
411 use core::option::Option;
412 use core::option::Option::{Some, None};
414 pub fn to_lower(c: char) -> [char; 3] {
415 match bsearch_case_table(c, to_lowercase_table) {
416 None => [c, '\\0', '\\0'],
417 Some(index) => to_lowercase_table[index].1,
421 pub fn to_upper(c: char) -> [char; 3] {
422 match bsearch_case_table(c, to_uppercase_table) {
423 None => [c, '\\0', '\\0'],
424 Some(index) => to_uppercase_table[index].1,
428 fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
429 table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
433 t_type
= "&[(char, [char; 3])]"
434 pfun
= lambda x
: "(%s,[%s,%s,%s])" % (
435 escape_char(x
[0]), escape_char(x
[1][0]), escape_char(x
[1][1]), escape_char(x
[1][2]))
436 emit_table(f
, "to_lowercase_table",
437 sorted(to_lower
.items(), key
=operator
.itemgetter(0)),
438 is_pub
=False, t_type
= t_type
, pfun
=pfun
)
439 emit_table(f
, "to_uppercase_table",
440 sorted(to_upper
.items(), key
=operator
.itemgetter(0)),
441 is_pub
=False, t_type
= t_type
, pfun
=pfun
)
444 def emit_norm_module(f
, canon
, compat
, combine
, norm_props
):
445 canon_keys
= sorted(canon
.keys())
447 compat_keys
= sorted(compat
.keys())
450 comp_exclusions
= norm_props
["Full_Composition_Exclusion"]
451 for char
in canon_keys
:
452 if any(lo
<= char
<= hi
for lo
, hi
in comp_exclusions
):
456 if decomp
[0] not in canon_comp
:
457 canon_comp
[decomp
[0]] = []
458 canon_comp
[decomp
[0]].append( (decomp
[1], char
) )
459 canon_comp_keys
= sorted(canon_comp
.keys())
461 if __name__
== "__main__":
463 if os
.path
.exists(r
):
465 with
open(r
, "w") as rf
:
466 # write the file's preamble
469 # download and parse all the data
471 with
open("ReadMe.txt") as readme
:
472 pattern
= "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
473 unicode_version
= re
.search(pattern
, readme
.read()).groups()
475 /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of
476 /// `CharExt` and `UnicodeStrPrelude` traits are based on.
477 pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
483 """ % unicode_version
)
484 (canon_decomp
, compat_decomp
, gencats
, combines
,
485 to_upper
, to_lower
, to_title
) = load_unicode_data("UnicodeData.txt")
486 load_special_casing("SpecialCasing.txt", to_upper
, to_lower
, to_title
)
487 want_derived
= ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
488 "Cased", "Case_Ignorable"]
489 derived
= load_properties("DerivedCoreProperties.txt", want_derived
)
490 scripts
= load_properties("Scripts.txt", [])
491 props
= load_properties("PropList.txt",
492 ["White_Space", "Join_Control", "Noncharacter_Code_Point", "Pattern_White_Space"])
493 norm_props
= load_properties("DerivedNormalizationProps.txt",
494 ["Full_Composition_Exclusion"])
497 for (name
, cat
, pfuns
) in ("general_category", gencats
, ["N", "Cc"]), \
498 ("derived_property", derived
, want_derived
), \
499 ("property", props
, ["White_Space", "Pattern_White_Space"]):
500 emit_property_module(rf
, name
, cat
, pfuns
)
502 # normalizations and conversions module
503 emit_norm_module(rf
, canon_decomp
, compat_decomp
, combines
, norm_props
)
504 emit_conversions_module(rf
, to_upper
, to_lower
, to_title
)