]>
git.proxmox.com Git - rustc.git/blob - vendor/unicode-script/scripts/unicode.py
3 # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
4 # file at the top-level directory of this distribution and at
5 # http://rust-lang.org/COPYRIGHT.
7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10 # option. This file may not be copied, modified, or distributed
11 # except according to those terms.
13 # This script uses the following Unicode tables:
14 # - PropertyValueAliases.txt
15 # - ScriptExtensions.txt
18 # Since this should not require frequent updates, we just store this
19 # out-of-line and check the unicode.rs file into git.
21 import fileinput
, re
, os
, sys
23 preamble
= '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
24 // file at the top-level directory of this distribution and at
25 // http://rust-lang.org/COPYRIGHT.
27 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
28 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
29 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
30 // option. This file may not be copied, modified, or distributed
31 // except according to those terms.
33 // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
35 #![allow(missing_docs, non_upper_case_globals, non_snake_case)]
37 pub use tables_impl::*;
41 use crate::ScriptExtension;
49 UNICODE_VERSION
= (13, 0, 0)
51 UNICODE_VERSION_NUMBER
= "%s.%s.%s" %UNICODE_VERSION
54 return "'\\u{%x}'" % c
57 if not os
.path
.exists(os
.path
.basename(f
)):
59 os
.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
60 % (UNICODE_VERSION
[0], UNICODE_VERSION
[1], f
))
62 os
.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
63 % (UNICODE_VERSION_NUMBER
, f
))
65 if not os
.path
.exists(os
.path
.basename(f
)):
66 sys
.stderr
.write("cannot load %s" % f
)
72 cats_out
[cat
] = group_cat(cats
[cat
])
77 Fetch the shorthand aliases for each longhand Script name
79 fetch("PropertyValueAliases.txt")
82 re1
= re
.compile(r
"^ *sc *; *(\w+) *; *(\w+)")
83 for line
in fileinput
.input(os
.path
.basename("PropertyValueAliases.txt")):
86 l
= m
.group(2).strip()
87 s
= m
.group(1).strip()
88 assert(s
not in longforms
)
89 assert(l
not in shortforms
)
95 return (longforms
, shortforms
)
97 def format_table_content(f
, content
, indent
):
100 for chunk
in content
.split(","):
101 if len(line
) + len(chunk
) < 98:
108 f
.write(line
+ ",\n")
109 line
= " "*indent
+ chunk
112 # Implementation from unicode-segmentation
113 def load_properties(f
, interestingprops
):
116 # Note: these regexes are different from those in unicode-segmentation,
117 # becase we need to handle spaces here
118 re1
= re
.compile(r
"^ *([0-9A-F]+) *; *([^#]+) *#")
119 re2
= re
.compile(r
"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
121 for line
in fileinput
.input(os
.path
.basename(f
)):
129 prop
= m
.group(2).strip()
135 prop
= m
.group(3).strip()
138 if interestingprops
and prop
not in interestingprops
:
142 if prop
not in props
:
144 props
[prop
].append((d_lo
, d_hi
))
148 # Implementation from unicode-segmentation
149 def emit_table(f
, name
, t_data
, t_type
= "&'static [(char, char)]", is_pub
=True,
150 pfun
=lambda x
: "(%s,%s)" % (escape_char(x
[0]), escape_char(x
[1])), is_const
=True):
155 pub_string
= "pub " + pub_string
156 f
.write(" %s %s: %s = &[\n" % (pub_string
, name
, t_type
))
164 format_table_content(f
, data
, 8)
169 pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
170 use core::cmp::Ordering::{Equal, Less, Greater};
171 match r.binary_search_by(|&(lo, hi, _)| {
172 if lo <= c && c <= hi { Equal }
173 else if hi < c { Less }
177 let (_, _, cat) = r[idx];
185 pub fn get_script(c: char) -> Option<Script> {
186 bsearch_range_value_table(c, SCRIPTS)
190 pub fn get_script_extension(c: char) -> Option<ScriptExtension> {
191 bsearch_range_value_table(c, SCRIPT_EXTENSIONS)
195 def emit_enums(f
, script_list
, extension_list
, longforms
):
197 Emit the Script and ScriptExtension enums as well as any related utility functions
201 #[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
203 #[allow(non_camel_case_types)]
205 /// A value of the `Script` property
214 for (i
, script
) in enumerate(script_list
):
215 f
.write(" /// %s\n %s = %s,\n" % (script
, longforms
[script
], i
))
217 f
.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list
))
220 pub mod script_extensions {
221 use crate::ScriptExtension;
222 pub const COMMON: ScriptExtension = ScriptExtension::new_common();
223 pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
224 pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
226 for (i
, script
) in enumerate(script_list
):
230 # need to replace L because `hex()` will spit out an L suffix for larger numbers
232 first
= hex(1 << i
).replace("L", "")
234 second
= hex(1 << (i
- 64)).replace("L", "")
236 third
= hex(1 << (i
- 128)).replace("L", "")
237 f
.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
238 (longforms
[script
], longforms
[script
].upper(), first
, second
, third
))
239 if script
!= longforms
[script
]:
240 f
.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
241 (longforms
[script
], script
.upper(), longforms
[script
].upper()))
242 for ext
in extension_list
:
243 longform
= ", ".join([longforms
[s
] for s
in ext
])
244 name
= "_".join([s
.upper() for s
in ext
])
245 expr
= ext
[0].upper()
247 expr
= "%s.union(%s)" % (expr
, e
.upper())
248 f
.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform
, name
, expr
))
253 # Generate implementation for the `Script`
254 generate_script_impl(f
)
257 def generate_script_impl(f
):
258 """Generates an `impl Script { ... }` section with all the required functions"""
260 # Open `impl Script` section.
261 f
.write("""impl Script {
264 # Generate impl of `inner_full_name`.
267 pub(crate) fn inner_full_name(self) -> &'static str {
269 Script::Unknown => "Unknown",
270 Script::Common => "Common",
271 Script::Inherited => "Inherited",
273 for script
in script_list
:
274 f
.write(" Script::%s => \"%s\",\n" % (longforms
[script
], longforms
[script
]))
279 # Generate impl of `inner_from_full_name`.
282 pub(crate) fn inner_from_full_name(input: &str) -> Option<Self> {
284 "Unknown" => Some(Script::Unknown),
285 "Common" => Some(Script::Common),
286 "Inherited" => Some(Script::Inherited),
288 for script
in script_list
:
289 f
.write(" \"%s\" => Some(Script::%s),\n" % (longforms
[script
], longforms
[script
]))
290 f
.write(" _ => None,\n" )
295 # Generate impl of `inner_short_name`
298 pub(crate) fn inner_short_name(self) -> &'static str {
300 Script::Unknown => "",
301 Script::Common => "Zyyy",
302 Script::Inherited => "Zinh",
304 for script
in script_list
:
305 f
.write(" Script::%s => \"%s\",\n" % (longforms
[script
], script
))
310 # Generate impl of `inner_from_short_name`
313 pub(crate) fn inner_from_short_name(input: &str) -> Option<Self> {
315 "Zyyy" => Some(Script::Common),
316 "Zinh" => Some(Script::Inherited),
318 for script
in script_list
:
319 f
.write(" \"%s\" => Some(Script::%s),\n" % (script
, longforms
[script
]))
320 f
.write(""" _ => None,\n""")
325 # Generate impl of `for_integer`
328 pub(crate) fn for_integer(value: u8) -> Self {
331 for (i
, script
) in enumerate(script_list
):
332 f
.write(" %s => Script::%s,\n" % (i
, longforms
[script
]))
333 f
.write(""" _ => unreachable!(),
338 # Close `impl Script` section
343 def extension_name(ext
):
344 """Get the rust source for a given ScriptExtension"""
345 return "script_extensions::%s" % "_".join([e
.upper() for e
in ext
])
348 if __name__
== "__main__":
350 if os
.path
.exists(r
):
352 with
open(r
, "w") as rf
:
353 # write the file's preamble
356 /// The version of [Unicode](http://www.unicode.org/)
357 /// that this version of unicode-script is based on.
358 pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
359 """ % UNICODE_VERSION
)
362 (longforms
, shortforms
) = aliases()
364 scripts
= load_properties("Scripts.txt", [])
369 for script
in scripts
:
370 if script
not in ["Common", "Unknown", "Inherited"]:
371 script_list
.append(shortforms
[script
])
372 script_table
.extend([(x
, y
, shortforms
[script
]) for (x
, y
) in scripts
[script
]])
374 script_table
.sort(key
=lambda w
: w
[0])
377 extensions
= load_properties("ScriptExtensions.txt", [])
381 for ext
in extensions
:
382 split
= ext
.split(" ")
386 extension_list
.append(split
)
388 extension_table
.extend([(x
, y
, output_ext
) for (x
, y
) in extensions
[ext
]])
389 extension_table
.sort(key
=lambda w
: w
[0])
392 emit_enums(rf
, script_list
, extension_list
, longforms
)
395 emit_table(rf
, "SCRIPTS", script_table
, t_type
= "&'static [(char, char, Script)]",
396 is_pub
=False , pfun
=lambda x
: "(%s,%s, Script::%s)" % (escape_char(x
[0]), escape_char(x
[1]), longforms
[x
[2]]))
397 emit_table(rf
, "SCRIPT_EXTENSIONS", extension_table
, t_type
= "&'static [(char, char, ScriptExtension)]",
398 is_pub
=False , pfun
=lambda x
: "(%s,%s,%s)" % (escape_char(x
[0]), escape_char(x
[1]), extension_name(x
[2])))
400 # emit_table(rf, "FOObar", properties)