]>
git.proxmox.com Git - rustc.git/blob - src/vendor/unicode-segmentation/scripts/unicode_gen_breaktests.py
4 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
5 # file at the top-level directory of this distribution and at
6 # http://rust-lang.org/COPYRIGHT.
8 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
9 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
10 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
11 # option. This file may not be copied, modified, or distributed
12 # except according to those terms.
14 # This script uses the following Unicode tables:
15 # - auxiliary/GraphemeBreakTest.txt
16 # - auxiliary/WordBreakTest.txt
18 # Since this should not require frequent updates, we just store this
19 # out-of-line and check the unicode.rs file into git.
21 import unicode, re
, os
, fileinput
23 def load_test_data(f
, optsplit
=[]):
25 testRe1
= re
.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
29 for line
in fileinput
.input(os
.path
.basename(f
)):
30 # lines that include a test start with the ÷ character
31 if len(line
) < 2 or line
[0:2] != '÷':
34 m
= testRe1
.match(line
)
36 print "error: no match on line where test was expected: %s" % line
39 # process the characters in this test case
40 chars
= process_split_string(m
.group(1))
41 # skip test case if it contains invalid characters (viz., surrogates)
45 # now process test cases
46 (chars
, info
) = process_split_info(m
.group(2), chars
, optsplit
)
48 # make sure that we have break info for each break!
49 assert len(chars
) - 1 == len(info
)
51 outls
.append((chars
, info
))
55 def process_split_info(s
, c
, o
):
60 # are we on a × or a ÷?
65 # find each instance of '(÷|×) [x.y] '
67 # find the currently considered rule number
68 sInd
= s
.index('[') + 1
71 # if it's '× [a.b]' where 'a.b' is in o, then
72 # we consider it a split even though it's not
74 # if it's ÷ then it's always a split
75 if not isX
or s
[sInd
:eInd
] in o
:
76 outis
.append(s
[sInd
:eInd
])
80 workcs
.extend(c
.pop(0))
84 if s
[idx
:idx
+2] == '×':
87 if s
[idx
:idx
+2] == '÷':
96 def process_split_string(s
):
103 if i
== '÷' or i
== '×':
110 if unicode.is_surrogate(ival
):
123 outstr
+= "\\u{%x}" % c
137 outstr
+= "\\u{%x}" % c
142 def create_grapheme_data(f
):
143 # rules 9.1 and 9.2 are for extended graphemes only
144 optsplits
= ['9.1','9.2']
145 d
= load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits
)
151 allchars
= [cn
for s
in c
for cn
in s
]
156 for n
in range(0,len(i
)):
157 if i
[n
] in optsplits
:
158 extwork
.extend(c
[n
+1])
160 extgraphs
.append(extwork
)
162 extwork
.extend(c
[n
+1])
164 # these are the extended grapheme clusters
165 extgraphs
.append(extwork
)
168 test_same
.append((allchars
, c
))
170 test_diff
.append((allchars
, extgraphs
, c
))
172 stype
= "&'static [(&'static str, &'static [&'static str])]"
173 dtype
= "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174 f
.write(" // official Unicode test data\n")
175 f
.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
176 unicode.emit_table(f
, "TEST_SAME", test_same
, stype
, True, showfun
, True)
177 unicode.emit_table(f
, "TEST_DIFF", test_diff
, dtype
, True, showfun
, True)
179 def create_words_data(f
):
180 d
= load_test_data("auxiliary/WordBreakTest.txt")
185 allchars
= [cn
for s
in c
for cn
in s
]
186 test
.append((allchars
, c
))
188 wtype
= "&'static [(&'static str, &'static [&'static str])]"
189 f
.write(" // official Unicode test data\n")
190 f
.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
191 unicode.emit_table(f
, "TEST_WORD", test
, wtype
, True, showfun
, True)
193 if __name__
== "__main__":
194 with
open("testdata.rs", "w") as rf
:
195 rf
.write(unicode.preamble
)
196 create_grapheme_data(rf
)
197 create_words_data(rf
)