]> git.proxmox.com Git - rustc.git/blob - src/vendor/unicode-segmentation/scripts/unicode_gen_breaktests.py
New upstream version 1.21.0+dfsg1
[rustc.git] / src / vendor / unicode-segmentation / scripts / unicode_gen_breaktests.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8
3 #
4 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
5 # file at the top-level directory of this distribution and at
6 # http://rust-lang.org/COPYRIGHT.
7 #
8 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
9 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
10 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
11 # option. This file may not be copied, modified, or distributed
12 # except according to those terms.
13
14 # This script uses the following Unicode tables:
15 # - auxiliary/GraphemeBreakTest.txt
16 # - auxiliary/WordBreakTest.txt
17 #
18 # Since this should not require frequent updates, we just store this
19 # out-of-line and check the unicode.rs file into git.
20
21 import unicode, re, os, fileinput
22
23 def load_test_data(f, optsplit=[]):
24 outls = []
25 testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
26
27 unicode.fetch(f)
28 data = []
29 for line in fileinput.input(os.path.basename(f)):
30 # lines that include a test start with the ÷ character
31 if len(line) < 2 or line[0:2] != '÷':
32 continue
33
34 m = testRe1.match(line)
35 if not m:
36 print "error: no match on line where test was expected: %s" % line
37 continue
38
39 # process the characters in this test case
40 chars = process_split_string(m.group(1))
41 # skip test case if it contains invalid characters (viz., surrogates)
42 if not chars:
43 continue
44
45 # now process test cases
46 (chars, info) = process_split_info(m.group(2), chars, optsplit)
47
48 # make sure that we have break info for each break!
49 assert len(chars) - 1 == len(info)
50
51 outls.append((chars, info))
52
53 return outls
54
55 def process_split_info(s, c, o):
56 outcs = []
57 outis = []
58 workcs = c.pop(0)
59
60 # are we on a × or a ÷?
61 isX = False
62 if s[0:2] == '×':
63 isX = True
64
65 # find each instance of '(÷|×) [x.y] '
66 while s:
67 # find the currently considered rule number
68 sInd = s.index('[') + 1
69 eInd = s.index(']')
70
71 # if it's '× [a.b]' where 'a.b' is in o, then
72 # we consider it a split even though it's not
73 # marked as one
74 # if it's ÷ then it's always a split
75 if not isX or s[sInd:eInd] in o:
76 outis.append(s[sInd:eInd])
77 outcs.append(workcs)
78 workcs = c.pop(0)
79 else:
80 workcs.extend(c.pop(0))
81
82 idx = 1
83 while idx < len(s):
84 if s[idx:idx+2] == '×':
85 isX = True
86 break
87 if s[idx:idx+2] == '÷':
88 isX = False
89 break
90 idx += 1
91 s = s[idx:]
92
93 outcs.append(workcs)
94 return (outcs, outis)
95
96 def process_split_string(s):
97 outls = []
98 workls = []
99
100 inls = s.split()
101
102 for i in inls:
103 if i == '÷' or i == '×':
104 outls.append(workls)
105 workls = []
106 continue
107
108 ival = int(i,16)
109
110 if unicode.is_surrogate(ival):
111 return []
112
113 workls.append(ival)
114
115 if workls:
116 outls.append(workls)
117
118 return outls
119
120 def showfun(x):
121 outstr = '("'
122 for c in x[0]:
123 outstr += "\\u{%x}" % c
124 outstr += '",&['
125 xfirst = True
126 for xx in x[1:]:
127 if not xfirst:
128 outstr += '],&['
129 xfirst = False
130 sfirst = True
131 for sp in xx:
132 if not sfirst:
133 outstr += ','
134 sfirst = False
135 outstr += '"'
136 for c in sp:
137 outstr += "\\u{%x}" % c
138 outstr += '"'
139 outstr += '])'
140 return outstr
141
142 def create_grapheme_data(f):
143 # rules 9.1 and 9.2 are for extended graphemes only
144 optsplits = ['9.1','9.2']
145 d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146
147 test_same = []
148 test_diff = []
149
150 for (c, i) in d:
151 allchars = [cn for s in c for cn in s]
152 extgraphs = []
153 extwork = []
154
155 extwork.extend(c[0])
156 for n in range(0,len(i)):
157 if i[n] in optsplits:
158 extwork.extend(c[n+1])
159 else:
160 extgraphs.append(extwork)
161 extwork = []
162 extwork.extend(c[n+1])
163
164 # these are the extended grapheme clusters
165 extgraphs.append(extwork)
166
167 if extgraphs == c:
168 test_same.append((allchars, c))
169 else:
170 test_diff.append((allchars, extgraphs, c))
171
172 stype = "&'static [(&'static str, &'static [&'static str])]"
173 dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
174 f.write(" // official Unicode test data\n")
175 f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
176 unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
177 unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
178
179 def create_words_data(f):
180 d = load_test_data("auxiliary/WordBreakTest.txt")
181
182 test = []
183
184 for (c, i) in d:
185 allchars = [cn for s in c for cn in s]
186 test.append((allchars, c))
187
188 wtype = "&'static [(&'static str, &'static [&'static str])]"
189 f.write(" // official Unicode test data\n")
190 f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
191 unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
192
193 if __name__ == "__main__":
194 with open("testdata.rs", "w") as rf:
195 rf.write(unicode.preamble)
196 create_grapheme_data(rf)
197 create_words_data(rf)