src/vendor/unicode-segmentation/scripts/unicode_gen_breaktests.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8
   3 #
   4 # Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   5 # file at the top-level directory of this distribution and at
   6 # http://rust-lang.org/COPYRIGHT.
   7 #
   8 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   9 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  10 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  11 # option. This file may not be copied, modified, or distributed
  12 # except according to those terms.
  13
  14 # This script uses the following Unicode tables:
  15 # - auxiliary/GraphemeBreakTest.txt
  16 # - auxiliary/WordBreakTest.txt
  17 #
  18 # Since this should not require frequent updates, we just store this
  19 # out-of-line and check the unicode.rs file into git.
  20
  21 import unicode, re, os, fileinput
  22
  23 def load_test_data(f, optsplit=[]):
  24     outls = []
  25     testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
  26
  27     unicode.fetch(f)
  28     data = []
  29     for line in fileinput.input(os.path.basename(f)):
  30         # lines that include a test start with the ÷ character
  31         if len(line) < 2 or line[0:2] != '÷':
  32             continue
  33
  34         m = testRe1.match(line)
  35         if not m:
  36             print "error: no match on line where test was expected: %s" % line
  37             continue
  38
  39         # process the characters in this test case
  40         chars = process_split_string(m.group(1))
  41         # skip test case if it contains invalid characters (viz., surrogates)
  42         if not chars:
  43             continue
  44
  45         # now process test cases
  46         (chars, info) = process_split_info(m.group(2), chars, optsplit)
  47
  48         # make sure that we have break info for each break!
  49         assert len(chars) - 1 == len(info)
  50
  51         outls.append((chars, info))
  52
  53     return outls
  54
  55 def process_split_info(s, c, o):
  56     outcs = []
  57     outis = []
  58     workcs = c.pop(0)
  59
  60     # are we on a × or a ÷?
  61     isX = False
  62     if s[0:2] == '×':
  63         isX = True
  64
  65     # find each instance of '(÷|×) [x.y] '
  66     while s:
  67         # find the currently considered rule number
  68         sInd = s.index('[') + 1
  69         eInd = s.index(']')
  70
  71         # if it's '× [a.b]' where 'a.b' is in o, then
  72         # we consider it a split even though it's not
  73         # marked as one
  74         # if it's ÷ then it's always a split
  75         if not isX or s[sInd:eInd] in o:
  76             outis.append(s[sInd:eInd])
  77             outcs.append(workcs)
  78             workcs = c.pop(0)
  79         else:
  80             workcs.extend(c.pop(0))
  81
  82         idx = 1
  83         while idx < len(s):
  84             if s[idx:idx+2] == '×':
  85                 isX = True
  86                 break
  87             if s[idx:idx+2] == '÷':
  88                 isX = False
  89                 break
  90             idx += 1
  91         s = s[idx:]
  92
  93     outcs.append(workcs)
  94     return (outcs, outis)
  95
  96 def process_split_string(s):
  97     outls = []
  98     workls = []
  99
 100     inls = s.split()
 101
 102     for i in inls:
 103         if i == '÷' or i == '×':
 104             outls.append(workls)
 105             workls = []
 106             continue
 107
 108         ival = int(i,16)
 109
 110         if unicode.is_surrogate(ival):
 111             return []
 112
 113         workls.append(ival)
 114
 115     if workls:
 116         outls.append(workls)
 117
 118     return outls
 119
 120 def showfun(x):
 121     outstr = '("'
 122     for c in x[0]:
 123         outstr += "\\u{%x}" % c
 124     outstr += '",&['
 125     xfirst = True
 126     for xx in x[1:]:
 127         if not xfirst:
 128             outstr += '],&['
 129         xfirst = False
 130         sfirst = True
 131         for sp in xx:
 132             if not sfirst:
 133                 outstr += ','
 134             sfirst = False
 135             outstr += '"'
 136             for c in sp:
 137                 outstr += "\\u{%x}" % c
 138             outstr += '"'
 139     outstr += '])'
 140     return outstr
 141
 142 def create_grapheme_data(f):
 143     # rules 9.1 and 9.2 are for extended graphemes only
 144     optsplits = ['9.1','9.2']
 145     d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
 146
 147     test_same = []
 148     test_diff = []
 149
 150     for (c, i) in d:
 151         allchars = [cn for s in c for cn in s]
 152         extgraphs = []
 153         extwork = []
 154
 155         extwork.extend(c[0])
 156         for n in range(0,len(i)):
 157             if i[n] in optsplits:
 158                 extwork.extend(c[n+1])
 159             else:
 160                 extgraphs.append(extwork)
 161                 extwork = []
 162                 extwork.extend(c[n+1])
 163
 164         # these are the extended grapheme clusters
 165         extgraphs.append(extwork)
 166
 167         if extgraphs == c:
 168             test_same.append((allchars, c))
 169         else:
 170             test_diff.append((allchars, extgraphs, c))
 171
 172     stype = "&'static [(&'static str, &'static [&'static str])]"
 173     dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
 174     f.write("    // official Unicode test data\n")
 175     f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
 176     unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
 177     unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
 178
 179 def create_words_data(f):
 180     d = load_test_data("auxiliary/WordBreakTest.txt")
 181
 182     test = []
 183
 184     for (c, i) in d:
 185         allchars = [cn for s in c for cn in s]
 186         test.append((allchars, c))
 187
 188     wtype = "&'static [(&'static str, &'static [&'static str])]"
 189     f.write("    // official Unicode test data\n")
 190     f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
 191     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 192
 193 if __name__ == "__main__":
 194     with open("testdata.rs", "w") as rf:
 195         rf.write(unicode.preamble)
 196         create_grapheme_data(rf)
 197         create_words_data(rf)