src/etc/char_private.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2011-2016 The Rust Project Developers. See the COPYRIGHT
   4 # file at the top-level directory of this distribution and at
   5 # http://rust-lang.org/COPYRIGHT.
   6 #
   7 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   8 # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   9 # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  10 # option. This file may not be copied, modified, or distributed
  11 # except according to those terms.
  12
  13 # This script uses the following Unicode tables:
  14 # - UnicodeData.txt
  15
  16
  17 from collections import namedtuple
  18 import csv
  19 import os
  20 import subprocess
  21
  22 NUM_CODEPOINTS=0x110000
  23
  24 def to_ranges(iter):
  25     current = None
  26     for i in iter:
  27         if current is None or i != current[1] or i in (0x10000, 0x20000):
  28             if current is not None:
  29                 yield tuple(current)
  30             current = [i, i + 1]
  31         else:
  32             current[1] += 1
  33     if current is not None:
  34         yield tuple(current)
  35
  36 def get_escaped(codepoints):
  37     for c in codepoints:
  38         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
  39             yield c.value
  40
  41 def get_file(f):
  42     try:
  43         return open(os.path.basename(f))
  44     except FileNotFoundError:
  45         subprocess.run(["curl", "-O", f], check=True)
  46         return open(os.path.basename(f))
  47
  48 Codepoint = namedtuple('Codepoint', 'value class_')
  49
  50 def get_codepoints(f):
  51     r = csv.reader(f, delimiter=";")
  52     prev_codepoint = 0
  53     class_first = None
  54     for row in r:
  55         codepoint = int(row[0], 16)
  56         name = row[1]
  57         class_ = row[2]
  58
  59         if class_first is not None:
  60             if not name.endswith("Last>"):
  61                 raise ValueError("Missing Last after First")
  62
  63         for c in range(prev_codepoint + 1, codepoint):
  64             yield Codepoint(c, class_first)
  65
  66         class_first = None
  67         if name.endswith("First>"):
  68             class_first = class_
  69
  70         yield Codepoint(codepoint, class_)
  71         prev_codepoint = codepoint
  72
  73     if class_first != None:
  74         raise ValueError("Missing Last after First")
  75
  76     for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
  77         yield Codepoint(c, None)
  78
  79 def compress_singletons(singletons):
  80     uppers = [] # (upper, # items in lowers)
  81     lowers = []
  82
  83     for i in singletons:
  84         upper = i >> 8
  85         lower = i & 0xff
  86         if len(uppers) == 0 or uppers[-1][0] != upper:
  87             uppers.append((upper, 1))
  88         else:
  89             upper, count = uppers[-1]
  90             uppers[-1] = upper, count + 1
  91         lowers.append(lower)
  92
  93     return uppers, lowers
  94
  95 def compress_normal(normal):
  96     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
  97     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
  98     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
  99
 100     prev_start = 0
 101     for start, count in normal:
 102         truelen = start - prev_start
 103         falselen = count
 104         prev_start = start + count
 105
 106         assert truelen < 0x8000 and falselen < 0x8000
 107         entry = []
 108         if truelen > 0x7f:
 109             entry.append(0x80 | (truelen >> 8))
 110             entry.append(truelen & 0xff)
 111         else:
 112             entry.append(truelen & 0x7f)
 113         if falselen > 0x7f:
 114             entry.append(0x80 | (falselen >> 8))
 115             entry.append(falselen & 0xff)
 116         else:
 117             entry.append(falselen & 0x7f)
 118
 119         compressed.append(entry)
 120
 121     return compressed
 122
 123 def print_singletons(uppers, lowers, uppersname, lowersname):
 124     print("const {}: &'static [(u8, u8)] = &[".format(uppersname))
 125     for u, c in uppers:
 126         print("    ({:#04x}, {}),".format(u, c))
 127     print("];")
 128     print("const {}: &'static [u8] = &[".format(lowersname))
 129     for i in range(0, len(lowers), 8):
 130         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
 131     print("];")
 132
 133 def print_normal(normal, normalname):
 134     print("const {}: &'static [u8] = &[".format(normalname))
 135     for v in normal:
 136         print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
 137     print("];")
 138
 139 def main():
 140     file = get_file("http://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 141
 142     codepoints = get_codepoints(file)
 143
 144     CUTOFF=0x10000
 145     singletons0 = []
 146     singletons1 = []
 147     normal0 = []
 148     normal1 = []
 149     extra = []
 150
 151     for a, b in to_ranges(get_escaped(codepoints)):
 152         if a > 2 * CUTOFF:
 153             extra.append((a, b - a))
 154         elif a == b - 1:
 155             if a & CUTOFF:
 156                 singletons1.append(a & ~CUTOFF)
 157             else:
 158                 singletons0.append(a)
 159         elif a == b - 2:
 160             if a & CUTOFF:
 161                 singletons1.append(a & ~CUTOFF)
 162                 singletons1.append((a + 1) & ~CUTOFF)
 163             else:
 164                 singletons0.append(a)
 165                 singletons0.append(a + 1)
 166         else:
 167             if a >= 2 * CUTOFF:
 168                 extra.append((a, b - a))
 169             elif a & CUTOFF:
 170                 normal1.append((a & ~CUTOFF, b - a))
 171             else:
 172                 normal0.append((a, b - a))
 173
 174     singletons0u, singletons0l = compress_singletons(singletons0)
 175     singletons1u, singletons1l = compress_singletons(singletons1)
 176     normal0 = compress_normal(normal0)
 177     normal1 = compress_normal(normal1)
 178
 179     print("""\
 180 // Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
 181 // file at the top-level directory of this distribution and at
 182 // http://rust-lang.org/COPYRIGHT.
 183 //
 184 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 185 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 186 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 187 // option. This file may not be copied, modified, or distributed
 188 // except according to those terms.
 189
 190 // NOTE: The following code was generated by "src/etc/char_private.py",
 191 //       do not edit directly!
 192
 193 fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8],
 194          normal: &[u8]) -> bool {
 195     let xupper = (x >> 8) as u8;
 196     let mut lowerstart = 0;
 197     for &(upper, lowercount) in singletonuppers {
 198         let lowerend = lowerstart + lowercount as usize;
 199         if xupper == upper {
 200             for &lower in &singletonlowers[lowerstart..lowerend] {
 201                 if lower == x as u8 {
 202                     return false;
 203                 }
 204             }
 205         } else if xupper < upper {
 206             break;
 207         }
 208         lowerstart = lowerend;
 209     }
 210
 211     let mut x = x as i32;
 212     let mut normal = normal.iter().cloned();
 213     let mut current = true;
 214     while let Some(v) = normal.next() {
 215         let len = if v & 0x80 != 0 {
 216             ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
 217         } else {
 218             v as i32
 219         };
 220         x -= len;
 221         if x < 0 {
 222             break;
 223         }
 224         current = !current;
 225     }
 226     current
 227 }
 228
 229 pub(crate) fn is_printable(x: char) -> bool {
 230     let x = x as u32;
 231     let lower = x as u16;
 232     if x < 0x10000 {
 233         check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
 234     } else if x < 0x20000 {
 235         check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
 236     } else {\
 237 """)
 238     for a, b in extra:
 239         print("        if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
 240         print("            return false;")
 241         print("        }")
 242     print("""\
 243         true
 244     }
 245 }\
 246 """)
 247     print()
 248     print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
 249     print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
 250     print_normal(normal0, 'NORMAL0')
 251     print_normal(normal1, 'NORMAL1')
 252
 253 if __name__ == '__main__':
 254     main()