ceph/src/fmt/support/printable.py

   1 #!/usr/bin/env python3
   2
   3 # This script is based on
   4 # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
   5 # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
   6
   7 # This script uses the following Unicode tables:
   8 # - UnicodeData.txt
   9
  10
  11 from collections import namedtuple
  12 import csv
  13 import os
  14 import subprocess
  15
  16 NUM_CODEPOINTS=0x110000
  17
  18 def to_ranges(iter):
  19     current = None
  20     for i in iter:
  21         if current is None or i != current[1] or i in (0x10000, 0x20000):
  22             if current is not None:
  23                 yield tuple(current)
  24             current = [i, i + 1]
  25         else:
  26             current[1] += 1
  27     if current is not None:
  28         yield tuple(current)
  29
  30 def get_escaped(codepoints):
  31     for c in codepoints:
  32         if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
  33             yield c.value
  34
  35 def get_file(f):
  36     try:
  37         return open(os.path.basename(f))
  38     except FileNotFoundError:
  39         subprocess.run(["curl", "-O", f], check=True)
  40         return open(os.path.basename(f))
  41
  42 Codepoint = namedtuple('Codepoint', 'value class_')
  43
  44 def get_codepoints(f):
  45     r = csv.reader(f, delimiter=";")
  46     prev_codepoint = 0
  47     class_first = None
  48     for row in r:
  49         codepoint = int(row[0], 16)
  50         name = row[1]
  51         class_ = row[2]
  52
  53         if class_first is not None:
  54             if not name.endswith("Last>"):
  55                 raise ValueError("Missing Last after First")
  56
  57         for c in range(prev_codepoint + 1, codepoint):
  58             yield Codepoint(c, class_first)
  59
  60         class_first = None
  61         if name.endswith("First>"):
  62             class_first = class_
  63
  64         yield Codepoint(codepoint, class_)
  65         prev_codepoint = codepoint
  66
  67     if class_first is not None:
  68         raise ValueError("Missing Last after First")
  69
  70     for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
  71         yield Codepoint(c, None)
  72
  73 def compress_singletons(singletons):
  74     uppers = [] # (upper, # items in lowers)
  75     lowers = []
  76
  77     for i in singletons:
  78         upper = i >> 8
  79         lower = i & 0xff
  80         if len(uppers) == 0 or uppers[-1][0] != upper:
  81             uppers.append((upper, 1))
  82         else:
  83             upper, count = uppers[-1]
  84             uppers[-1] = upper, count + 1
  85         lowers.append(lower)
  86
  87     return uppers, lowers
  88
  89 def compress_normal(normal):
  90     # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
  91     # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
  92     compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
  93
  94     prev_start = 0
  95     for start, count in normal:
  96         truelen = start - prev_start
  97         falselen = count
  98         prev_start = start + count
  99
 100         assert truelen < 0x8000 and falselen < 0x8000
 101         entry = []
 102         if truelen > 0x7f:
 103             entry.append(0x80 | (truelen >> 8))
 104             entry.append(truelen & 0xff)
 105         else:
 106             entry.append(truelen & 0x7f)
 107         if falselen > 0x7f:
 108             entry.append(0x80 | (falselen >> 8))
 109             entry.append(falselen & 0xff)
 110         else:
 111             entry.append(falselen & 0x7f)
 112
 113         compressed.append(entry)
 114
 115     return compressed
 116
 117 def print_singletons(uppers, lowers, uppersname, lowersname):
 118     print("  static constexpr singleton {}[] = {{".format(uppersname))
 119     for u, c in uppers:
 120         print("    {{{:#04x}, {}}},".format(u, c))
 121     print("  };")
 122     print("  static constexpr unsigned char {}[] = {{".format(lowersname))
 123     for i in range(0, len(lowers), 8):
 124         print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
 125     print("  };")
 126
 127 def print_normal(normal, normalname):
 128     print("  static constexpr unsigned char {}[] = {{".format(normalname))
 129     for v in normal:
 130         print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
 131     print("  };")
 132
 133 def main():
 134     file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
 135
 136     codepoints = get_codepoints(file)
 137
 138     CUTOFF=0x10000
 139     singletons0 = []
 140     singletons1 = []
 141     normal0 = []
 142     normal1 = []
 143     extra = []
 144
 145     for a, b in to_ranges(get_escaped(codepoints)):
 146         if a > 2 * CUTOFF:
 147             extra.append((a, b - a))
 148         elif a == b - 1:
 149             if a & CUTOFF:
 150                 singletons1.append(a & ~CUTOFF)
 151             else:
 152                 singletons0.append(a)
 153         elif a == b - 2:
 154             if a & CUTOFF:
 155                 singletons1.append(a & ~CUTOFF)
 156                 singletons1.append((a + 1) & ~CUTOFF)
 157             else:
 158                 singletons0.append(a)
 159                 singletons0.append(a + 1)
 160         else:
 161             if a >= 2 * CUTOFF:
 162                 extra.append((a, b - a))
 163             elif a & CUTOFF:
 164                 normal1.append((a & ~CUTOFF, b - a))
 165             else:
 166                 normal0.append((a, b - a))
 167
 168     singletons0u, singletons0l = compress_singletons(singletons0)
 169     singletons1u, singletons1l = compress_singletons(singletons1)
 170     normal0 = compress_normal(normal0)
 171     normal1 = compress_normal(normal1)
 172
 173     print("""\
 174 FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
 175 """)
 176     print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
 177     print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
 178     print_normal(normal0, 'normal0')
 179     print_normal(normal1, 'normal1')
 180     print("""\
 181   auto lower = static_cast<uint16_t>(cp);
 182   if (cp < 0x10000) {
 183     return is_printable(lower, singletons0,
 184                         sizeof(singletons0) / sizeof(*singletons0),
 185                         singletons0_lower, normal0, sizeof(normal0));
 186   }
 187   if (cp < 0x20000) {
 188     return is_printable(lower, singletons1,
 189                         sizeof(singletons1) / sizeof(*singletons1),
 190                         singletons1_lower, normal1, sizeof(normal1));
 191   }\
 192 """)
 193     for a, b in extra:
 194         print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
 195     print("""\
 196   return cp < 0x{:x};
 197 }}\
 198 """.format(NUM_CODEPOINTS))
 199
 200 if __name__ == '__main__':
 201     main()