[rustc.git] / library / core / src / unicode / printable.py

#!/usr/bin/env python

# This script uses the following Unicode tables:
# - UnicodeData.txt


from collections import namedtuple
import csv
import os
import subprocess

NUM_CODEPOINTS=0x110000

def to_ranges(iter):
    current = None
    for i in iter:
        if current is None or i != current[1] or i in (0x10000, 0x20000):
            if current is not None:
                yield tuple(current)
            current = [i, i + 1]
        else:
            current[1] += 1
    if current is not None:
        yield tuple(current)

def get_escaped(codepoints):
    for c in codepoints:
        if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
            yield c.value

def get_file(f):
    try:
        return open(os.path.basename(f))
    except FileNotFoundError:
        subprocess.run(["curl", "-O", f], check=True)
        return open(os.path.basename(f))

Codepoint = namedtuple('Codepoint', 'value class_')

def get_codepoints(f):
    r = csv.reader(f, delimiter=";")
    prev_codepoint = 0
    class_first = None
    for row in r:
        codepoint = int(row[0], 16)
        name = row[1]
        class_ = row[2]

        if class_first is not None:
            if not name.endswith("Last>"):
                raise ValueError("Missing Last after First")

        for c in range(prev_codepoint + 1, codepoint):
            yield Codepoint(c, class_first)

        class_first = None
        if name.endswith("First>"):
            class_first = class_

        yield Codepoint(codepoint, class_)
        prev_codepoint = codepoint

    if class_first is not None:
        raise ValueError("Missing Last after First")

    for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
        yield Codepoint(c, None)

def compress_singletons(singletons):
    uppers = [] # (upper, # items in lowers)
    lowers = []

    for i in singletons:
        upper = i >> 8
        lower = i & 0xff
        if len(uppers) == 0 or uppers[-1][0] != upper:
            uppers.append((upper, 1))
        else:
            upper, count = uppers[-1]
            uppers[-1] = upper, count + 1
        lowers.append(lower)

    return uppers, lowers

def compress_normal(normal):
    # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
    # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
    compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]

    prev_start = 0
    for start, count in normal:
        truelen = start - prev_start
        falselen = count
        prev_start = start + count

        assert truelen < 0x8000 and falselen < 0x8000
        entry = []
        if truelen > 0x7f:
            entry.append(0x80 | (truelen >> 8))
            entry.append(truelen & 0xff)
        else:
            entry.append(truelen & 0x7f)
        if falselen > 0x7f:
            entry.append(0x80 | (falselen >> 8))
            entry.append(falselen & 0xff)
        else:
            entry.append(falselen & 0x7f)

        compressed.append(entry)

    return compressed

def print_singletons(uppers, lowers, uppersname, lowersname):
    print("#[rustfmt::skip]")
    print("const {}: &[(u8, u8)] = &[".format(uppersname))
    for u, c in uppers:
        print("    ({:#04x}, {}),".format(u, c))
    print("];")
    print("#[rustfmt::skip]")
    print("const {}: &[u8] = &[".format(lowersname))
    for i in range(0, len(lowers), 8):
        print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
    print("];")

def print_normal(normal, normalname):
    print("#[rustfmt::skip]")
    print("const {}: &[u8] = &[".format(normalname))
    for v in normal:
        print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))
    print("];")

def main():
    file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")

    codepoints = get_codepoints(file)

    CUTOFF=0x10000
    singletons0 = []
    singletons1 = []
    normal0 = []
    normal1 = []
    extra = []

    for a, b in to_ranges(get_escaped(codepoints)):
        if a > 2 * CUTOFF:
            extra.append((a, b - a))
        elif a == b - 1:
            if a & CUTOFF:
                singletons1.append(a & ~CUTOFF)
            else:
                singletons0.append(a)
        elif a == b - 2:
            if a & CUTOFF:
                singletons1.append(a & ~CUTOFF)
                singletons1.append((a + 1) & ~CUTOFF)
            else:
                singletons0.append(a)
                singletons0.append(a + 1)
        else:
            if a >= 2 * CUTOFF:
                extra.append((a, b - a))
            elif a & CUTOFF:
                normal1.append((a & ~CUTOFF, b - a))
            else:
                normal0.append((a, b - a))

    singletons0u, singletons0l = compress_singletons(singletons0)
    singletons1u, singletons1l = compress_singletons(singletons1)
    normal0 = compress_normal(normal0)
    normal1 = compress_normal(normal1)

    print("""\
// NOTE: The following code was generated by "src/libcore/unicode/printable.py",
//       do not edit directly!

fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool {
    let xupper = (x >> 8) as u8;
    let mut lowerstart = 0;
    for &(upper, lowercount) in singletonuppers {
        let lowerend = lowerstart + lowercount as usize;
        if xupper == upper {
            for &lower in &singletonlowers[lowerstart..lowerend] {
                if lower == x as u8 {
                    return false;
                }
            }
        } else if xupper < upper {
            break;
        }
        lowerstart = lowerend;
    }

    let mut x = x as i32;
    let mut normal = normal.iter().cloned();
    let mut current = true;
    while let Some(v) = normal.next() {
        let len = if v & 0x80 != 0 {
            ((v & 0x7f) as i32) << 8 | normal.next().unwrap() as i32
        } else {
            v as i32
        };
        x -= len;
        if x < 0 {
            break;
        }
        current = !current;
    }
    current
}

pub(crate) fn is_printable(x: char) -> bool {
    let x = x as u32;
    let lower = x as u16;
    if x < 0x10000 {
        check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
    } else if x < 0x20000 {
        check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
    } else {\
""")
    for a, b in extra:
        print("        if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
        print("            return false;")
        print("        }")
    print("""\
        true
    }
}\
""")
    print()
    print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
    print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
    print_normal(normal0, 'NORMAL0')
    print_normal(normal1, 'NORMAL1')

if __name__ == '__main__':
    main()
Commit	Line	Data
5bcae85e	1	#!/usr/bin/env python
5bcae85e SL	2
5bcae85e SL	3	# This script uses the following Unicode tables:
476ff2be	4	# - UnicodeData.txt
5bcae85e	5
476ff2be SL	6
	7	from collections import namedtuple
	8	import csv
5bcae85e SL	9	import os
	10	import subprocess
	11
476ff2be SL	12	NUM_CODEPOINTS=0x110000
476ff2be SL	13
5bcae85e SL	14	def to_ranges(iter):
	15	current = None
	16	for i in iter:
	17	if current is None or i != current[1] or i in (0x10000, 0x20000):
	18	if current is not None:
	19	yield tuple(current)
	20	current = [i, i + 1]
	21	else:
	22	current[1] += 1
	23	if current is not None:
	24	yield tuple(current)
	25
476ff2be SL	26	def get_escaped(codepoints):
	27	for c in codepoints:
	28	if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
	29	yield c.value
5bcae85e SL	30
	31	def get_file(f):
	32	try:
	33	return open(os.path.basename(f))
	34	except FileNotFoundError:
	35	subprocess.run(["curl", "-O", f], check=True)
	36	return open(os.path.basename(f))
	37
476ff2be SL	38	Codepoint = namedtuple('Codepoint', 'value class_')
	39
	40	def get_codepoints(f):
	41	r = csv.reader(f, delimiter=";")
	42	prev_codepoint = 0
	43	class_first = None
	44	for row in r:
	45	codepoint = int(row[0], 16)
	46	name = row[1]
	47	class_ = row[2]
	48
	49	if class_first is not None:
	50	if not name.endswith("Last>"):
	51	raise ValueError("Missing Last after First")
	52
	53	for c in range(prev_codepoint + 1, codepoint):
	54	yield Codepoint(c, class_first)
	55
	56	class_first = None
	57	if name.endswith("First>"):
	58	class_first = class_
	59
	60	yield Codepoint(codepoint, class_)
	61	prev_codepoint = codepoint
	62
e1599b0c	63	if class_first is not None:
476ff2be SL	64	raise ValueError("Missing Last after First")
	65
	66	for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
	67	yield Codepoint(c, None)
	68
cc61c64b XL	69	def compress_singletons(singletons):
	70	uppers = [] # (upper, # items in lowers)
	71	lowers = []
	72
	73	for i in singletons:
	74	upper = i >> 8
	75	lower = i & 0xff
	76	if len(uppers) == 0 or uppers[-1][0] != upper:
	77	uppers.append((upper, 1))
	78	else:
	79	upper, count = uppers[-1]
	80	uppers[-1] = upper, count + 1
	81	lowers.append(lower)
	82
	83	return uppers, lowers
	84
	85	def compress_normal(normal):
	86	# lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
	87	# lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
	88	compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
	89
	90	prev_start = 0
	91	for start, count in normal:
	92	truelen = start - prev_start
	93	falselen = count
	94	prev_start = start + count
	95
	96	assert truelen < 0x8000 and falselen < 0x8000
	97	entry = []
	98	if truelen > 0x7f:
	99	entry.append(0x80 \| (truelen >> 8))
	100	entry.append(truelen & 0xff)
	101	else:
	102	entry.append(truelen & 0x7f)
	103	if falselen > 0x7f:
	104	entry.append(0x80 \| (falselen >> 8))
	105	entry.append(falselen & 0xff)
	106	else:
	107	entry.append(falselen & 0x7f)
	108
	109	compressed.append(entry)
	110
	111	return compressed
	112
	113	def print_singletons(uppers, lowers, uppersname, lowersname):
60c5eb7d	114	print("#[rustfmt::skip]")
416331ca	115	print("const {}: &[(u8, u8)] = &[".format(uppersname))
cc61c64b XL	116	for u, c in uppers:
	117	print(" ({:#04x}, {}),".format(u, c))
	118	print("];")
60c5eb7d	119	print("#[rustfmt::skip]")
416331ca	120	print("const {}: &[u8] = &[".format(lowersname))
cc61c64b XL	121	for i in range(0, len(lowers), 8):
	122	print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
	123	print("];")
	124
	125	def print_normal(normal, normalname):
60c5eb7d	126	print("#[rustfmt::skip]")
416331ca	127	print("const {}: &[u8] = &[".format(normalname))
cc61c64b XL	128	for v in normal:
	129	print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
	130	print("];")
	131
5bcae85e	132	def main():
136023e0	133	file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
5bcae85e	134
476ff2be	135	codepoints = get_codepoints(file)
5bcae85e SL	136
	137	CUTOFF=0x10000
	138	singletons0 = []
	139	singletons1 = []
	140	normal0 = []
	141	normal1 = []
	142	extra = []
	143
476ff2be	144	for a, b in to_ranges(get_escaped(codepoints)):
5bcae85e SL	145	if a > 2 * CUTOFF:
	146	extra.append((a, b - a))
	147	elif a == b - 1:
	148	if a & CUTOFF:
	149	singletons1.append(a & ~CUTOFF)
	150	else:
	151	singletons0.append(a)
	152	elif a == b - 2:
	153	if a & CUTOFF:
	154	singletons1.append(a & ~CUTOFF)
	155	singletons1.append((a + 1) & ~CUTOFF)
	156	else:
	157	singletons0.append(a)
	158	singletons0.append(a + 1)
	159	else:
	160	if a >= 2 * CUTOFF:
	161	extra.append((a, b - a))
	162	elif a & CUTOFF:
	163	normal1.append((a & ~CUTOFF, b - a))
	164	else:
	165	normal0.append((a, b - a))
	166
cc61c64b XL	167	singletons0u, singletons0l = compress_singletons(singletons0)
	168	singletons1u, singletons1l = compress_singletons(singletons1)
	169	normal0 = compress_normal(normal0)
	170	normal1 = compress_normal(normal1)
	171
5bcae85e	172	print("""\
83c7162d	173	// NOTE: The following code was generated by "src/libcore/unicode/printable.py",
5bcae85e SL	174	// do not edit directly!
5bcae85e SL	175
60c5eb7d	176	fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], normal: &[u8]) -> bool {
cc61c64b XL	177	let xupper = (x >> 8) as u8;
	178	let mut lowerstart = 0;
	179	for &(upper, lowercount) in singletonuppers {
	180	let lowerend = lowerstart + lowercount as usize;
	181	if xupper == upper {
	182	for &lower in &singletonlowers[lowerstart..lowerend] {
	183	if lower == x as u8 {
	184	return false;
	185	}
	186	}
	187	} else if xupper < upper {
5bcae85e SL	188	break;
5bcae85e SL	189	}
cc61c64b	190	lowerstart = lowerend;
5bcae85e	191	}
cc61c64b XL	192
	193	let mut x = x as i32;
	194	let mut normal = normal.iter().cloned();
	195	let mut current = true;
	196	while let Some(v) = normal.next() {
	197	let len = if v & 0x80 != 0 {
	198	((v & 0x7f) as i32) << 8 \| normal.next().unwrap() as i32
5bcae85e	199	} else {
cc61c64b XL	200	v as i32
	201	};
	202	x -= len;
	203	if x < 0 {
5bcae85e SL	204	break;
5bcae85e SL	205	}
cc61c64b	206	current = !current;
5bcae85e	207	}
cc61c64b	208	current
5bcae85e SL	209	}
5bcae85e SL	210
abe05a73	211	pub(crate) fn is_printable(x: char) -> bool {
5bcae85e SL	212	let x = x as u32;
	213	let lower = x as u16;
	214	if x < 0x10000 {
cc61c64b	215	check(lower, SINGLETONS0U, SINGLETONS0L, NORMAL0)
5bcae85e	216	} else if x < 0x20000 {
cc61c64b	217	check(lower, SINGLETONS1U, SINGLETONS1L, NORMAL1)
5bcae85e SL	218	} else {\
	219	""")
	220	for a, b in extra:
	221	print(" if 0x{:x} <= x && x < 0x{:x} {{".format(a, a + b))
	222	print(" return false;")
	223	print(" }")
	224	print("""\
	225	true
	226	}
	227	}\
	228	""")
	229	print()
cc61c64b XL	230	print_singletons(singletons0u, singletons0l, 'SINGLETONS0U', 'SINGLETONS0L')
	231	print_singletons(singletons1u, singletons1l, 'SINGLETONS1U', 'SINGLETONS1L')
	232	print_normal(normal0, 'NORMAL0')
	233	print_normal(normal1, 'NORMAL1')
5bcae85e SL	234
	235	if __name__ == '__main__':
	236	main()