| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201 | #!/usr/bin/env python3# This script is based on# https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py# distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.# This script uses the following Unicode tables:# - UnicodeData.txtfrom collections import namedtupleimport csvimport osimport subprocessNUM_CODEPOINTS=0x110000def to_ranges(iter):    current = None    for i in iter:        if current is None or i != current[1] or i in (0x10000, 0x20000):            if current is not None:                yield tuple(current)            current = [i, i + 1]        else:            current[1] += 1    if current is not None:        yield tuple(current)def get_escaped(codepoints):    for c in codepoints:        if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):            yield c.valuedef get_file(f):    try:        return open(os.path.basename(f))    except FileNotFoundError:        subprocess.run(["curl", "-O", f], check=True)        return open(os.path.basename(f))Codepoint = namedtuple('Codepoint', 'value class_')def get_codepoints(f):    r = csv.reader(f, delimiter=";")    prev_codepoint = 0    class_first = None    for row in r:        codepoint = int(row[0], 16)        name = row[1]        class_ = row[2]        if class_first is not None:            if not name.endswith("Last>"):                raise ValueError("Missing Last after First")        for c in range(prev_codepoint + 1, codepoint):            yield Codepoint(c, class_first)        class_first = None        if name.endswith("First>"):            class_first = class_        yield Codepoint(codepoint, class_)        prev_codepoint = codepoint    if class_first is not None:        raise ValueError("Missing Last after First")    for c in range(prev_codepoint + 1, NUM_CODEPOINTS):        yield Codepoint(c, None)def compress_singletons(singletons):    uppers = [] # (upper, # items in lowers)    lowers = []    for i in singletons:        upper = i >> 8        lower = i & 0xff        if len(uppers) == 0 or uppers[-1][0] != upper:            uppers.append((upper, 1))        else:            upper, count = uppers[-1]            uppers[-1] = upper, count + 1        lowers.append(lower)    return uppers, lowersdef compress_normal(normal):    # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f    # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff    compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]    prev_start = 0    for start, count in normal:        truelen = start - prev_start        falselen = count        prev_start = start + count        assert truelen < 0x8000 and falselen < 0x8000        entry = []        if truelen > 0x7f:            entry.append(0x80 | (truelen >> 8))            entry.append(truelen & 0xff)        else:            entry.append(truelen & 0x7f)        if falselen > 0x7f:            entry.append(0x80 | (falselen >> 8))            entry.append(falselen & 0xff)        else:            entry.append(falselen & 0x7f)        compressed.append(entry)    return compresseddef print_singletons(uppers, lowers, uppersname, lowersname):    print("  static constexpr singleton {}[] = {{".format(uppersname))    for u, c in uppers:        print("    {{{:#04x}, {}}},".format(u, c))    print("  };")    print("  static constexpr unsigned char {}[] = {{".format(lowersname))    for i in range(0, len(lowers), 8):        print("    {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))    print("  };")def print_normal(normal, normalname):    print("  static constexpr unsigned char {}[] = {{".format(normalname))    for v in normal:        print("    {}".format(" ".join("{:#04x},".format(i) for i in v)))    print("  };")def main():    file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")    codepoints = get_codepoints(file)    CUTOFF=0x10000    singletons0 = []    singletons1 = []    normal0 = []    normal1 = []    extra = []    for a, b in to_ranges(get_escaped(codepoints)):        if a > 2 * CUTOFF:            extra.append((a, b - a))        elif a == b - 1:            if a & CUTOFF:                singletons1.append(a & ~CUTOFF)            else:                singletons0.append(a)        elif a == b - 2:            if a & CUTOFF:                singletons1.append(a & ~CUTOFF)                singletons1.append((a + 1) & ~CUTOFF)            else:                singletons0.append(a)                singletons0.append(a + 1)        else:            if a >= 2 * CUTOFF:                extra.append((a, b - a))            elif a & CUTOFF:                normal1.append((a & ~CUTOFF, b - a))            else:                normal0.append((a, b - a))    singletons0u, singletons0l = compress_singletons(singletons0)    singletons1u, singletons1l = compress_singletons(singletons1)    normal0 = compress_normal(normal0)    normal1 = compress_normal(normal1)    print("""\FMT_FUNC auto is_printable(uint32_t cp) -> bool {\""")    print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')    print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')    print_normal(normal0, 'normal0')    print_normal(normal1, 'normal1')    print("""\  auto lower = static_cast<uint16_t>(cp);  if (cp < 0x10000) {    return is_printable(lower, singletons0,                        sizeof(singletons0) / sizeof(*singletons0),                        singletons0_lower, normal0, sizeof(normal0));  }  if (cp < 0x20000) {    return is_printable(lower, singletons1,                        sizeof(singletons1) / sizeof(*singletons1),                        singletons1_lower, normal1, sizeof(normal1));  }\""")    for a, b in extra:        print("  if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))    print("""\  return cp < 0x{:x};}}\""".format(NUM_CODEPOINTS))if __name__ == '__main__':    main()
 |