printable.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. #!/usr/bin/env python3
  2. # This script is based on
  3. # https://github.com/rust-lang/rust/blob/master/library/core/src/unicode/printable.py
  4. # distributed under https://github.com/rust-lang/rust/blob/master/LICENSE-MIT.
  5. # This script uses the following Unicode tables:
  6. # - UnicodeData.txt
  7. from collections import namedtuple
  8. import csv
  9. import os
  10. import subprocess
  11. NUM_CODEPOINTS=0x110000
  12. def to_ranges(iter):
  13. current = None
  14. for i in iter:
  15. if current is None or i != current[1] or i in (0x10000, 0x20000):
  16. if current is not None:
  17. yield tuple(current)
  18. current = [i, i + 1]
  19. else:
  20. current[1] += 1
  21. if current is not None:
  22. yield tuple(current)
  23. def get_escaped(codepoints):
  24. for c in codepoints:
  25. if (c.class_ or "Cn") in "Cc Cf Cs Co Cn Zl Zp Zs".split() and c.value != ord(' '):
  26. yield c.value
  27. def get_file(f):
  28. try:
  29. return open(os.path.basename(f))
  30. except FileNotFoundError:
  31. subprocess.run(["curl", "-O", f], check=True)
  32. return open(os.path.basename(f))
  33. Codepoint = namedtuple('Codepoint', 'value class_')
  34. def get_codepoints(f):
  35. r = csv.reader(f, delimiter=";")
  36. prev_codepoint = 0
  37. class_first = None
  38. for row in r:
  39. codepoint = int(row[0], 16)
  40. name = row[1]
  41. class_ = row[2]
  42. if class_first is not None:
  43. if not name.endswith("Last>"):
  44. raise ValueError("Missing Last after First")
  45. for c in range(prev_codepoint + 1, codepoint):
  46. yield Codepoint(c, class_first)
  47. class_first = None
  48. if name.endswith("First>"):
  49. class_first = class_
  50. yield Codepoint(codepoint, class_)
  51. prev_codepoint = codepoint
  52. if class_first is not None:
  53. raise ValueError("Missing Last after First")
  54. for c in range(prev_codepoint + 1, NUM_CODEPOINTS):
  55. yield Codepoint(c, None)
  56. def compress_singletons(singletons):
  57. uppers = [] # (upper, # items in lowers)
  58. lowers = []
  59. for i in singletons:
  60. upper = i >> 8
  61. lower = i & 0xff
  62. if len(uppers) == 0 or uppers[-1][0] != upper:
  63. uppers.append((upper, 1))
  64. else:
  65. upper, count = uppers[-1]
  66. uppers[-1] = upper, count + 1
  67. lowers.append(lower)
  68. return uppers, lowers
  69. def compress_normal(normal):
  70. # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
  71. # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
  72. compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
  73. prev_start = 0
  74. for start, count in normal:
  75. truelen = start - prev_start
  76. falselen = count
  77. prev_start = start + count
  78. assert truelen < 0x8000 and falselen < 0x8000
  79. entry = []
  80. if truelen > 0x7f:
  81. entry.append(0x80 | (truelen >> 8))
  82. entry.append(truelen & 0xff)
  83. else:
  84. entry.append(truelen & 0x7f)
  85. if falselen > 0x7f:
  86. entry.append(0x80 | (falselen >> 8))
  87. entry.append(falselen & 0xff)
  88. else:
  89. entry.append(falselen & 0x7f)
  90. compressed.append(entry)
  91. return compressed
  92. def print_singletons(uppers, lowers, uppersname, lowersname):
  93. print(" static constexpr singleton {}[] = {{".format(uppersname))
  94. for u, c in uppers:
  95. print(" {{{:#04x}, {}}},".format(u, c))
  96. print(" };")
  97. print(" static constexpr unsigned char {}[] = {{".format(lowersname))
  98. for i in range(0, len(lowers), 8):
  99. print(" {}".format(" ".join("{:#04x},".format(l) for l in lowers[i:i+8])))
  100. print(" };")
  101. def print_normal(normal, normalname):
  102. print(" static constexpr unsigned char {}[] = {{".format(normalname))
  103. for v in normal:
  104. print(" {}".format(" ".join("{:#04x},".format(i) for i in v)))
  105. print(" };")
  106. def main():
  107. file = get_file("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
  108. codepoints = get_codepoints(file)
  109. CUTOFF=0x10000
  110. singletons0 = []
  111. singletons1 = []
  112. normal0 = []
  113. normal1 = []
  114. extra = []
  115. for a, b in to_ranges(get_escaped(codepoints)):
  116. if a > 2 * CUTOFF:
  117. extra.append((a, b - a))
  118. elif a == b - 1:
  119. if a & CUTOFF:
  120. singletons1.append(a & ~CUTOFF)
  121. else:
  122. singletons0.append(a)
  123. elif a == b - 2:
  124. if a & CUTOFF:
  125. singletons1.append(a & ~CUTOFF)
  126. singletons1.append((a + 1) & ~CUTOFF)
  127. else:
  128. singletons0.append(a)
  129. singletons0.append(a + 1)
  130. else:
  131. if a >= 2 * CUTOFF:
  132. extra.append((a, b - a))
  133. elif a & CUTOFF:
  134. normal1.append((a & ~CUTOFF, b - a))
  135. else:
  136. normal0.append((a, b - a))
  137. singletons0u, singletons0l = compress_singletons(singletons0)
  138. singletons1u, singletons1l = compress_singletons(singletons1)
  139. normal0 = compress_normal(normal0)
  140. normal1 = compress_normal(normal1)
  141. print("""\
  142. FMT_FUNC auto is_printable(uint32_t cp) -> bool {\
  143. """)
  144. print_singletons(singletons0u, singletons0l, 'singletons0', 'singletons0_lower')
  145. print_singletons(singletons1u, singletons1l, 'singletons1', 'singletons1_lower')
  146. print_normal(normal0, 'normal0')
  147. print_normal(normal1, 'normal1')
  148. print("""\
  149. auto lower = static_cast<uint16_t>(cp);
  150. if (cp < 0x10000) {
  151. return is_printable(lower, singletons0,
  152. sizeof(singletons0) / sizeof(*singletons0),
  153. singletons0_lower, normal0, sizeof(normal0));
  154. }
  155. if (cp < 0x20000) {
  156. return is_printable(lower, singletons1,
  157. sizeof(singletons1) / sizeof(*singletons1),
  158. singletons1_lower, normal1, sizeof(normal1));
  159. }\
  160. """)
  161. for a, b in extra:
  162. print(" if (0x{:x} <= cp && cp < 0x{:x}) return false;".format(a, a + b))
  163. print("""\
  164. return cp < 0x{:x};
  165. }}\
  166. """.format(NUM_CODEPOINTS))
  167. if __name__ == '__main__':
  168. main()