unit-unicode1.cpp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. // __ _____ _____ _____
  2. // __| | __| | | | JSON for Modern C++ (supporting code)
  3. // | | |__ | | | | | | version 3.11.2
  4. // |_____|_____|_____|_|___| https://github.com/nlohmann/json
  5. //
  6. // SPDX-FileCopyrightText: 2013-2022 Niels Lohmann <https://nlohmann.me>
  7. // SPDX-License-Identifier: MIT
  8. #include "doctest_compatibility.h"
  9. // for some reason including this after the json header leads to linker errors with VS 2017...
  10. #include <locale>
  11. #include <nlohmann/json.hpp>
  12. using nlohmann::json;
  13. #include <fstream>
  14. #include <sstream>
  15. #include <iomanip>
  16. #include "make_test_data_available.hpp"
  17. TEST_CASE("Unicode (1/5)" * doctest::skip())
  18. {
  19. SECTION("\\uxxxx sequences")
  20. {
  21. // create an escaped string from a code point
  22. const auto codepoint_to_unicode = [](std::size_t cp)
  23. {
  24. // code points are represented as a six-character sequence: a
  25. // reverse solidus, followed by the lowercase letter u, followed
  26. // by four hexadecimal digits that encode the character's code
  27. // point
  28. std::stringstream ss;
  29. ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
  30. return ss.str();
  31. };
  32. SECTION("correct sequences")
  33. {
  34. // generate all UTF-8 code points; in total, 1112064 code points are
  35. // generated: 0x1FFFFF code points - 2048 invalid values between
  36. // 0xD800 and 0xDFFF.
  37. for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
  38. {
  39. // string to store the code point as in \uxxxx format
  40. std::string json_text = "\"";
  41. // decide whether to use one or two \uxxxx sequences
  42. if (cp < 0x10000u)
  43. {
  44. // The Unicode standard permanently reserves these code point
  45. // values for UTF-16 encoding of the high and low surrogates, and
  46. // they will never be assigned a character, so there should be no
  47. // reason to encode them. The official Unicode standard says that
  48. // no UTF forms, including UTF-16, can encode these code points.
  49. if (cp >= 0xD800u && cp <= 0xDFFFu)
  50. {
  51. // if we would not skip these code points, we would get a
  52. // "missing low surrogate" exception
  53. continue;
  54. }
  55. // code points in the Basic Multilingual Plane can be
  56. // represented with one \uxxxx sequence
  57. json_text += codepoint_to_unicode(cp);
  58. }
  59. else
  60. {
  61. // To escape an extended character that is not in the Basic
  62. // Multilingual Plane, the character is represented as a
  63. // 12-character sequence, encoding the UTF-16 surrogate pair
  64. const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
  65. const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
  66. json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
  67. }
  68. json_text += "\"";
  69. CAPTURE(json_text)
  70. json _;
  71. CHECK_NOTHROW(_ = json::parse(json_text));
  72. }
  73. }
  74. SECTION("incorrect sequences")
  75. {
  76. SECTION("incorrect surrogate values")
  77. {
  78. json _;
  79. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uDC00\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 7: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uDC00'", json::parse_error&);
  80. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD7FF\\uDC00\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF; last read: '\"\\uD7FF\\uDC00'", json::parse_error&);
  81. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800]\""), "[json.exception.parse_error.101] parse error at line 1, column 8: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800]'", json::parse_error&);
  82. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\v\""), "[json.exception.parse_error.101] parse error at line 1, column 9: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\v'", json::parse_error&);
  83. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\u123\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: '\\u' must be followed by 4 hex digits; last read: '\"\\uD800\\u123\"'", json::parse_error&);
  84. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uDBFF\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uDBFF'", json::parse_error&);
  85. CHECK_THROWS_WITH_AS(_ = json::parse("\"\\uD800\\uE000\""), "[json.exception.parse_error.101] parse error at line 1, column 13: syntax error while parsing value - invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF; last read: '\"\\uD800\\uE000'", json::parse_error&);
  86. }
  87. }
  88. #if 0
  89. SECTION("incorrect sequences")
  90. {
  91. SECTION("high surrogate without low surrogate")
  92. {
  93. // D800..DBFF are high surrogates and must be followed by low
  94. // surrogates DC00..DFFF; here, nothing follows
  95. for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
  96. {
  97. std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
  98. CAPTURE(json_text)
  99. CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
  100. }
  101. }
  102. SECTION("high surrogate with wrong low surrogate")
  103. {
  104. // D800..DBFF are high surrogates and must be followed by low
  105. // surrogates DC00..DFFF; here a different sequence follows
  106. for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
  107. {
  108. for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
  109. {
  110. if (0xDC00u <= cp2 && cp2 <= 0xDFFFu)
  111. {
  112. continue;
  113. }
  114. std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
  115. CAPTURE(json_text)
  116. CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
  117. }
  118. }
  119. }
  120. SECTION("low surrogate without high surrogate")
  121. {
  122. // low surrogates DC00..DFFF must follow high surrogates; here,
  123. // they occur alone
  124. for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
  125. {
  126. std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
  127. CAPTURE(json_text)
  128. CHECK_THROWS_AS(json::parse(json_text), json::parse_error&);
  129. }
  130. }
  131. }
  132. #endif
  133. }
  134. SECTION("read all unicode characters")
  135. {
  136. // read a file with all unicode characters stored as single-character
  137. // strings in a JSON array
  138. std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/all_unicode.json");
  139. json j;
  140. CHECK_NOTHROW(f >> j);
  141. // the array has 1112064 + 1 elements (a terminating "null" value)
  142. // Note: 1112064 = 0x1FFFFF code points - 2048 invalid values between
  143. // 0xD800 and 0xDFFF.
  144. CHECK(j.size() == 1112065);
  145. SECTION("check JSON Pointers")
  146. {
  147. for (const auto& s : j)
  148. {
  149. // skip non-string JSON values
  150. if (!s.is_string())
  151. {
  152. continue;
  153. }
  154. auto ptr = s.get<std::string>();
  155. // tilde must be followed by 0 or 1
  156. if (ptr == "~")
  157. {
  158. ptr += "0";
  159. }
  160. // JSON Pointers must begin with "/"
  161. ptr.insert(0, "/");
  162. CHECK_NOTHROW(json::json_pointer("/" + ptr));
  163. // check escape/unescape roundtrip
  164. auto escaped = nlohmann::detail::escape(ptr);
  165. nlohmann::detail::unescape(escaped);
  166. CHECK(escaped == ptr);
  167. }
  168. }
  169. }
  170. SECTION("ignore byte-order-mark")
  171. {
  172. SECTION("in a stream")
  173. {
  174. // read a file with a UTF-8 BOM
  175. std::ifstream f(TEST_DATA_DIRECTORY "/json_nlohmann_tests/bom.json");
  176. json j;
  177. CHECK_NOTHROW(f >> j);
  178. }
  179. SECTION("with an iterator")
  180. {
  181. std::string i = "\xef\xbb\xbf{\n \"foo\": true\n}";
  182. json _;
  183. CHECK_NOTHROW(_ = json::parse(i.begin(), i.end()));
  184. }
  185. }
  186. SECTION("error for incomplete/wrong BOM")
  187. {
  188. json _;
  189. CHECK_THROWS_AS(_ = json::parse("\xef\xbb"), json::parse_error&);
  190. CHECK_THROWS_AS(_ = json::parse("\xef\xbb\xbb"), json::parse_error&);
  191. }
  192. }
  193. namespace
  194. {
  195. void roundtrip(bool success_expected, const std::string& s);
  196. void roundtrip(bool success_expected, const std::string& s)
  197. {
  198. CAPTURE(s)
  199. json _;
  200. // create JSON string value
  201. const json j = s;
  202. // create JSON text
  203. const std::string ps = std::string("\"") + s + "\"";
  204. if (success_expected)
  205. {
  206. // serialization succeeds
  207. CHECK_NOTHROW(j.dump());
  208. // exclude parse test for U+0000
  209. if (s[0] != '\0')
  210. {
  211. // parsing JSON text succeeds
  212. CHECK_NOTHROW(_ = json::parse(ps));
  213. }
  214. // roundtrip succeeds
  215. CHECK_NOTHROW(_ = json::parse(j.dump()));
  216. // after roundtrip, the same string is stored
  217. const json jr = json::parse(j.dump());
  218. CHECK(jr.get<std::string>() == s);
  219. }
  220. else
  221. {
  222. // serialization fails
  223. CHECK_THROWS_AS(j.dump(), json::type_error&);
  224. // parsing JSON text fails
  225. CHECK_THROWS_AS(_ = json::parse(ps), json::parse_error&);
  226. }
  227. }
  228. } // namespace
  229. TEST_CASE("Markus Kuhn's UTF-8 decoder capability and stress test")
  230. {
  231. // Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
  232. // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  233. SECTION("1 Some correct UTF-8 text")
  234. {
  235. roundtrip(true, "κόσμε");
  236. }
  237. SECTION("2 Boundary condition test cases")
  238. {
  239. SECTION("2.1 First possible sequence of a certain length")
  240. {
  241. // 2.1.1 1 byte (U-00000000)
  242. roundtrip(true, std::string("\0", 1));
  243. // 2.1.2 2 bytes (U-00000080)
  244. roundtrip(true, "\xc2\x80");
  245. // 2.1.3 3 bytes (U-00000800)
  246. roundtrip(true, "\xe0\xa0\x80");
  247. // 2.1.4 4 bytes (U-00010000)
  248. roundtrip(true, "\xf0\x90\x80\x80");
  249. // 2.1.5 5 bytes (U-00200000)
  250. roundtrip(false, "\xF8\x88\x80\x80\x80");
  251. // 2.1.6 6 bytes (U-04000000)
  252. roundtrip(false, "\xFC\x84\x80\x80\x80\x80");
  253. }
  254. SECTION("2.2 Last possible sequence of a certain length")
  255. {
  256. // 2.2.1 1 byte (U-0000007F)
  257. roundtrip(true, "\x7f");
  258. // 2.2.2 2 bytes (U-000007FF)
  259. roundtrip(true, "\xdf\xbf");
  260. // 2.2.3 3 bytes (U-0000FFFF)
  261. roundtrip(true, "\xef\xbf\xbf");
  262. // 2.2.4 4 bytes (U-001FFFFF)
  263. roundtrip(false, "\xF7\xBF\xBF\xBF");
  264. // 2.2.5 5 bytes (U-03FFFFFF)
  265. roundtrip(false, "\xFB\xBF\xBF\xBF\xBF");
  266. // 2.2.6 6 bytes (U-7FFFFFFF)
  267. roundtrip(false, "\xFD\xBF\xBF\xBF\xBF\xBF");
  268. }
  269. SECTION("2.3 Other boundary conditions")
  270. {
  271. // 2.3.1 U-0000D7FF = ed 9f bf
  272. roundtrip(true, "\xed\x9f\xbf");
  273. // 2.3.2 U-0000E000 = ee 80 80
  274. roundtrip(true, "\xee\x80\x80");
  275. // 2.3.3 U-0000FFFD = ef bf bd
  276. roundtrip(true, "\xef\xbf\xbd");
  277. // 2.3.4 U-0010FFFF = f4 8f bf bf
  278. roundtrip(true, "\xf4\x8f\xbf\xbf");
  279. // 2.3.5 U-00110000 = f4 90 80 80
  280. roundtrip(false, "\xf4\x90\x80\x80");
  281. }
  282. }
  283. SECTION("3 Malformed sequences")
  284. {
  285. SECTION("3.1 Unexpected continuation bytes")
  286. {
  287. // Each unexpected continuation byte should be separately signalled as a
  288. // malformed sequence of its own.
  289. // 3.1.1 First continuation byte 0x80
  290. roundtrip(false, "\x80");
  291. // 3.1.2 Last continuation byte 0xbf
  292. roundtrip(false, "\xbf");
  293. // 3.1.3 2 continuation bytes
  294. roundtrip(false, "\x80\xbf");
  295. // 3.1.4 3 continuation bytes
  296. roundtrip(false, "\x80\xbf\x80");
  297. // 3.1.5 4 continuation bytes
  298. roundtrip(false, "\x80\xbf\x80\xbf");
  299. // 3.1.6 5 continuation bytes
  300. roundtrip(false, "\x80\xbf\x80\xbf\x80");
  301. // 3.1.7 6 continuation bytes
  302. roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf");
  303. // 3.1.8 7 continuation bytes
  304. roundtrip(false, "\x80\xbf\x80\xbf\x80\xbf\x80");
  305. // 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf)
  306. roundtrip(false, "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf");
  307. }
  308. SECTION("3.2 Lonely start characters")
  309. {
  310. // 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf)
  311. roundtrip(false, "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf");
  312. // 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef)
  313. roundtrip(false, "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef");
  314. // 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7)
  315. roundtrip(false, "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7");
  316. // 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb)
  317. roundtrip(false, "\xf8 \xf9 \xfa \xfb");
  318. // 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd)
  319. roundtrip(false, "\xfc \xfd");
  320. }
  321. SECTION("3.3 Sequences with last continuation byte missing")
  322. {
  323. // All bytes of an incomplete sequence should be signalled as a single
  324. // malformed sequence, i.e., you should see only a single replacement
  325. // character in each of the next 10 tests. (Characters as in section 2)
  326. // 3.3.1 2-byte sequence with last byte missing (U+0000)
  327. roundtrip(false, "\xc0");
  328. // 3.3.2 3-byte sequence with last byte missing (U+0000)
  329. roundtrip(false, "\xe0\x80");
  330. // 3.3.3 4-byte sequence with last byte missing (U+0000)
  331. roundtrip(false, "\xf0\x80\x80");
  332. // 3.3.4 5-byte sequence with last byte missing (U+0000)
  333. roundtrip(false, "\xf8\x80\x80\x80");
  334. // 3.3.5 6-byte sequence with last byte missing (U+0000)
  335. roundtrip(false, "\xfc\x80\x80\x80\x80");
  336. // 3.3.6 2-byte sequence with last byte missing (U-000007FF)
  337. roundtrip(false, "\xdf");
  338. // 3.3.7 3-byte sequence with last byte missing (U-0000FFFF)
  339. roundtrip(false, "\xef\xbf");
  340. // 3.3.8 4-byte sequence with last byte missing (U-001FFFFF)
  341. roundtrip(false, "\xf7\xbf\xbf");
  342. // 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF)
  343. roundtrip(false, "\xfb\xbf\xbf\xbf");
  344. // 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF)
  345. roundtrip(false, "\xfd\xbf\xbf\xbf\xbf");
  346. }
  347. SECTION("3.4 Concatenation of incomplete sequences")
  348. {
  349. // All the 10 sequences of 3.3 concatenated, you should see 10 malformed
  350. // sequences being signalled:
  351. roundtrip(false, "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf");
  352. }
  353. SECTION("3.5 Impossible bytes")
  354. {
  355. // The following two bytes cannot appear in a correct UTF-8 string
  356. // 3.5.1 fe
  357. roundtrip(false, "\xfe");
  358. // 3.5.2 ff
  359. roundtrip(false, "\xff");
  360. // 3.5.3 fe fe ff ff
  361. roundtrip(false, "\xfe\xfe\xff\xff");
  362. }
  363. }
  364. SECTION("4 Overlong sequences")
  365. {
  366. // The following sequences are not malformed according to the letter of
  367. // the Unicode 2.0 standard. However, they are longer then necessary and
  368. // a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8
  369. // decoder" should reject them just like malformed sequences for two
  370. // reasons: (1) It helps to debug applications if overlong sequences are
  371. // not treated as valid representations of characters, because this helps
  372. // to spot problems more quickly. (2) Overlong sequences provide
  373. // alternative representations of characters, that could maliciously be
  374. // used to bypass filters that check only for ASCII characters. For
  375. // instance, a 2-byte encoded line feed (LF) would not be caught by a
  376. // line counter that counts only 0x0a bytes, but it would still be
  377. // processed as a line feed by an unsafe UTF-8 decoder later in the
  378. // pipeline. From a security point of view, ASCII compatibility of UTF-8
  379. // sequences means also, that ASCII characters are *only* allowed to be
  380. // represented by ASCII bytes in the range 0x00-0x7f. To ensure this
  381. // aspect of ASCII compatibility, use only "safe UTF-8 decoders" that
  382. // reject overlong UTF-8 sequences for which a shorter encoding exists.
  383. SECTION("4.1 Examples of an overlong ASCII character")
  384. {
  385. // With a safe UTF-8 decoder, all of the following five overlong
  386. // representations of the ASCII character slash ("/") should be rejected
  387. // like a malformed UTF-8 sequence, for instance by substituting it with
  388. // a replacement character. If you see a slash below, you do not have a
  389. // safe UTF-8 decoder!
  390. // 4.1.1 U+002F = c0 af
  391. roundtrip(false, "\xc0\xaf");
  392. // 4.1.2 U+002F = e0 80 af
  393. roundtrip(false, "\xe0\x80\xaf");
  394. // 4.1.3 U+002F = f0 80 80 af
  395. roundtrip(false, "\xf0\x80\x80\xaf");
  396. // 4.1.4 U+002F = f8 80 80 80 af
  397. roundtrip(false, "\xf8\x80\x80\x80\xaf");
  398. // 4.1.5 U+002F = fc 80 80 80 80 af
  399. roundtrip(false, "\xfc\x80\x80\x80\x80\xaf");
  400. }
  401. SECTION("4.2 Maximum overlong sequences")
  402. {
  403. // Below you see the highest Unicode value that is still resulting in an
  404. // overlong sequence if represented with the given number of bytes. This
  405. // is a boundary test for safe UTF-8 decoders. All five characters should
  406. // be rejected like malformed UTF-8 sequences.
  407. // 4.2.1 U-0000007F = c1 bf
  408. roundtrip(false, "\xc1\xbf");
  409. // 4.2.2 U-000007FF = e0 9f bf
  410. roundtrip(false, "\xe0\x9f\xbf");
  411. // 4.2.3 U-0000FFFF = f0 8f bf bf
  412. roundtrip(false, "\xf0\x8f\xbf\xbf");
  413. // 4.2.4 U-001FFFFF = f8 87 bf bf bf
  414. roundtrip(false, "\xf8\x87\xbf\xbf\xbf");
  415. // 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf
  416. roundtrip(false, "\xfc\x83\xbf\xbf\xbf\xbf");
  417. }
  418. SECTION("4.3 Overlong representation of the NUL character")
  419. {
  420. // The following five sequences should also be rejected like malformed
  421. // UTF-8 sequences and should not be treated like the ASCII NUL
  422. // character.
  423. // 4.3.1 U+0000 = c0 80
  424. roundtrip(false, "\xc0\x80");
  425. // 4.3.2 U+0000 = e0 80 80
  426. roundtrip(false, "\xe0\x80\x80");
  427. // 4.3.3 U+0000 = f0 80 80 80
  428. roundtrip(false, "\xf0\x80\x80\x80");
  429. // 4.3.4 U+0000 = f8 80 80 80 80
  430. roundtrip(false, "\xf8\x80\x80\x80\x80");
  431. // 4.3.5 U+0000 = fc 80 80 80 80 80
  432. roundtrip(false, "\xfc\x80\x80\x80\x80\x80");
  433. }
  434. }
  435. SECTION("5 Illegal code positions")
  436. {
  437. // The following UTF-8 sequences should be rejected like malformed
  438. // sequences, because they never represent valid ISO 10646 characters and
  439. // a UTF-8 decoder that accepts them might introduce security problems
  440. // comparable to overlong UTF-8 sequences.
  441. SECTION("5.1 Single UTF-16 surrogates")
  442. {
  443. // 5.1.1 U+D800 = ed a0 80
  444. roundtrip(false, "\xed\xa0\x80");
  445. // 5.1.2 U+DB7F = ed ad bf
  446. roundtrip(false, "\xed\xad\xbf");
  447. // 5.1.3 U+DB80 = ed ae 80
  448. roundtrip(false, "\xed\xae\x80");
  449. // 5.1.4 U+DBFF = ed af bf
  450. roundtrip(false, "\xed\xaf\xbf");
  451. // 5.1.5 U+DC00 = ed b0 80
  452. roundtrip(false, "\xed\xb0\x80");
  453. // 5.1.6 U+DF80 = ed be 80
  454. roundtrip(false, "\xed\xbe\x80");
  455. // 5.1.7 U+DFFF = ed bf bf
  456. roundtrip(false, "\xed\xbf\xbf");
  457. }
  458. SECTION("5.2 Paired UTF-16 surrogates")
  459. {
  460. // 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80
  461. roundtrip(false, "\xed\xa0\x80\xed\xb0\x80");
  462. // 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf
  463. roundtrip(false, "\xed\xa0\x80\xed\xbf\xbf");
  464. // 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80
  465. roundtrip(false, "\xed\xad\xbf\xed\xb0\x80");
  466. // 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf
  467. roundtrip(false, "\xed\xad\xbf\xed\xbf\xbf");
  468. // 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80
  469. roundtrip(false, "\xed\xae\x80\xed\xb0\x80");
  470. // 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf
  471. roundtrip(false, "\xed\xae\x80\xed\xbf\xbf");
  472. // 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80
  473. roundtrip(false, "\xed\xaf\xbf\xed\xb0\x80");
  474. // 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf
  475. roundtrip(false, "\xed\xaf\xbf\xed\xbf\xbf");
  476. }
  477. SECTION("5.3 Noncharacter code positions")
  478. {
  479. // The following "noncharacters" are "reserved for internal use" by
  480. // applications, and according to older versions of the Unicode Standard
  481. // "should never be interchanged". Unicode Corrigendum #9 dropped the
  482. // latter restriction. Nevertheless, their presence in incoming UTF-8 data
  483. // can remain a potential security risk, depending on what use is made of
  484. // these codes subsequently. Examples of such internal use:
  485. //
  486. // - Some file APIs with 16-bit characters may use the integer value -1
  487. // = U+FFFF to signal an end-of-file (EOF) or error condition.
  488. //
  489. // - In some UTF-16 receivers, code point U+FFFE might trigger a
  490. // byte-swap operation (to convert between UTF-16LE and UTF-16BE).
  491. //
  492. // With such internal use of noncharacters, it may be desirable and safer
  493. // to block those code points in UTF-8 decoders, as they should never
  494. // occur legitimately in incoming UTF-8 data, and could trigger unsafe
  495. // behaviour in subsequent processing.
  496. // Particularly problematic noncharacters in 16-bit applications:
  497. // 5.3.1 U+FFFE = ef bf be
  498. roundtrip(true, "\xef\xbf\xbe");
  499. // 5.3.2 U+FFFF = ef bf bf
  500. roundtrip(true, "\xef\xbf\xbf");
  501. // 5.3.3 U+FDD0 .. U+FDEF
  502. roundtrip(true, "\xEF\xB7\x90");
  503. roundtrip(true, "\xEF\xB7\x91");
  504. roundtrip(true, "\xEF\xB7\x92");
  505. roundtrip(true, "\xEF\xB7\x93");
  506. roundtrip(true, "\xEF\xB7\x94");
  507. roundtrip(true, "\xEF\xB7\x95");
  508. roundtrip(true, "\xEF\xB7\x96");
  509. roundtrip(true, "\xEF\xB7\x97");
  510. roundtrip(true, "\xEF\xB7\x98");
  511. roundtrip(true, "\xEF\xB7\x99");
  512. roundtrip(true, "\xEF\xB7\x9A");
  513. roundtrip(true, "\xEF\xB7\x9B");
  514. roundtrip(true, "\xEF\xB7\x9C");
  515. roundtrip(true, "\xEF\xB7\x9D");
  516. roundtrip(true, "\xEF\xB7\x9E");
  517. roundtrip(true, "\xEF\xB7\x9F");
  518. roundtrip(true, "\xEF\xB7\xA0");
  519. roundtrip(true, "\xEF\xB7\xA1");
  520. roundtrip(true, "\xEF\xB7\xA2");
  521. roundtrip(true, "\xEF\xB7\xA3");
  522. roundtrip(true, "\xEF\xB7\xA4");
  523. roundtrip(true, "\xEF\xB7\xA5");
  524. roundtrip(true, "\xEF\xB7\xA6");
  525. roundtrip(true, "\xEF\xB7\xA7");
  526. roundtrip(true, "\xEF\xB7\xA8");
  527. roundtrip(true, "\xEF\xB7\xA9");
  528. roundtrip(true, "\xEF\xB7\xAA");
  529. roundtrip(true, "\xEF\xB7\xAB");
  530. roundtrip(true, "\xEF\xB7\xAC");
  531. roundtrip(true, "\xEF\xB7\xAD");
  532. roundtrip(true, "\xEF\xB7\xAE");
  533. roundtrip(true, "\xEF\xB7\xAF");
  534. // 5.3.4 U+nFFFE U+nFFFF (for n = 1..10)
  535. roundtrip(true, "\xF0\x9F\xBF\xBF");
  536. roundtrip(true, "\xF0\xAF\xBF\xBF");
  537. roundtrip(true, "\xF0\xBF\xBF\xBF");
  538. roundtrip(true, "\xF1\x8F\xBF\xBF");
  539. roundtrip(true, "\xF1\x9F\xBF\xBF");
  540. roundtrip(true, "\xF1\xAF\xBF\xBF");
  541. roundtrip(true, "\xF1\xBF\xBF\xBF");
  542. roundtrip(true, "\xF2\x8F\xBF\xBF");
  543. roundtrip(true, "\xF2\x9F\xBF\xBF");
  544. roundtrip(true, "\xF2\xAF\xBF\xBF");
  545. }
  546. }
  547. }