The source code and dockerfile for the GSW2024 AI Lab.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

182 lines
4.5 KiB

4 months ago
  1. #include <sstream>
  2. #include "handler_test.h"
  3. #include "yaml-cpp/yaml.h" // IWYU pragma: keep
  4. #include "gtest/gtest.h"
  5. using ::testing::_;
  6. using ::testing::InSequence;
  7. namespace YAML {
  8. namespace {
  9. typedef void (*EncodingFn)(std::ostream&, int);
  10. inline char Byte(int ch) {
  11. return static_cast<char>(
  12. static_cast<unsigned char>(static_cast<unsigned int>(ch)));
  13. }
  14. void EncodeToUtf8(std::ostream& stream, int ch) {
  15. if (ch <= 0x7F) {
  16. stream << Byte(ch);
  17. } else if (ch <= 0x7FF) {
  18. stream << Byte(0xC0 | (ch >> 6));
  19. stream << Byte(0x80 | (ch & 0x3F));
  20. } else if (ch <= 0xFFFF) {
  21. stream << Byte(0xE0 | (ch >> 12));
  22. stream << Byte(0x80 | ((ch >> 6) & 0x3F));
  23. stream << Byte(0x80 | (ch & 0x3F));
  24. } else if (ch <= 0x1FFFFF) {
  25. stream << Byte(0xF0 | (ch >> 18));
  26. stream << Byte(0x80 | ((ch >> 12) & 0x3F));
  27. stream << Byte(0x80 | ((ch >> 6) & 0x3F));
  28. stream << Byte(0x80 | (ch & 0x3F));
  29. }
  30. }
  31. bool SplitUtf16HighChar(std::ostream& stream, EncodingFn encoding, int ch) {
  32. int biasedValue = ch - 0x10000;
  33. if (biasedValue < 0) {
  34. return false;
  35. }
  36. int high = 0xD800 | (biasedValue >> 10);
  37. int low = 0xDC00 | (biasedValue & 0x3FF);
  38. encoding(stream, high);
  39. encoding(stream, low);
  40. return true;
  41. }
  42. void EncodeToUtf16LE(std::ostream& stream, int ch) {
  43. if (!SplitUtf16HighChar(stream, &EncodeToUtf16LE, ch)) {
  44. stream << Byte(ch & 0xFF) << Byte(ch >> 8);
  45. }
  46. }
  47. void EncodeToUtf16BE(std::ostream& stream, int ch) {
  48. if (!SplitUtf16HighChar(stream, &EncodeToUtf16BE, ch)) {
  49. stream << Byte(ch >> 8) << Byte(ch & 0xFF);
  50. }
  51. }
  52. void EncodeToUtf32LE(std::ostream& stream, int ch) {
  53. stream << Byte(ch & 0xFF) << Byte((ch >> 8) & 0xFF) << Byte((ch >> 16) & 0xFF)
  54. << Byte((ch >> 24) & 0xFF);
  55. }
  56. void EncodeToUtf32BE(std::ostream& stream, int ch) {
  57. stream << Byte((ch >> 24) & 0xFF) << Byte((ch >> 16) & 0xFF)
  58. << Byte((ch >> 8) & 0xFF) << Byte(ch & 0xFF);
  59. }
  60. class EncodingTest : public HandlerTest {
  61. protected:
  62. void SetUpEncoding(EncodingFn encoding, bool declareEncoding) {
  63. if (declareEncoding) {
  64. encoding(m_yaml, 0xFEFF);
  65. }
  66. AddEntry(encoding, 0x0021, 0x007E); // Basic Latin
  67. AddEntry(encoding, 0x00A1, 0x00FF); // Latin-1 Supplement
  68. AddEntry(encoding, 0x0660, 0x06FF); // Arabic (largest contiguous block)
  69. // CJK unified ideographs (multiple lines)
  70. AddEntry(encoding, 0x4E00, 0x4EFF);
  71. AddEntry(encoding, 0x4F00, 0x4FFF);
  72. AddEntry(encoding, 0x5000, 0x51FF); // 512 character line
  73. AddEntry(encoding, 0x5200, 0x54FF); // 768 character line
  74. AddEntry(encoding, 0x5500, 0x58FF); // 1024 character line
  75. AddEntry(encoding, 0x103A0, 0x103C3); // Old Persian
  76. m_yaml.seekg(0, std::ios::beg);
  77. }
  78. void Run() {
  79. InSequence sequence;
  80. EXPECT_CALL(handler, OnDocumentStart(_));
  81. EXPECT_CALL(handler, OnSequenceStart(_, "?", 0, EmitterStyle::Block));
  82. for (std::size_t i = 0; i < m_entries.size(); i++) {
  83. EXPECT_CALL(handler, OnScalar(_, "!", 0, m_entries[i]));
  84. }
  85. EXPECT_CALL(handler, OnSequenceEnd());
  86. EXPECT_CALL(handler, OnDocumentEnd());
  87. Parse(m_yaml.str());
  88. }
  89. private:
  90. std::stringstream m_yaml;
  91. std::vector<std::string> m_entries;
  92. void AddEntry(EncodingFn encoding, int startCh, int endCh) {
  93. encoding(m_yaml, '-');
  94. encoding(m_yaml, ' ');
  95. encoding(m_yaml, '|');
  96. encoding(m_yaml, '\n');
  97. encoding(m_yaml, ' ');
  98. encoding(m_yaml, ' ');
  99. std::stringstream entry;
  100. for (int ch = startCh; ch <= endCh; ++ch) {
  101. encoding(m_yaml, ch);
  102. EncodeToUtf8(entry, ch);
  103. }
  104. encoding(m_yaml, '\n');
  105. EncodeToUtf8(entry, '\n');
  106. m_entries.push_back(entry.str());
  107. }
  108. };
  109. TEST_F(EncodingTest, UTF8_noBOM) {
  110. SetUpEncoding(&EncodeToUtf8, false);
  111. Run();
  112. }
  113. TEST_F(EncodingTest, UTF8_BOM) {
  114. SetUpEncoding(&EncodeToUtf8, true);
  115. Run();
  116. }
  117. TEST_F(EncodingTest, UTF16LE_noBOM) {
  118. SetUpEncoding(&EncodeToUtf16LE, false);
  119. Run();
  120. }
  121. TEST_F(EncodingTest, UTF16LE_BOM) {
  122. SetUpEncoding(&EncodeToUtf16LE, true);
  123. Run();
  124. }
  125. TEST_F(EncodingTest, UTF16BE_noBOM) {
  126. SetUpEncoding(&EncodeToUtf16BE, false);
  127. Run();
  128. }
  129. TEST_F(EncodingTest, UTF16BE_BOM) {
  130. SetUpEncoding(&EncodeToUtf16BE, true);
  131. Run();
  132. }
  133. TEST_F(EncodingTest, UTF32LE_noBOM) {
  134. SetUpEncoding(&EncodeToUtf32LE, false);
  135. Run();
  136. }
  137. TEST_F(EncodingTest, UTF32LE_BOM) {
  138. SetUpEncoding(&EncodeToUtf32LE, true);
  139. Run();
  140. }
  141. TEST_F(EncodingTest, UTF32BE_noBOM) {
  142. SetUpEncoding(&EncodeToUtf32BE, false);
  143. Run();
  144. }
  145. TEST_F(EncodingTest, UTF32BE_BOM) {
  146. SetUpEncoding(&EncodeToUtf32BE, true);
  147. Run();
  148. }
  149. }
  150. }