test_unicode.cpp 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE test_unicode.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Unicode specific tests (requires ICU).
  16. */
  17. #include <boost/regex/config.hpp>
  18. #ifdef BOOST_HAS_ICU
  19. #include "test.hpp"
  20. #ifdef BOOST_MSVC
  21. #pragma warning(disable:4127)
  22. #endif
  23. #ifndef BOOST_NO_STD_WSTRING
  24. #define TEST_REGEX_SEARCH_U(s, f, t, m, a)\
  25. do{\
  26. const wchar_t e[] = { s };\
  27. std::wstring se(e, (sizeof(e) / sizeof(wchar_t)) - 1);\
  28. const wchar_t st[] = { t };\
  29. std::wstring sst(st, (sizeof(st) / sizeof(wchar_t)) - 1);\
  30. test_info<wchar_t>::set_info(__FILE__, __LINE__, se, f, sst, m, a);\
  31. test_icu(wchar_t(0), test_regex_search_tag());\
  32. }while(0)
  33. #define TEST_REGEX_CLASS_U(classname, character)\
  34. TEST_REGEX_SEARCH_U(\
  35. L"[[:" BOOST_JOIN(L, BOOST_STRINGIZE(classname)) L":]]",\
  36. perl, \
  37. BOOST_JOIN(L, \
  38. BOOST_STRINGIZE(\
  39. BOOST_JOIN(\x, character))), \
  40. match_default, \
  41. make_array(0, 1, -2, -2))
  42. #else
  43. #define TEST_REGEX_SEARCH_U(s, f, t, m, a)
  44. #define TEST_REGEX_CLASS_U(classname, character)
  45. #endif
  46. void test_unicode()
  47. {
  48. using namespace boost::regex_constants;
  49. TEST_REGEX_CLASS_U(L*, 3108);
  50. TEST_REGEX_CLASS_U(Letter, 3108);
  51. TEST_REGEX_CLASS_U(Lu, 2145);
  52. TEST_REGEX_CLASS_U(Uppercase Letter, 2145);
  53. TEST_REGEX_CLASS_U(Ll, 2146);
  54. TEST_REGEX_CLASS_U(Lowercase Letter, 2146);
  55. TEST_REGEX_CLASS_U(Lt, 1FFC);
  56. TEST_REGEX_CLASS_U(Titlecase Letter, 1FFC);
  57. TEST_REGEX_CLASS_U(Lm, 1D61);
  58. TEST_REGEX_CLASS_U(Modifier Letter, 1D61);
  59. TEST_REGEX_CLASS_U(Lo, 1974);
  60. TEST_REGEX_CLASS_U(Other Letter, 1974);
  61. TEST_REGEX_CLASS_U(M*, 20EA);
  62. TEST_REGEX_CLASS_U(Mark, 20EA);
  63. TEST_REGEX_CLASS_U(Mn, 20EA);
  64. TEST_REGEX_CLASS_U(Non-Spacing Mark, 20EA);
  65. TEST_REGEX_CLASS_U(Mc, 1938);
  66. TEST_REGEX_CLASS_U(Spacing Combining Mark, 1938);
  67. TEST_REGEX_CLASS_U(Me, 0488);
  68. TEST_REGEX_CLASS_U(Enclosing Mark, 0488);
  69. TEST_REGEX_CLASS_U(N*, 0669);
  70. TEST_REGEX_CLASS_U(Number, 0669);
  71. TEST_REGEX_CLASS_U(Nd, 0669);
  72. TEST_REGEX_CLASS_U(Decimal Digit Number, 0669);
  73. TEST_REGEX_CLASS_U(Nl, 303A);
  74. TEST_REGEX_CLASS_U(Letter Number, 303A);
  75. TEST_REGEX_CLASS_U(No, 2793);
  76. TEST_REGEX_CLASS_U(Other Number, 2793);
  77. TEST_REGEX_CLASS_U(S*, 2144);
  78. TEST_REGEX_CLASS_U(Symbol, 2144);
  79. TEST_REGEX_CLASS_U(Sm, 2144);
  80. TEST_REGEX_CLASS_U(Math Symbol, 2144);
  81. TEST_REGEX_CLASS_U(Sc, 20B1);
  82. TEST_REGEX_CLASS_U(Currency Symbol, 20B1);
  83. TEST_REGEX_CLASS_U(Sk, 1FFE);
  84. TEST_REGEX_CLASS_U(Modifier Symbol, 1FFE);
  85. TEST_REGEX_CLASS_U(So, 19FF);
  86. TEST_REGEX_CLASS_U(Other Symbol, 19FF);
  87. TEST_REGEX_CLASS_U(P*, 005F);
  88. TEST_REGEX_CLASS_U(Punctuation, 005F);
  89. TEST_REGEX_CLASS_U(Pc, 005F);
  90. TEST_REGEX_CLASS_U(Connector Punctuation, 005F);
  91. TEST_REGEX_CLASS_U(Pd, 002D);
  92. TEST_REGEX_CLASS_U(Dash Punctuation, 002D);
  93. TEST_REGEX_CLASS_U(Ps, 0028);
  94. TEST_REGEX_CLASS_U(Open Punctuation, 0028);
  95. TEST_REGEX_CLASS_U(Pe, FF63);
  96. TEST_REGEX_CLASS_U(Close Punctuation, FF63);
  97. TEST_REGEX_CLASS_U(Pi, 2039);
  98. TEST_REGEX_CLASS_U(Initial Punctuation, 2039);
  99. TEST_REGEX_CLASS_U(Pf, 203A);
  100. TEST_REGEX_CLASS_U(Final Punctuation, 203A);
  101. TEST_REGEX_CLASS_U(Po, 2038);
  102. TEST_REGEX_CLASS_U(Other Punctuation, 2038);
  103. TEST_REGEX_CLASS_U(Z*, 202F);
  104. TEST_REGEX_CLASS_U(Separator, 202F);
  105. TEST_REGEX_CLASS_U(Zs, 202F);
  106. TEST_REGEX_CLASS_U(Space Separator, 202F);
  107. TEST_REGEX_CLASS_U(Zl, 2028);
  108. TEST_REGEX_CLASS_U(Line Separator, 2028);
  109. TEST_REGEX_CLASS_U(Zp, 2029);
  110. TEST_REGEX_CLASS_U(Paragraph Separator, 2029);
  111. #if !BOOST_WORKAROUND(BOOST_MSVC, < 1300)
  112. // Some tests have to be disabled for VC6 because the compiler
  113. // mangles the string literals...
  114. TEST_REGEX_CLASS_U(C*, 009F);
  115. TEST_REGEX_CLASS_U(Other, 009F);
  116. TEST_REGEX_CLASS_U(Cc, 009F);
  117. TEST_REGEX_CLASS_U(Control, 009F);
  118. #endif
  119. TEST_REGEX_CLASS_U(Cf, FFFB);
  120. TEST_REGEX_CLASS_U(Format, FFFB);
  121. //TEST_REGEX_CLASS_U(Cs, DC00);
  122. //TEST_REGEX_CLASS_U(Surrogate, DC00);
  123. TEST_REGEX_CLASS_U(Co, F8FF);
  124. TEST_REGEX_CLASS_U(Private Use, F8FF);
  125. TEST_REGEX_CLASS_U(Cn, FFFF);
  126. TEST_REGEX_CLASS_U(Not Assigned, FFFF);
  127. TEST_REGEX_CLASS_U(Any, 2038);
  128. TEST_REGEX_CLASS_U(Assigned, 2038);
  129. TEST_REGEX_CLASS_U(ASCII, 7f);
  130. TEST_REGEX_SEARCH_U(L"[[:Assigned:]]", perl, L"\xffff", match_default, make_array(-2, -2));
  131. TEST_REGEX_SEARCH_U(L"[[:ASCII:]]", perl, L"\x80", match_default, make_array(-2, -2));
  132. TEST_REGEX_SEARCH_U(L"\\N{KHMER DIGIT SIX}", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
  133. TEST_REGEX_SEARCH_U(L"\\N{MODIFIER LETTER LOW ACUTE ACCENT}", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
  134. TEST_REGEX_SEARCH_U(L"\\N{SUPERSCRIPT ONE}", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
  135. TEST_REGEX_SEARCH_U(L"[\\N{KHMER DIGIT SIX}]", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
  136. TEST_REGEX_SEARCH_U(L"[\\N{MODIFIER LETTER LOW ACUTE ACCENT}]", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
  137. TEST_REGEX_SEARCH_U(L"[\\N{SUPERSCRIPT ONE}]", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
  138. TEST_REGEX_SEARCH_U(L"\\N{CJK UNIFIED IDEOGRAPH-7FED}", perl, L"\x7FED", match_default, make_array(0, 1, -2, -2));
  139. #if !BOOST_WORKAROUND(BOOST_MSVC, < 1300)
  140. // Some tests have to be disabled for VC6 because the compiler
  141. // mangles the string literals...
  142. TEST_REGEX_SEARCH_U(L"\\w+", perl, L" e\x301" L"coute ", match_default, make_array(1, 8, -2, -2));
  143. TEST_REGEX_SEARCH_U(L"^", perl, L" \x2028 \x2029 \x000D\x000A \x000A \x000C \x000D \x0085 ",
  144. match_default | match_not_bol, make_array(2, 2, -2, 4, 4, -2, 7, 7, -2, 9, 9, -2, 11, 11, -2, 13, 13, -2, 15, 15, -2, -2));
  145. TEST_REGEX_SEARCH_U(L"$", perl, L" \x2028 \x2029 \x000D\x000A \x000A \x000C \x000D \x0085 ",
  146. match_default | match_not_eol, make_array(1, 1, -2, 3, 3, -2, 5, 5, -2, 8, 8, -2, 10, 10, -2, 12, 12, -2, 14, 14, -2, -2));
  147. TEST_REGEX_SEARCH_U(L".", perl, L" \x2028\x2029\x000D\x000A\x000A\x000C\x000D\x0085 ",
  148. match_default | match_not_dot_newline, make_array(0, 1, -2, 9, 10, -2, -2));
  149. #endif
  150. }
  151. #else
  152. void test_unicode(){}
  153. #endif