cpp_slex_lexer.hpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827
  1. /*=============================================================================
  2. Boost.Wave: A Standard compliant C++ preprocessor library
  3. SLex (Spirit Lex) based C++ lexer
  4. http://www.boost.org/
  5. Copyright (c) 2001-2012 Hartmut Kaiser. Distributed under the Boost
  6. Software License, Version 1.0. (See accompanying file
  7. LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  8. =============================================================================*/
  9. #if !defined(SLEX_LEXER_HPP_5E8E1DF0_BB41_4938_B7E5_A4BB68222FF6_INCLUDED)
  10. #define SLEX_LEXER_HPP_5E8E1DF0_BB41_4938_B7E5_A4BB68222FF6_INCLUDED
  11. #include <string>
  12. #if defined(BOOST_SPIRIT_DEBUG)
  13. #include <iostream>
  14. #endif // defined(BOOST_SPIRIT_DEBUG)
  15. #include <boost/assert.hpp>
  16. #include <boost/spirit/include/classic_core.hpp>
  17. #include <boost/wave/wave_config.hpp>
  18. #include <boost/wave/language_support.hpp>
  19. #include <boost/wave/token_ids.hpp>
  20. #include <boost/wave/util/file_position.hpp>
  21. #include <boost/wave/util/time_conversion_helper.hpp>
  22. #include <boost/wave/cpplexer/validate_universal_char.hpp>
  23. #include <boost/wave/cpplexer/convert_trigraphs.hpp>
  24. #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
  25. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  26. #include <boost/wave/cpplexer/detect_include_guards.hpp>
  27. #endif
  28. #include <boost/wave/cpplexer/cpp_lex_interface.hpp>
  29. #include "../slex_interface.hpp"
  30. #include "../slex_token.hpp"
  31. #include "../slex_iterator.hpp"
  32. #include "lexer.hpp" // "spirit/lexer.hpp"
  33. ///////////////////////////////////////////////////////////////////////////////
  34. namespace boost {
  35. namespace wave {
  36. namespace cpplexer {
  37. namespace slex {
  38. namespace lexer {
  39. ///////////////////////////////////////////////////////////////////////////////
  40. // The following numbers are the array sizes of the token regex's which we
  41. // need to specify to make the CW compiler happy (at least up to V9.5).
  42. #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
  43. #define INIT_DATA_SIZE 175
  44. #else
  45. #define INIT_DATA_SIZE 158
  46. #endif
  47. #define INIT_DATA_CPP_SIZE 15
  48. #define INIT_DATA_PP_NUMBER_SIZE 2
  49. #define INIT_DATA_CPP0X_SIZE 15
  50. ///////////////////////////////////////////////////////////////////////////////
  51. //
  52. // encapsulation of the boost::spirit::classic::slex based cpp lexer
  53. //
  54. ///////////////////////////////////////////////////////////////////////////////
  55. ///////////////////////////////////////////////////////////////////////////////
  56. // The following lexer_base class was necessary to workaround a CodeWarrior
  57. // bug (at least up to CW V9.5).
  58. template <typename IteratorT, typename PositionT>
  59. class lexer_base
  60. : public boost::spirit::classic::lexer<
  61. boost::wave::util::position_iterator<IteratorT, PositionT> >
  62. {
  63. protected:
  64. typedef boost::wave::util::position_iterator<IteratorT, PositionT>
  65. iterator_type;
  66. typedef typename std::iterator_traits<IteratorT>::value_type char_type;
  67. typedef boost::spirit::classic::lexer<iterator_type> base_type;
  68. lexer_base();
  69. // initialization data (regular expressions for the token definitions)
  70. struct lexer_data {
  71. token_id tokenid; // token data
  72. char_type const *tokenregex; // associated token to match
  73. typename base_type::callback_t tokencb; // associated callback function
  74. unsigned int lexerstate; // valid for lexer state
  75. };
  76. };
  77. ///////////////////////////////////////////////////////////////////////////////
  78. template <typename IteratorT, typename PositionT>
  79. class lexer
  80. : public lexer_base<IteratorT, PositionT>
  81. {
  82. public:
  83. typedef boost::wave::cpplexer::slex_token<PositionT> token_type;
  84. void init_dfa(boost::wave::language_support language);
  85. // get time of last compilation
  86. static std::time_t get_compilation_time()
  87. { return compilation_time.get_time(); }
  88. // helper for calculation of the time of last compilation
  89. static boost::wave::util::time_conversion_helper compilation_time;
  90. private:
  91. typedef lexer_base<IteratorT, PositionT> base_type;
  92. static typename base_type::lexer_data const init_data[INIT_DATA_SIZE]; // common patterns
  93. static typename base_type::lexer_data const init_data_cpp[INIT_DATA_CPP_SIZE]; // C++ only patterns
  94. static typename base_type::lexer_data const init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE]; // pp-number only patterns
  95. static typename base_type::lexer_data const init_data_cpp0x[INIT_DATA_CPP0X_SIZE]; // C++0X only patterns
  96. };
  97. ///////////////////////////////////////////////////////////////////////////////
  98. // data required for initialization of the lexer (token definitions)
  99. #define OR "|"
  100. #define Q(c) "\\" c
  101. #define TRI(c) Q("?") Q("?") c
  102. // definition of some sub-token regexps to simplify the regex definitions
  103. #define BLANK "[ \\t]"
  104. #define CCOMMENT \
  105. Q("/") Q("*") "[^*]*" Q("*") "+" "(" "[^/*][^*]*" Q("*") "+" ")*" Q("/")
  106. #define PPSPACE "(" BLANK OR CCOMMENT ")*"
  107. #define OCTALDIGIT "[0-7]"
  108. #define DIGIT "[0-9]"
  109. #define HEXDIGIT "[0-9a-fA-F]"
  110. #define OPTSIGN "[-+]?"
  111. #define EXPSTART "[eE]" "[-+]"
  112. #define EXPONENT "(" "[eE]" OPTSIGN "[0-9]+" ")"
  113. #define NONDIGIT "[a-zA-Z_]"
  114. #define INTEGER \
  115. "(" "(0x|0X)" HEXDIGIT "+" OR "0" OCTALDIGIT "*" OR "[1-9]" DIGIT "*" ")"
  116. #define INTEGER_SUFFIX "(" "[uU][lL]?|[lL][uU]?" ")"
  117. #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
  118. #define LONGINTEGER_SUFFIX "(" "[uU]" "(" "[lL][lL]" ")" OR \
  119. "(" "[lL][lL]" ")" "[uU]" "?" OR \
  120. "i64" \
  121. ")"
  122. #else
  123. #define LONGINTEGER_SUFFIX "(" "[uU]" "(" "[lL][lL]" ")" OR \
  124. "(" "[lL][lL]" ")" "[uU]" "?" ")"
  125. #endif
  126. #define FLOAT_SUFFIX "(" "[fF][lL]?" OR "[lL][fF]?" ")"
  127. #define CHAR_SPEC "L?"
  128. #define EXTCHAR_SPEC "(" "[uU]" OR "u8" ")"
  129. #define BACKSLASH "(" Q("\\") OR TRI(Q("/")) ")"
  130. #define ESCAPESEQ "(" BACKSLASH "(" \
  131. "[abfnrtv?'\"]" OR \
  132. BACKSLASH OR \
  133. "x" HEXDIGIT "+" OR \
  134. OCTALDIGIT OCTALDIGIT "?" OCTALDIGIT "?" \
  135. "))"
  136. #define HEXQUAD "(" HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT ")"
  137. #define UNIVERSALCHAR "(" BACKSLASH "(" \
  138. "u" HEXQUAD OR \
  139. "U" HEXQUAD HEXQUAD \
  140. "))"
  141. #define POUNDDEF "(" "#" OR TRI("=") OR Q("%:") ")"
  142. #define NEWLINEDEF "(" "\n" OR "\r" OR "\r\n" ")"
  143. #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
  144. #define INCLUDEDEF "(include|include_next)"
  145. #else
  146. #define INCLUDEDEF "include"
  147. #endif
  148. #define PP_NUMBERDEF Q(".") "?" DIGIT "(" DIGIT OR NONDIGIT OR EXPSTART OR Q(".") ")*"
  149. ///////////////////////////////////////////////////////////////////////////////
  150. // lexer state constants
  151. #define LEXER_STATE_NORMAL 0
  152. #define LEXER_STATE_PP 1
  153. #define NUM_LEXER_STATES 1
  154. // helper for initializing token data
  155. #define TOKEN_DATA(id, regex) \
  156. { T_##id, regex, 0, LEXER_STATE_NORMAL } \
  157. /**/
  158. #define TOKEN_DATA_EX(id, regex, callback) \
  159. { T_##id, regex, callback, LEXER_STATE_NORMAL } \
  160. /**/
  161. ///////////////////////////////////////////////////////////////////////////////
  162. // common C++/C99 token definitions
  163. template <typename IteratorT, typename PositionT>
  164. typename lexer_base<IteratorT, PositionT>::lexer_data const
  165. lexer<IteratorT, PositionT>::init_data[INIT_DATA_SIZE] =
  166. {
  167. TOKEN_DATA(AND, "&"),
  168. TOKEN_DATA(ANDAND, "&&"),
  169. TOKEN_DATA(ASSIGN, "="),
  170. TOKEN_DATA(ANDASSIGN, "&="),
  171. TOKEN_DATA(OR, Q("|")),
  172. TOKEN_DATA(OR_TRIGRAPH, TRI("!")),
  173. TOKEN_DATA(ORASSIGN, Q("|=")),
  174. TOKEN_DATA(ORASSIGN_TRIGRAPH, TRI("!=")),
  175. TOKEN_DATA(XOR, Q("^")),
  176. TOKEN_DATA(XOR_TRIGRAPH, TRI("'")),
  177. TOKEN_DATA(XORASSIGN, Q("^=")),
  178. TOKEN_DATA(XORASSIGN_TRIGRAPH, TRI("'=")),
  179. TOKEN_DATA(COMMA, ","),
  180. TOKEN_DATA(COLON, ":"),
  181. TOKEN_DATA(DIVIDEASSIGN, Q("/=")),
  182. TOKEN_DATA(DIVIDE, Q("/")),
  183. TOKEN_DATA(DOT, Q(".")),
  184. TOKEN_DATA(ELLIPSIS, Q(".") Q(".") Q(".")),
  185. TOKEN_DATA(EQUAL, "=="),
  186. TOKEN_DATA(GREATER, ">"),
  187. TOKEN_DATA(GREATEREQUAL, ">="),
  188. TOKEN_DATA(LEFTBRACE, Q("{")),
  189. TOKEN_DATA(LEFTBRACE_ALT, "<" Q("%")),
  190. TOKEN_DATA(LEFTBRACE_TRIGRAPH, TRI("<")),
  191. TOKEN_DATA(LESS, "<"),
  192. TOKEN_DATA(LESSEQUAL, "<="),
  193. TOKEN_DATA(LEFTPAREN, Q("(")),
  194. TOKEN_DATA(LEFTBRACKET, Q("[")),
  195. TOKEN_DATA(LEFTBRACKET_ALT, "<:"),
  196. TOKEN_DATA(LEFTBRACKET_TRIGRAPH, TRI(Q("("))),
  197. TOKEN_DATA(MINUS, Q("-")),
  198. TOKEN_DATA(MINUSASSIGN, Q("-=")),
  199. TOKEN_DATA(MINUSMINUS, Q("-") Q("-")),
  200. TOKEN_DATA(PERCENT, Q("%")),
  201. TOKEN_DATA(PERCENTASSIGN, Q("%=")),
  202. TOKEN_DATA(NOT, "!"),
  203. TOKEN_DATA(NOTEQUAL, "!="),
  204. TOKEN_DATA(OROR, Q("|") Q("|")),
  205. TOKEN_DATA(OROR_TRIGRAPH, TRI("!") Q("|") OR Q("|") TRI("!") OR TRI("!") TRI("!")),
  206. TOKEN_DATA(PLUS, Q("+")),
  207. TOKEN_DATA(PLUSASSIGN, Q("+=")),
  208. TOKEN_DATA(PLUSPLUS, Q("+") Q("+")),
  209. TOKEN_DATA(ARROW, Q("->")),
  210. TOKEN_DATA(QUESTION_MARK, Q("?")),
  211. TOKEN_DATA(RIGHTBRACE, Q("}")),
  212. TOKEN_DATA(RIGHTBRACE_ALT, Q("%>")),
  213. TOKEN_DATA(RIGHTBRACE_TRIGRAPH, TRI(">")),
  214. TOKEN_DATA(RIGHTPAREN, Q(")")),
  215. TOKEN_DATA(RIGHTBRACKET, Q("]")),
  216. TOKEN_DATA(RIGHTBRACKET_ALT, ":>"),
  217. TOKEN_DATA(RIGHTBRACKET_TRIGRAPH, TRI(Q(")"))),
  218. TOKEN_DATA(SEMICOLON, ";"),
  219. TOKEN_DATA(SHIFTLEFT, "<<"),
  220. TOKEN_DATA(SHIFTLEFTASSIGN, "<<="),
  221. TOKEN_DATA(SHIFTRIGHT, ">>"),
  222. TOKEN_DATA(SHIFTRIGHTASSIGN, ">>="),
  223. TOKEN_DATA(STAR, Q("*")),
  224. TOKEN_DATA(COMPL, Q("~")),
  225. TOKEN_DATA(COMPL_TRIGRAPH, TRI("-")),
  226. TOKEN_DATA(STARASSIGN, Q("*=")),
  227. TOKEN_DATA(ASM, "asm"),
  228. TOKEN_DATA(AUTO, "auto"),
  229. TOKEN_DATA(BOOL, "bool"),
  230. TOKEN_DATA(FALSE, "false"),
  231. TOKEN_DATA(TRUE, "true"),
  232. TOKEN_DATA(BREAK, "break"),
  233. TOKEN_DATA(CASE, "case"),
  234. TOKEN_DATA(CATCH, "catch"),
  235. TOKEN_DATA(CHAR, "char"),
  236. TOKEN_DATA(CLASS, "class"),
  237. TOKEN_DATA(CONST, "const"),
  238. TOKEN_DATA(CONSTCAST, "const_cast"),
  239. TOKEN_DATA(CONTINUE, "continue"),
  240. TOKEN_DATA(DEFAULT, "default"),
  241. TOKEN_DATA(DELETE, "delete"),
  242. TOKEN_DATA(DO, "do"),
  243. TOKEN_DATA(DOUBLE, "double"),
  244. TOKEN_DATA(DYNAMICCAST, "dynamic_cast"),
  245. TOKEN_DATA(ELSE, "else"),
  246. TOKEN_DATA(ENUM, "enum"),
  247. TOKEN_DATA(EXPLICIT, "explicit"),
  248. TOKEN_DATA(EXPORT, "export"),
  249. TOKEN_DATA(EXTERN, "extern"),
  250. TOKEN_DATA(FLOAT, "float"),
  251. TOKEN_DATA(FOR, "for"),
  252. TOKEN_DATA(FRIEND, "friend"),
  253. TOKEN_DATA(GOTO, "goto"),
  254. TOKEN_DATA(IF, "if"),
  255. TOKEN_DATA(INLINE, "inline"),
  256. TOKEN_DATA(INT, "int"),
  257. TOKEN_DATA(LONG, "long"),
  258. TOKEN_DATA(MUTABLE, "mutable"),
  259. TOKEN_DATA(NAMESPACE, "namespace"),
  260. TOKEN_DATA(NEW, "new"),
  261. TOKEN_DATA(OPERATOR, "operator"),
  262. TOKEN_DATA(PRIVATE, "private"),
  263. TOKEN_DATA(PROTECTED, "protected"),
  264. TOKEN_DATA(PUBLIC, "public"),
  265. TOKEN_DATA(REGISTER, "register"),
  266. TOKEN_DATA(REINTERPRETCAST, "reinterpret_cast"),
  267. TOKEN_DATA(RETURN, "return"),
  268. TOKEN_DATA(SHORT, "short"),
  269. TOKEN_DATA(SIGNED, "signed"),
  270. TOKEN_DATA(SIZEOF, "sizeof"),
  271. TOKEN_DATA(STATIC, "static"),
  272. TOKEN_DATA(STATICCAST, "static_cast"),
  273. TOKEN_DATA(STRUCT, "struct"),
  274. TOKEN_DATA(SWITCH, "switch"),
  275. TOKEN_DATA(TEMPLATE, "template"),
  276. TOKEN_DATA(THIS, "this"),
  277. TOKEN_DATA(THROW, "throw"),
  278. TOKEN_DATA(TRY, "try"),
  279. TOKEN_DATA(TYPEDEF, "typedef"),
  280. TOKEN_DATA(TYPEID, "typeid"),
  281. TOKEN_DATA(TYPENAME, "typename"),
  282. TOKEN_DATA(UNION, "union"),
  283. TOKEN_DATA(UNSIGNED, "unsigned"),
  284. TOKEN_DATA(USING, "using"),
  285. TOKEN_DATA(VIRTUAL, "virtual"),
  286. TOKEN_DATA(VOID, "void"),
  287. TOKEN_DATA(VOLATILE, "volatile"),
  288. TOKEN_DATA(WCHART, "wchar_t"),
  289. TOKEN_DATA(WHILE, "while"),
  290. TOKEN_DATA(PP_DEFINE, POUNDDEF PPSPACE "define"),
  291. TOKEN_DATA(PP_IF, POUNDDEF PPSPACE "if"),
  292. TOKEN_DATA(PP_IFDEF, POUNDDEF PPSPACE "ifdef"),
  293. TOKEN_DATA(PP_IFNDEF, POUNDDEF PPSPACE "ifndef"),
  294. TOKEN_DATA(PP_ELSE, POUNDDEF PPSPACE "else"),
  295. TOKEN_DATA(PP_ELIF, POUNDDEF PPSPACE "elif"),
  296. TOKEN_DATA(PP_ENDIF, POUNDDEF PPSPACE "endif"),
  297. TOKEN_DATA(PP_ERROR, POUNDDEF PPSPACE "error"),
  298. TOKEN_DATA(PP_QHEADER, POUNDDEF PPSPACE \
  299. INCLUDEDEF PPSPACE Q("\"") "[^\\n\\r\"]+" Q("\"")),
  300. TOKEN_DATA(PP_HHEADER, POUNDDEF PPSPACE \
  301. INCLUDEDEF PPSPACE "<" "[^\\n\\r>]+" ">"),
  302. TOKEN_DATA(PP_INCLUDE, POUNDDEF PPSPACE \
  303. INCLUDEDEF PPSPACE),
  304. TOKEN_DATA(PP_LINE, POUNDDEF PPSPACE "line"),
  305. TOKEN_DATA(PP_PRAGMA, POUNDDEF PPSPACE "pragma"),
  306. TOKEN_DATA(PP_UNDEF, POUNDDEF PPSPACE "undef"),
  307. TOKEN_DATA(PP_WARNING, POUNDDEF PPSPACE "warning"),
  308. #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
  309. TOKEN_DATA(MSEXT_INT8, "__int8"),
  310. TOKEN_DATA(MSEXT_INT16, "__int16"),
  311. TOKEN_DATA(MSEXT_INT32, "__int32"),
  312. TOKEN_DATA(MSEXT_INT64, "__int64"),
  313. TOKEN_DATA(MSEXT_BASED, "_?" "_based"),
  314. TOKEN_DATA(MSEXT_DECLSPEC, "_?" "_declspec"),
  315. TOKEN_DATA(MSEXT_CDECL, "_?" "_cdecl"),
  316. TOKEN_DATA(MSEXT_FASTCALL, "_?" "_fastcall"),
  317. TOKEN_DATA(MSEXT_STDCALL, "_?" "_stdcall"),
  318. TOKEN_DATA(MSEXT_TRY , "__try"),
  319. TOKEN_DATA(MSEXT_EXCEPT, "__except"),
  320. TOKEN_DATA(MSEXT_FINALLY, "__finally"),
  321. TOKEN_DATA(MSEXT_LEAVE, "__leave"),
  322. TOKEN_DATA(MSEXT_INLINE, "_?" "_inline"),
  323. TOKEN_DATA(MSEXT_ASM, "_?" "_asm"),
  324. TOKEN_DATA(MSEXT_PP_REGION, POUNDDEF PPSPACE "region"),
  325. TOKEN_DATA(MSEXT_PP_ENDREGION, POUNDDEF PPSPACE "endregion"),
  326. #endif // BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
  327. // TOKEN_DATA(OCTALINT, "0" OCTALDIGIT "*" INTEGER_SUFFIX "?"),
  328. // TOKEN_DATA(DECIMALINT, "[1-9]" DIGIT "*" INTEGER_SUFFIX "?"),
  329. // TOKEN_DATA(HEXAINT, "(0x|0X)" HEXDIGIT "+" INTEGER_SUFFIX "?"),
  330. TOKEN_DATA(LONGINTLIT, INTEGER LONGINTEGER_SUFFIX),
  331. TOKEN_DATA(INTLIT, INTEGER INTEGER_SUFFIX "?"),
  332. TOKEN_DATA(FLOATLIT,
  333. "(" DIGIT "*" Q(".") DIGIT "+" OR DIGIT "+" Q(".") ")"
  334. EXPONENT "?" FLOAT_SUFFIX "?" OR
  335. DIGIT "+" EXPONENT FLOAT_SUFFIX "?"),
  336. TOKEN_DATA(CCOMMENT, CCOMMENT),
  337. TOKEN_DATA(CPPCOMMENT, Q("/") Q("/[^\\n\\r]*") NEWLINEDEF ),
  338. TOKEN_DATA(CHARLIT, CHAR_SPEC "'"
  339. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\n\\r\\\\']" ")+" "'"),
  340. TOKEN_DATA(STRINGLIT, CHAR_SPEC Q("\"")
  341. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\n\\r\\\\\"]" ")*" Q("\"")),
  342. #if BOOST_WAVE_USE_STRICT_LEXER != 0
  343. TOKEN_DATA(IDENTIFIER, "([a-zA-Z_]" OR UNIVERSALCHAR ")([a-zA-Z0-9_]" OR UNIVERSALCHAR ")*"),
  344. #else
  345. TOKEN_DATA(IDENTIFIER, "([a-zA-Z_$]" OR UNIVERSALCHAR ")([a-zA-Z0-9_$]" OR UNIVERSALCHAR ")*"),
  346. #endif
  347. TOKEN_DATA(SPACE, "[ \t\v\f]+"),
  348. // TOKEN_DATA(SPACE2, "[\\v\\f]+"),
  349. TOKEN_DATA(CONTLINE, Q("\\") "\n"),
  350. TOKEN_DATA(NEWLINE, NEWLINEDEF),
  351. TOKEN_DATA(POUND_POUND, "##"),
  352. TOKEN_DATA(POUND_POUND_ALT, Q("%:") Q("%:")),
  353. TOKEN_DATA(POUND_POUND_TRIGRAPH, TRI("=") TRI("=")),
  354. TOKEN_DATA(POUND, "#"),
  355. TOKEN_DATA(POUND_ALT, Q("%:")),
  356. TOKEN_DATA(POUND_TRIGRAPH, TRI("=")),
  357. TOKEN_DATA(ANY_TRIGRAPH, TRI(Q("/"))),
  358. TOKEN_DATA(ANY, "."), // this should be the last recognized token
  359. { token_id(0) } // this should be the last entry
  360. };
  361. ///////////////////////////////////////////////////////////////////////////////
  362. // C++ only token definitions
  363. template <typename IteratorT, typename PositionT>
  364. typename lexer_base<IteratorT, PositionT>::lexer_data const
  365. lexer<IteratorT, PositionT>::init_data_cpp[INIT_DATA_CPP_SIZE] =
  366. {
  367. TOKEN_DATA(AND_ALT, "bitand"),
  368. TOKEN_DATA(ANDASSIGN_ALT, "and_eq"),
  369. TOKEN_DATA(ANDAND_ALT, "and"),
  370. TOKEN_DATA(OR_ALT, "bitor"),
  371. TOKEN_DATA(ORASSIGN_ALT, "or_eq"),
  372. TOKEN_DATA(OROR_ALT, "or"),
  373. TOKEN_DATA(XORASSIGN_ALT, "xor_eq"),
  374. TOKEN_DATA(XOR_ALT, "xor"),
  375. TOKEN_DATA(NOTEQUAL_ALT, "not_eq"),
  376. TOKEN_DATA(NOT_ALT, "not"),
  377. TOKEN_DATA(COMPL_ALT, "compl"),
  378. #if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0
  379. TOKEN_DATA(IMPORT, "import"),
  380. #endif
  381. TOKEN_DATA(ARROWSTAR, Q("->") Q("*")),
  382. TOKEN_DATA(DOTSTAR, Q(".") Q("*")),
  383. TOKEN_DATA(COLON_COLON, "::"),
  384. { token_id(0) } // this should be the last entry
  385. };
  386. ///////////////////////////////////////////////////////////////////////////////
  387. // C++ only token definitions
  388. template <typename IteratorT, typename PositionT>
  389. typename lexer_base<IteratorT, PositionT>::lexer_data const
  390. lexer<IteratorT, PositionT>::init_data_pp_number[INIT_DATA_PP_NUMBER_SIZE] =
  391. {
  392. TOKEN_DATA(PP_NUMBER, PP_NUMBERDEF),
  393. { token_id(0) } // this should be the last entry
  394. };
  395. ///////////////////////////////////////////////////////////////////////////////
  396. // C++ only token definitions
  397. #define T_EXTCHARLIT token_id(T_CHARLIT|AltTokenType)
  398. #define T_EXTSTRINGLIT token_id(T_STRINGLIT|AltTokenType)
  399. #define T_EXTRAWSTRINGLIT token_id(T_RAWSTRINGLIT|AltTokenType)
  400. template <typename IteratorT, typename PositionT>
  401. typename lexer_base<IteratorT, PositionT>::lexer_data const
  402. lexer<IteratorT, PositionT>::init_data_cpp0x[INIT_DATA_CPP0X_SIZE] =
  403. {
  404. TOKEN_DATA(EXTCHARLIT, EXTCHAR_SPEC "'"
  405. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\n\\r\\\\']" ")+" "'"),
  406. TOKEN_DATA(EXTSTRINGLIT, EXTCHAR_SPEC Q("\"")
  407. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\n\\r\\\\\"]" ")*" Q("\"")),
  408. TOKEN_DATA(RAWSTRINGLIT, CHAR_SPEC "R" Q("\"")
  409. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\\\\"]" ")*" Q("\"")),
  410. TOKEN_DATA(EXTRAWSTRINGLIT, EXTCHAR_SPEC "R" Q("\"")
  411. "(" ESCAPESEQ OR UNIVERSALCHAR OR "[^\\\\\"]" ")*" Q("\"")),
  412. TOKEN_DATA(ALIGNAS, "alignas"),
  413. TOKEN_DATA(ALIGNOF, "alignof"),
  414. TOKEN_DATA(CHAR16_T, "char16_t"),
  415. TOKEN_DATA(CHAR32_T, "char32_t"),
  416. TOKEN_DATA(CONSTEXPR, "constexpr"),
  417. TOKEN_DATA(DECLTYPE, "decltype"),
  418. TOKEN_DATA(NOEXCEPT, "noexcept"),
  419. TOKEN_DATA(NULLPTR, "nullptr"),
  420. TOKEN_DATA(STATICASSERT, "static_assert"),
  421. TOKEN_DATA(THREADLOCAL, "threadlocal"),
  422. { token_id(0) } // this should be the last entry
  423. };
  424. ///////////////////////////////////////////////////////////////////////////////
  425. // undefine macros, required for regular expression definitions
  426. #undef INCLUDEDEF
  427. #undef POUNDDEF
  428. #undef CCOMMENT
  429. #undef PPSPACE
  430. #undef DIGIT
  431. #undef OCTALDIGIT
  432. #undef HEXDIGIT
  433. #undef NONDIGIT
  434. #undef OPTSIGN
  435. #undef EXPSTART
  436. #undef EXPONENT
  437. #undef LONGINTEGER_SUFFIX
  438. #undef INTEGER_SUFFIX
  439. #undef INTEGER
  440. #undef FLOAT_SUFFIX
  441. #undef CHAR_SPEC
  442. #undef BACKSLASH
  443. #undef ESCAPESEQ
  444. #undef HEXQUAD
  445. #undef UNIVERSALCHAR
  446. #undef PP_NUMBERDEF
  447. #undef Q
  448. #undef TRI
  449. #undef OR
  450. #undef TOKEN_DATA
  451. #undef TOKEN_DATA_EX
  452. ///////////////////////////////////////////////////////////////////////////////
  453. // initialize cpp lexer with token data
  454. template <typename IteratorT, typename PositionT>
  455. inline
  456. lexer_base<IteratorT, PositionT>::lexer_base()
  457. : base_type(NUM_LEXER_STATES)
  458. {
  459. }
  460. template <typename IteratorT, typename PositionT>
  461. inline void
  462. lexer<IteratorT, PositionT>::init_dfa(boost::wave::language_support lang)
  463. {
  464. if (this->has_compiled_dfa())
  465. return;
  466. // if pp-numbers should be preferred, insert the corresponding rule first
  467. if (boost::wave::need_prefer_pp_numbers(lang)) {
  468. for (int j = 0; 0 != init_data_pp_number[j].tokenid; ++j) {
  469. this->register_regex(init_data_pp_number[j].tokenregex,
  470. init_data_pp_number[j].tokenid, init_data_pp_number[j].tokencb,
  471. init_data_pp_number[j].lexerstate);
  472. }
  473. }
  474. // if in C99 mode, some of the keywords are not valid
  475. if (!boost::wave::need_c99(lang)) {
  476. for (int j = 0; 0 != init_data_cpp[j].tokenid; ++j) {
  477. this->register_regex(init_data_cpp[j].tokenregex,
  478. init_data_cpp[j].tokenid, init_data_cpp[j].tokencb,
  479. init_data_cpp[j].lexerstate);
  480. }
  481. }
  482. // if in C++0x mode, add all new keywords
  483. #if BOOST_WAVE_SUPPORT_CPP0X != 0
  484. if (boost::wave::need_cpp0x(lang)) {
  485. for (int j = 0; 0 != init_data_cpp0x[j].tokenid; ++j) {
  486. this->register_regex(init_data_cpp0x[j].tokenregex,
  487. init_data_cpp0x[j].tokenid, init_data_cpp0x[j].tokencb,
  488. init_data_cpp0x[j].lexerstate);
  489. }
  490. }
  491. #endif
  492. for (int i = 0; 0 != init_data[i].tokenid; ++i) {
  493. this->register_regex(init_data[i].tokenregex, init_data[i].tokenid,
  494. init_data[i].tokencb, init_data[i].lexerstate);
  495. }
  496. }
  497. ///////////////////////////////////////////////////////////////////////////////
  498. // get time of last compilation of this file
  499. template <typename IteratorT, typename PositionT>
  500. boost::wave::util::time_conversion_helper
  501. lexer<IteratorT, PositionT>::compilation_time(__DATE__ " " __TIME__);
  502. ///////////////////////////////////////////////////////////////////////////////
  503. } // namespace lexer
  504. ///////////////////////////////////////////////////////////////////////////////
  505. //
  506. template <typename IteratorT, typename PositionT>
  507. inline void
  508. init_lexer (lexer::lexer<IteratorT, PositionT> &lexer,
  509. boost::wave::language_support language, bool force_reinit = false)
  510. {
  511. if (lexer.has_compiled_dfa())
  512. return; // nothing to do
  513. using std::ifstream;
  514. using std::ofstream;
  515. using std::ios;
  516. using std::cerr;
  517. using std::endl;
  518. ifstream dfa_in("wave_slex_lexer.dfa", ios::in|ios::binary);
  519. lexer.init_dfa(language);
  520. if (force_reinit || !dfa_in.is_open() ||
  521. !lexer.load (dfa_in, (long)lexer.get_compilation_time()))
  522. {
  523. #if defined(BOOST_SPIRIT_DEBUG)
  524. cerr << "Compiling regular expressions for slex ...";
  525. #endif // defined(BOOST_SPIRIT_DEBUG)
  526. dfa_in.close();
  527. lexer.create_dfa();
  528. ofstream dfa_out ("wave_slex_lexer.dfa", ios::out|ios::binary|ios::trunc);
  529. if (dfa_out.is_open())
  530. lexer.save (dfa_out, (long)lexer.get_compilation_time());
  531. #if defined(BOOST_SPIRIT_DEBUG)
  532. cerr << " Done." << endl;
  533. #endif // defined(BOOST_SPIRIT_DEBUG)
  534. }
  535. }
  536. ///////////////////////////////////////////////////////////////////////////////
  537. //
  538. // lex_functor
  539. //
  540. ///////////////////////////////////////////////////////////////////////////////
  541. template <typename IteratorT, typename PositionT = wave::util::file_position_type>
  542. class slex_functor
  543. : public slex_input_interface<
  544. typename lexer::lexer<IteratorT, PositionT>::token_type
  545. >
  546. {
  547. public:
  548. typedef boost::wave::util::position_iterator<IteratorT, PositionT>
  549. iterator_type;
  550. typedef typename std::iterator_traits<IteratorT>::value_type char_type;
  551. typedef BOOST_WAVE_STRINGTYPE string_type;
  552. typedef typename lexer::lexer<IteratorT, PositionT>::token_type token_type;
  553. slex_functor(IteratorT const &first_, IteratorT const &last_,
  554. PositionT const &pos_, boost::wave::language_support language_)
  555. : first(first_, last_, pos_), language(language_), at_eof(false)
  556. {
  557. // initialize lexer dfa tables
  558. init_lexer(lexer, language_);
  559. }
  560. virtual ~slex_functor() {}
  561. // get the next token from the input stream
  562. token_type& get(token_type& result)
  563. {
  564. if (!at_eof) {
  565. do {
  566. // generate and return the next token
  567. std::string value;
  568. PositionT pos = first.get_position(); // begin of token position
  569. token_id id = token_id(lexer.next_token(first, last, &value));
  570. if ((token_id)(-1) == id)
  571. id = T_EOF; // end of input reached
  572. string_type token_val(value.c_str());
  573. if (boost::wave::need_emit_contnewlines(language) ||
  574. T_CONTLINE != id)
  575. {
  576. // The cast should avoid spurious warnings about missing case labels
  577. // for the other token ids's.
  578. switch (id) {
  579. case T_IDENTIFIER:
  580. // test identifier characters for validity (throws if
  581. // invalid chars found)
  582. if (!boost::wave::need_no_character_validation(language)) {
  583. using boost::wave::cpplexer::impl::validate_identifier_name;
  584. validate_identifier_name(token_val,
  585. pos.get_line(), pos.get_column(), pos.get_file());
  586. }
  587. break;
  588. case T_EXTCHARLIT:
  589. case T_EXTSTRINGLIT:
  590. case T_EXTRAWSTRINGLIT:
  591. id = token_id(id & ~AltTokenType);
  592. BOOST_FALLTHROUGH;
  593. case T_CHARLIT:
  594. case T_STRINGLIT:
  595. case T_RAWSTRINGLIT:
  596. // test literal characters for validity (throws if invalid
  597. // chars found)
  598. if (boost::wave::need_convert_trigraphs(language)) {
  599. using boost::wave::cpplexer::impl::convert_trigraphs;
  600. token_val = convert_trigraphs(token_val);
  601. }
  602. if (!boost::wave::need_no_character_validation(language)) {
  603. using boost::wave::cpplexer::impl::validate_literal;
  604. validate_literal(token_val,
  605. pos.get_line(), pos.get_column(), pos.get_file());
  606. }
  607. break;
  608. case T_LONGINTLIT: // supported in C99 and long_long mode
  609. if (!boost::wave::need_long_long(language)) {
  610. // syntax error: not allowed in C++ mode
  611. BOOST_WAVE_LEXER_THROW(
  612. boost::wave::cpplexer::lexing_exception,
  613. invalid_long_long_literal, value.c_str(),
  614. pos.get_line(), pos.get_column(),
  615. pos.get_file().c_str());
  616. }
  617. break;
  618. #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
  619. case T_PP_HHEADER:
  620. case T_PP_QHEADER:
  621. case T_PP_INCLUDE:
  622. // convert to the corresponding ..._next token, if appropriate
  623. {
  624. // Skip '#' and whitespace and see whether we find an
  625. // 'include_next' here.
  626. typename string_type::size_type start = value.find("include");
  627. if (0 == value.compare(start, 12, "include_next", 12))
  628. id = token_id(id | AltTokenType);
  629. break;
  630. }
  631. #endif // BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
  632. case T_EOF:
  633. // T_EOF is returned as a valid token, the next call will
  634. // return T_EOI, i.e. the actual end of input
  635. at_eof = true;
  636. token_val.clear();
  637. break;
  638. case T_OR_TRIGRAPH:
  639. case T_XOR_TRIGRAPH:
  640. case T_LEFTBRACE_TRIGRAPH:
  641. case T_RIGHTBRACE_TRIGRAPH:
  642. case T_LEFTBRACKET_TRIGRAPH:
  643. case T_RIGHTBRACKET_TRIGRAPH:
  644. case T_COMPL_TRIGRAPH:
  645. case T_POUND_TRIGRAPH:
  646. case T_ANY_TRIGRAPH:
  647. if (boost::wave::need_convert_trigraphs(language))
  648. {
  649. using boost::wave::cpplexer::impl::convert_trigraph;
  650. token_val = convert_trigraph(token_val);
  651. }
  652. break;
  653. }
  654. result = token_type(id, token_val, pos);
  655. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  656. return guards.detect_guard(result);
  657. #else
  658. return result;
  659. #endif
  660. }
  661. // skip the T_CONTLINE token
  662. } while (true);
  663. }
  664. return result = token_type(); // return T_EOI
  665. }
  666. void set_position(PositionT const &pos)
  667. {
  668. // set position has to change the file name and line number only
  669. first.get_position().set_file(pos.get_file());
  670. first.get_position().set_line(pos.get_line());
  671. }
  672. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  673. bool has_include_guards(std::string& guard_name) const
  674. { return guards.detected(guard_name); }
  675. #endif
  676. private:
  677. iterator_type first;
  678. iterator_type last;
  679. boost::wave::language_support language;
  680. static lexer::lexer<IteratorT, PositionT> lexer; // needed only once
  681. bool at_eof;
  682. #if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0
  683. include_guards<token_type> guards;
  684. #endif
  685. };
  686. template <typename IteratorT, typename PositionT>
  687. lexer::lexer<IteratorT, PositionT> slex_functor<IteratorT, PositionT>::lexer;
  688. #undef T_EXTCHARLIT
  689. #undef T_EXTSTRINGLIT
  690. #undef T_EXTRAWSTRINGLIT
  691. ///////////////////////////////////////////////////////////////////////////////
  692. //
  693. // The 'new_lexer' function allows the opaque generation of a new lexer object.
  694. // It is coupled to the iterator type to allow to decouple the lexer/iterator
  695. // configurations at compile time.
  696. //
  697. // This function is declared inside the cpp_slex_token.hpp file, which is
  698. // referenced by the source file calling the lexer and the source file, which
  699. // instantiates the lex_functor. But it is defined here, so it will be
  700. // instantiated only while compiling the source file, which instantiates the
  701. // lex_functor. While the cpp_slex_token.hpp file may be included everywhere,
  702. // this file (cpp_slex_lexer.hpp) should be included only once. This allows
  703. // to decouple the lexer interface from the lexer implementation and reduces
  704. // compilation time.
  705. //
  706. ///////////////////////////////////////////////////////////////////////////////
  707. ///////////////////////////////////////////////////////////////////////////////
  708. //
  709. // The new_lexer_gen<>::new_lexer function (declared in cpp_slex_token.hpp)
  710. // should be defined inline, if the lex_functor shouldn't be instantiated
  711. // separately from the lex_iterator.
  712. //
  713. // Separate (explicit) instantiation helps to reduce compilation time.
  714. //
  715. ///////////////////////////////////////////////////////////////////////////////
  716. #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
  717. #define BOOST_WAVE_SLEX_NEW_LEXER_INLINE
  718. #else
  719. #define BOOST_WAVE_SLEX_NEW_LEXER_INLINE inline
  720. #endif
  721. template <typename IteratorT, typename PositionT>
  722. BOOST_WAVE_SLEX_NEW_LEXER_INLINE
  723. lex_input_interface<slex_token<PositionT> > *
  724. new_lexer_gen<IteratorT, PositionT>::new_lexer(IteratorT const &first,
  725. IteratorT const &last, PositionT const &pos,
  726. boost::wave::language_support language)
  727. {
  728. return new slex_functor<IteratorT, PositionT>(first, last, pos,
  729. language);
  730. }
  731. #undef BOOST_WAVE_SLEX_NEW_LEXER_INLINE
  732. ///////////////////////////////////////////////////////////////////////////////
  733. } // namespace slex
  734. } // namespace cpplexer
  735. } // namespace wave
  736. } // namespace boost
  737. #endif // !defined(SLEX_LEXER_HPP_5E8E1DF0_BB41_4938_B7E5_A4BB68222FF6_INCLUDED)