word_count.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. // Copyright (c) 2001-2010 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. // This example is the equivalent to the following lex program:
  6. /*
  7. //[wcp_flex_version
  8. %{
  9. int c = 0, w = 0, l = 0;
  10. %}
  11. word [^ \t\n]+
  12. eol \n
  13. %%
  14. {word} { ++w; c += yyleng; }
  15. {eol} { ++c; ++l; }
  16. . { ++c; }
  17. %%
  18. main()
  19. {
  20. yylex();
  21. printf("%d %d %d\n", l, w, c);
  22. }
  23. //]
  24. */
  25. // Its purpose is to do the word count function of the wc command in UNIX. It
  26. // prints the number of lines, words and characters in a file.
  27. //
  28. // The example additionally demonstrates how to use the add_pattern(...)(...)
  29. // syntax to define lexer patterns. These patterns are essentially parameter-
  30. // less 'macros' for regular expressions, allowing to simplify their
  31. // definition.
  32. // #define BOOST_SPIRIT_LEXERTL_DEBUG
  33. #define BOOST_VARIANT_MINIMIZE_SIZE
  34. #include <boost/config/warning_disable.hpp>
  35. //[wcp_includes
  36. #include <boost/spirit/include/qi.hpp>
  37. #include <boost/spirit/include/lex_lexertl.hpp>
  38. #include <boost/spirit/include/phoenix_operator.hpp>
  39. #include <boost/spirit/include/phoenix_statement.hpp>
  40. #include <boost/spirit/include/phoenix_container.hpp>
  41. //]
  42. #include <iostream>
  43. #include <string>
  44. #include "example.hpp"
  45. //[wcp_namespaces
  46. using namespace boost::spirit;
  47. using namespace boost::spirit::ascii;
  48. //]
  49. ///////////////////////////////////////////////////////////////////////////////
  50. // Token definition: We use the lexertl based lexer engine as the underlying
  51. // lexer type.
  52. ///////////////////////////////////////////////////////////////////////////////
  53. //[wcp_token_ids
  54. enum tokenids
  55. {
  56. IDANY = lex::min_token_id + 10
  57. };
  58. //]
  59. //[wcp_token_definition
  60. template <typename Lexer>
  61. struct word_count_tokens : lex::lexer<Lexer>
  62. {
  63. word_count_tokens()
  64. {
  65. // define patterns (lexer macros) to be used during token definition
  66. // below
  67. this->self.add_pattern
  68. ("WORD", "[^ \t\n]+")
  69. ;
  70. // define tokens and associate them with the lexer
  71. word = "{WORD}"; // reference the pattern 'WORD' as defined above
  72. // this lexer will recognize 3 token types: words, newlines, and
  73. // everything else
  74. this->self.add
  75. (word) // no token id is needed here
  76. ('\n') // characters are usable as tokens as well
  77. (".", IDANY) // string literals will not be escaped by the library
  78. ;
  79. }
  80. // the token 'word' exposes the matched string as its parser attribute
  81. lex::token_def<std::string> word;
  82. };
  83. //]
  84. ///////////////////////////////////////////////////////////////////////////////
  85. // Grammar definition
  86. ///////////////////////////////////////////////////////////////////////////////
  87. //[wcp_grammar_definition
  88. template <typename Iterator>
  89. struct word_count_grammar : qi::grammar<Iterator>
  90. {
  91. template <typename TokenDef>
  92. word_count_grammar(TokenDef const& tok)
  93. : word_count_grammar::base_type(start)
  94. , c(0), w(0), l(0)
  95. {
  96. using boost::phoenix::ref;
  97. using boost::phoenix::size;
  98. start = *( tok.word [++ref(w), ref(c) += size(_1)]
  99. | lit('\n') [++ref(c), ++ref(l)]
  100. | qi::token(IDANY) [++ref(c)]
  101. )
  102. ;
  103. }
  104. std::size_t c, w, l;
  105. qi::rule<Iterator> start;
  106. };
  107. //]
  108. ///////////////////////////////////////////////////////////////////////////////
  109. //[wcp_main
  110. int main(int argc, char* argv[])
  111. {
  112. /*< Define the token type to be used: `std::string` is available as the
  113. type of the token attribute
  114. >*/ typedef lex::lexertl::token<
  115. char const*, boost::mpl::vector<std::string>
  116. > token_type;
  117. /*< Define the lexer type to use implementing the state machine
  118. >*/ typedef lex::lexertl::lexer<token_type> lexer_type;
  119. /*< Define the iterator type exposed by the lexer type
  120. >*/ typedef word_count_tokens<lexer_type>::iterator_type iterator_type;
  121. // now we use the types defined above to create the lexer and grammar
  122. // object instances needed to invoke the parsing process
  123. word_count_tokens<lexer_type> word_count; // Our lexer
  124. word_count_grammar<iterator_type> g (word_count); // Our parser
  125. // read in the file int memory
  126. std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1]));
  127. char const* first = str.c_str();
  128. char const* last = &first[str.size()];
  129. /*< Parsing is done based on the token stream, not the character
  130. stream read from the input. The function `tokenize_and_parse()` wraps
  131. the passed iterator range `[first, last)` by the lexical analyzer and
  132. uses its exposed iterators to parse the token stream.
  133. >*/ bool r = lex::tokenize_and_parse(first, last, word_count, g);
  134. if (r) {
  135. std::cout << "lines: " << g.l << ", words: " << g.w
  136. << ", characters: " << g.c << "\n";
  137. }
  138. else {
  139. std::string rest(first, last);
  140. std::cerr << "Parsing failed\n" << "stopped at: \""
  141. << rest << "\"\n";
  142. }
  143. return 0;
  144. }
  145. //]