/*============================================================================= Boost.Wave: A Standard compliant C++ preprocessor library Spirit based lexer http://www.boost.org/ Copyright (c) 2001, Daniel C. Nuffer. Copyright (c) 2001-2012 Hartmut Kaiser. Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) TODO List: X callback objects (called when a match is made.) X callback passed first & last iterator, and a reference to a lexercontrol object that supports some operations on the lexer. set state terminate state stack (push, pop, top) set new token return value ignore the current token yymore get length of matched token get current lexer state X DFA serialization to save recomputing the DFA. lexer states. organize the file into hpp and ipp. arrange the classes in a logical order. use buffering - iterator only needs be an input iterator, lexer & callback are not templatized on iterator type, but char type. next_token is templatized on iterator type. beginning/ending contexts. ^ and $ DFA minimization. DFA table compression. =============================================================================*/ #ifndef BOOST_SPIRIT_LEXER_HPP #define BOOST_SPIRIT_LEXER_HPP /////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include // for auto_ptr/unique_ptr #include #include #include // for pair #include #include #include #include #if defined(BOOST_NO_STD_ITERATOR_TRAITS) #define BOOST_SPIRIT_IT_NS impl #else #define BOOST_SPIRIT_IT_NS std #endif /////////////////////////////////////////////////////////////////////////////// namespace boost { namespace spirit { namespace classic { typedef unsigned char uchar; typedef unsigned int node_id_t; const node_id_t invalid_node = node_id_t(-1); typedef std::set node_set; typedef std::vector uchar_vector; typedef std::map followpos_t; typedef std::vector state_match_t; template class lexer_control; class bad_regex : public std::exception { }; namespace lexerimpl { class node { public: virtual ~node() {} virtual node* clone() const = 0; virtual bool nullable() const = 0; virtual node_set firstpos() const = 0; virtual node_set lastpos() const = 0; virtual void compute_followpos(followpos_t& followpos) const = 0; virtual void compute_state_match(state_match_t& state_match) const = 0; virtual void get_eof_ids(node_set& eof_set) const = 0; virtual void assign_node_ids(node_id_t& node_count) = 0; #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const = 0; #endif }; class char_node : public node { public: char_node(const uchar c); char_node(const char_node& x); virtual ~char_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: uchar m_char; node_id_t m_node_num; }; inline char_node::char_node(const uchar c) : node() , m_char(c) , m_node_num(0) { } inline char_node::char_node(const char_node& x) : node(x) , m_char(x.m_char) , m_node_num(x.m_node_num) { } inline node * char_node::clone() const { return new char_node(*this); } inline bool char_node::nullable() const { return false; } inline node_set char_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set char_node::lastpos() const { return firstpos(); } inline void char_node::compute_followpos(followpos_t&) const { return; } inline void char_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256); state_match[m_node_num][m_char] = 1; } inline void char_node::get_eof_ids(node_set&) const { return; } inline void char_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void char_node::dump(std::ostream& out) const { out << "\nchar_node m_char = " << m_char; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); } #endif class epsilon_node : public node { public: epsilon_node(); epsilon_node(const epsilon_node& x); virtual ~epsilon_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: node_id_t m_node_num; }; inline epsilon_node::epsilon_node() : node() , m_node_num(0) { } inline epsilon_node::epsilon_node(const epsilon_node& x) : node(x) , m_node_num(x.m_node_num) { } inline node * epsilon_node::clone() const { return new epsilon_node(*this); } inline bool epsilon_node::nullable() const { return true; } inline node_set epsilon_node::firstpos() const { return node_set(); } inline node_set epsilon_node::lastpos() const { return node_set(); } inline void epsilon_node::compute_followpos(followpos_t&) const { return; } inline void epsilon_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256, 1); } inline void epsilon_node::get_eof_ids(node_set&) const { return; } inline void epsilon_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void epsilon_node::dump(std::ostream& out) const { out << "\nepsilon_node"; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); } #endif class or_node : public node { public: or_node(node* left, node* right); or_node(const or_node& x); virtual ~or_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr m_left; std::unique_ptr m_right; #else std::auto_ptr m_left; std::auto_ptr m_right; #endif }; inline or_node::or_node(node* left, node* right) : node() , m_left(left) , m_right(right) { } inline or_node::or_node(const or_node& x) : node(x) , m_left(x.m_left->clone()) , m_right(x.m_right->clone()) { } inline node * or_node::clone() const { return new or_node(m_left->clone(), m_right->clone()); } inline bool or_node::nullable() const { return m_left->nullable() || m_right->nullable(); } inline node_set or_node::firstpos() const { node_set rval; node_set l = m_left->firstpos(); node_set r = m_right->firstpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } inline node_set or_node::lastpos() const { node_set rval; node_set l = m_left->lastpos(); node_set r = m_right->lastpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } inline void or_node::compute_followpos(followpos_t& followpos) const { m_left->compute_followpos(followpos); m_right->compute_followpos(followpos); } inline void or_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); m_right->compute_state_match(state_match); } inline void or_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); m_right->get_eof_ids(eof_nodes); } inline void or_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); m_right->assign_node_ids(node_count); } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void or_node::dump(std::ostream& out) const { m_left->dump(out); out << "\nor_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); m_right->dump(out); } #endif class cat_node : public node { public: cat_node(node* left, node* right); cat_node(const cat_node& x); virtual ~cat_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr m_left; std::unique_ptr m_right; #else std::auto_ptr m_left; std::auto_ptr m_right; #endif }; inline cat_node::cat_node(node* left, node* right) : node() , m_left(left) , m_right(right) { } inline cat_node::cat_node(const cat_node& x) : node(x) , m_left(x.m_left->clone()) , m_right(x.m_right->clone()) { } inline node * cat_node::clone() const { return new cat_node(m_left->clone(), m_right->clone()); } inline bool cat_node::nullable() const { return m_left->nullable() && m_right->nullable(); } inline node_set cat_node::firstpos() const { if (m_left->nullable()) { node_set rval; node_set l = m_left->firstpos(); node_set r = m_right->firstpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } else { return m_left->firstpos(); } } inline node_set cat_node::lastpos() const { if (m_right->nullable()) { node_set rval; node_set l = m_left->lastpos(); node_set r = m_right->lastpos(); std::set_union(l.begin(), l.end(), r.begin(), r.end(), std::inserter(rval, rval.begin())); return rval; } else { return m_right->lastpos(); } } inline void cat_node::compute_followpos(followpos_t& followpos) const { node_set l = m_left->lastpos(); for (node_set::iterator i = l.begin(); i != l.end(); ++i) { node_set rf = m_right->firstpos(); followpos[*i].insert(rf.begin(), rf.end()); } m_left->compute_followpos(followpos); m_right->compute_followpos(followpos); } inline void cat_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); m_right->compute_state_match(state_match); } inline void cat_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); m_right->get_eof_ids(eof_nodes); } inline void cat_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); m_right->assign_node_ids(node_count); } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void cat_node::dump(std::ostream& out) const { m_left->dump(out); out << "\ncat_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); m_right->dump(out); } #endif class star_node : public node { public: star_node(node* left); star_node(const star_node& x); virtual ~star_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr m_left; #else std::auto_ptr m_left; #endif }; inline star_node::star_node(node* left) : node() , m_left(left) { } inline star_node::star_node(const star_node& x) : node(x) , m_left(x.m_left->clone()) { } inline node * star_node::clone() const { return new star_node(m_left->clone()); } inline bool star_node::nullable() const { return true; } inline node_set star_node::firstpos() const { return m_left->firstpos(); } inline node_set star_node::lastpos() const { return m_left->lastpos(); } inline void star_node::compute_followpos(followpos_t& followpos) const { node_set lp = this->lastpos(); for (node_set::iterator i = lp.begin(); i != lp.end(); ++i) { node_set fp = this->firstpos(); followpos[*i].insert(fp.begin(), fp.end()); } m_left->compute_followpos(followpos); } inline void star_node::compute_state_match(state_match_t& state_match) const { m_left->compute_state_match(state_match); } inline void star_node::get_eof_ids(node_set& eof_nodes) const { m_left->get_eof_ids(eof_nodes); } inline void star_node::assign_node_ids(node_id_t& node_count) { m_left->assign_node_ids(node_count); } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void star_node::dump(std::ostream& out) const { m_left->dump(out); out << "\nstar_node"; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); } #endif class eof_node : public node { public: eof_node(); eof_node(const eof_node& x); virtual ~eof_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: node_id_t m_node_num; }; inline eof_node::eof_node() : node() , m_node_num(0) { } inline eof_node::eof_node(const eof_node& x) : node(x) , m_node_num(x.m_node_num) { } inline node * eof_node::clone() const { return new eof_node(*this); } inline bool eof_node::nullable() const { return false; } inline node_set eof_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set eof_node::lastpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline void eof_node::compute_followpos(followpos_t&) const { return; } inline void eof_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num].resize(256, 0); } inline void eof_node::get_eof_ids(node_set& eof_nodes) const { eof_nodes.insert(m_node_num); } inline void eof_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void eof_node::dump(std::ostream& out) const { out << "\neof_node"; out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); } #endif class ccl_node : public node { public: ccl_node(const std::vector& v); ccl_node(const uchar c1, const uchar c2); ccl_node(const ccl_node& x); virtual ~ccl_node(){} virtual node* clone() const; virtual bool nullable() const; virtual node_set firstpos() const; virtual node_set lastpos() const; virtual void compute_followpos(followpos_t& followpos) const; virtual void compute_state_match(state_match_t& state_match ) const; virtual void get_eof_ids(node_set& eof_set) const; virtual void assign_node_ids(node_id_t& node_count); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) virtual void dump(std::ostream& out) const; #endif private: std::vector m_match; node_id_t m_node_num; }; inline ccl_node::ccl_node(const std::vector& v) : node() , m_match(v) , m_node_num(0) { m_match.resize(256); // make sure it's the right size } inline ccl_node::ccl_node(const uchar c1, const uchar c2) : node() , m_match(256, uchar(0)) , m_node_num(0) { BOOST_ASSERT(c1 < c2); for (std::size_t i = c1; i <= std::size_t(c2); ++i) { m_match[i] = 1; } } inline ccl_node::ccl_node(const ccl_node& x) : node(x) , m_match(x.m_match) , m_node_num(x.m_node_num) { } inline node * ccl_node::clone() const { return new ccl_node(*this); } inline bool ccl_node::nullable() const { return false; } inline node_set ccl_node::firstpos() const { node_set rval; rval.insert(m_node_num); return rval; } inline node_set ccl_node::lastpos() const { return firstpos(); } inline void ccl_node::compute_followpos(followpos_t&) const { return; } inline void ccl_node::compute_state_match(state_match_t& state_match) const { if (state_match.size() < m_node_num + 1) state_match.resize(m_node_num + 1); state_match[m_node_num] = m_match; } inline void ccl_node::get_eof_ids(node_set&) const { return; } inline void ccl_node::assign_node_ids(node_id_t& node_count) { m_node_num = node_count++; } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) inline void ccl_node::dump(std::ostream& out) const { out << "\nccl_node m_match = "; for (std::size_t i = 0; i < m_match.size(); ++i) { if (m_match[i]) out << i << ", "; } out << " m_node_num = " << m_node_num; out << " nullable() = " << (nullable() ? "true" : "false"); out << " firstpos() = "; node_set fp = firstpos(); std::copy(fp.begin(), fp.end(), std::ostream_iterator(out, ",")); out << " lastpos() = "; node_set lp = lastpos(); std::copy(lp.begin(), lp.end(), std::ostream_iterator(out, ",")); } #endif template class make_concat { typedef typename ScannerT::iterator_t iterator_type; public: make_concat(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const &, iterator_type const &) const { node* right = m_stack.top(); m_stack.pop(); node* left = m_stack.top(); m_stack.pop(); node* newnode = new cat_node(left, right); m_stack.push(newnode); } std::stack& m_stack; }; template struct get_byte_aux; template<> struct get_byte_aux<1> { template unsigned char operator()(CharT c, unsigned int byte) { BOOST_ASSERT(byte == 0); return c; } }; template<> struct get_byte_aux<2> { template unsigned char operator()(CharT c, unsigned int byte) { static unsigned long mask[] = { 0xFF00, 0x00FF }; BOOST_ASSERT(byte < 2); return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); } }; template<> struct get_byte_aux<4> { template unsigned char operator()(CharT c, unsigned int byte) { static unsigned long mask[] = { 0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF }; BOOST_ASSERT(byte < 4); return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); } }; template inline unsigned char get_byte(CharT c, unsigned int byte) { return get_byte_aux()(c, byte); } template class make_star { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; make_star(std::stack& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* newnode = new star_node(left); m_stack.push(newnode); } std::stack& m_stack; }; template class make_or { typedef typename ScannerT::iterator_t iterator_type; public: make_or(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const&, iterator_type const&) const { node* right = m_stack.top(); m_stack.pop(); node* left = m_stack.top(); m_stack.pop(); node* newnode = new or_node(left, right); m_stack.push(newnode); } std::stack& m_stack; }; template class make_plus { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; make_plus(std::stack& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* copy = left->clone(); node* new_star = new star_node(copy); node* new_cat = new cat_node(left, new_star); m_stack.push(new_cat); } std::stack& m_stack; }; template class make_optional { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; make_optional(std::stack& the_stack) : m_stack(the_stack) {} void operator()(const char_t) const { node* left = m_stack.top(); m_stack.pop(); node* new_or = new or_node(left, new epsilon_node()); m_stack.push(new_or); } std::stack& m_stack; }; /////////////////////////////////////////////////////////////////////////////// // utility function template inline utility::impl::range const& full_range() { static utility::impl::range full((std::numeric_limits::min)(), (std::numeric_limits::max)()); return full; } namespace ccl_utils { template inline utility::impl::range_run negate_range_run( const utility::impl::range_run& rr) { utility::impl::range_run newrr; newrr.set(full_range()); for (typename utility::impl::range_run::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) newrr.clear(*iter); return newrr; } template inline node* create_mb_node_seq(char_t c) { node* newnode = new char_node(get_byte(c, 0)); for (unsigned int i = 1; i < sizeof(c); ++i) { node* cnode = new char_node(get_byte(c, i)); node* top_node = new cat_node(newnode, cnode); newnode = top_node; } return newnode; } template inline void handle_mb_char(char_t c, bool first_time, std::stack& stack) { node* newnode = create_mb_node_seq(c); if (first_time) { stack.push(newnode); } else { node* top = stack.top(); stack.pop(); node* newtop = new or_node(top, newnode); stack.push(newtop); } } // forward decl only template inline void handle_mb_range(char_t c1, char_t c2, bool first_time, std::stack& stack); template inline void create_nodes(const utility::impl::range_run& rr, std::stack& stack) { if (sizeof(char_t) == 1) { std::vector ccl; ccl.resize(256); for (typename utility::impl::range_run::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) { for (int i = iter->first; i <= iter->last; ++i) { // this is always true because of the limited datatype // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256); ccl[uchar(i)] = 1; } } node* new_ccl = new ccl_node(ccl); stack.push(new_ccl); } else { bool mb_first_time = true; for (typename utility::impl::range_run::const_iterator iter = rr.begin(); iter != rr.end(); ++iter) { if (iter->first == iter->last) { handle_mb_char(iter->first, mb_first_time, stack); } else { handle_mb_range(iter->first, iter->last, mb_first_time, stack); } mb_first_time = false; } } } template inline std::size_t compute_differing_byte(char_t c1, char_t c2) { std::size_t rval = 0; while (rval < sizeof(c1) && get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval)) { ++rval; } return rval; } template inline node* create_mb_node_type1(std::size_t j, char_t c1, char_t c2) { std::size_t diff = get_byte(c2, (unsigned int)j) - get_byte(c1, (unsigned int)j); if (diff == 1) { return 0; } else if (diff == 2) { return new char_node(get_byte(c1, (unsigned int)j)+1); } else { return new ccl_node(get_byte(c1, (unsigned int)j)+1, get_byte(c2, (unsigned int)j)-1); } } template inline node * create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1, std::size_t differing_byte, char_t c1, char_t c2, node* newnode) { node* cnode; if (i == sizem1 && j == differing_byte && j != sizem1) { node* tmp = create_mb_node_type1(j, c1, c2); if (tmp == 0) { delete newnode; return 0; } else cnode = tmp; } else if (i == differing_byte && j == sizem1) { if (i != sizem1) { cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF); } else { cnode = new ccl_node(get_byte(c1, (unsigned int)j), get_byte(c2, (unsigned int)j)); } } else if (i != differing_byte && i != sizem1 && j == (sizem1 - i + differing_byte)) { cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF); } else if (i + j - differing_byte > sizem1) { cnode = new ccl_node(0, 0xFF); } else {//if (is plain) cnode = new char_node(get_byte(c1, (unsigned int)j)); } node* top_node = new cat_node(newnode, cnode); return top_node; } // On platforms, where wchar_t is a typedef for unsigned short, the // comparision for a negative value is pointless template struct correct_char_aux { }; template <> struct correct_char_aux { template static char_t correct(char_t c) { if (c < 0) c = 0; return c; } }; template <> struct correct_char_aux { template static char_t correct(char_t c) { return c; } }; template struct correct_char { static char_t correct(char_t c) { return correct_char_aux::is_signed >:: correct(c); } }; template inline void handle_mb_range(char_t c1, char_t c2, bool first_time, std::stack& stack) { // The algorithm can't handle negative value chars, which don't make // much sense anyway. This comparision is pointless for wchar_t's on // platforms, where wchar_t is a typedef for unsigned short c1 = correct_char::correct(c1); //if (c1 < 0) // c1 = 0; BOOST_ASSERT(c1 < c2); node* newnode = 0; node* savednode = 0; const std::size_t differing_byte = compute_differing_byte(c1, c2); const std::size_t sizem1 = sizeof(c1) - 1; const std::size_t ndb = sizem1 - differing_byte; for (std::size_t i = differing_byte; i < sizeof(c1); ++i) { // generate node for the first byte if (differing_byte == 0 && i == ndb) { node* tmp = create_mb_node_type1(0, c1, c2); if (tmp == 0) continue; else newnode = tmp; } else { newnode = new char_node(get_byte(c1, 0)); } for (std::size_t j = 1; j < sizeof(c1); ++j) { newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, c1, c2, newnode); if (newnode == 0) goto end_outer_for; } // or together the various parts if (savednode) { node* top_node = new or_node(savednode, newnode); savednode = top_node; } else { savednode = newnode; } end_outer_for: continue; } for (std::size_t k = 0; k < ndb; ++k) { newnode = new char_node(get_byte(c2, 0)); for (std::size_t j = 1; j < sizeof(c2); ++j) { node* cnode; if (k == differing_byte && j == sizem1 && k != sizem1) cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)); else if (k != differing_byte && k != sizem1 && j == (sizem1 - k + differing_byte)) cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1); else if (k + j - differing_byte > sizem1) cnode = new ccl_node(0, 0xFF); else //if (is plain) cnode = new char_node(get_byte(c2, (unsigned int)j)); node* top_node = new cat_node(newnode, cnode); newnode = top_node; } // or together the various parts if (savednode) { node* top_node = new or_node(savednode, newnode); savednode = top_node; } else { savednode = newnode; } } if (first_time) { stack.push(savednode); } else { node* top = stack.top(); stack.pop(); node* newtop = new or_node(top, savednode); stack.push(newtop); } } } // namespace ccl_utils template class make_char { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; make_char(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const& first, iterator_type const& last) const { const escape_char_parser lex_escape_ch = escape_char_parser(); char_t the_char; iterator_type first_ = first; ScannerT scan(first_, last); lex_escape_ch[assign(the_char)].parse(scan); node* newnode = ccl_utils::create_mb_node_seq(the_char); m_stack.push(newnode); } std::stack& m_stack; }; template class make_ccl { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; make_ccl(std::stack& the_stack) : m_stack(the_stack) {} static bool is_equal_to_string(iterator_type first, iterator_type const & last, const char* str) { while (first != last &&*str &&*first ==*str) { ++first; ++str; } return*str == 0; } template static void fill_ccl(utility::impl::range_run& rr, const ParserT& parser) { for (int i = 0; i < 256; ++i) { if (parser.test(static_cast(uchar(i)))) rr.set(utility::impl::range(char_t(i), char_t(i))); } } void operator()(iterator_type const& first_, iterator_type const& last) const { BOOST_ASSERT(*first_ == '['); iterator_type first = first_; ++first; // skip over '[' bool negated_ccl = false; if (*first == '^') { negated_ccl = true; ++first; } utility::impl::range_run rr; while (first != last &&*first != ']') { if (*first == '[') // it's a ccl_expr like [:space:] { // check for [:space:], etc. if (is_equal_to_string(first, last, "[:alnum:]")) { fill_ccl(rr, alnum_p); } else if (is_equal_to_string(first, last, "[:alpha:]")) { fill_ccl(rr, alpha_p); } else if (is_equal_to_string(first, last, "[:blank:]")) { fill_ccl(rr, blank_p); } else if (is_equal_to_string(first, last, "[:cntrl:]")) { fill_ccl(rr, cntrl_p); } else if (is_equal_to_string(first, last, "[:digit:]")) { fill_ccl(rr, digit_p); } else if (is_equal_to_string(first, last, "[:graph:]")) { fill_ccl(rr, graph_p); } else if (is_equal_to_string(first, last, "[:lower:]")) { fill_ccl(rr, lower_p); } else if (is_equal_to_string(first, last, "[:print:]")) { fill_ccl(rr, print_p); } else if (is_equal_to_string(first, last, "[:punct:]")) { fill_ccl(rr, punct_p); } else if (is_equal_to_string(first, last, "[:space:]")) { fill_ccl(rr, space_p); } else if (is_equal_to_string(first, last, "[:upper:]")) { fill_ccl(rr, upper_p); } else if (is_equal_to_string(first, last, "[:xdigit:]")) { fill_ccl(rr, xdigit_p); } // this can't happen, because it's parsed before we get here. //else // throw bad_regex(); // Advance past the character class expression while (first != last &&*first != ']') ++first; BOOST_ASSERT(*first == ']'); ++first; } else { const escape_char_parser lex_escape_ch = escape_char_parser(); char_t c1; ScannerT scan(first, last); lex_escape_ch[assign(c1)].parse(scan); if (*scan.first == '-') // insert a range { ++scan.first; char_t c2; lex_escape_ch[assign(c2)].parse(scan); BOOST_ASSERT(c1 < c2); // Throw exception? rr.set(utility::impl::range(c1, c2)); } else // insert 1 char { rr.set(utility::impl::range(c1, c1)); } } } if (negated_ccl) { rr = ccl_utils::negate_range_run(rr); } ccl_utils::create_nodes(rr, m_stack); } std::stack& m_stack; }; template class make_any_char { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; std::stack& m_stack; make_any_char(std::stack& the_stack) : m_stack(the_stack) {} void operator()(const char_t c) const { BOOST_ASSERT(c == '.'); do_any_char(); } void do_any_char() const { static utility::impl::range_run rr; rr.set(full_range()); char_t newline = '\n'; rr.clear(utility::impl::range(newline, newline)); ccl_utils::create_nodes(rr, m_stack); } }; template class make_string { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; std::stack& m_stack; make_string(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const& first, iterator_type const& last) const { BOOST_ASSERT(*first == '"'); iterator_type first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '"' // empty string not allowed if (*scan.first == '"') { boost::throw_exception(bad_regex()); } const escape_char_parser lex_escape_ch = escape_char_parser(); char_t c; lex_escape_ch[assign(c)].parse(scan); node* top_node = ccl_utils::create_mb_node_seq(c); while (*scan.first != '"' && scan.first != scan.last) { lex_escape_ch[assign(c)].parse(scan); node* cur_node = ccl_utils::create_mb_node_seq(c); top_node = new cat_node(top_node, cur_node); } m_stack.push(top_node); } }; inline node* repeat_node(node* n, int num) { node* list_of_nodes = n; for (int i = 1; i < num; ++i) { list_of_nodes = new cat_node(list_of_nodes, n->clone()); } return list_of_nodes; } inline node* optional_node(node* n) { return new or_node(n, new epsilon_node()); } template class make_rep1 { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; std::stack& m_stack; make_rep1(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const& first, iterator_type const& last) const { BOOST_ASSERT(*first == '{'); iterator_type first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '{' unsigned int count; uint_p[assign(count)].parse(scan); if (count == 0) boost::throw_exception(bad_regex()); node* top_node = m_stack.top(); m_stack.pop(); top_node = repeat_node(top_node, count); m_stack.push(top_node); } }; template class make_rep2 { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; std::stack& m_stack; make_rep2(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const& first, iterator_type const& last) const { BOOST_ASSERT(*first == '{'); iterator_type first_ = first; ScannerT scan (first_, last); ++scan.first; // skip over '{' unsigned int count; uint_p[assign(count)].parse(scan); if (count == 0) boost::throw_exception(bad_regex()); node* top_node = m_stack.top(); m_stack.pop(); top_node = new cat_node(repeat_node(top_node, count), new star_node(top_node->clone())); m_stack.push(top_node); } }; template class make_rep3 { typedef typename ScannerT::iterator_t iterator_type; public: typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; std::stack& m_stack; make_rep3(std::stack& the_stack) : m_stack(the_stack) {} void operator()(iterator_type const& first, iterator_type const& last) const { BOOST_ASSERT(*first == '{'); iterator_type first_ = first; ScannerT scan(first_, last); ++scan.first; // skip over '{' unsigned int count1, count2; uint_p[assign(count1)].parse(scan); if (count1 == 0) boost::throw_exception(bad_regex()); ++scan.first; // skip over ',' uint_p[assign(count2)].parse(scan); if (count2 <= count1) boost::throw_exception(bad_regex()); node* top_node = m_stack.top(); m_stack.pop(); node* repeats = repeat_node(top_node, count1); top_node = new cat_node(repeats, repeat_node(optional_node(top_node->clone()), count2 - count1)); m_stack.push(top_node); } }; /////////////////////////////////////////////////////////////////////////////// // // Lexer grammar // // Defines the grammar, which mandates the syntax of the understood // lexeme definitions passed to lexer::register_regex. // /////////////////////////////////////////////////////////////////////////////// class lexer_grammar : public boost::spirit::classic::grammar { public: lexer_grammar(std::stack &node_stack_) : node_stack(node_stack_) {} template struct definition { typedef rule rule_t; typedef typename ScannerT::iterator_t iterator_type; typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, escseq, ccl_char; symbols<> ccl_expr; definition(lexer_grammar const &self) { regex = re >> !('/' >> re) >> !ch_p('$') ; re = series >>*( ('|' >> series)[make_or(self.node_stack)] ) ; series = singleton >>*( singleton[make_concat(self.node_stack)] ) ; singleton = ch_p('.')[make_any_char(self.node_stack)] >> singleton2 | fullccl >> singleton2 | ('"' >> string >> '"') [ make_string(self.node_stack) ] >> singleton2 | '(' >> re >> ')' >> singleton2 | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) [ make_char(self.node_stack) ] >> singleton2 ; singleton2 = ch_p('*')[make_star(self.node_stack)] >> singleton2 | ch_p('+')[make_plus(self.node_stack)] >> singleton2 | ch_p('?')[make_optional(self.node_stack)] >> singleton2 | ('{' >> uint_p >> '}') [ make_rep1(self.node_stack) ] >> singleton2 | ('{' >> uint_p >> ',' >> '}') [ make_rep2(self.node_stack) ] >> singleton2 | ('{' >> uint_p >> ',' >> uint_p >> '}') [ make_rep3(self.node_stack) ] >> singleton2 | epsilon_p ; fullccl = ('[' >> !ch_p('^') >> ccl >> ']') [ make_ccl(self.node_stack) ] ; ccl = *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) ; ccl_char = ( (anychar_p - chset<>("\\\n]")) | escseq ) ; ccl_expr = "[:alnum:]", "[:alpha:]", "[:blank:]", "[:cntrl:]", "[:digit:]", "[:graph:]", "[:lower:]", "[:print:]", "[:punct:]", "[:space:]", "[:upper:]", "[:xdigit:]" ; string = +( (anychar_p - chset<>("\"\\")) | escseq ) ; typedef uint_parser::digits / 3 + 1 > oct_parser_t; typedef uint_parser::digits / 4 + 1 > hex_parser_t; escseq = ch_p('\\') >> ( oct_parser_t() | as_lower_d['x'] >> hex_parser_t() | (anychar_p - chset<>('\n')) ) ; #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); #undef BOOST_SLEX_DEBUG } rule const& start() const { return regex; } }; std::stack &node_stack; }; // class lexer_grammar template inline node * parse(lexer_grammar& g, StringT const& str) { typedef scanner > scanner_t; typedef typename rule::template result::type result_t; typename StringT::const_iterator first = str.begin(); typename StringT::const_iterator last = str.end(); scanner_t scan(first, last); // typename rule::result_t hit = g.parse(scan); result_t hit = g.parse(scan); if (!hit || !scan.at_end()) { while (g.node_stack.size()) { delete g.node_stack.top(); g.node_stack.pop(); } return 0; } BOOST_ASSERT(g.node_stack.size() == 1); node* rval = g.node_stack.top(); g.node_stack.pop(); node* an_eof_node = new eof_node(); rval = new cat_node(rval, an_eof_node); return rval; } inline void make_case_insensitive(state_match_t& state_match) { // TODO: Fix this. // This doesn't take into account foreign languages, figure out how to // do that. Also this approach is broken for this implementation of // wide chars. for (state_match_t::iterator iter = state_match.begin(); iter != state_match.end(); ++iter) { int i, j; for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) { // if either is set, turn them both on (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); } } } template struct regex_match_helper; template<> struct regex_match_helper // single byte char { template static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type > *token) { typedef std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type > string_type; typedef typename string_type::size_type size_type; node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT p = first; IteratorT last_accepting_cpos = first; while (p != last) { s = dfa.transition_table[s][(uchar)*p]; if (s == invalid_node) break; if (token) token->append((size_type)1, *p); ++p; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = p; } } if (last_accepting_index != invalid_node) { #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n'; #endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; } }; template<> struct regex_match_helper // wide char { template static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type > *token) { typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; typedef std::basic_string string_type; typedef typename string_type::size_type size_type; node_id_t s = 0; node_id_t last_accepting_index = invalid_node; IteratorT wp = first; IteratorT last_accepting_cpos = first; while (wp != last) { for (unsigned int i = 0; i < sizeof(char_t); ++i) { s = dfa.transition_table[s][get_byte(*wp,i)]; if (s == invalid_node) { goto break_while; } } if (token) token->append((size_type)1, *wp); ++wp; if (dfa.acceptance_index[s] != invalid_node) { last_accepting_index = s; last_accepting_cpos = wp; } } break_while: if (last_accepting_index != invalid_node) { #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << dfa.acceptance_index[last_accepting_index] << '\n'; #endif first = last_accepting_cpos; regex_index = dfa.acceptance_index[last_accepting_index]; return true; } else return false; } }; template struct regex_match { static bool do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, int& regex_index, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type > *token) { return regex_match_helper::do_match( dfa, first, last, regex_index, token); } }; } // namespace lexerimpl /////////////////////////////////////////////////////////////////////////////// // template &)> class lexer { public: typedef CallbackT callback_t; typedef typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type char_t; struct regex_info { std::basic_string str; TokenT token; CallbackT callback; regex_info(const std::basic_string& _str, const TokenT& _token, const CallbackT& _callback) : str(_str) , token(_token) , callback(_callback) {} }; struct dfa_table { std::vector > transition_table; std::vector acceptance_index; }; typedef std::vector node_table_t; typedef std::vector transition_table_t; typedef std::vector dfa_t; lexer(unsigned int states = 1); void register_regex(const std::basic_string& regex, const TokenT& id, const CallbackT& cb = CallbackT(), unsigned int state = 0); TokenT next_token(IteratorT &first, IteratorT const &last, std::basic_string *token = 0); void create_dfa(); bool has_compiled_dfa() { return m_compiled_dfa; } void set_case_insensitive(bool insensitive); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) void dump(std::ostream& out); #endif typedef std::vector > regex_list_t; bool load (std::ifstream &in, long unique_id = 0); bool save (std::ofstream &out, long unique_id = 0); enum { SLEX_SIGNATURE = 0x58454C53, // "SLEX" SLEX_VERSION_100 = 0x0100, // persistance version SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, SLEX_MINOR_VERSION_MASK = 0xFF }; private: void create_dfa_for_state(int state); static bool regex_match(const dfa_t& dfa, IteratorT& first, IteratorT& last, int& regex_index); mutable std::stack node_stack; lexerimpl::lexer_grammar g; mutable bool m_compiled_dfa; mutable dfa_t m_dfa; regex_list_t m_regex_list; bool m_case_insensitive; unsigned int m_state; std::stack m_state_stack; unsigned int m_num_states; }; template inline lexer::lexer(unsigned int states) : g(node_stack) , m_compiled_dfa(false) , m_regex_list(states) , m_case_insensitive(false) , m_state(0) , m_state_stack() , m_num_states(states) { BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX); } template inline void lexer::register_regex( const std::basic_string& regex, const TokenT& id, const CallbackT& callback, unsigned int state) { if (state > m_num_states) { m_regex_list.resize(state); m_num_states = state; } m_regex_list[state].push_back(regex_info(regex, id, callback)); } template inline TokenT lexer::next_token( IteratorT &first, IteratorT const& last, std::basic_string< typename BOOST_SPIRIT_IT_NS::iterator_traits::value_type > *token) { if (!m_compiled_dfa) { create_dfa(); } IteratorT saved = first; int regex_index; if (!lexerimpl::regex_match 1)>:: do_match(m_dfa[m_state], first, last, regex_index, token)) return -1; // TODO: can't return -1, need to return some invalid token. // how to figure this out? We can use traits I guess. else { regex_info regex = m_regex_list[m_state][regex_index]; TokenT rval = regex.token; if (regex.callback) { // execute corresponding callback lexer_control controller(rval, m_state, m_state_stack); regex.callback(saved, first, last, regex.token, controller); if (controller.ignore_current_token_set()) { if (token) token->erase(); return next_token(first, last, token); } } return rval; } } namespace lexerimpl { inline bool find_acceptance_state(const node_set& eof_node_ids, const node_set& current_set, node_id_t& acceptance_node_id) { for(node_set::const_iterator nsi = eof_node_ids.begin(); nsi != eof_node_ids.end(); ++nsi) { node_id_t eof_node_id =*nsi; if (current_set.end() != current_set.find(eof_node_id)) { // store the first one we come to as the // matched pattern acceptance_node_id = eof_node_id; // don't bother searching for more return true; } } return false; } template #ifndef BOOST_NO_CXX11_SMART_PTR inline std::unique_ptr #else inline std::auto_ptr #endif parse_regexes(const RegexListT& regex_list, GrammarT& g) { // parse the expressions into a tree if (regex_list.begin() == regex_list.end()) boost::throw_exception(bad_regex()); typename RegexListT::const_iterator ri = regex_list.begin(); #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr tree(lexerimpl::parse(g, (*ri).str)); #else std::auto_ptr tree(lexerimpl::parse(g, (*ri).str)); #endif if (tree.get() == 0) boost::throw_exception(bad_regex()); ++ri; for (/**/; ri != regex_list.end(); ++ri) { #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr next_tree(lexerimpl::parse(g, (*ri).str)); #else std::auto_ptr next_tree(lexerimpl::parse(g, (*ri).str)); #endif if (next_tree.get() == 0) boost::throw_exception(bad_regex()); #ifndef BOOST_NO_CXX11_SMART_PTR tree = std::unique_ptr(new or_node(tree.release(), next_tree.release())); #else tree = std::auto_ptr(new or_node(tree.release(), next_tree.release())); #endif } return tree; } } //namespace lexerimpl template inline void lexer::create_dfa() { m_dfa.resize(m_num_states); for (unsigned int i = 0; i < m_num_states; ++i) create_dfa_for_state(i); } // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 template inline void lexer::create_dfa_for_state(int state) { using lexerimpl::node; #ifndef BOOST_NO_CXX11_SMART_PTR std::unique_ptr tree = lexerimpl::parse_regexes(m_regex_list[state], g); #else std::auto_ptr tree = lexerimpl::parse_regexes(m_regex_list[state], g); #endif node_id_t dummy = 0; tree->assign_node_ids(dummy); #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) tree->dump(std::cout); #endif // compute followpos(root) followpos_t followpos; tree->compute_followpos(followpos); // the dfa states <-> nfa state groups std::map dstates1; std::map dstates2; // the dfa transitions m_dfa[state].transition_table.push_back( std::vector(256, invalid_node)); m_dfa[state].acceptance_index.push_back(invalid_node); // whether the dfa state has been processed yet std::vector marked; // used to give a unique id to each dfa state node_id_t num_states = 0; // initially, the only unmarked state in Dstates is firstpos(root). marked.push_back(0); node_set fpr = tree->firstpos(); dstates1[fpr] = 0; dstates2[0] = fpr; state_match_t state_match; tree->compute_state_match(state_match); if (m_case_insensitive) lexerimpl::make_case_insensitive(state_match); node_set eof_node_ids; tree->get_eof_ids(eof_node_ids); // translate the eof_node_ids into a 0-based index std::map eof_node_id_map; unsigned int x = 0; for (node_set::iterator node_id_it = eof_node_ids.begin(); node_id_it != eof_node_ids.end(); ++node_id_it) { eof_node_id_map[*node_id_it] = x++; } // figure out if this is an acceptance state node_id_t eof_node_id; if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) { m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; } std::vector::iterator i = std::find(marked.begin(), marked.end(), node_id_t(0)); while (marked.end() != i) { *i = 1; node_id_t T = node_id_t(std::distance(marked.begin(), i)); BOOST_ASSERT(T < dstates2.size()); node_set Tstates = dstates2[T]; for (node_id_t j = 0; j < 256; ++j) { node_set U; for (node_set::iterator k = Tstates.begin(); k != Tstates.end(); ++k) { node_id_t p =*k; BOOST_ASSERT(p < state_match.size()); BOOST_ASSERT(j < state_match[p].size()); if (state_match[p][j]) { node_set fpp = followpos[p]; U.insert(fpp.begin(), fpp.end()); } } if (U.size() > 0) { std::map::iterator l = dstates1.find(U); node_id_t target_state; if (l == dstates1.end()) // not in the states yet { ++num_states; dstates1[U] = target_state = num_states; dstates2[target_state] = U; marked.push_back(0); m_dfa[state].transition_table.push_back( std::vector(256, invalid_node)); m_dfa[state].acceptance_index.push_back(invalid_node); // figure out if this is an acceptance state node_id_t eof_node_id; if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) { m_dfa[state].acceptance_index[target_state] = eof_node_id_map[eof_node_id]; } } else { target_state = dstates1[U]; } BOOST_ASSERT(T < m_dfa[state].transition_table.size()); BOOST_ASSERT(j < m_dfa[state].transition_table[T].size()); m_dfa[state].transition_table[T][j] = target_state; } } i = std::find(marked.begin(), marked.end(), node_id_t(0)); } m_compiled_dfa = true; } template inline void lexer::set_case_insensitive(bool insensitive) { m_case_insensitive = insensitive; } #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) template inline void lexer::dump(std::ostream& out) { for (unsigned x = 0; x < m_dfa.size(); ++x) { out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) { out << "state " << i << ":"; for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) { if (m_dfa[x].transition_table[i][j] != invalid_node) out << j << "->" << m_dfa[x].transition_table[i][j] << " "; } out << "\n"; } out << "acceptance states: "; for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) { if (m_dfa[x].acceptance_index[k] != invalid_node) out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; } out << endl; } } #endif /////////////////////////////////////////////////////////////////////////////// // load the lexer tables #define slex_in(strm, val) \ strm.read((char*)&val, sizeof(val)); \ if (std::ios::goodbit != strm.rdstate()) return false template inline bool lexer::load (std::ifstream &in, long unique_id) { // ensure correct signature and version long signature = 0; slex_in (in, signature); if (signature != SLEX_SIGNATURE) return false; // not for us long version = 0; slex_in (in, version); if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) return false; // to new for us long uid = 0; slex_in (in, uid); if (uid != unique_id) return false; // not saved by us // load auxiliary members int num_states = 0; slex_in (in, num_states); // load the dfa tables dfa_t in_dfa; std::size_t dfa_size = 0; slex_in (in, dfa_size); in_dfa.resize(dfa_size); for (std::size_t dfa = 0; dfa < dfa_size; ++dfa) { // load the transition tables std::size_t tt_size = 0; transition_table_t &tt_table = in_dfa[dfa].transition_table; slex_in (in, tt_size); tt_table.resize(tt_size); for (std::size_t tt = 0; tt < tt_size; ++tt) { std::size_t nt_size = 0; node_table_t &nt_table = tt_table[tt]; slex_in (in, nt_size); nt_table.resize(nt_size); for (std::size_t nt = 0; nt < nt_size; ++nt) { slex_in (in, nt_table[nt]); } } // load the acceptance index table std::size_t ai_size = 0; node_table_t &ai_table = in_dfa[dfa].acceptance_index; slex_in (in, ai_size); ai_table.resize(ai_size); for (std::size_t ai = 0; ai < ai_size; ++ai) { slex_in (in, ai_table[ai]); } } m_dfa.swap(in_dfa); // success, swap in the read values m_num_states = num_states; m_compiled_dfa = true; return true; } #undef slex_in /////////////////////////////////////////////////////////////////////////////// // save the lexer tables #define slex_out(strm, val) \ strm.write((char*)&val, sizeof(val)); \ if (std::ios::goodbit != strm.rdstate()) return false template inline bool lexer::save (std::ofstream &out, long unique_id) { // save signature and version information long out_long = SLEX_SIGNATURE; slex_out(out, out_long); out_long = SLEX_VERSION_100; slex_out(out, out_long); slex_out(out, unique_id); // save auxiliary members slex_out(out, m_num_states); // save the dfa tables typedef typename dfa_t::const_iterator dfa_iter_t; typedef transition_table_t::const_iterator transition_table_iter_t; typedef node_table_t::const_iterator node_table_iter_t; std::size_t out_size_t = m_dfa.size(); slex_out(out, out_size_t); dfa_iter_t end = m_dfa.end(); for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) { // save the transition table out_size_t = (*it).transition_table.size(); slex_out(out, out_size_t); transition_table_iter_t tt_end = (*it).transition_table.end(); for (transition_table_iter_t tt_it = (*it).transition_table.begin(); tt_it != tt_end; ++tt_it) { out_size_t = (*tt_it).size(); slex_out(out, out_size_t); node_table_iter_t nt_end = (*tt_it).end(); for (node_table_iter_t nt_it = (*tt_it).begin(); nt_it != nt_end; ++nt_it) { slex_out(out, (*nt_it)); } } // save the acceptance index table out_size_t = (*it).acceptance_index.size(); slex_out(out, out_size_t); node_table_iter_t nt_end = (*it).acceptance_index.end(); for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); nt_it != nt_end; ++nt_it) { slex_out(out, (*nt_it)); } } return true; } #undef slex_out /* a lexer_control object supports some operations on the lexer. get current lexer state set state terminate state stack (push, pop, top) set new token return value ignore the current token yymore get length of matched token */ template class lexer_control { public: lexer_control(TokenT& token, unsigned int& current_state, std::stack& state_stack); // functions dealing with the lexer state // set the state to state void set_state(unsigned int state); // get the current state unsigned int get_state(); // pushes the current state onto the top of the state stack and // switches to new_state void push_state(unsigned int new_state); // pops the top of the state stack and switches to it. void pop_state(); // returns the top of the stack without altering the stack's contents. unsigned int top_state(); // functions dealing with the token returned. // set a new TokenT return value, overriding that one that was // registered via. register_regex() void set_token(const TokenT& token); // tell the lexer to return an invalid token, signifying termination. void terminate(); // ignore the current token, and move on to the next one. The current // token will NOT be returned from next_token() void ignore_current_token(); // returns true if ignore_current_token() has been called, // false otherwise. bool ignore_current_token_set(); private: TokenT& m_token; bool m_ignore_current_token; unsigned int& m_current_state; std::stack& m_state_stack; }; template inline lexer_control::lexer_control(TokenT& token, unsigned int& current_state, std::stack& state_stack) : m_token(token) , m_ignore_current_token(false) , m_current_state(current_state) , m_state_stack(state_stack) { } template inline void lexer_control::set_state(unsigned int state) { m_current_state = state; } template inline unsigned int lexer_control::get_state() { return m_current_state; } template inline void lexer_control::push_state(unsigned int new_state) { m_state_stack.push(m_current_state); m_current_state = new_state; } template inline void lexer_control::pop_state() { m_current_state = m_state_stack.top(); m_state_stack.pop(); } template inline unsigned int lexer_control::top_state() { return m_state_stack.top(); } template inline void lexer_control::set_token(const TokenT& token) { m_token = token; } template inline void lexer_control::terminate() { m_token = -1; // TOOD: fix this, need to be determined by traits } template inline void lexer_control::ignore_current_token() { m_ignore_current_token = true; } template inline bool lexer_control::ignore_current_token_set() { return m_ignore_current_token; } } // namespace classic } // namespace spirit } // namespace boost #undef BOOST_SPIRIT_IT_NS #endif