libstdc++
regex_nfa.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_nfa.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 namespace __regex
00034 {
00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00036 
00037   // Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
00038   class _Automaton
00039   {
00040   public:
00041     typedef unsigned int _SizeT;
00042 
00043   public:
00044     virtual
00045     ~_Automaton() { }
00046 
00047     virtual _SizeT
00048     _M_sub_count() const = 0;
00049 
00050 #ifdef _GLIBCXX_DEBUG
00051     virtual std::ostream&
00052     _M_dot(std::ostream& __ostr) const = 0;
00053 #endif
00054   };
00055 
00056   // Generic shared pointer to an automaton.  
00057   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
00058 
00059   // Operation codes that define the type of transitions within the base NFA
00060   // that represents the regular expression.
00061   enum _Opcode
00062   {
00063       _S_opcode_unknown       =   0,
00064       _S_opcode_alternative   =   1,
00065       _S_opcode_subexpr_begin =   4,
00066       _S_opcode_subexpr_end   =   5,
00067       _S_opcode_match         = 100,
00068       _S_opcode_accept        = 255
00069   };
00070 
00071   // Provides a generic facade for a templated match_results.
00072   struct _Results
00073   {
00074     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
00075     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
00076   };
00077 
00078   // Tags current state (for subexpr begin/end).
00079   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
00080 
00081   template<typename _FwdIterT, typename _TraitsT>
00082     struct _StartTagger
00083     {
00084       explicit
00085       _StartTagger(int __i)
00086       : _M_index(__i)
00087       { }
00088 
00089       void
00090       operator()(const _PatternCursor& __pc, _Results& __r)
00091       { __r._M_set_pos(_M_index, 0, __pc); }
00092 
00093       int       _M_index;
00094     };
00095 
00096   template<typename _FwdIterT, typename _TraitsT>
00097     struct _EndTagger
00098     {
00099       explicit
00100       _EndTagger(int __i)
00101       : _M_index(__i)
00102       { }
00103 
00104       void
00105       operator()(const _PatternCursor& __pc, _Results& __r)
00106       { __r._M_set_pos(_M_index, 1, __pc); }
00107 
00108       int       _M_index;
00109       _FwdIterT _M_pos;
00110     };
00111   // Indicates if current state matches cursor current.
00112   typedef std::function<bool (const _PatternCursor&)> _Matcher;
00113 
00114   // Matches any character
00115   inline bool
00116   _AnyMatcher(const _PatternCursor&)
00117   { return true; }
00118 
00119   // Matches a single character
00120   template<typename _InIterT, typename _TraitsT>
00121     struct _CharMatcher
00122     {
00123       typedef typename _TraitsT::char_type char_type;
00124 
00125       explicit
00126       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
00127       : _M_traits(__t), _M_c(_M_traits.translate(__c))
00128       { }
00129 
00130       bool
00131       operator()(const _PatternCursor& __pc) const
00132       {
00133     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00134     _CursorT __c = static_cast<_CursorT>(__pc);
00135     return _M_traits.translate(__c._M_current()) == _M_c;
00136       }
00137 
00138       const _TraitsT& _M_traits;
00139       char_type       _M_c;
00140     };
00141 
00142   // Matches a character range (bracket expression)
00143   template<typename _InIterT, typename _TraitsT>
00144     struct _RangeMatcher
00145     {
00146       typedef typename _TraitsT::char_type _CharT;
00147       typedef std::basic_string<_CharT>    _StringT;
00148 
00149       explicit
00150       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
00151       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
00152       { }
00153 
00154       bool
00155       operator()(const _PatternCursor& __pc) const
00156       {
00157     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00158     _CursorT __c = static_cast<_CursorT>(__pc);
00159     return true;
00160       }
00161 
00162       void
00163       _M_add_char(_CharT __c)
00164       { }
00165 
00166       void
00167       _M_add_collating_element(const _StringT& __s)
00168       { }
00169 
00170       void
00171       _M_add_equivalence_class(const _StringT& __s)
00172       { }
00173 
00174       void
00175       _M_add_character_class(const _StringT& __s)
00176       { }
00177 
00178       void
00179       _M_make_range()
00180       { }
00181 
00182       const _TraitsT& _M_traits;
00183       bool            _M_is_non_matching;
00184     };
00185 
00186   // Identifies a state in the NFA.
00187   typedef int _StateIdT;
00188 
00189   // The special case in which a state identifier is not an index.
00190   static const _StateIdT _S_invalid_state_id  = -1;
00191 
00192 
00193   // An individual state in an NFA
00194   //
00195   // In this case a "state" is an entry in the NFA definition coupled with its
00196   // outgoing transition(s).  All states have a single outgoing transition,
00197   // except for accepting states (which have no outgoing transitions) and alt
00198   // states, which have two outgoing transitions.
00199   //
00200   struct _State
00201   {
00202     typedef int  _OpcodeT;
00203 
00204     _OpcodeT     _M_opcode;    // type of outgoing transition
00205     _StateIdT    _M_next;      // outgoing transition
00206     _StateIdT    _M_alt;       // for _S_opcode_alternative
00207     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
00208     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
00209     _Matcher     _M_matches;   // for _S_opcode_match
00210 
00211     explicit _State(_OpcodeT __opcode)
00212     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00213     { }
00214 
00215     _State(const _Matcher& __m)
00216     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
00217     { }
00218 
00219     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
00220     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
00221       _M_tagger(__t)
00222     { }
00223 
00224     _State(_StateIdT __next, _StateIdT __alt)
00225     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
00226     { }
00227 
00228 #ifdef _GLIBCXX_DEBUG
00229     std::ostream&
00230     _M_print(std::ostream& ostr) const;
00231 
00232     // Prints graphviz dot commands for state.
00233     std::ostream&
00234     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00235 #endif
00236   };
00237 
00238   
00239   // The Grep Matcher works on sets of states.  Here are sets of states.
00240   typedef std::set<_StateIdT> _StateSet;
00241 
00242  // A collection of all states making up an NFA
00243   //
00244   // An NFA is a 4-tuple M = (K, S, s, F), where
00245   //    K is a finite set of states,
00246   //    S is the alphabet of the NFA,
00247   //    s is the initial state,
00248   //    F is a set of final (accepting) states.
00249   //
00250   // This NFA class is templated on S, a type that will hold values of the
00251   // underlying alphabet (without regard to semantics of that alphabet).  The
00252   // other elements of the tuple are generated during construction of the NFA
00253   // and are available through accessor member functions.
00254   //
00255   class _Nfa
00256   : public _Automaton, public std::vector<_State>
00257   {
00258   public:
00259     typedef _State                              _StateT;
00260     typedef unsigned int                        _SizeT;
00261     typedef regex_constants::syntax_option_type _FlagT;
00262 
00263   public:
00264     _Nfa(_FlagT __f)
00265     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
00266     { }
00267 
00268     ~_Nfa()
00269     { }
00270 
00271     _FlagT
00272     _M_options() const
00273     { return _M_flags; }
00274 
00275     _StateIdT
00276     _M_start() const
00277     { return _M_start_state; }
00278 
00279     const _StateSet&
00280     _M_final_states() const
00281     { return _M_accepting_states; }
00282 
00283     _SizeT
00284     _M_sub_count() const
00285     { return _M_subexpr_count; }
00286 
00287     _StateIdT
00288     _M_insert_accept()
00289     {
00290       this->push_back(_StateT(_S_opcode_accept));
00291       _M_accepting_states.insert(this->size()-1);
00292       return this->size()-1;
00293     }
00294 
00295     _StateIdT
00296     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
00297     {
00298       this->push_back(_StateT(__next, __alt));
00299       return this->size()-1;
00300     }
00301 
00302     _StateIdT
00303     _M_insert_matcher(_Matcher __m)
00304     {
00305       this->push_back(_StateT(__m));
00306       return this->size()-1;
00307     }
00308 
00309     _StateIdT
00310     _M_insert_subexpr_begin(const _Tagger& __t)
00311     {
00312       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
00313       return this->size()-1;
00314     }
00315 
00316     _StateIdT 
00317     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
00318     {
00319       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
00320       return this->size()-1;
00321     }
00322 
00323 #ifdef _GLIBCXX_DEBUG
00324     std::ostream&
00325     _M_dot(std::ostream& __ostr) const;
00326 #endif
00327 
00328   private:
00329     _FlagT     _M_flags;
00330     _StateIdT  _M_start_state;
00331     _StateSet  _M_accepting_states;
00332     _SizeT     _M_subexpr_count;
00333   };
00334 
00335   // Describes a sequence of one or more %_State, its current start and end(s).
00336   //
00337   // This structure contains fragments of an NFA during construction.
00338   class _StateSeq
00339   {
00340   public:
00341     // Constructs a single-node sequence
00342     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
00343     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
00344     { }
00345     // Constructs a split sequence from two other sequencces
00346     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
00347     : _M_nfa(__e1._M_nfa),
00348       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
00349       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
00350     { }
00351 
00352     // Constructs a split sequence from a single sequence
00353     _StateSeq(const _StateSeq& __e, _StateIdT __id)
00354     : _M_nfa(__e._M_nfa),
00355       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
00356       _M_end1(__id), _M_end2(__e._M_end1)
00357     { }
00358 
00359     // Constructs a copy of a %_StateSeq
00360     _StateSeq(const _StateSeq& __rhs)
00361     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
00362       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
00363     { }
00364 
00365 
00366     _StateSeq& operator=(const _StateSeq& __rhs);
00367 
00368     _StateIdT
00369     _M_front() const
00370     { return _M_start; }
00371 
00372     // Extends a sequence by one.
00373     void
00374     _M_push_back(_StateIdT __id);
00375 
00376     // Extends and maybe joins a sequence.
00377     void
00378     _M_append(_StateIdT __id);
00379 
00380     void
00381     _M_append(_StateSeq& __rhs);
00382 
00383     // Clones an entire sequence.
00384     _StateIdT
00385     _M_clone();
00386 
00387   private:
00388     _Nfa&     _M_nfa;
00389     _StateIdT _M_start;
00390     _StateIdT _M_end1;
00391     _StateIdT _M_end2;
00392 
00393   };
00394 
00395 _GLIBCXX_END_NAMESPACE_VERSION
00396 } // namespace __regex
00397 } // namespace std
00398 
00399 #include <bits/regex_nfa.tcc>
00400