libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_nfa.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 namespace __regex 00034 { 00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00036 00037 // Base class for, um, automata. Could be an NFA or a DFA. Your choice. 00038 class _Automaton 00039 { 00040 public: 00041 typedef unsigned int _SizeT; 00042 00043 public: 00044 virtual 00045 ~_Automaton() { } 00046 00047 virtual _SizeT 00048 _M_sub_count() const = 0; 00049 00050 #ifdef _GLIBCXX_DEBUG 00051 virtual std::ostream& 00052 _M_dot(std::ostream& __ostr) const = 0; 00053 #endif 00054 }; 00055 00056 // Generic shared pointer to an automaton. 00057 typedef std::shared_ptr<_Automaton> _AutomatonPtr; 00058 00059 // Operation codes that define the type of transitions within the base NFA 00060 // that represents the regular expression. 00061 enum _Opcode 00062 { 00063 _S_opcode_unknown = 0, 00064 _S_opcode_alternative = 1, 00065 _S_opcode_subexpr_begin = 4, 00066 _S_opcode_subexpr_end = 5, 00067 _S_opcode_match = 100, 00068 _S_opcode_accept = 255 00069 }; 00070 00071 // Provides a generic facade for a templated match_results. 00072 struct _Results 00073 { 00074 virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0; 00075 virtual void _M_set_matched(int __i, bool __is_matched) = 0; 00076 }; 00077 00078 // Tags current state (for subexpr begin/end). 00079 typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger; 00080 00081 template<typename _FwdIterT, typename _TraitsT> 00082 struct _StartTagger 00083 { 00084 explicit 00085 _StartTagger(int __i) 00086 : _M_index(__i) 00087 { } 00088 00089 void 00090 operator()(const _PatternCursor& __pc, _Results& __r) 00091 { __r._M_set_pos(_M_index, 0, __pc); } 00092 00093 int _M_index; 00094 }; 00095 00096 template<typename _FwdIterT, typename _TraitsT> 00097 struct _EndTagger 00098 { 00099 explicit 00100 _EndTagger(int __i) 00101 : _M_index(__i) 00102 { } 00103 00104 void 00105 operator()(const _PatternCursor& __pc, _Results& __r) 00106 { __r._M_set_pos(_M_index, 1, __pc); } 00107 00108 int _M_index; 00109 _FwdIterT _M_pos; 00110 }; 00111 // Indicates if current state matches cursor current. 00112 typedef std::function<bool (const _PatternCursor&)> _Matcher; 00113 00114 // Matches any character 00115 inline bool 00116 _AnyMatcher(const _PatternCursor&) 00117 { return true; } 00118 00119 // Matches a single character 00120 template<typename _InIterT, typename _TraitsT> 00121 struct _CharMatcher 00122 { 00123 typedef typename _TraitsT::char_type char_type; 00124 00125 explicit 00126 _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT()) 00127 : _M_traits(__t), _M_c(_M_traits.translate(__c)) 00128 { } 00129 00130 bool 00131 operator()(const _PatternCursor& __pc) const 00132 { 00133 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00134 _CursorT __c = static_cast<_CursorT>(__pc); 00135 return _M_traits.translate(__c._M_current()) == _M_c; 00136 } 00137 00138 const _TraitsT& _M_traits; 00139 char_type _M_c; 00140 }; 00141 00142 // Matches a character range (bracket expression) 00143 template<typename _InIterT, typename _TraitsT> 00144 struct _RangeMatcher 00145 { 00146 typedef typename _TraitsT::char_type _CharT; 00147 typedef std::basic_string<_CharT> _StringT; 00148 00149 explicit 00150 _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT()) 00151 : _M_traits(__t), _M_is_non_matching(__is_non_matching) 00152 { } 00153 00154 bool 00155 operator()(const _PatternCursor& __pc) const 00156 { 00157 typedef const _SpecializedCursor<_InIterT>& _CursorT; 00158 _CursorT __c = static_cast<_CursorT>(__pc); 00159 return true; 00160 } 00161 00162 void 00163 _M_add_char(_CharT __c) 00164 { } 00165 00166 void 00167 _M_add_collating_element(const _StringT& __s) 00168 { } 00169 00170 void 00171 _M_add_equivalence_class(const _StringT& __s) 00172 { } 00173 00174 void 00175 _M_add_character_class(const _StringT& __s) 00176 { } 00177 00178 void 00179 _M_make_range() 00180 { } 00181 00182 const _TraitsT& _M_traits; 00183 bool _M_is_non_matching; 00184 }; 00185 00186 // Identifies a state in the NFA. 00187 typedef int _StateIdT; 00188 00189 // The special case in which a state identifier is not an index. 00190 static const _StateIdT _S_invalid_state_id = -1; 00191 00192 00193 // An individual state in an NFA 00194 // 00195 // In this case a "state" is an entry in the NFA definition coupled with its 00196 // outgoing transition(s). All states have a single outgoing transition, 00197 // except for accepting states (which have no outgoing transitions) and alt 00198 // states, which have two outgoing transitions. 00199 // 00200 struct _State 00201 { 00202 typedef int _OpcodeT; 00203 00204 _OpcodeT _M_opcode; // type of outgoing transition 00205 _StateIdT _M_next; // outgoing transition 00206 _StateIdT _M_alt; // for _S_opcode_alternative 00207 unsigned int _M_subexpr; // for _S_opcode_subexpr_* 00208 _Tagger _M_tagger; // for _S_opcode_subexpr_* 00209 _Matcher _M_matches; // for _S_opcode_match 00210 00211 explicit _State(_OpcodeT __opcode) 00212 : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 00213 { } 00214 00215 _State(const _Matcher& __m) 00216 : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m) 00217 { } 00218 00219 _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t) 00220 : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s), 00221 _M_tagger(__t) 00222 { } 00223 00224 _State(_StateIdT __next, _StateIdT __alt) 00225 : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt) 00226 { } 00227 00228 #ifdef _GLIBCXX_DEBUG 00229 std::ostream& 00230 _M_print(std::ostream& ostr) const; 00231 00232 // Prints graphviz dot commands for state. 00233 std::ostream& 00234 _M_dot(std::ostream& __ostr, _StateIdT __id) const; 00235 #endif 00236 }; 00237 00238 00239 // The Grep Matcher works on sets of states. Here are sets of states. 00240 typedef std::set<_StateIdT> _StateSet; 00241 00242 // A collection of all states making up an NFA 00243 // 00244 // An NFA is a 4-tuple M = (K, S, s, F), where 00245 // K is a finite set of states, 00246 // S is the alphabet of the NFA, 00247 // s is the initial state, 00248 // F is a set of final (accepting) states. 00249 // 00250 // This NFA class is templated on S, a type that will hold values of the 00251 // underlying alphabet (without regard to semantics of that alphabet). The 00252 // other elements of the tuple are generated during construction of the NFA 00253 // and are available through accessor member functions. 00254 // 00255 class _Nfa 00256 : public _Automaton, public std::vector<_State> 00257 { 00258 public: 00259 typedef _State _StateT; 00260 typedef unsigned int _SizeT; 00261 typedef regex_constants::syntax_option_type _FlagT; 00262 00263 public: 00264 _Nfa(_FlagT __f) 00265 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0) 00266 { } 00267 00268 ~_Nfa() 00269 { } 00270 00271 _FlagT 00272 _M_options() const 00273 { return _M_flags; } 00274 00275 _StateIdT 00276 _M_start() const 00277 { return _M_start_state; } 00278 00279 const _StateSet& 00280 _M_final_states() const 00281 { return _M_accepting_states; } 00282 00283 _SizeT 00284 _M_sub_count() const 00285 { return _M_subexpr_count; } 00286 00287 _StateIdT 00288 _M_insert_accept() 00289 { 00290 this->push_back(_StateT(_S_opcode_accept)); 00291 _M_accepting_states.insert(this->size()-1); 00292 return this->size()-1; 00293 } 00294 00295 _StateIdT 00296 _M_insert_alt(_StateIdT __next, _StateIdT __alt) 00297 { 00298 this->push_back(_StateT(__next, __alt)); 00299 return this->size()-1; 00300 } 00301 00302 _StateIdT 00303 _M_insert_matcher(_Matcher __m) 00304 { 00305 this->push_back(_StateT(__m)); 00306 return this->size()-1; 00307 } 00308 00309 _StateIdT 00310 _M_insert_subexpr_begin(const _Tagger& __t) 00311 { 00312 this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t)); 00313 return this->size()-1; 00314 } 00315 00316 _StateIdT 00317 _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t) 00318 { 00319 this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t)); 00320 return this->size()-1; 00321 } 00322 00323 #ifdef _GLIBCXX_DEBUG 00324 std::ostream& 00325 _M_dot(std::ostream& __ostr) const; 00326 #endif 00327 00328 private: 00329 _FlagT _M_flags; 00330 _StateIdT _M_start_state; 00331 _StateSet _M_accepting_states; 00332 _SizeT _M_subexpr_count; 00333 }; 00334 00335 // Describes a sequence of one or more %_State, its current start and end(s). 00336 // 00337 // This structure contains fragments of an NFA during construction. 00338 class _StateSeq 00339 { 00340 public: 00341 // Constructs a single-node sequence 00342 _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id) 00343 : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e) 00344 { } 00345 // Constructs a split sequence from two other sequencces 00346 _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2) 00347 : _M_nfa(__e1._M_nfa), 00348 _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)), 00349 _M_end1(__e1._M_end1), _M_end2(__e2._M_end1) 00350 { } 00351 00352 // Constructs a split sequence from a single sequence 00353 _StateSeq(const _StateSeq& __e, _StateIdT __id) 00354 : _M_nfa(__e._M_nfa), 00355 _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)), 00356 _M_end1(__id), _M_end2(__e._M_end1) 00357 { } 00358 00359 // Constructs a copy of a %_StateSeq 00360 _StateSeq(const _StateSeq& __rhs) 00361 : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start), 00362 _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2) 00363 { } 00364 00365 00366 _StateSeq& operator=(const _StateSeq& __rhs); 00367 00368 _StateIdT 00369 _M_front() const 00370 { return _M_start; } 00371 00372 // Extends a sequence by one. 00373 void 00374 _M_push_back(_StateIdT __id); 00375 00376 // Extends and maybe joins a sequence. 00377 void 00378 _M_append(_StateIdT __id); 00379 00380 void 00381 _M_append(_StateSeq& __rhs); 00382 00383 // Clones an entire sequence. 00384 _StateIdT 00385 _M_clone(); 00386 00387 private: 00388 _Nfa& _M_nfa; 00389 _StateIdT _M_start; 00390 _StateIdT _M_end1; 00391 _StateIdT _M_end2; 00392 00393 }; 00394 00395 _GLIBCXX_END_NAMESPACE_VERSION 00396 } // namespace __regex 00397 } // namespace std 00398 00399 #include <bits/regex_nfa.tcc> 00400