libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2010, 2011 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_compiler.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 namespace __regex 00034 { 00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00036 00037 struct _Scanner_base 00038 { 00039 typedef unsigned int _StateT; 00040 00041 static constexpr _StateT _S_state_at_start = 1 << 0; 00042 static constexpr _StateT _S_state_in_brace = 1 << 2; 00043 static constexpr _StateT _S_state_in_bracket = 1 << 3; 00044 00045 virtual ~_Scanner_base() { }; 00046 }; 00047 00048 // 00049 // @brief Scans an input range for regex tokens. 00050 // 00051 // The %_Scanner class interprets the regular expression pattern in the input 00052 // range passed to its constructor as a sequence of parse tokens passed to 00053 // the regular expression compiler. The sequence of tokens provided depends 00054 // on the flag settings passed to the constructor: different regular 00055 // expression grammars will interpret the same input pattern in 00056 // syntactically different ways. 00057 // 00058 template<typename _InputIterator> 00059 class _Scanner: public _Scanner_base 00060 { 00061 public: 00062 typedef _InputIterator _IteratorT; 00063 typedef typename std::iterator_traits<_IteratorT>::value_type _CharT; 00064 typedef std::basic_string<_CharT> _StringT; 00065 typedef regex_constants::syntax_option_type _FlagT; 00066 typedef const std::ctype<_CharT> _CtypeT; 00067 00068 // Token types returned from the scanner. 00069 enum _TokenT 00070 { 00071 _S_token_anychar, 00072 _S_token_backref, 00073 _S_token_bracket_begin, 00074 _S_token_bracket_end, 00075 _S_token_inverse_class, 00076 _S_token_char_class_name, 00077 _S_token_closure0, 00078 _S_token_closure1, 00079 _S_token_collelem_multi, 00080 _S_token_collelem_single, 00081 _S_token_collsymbol, 00082 _S_token_comma, 00083 _S_token_dash, 00084 _S_token_dup_count, 00085 _S_token_eof, 00086 _S_token_equiv_class_name, 00087 _S_token_interval_begin, 00088 _S_token_interval_end, 00089 _S_token_line_begin, 00090 _S_token_line_end, 00091 _S_token_opt, 00092 _S_token_or, 00093 _S_token_ord_char, 00094 _S_token_quoted_char, 00095 _S_token_subexpr_begin, 00096 _S_token_subexpr_end, 00097 _S_token_word_begin, 00098 _S_token_word_end, 00099 _S_token_unknown 00100 }; 00101 00102 public: 00103 _Scanner(_IteratorT __begin, _IteratorT __end, _FlagT __flags, 00104 std::locale __loc) 00105 : _M_current(__begin) , _M_end(__end) , _M_flags(__flags), 00106 _M_ctype(std::use_facet<_CtypeT>(__loc)), _M_state(_S_state_at_start) 00107 { _M_advance(); } 00108 00109 void 00110 _M_advance(); 00111 00112 _TokenT 00113 _M_token() const 00114 { return _M_curToken; } 00115 00116 const _StringT& 00117 _M_value() const 00118 { return _M_curValue; } 00119 00120 #ifdef _GLIBCXX_DEBUG 00121 std::ostream& 00122 _M_print(std::ostream&); 00123 #endif 00124 00125 private: 00126 void 00127 _M_eat_escape(); 00128 00129 void 00130 _M_scan_in_brace(); 00131 00132 void 00133 _M_scan_in_bracket(); 00134 00135 void 00136 _M_eat_charclass(); 00137 00138 void 00139 _M_eat_equivclass(); 00140 00141 void 00142 _M_eat_collsymbol(); 00143 00144 private: 00145 _IteratorT _M_current; 00146 _IteratorT _M_end; 00147 _FlagT _M_flags; 00148 _CtypeT& _M_ctype; 00149 _TokenT _M_curToken; 00150 _StringT _M_curValue; 00151 _StateT _M_state; 00152 }; 00153 00154 template<typename _InputIterator> 00155 void 00156 _Scanner<_InputIterator>:: 00157 _M_advance() 00158 { 00159 if (_M_current == _M_end) 00160 { 00161 _M_curToken = _S_token_eof; 00162 return; 00163 } 00164 00165 _CharT __c = *_M_current; 00166 if (_M_state & _S_state_in_bracket) 00167 { 00168 _M_scan_in_bracket(); 00169 return; 00170 } 00171 if (_M_state & _S_state_in_brace) 00172 { 00173 _M_scan_in_brace(); 00174 return; 00175 } 00176 #if 0 00177 // TODO: re-enable line anchors when _M_assertion is implemented. 00178 // See PR libstdc++/47724 00179 else if (_M_state & _S_state_at_start && __c == _M_ctype.widen('^')) 00180 { 00181 _M_curToken = _S_token_line_begin; 00182 ++_M_current; 00183 return; 00184 } 00185 else if (__c == _M_ctype.widen('$')) 00186 { 00187 _M_curToken = _S_token_line_end; 00188 ++_M_current; 00189 return; 00190 } 00191 #endif 00192 else if (__c == _M_ctype.widen('.')) 00193 { 00194 _M_curToken = _S_token_anychar; 00195 ++_M_current; 00196 return; 00197 } 00198 else if (__c == _M_ctype.widen('*')) 00199 { 00200 _M_curToken = _S_token_closure0; 00201 ++_M_current; 00202 return; 00203 } 00204 else if (__c == _M_ctype.widen('+')) 00205 { 00206 _M_curToken = _S_token_closure1; 00207 ++_M_current; 00208 return; 00209 } 00210 else if (__c == _M_ctype.widen('|')) 00211 { 00212 _M_curToken = _S_token_or; 00213 ++_M_current; 00214 return; 00215 } 00216 else if (__c == _M_ctype.widen('[')) 00217 { 00218 _M_curToken = _S_token_bracket_begin; 00219 _M_state |= (_S_state_in_bracket | _S_state_at_start); 00220 ++_M_current; 00221 return; 00222 } 00223 else if (__c == _M_ctype.widen('\\')) 00224 { 00225 _M_eat_escape(); 00226 return; 00227 } 00228 else if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00229 { 00230 if (__c == _M_ctype.widen('(')) 00231 { 00232 _M_curToken = _S_token_subexpr_begin; 00233 ++_M_current; 00234 return; 00235 } 00236 else if (__c == _M_ctype.widen(')')) 00237 { 00238 _M_curToken = _S_token_subexpr_end; 00239 ++_M_current; 00240 return; 00241 } 00242 else if (__c == _M_ctype.widen('{')) 00243 { 00244 _M_curToken = _S_token_interval_begin; 00245 _M_state |= _S_state_in_brace; 00246 ++_M_current; 00247 return; 00248 } 00249 } 00250 00251 _M_curToken = _S_token_ord_char; 00252 _M_curValue.assign(1, __c); 00253 ++_M_current; 00254 } 00255 00256 00257 template<typename _InputIterator> 00258 void 00259 _Scanner<_InputIterator>:: 00260 _M_scan_in_brace() 00261 { 00262 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00263 { 00264 _M_curToken = _S_token_dup_count; 00265 _M_curValue.assign(1, *_M_current); 00266 ++_M_current; 00267 while (_M_current != _M_end 00268 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00269 { 00270 _M_curValue += *_M_current; 00271 ++_M_current; 00272 } 00273 return; 00274 } 00275 else if (*_M_current == _M_ctype.widen(',')) 00276 { 00277 _M_curToken = _S_token_comma; 00278 ++_M_current; 00279 return; 00280 } 00281 if (_M_flags & (regex_constants::basic | regex_constants::grep)) 00282 { 00283 if (*_M_current == _M_ctype.widen('\\')) 00284 _M_eat_escape(); 00285 } 00286 else 00287 { 00288 if (*_M_current == _M_ctype.widen('}')) 00289 { 00290 _M_curToken = _S_token_interval_end; 00291 _M_state &= ~_S_state_in_brace; 00292 ++_M_current; 00293 return; 00294 } 00295 } 00296 } 00297 00298 template<typename _InputIterator> 00299 void 00300 _Scanner<_InputIterator>:: 00301 _M_scan_in_bracket() 00302 { 00303 if (_M_state & _S_state_at_start && *_M_current == _M_ctype.widen('^')) 00304 { 00305 _M_curToken = _S_token_inverse_class; 00306 _M_state &= ~_S_state_at_start; 00307 ++_M_current; 00308 return; 00309 } 00310 else if (*_M_current == _M_ctype.widen('[')) 00311 { 00312 ++_M_current; 00313 if (_M_current == _M_end) 00314 { 00315 _M_curToken = _S_token_eof; 00316 return; 00317 } 00318 00319 if (*_M_current == _M_ctype.widen('.')) 00320 { 00321 _M_curToken = _S_token_collsymbol; 00322 _M_eat_collsymbol(); 00323 return; 00324 } 00325 else if (*_M_current == _M_ctype.widen(':')) 00326 { 00327 _M_curToken = _S_token_char_class_name; 00328 _M_eat_charclass(); 00329 return; 00330 } 00331 else if (*_M_current == _M_ctype.widen('=')) 00332 { 00333 _M_curToken = _S_token_equiv_class_name; 00334 _M_eat_equivclass(); 00335 return; 00336 } 00337 } 00338 else if (*_M_current == _M_ctype.widen('-')) 00339 { 00340 _M_curToken = _S_token_dash; 00341 ++_M_current; 00342 return; 00343 } 00344 else if (*_M_current == _M_ctype.widen(']')) 00345 { 00346 if (!(_M_flags & regex_constants::ECMAScript) 00347 || !(_M_state & _S_state_at_start)) 00348 { 00349 // special case: only if _not_ chr first after 00350 // '[' or '[^' and if not ECMAscript 00351 _M_curToken = _S_token_bracket_end; 00352 ++_M_current; 00353 return; 00354 } 00355 } 00356 _M_curToken = _S_token_collelem_single; 00357 _M_curValue.assign(1, *_M_current); 00358 ++_M_current; 00359 } 00360 00361 template<typename _InputIterator> 00362 void 00363 _Scanner<_InputIterator>:: 00364 _M_eat_escape() 00365 { 00366 ++_M_current; 00367 if (_M_current == _M_end) 00368 { 00369 _M_curToken = _S_token_eof; 00370 return; 00371 } 00372 _CharT __c = *_M_current; 00373 ++_M_current; 00374 00375 if (__c == _M_ctype.widen('(')) 00376 { 00377 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00378 { 00379 _M_curToken = _S_token_ord_char; 00380 _M_curValue.assign(1, __c); 00381 } 00382 else 00383 _M_curToken = _S_token_subexpr_begin; 00384 } 00385 else if (__c == _M_ctype.widen(')')) 00386 { 00387 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00388 { 00389 _M_curToken = _S_token_ord_char; 00390 _M_curValue.assign(1, __c); 00391 } 00392 else 00393 _M_curToken = _S_token_subexpr_end; 00394 } 00395 else if (__c == _M_ctype.widen('{')) 00396 { 00397 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00398 { 00399 _M_curToken = _S_token_ord_char; 00400 _M_curValue.assign(1, __c); 00401 } 00402 else 00403 { 00404 _M_curToken = _S_token_interval_begin; 00405 _M_state |= _S_state_in_brace; 00406 } 00407 } 00408 else if (__c == _M_ctype.widen('}')) 00409 { 00410 if (!(_M_flags & (regex_constants::basic | regex_constants::grep))) 00411 { 00412 _M_curToken = _S_token_ord_char; 00413 _M_curValue.assign(1, __c); 00414 } 00415 else 00416 { 00417 if (!(_M_state && _S_state_in_brace)) 00418 __throw_regex_error(regex_constants::error_badbrace); 00419 _M_state &= ~_S_state_in_brace; 00420 _M_curToken = _S_token_interval_end; 00421 } 00422 } 00423 else if (__c == _M_ctype.widen('x')) 00424 { 00425 ++_M_current; 00426 if (_M_current == _M_end) 00427 { 00428 _M_curToken = _S_token_eof; 00429 return; 00430 } 00431 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00432 { 00433 _M_curValue.assign(1, *_M_current); 00434 ++_M_current; 00435 if (_M_current == _M_end) 00436 { 00437 _M_curToken = _S_token_eof; 00438 return; 00439 } 00440 if (_M_ctype.is(_CtypeT::digit, *_M_current)) 00441 { 00442 _M_curValue += *_M_current; 00443 ++_M_current; 00444 return; 00445 } 00446 } 00447 } 00448 else if (__c == _M_ctype.widen('^') 00449 || __c == _M_ctype.widen('.') 00450 || __c == _M_ctype.widen('*') 00451 || __c == _M_ctype.widen('$') 00452 || __c == _M_ctype.widen('\\')) 00453 { 00454 _M_curToken = _S_token_ord_char; 00455 _M_curValue.assign(1, __c); 00456 } 00457 else if (_M_ctype.is(_CtypeT::digit, __c)) 00458 { 00459 _M_curToken = _S_token_backref; 00460 _M_curValue.assign(1, __c); 00461 } 00462 else 00463 __throw_regex_error(regex_constants::error_escape); 00464 } 00465 00466 00467 // Eats a character class or throwns an exception. 00468 // current point to ':' delimiter on entry, char after ']' on return 00469 template<typename _InputIterator> 00470 void 00471 _Scanner<_InputIterator>:: 00472 _M_eat_charclass() 00473 { 00474 ++_M_current; // skip ':' 00475 if (_M_current == _M_end) 00476 __throw_regex_error(regex_constants::error_ctype); 00477 for (_M_curValue.clear(); 00478 _M_current != _M_end && *_M_current != _M_ctype.widen(':'); 00479 ++_M_current) 00480 _M_curValue += *_M_current; 00481 if (_M_current == _M_end) 00482 __throw_regex_error(regex_constants::error_ctype); 00483 ++_M_current; // skip ':' 00484 if (*_M_current != _M_ctype.widen(']')) 00485 __throw_regex_error(regex_constants::error_ctype); 00486 ++_M_current; // skip ']' 00487 } 00488 00489 00490 template<typename _InputIterator> 00491 void 00492 _Scanner<_InputIterator>:: 00493 _M_eat_equivclass() 00494 { 00495 ++_M_current; // skip '=' 00496 if (_M_current == _M_end) 00497 __throw_regex_error(regex_constants::error_collate); 00498 for (_M_curValue.clear(); 00499 _M_current != _M_end && *_M_current != _M_ctype.widen('='); 00500 ++_M_current) 00501 _M_curValue += *_M_current; 00502 if (_M_current == _M_end) 00503 __throw_regex_error(regex_constants::error_collate); 00504 ++_M_current; // skip '=' 00505 if (*_M_current != _M_ctype.widen(']')) 00506 __throw_regex_error(regex_constants::error_collate); 00507 ++_M_current; // skip ']' 00508 } 00509 00510 00511 template<typename _InputIterator> 00512 void 00513 _Scanner<_InputIterator>:: 00514 _M_eat_collsymbol() 00515 { 00516 ++_M_current; // skip '.' 00517 if (_M_current == _M_end) 00518 __throw_regex_error(regex_constants::error_collate); 00519 for (_M_curValue.clear(); 00520 _M_current != _M_end && *_M_current != _M_ctype.widen('.'); 00521 ++_M_current) 00522 _M_curValue += *_M_current; 00523 if (_M_current == _M_end) 00524 __throw_regex_error(regex_constants::error_collate); 00525 ++_M_current; // skip '.' 00526 if (*_M_current != _M_ctype.widen(']')) 00527 __throw_regex_error(regex_constants::error_collate); 00528 ++_M_current; // skip ']' 00529 } 00530 00531 #ifdef _GLIBCXX_DEBUG 00532 template<typename _InputIterator> 00533 std::ostream& 00534 _Scanner<_InputIterator>:: 00535 _M_print(std::ostream& ostr) 00536 { 00537 switch (_M_curToken) 00538 { 00539 case _S_token_anychar: 00540 ostr << "any-character\n"; 00541 break; 00542 case _S_token_backref: 00543 ostr << "backref\n"; 00544 break; 00545 case _S_token_bracket_begin: 00546 ostr << "bracket-begin\n"; 00547 break; 00548 case _S_token_bracket_end: 00549 ostr << "bracket-end\n"; 00550 break; 00551 case _S_token_char_class_name: 00552 ostr << "char-class-name \"" << _M_curValue << "\"\n"; 00553 break; 00554 case _S_token_closure0: 00555 ostr << "closure0\n"; 00556 break; 00557 case _S_token_closure1: 00558 ostr << "closure1\n"; 00559 break; 00560 case _S_token_collelem_multi: 00561 ostr << "coll-elem-multi \"" << _M_curValue << "\"\n"; 00562 break; 00563 case _S_token_collelem_single: 00564 ostr << "coll-elem-single \"" << _M_curValue << "\"\n"; 00565 break; 00566 case _S_token_collsymbol: 00567 ostr << "collsymbol \"" << _M_curValue << "\"\n"; 00568 break; 00569 case _S_token_comma: 00570 ostr << "comma\n"; 00571 break; 00572 case _S_token_dash: 00573 ostr << "dash\n"; 00574 break; 00575 case _S_token_dup_count: 00576 ostr << "dup count: " << _M_curValue << "\n"; 00577 break; 00578 case _S_token_eof: 00579 ostr << "EOF\n"; 00580 break; 00581 case _S_token_equiv_class_name: 00582 ostr << "equiv-class-name \"" << _M_curValue << "\"\n"; 00583 break; 00584 case _S_token_interval_begin: 00585 ostr << "interval begin\n"; 00586 break; 00587 case _S_token_interval_end: 00588 ostr << "interval end\n"; 00589 break; 00590 case _S_token_line_begin: 00591 ostr << "line begin\n"; 00592 break; 00593 case _S_token_line_end: 00594 ostr << "line end\n"; 00595 break; 00596 case _S_token_opt: 00597 ostr << "opt\n"; 00598 break; 00599 case _S_token_or: 00600 ostr << "or\n"; 00601 break; 00602 case _S_token_ord_char: 00603 ostr << "ordinary character: \"" << _M_value() << "\"\n"; 00604 break; 00605 case _S_token_quoted_char: 00606 ostr << "quoted char\n"; 00607 break; 00608 case _S_token_subexpr_begin: 00609 ostr << "subexpr begin\n"; 00610 break; 00611 case _S_token_subexpr_end: 00612 ostr << "subexpr end\n"; 00613 break; 00614 case _S_token_word_begin: 00615 ostr << "word begin\n"; 00616 break; 00617 case _S_token_word_end: 00618 ostr << "word end\n"; 00619 break; 00620 case _S_token_unknown: 00621 ostr << "-- unknown token --\n"; 00622 break; 00623 } 00624 return ostr; 00625 } 00626 #endif 00627 00628 // Builds an NFA from an input iterator interval. 00629 template<typename _InIter, typename _TraitsT> 00630 class _Compiler 00631 { 00632 public: 00633 typedef _InIter _IterT; 00634 typedef typename std::iterator_traits<_InIter>::value_type _CharT; 00635 typedef std::basic_string<_CharT> _StringT; 00636 typedef regex_constants::syntax_option_type _FlagT; 00637 00638 public: 00639 _Compiler(const _InIter& __b, const _InIter& __e, 00640 _TraitsT& __traits, _FlagT __flags); 00641 00642 const _Nfa& 00643 _M_nfa() const 00644 { return _M_state_store; } 00645 00646 private: 00647 typedef _Scanner<_InIter> _ScannerT; 00648 typedef typename _ScannerT::_TokenT _TokenT; 00649 typedef std::stack<_StateSeq, std::vector<_StateSeq> > _StackT; 00650 typedef _RangeMatcher<_InIter, _TraitsT> _RMatcherT; 00651 00652 // accepts a specific token or returns false. 00653 bool 00654 _M_match_token(_TokenT __token); 00655 00656 void 00657 _M_disjunction(); 00658 00659 bool 00660 _M_alternative(); 00661 00662 bool 00663 _M_term(); 00664 00665 bool 00666 _M_assertion(); 00667 00668 bool 00669 _M_quantifier(); 00670 00671 bool 00672 _M_atom(); 00673 00674 bool 00675 _M_bracket_expression(); 00676 00677 bool 00678 _M_bracket_list(_RMatcherT& __matcher); 00679 00680 bool 00681 _M_follow_list(_RMatcherT& __matcher); 00682 00683 bool 00684 _M_follow_list2(_RMatcherT& __matcher); 00685 00686 bool 00687 _M_expression_term(_RMatcherT& __matcher); 00688 00689 bool 00690 _M_range_expression(_RMatcherT& __matcher); 00691 00692 bool 00693 _M_start_range(_RMatcherT& __matcher); 00694 00695 bool 00696 _M_collating_symbol(_RMatcherT& __matcher); 00697 00698 bool 00699 _M_equivalence_class(_RMatcherT& __matcher); 00700 00701 bool 00702 _M_character_class(_RMatcherT& __matcher); 00703 00704 int 00705 _M_cur_int_value(int __radix); 00706 00707 private: 00708 _TraitsT& _M_traits; 00709 _ScannerT _M_scanner; 00710 _StringT _M_cur_value; 00711 _Nfa _M_state_store; 00712 _StackT _M_stack; 00713 }; 00714 00715 template<typename _InIter, typename _TraitsT> 00716 _Compiler<_InIter, _TraitsT>:: 00717 _Compiler(const _InIter& __b, const _InIter& __e, _TraitsT& __traits, 00718 _Compiler<_InIter, _TraitsT>::_FlagT __flags) 00719 : _M_traits(__traits), _M_scanner(__b, __e, __flags, _M_traits.getloc()), 00720 _M_state_store(__flags) 00721 { 00722 typedef _StartTagger<_InIter, _TraitsT> _Start; 00723 typedef _EndTagger<_InIter, _TraitsT> _End; 00724 00725 _StateSeq __r(_M_state_store, 00726 _M_state_store._M_insert_subexpr_begin(_Start(0))); 00727 _M_disjunction(); 00728 if (!_M_stack.empty()) 00729 { 00730 __r._M_append(_M_stack.top()); 00731 _M_stack.pop(); 00732 } 00733 __r._M_append(_M_state_store._M_insert_subexpr_end(0, _End(0))); 00734 __r._M_append(_M_state_store._M_insert_accept()); 00735 } 00736 00737 template<typename _InIter, typename _TraitsT> 00738 bool 00739 _Compiler<_InIter, _TraitsT>:: 00740 _M_match_token(_Compiler<_InIter, _TraitsT>::_TokenT token) 00741 { 00742 if (token == _M_scanner._M_token()) 00743 { 00744 _M_cur_value = _M_scanner._M_value(); 00745 _M_scanner._M_advance(); 00746 return true; 00747 } 00748 return false; 00749 } 00750 00751 template<typename _InIter, typename _TraitsT> 00752 void 00753 _Compiler<_InIter, _TraitsT>:: 00754 _M_disjunction() 00755 { 00756 this->_M_alternative(); 00757 if (_M_match_token(_ScannerT::_S_token_or)) 00758 { 00759 _StateSeq __alt1 = _M_stack.top(); _M_stack.pop(); 00760 this->_M_disjunction(); 00761 _StateSeq __alt2 = _M_stack.top(); _M_stack.pop(); 00762 _M_stack.push(_StateSeq(__alt1, __alt2)); 00763 } 00764 } 00765 00766 template<typename _InIter, typename _TraitsT> 00767 bool 00768 _Compiler<_InIter, _TraitsT>:: 00769 _M_alternative() 00770 { 00771 if (this->_M_term()) 00772 { 00773 _StateSeq __re = _M_stack.top(); _M_stack.pop(); 00774 this->_M_alternative(); 00775 if (!_M_stack.empty()) 00776 { 00777 __re._M_append(_M_stack.top()); 00778 _M_stack.pop(); 00779 } 00780 _M_stack.push(__re); 00781 return true; 00782 } 00783 return false; 00784 } 00785 00786 template<typename _InIter, typename _TraitsT> 00787 bool 00788 _Compiler<_InIter, _TraitsT>:: 00789 _M_term() 00790 { 00791 if (this->_M_assertion()) 00792 return true; 00793 if (this->_M_atom()) 00794 { 00795 this->_M_quantifier(); 00796 return true; 00797 } 00798 return false; 00799 } 00800 00801 template<typename _InIter, typename _TraitsT> 00802 bool 00803 _Compiler<_InIter, _TraitsT>:: 00804 _M_assertion() 00805 { 00806 if (_M_match_token(_ScannerT::_S_token_line_begin)) 00807 { 00808 // __m.push(_Matcher::_S_opcode_line_begin); 00809 return true; 00810 } 00811 if (_M_match_token(_ScannerT::_S_token_line_end)) 00812 { 00813 // __m.push(_Matcher::_S_opcode_line_end); 00814 return true; 00815 } 00816 if (_M_match_token(_ScannerT::_S_token_word_begin)) 00817 { 00818 // __m.push(_Matcher::_S_opcode_word_begin); 00819 return true; 00820 } 00821 if (_M_match_token(_ScannerT::_S_token_word_end)) 00822 { 00823 // __m.push(_Matcher::_S_opcode_word_end); 00824 return true; 00825 } 00826 return false; 00827 } 00828 00829 template<typename _InIter, typename _TraitsT> 00830 bool 00831 _Compiler<_InIter, _TraitsT>:: 00832 _M_quantifier() 00833 { 00834 if (_M_match_token(_ScannerT::_S_token_closure0)) 00835 { 00836 if (_M_stack.empty()) 00837 __throw_regex_error(regex_constants::error_badrepeat); 00838 _StateSeq __r(_M_stack.top(), -1); 00839 __r._M_append(__r._M_front()); 00840 _M_stack.pop(); 00841 _M_stack.push(__r); 00842 return true; 00843 } 00844 if (_M_match_token(_ScannerT::_S_token_closure1)) 00845 { 00846 if (_M_stack.empty()) 00847 __throw_regex_error(regex_constants::error_badrepeat); 00848 _StateSeq __r(_M_state_store, 00849 _M_state_store. 00850 _M_insert_alt(_S_invalid_state_id, 00851 _M_stack.top()._M_front())); 00852 _M_stack.top()._M_append(__r); 00853 return true; 00854 } 00855 if (_M_match_token(_ScannerT::_S_token_opt)) 00856 { 00857 if (_M_stack.empty()) 00858 __throw_regex_error(regex_constants::error_badrepeat); 00859 _StateSeq __r(_M_stack.top(), -1); 00860 _M_stack.pop(); 00861 _M_stack.push(__r); 00862 return true; 00863 } 00864 if (_M_match_token(_ScannerT::_S_token_interval_begin)) 00865 { 00866 if (_M_stack.empty()) 00867 __throw_regex_error(regex_constants::error_badrepeat); 00868 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 00869 __throw_regex_error(regex_constants::error_badbrace); 00870 _StateSeq __r(_M_stack.top()); 00871 int __min_rep = _M_cur_int_value(10); 00872 for (int __i = 1; __i < __min_rep; ++__i) 00873 _M_stack.top()._M_append(__r._M_clone()); 00874 if (_M_match_token(_ScannerT::_S_token_comma)) 00875 if (_M_match_token(_ScannerT::_S_token_dup_count)) 00876 { 00877 int __n = _M_cur_int_value(10) - __min_rep; 00878 if (__n < 0) 00879 __throw_regex_error(regex_constants::error_badbrace); 00880 for (int __i = 0; __i < __n; ++__i) 00881 { 00882 _StateSeq __r(_M_state_store, 00883 _M_state_store. 00884 _M_insert_alt(_S_invalid_state_id, 00885 _M_stack.top()._M_front())); 00886 _M_stack.top()._M_append(__r); 00887 } 00888 } 00889 else 00890 { 00891 _StateSeq __r(_M_stack.top(), -1); 00892 __r._M_push_back(__r._M_front()); 00893 _M_stack.pop(); 00894 _M_stack.push(__r); 00895 } 00896 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 00897 __throw_regex_error(regex_constants::error_brace); 00898 return true; 00899 } 00900 return false; 00901 } 00902 00903 template<typename _InIter, typename _TraitsT> 00904 bool 00905 _Compiler<_InIter, _TraitsT>:: 00906 _M_atom() 00907 { 00908 typedef _CharMatcher<_InIter, _TraitsT> _CMatcher; 00909 typedef _StartTagger<_InIter, _TraitsT> _Start; 00910 typedef _EndTagger<_InIter, _TraitsT> _End; 00911 00912 if (_M_match_token(_ScannerT::_S_token_anychar)) 00913 { 00914 _M_stack.push(_StateSeq(_M_state_store, 00915 _M_state_store._M_insert_matcher 00916 (_AnyMatcher))); 00917 return true; 00918 } 00919 if (_M_match_token(_ScannerT::_S_token_ord_char)) 00920 { 00921 _M_stack.push(_StateSeq(_M_state_store, 00922 _M_state_store._M_insert_matcher 00923 (_CMatcher(_M_cur_value[0], _M_traits)))); 00924 return true; 00925 } 00926 if (_M_match_token(_ScannerT::_S_token_quoted_char)) 00927 { 00928 // note that in the ECMA grammar, this case covers backrefs. 00929 _M_stack.push(_StateSeq(_M_state_store, 00930 _M_state_store._M_insert_matcher 00931 (_CMatcher(_M_cur_value[0], _M_traits)))); 00932 return true; 00933 } 00934 if (_M_match_token(_ScannerT::_S_token_backref)) 00935 { 00936 // __m.push(_Matcher::_S_opcode_ordchar, _M_cur_value); 00937 return true; 00938 } 00939 if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 00940 { 00941 int __mark = _M_state_store._M_sub_count(); 00942 _StateSeq __r(_M_state_store, 00943 _M_state_store. 00944 _M_insert_subexpr_begin(_Start(__mark))); 00945 this->_M_disjunction(); 00946 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00947 __throw_regex_error(regex_constants::error_paren); 00948 if (!_M_stack.empty()) 00949 { 00950 __r._M_append(_M_stack.top()); 00951 _M_stack.pop(); 00952 } 00953 __r._M_append(_M_state_store._M_insert_subexpr_end 00954 (__mark, _End(__mark))); 00955 _M_stack.push(__r); 00956 return true; 00957 } 00958 return _M_bracket_expression(); 00959 } 00960 00961 template<typename _InIter, typename _TraitsT> 00962 bool 00963 _Compiler<_InIter, _TraitsT>:: 00964 _M_bracket_expression() 00965 { 00966 if (_M_match_token(_ScannerT::_S_token_bracket_begin)) 00967 { 00968 _RMatcherT __matcher(_M_match_token(_ScannerT::_S_token_line_begin), 00969 _M_traits); 00970 if (!_M_bracket_list(__matcher) 00971 || !_M_match_token(_ScannerT::_S_token_bracket_end)) 00972 __throw_regex_error(regex_constants::error_brack); 00973 _M_stack.push(_StateSeq(_M_state_store, 00974 _M_state_store._M_insert_matcher(__matcher))); 00975 return true; 00976 } 00977 return false; 00978 } 00979 00980 // If the dash is the last character in the bracket expression, it is not 00981 // special. 00982 template<typename _InIter, typename _TraitsT> 00983 bool 00984 _Compiler<_InIter, _TraitsT>:: 00985 _M_bracket_list(_RMatcherT& __matcher) 00986 { 00987 if (_M_follow_list(__matcher)) 00988 { 00989 if (_M_match_token(_ScannerT::_S_token_dash)) 00990 __matcher._M_add_char(_M_cur_value[0]); 00991 return true; 00992 } 00993 return false; 00994 } 00995 00996 template<typename _InIter, typename _TraitsT> 00997 bool 00998 _Compiler<_InIter, _TraitsT>:: 00999 _M_follow_list(_RMatcherT& __matcher) 01000 { return _M_expression_term(__matcher) && _M_follow_list2(__matcher); } 01001 01002 template<typename _InIter, typename _TraitsT> 01003 bool 01004 _Compiler<_InIter, _TraitsT>:: 01005 _M_follow_list2(_RMatcherT& __matcher) 01006 { 01007 if (_M_expression_term(__matcher)) 01008 return _M_follow_list2(__matcher); 01009 return true; 01010 } 01011 01012 template<typename _InIter, typename _TraitsT> 01013 bool 01014 _Compiler<_InIter, _TraitsT>:: 01015 _M_expression_term(_RMatcherT& __matcher) 01016 { 01017 return (_M_collating_symbol(__matcher) 01018 || _M_character_class(__matcher) 01019 || _M_equivalence_class(__matcher) 01020 || (_M_start_range(__matcher) 01021 && _M_range_expression(__matcher))); 01022 } 01023 01024 template<typename _InIter, typename _TraitsT> 01025 bool 01026 _Compiler<_InIter, _TraitsT>:: 01027 _M_range_expression(_RMatcherT& __matcher) 01028 { 01029 if (!_M_collating_symbol(__matcher)) 01030 if (!_M_match_token(_ScannerT::_S_token_dash)) 01031 __throw_regex_error(regex_constants::error_range); 01032 __matcher._M_make_range(); 01033 return true; 01034 } 01035 01036 template<typename _InIter, typename _TraitsT> 01037 bool 01038 _Compiler<_InIter, _TraitsT>:: 01039 _M_start_range(_RMatcherT& __matcher) 01040 { return _M_match_token(_ScannerT::_S_token_dash); } 01041 01042 template<typename _InIter, typename _TraitsT> 01043 bool 01044 _Compiler<_InIter, _TraitsT>:: 01045 _M_collating_symbol(_RMatcherT& __matcher) 01046 { 01047 if (_M_match_token(_ScannerT::_S_token_collelem_single)) 01048 { 01049 __matcher._M_add_char(_M_cur_value[0]); 01050 return true; 01051 } 01052 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 01053 { 01054 __matcher._M_add_collating_element(_M_cur_value); 01055 return true; 01056 } 01057 return false; 01058 } 01059 01060 template<typename _InIter, typename _TraitsT> 01061 bool 01062 _Compiler<_InIter, _TraitsT>:: 01063 _M_equivalence_class(_RMatcherT& __matcher) 01064 { 01065 if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 01066 { 01067 __matcher._M_add_equivalence_class(_M_cur_value); 01068 return true; 01069 } 01070 return false; 01071 } 01072 01073 template<typename _InIter, typename _TraitsT> 01074 bool 01075 _Compiler<_InIter, _TraitsT>:: 01076 _M_character_class(_RMatcherT& __matcher) 01077 { 01078 if (_M_match_token(_ScannerT::_S_token_char_class_name)) 01079 { 01080 __matcher._M_add_character_class(_M_cur_value); 01081 return true; 01082 } 01083 return false; 01084 } 01085 01086 template<typename _InIter, typename _TraitsT> 01087 int 01088 _Compiler<_InIter, _TraitsT>:: 01089 _M_cur_int_value(int __radix) 01090 { 01091 int __v = 0; 01092 for (typename _StringT::size_type __i = 0; 01093 __i < _M_cur_value.length(); ++__i) 01094 __v =__v * __radix + _M_traits.value(_M_cur_value[__i], __radix); 01095 return __v; 01096 } 01097 01098 template<typename _InIter, typename _TraitsT> 01099 _AutomatonPtr 01100 __compile(const _InIter& __b, const _InIter& __e, _TraitsT& __t, 01101 regex_constants::syntax_option_type __f) 01102 { return _AutomatonPtr(new _Nfa(_Compiler<_InIter, _TraitsT>(__b, __e, __t, 01103 __f)._M_nfa())); } 01104 01105 _GLIBCXX_END_NAMESPACE_VERSION 01106 } // namespace __regex 01107 } // namespace std 01108 01109 /* vim: set ts=8 sw=2 sts=2: */