libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2019 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_executor.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00034 00035 namespace __detail 00036 { 00037 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00038 bool __dfs_mode> 00039 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00040 _M_search() 00041 { 00042 if (_M_search_from_first()) 00043 return true; 00044 if (_M_flags & regex_constants::match_continuous) 00045 return false; 00046 _M_flags |= regex_constants::match_prev_avail; 00047 while (_M_begin != _M_end) 00048 { 00049 ++_M_begin; 00050 if (_M_search_from_first()) 00051 return true; 00052 } 00053 return false; 00054 } 00055 00056 // The _M_main function operates in different modes, DFS mode or BFS mode, 00057 // indicated by template parameter __dfs_mode, and dispatches to one of the 00058 // _M_main_dispatch overloads. 00059 // 00060 // ------------------------------------------------------------ 00061 // 00062 // DFS mode: 00063 // 00064 // It applies a Depth-First-Search (aka backtracking) on given NFA and input 00065 // string. 00066 // At the very beginning the executor stands in the start state, then it 00067 // tries every possible state transition in current state recursively. Some 00068 // state transitions consume input string, say, a single-char-matcher or a 00069 // back-reference matcher; some don't, like assertion or other anchor nodes. 00070 // When the input is exhausted and/or the current state is an accepting 00071 // state, the whole executor returns true. 00072 // 00073 // TODO: This approach is exponentially slow for certain input. 00074 // Try to compile the NFA to a DFA. 00075 // 00076 // Time complexity: \Omega(match_length), O(2^(_M_nfa.size())) 00077 // Space complexity: \theta(match_results.size() + match_length) 00078 // 00079 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00080 bool __dfs_mode> 00081 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00082 _M_main_dispatch(_Match_mode __match_mode, __dfs) 00083 { 00084 _M_has_sol = false; 00085 *_M_states._M_get_sol_pos() = _BiIter(); 00086 _M_cur_results = _M_results; 00087 _M_dfs(__match_mode, _M_states._M_start); 00088 return _M_has_sol; 00089 } 00090 00091 // ------------------------------------------------------------ 00092 // 00093 // BFS mode: 00094 // 00095 // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) 00096 // explained this algorithm clearly. 00097 // 00098 // It first computes epsilon closure (states that can be achieved without 00099 // consuming characters) for every state that's still matching, 00100 // using the same DFS algorithm, but doesn't re-enter states (using 00101 // _M_states._M_visited to check), nor follow _S_opcode_match. 00102 // 00103 // Then apply DFS using every _S_opcode_match (in _M_states._M_match_queue) 00104 // as the start state. 00105 // 00106 // It significantly reduces potential duplicate states, so has a better 00107 // upper bound; but it requires more overhead. 00108 // 00109 // Time complexity: \Omega(match_length * match_results.size()) 00110 // O(match_length * _M_nfa.size() * match_results.size()) 00111 // Space complexity: \Omega(_M_nfa.size() + match_results.size()) 00112 // O(_M_nfa.size() * match_results.size()) 00113 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00114 bool __dfs_mode> 00115 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00116 _M_main_dispatch(_Match_mode __match_mode, __bfs) 00117 { 00118 _M_states._M_queue(_M_states._M_start, _M_results); 00119 bool __ret = false; 00120 while (1) 00121 { 00122 _M_has_sol = false; 00123 if (_M_states._M_match_queue.empty()) 00124 break; 00125 std::fill_n(_M_states._M_visited_states.get(), _M_nfa.size(), false); 00126 auto __old_queue = std::move(_M_states._M_match_queue); 00127 for (auto& __task : __old_queue) 00128 { 00129 _M_cur_results = std::move(__task.second); 00130 _M_dfs(__match_mode, __task.first); 00131 } 00132 if (__match_mode == _Match_mode::_Prefix) 00133 __ret |= _M_has_sol; 00134 if (_M_current == _M_end) 00135 break; 00136 ++_M_current; 00137 } 00138 if (__match_mode == _Match_mode::_Exact) 00139 __ret = _M_has_sol; 00140 _M_states._M_match_queue.clear(); 00141 return __ret; 00142 } 00143 00144 // Return whether now match the given sub-NFA. 00145 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00146 bool __dfs_mode> 00147 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00148 _M_lookahead(_StateIdT __next) 00149 { 00150 // Backreferences may refer to captured content. 00151 // We may want to make this faster by not copying, 00152 // but let's not be clever prematurely. 00153 _ResultsVec __what(_M_cur_results); 00154 _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags); 00155 __sub._M_states._M_start = __next; 00156 if (__sub._M_search_from_first()) 00157 { 00158 for (size_t __i = 0; __i < __what.size(); __i++) 00159 if (__what[__i].matched) 00160 _M_cur_results[__i] = __what[__i]; 00161 return true; 00162 } 00163 return false; 00164 } 00165 00166 // __rep_count records how many times (__rep_count.second) 00167 // this node is visited under certain input iterator 00168 // (__rep_count.first). This prevent the executor from entering 00169 // infinite loop by refusing to continue when it's already been 00170 // visited more than twice. It's `twice` instead of `once` because 00171 // we need to spare one more time for potential group capture. 00172 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00173 bool __dfs_mode> 00174 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00175 _M_rep_once_more(_Match_mode __match_mode, _StateIdT __i) 00176 { 00177 const auto& __state = _M_nfa[__i]; 00178 auto& __rep_count = _M_rep_count[__i]; 00179 if (__rep_count.second == 0 || __rep_count.first != _M_current) 00180 { 00181 auto __back = __rep_count; 00182 __rep_count.first = _M_current; 00183 __rep_count.second = 1; 00184 _M_dfs(__match_mode, __state._M_alt); 00185 __rep_count = __back; 00186 } 00187 else 00188 { 00189 if (__rep_count.second < 2) 00190 { 00191 __rep_count.second++; 00192 _M_dfs(__match_mode, __state._M_alt); 00193 __rep_count.second--; 00194 } 00195 } 00196 } 00197 00198 // _M_alt branch is "match once more", while _M_next is "get me out 00199 // of this quantifier". Executing _M_next first or _M_alt first don't 00200 // mean the same thing, and we need to choose the correct order under 00201 // given greedy mode. 00202 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00203 bool __dfs_mode> 00204 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00205 _M_handle_repeat(_Match_mode __match_mode, _StateIdT __i) 00206 { 00207 const auto& __state = _M_nfa[__i]; 00208 00209 // Greedy. 00210 if (!__state._M_neg) 00211 { 00212 _M_rep_once_more(__match_mode, __i); 00213 // If it's DFS executor and already accepted, we're done. 00214 if (!__dfs_mode || !_M_has_sol) 00215 _M_dfs(__match_mode, __state._M_next); 00216 } 00217 else // Non-greedy mode 00218 { 00219 if (__dfs_mode) 00220 { 00221 // vice-versa. 00222 _M_dfs(__match_mode, __state._M_next); 00223 if (!_M_has_sol) 00224 _M_rep_once_more(__match_mode, __i); 00225 } 00226 else 00227 { 00228 // DON'T attempt anything, because there's already another 00229 // state with higher priority accepted. This state cannot 00230 // be better by attempting its next node. 00231 if (!_M_has_sol) 00232 { 00233 _M_dfs(__match_mode, __state._M_next); 00234 // DON'T attempt anything if it's already accepted. An 00235 // accepted state *must* be better than a solution that 00236 // matches a non-greedy quantifier one more time. 00237 if (!_M_has_sol) 00238 _M_rep_once_more(__match_mode, __i); 00239 } 00240 } 00241 } 00242 } 00243 00244 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00245 bool __dfs_mode> 00246 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00247 _M_handle_subexpr_begin(_Match_mode __match_mode, _StateIdT __i) 00248 { 00249 const auto& __state = _M_nfa[__i]; 00250 00251 auto& __res = _M_cur_results[__state._M_subexpr]; 00252 auto __back = __res.first; 00253 __res.first = _M_current; 00254 _M_dfs(__match_mode, __state._M_next); 00255 __res.first = __back; 00256 } 00257 00258 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00259 bool __dfs_mode> 00260 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00261 _M_handle_subexpr_end(_Match_mode __match_mode, _StateIdT __i) 00262 { 00263 const auto& __state = _M_nfa[__i]; 00264 00265 auto& __res = _M_cur_results[__state._M_subexpr]; 00266 auto __back = __res; 00267 __res.second = _M_current; 00268 __res.matched = true; 00269 _M_dfs(__match_mode, __state._M_next); 00270 __res = __back; 00271 } 00272 00273 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00274 bool __dfs_mode> 00275 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00276 _M_handle_line_begin_assertion(_Match_mode __match_mode, _StateIdT __i) 00277 { 00278 const auto& __state = _M_nfa[__i]; 00279 if (_M_at_begin()) 00280 _M_dfs(__match_mode, __state._M_next); 00281 } 00282 00283 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00284 bool __dfs_mode> 00285 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00286 _M_handle_line_end_assertion(_Match_mode __match_mode, _StateIdT __i) 00287 { 00288 const auto& __state = _M_nfa[__i]; 00289 if (_M_at_end()) 00290 _M_dfs(__match_mode, __state._M_next); 00291 } 00292 00293 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00294 bool __dfs_mode> 00295 inline void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00296 _M_handle_word_boundary(_Match_mode __match_mode, _StateIdT __i) 00297 { 00298 const auto& __state = _M_nfa[__i]; 00299 if (_M_word_boundary() == !__state._M_neg) 00300 _M_dfs(__match_mode, __state._M_next); 00301 } 00302 00303 // Here __state._M_alt offers a single start node for a sub-NFA. 00304 // We recursively invoke our algorithm to match the sub-NFA. 00305 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00306 bool __dfs_mode> 00307 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00308 _M_handle_subexpr_lookahead(_Match_mode __match_mode, _StateIdT __i) 00309 { 00310 const auto& __state = _M_nfa[__i]; 00311 if (_M_lookahead(__state._M_alt) == !__state._M_neg) 00312 _M_dfs(__match_mode, __state._M_next); 00313 } 00314 00315 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00316 bool __dfs_mode> 00317 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00318 _M_handle_match(_Match_mode __match_mode, _StateIdT __i) 00319 { 00320 const auto& __state = _M_nfa[__i]; 00321 00322 if (_M_current == _M_end) 00323 return; 00324 if (__dfs_mode) 00325 { 00326 if (__state._M_matches(*_M_current)) 00327 { 00328 ++_M_current; 00329 _M_dfs(__match_mode, __state._M_next); 00330 --_M_current; 00331 } 00332 } 00333 else 00334 if (__state._M_matches(*_M_current)) 00335 _M_states._M_queue(__state._M_next, _M_cur_results); 00336 } 00337 00338 template<typename _BiIter, typename _TraitsT> 00339 struct _Backref_matcher 00340 { 00341 _Backref_matcher(bool __icase, const _TraitsT& __traits) 00342 : _M_traits(__traits) { } 00343 00344 bool 00345 _M_apply(_BiIter __expected_begin, 00346 _BiIter __expected_end, _BiIter __actual_begin, 00347 _BiIter __actual_end) 00348 { 00349 return _M_traits.transform(__expected_begin, __expected_end) 00350 == _M_traits.transform(__actual_begin, __actual_end); 00351 } 00352 00353 const _TraitsT& _M_traits; 00354 }; 00355 00356 template<typename _BiIter, typename _CharT> 00357 struct _Backref_matcher<_BiIter, std::regex_traits<_CharT>> 00358 { 00359 using _TraitsT = std::regex_traits<_CharT>; 00360 _Backref_matcher(bool __icase, const _TraitsT& __traits) 00361 : _M_icase(__icase), _M_traits(__traits) { } 00362 00363 bool 00364 _M_apply(_BiIter __expected_begin, 00365 _BiIter __expected_end, _BiIter __actual_begin, 00366 _BiIter __actual_end) 00367 { 00368 if (!_M_icase) 00369 return _GLIBCXX_STD_A::__equal4(__expected_begin, __expected_end, 00370 __actual_begin, __actual_end); 00371 typedef std::ctype<_CharT> __ctype_type; 00372 const auto& __fctyp = use_facet<__ctype_type>(_M_traits.getloc()); 00373 return _GLIBCXX_STD_A::__equal4(__expected_begin, __expected_end, 00374 __actual_begin, __actual_end, 00375 [this, &__fctyp](_CharT __lhs, _CharT __rhs) 00376 { 00377 return __fctyp.tolower(__lhs) 00378 == __fctyp.tolower(__rhs); 00379 }); 00380 } 00381 00382 bool _M_icase; 00383 const _TraitsT& _M_traits; 00384 }; 00385 00386 // First fetch the matched result from _M_cur_results as __submatch; 00387 // then compare it with 00388 // (_M_current, _M_current + (__submatch.second - __submatch.first)). 00389 // If matched, keep going; else just return and try another state. 00390 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00391 bool __dfs_mode> 00392 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00393 _M_handle_backref(_Match_mode __match_mode, _StateIdT __i) 00394 { 00395 __glibcxx_assert(__dfs_mode); 00396 00397 const auto& __state = _M_nfa[__i]; 00398 auto& __submatch = _M_cur_results[__state._M_backref_index]; 00399 if (!__submatch.matched) 00400 return; 00401 auto __last = _M_current; 00402 for (auto __tmp = __submatch.first; 00403 __last != _M_end && __tmp != __submatch.second; 00404 ++__tmp) 00405 ++__last; 00406 if (_Backref_matcher<_BiIter, _TraitsT>( 00407 _M_re.flags() & regex_constants::icase, 00408 _M_re._M_automaton->_M_traits)._M_apply( 00409 __submatch.first, __submatch.second, _M_current, __last)) 00410 { 00411 if (__last != _M_current) 00412 { 00413 auto __backup = _M_current; 00414 _M_current = __last; 00415 _M_dfs(__match_mode, __state._M_next); 00416 _M_current = __backup; 00417 } 00418 else 00419 _M_dfs(__match_mode, __state._M_next); 00420 } 00421 } 00422 00423 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00424 bool __dfs_mode> 00425 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00426 _M_handle_accept(_Match_mode __match_mode, _StateIdT __i) 00427 { 00428 if (__dfs_mode) 00429 { 00430 __glibcxx_assert(!_M_has_sol); 00431 if (__match_mode == _Match_mode::_Exact) 00432 _M_has_sol = _M_current == _M_end; 00433 else 00434 _M_has_sol = true; 00435 if (_M_current == _M_begin 00436 && (_M_flags & regex_constants::match_not_null)) 00437 _M_has_sol = false; 00438 if (_M_has_sol) 00439 { 00440 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00441 _M_results = _M_cur_results; 00442 else // POSIX 00443 { 00444 __glibcxx_assert(_M_states._M_get_sol_pos()); 00445 // Here's POSIX's logic: match the longest one. However 00446 // we never know which one (lhs or rhs of "|") is longer 00447 // unless we try both of them and compare the results. 00448 // The member variable _M_sol_pos records the end 00449 // position of the last successful match. It's better 00450 // to be larger, because POSIX regex is always greedy. 00451 // TODO: This could be slow. 00452 if (*_M_states._M_get_sol_pos() == _BiIter() 00453 || std::distance(_M_begin, 00454 *_M_states._M_get_sol_pos()) 00455 < std::distance(_M_begin, _M_current)) 00456 { 00457 *_M_states._M_get_sol_pos() = _M_current; 00458 _M_results = _M_cur_results; 00459 } 00460 } 00461 } 00462 } 00463 else 00464 { 00465 if (_M_current == _M_begin 00466 && (_M_flags & regex_constants::match_not_null)) 00467 return; 00468 if (__match_mode == _Match_mode::_Prefix || _M_current == _M_end) 00469 if (!_M_has_sol) 00470 { 00471 _M_has_sol = true; 00472 _M_results = _M_cur_results; 00473 } 00474 } 00475 } 00476 00477 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00478 bool __dfs_mode> 00479 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00480 _M_handle_alternative(_Match_mode __match_mode, _StateIdT __i) 00481 { 00482 const auto& __state = _M_nfa[__i]; 00483 00484 if (_M_nfa._M_flags & regex_constants::ECMAScript) 00485 { 00486 // TODO: Fix BFS support. It is wrong. 00487 _M_dfs(__match_mode, __state._M_alt); 00488 // Pick lhs if it matches. Only try rhs if it doesn't. 00489 if (!_M_has_sol) 00490 _M_dfs(__match_mode, __state._M_next); 00491 } 00492 else 00493 { 00494 // Try both and compare the result. 00495 // See "case _S_opcode_accept:" handling above. 00496 _M_dfs(__match_mode, __state._M_alt); 00497 auto __has_sol = _M_has_sol; 00498 _M_has_sol = false; 00499 _M_dfs(__match_mode, __state._M_next); 00500 _M_has_sol |= __has_sol; 00501 } 00502 } 00503 00504 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00505 bool __dfs_mode> 00506 void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00507 _M_dfs(_Match_mode __match_mode, _StateIdT __i) 00508 { 00509 if (_M_states._M_visited(__i)) 00510 return; 00511 00512 switch (_M_nfa[__i]._M_opcode()) 00513 { 00514 case _S_opcode_repeat: 00515 _M_handle_repeat(__match_mode, __i); break; 00516 case _S_opcode_subexpr_begin: 00517 _M_handle_subexpr_begin(__match_mode, __i); break; 00518 case _S_opcode_subexpr_end: 00519 _M_handle_subexpr_end(__match_mode, __i); break; 00520 case _S_opcode_line_begin_assertion: 00521 _M_handle_line_begin_assertion(__match_mode, __i); break; 00522 case _S_opcode_line_end_assertion: 00523 _M_handle_line_end_assertion(__match_mode, __i); break; 00524 case _S_opcode_word_boundary: 00525 _M_handle_word_boundary(__match_mode, __i); break; 00526 case _S_opcode_subexpr_lookahead: 00527 _M_handle_subexpr_lookahead(__match_mode, __i); break; 00528 case _S_opcode_match: 00529 _M_handle_match(__match_mode, __i); break; 00530 case _S_opcode_backref: 00531 _M_handle_backref(__match_mode, __i); break; 00532 case _S_opcode_accept: 00533 _M_handle_accept(__match_mode, __i); break; 00534 case _S_opcode_alternative: 00535 _M_handle_alternative(__match_mode, __i); break; 00536 default: 00537 __glibcxx_assert(false); 00538 } 00539 } 00540 00541 // Return whether now is at some word boundary. 00542 template<typename _BiIter, typename _Alloc, typename _TraitsT, 00543 bool __dfs_mode> 00544 bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: 00545 _M_word_boundary() const 00546 { 00547 if (_M_current == _M_begin && (_M_flags & regex_constants::match_not_bow)) 00548 return false; 00549 if (_M_current == _M_end && (_M_flags & regex_constants::match_not_eow)) 00550 return false; 00551 00552 bool __left_is_word = false; 00553 if (_M_current != _M_begin 00554 || (_M_flags & regex_constants::match_prev_avail)) 00555 { 00556 auto __prev = _M_current; 00557 if (_M_is_word(*std::prev(__prev))) 00558 __left_is_word = true; 00559 } 00560 bool __right_is_word = 00561 _M_current != _M_end && _M_is_word(*_M_current); 00562 00563 return __left_is_word != __right_is_word; 00564 } 00565 } // namespace __detail 00566 00567 _GLIBCXX_END_NAMESPACE_VERSION 00568 } // namespace