libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2019 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00052 00053 namespace __detail 00054 { 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 { 00087 __glibcxx_assert(false); 00088 } 00089 } 00090 00091 // Differences between styles: 00092 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00093 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00094 template<typename _CharT> 00095 void 00096 _Scanner<_CharT>:: 00097 _M_scan_normal() 00098 { 00099 auto __c = *_M_current++; 00100 00101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr) 00102 { 00103 _M_token = _S_token_ord_char; 00104 _M_value.assign(1, __c); 00105 return; 00106 } 00107 if (__c == '\\') 00108 { 00109 if (_M_current == _M_end) 00110 __throw_regex_error( 00111 regex_constants::error_escape, 00112 "Unexpected end of regex when escaping."); 00113 00114 if (!_M_is_basic() 00115 || (*_M_current != '(' 00116 && *_M_current != ')' 00117 && *_M_current != '{')) 00118 { 00119 (this->*_M_eat_escape)(); 00120 return; 00121 } 00122 __c = *_M_current++; 00123 } 00124 if (__c == '(') 00125 { 00126 if (_M_is_ecma() && *_M_current == '?') 00127 { 00128 if (++_M_current == _M_end) 00129 __throw_regex_error( 00130 regex_constants::error_paren, 00131 "Unexpected end of regex when in an open parenthesis."); 00132 00133 if (*_M_current == ':') 00134 { 00135 ++_M_current; 00136 _M_token = _S_token_subexpr_no_group_begin; 00137 } 00138 else if (*_M_current == '=') 00139 { 00140 ++_M_current; 00141 _M_token = _S_token_subexpr_lookahead_begin; 00142 _M_value.assign(1, 'p'); 00143 } 00144 else if (*_M_current == '!') 00145 { 00146 ++_M_current; 00147 _M_token = _S_token_subexpr_lookahead_begin; 00148 _M_value.assign(1, 'n'); 00149 } 00150 else 00151 __throw_regex_error( 00152 regex_constants::error_paren, 00153 "Invalid special open parenthesis."); 00154 } 00155 else if (_M_flags & regex_constants::nosubs) 00156 _M_token = _S_token_subexpr_no_group_begin; 00157 else 00158 _M_token = _S_token_subexpr_begin; 00159 } 00160 else if (__c == ')') 00161 _M_token = _S_token_subexpr_end; 00162 else if (__c == '[') 00163 { 00164 _M_state = _S_state_in_bracket; 00165 _M_at_bracket_start = true; 00166 if (_M_current != _M_end && *_M_current == '^') 00167 { 00168 _M_token = _S_token_bracket_neg_begin; 00169 ++_M_current; 00170 } 00171 else 00172 _M_token = _S_token_bracket_begin; 00173 } 00174 else if (__c == '{') 00175 { 00176 _M_state = _S_state_in_brace; 00177 _M_token = _S_token_interval_begin; 00178 } 00179 else if (__c != ']' && __c != '}') 00180 { 00181 auto __it = _M_token_tbl; 00182 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00183 for (; __it->first != '\0'; ++__it) 00184 if (__it->first == __narrowc) 00185 { 00186 _M_token = __it->second; 00187 return; 00188 } 00189 __glibcxx_assert(false); 00190 } 00191 else 00192 { 00193 _M_token = _S_token_ord_char; 00194 _M_value.assign(1, __c); 00195 } 00196 } 00197 00198 // Differences between styles: 00199 // 1) different semantics of "[]" and "[^]". 00200 // 2) Escaping in bracket expr. 00201 template<typename _CharT> 00202 void 00203 _Scanner<_CharT>:: 00204 _M_scan_in_bracket() 00205 { 00206 if (_M_current == _M_end) 00207 __throw_regex_error( 00208 regex_constants::error_brack, 00209 "Unexpected end of regex when in bracket expression."); 00210 00211 auto __c = *_M_current++; 00212 00213 if (__c == '-') 00214 _M_token = _S_token_bracket_dash; 00215 else if (__c == '[') 00216 { 00217 if (_M_current == _M_end) 00218 __throw_regex_error(regex_constants::error_brack, 00219 "Unexpected character class open bracket."); 00220 00221 if (*_M_current == '.') 00222 { 00223 _M_token = _S_token_collsymbol; 00224 _M_eat_class(*_M_current++); 00225 } 00226 else if (*_M_current == ':') 00227 { 00228 _M_token = _S_token_char_class_name; 00229 _M_eat_class(*_M_current++); 00230 } 00231 else if (*_M_current == '=') 00232 { 00233 _M_token = _S_token_equiv_class_name; 00234 _M_eat_class(*_M_current++); 00235 } 00236 else 00237 { 00238 _M_token = _S_token_ord_char; 00239 _M_value.assign(1, __c); 00240 } 00241 } 00242 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00243 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases 00244 // `*/empty_range.cc`. 00245 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00246 { 00247 _M_token = _S_token_bracket_end; 00248 _M_state = _S_state_normal; 00249 } 00250 // ECMAScript and awk permits escaping in bracket. 00251 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00252 (this->*_M_eat_escape)(); 00253 else 00254 { 00255 _M_token = _S_token_ord_char; 00256 _M_value.assign(1, __c); 00257 } 00258 _M_at_bracket_start = false; 00259 } 00260 00261 // Differences between styles: 00262 // 1) "\}" in basic style. 00263 template<typename _CharT> 00264 void 00265 _Scanner<_CharT>:: 00266 _M_scan_in_brace() 00267 { 00268 if (_M_current == _M_end) 00269 __throw_regex_error( 00270 regex_constants::error_brace, 00271 "Unexpected end of regex when in brace expression."); 00272 00273 auto __c = *_M_current++; 00274 00275 if (_M_ctype.is(_CtypeT::digit, __c)) 00276 { 00277 _M_token = _S_token_dup_count; 00278 _M_value.assign(1, __c); 00279 while (_M_current != _M_end 00280 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00281 _M_value += *_M_current++; 00282 } 00283 else if (__c == ',') 00284 _M_token = _S_token_comma; 00285 // basic use \}. 00286 else if (_M_is_basic()) 00287 { 00288 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00289 { 00290 _M_state = _S_state_normal; 00291 _M_token = _S_token_interval_end; 00292 ++_M_current; 00293 } 00294 else 00295 __throw_regex_error(regex_constants::error_badbrace, 00296 "Unexpected character in brace expression."); 00297 } 00298 else if (__c == '}') 00299 { 00300 _M_state = _S_state_normal; 00301 _M_token = _S_token_interval_end; 00302 } 00303 else 00304 __throw_regex_error(regex_constants::error_badbrace, 00305 "Unexpected character in brace expression."); 00306 } 00307 00308 template<typename _CharT> 00309 void 00310 _Scanner<_CharT>:: 00311 _M_eat_escape_ecma() 00312 { 00313 if (_M_current == _M_end) 00314 __throw_regex_error(regex_constants::error_escape, 00315 "Unexpected end of regex when escaping."); 00316 00317 auto __c = *_M_current++; 00318 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00319 00320 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00321 { 00322 _M_token = _S_token_ord_char; 00323 _M_value.assign(1, *__pos); 00324 } 00325 else if (__c == 'b') 00326 { 00327 _M_token = _S_token_word_bound; 00328 _M_value.assign(1, 'p'); 00329 } 00330 else if (__c == 'B') 00331 { 00332 _M_token = _S_token_word_bound; 00333 _M_value.assign(1, 'n'); 00334 } 00335 // N3376 28.13 00336 else if (__c == 'd' 00337 || __c == 'D' 00338 || __c == 's' 00339 || __c == 'S' 00340 || __c == 'w' 00341 || __c == 'W') 00342 { 00343 _M_token = _S_token_quoted_class; 00344 _M_value.assign(1, __c); 00345 } 00346 else if (__c == 'c') 00347 { 00348 if (_M_current == _M_end) 00349 __throw_regex_error( 00350 regex_constants::error_escape, 00351 "Unexpected end of regex when reading control code."); 00352 _M_token = _S_token_ord_char; 00353 _M_value.assign(1, *_M_current++); 00354 } 00355 else if (__c == 'x' || __c == 'u') 00356 { 00357 _M_value.erase(); 00358 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++) 00359 { 00360 if (_M_current == _M_end 00361 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00362 __throw_regex_error( 00363 regex_constants::error_escape, 00364 "Unexpected end of regex when ascii character."); 00365 _M_value += *_M_current++; 00366 } 00367 _M_token = _S_token_hex_num; 00368 } 00369 // ECMAScript recognizes multi-digit back-references. 00370 else if (_M_ctype.is(_CtypeT::digit, __c)) 00371 { 00372 _M_value.assign(1, __c); 00373 while (_M_current != _M_end 00374 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00375 _M_value += *_M_current++; 00376 _M_token = _S_token_backref; 00377 } 00378 else 00379 { 00380 _M_token = _S_token_ord_char; 00381 _M_value.assign(1, __c); 00382 } 00383 } 00384 00385 // Differences between styles: 00386 // 1) Extended doesn't support backref, but basic does. 00387 template<typename _CharT> 00388 void 00389 _Scanner<_CharT>:: 00390 _M_eat_escape_posix() 00391 { 00392 if (_M_current == _M_end) 00393 __throw_regex_error(regex_constants::error_escape, 00394 "Unexpected end of regex when escaping."); 00395 00396 auto __c = *_M_current; 00397 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00398 00399 if (__pos != nullptr && *__pos != '\0') 00400 { 00401 _M_token = _S_token_ord_char; 00402 _M_value.assign(1, __c); 00403 } 00404 // We MUST judge awk before handling backrefs. There's no backref in awk. 00405 else if (_M_is_awk()) 00406 { 00407 _M_eat_escape_awk(); 00408 return; 00409 } 00410 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00411 { 00412 _M_token = _S_token_backref; 00413 _M_value.assign(1, __c); 00414 } 00415 else 00416 { 00417 #ifdef __STRICT_ANSI__ 00418 // POSIX says it is undefined to escape ordinary characters 00419 __throw_regex_error(regex_constants::error_escape, 00420 "Unexpected escape character."); 00421 #else 00422 _M_token = _S_token_ord_char; 00423 _M_value.assign(1, __c); 00424 #endif 00425 } 00426 ++_M_current; 00427 } 00428 00429 template<typename _CharT> 00430 void 00431 _Scanner<_CharT>:: 00432 _M_eat_escape_awk() 00433 { 00434 auto __c = *_M_current++; 00435 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00436 00437 if (__pos != nullptr) 00438 { 00439 _M_token = _S_token_ord_char; 00440 _M_value.assign(1, *__pos); 00441 } 00442 // \ddd for oct representation 00443 else if (_M_ctype.is(_CtypeT::digit, __c) 00444 && __c != '8' 00445 && __c != '9') 00446 { 00447 _M_value.assign(1, __c); 00448 for (int __i = 0; 00449 __i < 2 00450 && _M_current != _M_end 00451 && _M_ctype.is(_CtypeT::digit, *_M_current) 00452 && *_M_current != '8' 00453 && *_M_current != '9'; 00454 __i++) 00455 _M_value += *_M_current++; 00456 _M_token = _S_token_oct_num; 00457 return; 00458 } 00459 else 00460 __throw_regex_error(regex_constants::error_escape, 00461 "Unexpected escape character."); 00462 } 00463 00464 // Eats a character class or throws an exception. 00465 // __ch could be ':', '.' or '=', _M_current is the char after ']' when 00466 // returning. 00467 template<typename _CharT> 00468 void 00469 _Scanner<_CharT>:: 00470 _M_eat_class(char __ch) 00471 { 00472 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00473 _M_value += *_M_current++; 00474 if (_M_current == _M_end 00475 || *_M_current++ != __ch 00476 || _M_current == _M_end // skip __ch 00477 || *_M_current++ != ']') // skip ']' 00478 { 00479 if (__ch == ':') 00480 __throw_regex_error(regex_constants::error_ctype, 00481 "Unexpected end of character class."); 00482 else 00483 __throw_regex_error(regex_constants::error_collate, 00484 "Unexpected end of character class."); 00485 } 00486 } 00487 00488 #ifdef _GLIBCXX_DEBUG 00489 template<typename _CharT> 00490 std::ostream& 00491 _Scanner<_CharT>:: 00492 _M_print(std::ostream& ostr) 00493 { 00494 switch (_M_token) 00495 { 00496 case _S_token_anychar: 00497 ostr << "any-character\n"; 00498 break; 00499 case _S_token_backref: 00500 ostr << "backref\n"; 00501 break; 00502 case _S_token_bracket_begin: 00503 ostr << "bracket-begin\n"; 00504 break; 00505 case _S_token_bracket_neg_begin: 00506 ostr << "bracket-neg-begin\n"; 00507 break; 00508 case _S_token_bracket_end: 00509 ostr << "bracket-end\n"; 00510 break; 00511 case _S_token_char_class_name: 00512 ostr << "char-class-name \"" << _M_value << "\"\n"; 00513 break; 00514 case _S_token_closure0: 00515 ostr << "closure0\n"; 00516 break; 00517 case _S_token_closure1: 00518 ostr << "closure1\n"; 00519 break; 00520 case _S_token_collsymbol: 00521 ostr << "collsymbol \"" << _M_value << "\"\n"; 00522 break; 00523 case _S_token_comma: 00524 ostr << "comma\n"; 00525 break; 00526 case _S_token_dup_count: 00527 ostr << "dup count: " << _M_value << "\n"; 00528 break; 00529 case _S_token_eof: 00530 ostr << "EOF\n"; 00531 break; 00532 case _S_token_equiv_class_name: 00533 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00534 break; 00535 case _S_token_interval_begin: 00536 ostr << "interval begin\n"; 00537 break; 00538 case _S_token_interval_end: 00539 ostr << "interval end\n"; 00540 break; 00541 case _S_token_line_begin: 00542 ostr << "line begin\n"; 00543 break; 00544 case _S_token_line_end: 00545 ostr << "line end\n"; 00546 break; 00547 case _S_token_opt: 00548 ostr << "opt\n"; 00549 break; 00550 case _S_token_or: 00551 ostr << "or\n"; 00552 break; 00553 case _S_token_ord_char: 00554 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00555 break; 00556 case _S_token_subexpr_begin: 00557 ostr << "subexpr begin\n"; 00558 break; 00559 case _S_token_subexpr_no_group_begin: 00560 ostr << "no grouping subexpr begin\n"; 00561 break; 00562 case _S_token_subexpr_lookahead_begin: 00563 ostr << "lookahead subexpr begin\n"; 00564 break; 00565 case _S_token_subexpr_end: 00566 ostr << "subexpr end\n"; 00567 break; 00568 case _S_token_unknown: 00569 ostr << "-- unknown token --\n"; 00570 break; 00571 case _S_token_oct_num: 00572 ostr << "oct number " << _M_value << "\n"; 00573 break; 00574 case _S_token_hex_num: 00575 ostr << "hex number " << _M_value << "\n"; 00576 break; 00577 case _S_token_quoted_class: 00578 ostr << "quoted class " << "\\" << _M_value << "\n"; 00579 break; 00580 default: 00581 _GLIBCXX_DEBUG_ASSERT(false); 00582 } 00583 return ostr; 00584 } 00585 #endif 00586 00587 } // namespace __detail 00588 _GLIBCXX_END_NAMESPACE_VERSION 00589 } // namespace