libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2019 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 namespace std _GLIBCXX_VISIBILITY(default) 00032 { 00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00034 00035 namespace __detail 00036 { 00037 /** 00038 * @addtogroup regex-detail 00039 * @{ 00040 */ 00041 00042 struct _ScannerBase 00043 { 00044 public: 00045 /// Token types returned from the scanner. 00046 enum _TokenT : unsigned 00047 { 00048 _S_token_anychar, 00049 _S_token_ord_char, 00050 _S_token_oct_num, 00051 _S_token_hex_num, 00052 _S_token_backref, 00053 _S_token_subexpr_begin, 00054 _S_token_subexpr_no_group_begin, 00055 _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n' 00056 _S_token_subexpr_end, 00057 _S_token_bracket_begin, 00058 _S_token_bracket_neg_begin, 00059 _S_token_bracket_end, 00060 _S_token_interval_begin, 00061 _S_token_interval_end, 00062 _S_token_quoted_class, 00063 _S_token_char_class_name, 00064 _S_token_collsymbol, 00065 _S_token_equiv_class_name, 00066 _S_token_opt, 00067 _S_token_or, 00068 _S_token_closure0, 00069 _S_token_closure1, 00070 _S_token_line_begin, 00071 _S_token_line_end, 00072 _S_token_word_bound, // neg if _M_value[0] == 'n' 00073 _S_token_comma, 00074 _S_token_dup_count, 00075 _S_token_eof, 00076 _S_token_bracket_dash, 00077 _S_token_unknown = -1u 00078 }; 00079 00080 protected: 00081 typedef regex_constants::syntax_option_type _FlagT; 00082 00083 enum _StateT 00084 { 00085 _S_state_normal, 00086 _S_state_in_brace, 00087 _S_state_in_bracket, 00088 }; 00089 00090 protected: 00091 _ScannerBase(_FlagT __flags) 00092 : _M_state(_S_state_normal), 00093 _M_flags(__flags), 00094 _M_escape_tbl(_M_is_ecma() 00095 ? _M_ecma_escape_tbl 00096 : _M_awk_escape_tbl), 00097 _M_spec_char(_M_is_ecma() 00098 ? _M_ecma_spec_char 00099 : _M_flags & regex_constants::basic 00100 ? _M_basic_spec_char 00101 : _M_flags & regex_constants::extended 00102 ? _M_extended_spec_char 00103 : _M_flags & regex_constants::grep 00104 ? ".[\\*^$\n" 00105 : _M_flags & regex_constants::egrep 00106 ? ".[\\()*+?{|^$\n" 00107 : _M_flags & regex_constants::awk 00108 ? _M_extended_spec_char 00109 : nullptr), 00110 _M_at_bracket_start(false) 00111 { __glibcxx_assert(_M_spec_char); } 00112 00113 protected: 00114 const char* 00115 _M_find_escape(char __c) 00116 { 00117 auto __it = _M_escape_tbl; 00118 for (; __it->first != '\0'; ++__it) 00119 if (__it->first == __c) 00120 return &__it->second; 00121 return nullptr; 00122 } 00123 00124 bool 00125 _M_is_ecma() const 00126 { return _M_flags & regex_constants::ECMAScript; } 00127 00128 bool 00129 _M_is_basic() const 00130 { return _M_flags & (regex_constants::basic | regex_constants::grep); } 00131 00132 bool 00133 _M_is_extended() const 00134 { 00135 return _M_flags & (regex_constants::extended 00136 | regex_constants::egrep 00137 | regex_constants::awk); 00138 } 00139 00140 bool 00141 _M_is_grep() const 00142 { return _M_flags & (regex_constants::grep | regex_constants::egrep); } 00143 00144 bool 00145 _M_is_awk() const 00146 { return _M_flags & regex_constants::awk; } 00147 00148 protected: 00149 // TODO: Make them static in the next abi change. 00150 const std::pair<char, _TokenT> _M_token_tbl[9] = 00151 { 00152 {'^', _S_token_line_begin}, 00153 {'$', _S_token_line_end}, 00154 {'.', _S_token_anychar}, 00155 {'*', _S_token_closure0}, 00156 {'+', _S_token_closure1}, 00157 {'?', _S_token_opt}, 00158 {'|', _S_token_or}, 00159 {'\n', _S_token_or}, // grep and egrep 00160 {'\0', _S_token_or}, 00161 }; 00162 const std::pair<char, char> _M_ecma_escape_tbl[8] = 00163 { 00164 {'0', '\0'}, 00165 {'b', '\b'}, 00166 {'f', '\f'}, 00167 {'n', '\n'}, 00168 {'r', '\r'}, 00169 {'t', '\t'}, 00170 {'v', '\v'}, 00171 {'\0', '\0'}, 00172 }; 00173 const std::pair<char, char> _M_awk_escape_tbl[11] = 00174 { 00175 {'"', '"'}, 00176 {'/', '/'}, 00177 {'\\', '\\'}, 00178 {'a', '\a'}, 00179 {'b', '\b'}, 00180 {'f', '\f'}, 00181 {'n', '\n'}, 00182 {'r', '\r'}, 00183 {'t', '\t'}, 00184 {'v', '\v'}, 00185 {'\0', '\0'}, 00186 }; 00187 const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|"; 00188 const char* _M_basic_spec_char = ".[\\*^$"; 00189 const char* _M_extended_spec_char = ".[\\()*+?{|^$"; 00190 00191 _StateT _M_state; 00192 _FlagT _M_flags; 00193 _TokenT _M_token; 00194 const std::pair<char, char>* _M_escape_tbl; 00195 const char* _M_spec_char; 00196 bool _M_at_bracket_start; 00197 }; 00198 00199 /** 00200 * @brief Scans an input range for regex tokens. 00201 * 00202 * The %_Scanner class interprets the regular expression pattern in 00203 * the input range passed to its constructor as a sequence of parse 00204 * tokens passed to the regular expression compiler. The sequence 00205 * of tokens provided depends on the flag settings passed to the 00206 * constructor: different regular expression grammars will interpret 00207 * the same input pattern in syntactically different ways. 00208 */ 00209 template<typename _CharT> 00210 class _Scanner 00211 : public _ScannerBase 00212 { 00213 public: 00214 typedef const _CharT* _IterT; 00215 typedef std::basic_string<_CharT> _StringT; 00216 typedef regex_constants::syntax_option_type _FlagT; 00217 typedef const std::ctype<_CharT> _CtypeT; 00218 00219 _Scanner(_IterT __begin, _IterT __end, 00220 _FlagT __flags, std::locale __loc); 00221 00222 void 00223 _M_advance(); 00224 00225 _TokenT 00226 _M_get_token() const 00227 { return _M_token; } 00228 00229 const _StringT& 00230 _M_get_value() const 00231 { return _M_value; } 00232 00233 #ifdef _GLIBCXX_DEBUG 00234 std::ostream& 00235 _M_print(std::ostream&); 00236 #endif 00237 00238 private: 00239 void 00240 _M_scan_normal(); 00241 00242 void 00243 _M_scan_in_bracket(); 00244 00245 void 00246 _M_scan_in_brace(); 00247 00248 void 00249 _M_eat_escape_ecma(); 00250 00251 void 00252 _M_eat_escape_posix(); 00253 00254 void 00255 _M_eat_escape_awk(); 00256 00257 void 00258 _M_eat_class(char); 00259 00260 _IterT _M_current; 00261 _IterT _M_end; 00262 _CtypeT& _M_ctype; 00263 _StringT _M_value; 00264 void (_Scanner::* _M_eat_escape)(); 00265 }; 00266 00267 //@} regex-detail 00268 } // namespace __detail 00269 _GLIBCXX_END_NAMESPACE_VERSION 00270 } // namespace std 00271 00272 #include <bits/regex_scanner.tcc>